diff options
author | Jason Ekstrand <[email protected]> | 2017-02-28 09:10:43 -0800 |
---|---|---|
committer | Emil Velikov <[email protected]> | 2017-03-13 11:16:34 +0000 |
commit | 700bebb958e93f4d472c383de62ced9db8e64bec (patch) | |
tree | 0075c098c56c338f38ba0db80b9dba3e7e268a17 /src/intel/compiler | |
parent | d0d4a5f43b4dd79bd7bfff7c7deaade10bfebf7c (diff) |
i965: Move the back-end compiler to src/intel/compiler
Mostly a dummy git mv with a couple of noticable parts:
- With the earlier header cleanups, nothing in src/intel depends
files from src/mesa/drivers/dri/i965/
- Both Autoconf and Android builds are addressed. Thanks to Mauro and
Tapani for the fixups in the latter
- brw_util.[ch] is not really compiler specific, so it's moved to i965.
v2:
- move brw_eu_defines.h instead of brw_defines.h
- remove no-longer applicable includes
- add missing vulkan/ prefix in the Android build (thanks Tapani)
v3:
- don't list brw_defines.h in src/intel/Makefile.sources (Jason)
- rebase on top of the oa patches
[Emil Velikov: commit message, various small fixes througout]
Signed-off-by: Emil Velikov <[email protected]>
Reviewed-by: Jason Ekstrand <[email protected]>
Diffstat (limited to 'src/intel/compiler')
95 files changed, 63683 insertions, 0 deletions
diff --git a/src/intel/compiler/.gitignore b/src/intel/compiler/.gitignore new file mode 100644 index 00000000000..e844421b336 --- /dev/null +++ b/src/intel/compiler/.gitignore @@ -0,0 +1,10 @@ +brw_nir_trig_workarounds.c +test_eu_compact +test_eu_validate +test_fs_cmod_propagation +test_fs_copy_propagation +test_fs_saturate_propagation +test_vec4_cmod_propagation +test_vec4_copy_propagation +test_vec4_register_coalesce +test_vf_float_conversions diff --git a/src/intel/compiler/brw_cfg.cpp b/src/intel/compiler/brw_cfg.cpp new file mode 100644 index 00000000000..fad12eec588 --- /dev/null +++ b/src/intel/compiler/brw_cfg.cpp @@ -0,0 +1,531 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_cfg.h" + +/** @file brw_cfg.cpp + * + * Walks the shader instructions generated and creates a set of basic + * blocks with successor/predecessor edges connecting them. + */ + +static bblock_t * +pop_stack(exec_list *list) +{ + bblock_link *link = (bblock_link *)list->get_tail(); + bblock_t *block = link->block; + link->link.remove(); + + return block; +} + +static exec_node * +link(void *mem_ctx, bblock_t *block) +{ + bblock_link *l = new(mem_ctx) bblock_link(block); + return &l->link; +} + +bblock_t::bblock_t(cfg_t *cfg) : + cfg(cfg), idom(NULL), start_ip(0), end_ip(0), num(0), cycle_count(0) +{ + instructions.make_empty(); + parents.make_empty(); + children.make_empty(); +} + +void +bblock_t::add_successor(void *mem_ctx, bblock_t *successor) +{ + successor->parents.push_tail(::link(mem_ctx, this)); + children.push_tail(::link(mem_ctx, successor)); +} + +bool +bblock_t::is_predecessor_of(const bblock_t *block) const +{ + foreach_list_typed_safe (bblock_link, parent, link, &block->parents) { + if (parent->block == this) { + return true; + } + } + + return false; +} + +bool +bblock_t::is_successor_of(const bblock_t *block) const +{ + foreach_list_typed_safe (bblock_link, child, link, &block->children) { + if (child->block == this) { + return true; + } + } + + return false; +} + +static bool +ends_block(const backend_instruction *inst) +{ + enum opcode op = inst->opcode; + + return op == BRW_OPCODE_IF || + op == BRW_OPCODE_ELSE || + op == BRW_OPCODE_CONTINUE || + op == BRW_OPCODE_BREAK || + op == BRW_OPCODE_WHILE; +} + +static bool +starts_block(const backend_instruction *inst) +{ + enum opcode op = inst->opcode; + + return op == BRW_OPCODE_DO || + op == BRW_OPCODE_ENDIF; +} + +bool +bblock_t::can_combine_with(const bblock_t *that) const +{ + if ((const bblock_t *)this->link.next != that) + return false; + + if (ends_block(this->end()) || + starts_block(that->start())) + return false; + + return true; +} + +void +bblock_t::combine_with(bblock_t *that) +{ + assert(this->can_combine_with(that)); + foreach_list_typed (bblock_link, link, link, &this->children) { + assert(link->block == that); + } + foreach_list_typed (bblock_link, link, link, &that->parents) { + assert(link->block == this); + } + + this->end_ip = that->end_ip; + this->instructions.append_list(&that->instructions); + + this->cfg->remove_block(that); +} + +void +bblock_t::dump(backend_shader *s) const +{ + int ip = this->start_ip; + foreach_inst_in_block(backend_instruction, inst, this) { + fprintf(stderr, "%5d: ", ip); + s->dump_instruction(inst); + ip++; + } +} + +cfg_t::cfg_t(exec_list *instructions) +{ + mem_ctx = ralloc_context(NULL); + block_list.make_empty(); + blocks = NULL; + num_blocks = 0; + idom_dirty = true; + cycle_count = 0; + + bblock_t *cur = NULL; + int ip = 0; + + bblock_t *entry = new_block(); + bblock_t *cur_if = NULL; /**< BB ending with IF. */ + bblock_t *cur_else = NULL; /**< BB ending with ELSE. */ + bblock_t *cur_endif = NULL; /**< BB starting with ENDIF. */ + bblock_t *cur_do = NULL; /**< BB starting with DO. */ + bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */ + exec_list if_stack, else_stack, do_stack, while_stack; + bblock_t *next; + + set_next_block(&cur, entry, ip); + + foreach_in_list_safe(backend_instruction, inst, instructions) { + /* set_next_block wants the post-incremented ip */ + ip++; + + inst->exec_node::remove(); + + switch (inst->opcode) { + case BRW_OPCODE_IF: + cur->instructions.push_tail(inst); + + /* Push our information onto a stack so we can recover from + * nested ifs. + */ + if_stack.push_tail(link(mem_ctx, cur_if)); + else_stack.push_tail(link(mem_ctx, cur_else)); + + cur_if = cur; + cur_else = NULL; + cur_endif = NULL; + + /* Set up our immediately following block, full of "then" + * instructions. + */ + next = new_block(); + cur_if->add_successor(mem_ctx, next); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_ELSE: + cur->instructions.push_tail(inst); + + cur_else = cur; + + next = new_block(); + assert(cur_if != NULL); + cur_if->add_successor(mem_ctx, next); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_ENDIF: { + if (cur->instructions.is_empty()) { + /* New block was just created; use it. */ + cur_endif = cur; + } else { + cur_endif = new_block(); + + cur->add_successor(mem_ctx, cur_endif); + + set_next_block(&cur, cur_endif, ip - 1); + } + + cur->instructions.push_tail(inst); + + if (cur_else) { + cur_else->add_successor(mem_ctx, cur_endif); + } else { + assert(cur_if != NULL); + cur_if->add_successor(mem_ctx, cur_endif); + } + + assert(cur_if->end()->opcode == BRW_OPCODE_IF); + assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE); + + /* Pop the stack so we're in the previous if/else/endif */ + cur_if = pop_stack(&if_stack); + cur_else = pop_stack(&else_stack); + break; + } + case BRW_OPCODE_DO: + /* Push our information onto a stack so we can recover from + * nested loops. + */ + do_stack.push_tail(link(mem_ctx, cur_do)); + while_stack.push_tail(link(mem_ctx, cur_while)); + + /* Set up the block just after the while. Don't know when exactly + * it will start, yet. + */ + cur_while = new_block(); + + if (cur->instructions.is_empty()) { + /* New block was just created; use it. */ + cur_do = cur; + } else { + cur_do = new_block(); + + cur->add_successor(mem_ctx, cur_do); + + set_next_block(&cur, cur_do, ip - 1); + } + + cur->instructions.push_tail(inst); + break; + + case BRW_OPCODE_CONTINUE: + cur->instructions.push_tail(inst); + + assert(cur_do != NULL); + cur->add_successor(mem_ctx, cur_do); + + next = new_block(); + if (inst->predicate) + cur->add_successor(mem_ctx, next); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_BREAK: + cur->instructions.push_tail(inst); + + assert(cur_while != NULL); + cur->add_successor(mem_ctx, cur_while); + + next = new_block(); + if (inst->predicate) + cur->add_successor(mem_ctx, next); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_WHILE: + cur->instructions.push_tail(inst); + + assert(cur_do != NULL && cur_while != NULL); + cur->add_successor(mem_ctx, cur_do); + + if (inst->predicate) + cur->add_successor(mem_ctx, cur_while); + + set_next_block(&cur, cur_while, ip); + + /* Pop the stack so we're in the previous loop */ + cur_do = pop_stack(&do_stack); + cur_while = pop_stack(&while_stack); + break; + + default: + cur->instructions.push_tail(inst); + break; + } + } + + cur->end_ip = ip - 1; + + make_block_array(); +} + +cfg_t::~cfg_t() +{ + ralloc_free(mem_ctx); +} + +void +cfg_t::remove_block(bblock_t *block) +{ + foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) { + /* Remove block from all of its predecessors' successor lists. */ + foreach_list_typed_safe (bblock_link, successor, link, + &predecessor->block->children) { + if (block == successor->block) { + successor->link.remove(); + ralloc_free(successor); + } + } + + /* Add removed-block's successors to its predecessors' successor lists. */ + foreach_list_typed (bblock_link, successor, link, &block->children) { + if (!successor->block->is_successor_of(predecessor->block)) { + predecessor->block->children.push_tail(link(mem_ctx, + successor->block)); + } + } + } + + foreach_list_typed_safe (bblock_link, successor, link, &block->children) { + /* Remove block from all of its childrens' parents lists. */ + foreach_list_typed_safe (bblock_link, predecessor, link, + &successor->block->parents) { + if (block == predecessor->block) { + predecessor->link.remove(); + ralloc_free(predecessor); + } + } + + /* Add removed-block's predecessors to its successors' predecessor lists. */ + foreach_list_typed (bblock_link, predecessor, link, &block->parents) { + if (!predecessor->block->is_predecessor_of(successor->block)) { + successor->block->parents.push_tail(link(mem_ctx, + predecessor->block)); + } + } + } + + block->link.remove(); + + for (int b = block->num; b < this->num_blocks - 1; b++) { + this->blocks[b] = this->blocks[b + 1]; + this->blocks[b]->num = b; + } + + this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2; + this->num_blocks--; + idom_dirty = true; +} + +bblock_t * +cfg_t::new_block() +{ + bblock_t *block = new(mem_ctx) bblock_t(this); + + return block; +} + +void +cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip) +{ + if (*cur) { + (*cur)->end_ip = ip - 1; + } + + block->start_ip = ip; + block->num = num_blocks++; + block_list.push_tail(&block->link); + *cur = block; +} + +void +cfg_t::make_block_array() +{ + blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks); + + int i = 0; + foreach_block (block, this) { + blocks[i++] = block; + } + assert(i == num_blocks); +} + +void +cfg_t::dump(backend_shader *s) +{ + if (idom_dirty) + calculate_idom(); + + foreach_block (block, this) { + if (block->idom) + fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num); + else + fprintf(stderr, "START B%d IDOM(none)", block->num); + + foreach_list_typed(bblock_link, link, link, &block->parents) { + fprintf(stderr, " <-B%d", + link->block->num); + } + fprintf(stderr, "\n"); + if (s != NULL) + block->dump(s); + fprintf(stderr, "END B%d", block->num); + foreach_list_typed(bblock_link, link, link, &block->children) { + fprintf(stderr, " ->B%d", + link->block->num); + } + fprintf(stderr, "\n"); + } +} + +/* Calculates the immediate dominator of each block, according to "A Simple, + * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken + * Kennedy. + * + * The authors claim that for control flow graphs of sizes normally encountered + * (less than 1000 nodes) that this algorithm is significantly faster than + * others like Lengauer-Tarjan. + */ +void +cfg_t::calculate_idom() +{ + foreach_block(block, this) { + block->idom = NULL; + } + blocks[0]->idom = blocks[0]; + + bool changed; + do { + changed = false; + + foreach_block(block, this) { + if (block->num == 0) + continue; + + bblock_t *new_idom = NULL; + foreach_list_typed(bblock_link, parent, link, &block->parents) { + if (parent->block->idom) { + if (new_idom == NULL) { + new_idom = parent->block; + } else if (parent->block->idom != NULL) { + new_idom = intersect(parent->block, new_idom); + } + } + } + + if (block->idom != new_idom) { + block->idom = new_idom; + changed = true; + } + } + } while (changed); + + idom_dirty = false; +} + +bblock_t * +cfg_t::intersect(bblock_t *b1, bblock_t *b2) +{ + /* Note, the comparisons here are the opposite of what the paper says + * because we index blocks from beginning -> end (i.e. reverse post-order) + * instead of post-order like they assume. + */ + while (b1->num != b2->num) { + while (b1->num > b2->num) + b1 = b1->idom; + while (b2->num > b1->num) + b2 = b2->idom; + } + assert(b1); + return b1; +} + +void +cfg_t::dump_cfg() +{ + printf("digraph CFG {\n"); + for (int b = 0; b < num_blocks; b++) { + bblock_t *block = this->blocks[b]; + + foreach_list_typed_safe (bblock_link, child, link, &block->children) { + printf("\t%d -> %d\n", b, child->block->num); + } + } + printf("}\n"); +} + +void +cfg_t::dump_domtree() +{ + printf("digraph DominanceTree {\n"); + foreach_block(block, this) { + if (block->idom) { + printf("\t%d -> %d\n", block->idom->num, block->num); + } + } + printf("}\n"); +} diff --git a/src/intel/compiler/brw_cfg.h b/src/intel/compiler/brw_cfg.h new file mode 100644 index 00000000000..b8af40f725f --- /dev/null +++ b/src/intel/compiler/brw_cfg.h @@ -0,0 +1,358 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#pragma once +#ifndef BRW_CFG_H +#define BRW_CFG_H + +#include "brw_shader.h" + +struct bblock_t; + +struct bblock_link { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(bblock_link) + + bblock_link(bblock_t *block) + : block(block) + { + } +#endif + + struct exec_node link; + struct bblock_t *block; +}; + +struct backend_instruction; + +struct bblock_t { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(bblock_t) + + explicit bblock_t(cfg_t *cfg); + + void add_successor(void *mem_ctx, bblock_t *successor); + bool is_predecessor_of(const bblock_t *block) const; + bool is_successor_of(const bblock_t *block) const; + bool can_combine_with(const bblock_t *that) const; + void combine_with(bblock_t *that); + void dump(backend_shader *s) const; + + backend_instruction *start(); + const backend_instruction *start() const; + backend_instruction *end(); + const backend_instruction *end() const; + + bblock_t *next(); + const bblock_t *next() const; + bblock_t *prev(); + const bblock_t *prev() const; + + bool starts_with_control_flow() const; + bool ends_with_control_flow() const; + + backend_instruction *first_non_control_flow_inst(); + backend_instruction *last_non_control_flow_inst(); +#endif + + struct exec_node link; + struct cfg_t *cfg; + struct bblock_t *idom; + + int start_ip; + int end_ip; + + struct exec_list instructions; + struct exec_list parents; + struct exec_list children; + int num; + + unsigned cycle_count; +}; + +static inline struct backend_instruction * +bblock_start(struct bblock_t *block) +{ + return (struct backend_instruction *)exec_list_get_head(&block->instructions); +} + +static inline const struct backend_instruction * +bblock_start_const(const struct bblock_t *block) +{ + return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions); +} + +static inline struct backend_instruction * +bblock_end(struct bblock_t *block) +{ + return (struct backend_instruction *)exec_list_get_tail(&block->instructions); +} + +static inline const struct backend_instruction * +bblock_end_const(const struct bblock_t *block) +{ + return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions); +} + +static inline struct bblock_t * +bblock_next(struct bblock_t *block) +{ + if (exec_node_is_tail_sentinel(block->link.next)) + return NULL; + + return (struct bblock_t *)block->link.next; +} + +static inline const struct bblock_t * +bblock_next_const(const struct bblock_t *block) +{ + if (exec_node_is_tail_sentinel(block->link.next)) + return NULL; + + return (const struct bblock_t *)block->link.next; +} + +static inline struct bblock_t * +bblock_prev(struct bblock_t *block) +{ + if (exec_node_is_head_sentinel(block->link.prev)) + return NULL; + + return (struct bblock_t *)block->link.prev; +} + +static inline const struct bblock_t * +bblock_prev_const(const struct bblock_t *block) +{ + if (exec_node_is_head_sentinel(block->link.prev)) + return NULL; + + return (const struct bblock_t *)block->link.prev; +} + +static inline bool +bblock_starts_with_control_flow(const struct bblock_t *block) +{ + enum opcode op = bblock_start_const(block)->opcode; + return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF; +} + +static inline bool +bblock_ends_with_control_flow(const struct bblock_t *block) +{ + enum opcode op = bblock_end_const(block)->opcode; + return op == BRW_OPCODE_IF || + op == BRW_OPCODE_ELSE || + op == BRW_OPCODE_WHILE || + op == BRW_OPCODE_BREAK || + op == BRW_OPCODE_CONTINUE; +} + +static inline struct backend_instruction * +bblock_first_non_control_flow_inst(struct bblock_t *block) +{ + struct backend_instruction *inst = bblock_start(block); + if (bblock_starts_with_control_flow(block)) +#ifdef __cplusplus + inst = (struct backend_instruction *)inst->next; +#else + inst = (struct backend_instruction *)inst->link.next; +#endif + return inst; +} + +static inline struct backend_instruction * +bblock_last_non_control_flow_inst(struct bblock_t *block) +{ + struct backend_instruction *inst = bblock_end(block); + if (bblock_ends_with_control_flow(block)) +#ifdef __cplusplus + inst = (struct backend_instruction *)inst->prev; +#else + inst = (struct backend_instruction *)inst->link.prev; +#endif + return inst; +} + +#ifdef __cplusplus +inline backend_instruction * +bblock_t::start() +{ + return bblock_start(this); +} + +inline const backend_instruction * +bblock_t::start() const +{ + return bblock_start_const(this); +} + +inline backend_instruction * +bblock_t::end() +{ + return bblock_end(this); +} + +inline const backend_instruction * +bblock_t::end() const +{ + return bblock_end_const(this); +} + +inline bblock_t * +bblock_t::next() +{ + return bblock_next(this); +} + +inline const bblock_t * +bblock_t::next() const +{ + return bblock_next_const(this); +} + +inline bblock_t * +bblock_t::prev() +{ + return bblock_prev(this); +} + +inline const bblock_t * +bblock_t::prev() const +{ + return bblock_prev_const(this); +} + +inline bool +bblock_t::starts_with_control_flow() const +{ + return bblock_starts_with_control_flow(this); +} + +inline bool +bblock_t::ends_with_control_flow() const +{ + return bblock_ends_with_control_flow(this); +} + +inline backend_instruction * +bblock_t::first_non_control_flow_inst() +{ + return bblock_first_non_control_flow_inst(this); +} + +inline backend_instruction * +bblock_t::last_non_control_flow_inst() +{ + return bblock_last_non_control_flow_inst(this); +} +#endif + +struct cfg_t { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(cfg_t) + + cfg_t(exec_list *instructions); + ~cfg_t(); + + void remove_block(bblock_t *block); + + bblock_t *new_block(); + void set_next_block(bblock_t **cur, bblock_t *block, int ip); + void make_block_array(); + void calculate_idom(); + static bblock_t *intersect(bblock_t *b1, bblock_t *b2); + + void dump(backend_shader *s); + void dump_cfg(); + void dump_domtree(); +#endif + void *mem_ctx; + + /** Ordered list (by ip) of basic blocks */ + struct exec_list block_list; + struct bblock_t **blocks; + int num_blocks; + + bool idom_dirty; + + unsigned cycle_count; +}; + +/* Note that this is implemented with a double for loop -- break will + * break from the inner loop only! + */ +#define foreach_block_and_inst(__block, __type, __inst, __cfg) \ + foreach_block (__block, __cfg) \ + foreach_inst_in_block (__type, __inst, __block) + +/* Note that this is implemented with a double for loop -- break will + * break from the inner loop only! + */ +#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \ + foreach_block_safe (__block, __cfg) \ + foreach_inst_in_block_safe (__type, __inst, __block) + +#define foreach_block(__block, __cfg) \ + foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_reverse(__block, __cfg) \ + foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_safe(__block, __cfg) \ + foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_reverse_safe(__block, __cfg) \ + foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_inst_in_block(__type, __inst, __block) \ + foreach_in_list(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_safe(__type, __inst, __block) \ + for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \ + *__next = (__type *)__inst->next; \ + __next != NULL; \ + __inst = __next, \ + __next = (__type *)__next->next) + +#define foreach_inst_in_block_reverse(__type, __inst, __block) \ + foreach_in_list_reverse(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \ + foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \ + for (__type *__scan_inst = (__type *)__inst->next; \ + !__scan_inst->is_tail_sentinel(); \ + __scan_inst = (__type *)__scan_inst->next) + +#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \ + for (__type *__scan_inst = (__type *)__inst->prev; \ + !__scan_inst->is_head_sentinel(); \ + __scan_inst = (__type *)__scan_inst->prev) + +#endif /* BRW_CFG_H */ diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c new file mode 100644 index 00000000000..cd9473f9a3b --- /dev/null +++ b/src/intel/compiler/brw_compiler.c @@ -0,0 +1,160 @@ +/* + * Copyright © 2015-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "brw_shader.h" +#include "brw_eu.h" +#include "common/gen_debug.h" +#include "compiler/nir/nir.h" +#include "main/errors.h" +#include "util/debug.h" + +#define COMMON_OPTIONS \ + .lower_sub = true, \ + .lower_fdiv = true, \ + .lower_scmp = true, \ + .lower_fmod32 = true, \ + .lower_fmod64 = false, \ + .lower_bitfield_extract = true, \ + .lower_bitfield_insert = true, \ + .lower_uadd_carry = true, \ + .lower_usub_borrow = true, \ + .lower_fdiv = true, \ + .lower_flrp64 = true, \ + .native_integers = true, \ + .use_interpolated_input_intrinsics = true, \ + .vertex_id_zero_based = true + +static const struct nir_shader_compiler_options scalar_nir_options = { + COMMON_OPTIONS, + .lower_pack_half_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .max_unroll_iterations = 32, +}; + +static const struct nir_shader_compiler_options vector_nir_options = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + + /* Prior to Gen6, there are no three source operations for SIMD4x2. */ + .lower_flrp32 = true, + + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .max_unroll_iterations = 32, +}; + +static const struct nir_shader_compiler_options vector_nir_options_gen6 = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .max_unroll_iterations = 32, +}; + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) +{ + struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); + + compiler->devinfo = devinfo; + + brw_fs_alloc_reg_sets(compiler); + brw_vec4_alloc_reg_set(compiler); + brw_init_compaction_tables(devinfo); + + compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false); + + compiler->scalar_stage[MESA_SHADER_VERTEX] = + devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true); + compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); + compiler->scalar_stage[MESA_SHADER_GEOMETRY] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true); + compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; + compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; + + /* We want the GLSL compiler to emit code that uses condition codes */ + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + compiler->glsl_compiler_options[i].MaxUnrollIterations = 0; + compiler->glsl_compiler_options[i].MaxIfDepth = + devinfo->gen < 6 ? 16 : UINT_MAX; + + compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; + compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; + + bool is_scalar = compiler->scalar_stage[i]; + + compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar; + compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar; + compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar; + + if (is_scalar) { + compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options; + } else { + compiler->glsl_compiler_options[i].NirOptions = + devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6; + } + + compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; + compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true; + } + + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false; + + if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) + compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; + + return compiler; +} diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h new file mode 100644 index 00000000000..85257d494af --- /dev/null +++ b/src/intel/compiler/brw_compiler.h @@ -0,0 +1,1057 @@ +/* + * Copyright © 2010 - 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include <stdio.h> +#include "common/gen_device_info.h" +#include "main/mtypes.h" +#include "main/macros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ra_regs; +struct nir_shader; +struct brw_program; +union gl_constant_value; + +struct brw_compiler { + const struct gen_device_info *devinfo; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used. + */ + int *classes; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + } vec4_reg_set; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used, indexed by register size. + */ + int classes[16]; + + /** + * Mapping from classes to ra_reg ranges. Each of the per-size + * classes corresponds to a range of ra_reg nodes. This array stores + * those ranges in the form of first ra_reg in each class and the + * total number of ra_reg elements in the last array element. This + * way the range of the i'th class is given by: + * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) + */ + int class_to_ra_reg_range[17]; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + + /** + * ra class for the aligned pairs we use for PLN, which doesn't + * appear in *classes. + */ + int aligned_pairs_class; + } fs_reg_sets[3]; + + void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + + bool scalar_stage[MESA_SHADER_STAGES]; + struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; + + /** + * Apply workarounds for SIN and COS output range problems. + * This can negatively impact performance. + */ + bool precise_trig; +}; + + +/** + * Program key structures. + * + * When drawing, we look for the currently bound shaders in the program + * cache. This is essentially a hash table lookup, and these are the keys. + * + * Sometimes OpenGL features specified as state need to be simulated via + * shader code, due to a mismatch between the API and the hardware. This + * is often referred to as "non-orthagonal state" or "NOS". We store NOS + * in the program key so it's considered when searching for a program. If + * we haven't seen a particular combination before, we have to recompile a + * new specialized version. + * + * Shader compilation should not look up state in gl_context directly, but + * instead use the copy in the program key. This guarantees recompiles will + * happen correctly. + * + * @{ + */ + +enum PACKED gen6_gather_sampler_wa { + WA_SIGN = 1, /* whether we need to sign extend */ + WA_8BIT = 2, /* if we have an 8bit format needing wa */ + WA_16BIT = 4, /* if we have a 16bit format needing wa */ +}; + +/** + * Sampler information needed by VS, WM, and GS program cache keys. + */ +struct brw_sampler_prog_key_data { + /** + * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. + */ + uint16_t swizzles[MAX_SAMPLERS]; + + uint32_t gl_clamp_mask[3]; + + /** + * For RG32F, gather4's channel select is broken. + */ + uint32_t gather_channel_quirk_mask; + + /** + * Whether this sampler uses the compressed multisample surface layout. + */ + uint32_t compressed_multisample_layout_mask; + + /** + * Whether this sampler is using 16x multisampling. If so fetching from + * this sampler will be handled with a different instruction, ld2dms_w + * instead of ld2dms. + */ + uint32_t msaa_16; + + /** + * For Sandybridge, which shader w/a we need for gather quirks. + */ + enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; + + /** + * Texture units that have a YUV image bound. + */ + uint32_t y_u_v_image_mask; + uint32_t y_uv_image_mask; + uint32_t yx_xuxv_image_mask; +}; + +/** + * The VF can't natively handle certain types of attributes, such as GL_FIXED + * or most 10_10_10_2 types. These flags enable various VS workarounds to + * "fix" attributes at the beginning of shaders. + */ +#define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */ +#define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */ +#define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */ +#define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */ +#define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */ + +/** The program key for Vertex Shaders. */ +struct brw_vs_prog_key { + unsigned program_string_id; + + /** + * Per-attribute workaround flags + * + * For each attribute, a combination of BRW_ATTRIB_WA_*. + */ + uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; + + bool copy_edgeflag:1; + + bool clamp_vertex_color:1; + + /** + * How many user clipping planes are being uploaded to the vertex shader as + * push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + + /** + * For pre-Gen6 hardware, a bitfield indicating which texture coordinates + * are going to be replaced with point coordinates (as a consequence of a + * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because + * our SF thread requires exact matching between VS outputs and FS inputs, + * these texture coordinates will need to be unconditionally included in + * the VUE, even if they aren't written by the vertex shader. + */ + uint8_t point_coord_replace; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Tessellation Control Shaders. */ +struct brw_tcs_prog_key +{ + unsigned program_string_id; + + GLenum tes_primitive_mode; + + unsigned input_vertices; + + /** A bitfield of per-patch outputs written. */ + uint32_t patch_outputs_written; + + /** A bitfield of per-vertex outputs written. */ + uint64_t outputs_written; + + bool quads_workaround; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Tessellation Evaluation Shaders. */ +struct brw_tes_prog_key +{ + unsigned program_string_id; + + /** A bitfield of per-patch inputs read. */ + uint32_t patch_inputs_read; + + /** A bitfield of per-vertex inputs read. */ + uint64_t inputs_read; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Geometry Shaders. */ +struct brw_gs_prog_key +{ + unsigned program_string_id; + + struct brw_sampler_prog_key_data tex; +}; + +/* A big lookup table is used to figure out which and how many + * additional regs will inserted before the main payload in the WM + * program execution. These mainly relate to depth and stencil + * processing and the early-depth-test optimization. + */ +enum brw_wm_iz_bits { + BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1, + BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2, + BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4, + BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8, + BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10, + BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20, + BRW_WM_IZ_BIT_MAX = 0x40 +}; + +enum brw_wm_aa_enable { + BRW_WM_AA_NEVER, + BRW_WM_AA_SOMETIMES, + BRW_WM_AA_ALWAYS +}; + +/** The program key for Fragment/Pixel Shaders. */ +struct brw_wm_prog_key { + /* Some collection of BRW_WM_IZ_* */ + uint8_t iz_lookup; + bool stats_wm:1; + bool flat_shade:1; + unsigned nr_color_regions:5; + bool replicate_alpha:1; + bool clamp_fragment_color:1; + bool persample_interp:1; + bool multisample_fbo:1; + enum brw_wm_aa_enable line_aa:2; + bool high_quality_derivatives:1; + bool force_dual_color_blend:1; + bool coherent_fb_fetch:1; + + uint16_t drawable_height; + uint64_t input_slots_valid; + unsigned program_string_id; + GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ + float alpha_test_ref; + + struct brw_sampler_prog_key_data tex; +}; + +struct brw_cs_prog_key { + uint32_t program_string_id; + struct brw_sampler_prog_key_data tex; +}; + +/* + * Image metadata structure as laid out in the shader parameter + * buffer. Entries have to be 16B-aligned for the vec4 back-end to be + * able to use them. That's okay because the padding and any unused + * entries [most of them except when we're doing untyped surface + * access] will be removed by the uniform packing pass. + */ +#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 +#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 +#define BRW_IMAGE_PARAM_SIZE_OFFSET 8 +#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 +#define BRW_IMAGE_PARAM_TILING_OFFSET 16 +#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 +#define BRW_IMAGE_PARAM_SIZE 24 + +struct brw_image_param { + /** Surface binding table index. */ + uint32_t surface_idx; + + /** Offset applied to the X and Y surface coordinates. */ + uint32_t offset[2]; + + /** Surface X, Y and Z dimensions. */ + uint32_t size[3]; + + /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in + * pixels, vertical slice stride in pixels. + */ + uint32_t stride[4]; + + /** Log2 of the tiling modulus in the X, Y and Z dimension. */ + uint32_t tiling[3]; + + /** + * Right shift to apply for bit 6 address swizzling. Two different + * swizzles can be specified and will be applied one after the other. The + * resulting address will be: + * + * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ + * (addr >> swizzling[1]))) + * + * Use \c 0xff if any of the swizzles is not required. + */ + uint32_t swizzling[2]; +}; + +/** Max number of render targets in a shader */ +#define BRW_MAX_DRAW_BUFFERS 8 + +/** + * Max number of binding table entries used for stream output. + * + * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the + * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64. + * + * On Gen6, the size of transform feedback data is limited not by the number + * of components but by the number of binding table entries we set aside. We + * use one binding table entry for a float, one entry for a vector, and one + * entry per matrix column. Since the only way we can communicate our + * transform feedback capabilities to the client is via + * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the + * worst case, in which all the varyings are floats, so we use up one binding + * table entry per component. Therefore we need to set aside at least 64 + * binding table entries for use by transform feedback. + * + * Note: since we don't currently pack varyings, it is currently impossible + * for the client to actually use up all of these binding table entries--if + * all of their varyings were floats, they would run out of varying slots and + * fail to link. But that's a bug, so it seems prudent to go ahead and + * allocate the number of binding table entries we will need once the bug is + * fixed. + */ +#define BRW_MAX_SOL_BINDINGS 64 + +/** + * Binding table index for the first gen6 SOL binding. + */ +#define BRW_GEN6_SOL_BINDING_START 0 + +/** + * Stride in bytes between shader_time entries. + * + * We separate entries by a cacheline to reduce traffic between EUs writing to + * different entries. + */ +#define BRW_SHADER_TIME_STRIDE 64 + +struct brw_stage_prog_data { + struct { + /** size of our binding table. */ + uint32_t size_bytes; + + /** @{ + * surface indices for the various groups of surfaces + */ + uint32_t pull_constants_start; + uint32_t texture_start; + uint32_t gather_texture_start; + uint32_t ubo_start; + uint32_t ssbo_start; + uint32_t abo_start; + uint32_t image_start; + uint32_t shader_time_start; + uint32_t plane_start[3]; + /** @} */ + } binding_table; + + GLuint nr_params; /**< number of float params/constants */ + GLuint nr_pull_params; + unsigned nr_image_params; + + unsigned curb_read_length; + unsigned total_scratch; + unsigned total_shared; + + /** + * Register where the thread expects to find input data from the URB + * (typically uniforms, followed by vertex or fragment attributes). + */ + unsigned dispatch_grf_start_reg; + + bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ + + /* Pointers to tracked values (only valid once + * _mesa_load_state_parameters has been called at runtime). + */ + const union gl_constant_value **param; + const union gl_constant_value **pull_param; + + /** Image metadata passed to the shader as uniforms. */ + struct brw_image_param *image_param; +}; + +static inline void +brw_mark_surface_used(struct brw_stage_prog_data *prog_data, + unsigned surf_index) +{ + /* A binding table index is 8 bits and the top 3 values are reserved for + * special things (stateless and SLM). + */ + assert(surf_index <= 252); + + prog_data->binding_table.size_bytes = + MAX2(prog_data->binding_table.size_bytes, (surf_index + 1) * 4); +} + +/* Data about a particular attempt to compile a program. Note that + * there can be many of these, each in a different GL state + * corresponding to a different brw_wm_prog_key struct, with different + * compiled programs. + */ +struct brw_wm_prog_data { + struct brw_stage_prog_data base; + + GLuint num_varying_inputs; + + uint8_t reg_blocks_0; + uint8_t reg_blocks_2; + + uint8_t dispatch_grf_start_reg_2; + uint32_t prog_offset_2; + + struct { + /** @{ + * surface indices the WM-specific surfaces + */ + uint32_t render_target_start; + uint32_t render_target_read_start; + /** @} */ + } binding_table; + + uint8_t computed_depth_mode; + bool computed_stencil; + + bool early_fragment_tests; + bool post_depth_coverage; + bool inner_coverage; + bool dispatch_8; + bool dispatch_16; + bool dual_src_blend; + bool persample_dispatch; + bool uses_pos_offset; + bool uses_omask; + bool uses_kill; + bool uses_src_depth; + bool uses_src_w; + bool uses_sample_mask; + bool has_side_effects; + bool pulls_bary; + + bool contains_flat_varying; + bool contains_noperspective_varying; + + /** + * Mask of which interpolation modes are required by the fragment shader. + * Used in hardware setup on gen6+. + */ + uint32_t barycentric_interp_modes; + + /** + * Mask of which FS inputs are marked flat by the shader source. This is + * needed for setting up 3DSTATE_SF/SBE. + */ + uint32_t flat_inputs; + + /* Mapping of VUE slots to interpolation modes. + * Used by the Gen4-5 clip/sf/wm stages. + */ + unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ + + /** + * Map from gl_varying_slot to the position within the FS setup data + * payload where the varying's attribute vertex deltas should be delivered. + * For varying slots that are not used by the FS, the value is -1. + */ + int urb_setup[VARYING_SLOT_MAX]; +}; + +struct brw_push_const_block { + unsigned dwords; /* Dword count, not reg aligned */ + unsigned regs; + unsigned size; /* Bytes, register aligned */ +}; + +struct brw_cs_prog_data { + struct brw_stage_prog_data base; + + GLuint dispatch_grf_start_reg_16; + unsigned local_size[3]; + unsigned simd_size; + unsigned threads; + bool uses_barrier; + bool uses_num_work_groups; + int thread_local_id_index; + + struct { + struct brw_push_const_block cross_thread; + struct brw_push_const_block per_thread; + struct brw_push_const_block total; + } push; + + struct { + /** @{ + * surface indices the CS-specific surfaces + */ + uint32_t work_groups_start; + /** @} */ + } binding_table; +}; + +/** + * Enum representing the i965-specific vertex results that don't correspond + * exactly to any element of gl_varying_slot. The values of this enum are + * assigned such that they don't conflict with gl_varying_slot. + */ +typedef enum +{ + BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, + BRW_VARYING_SLOT_PAD, + /** + * Technically this is not a varying but just a placeholder that + * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord + * builtin variable to be compiled correctly. see compile_sf_prog() for + * more info. + */ + BRW_VARYING_SLOT_PNTC, + BRW_VARYING_SLOT_COUNT +} brw_varying_slot; + +/** + * We always program SF to start reading at an offset of 1 (2 varying slots) + * from the start of the vertex URB entry. This causes it to skip: + * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5 + * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gen6+ + */ +#define BRW_SF_URB_ENTRY_READ_OFFSET 1 + +/** + * Bitmask indicating which fragment shader inputs represent varyings (and + * hence have to be delivered to the fragment shader by the SF/SBE stage). + */ +#define BRW_FS_VARYING_INPUT_MASK \ + (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \ + ~VARYING_BIT_POS & ~VARYING_BIT_FACE) + +/** + * Data structure recording the relationship between the gl_varying_slot enum + * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a + * single octaword within the VUE (128 bits). + * + * Note that each BRW register contains 256 bits (2 octawords), so when + * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two + * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as + * in a vertex shader), each register corresponds to a single VUE slot, since + * it contains data for two separate vertices. + */ +struct brw_vue_map { + /** + * Bitfield representing all varying slots that are (a) stored in this VUE + * map, and (b) actually written by the shader. Does not include any of + * the additional varying slots defined in brw_varying_slot. + */ + uint64_t slots_valid; + + /** + * Is this VUE map for a separate shader pipeline? + * + * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched + * without the linker having a chance to dead code eliminate unused varyings. + * + * This means that we have to use a fixed slot layout, based on the output's + * location field, rather than assigning slots in a compact contiguous block. + */ + bool separate; + + /** + * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are + * not stored in a slot (because they are not written, or because + * additional processing is applied before storing them in the VUE), the + * value is -1. + */ + signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; + + /** + * Map from VUE slot to gl_varying_slot value. For slots that do not + * directly correspond to a gl_varying_slot, the value comes from + * brw_varying_slot. + * + * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. + */ + signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; + + /** + * Total number of VUE slots in use + */ + int num_slots; + + /** + * Number of per-patch VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_patch_slots; + + /** + * Number of per-vertex VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_vertex_slots; +}; + +void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); + +/** + * Convert a VUE slot number into a byte offset within the VUE. + */ +static inline GLuint brw_vue_slot_to_offset(GLuint slot) +{ + return 16*slot; +} + +/** + * Convert a vertex output (brw_varying_slot) into a byte offset within the + * VUE. + */ +static inline +GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) +{ + return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); +} + +void brw_compute_vue_map(const struct gen_device_info *devinfo, + struct brw_vue_map *vue_map, + uint64_t slots_valid, + bool separate_shader); + +void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, + uint64_t slots_valid, + uint32_t is_patch); + +/* brw_interpolation_map.c */ +void brw_setup_vue_interpolation(struct brw_vue_map *vue_map, + struct nir_shader *nir, + struct brw_wm_prog_data *prog_data, + const struct gen_device_info *devinfo); + +enum shader_dispatch_mode { + DISPATCH_MODE_4X1_SINGLE = 0, + DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, + DISPATCH_MODE_4X2_DUAL_OBJECT = 2, + DISPATCH_MODE_SIMD8 = 3, +}; + +/** + * @defgroup Tessellator parameter enumerations. + * + * These correspond to the hardware values in 3DSTATE_TE, and are provided + * as part of the tessellation evaluation shader. + * + * @{ + */ +enum brw_tess_partitioning { + BRW_TESS_PARTITIONING_INTEGER = 0, + BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1, + BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, +}; + +enum brw_tess_output_topology { + BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0, + BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1, + BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, + BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, +}; + +enum brw_tess_domain { + BRW_TESS_DOMAIN_QUAD = 0, + BRW_TESS_DOMAIN_TRI = 1, + BRW_TESS_DOMAIN_ISOLINE = 2, +}; +/** @} */ + +struct brw_vue_prog_data { + struct brw_stage_prog_data base; + struct brw_vue_map vue_map; + + /** Should the hardware deliver input VUE handles for URB pull loads? */ + bool include_vue_handles; + + GLuint urb_read_length; + GLuint total_grf; + + uint32_t clip_distance_mask; + uint32_t cull_distance_mask; + + /* Used for calculating urb partitions. In the VS, this is the size of the + * URB entry used for both input and output to the thread. In the GS, this + * is the size of the URB entry used for output. + */ + GLuint urb_entry_size; + + enum shader_dispatch_mode dispatch_mode; +}; + +struct brw_vs_prog_data { + struct brw_vue_prog_data base; + + GLbitfield64 inputs_read; + GLbitfield64 double_inputs_read; + + unsigned nr_attributes; + unsigned nr_attribute_slots; + + bool uses_vertexid; + bool uses_instanceid; + bool uses_basevertex; + bool uses_baseinstance; + bool uses_drawid; +}; + +struct brw_tcs_prog_data +{ + struct brw_vue_prog_data base; + + /** Number vertices in output patch */ + int instances; +}; + + +struct brw_tes_prog_data +{ + struct brw_vue_prog_data base; + + enum brw_tess_partitioning partitioning; + enum brw_tess_output_topology output_topology; + enum brw_tess_domain domain; +}; + +struct brw_gs_prog_data +{ + struct brw_vue_prog_data base; + + unsigned vertices_in; + + /** + * Size of an output vertex, measured in HWORDS (32 bytes). + */ + unsigned output_vertex_size_hwords; + + unsigned output_topology; + + /** + * Size of the control data (cut bits or StreamID bits), in hwords (32 + * bytes). 0 if there is no control data. + */ + unsigned control_data_header_size_hwords; + + /** + * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID + * if the control data is StreamID bits, or + * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). + * Ignored if control_data_header_size is 0. + */ + unsigned control_data_format; + + bool include_primitive_id; + + /** + * The number of vertices emitted, if constant - otherwise -1. + */ + int static_vertex_count; + + int invocations; + + /** + * Gen6: Provoking vertex convention for odd-numbered triangles + * in tristrips. + */ + GLuint pv_first:1; + + /** + * Gen6: Number of varyings that are output to transform feedback. + */ + GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * gl_varying_slot that should be streamed out through that binding table + * entry. + */ + unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * swizzles that should be used when streaming out data through that + * binding table entry. + */ + unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; +}; + +#define DEFINE_PROG_DATA_DOWNCAST(stage) \ +static inline struct brw_##stage##_prog_data * \ +brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ +{ \ + return (struct brw_##stage##_prog_data *) prog_data; \ +} +DEFINE_PROG_DATA_DOWNCAST(vue) +DEFINE_PROG_DATA_DOWNCAST(vs) +DEFINE_PROG_DATA_DOWNCAST(tcs) +DEFINE_PROG_DATA_DOWNCAST(tes) +DEFINE_PROG_DATA_DOWNCAST(gs) +DEFINE_PROG_DATA_DOWNCAST(wm) +DEFINE_PROG_DATA_DOWNCAST(cs) +DEFINE_PROG_DATA_DOWNCAST(ff_gs) +DEFINE_PROG_DATA_DOWNCAST(clip) +DEFINE_PROG_DATA_DOWNCAST(sf) +#undef DEFINE_PROG_DATA_DOWNCAST + +/** @} */ + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo); + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const struct nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a tessellation control shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const struct nir_shader *nir, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a tessellation evaluation shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + const struct brw_vue_map *input_vue_map, + struct brw_tes_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_program *prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_gs_prog_key *key, + struct brw_gs_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_program *prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a fragment shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_program *prog, + int shader_time_index8, + int shader_time_index16, + bool allow_spilling, + bool use_rep_send, struct brw_vue_map *vue_map, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a compute shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const struct nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +static inline uint32_t +encode_slm_size(unsigned gen, uint32_t bytes) +{ + uint32_t slm_size = 0; + + /* Shared Local Memory is specified as powers of two, and encoded in + * INTERFACE_DESCRIPTOR_DATA with the following representations: + * + * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | + * ------------------------------------------------------------------- + * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | + * ------------------------------------------------------------------- + * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + */ + assert(bytes <= 64 * 1024); + + if (bytes > 0) { + /* Shared Local Memory Size is specified as powers of two. */ + slm_size = util_next_power_of_two(bytes); + + if (gen >= 9) { + /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */ + slm_size = ffs(MAX2(slm_size, 1024)) - 10; + } else { + /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */ + slm_size = MAX2(slm_size, 4096) / 4096; + } + } + + return slm_size; +} + +/** + * Return true if the given shader stage is dispatched contiguously by the + * relevant fixed function starting from channel 0 of the SIMD thread, which + * implies that the dispatch mask of a thread can be assumed to have the form + * '2^n - 1' for some n. + */ +static inline bool +brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo, + gl_shader_stage stage, + const struct brw_stage_prog_data *prog_data) +{ + /* The code below makes assumptions about the hardware's thread dispatch + * behavior that could be proven wrong in future generations -- Make sure + * to do a full test run with brw_fs_test_dispatch_packing() hooked up to + * the NIR front-end before changing this assertion. + */ + assert(devinfo->gen <= 9); + + switch (stage) { + case MESA_SHADER_FRAGMENT: { + /* The PSD discards subspans coming in with no lit samples, which in the + * per-pixel shading case implies that each subspan will either be fully + * lit (due to the VMask being used to allow derivative computations), + * or not dispatched at all. In per-sample dispatch mode individual + * samples from the same subspan have a fixed relative location within + * the SIMD thread, so dispatch of unlit samples cannot be avoided in + * general and we should return false. + */ + const struct brw_wm_prog_data *wm_prog_data = + (const struct brw_wm_prog_data *)prog_data; + return !wm_prog_data->persample_dispatch; + } + case MESA_SHADER_COMPUTE: + /* Compute shaders will be spawned with either a fully enabled dispatch + * mask or with whatever bottom/right execution mask was given to the + * GPGPU walker command to be used along the workgroup edges -- In both + * cases the dispatch mask is required to be tightly packed for our + * invocation index calculations to work. + */ + return true; + default: + /* Most remaining fixed functions are limited to use a packed dispatch + * mask due to the hardware representation of the dispatch mask as a + * single counter representing the number of enabled channels. + */ + return true; + } +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/src/intel/compiler/brw_dead_control_flow.cpp b/src/intel/compiler/brw_dead_control_flow.cpp new file mode 100644 index 00000000000..114dc6cb212 --- /dev/null +++ b/src/intel/compiler/brw_dead_control_flow.cpp @@ -0,0 +1,119 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_dead_control_flow.cpp + * + * This file implements the dead control flow elimination optimization pass. + */ + +#include "brw_shader.h" +#include "brw_cfg.h" + +/* Look for and eliminate dead control flow: + * + * - if/endif + * - else in else/endif + * - then in if/else/endif + */ +bool +dead_control_flow_eliminate(backend_shader *s) +{ + bool progress = false; + + foreach_block_safe (block, s->cfg) { + bblock_t *prev_block = block->prev(); + + if (!prev_block) + continue; + + backend_instruction *const inst = block->start(); + backend_instruction *const prev_inst = prev_block->end(); + + /* ENDIF instructions, by definition, can only be found at the start of + * basic blocks. + */ + if (inst->opcode == BRW_OPCODE_ENDIF && + prev_inst->opcode == BRW_OPCODE_ELSE) { + bblock_t *const else_block = prev_block; + backend_instruction *const else_inst = prev_inst; + + else_inst->remove(else_block); + progress = true; + } else if (inst->opcode == BRW_OPCODE_ENDIF && + prev_inst->opcode == BRW_OPCODE_IF) { + bblock_t *const endif_block = block; + bblock_t *const if_block = prev_block; + backend_instruction *const endif_inst = inst; + backend_instruction *const if_inst = prev_inst; + + bblock_t *earlier_block = NULL, *later_block = NULL; + + if (if_block->start_ip == if_block->end_ip) { + earlier_block = if_block->prev(); + } else { + earlier_block = if_block; + } + if_inst->remove(if_block); + + if (endif_block->start_ip == endif_block->end_ip) { + later_block = endif_block->next(); + } else { + later_block = endif_block; + } + endif_inst->remove(endif_block); + + assert((earlier_block == NULL) == (later_block == NULL)); + if (earlier_block && earlier_block->can_combine_with(later_block)) { + earlier_block->combine_with(later_block); + + /* If ENDIF was in its own block, then we've now deleted it and + * merged the two surrounding blocks, the latter of which the + * __next block pointer was pointing to. + */ + if (endif_block != later_block) { + __next = earlier_block->next(); + } + } + + progress = true; + } else if (inst->opcode == BRW_OPCODE_ELSE && + prev_inst->opcode == BRW_OPCODE_IF) { + bblock_t *const else_block = block; + backend_instruction *const if_inst = prev_inst; + backend_instruction *const else_inst = inst; + + /* Since the else-branch is becoming the new then-branch, the + * condition has to be inverted. + */ + if_inst->predicate_inverse = !if_inst->predicate_inverse; + else_inst->remove(else_block); + + progress = true; + } + } + + if (progress) + s->invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_dead_control_flow.h b/src/intel/compiler/brw_dead_control_flow.h new file mode 100644 index 00000000000..83fd9b1e79e --- /dev/null +++ b/src/intel/compiler/brw_dead_control_flow.h @@ -0,0 +1,26 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_shader.h" + +bool dead_control_flow_eliminate(backend_shader *s); diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c new file mode 100644 index 00000000000..536a003dcbe --- /dev/null +++ b/src/intel/compiler/brw_disasm.c @@ -0,0 +1,1646 @@ +/* + * Copyright © 2008 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#include <stdio.h> +#include <string.h> +#include <stdarg.h> + +#include "brw_eu_defines.h" +#include "brw_inst.h" +#include "brw_shader.h" +#include "brw_reg.h" +#include "brw_inst.h" +#include "brw_eu.h" + +static bool +has_jip(const struct gen_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->gen < 6) + return false; + + return opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_ELSE || + opcode == BRW_OPCODE_ENDIF || + opcode == BRW_OPCODE_WHILE || + opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_HALT; +} + +static bool +has_uip(const struct gen_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->gen < 6) + return false; + + return (devinfo->gen >= 7 && opcode == BRW_OPCODE_IF) || + (devinfo->gen >= 8 && opcode == BRW_OPCODE_ELSE) || + opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_HALT; +} + +static bool +has_branch_ctrl(const struct gen_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->gen < 8) + return false; + + return opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_ELSE; + /* opcode == BRW_OPCODE_GOTO; */ +} + +static bool +is_logic_instruction(unsigned opcode) +{ + return opcode == BRW_OPCODE_AND || + opcode == BRW_OPCODE_NOT || + opcode == BRW_OPCODE_OR || + opcode == BRW_OPCODE_XOR; +} + +const char *const conditional_modifier[16] = { + [BRW_CONDITIONAL_NONE] = "", + [BRW_CONDITIONAL_Z] = ".z", + [BRW_CONDITIONAL_NZ] = ".nz", + [BRW_CONDITIONAL_G] = ".g", + [BRW_CONDITIONAL_GE] = ".ge", + [BRW_CONDITIONAL_L] = ".l", + [BRW_CONDITIONAL_LE] = ".le", + [BRW_CONDITIONAL_R] = ".r", + [BRW_CONDITIONAL_O] = ".o", + [BRW_CONDITIONAL_U] = ".u", +}; + +static const char *const m_negate[2] = { + [0] = "", + [1] = "-", +}; + +static const char *const _abs[2] = { + [0] = "", + [1] = "(abs)", +}; + +static const char *const m_bitnot[2] = { "", "~" }; + +static const char *const vert_stride[16] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4", + [4] = "8", + [5] = "16", + [6] = "32", + [15] = "VxH", +}; + +static const char *const width[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", +}; + +static const char *const horiz_stride[4] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4" +}; + +static const char *const chan_sel[4] = { + [0] = "x", + [1] = "y", + [2] = "z", + [3] = "w", +}; + +static const char *const debug_ctrl[2] = { + [0] = "", + [1] = ".breakpoint" +}; + +static const char *const saturate[2] = { + [0] = "", + [1] = ".sat" +}; + +static const char *const cmpt_ctrl[2] = { + [0] = "", + [1] = "compacted" +}; + +static const char *const accwr[2] = { + [0] = "", + [1] = "AccWrEnable" +}; + +static const char *const branch_ctrl[2] = { + [0] = "", + [1] = "BranchCtrl" +}; + +static const char *const wectrl[2] = { + [0] = "", + [1] = "WE_all" +}; + +static const char *const exec_size[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", + [5] = "32" +}; + +static const char *const pred_inv[2] = { + [0] = "+", + [1] = "-" +}; + +const char *const pred_ctrl_align16[16] = { + [1] = "", + [2] = ".x", + [3] = ".y", + [4] = ".z", + [5] = ".w", + [6] = ".any4h", + [7] = ".all4h", +}; + +static const char *const pred_ctrl_align1[16] = { + [BRW_PREDICATE_NORMAL] = "", + [BRW_PREDICATE_ALIGN1_ANYV] = ".anyv", + [BRW_PREDICATE_ALIGN1_ALLV] = ".allv", + [BRW_PREDICATE_ALIGN1_ANY2H] = ".any2h", + [BRW_PREDICATE_ALIGN1_ALL2H] = ".all2h", + [BRW_PREDICATE_ALIGN1_ANY4H] = ".any4h", + [BRW_PREDICATE_ALIGN1_ALL4H] = ".all4h", + [BRW_PREDICATE_ALIGN1_ANY8H] = ".any8h", + [BRW_PREDICATE_ALIGN1_ALL8H] = ".all8h", + [BRW_PREDICATE_ALIGN1_ANY16H] = ".any16h", + [BRW_PREDICATE_ALIGN1_ALL16H] = ".all16h", + [BRW_PREDICATE_ALIGN1_ANY32H] = ".any32h", + [BRW_PREDICATE_ALIGN1_ALL32H] = ".all32h", +}; + +static const char *const thread_ctrl[4] = { + [BRW_THREAD_NORMAL] = "", + [BRW_THREAD_ATOMIC] = "atomic", + [BRW_THREAD_SWITCH] = "switch", +}; + +static const char *const compr_ctrl[4] = { + [0] = "", + [1] = "sechalf", + [2] = "compr", + [3] = "compr4", +}; + +static const char *const dep_ctrl[4] = { + [0] = "", + [1] = "NoDDClr", + [2] = "NoDDChk", + [3] = "NoDDClr,NoDDChk", +}; + +static const char *const mask_ctrl[4] = { + [0] = "", + [1] = "nomask", +}; + +static const char *const access_mode[2] = { + [0] = "align1", + [1] = "align16", +}; + +static const char * const reg_encoding[] = { + [BRW_HW_REG_TYPE_UD] = "UD", + [BRW_HW_REG_TYPE_D] = "D", + [BRW_HW_REG_TYPE_UW] = "UW", + [BRW_HW_REG_TYPE_W] = "W", + [BRW_HW_REG_NON_IMM_TYPE_UB] = "UB", + [BRW_HW_REG_NON_IMM_TYPE_B] = "B", + [GEN7_HW_REG_NON_IMM_TYPE_DF] = "DF", + [BRW_HW_REG_TYPE_F] = "F", + [GEN8_HW_REG_TYPE_UQ] = "UQ", + [GEN8_HW_REG_TYPE_Q] = "Q", + [GEN8_HW_REG_NON_IMM_TYPE_HF] = "HF", +}; + +static const char *const three_source_reg_encoding[] = { + [BRW_3SRC_TYPE_F] = "F", + [BRW_3SRC_TYPE_D] = "D", + [BRW_3SRC_TYPE_UD] = "UD", + [BRW_3SRC_TYPE_DF] = "DF", +}; + +static const char *const reg_file[4] = { + [0] = "A", + [1] = "g", + [2] = "m", + [3] = "imm", +}; + +static const char *const writemask[16] = { + [0x0] = ".", + [0x1] = ".x", + [0x2] = ".y", + [0x3] = ".xy", + [0x4] = ".z", + [0x5] = ".xz", + [0x6] = ".yz", + [0x7] = ".xyz", + [0x8] = ".w", + [0x9] = ".xw", + [0xa] = ".yw", + [0xb] = ".xyw", + [0xc] = ".zw", + [0xd] = ".xzw", + [0xe] = ".yzw", + [0xf] = "", +}; + +static const char *const end_of_thread[2] = { + [0] = "", + [1] = "EOT" +}; + +/* SFIDs on Gen4-5 */ +static const char *const gen4_sfid[16] = { + [BRW_SFID_NULL] = "null", + [BRW_SFID_MATH] = "math", + [BRW_SFID_SAMPLER] = "sampler", + [BRW_SFID_MESSAGE_GATEWAY] = "gateway", + [BRW_SFID_DATAPORT_READ] = "read", + [BRW_SFID_DATAPORT_WRITE] = "write", + [BRW_SFID_URB] = "urb", + [BRW_SFID_THREAD_SPAWNER] = "thread_spawner", + [BRW_SFID_VME] = "vme", +}; + +static const char *const gen6_sfid[16] = { + [BRW_SFID_NULL] = "null", + [BRW_SFID_MATH] = "math", + [BRW_SFID_SAMPLER] = "sampler", + [BRW_SFID_MESSAGE_GATEWAY] = "gateway", + [BRW_SFID_URB] = "urb", + [BRW_SFID_THREAD_SPAWNER] = "thread_spawner", + [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler", + [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render", + [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const", + [GEN7_SFID_DATAPORT_DATA_CACHE] = "data", + [GEN7_SFID_PIXEL_INTERPOLATOR] = "pixel interp", + [HSW_SFID_DATAPORT_DATA_CACHE_1] = "dp data 1", + [HSW_SFID_CRE] = "cre", +}; + +static const char *const gen7_gateway_subfuncid[8] = { + [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open", + [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close", + [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg", + [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp", + [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg", + [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state", + [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write", +}; + +static const char *const gen4_dp_read_port_msg_type[4] = { + [0b00] = "OWord Block Read", + [0b01] = "OWord Dual Block Read", + [0b10] = "Media Block Read", + [0b11] = "DWord Scattered Read", +}; + +static const char *const g45_dp_read_port_msg_type[8] = { + [0b000] = "OWord Block Read", + [0b010] = "OWord Dual Block Read", + [0b100] = "Media Block Read", + [0b110] = "DWord Scattered Read", + [0b001] = "Render Target UNORM Read", + [0b011] = "AVC Loop Filter Read", +}; + +static const char *const dp_write_port_msg_type[8] = { + [0b000] = "OWord block write", + [0b001] = "OWord dual block write", + [0b010] = "media block write", + [0b011] = "DWord scattered write", + [0b100] = "RT write", + [0b101] = "streamed VB write", + [0b110] = "RT UNORM write", /* G45+ */ + [0b111] = "flush render cache", +}; + +static const char *const dp_rc_msg_type_gen6[16] = { + [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read", + [GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read", + [GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read", + [GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read", + [GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] = + "OWORD unaligned block read", + [GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read", + [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write", + [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write", + [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] = + "OWORD dual block write", + [GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write", + [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] = + "DWORD scattered write", + [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write", + [GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write", + [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORM write", +}; + +static const char *const dp_rc_msg_type_gen7[16] = { + [GEN7_DATAPORT_RC_MEDIA_BLOCK_READ] = "media block read", + [GEN7_DATAPORT_RC_TYPED_SURFACE_READ] = "typed surface read", + [GEN7_DATAPORT_RC_TYPED_ATOMIC_OP] = "typed atomic op", + [GEN7_DATAPORT_RC_MEMORY_FENCE] = "memory fence", + [GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE] = "media block write", + [GEN7_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write", + [GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE] = "typed surface write" +}; + +static const char *const dp_rc_msg_type_gen9[16] = { + [GEN9_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write", + [GEN9_DATAPORT_RC_RENDER_TARGET_READ] = "RT read" +}; + +static const char *const * +dp_rc_msg_type(const struct gen_device_info *devinfo) +{ + return (devinfo->gen >= 9 ? dp_rc_msg_type_gen9 : + devinfo->gen >= 7 ? dp_rc_msg_type_gen7 : + devinfo->gen >= 6 ? dp_rc_msg_type_gen6 : + dp_write_port_msg_type); +} + +static const char *const m_rt_write_subtype[] = { + [0b000] = "SIMD16", + [0b001] = "SIMD16/RepData", + [0b010] = "SIMD8/DualSrcLow", + [0b011] = "SIMD8/DualSrcHigh", + [0b100] = "SIMD8", + [0b101] = "SIMD8/ImageWrite", /* Gen6+ */ + [0b111] = "SIMD16/RepData-111", /* no idea how this is different than 1 */ +}; + +static const char *const dp_dc0_msg_type_gen7[16] = { + [GEN7_DATAPORT_DC_OWORD_BLOCK_READ] = "DC OWORD block read", + [GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ] = + "DC unaligned OWORD block read", + [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ] = "DC OWORD dual block read", + [GEN7_DATAPORT_DC_DWORD_SCATTERED_READ] = "DC DWORD scattered read", + [GEN7_DATAPORT_DC_BYTE_SCATTERED_READ] = "DC byte scattered read", + [GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ] = "DC untyped surface read", + [GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP] = "DC untyped atomic", + [GEN7_DATAPORT_DC_MEMORY_FENCE] = "DC mfence", + [GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE] = "DC OWORD block write", + [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE] = "DC OWORD dual block write", + [GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE] = "DC DWORD scatterd write", + [GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE] = "DC byte scattered write", + [GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE] = "DC untyped surface write", +}; + +static const char *const dp_dc1_msg_type_hsw[16] = { + [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ] = "untyped surface read", + [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP] = "DC untyped atomic op", + [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2] = + "DC untyped 4x2 atomic op", + [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ] = "DC media block read", + [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ] = "DC typed surface read", + [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP] = "DC typed atomic", + [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2] = "DC typed 4x2 atomic op", + [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE] = "DC untyped surface write", + [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE] = "DC media block write", + [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP] = "DC atomic counter op", + [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] = + "DC 4x2 atomic counter op", + [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write", +}; + +static const char *const aop[16] = { + [BRW_AOP_AND] = "and", + [BRW_AOP_OR] = "or", + [BRW_AOP_XOR] = "xor", + [BRW_AOP_MOV] = "mov", + [BRW_AOP_INC] = "inc", + [BRW_AOP_DEC] = "dec", + [BRW_AOP_ADD] = "add", + [BRW_AOP_SUB] = "sub", + [BRW_AOP_REVSUB] = "revsub", + [BRW_AOP_IMAX] = "imax", + [BRW_AOP_IMIN] = "imin", + [BRW_AOP_UMAX] = "umax", + [BRW_AOP_UMIN] = "umin", + [BRW_AOP_CMPWR] = "cmpwr", + [BRW_AOP_PREDEC] = "predec", +}; + +static const char * const pixel_interpolator_msg_types[4] = { + [GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET] = "per_message_offset", + [GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE] = "sample_position", + [GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID] = "centroid", + [GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET] = "per_slot_offset", +}; + +static const char *const math_function[16] = { + [BRW_MATH_FUNCTION_INV] = "inv", + [BRW_MATH_FUNCTION_LOG] = "log", + [BRW_MATH_FUNCTION_EXP] = "exp", + [BRW_MATH_FUNCTION_SQRT] = "sqrt", + [BRW_MATH_FUNCTION_RSQ] = "rsq", + [BRW_MATH_FUNCTION_SIN] = "sin", + [BRW_MATH_FUNCTION_COS] = "cos", + [BRW_MATH_FUNCTION_SINCOS] = "sincos", + [BRW_MATH_FUNCTION_FDIV] = "fdiv", + [BRW_MATH_FUNCTION_POW] = "pow", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv", + [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod", + [GEN8_MATH_FUNCTION_INVM] = "invm", + [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm", +}; + +static const char *const math_saturate[2] = { + [0] = "", + [1] = "sat" +}; + +static const char *const math_signed[2] = { + [0] = "", + [1] = "signed" +}; + +static const char *const math_scalar[2] = { + [0] = "", + [1] = "scalar" +}; + +static const char *const math_precision[2] = { + [0] = "", + [1] = "partial_precision" +}; + +static const char *const gen5_urb_opcode[] = { + [0] = "urb_write", + [1] = "ff_sync", +}; + +static const char *const gen7_urb_opcode[] = { + [BRW_URB_OPCODE_WRITE_HWORD] = "write HWord", + [BRW_URB_OPCODE_WRITE_OWORD] = "write OWord", + [BRW_URB_OPCODE_READ_HWORD] = "read HWord", + [BRW_URB_OPCODE_READ_OWORD] = "read OWord", + [GEN7_URB_OPCODE_ATOMIC_MOV] = "atomic mov", /* Gen7+ */ + [GEN7_URB_OPCODE_ATOMIC_INC] = "atomic inc", /* Gen7+ */ + [GEN8_URB_OPCODE_ATOMIC_ADD] = "atomic add", /* Gen8+ */ + [GEN8_URB_OPCODE_SIMD8_WRITE] = "SIMD8 write", /* Gen8+ */ + [GEN8_URB_OPCODE_SIMD8_READ] = "SIMD8 read", /* Gen8+ */ + /* [9-15] - reserved */ +}; + +static const char *const urb_swizzle[4] = { + [BRW_URB_SWIZZLE_NONE] = "", + [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave", + [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose", +}; + +static const char *const urb_allocate[2] = { + [0] = "", + [1] = "allocate" +}; + +static const char *const urb_used[2] = { + [0] = "", + [1] = "used" +}; + +static const char *const urb_complete[2] = { + [0] = "", + [1] = "complete" +}; + +static const char *const gen5_sampler_msg_type[] = { + [GEN5_SAMPLER_MESSAGE_SAMPLE] = "sample", + [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS] = "sample_b", + [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD] = "sample_l", + [GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE] = "sample_c", + [GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS] = "sample_d", + [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c", + [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE] = "sample_l_c", + [GEN5_SAMPLER_MESSAGE_SAMPLE_LD] = "ld", + [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4] = "gather4", + [GEN5_SAMPLER_MESSAGE_LOD] = "lod", + [GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO] = "resinfo", + [GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO] = "sampleinfo", + [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C] = "gather4_c", + [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po", + [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c", + [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c", + [GEN9_SAMPLER_MESSAGE_SAMPLE_LZ] = "sample_lz", + [GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ] = "sample_c_lz", + [GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ] = "ld_lz", + [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w", + [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs", + [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms", + [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss", +}; + +static const char *const gen5_sampler_simd_mode[4] = { + [BRW_SAMPLER_SIMD_MODE_SIMD4X2] = "SIMD4x2", + [BRW_SAMPLER_SIMD_MODE_SIMD8] = "SIMD8", + [BRW_SAMPLER_SIMD_MODE_SIMD16] = "SIMD16", + [BRW_SAMPLER_SIMD_MODE_SIMD32_64] = "SIMD32/64", +}; + +static const char *const sampler_target_format[4] = { + [0] = "F", + [2] = "UD", + [3] = "D" +}; + + +static int column; + +static int +string(FILE *file, const char *string) +{ + fputs(string, file); + column += strlen(string); + return 0; +} + +static int +format(FILE *f, const char *format, ...) PRINTFLIKE(2, 3); + +static int +format(FILE *f, const char *format, ...) +{ + char buf[1024]; + va_list args; + va_start(args, format); + + vsnprintf(buf, sizeof(buf) - 1, format, args); + va_end(args); + string(f, buf); + return 0; +} + +static int +newline(FILE *f) +{ + putc('\n', f); + column = 0; + return 0; +} + +static int +pad(FILE *f, int c) +{ + do + string(f, " "); + while (column < c); + return 0; +} + +static int +control(FILE *file, const char *name, const char *const ctrl[], + unsigned id, int *space) +{ + if (!ctrl[id]) { + fprintf(file, "*** invalid %s value %d ", name, id); + return 1; + } + if (ctrl[id][0]) { + if (space && *space) + string(file, " "); + string(file, ctrl[id]); + if (space) + *space = 1; + } + return 0; +} + +static int +print_opcode(FILE *file, const struct gen_device_info *devinfo, + enum opcode id) +{ + const struct opcode_desc *desc = brw_opcode_desc(devinfo, id); + if (!desc) { + format(file, "*** invalid opcode value %d ", id); + return 1; + } + string(file, desc->name); + return 0; +} + +static int +reg(FILE *file, unsigned _reg_file, unsigned _reg_nr) +{ + int err = 0; + + /* Clear the Compr4 instruction compression bit. */ + if (_reg_file == BRW_MESSAGE_REGISTER_FILE) + _reg_nr &= ~BRW_MRF_COMPR4; + + if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) { + switch (_reg_nr & 0xf0) { + case BRW_ARF_NULL: + string(file, "null"); + break; + case BRW_ARF_ADDRESS: + format(file, "a%d", _reg_nr & 0x0f); + break; + case BRW_ARF_ACCUMULATOR: + format(file, "acc%d", _reg_nr & 0x0f); + break; + case BRW_ARF_FLAG: + format(file, "f%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK: + format(file, "mask%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK_STACK: + format(file, "msd%d", _reg_nr & 0x0f); + break; + case BRW_ARF_STATE: + format(file, "sr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_CONTROL: + format(file, "cr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_NOTIFICATION_COUNT: + format(file, "n%d", _reg_nr & 0x0f); + break; + case BRW_ARF_IP: + string(file, "ip"); + return -1; + break; + case BRW_ARF_TDR: + format(file, "tdr0"); + return -1; + case BRW_ARF_TIMESTAMP: + format(file, "tm%d", _reg_nr & 0x0f); + break; + default: + format(file, "ARF%d", _reg_nr); + break; + } + } else { + err |= control(file, "src reg file", reg_file, _reg_file, NULL); + format(file, "%d", _reg_nr); + } + return err; +} + +static int +dest(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + unsigned elem_size = brw_element_size(devinfo, inst, dst); + int err = 0; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + err |= reg(file, brw_inst_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (brw_inst_dst_da1_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_dst_da1_subreg_nr(devinfo, inst) / + elem_size); + string(file, "<"); + err |= control(file, "horiz stride", horiz_stride, + brw_inst_dst_hstride(devinfo, inst), NULL); + string(file, ">"); + err |= control(file, "dest reg encoding", reg_encoding, + brw_inst_dst_reg_type(devinfo, inst), NULL); + } else { + string(file, "g[a0"); + if (brw_inst_dst_ia_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) / + elem_size); + if (brw_inst_dst_ia1_addr_imm(devinfo, inst)) + format(file, " %d", brw_inst_dst_ia1_addr_imm(devinfo, inst)); + string(file, "]<"); + err |= control(file, "horiz stride", horiz_stride, + brw_inst_dst_hstride(devinfo, inst), NULL); + string(file, ">"); + err |= control(file, "dest reg encoding", reg_encoding, + brw_inst_dst_reg_type(devinfo, inst), NULL); + } + } else { + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + err |= reg(file, brw_inst_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (brw_inst_dst_da16_subreg_nr(devinfo, inst)) + format(file, ".%u", 16 / elem_size); + string(file, "<1>"); + err |= control(file, "writemask", writemask, + brw_inst_da16_writemask(devinfo, inst), NULL); + err |= control(file, "dest reg encoding", reg_encoding, + brw_inst_dst_reg_type(devinfo, inst), NULL); + } else { + err = 1; + string(file, "Indirect align16 address mode not supported"); + } + } + + return 0; +} + +static int +dest_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + int err = 0; + uint32_t reg_file; + + if (devinfo->gen == 6 && brw_inst_3src_dst_reg_file(devinfo, inst)) + reg_file = BRW_MESSAGE_REGISTER_FILE; + else + reg_file = BRW_GENERAL_REGISTER_FILE; + + err |= reg(file, reg_file, brw_inst_3src_dst_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (brw_inst_3src_dst_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_3src_dst_subreg_nr(devinfo, inst)); + string(file, "<1>"); + err |= control(file, "writemask", writemask, + brw_inst_3src_dst_writemask(devinfo, inst), NULL); + err |= control(file, "dest reg encoding", three_source_reg_encoding, + brw_inst_3src_dst_type(devinfo, inst), NULL); + + return 0; +} + +static int +src_align1_region(FILE *file, + unsigned _vert_stride, unsigned _width, + unsigned _horiz_stride) +{ + int err = 0; + string(file, "<"); + err |= control(file, "vert stride", vert_stride, _vert_stride, NULL); + string(file, ","); + err |= control(file, "width", width, _width, NULL); + string(file, ","); + err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL); + string(file, ">"); + return err; +} + +static int +src_da1(FILE *file, + const struct gen_device_info *devinfo, + unsigned opcode, + unsigned type, unsigned _reg_file, + unsigned _vert_stride, unsigned _width, unsigned _horiz_stride, + unsigned reg_num, unsigned sub_reg_num, unsigned __abs, + unsigned _negate) +{ + int err = 0; + + if (devinfo->gen >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + err |= reg(file, _reg_file, reg_num); + if (err == -1) + return 0; + if (sub_reg_num) { + unsigned elem_size = brw_hw_reg_type_to_size(devinfo, type, _reg_file); + format(file, ".%d", sub_reg_num / elem_size); /* use formal style like spec */ + } + src_align1_region(file, _vert_stride, _width, _horiz_stride); + err |= control(file, "src reg encoding", reg_encoding, type, NULL); + return err; +} + +static int +src_ia1(FILE *file, + const struct gen_device_info *devinfo, + unsigned opcode, + unsigned type, + unsigned _reg_file, + int _addr_imm, + unsigned _addr_subreg_nr, + unsigned _negate, + unsigned __abs, + unsigned _horiz_stride, unsigned _width, unsigned _vert_stride) +{ + int err = 0; + + if (devinfo->gen >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + string(file, "g[a0"); + if (_addr_subreg_nr) + format(file, ".%d", _addr_subreg_nr); + if (_addr_imm) + format(file, " %d", _addr_imm); + string(file, "]"); + src_align1_region(file, _vert_stride, _width, _horiz_stride); + err |= control(file, "src reg encoding", reg_encoding, type, NULL); + return err; +} + +static int +src_swizzle(FILE *file, unsigned swiz) +{ + unsigned x = BRW_GET_SWZ(swiz, BRW_CHANNEL_X); + unsigned y = BRW_GET_SWZ(swiz, BRW_CHANNEL_Y); + unsigned z = BRW_GET_SWZ(swiz, BRW_CHANNEL_Z); + unsigned w = BRW_GET_SWZ(swiz, BRW_CHANNEL_W); + int err = 0; + + if (x == y && x == z && x == w) { + string(file, "."); + err |= control(file, "channel select", chan_sel, x, NULL); + } else if (swiz != BRW_SWIZZLE_XYZW) { + string(file, "."); + err |= control(file, "channel select", chan_sel, x, NULL); + err |= control(file, "channel select", chan_sel, y, NULL); + err |= control(file, "channel select", chan_sel, z, NULL); + err |= control(file, "channel select", chan_sel, w, NULL); + } + return err; +} + +static int +src_da16(FILE *file, + const struct gen_device_info *devinfo, + unsigned opcode, + unsigned _reg_type, + unsigned _reg_file, + unsigned _vert_stride, + unsigned _reg_nr, + unsigned _subreg_nr, + unsigned __abs, + unsigned _negate, + unsigned swz_x, unsigned swz_y, unsigned swz_z, unsigned swz_w) +{ + int err = 0; + + if (devinfo->gen >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + err |= reg(file, _reg_file, _reg_nr); + if (err == -1) + return 0; + if (_subreg_nr) { + unsigned elem_size = + brw_hw_reg_type_to_size(devinfo, _reg_type, _reg_file); + + /* bit4 for subreg number byte addressing. Make this same meaning as + in da1 case, so output looks consistent. */ + format(file, ".%d", 16 / elem_size); + } + string(file, "<"); + err |= control(file, "vert stride", vert_stride, _vert_stride, NULL); + string(file, ">"); + err |= src_swizzle(file, BRW_SWIZZLE4(swz_x, swz_y, swz_z, swz_w)); + err |= control(file, "src da16 reg type", reg_encoding, _reg_type, NULL); + return err; +} + +static int +src0_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + int err = 0; + unsigned src0_subreg_nr = brw_inst_3src_src0_subreg_nr(devinfo, inst); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src0_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src0_abs(devinfo, inst), NULL); + + err |= reg(file, BRW_GENERAL_REGISTER_FILE, + brw_inst_3src_src0_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (src0_subreg_nr || brw_inst_3src_src0_rep_ctrl(devinfo, inst)) + format(file, ".%d", src0_subreg_nr); + if (brw_inst_3src_src0_rep_ctrl(devinfo, inst)) + string(file, "<0,1,0>"); + else { + string(file, "<4,4,1>"); + err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst)); + } + err |= control(file, "src da16 reg type", three_source_reg_encoding, + brw_inst_3src_src_type(devinfo, inst), NULL); + return err; +} + +static int +src1_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + int err = 0; + unsigned src1_subreg_nr = brw_inst_3src_src1_subreg_nr(devinfo, inst); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src1_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src1_abs(devinfo, inst), NULL); + + err |= reg(file, BRW_GENERAL_REGISTER_FILE, + brw_inst_3src_src1_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (src1_subreg_nr || brw_inst_3src_src1_rep_ctrl(devinfo, inst)) + format(file, ".%d", src1_subreg_nr); + if (brw_inst_3src_src1_rep_ctrl(devinfo, inst)) + string(file, "<0,1,0>"); + else { + string(file, "<4,4,1>"); + err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst)); + } + err |= control(file, "src da16 reg type", three_source_reg_encoding, + brw_inst_3src_src_type(devinfo, inst), NULL); + return err; +} + + +static int +src2_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + int err = 0; + unsigned src2_subreg_nr = brw_inst_3src_src2_subreg_nr(devinfo, inst); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src2_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src2_abs(devinfo, inst), NULL); + + err |= reg(file, BRW_GENERAL_REGISTER_FILE, + brw_inst_3src_src2_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (src2_subreg_nr || brw_inst_3src_src2_rep_ctrl(devinfo, inst)) + format(file, ".%d", src2_subreg_nr); + if (brw_inst_3src_src2_rep_ctrl(devinfo, inst)) + string(file, "<0,1,0>"); + else { + string(file, "<4,4,1>"); + err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst)); + } + err |= control(file, "src da16 reg type", three_source_reg_encoding, + brw_inst_3src_src_type(devinfo, inst), NULL); + return err; +} + +static int +imm(FILE *file, const struct gen_device_info *devinfo, unsigned type, brw_inst *inst) +{ + switch (type) { + case BRW_HW_REG_TYPE_UD: + format(file, "0x%08xUD", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_HW_REG_TYPE_D: + format(file, "%dD", brw_inst_imm_d(devinfo, inst)); + break; + case BRW_HW_REG_TYPE_UW: + format(file, "0x%04xUW", (uint16_t) brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_HW_REG_TYPE_W: + format(file, "%dW", (int16_t) brw_inst_imm_d(devinfo, inst)); + break; + case BRW_HW_REG_IMM_TYPE_UV: + format(file, "0x%08xUV", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_HW_REG_IMM_TYPE_VF: + format(file, "[%-gF, %-gF, %-gF, %-gF]VF", + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst)), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 8), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 16), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 24)); + break; + case BRW_HW_REG_IMM_TYPE_V: + format(file, "0x%08xV", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_HW_REG_TYPE_F: + format(file, "%-gF", brw_inst_imm_f(devinfo, inst)); + break; + case GEN8_HW_REG_IMM_TYPE_DF: + format(file, "%-gDF", brw_inst_imm_df(devinfo, inst)); + break; + case GEN8_HW_REG_IMM_TYPE_HF: + string(file, "Half Float IMM"); + break; + } + return 0; +} + +static int +src0(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + return imm(file, devinfo, brw_inst_src0_reg_type(devinfo, inst), inst); + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da1(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src0_reg_type(devinfo, inst), + brw_inst_src0_reg_file(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst), + brw_inst_src0_width(devinfo, inst), + brw_inst_src0_hstride(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + brw_inst_src0_da1_subreg_nr(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_negate(devinfo, inst)); + } else { + return src_ia1(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src0_reg_type(devinfo, inst), + brw_inst_src0_reg_file(devinfo, inst), + brw_inst_src0_ia1_addr_imm(devinfo, inst), + brw_inst_src0_ia_subreg_nr(devinfo, inst), + brw_inst_src0_negate(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_hstride(devinfo, inst), + brw_inst_src0_width(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst)); + } + } else { + if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da16(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src0_reg_type(devinfo, inst), + brw_inst_src0_reg_file(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + brw_inst_src0_da16_subreg_nr(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_negate(devinfo, inst), + brw_inst_src0_da16_swiz_x(devinfo, inst), + brw_inst_src0_da16_swiz_y(devinfo, inst), + brw_inst_src0_da16_swiz_z(devinfo, inst), + brw_inst_src0_da16_swiz_w(devinfo, inst)); + } else { + string(file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +static int +src1(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + return imm(file, devinfo, brw_inst_src1_reg_type(devinfo, inst), inst); + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da1(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src1_reg_type(devinfo, inst), + brw_inst_src1_reg_file(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst), + brw_inst_src1_width(devinfo, inst), + brw_inst_src1_hstride(devinfo, inst), + brw_inst_src1_da_reg_nr(devinfo, inst), + brw_inst_src1_da1_subreg_nr(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_negate(devinfo, inst)); + } else { + return src_ia1(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src1_reg_type(devinfo, inst), + brw_inst_src1_reg_file(devinfo, inst), + brw_inst_src1_ia1_addr_imm(devinfo, inst), + brw_inst_src1_ia_subreg_nr(devinfo, inst), + brw_inst_src1_negate(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_hstride(devinfo, inst), + brw_inst_src1_width(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst)); + } + } else { + if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da16(file, + devinfo, + brw_inst_opcode(devinfo, inst), + brw_inst_src1_reg_type(devinfo, inst), + brw_inst_src1_reg_file(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst), + brw_inst_src1_da_reg_nr(devinfo, inst), + brw_inst_src1_da16_subreg_nr(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_negate(devinfo, inst), + brw_inst_src1_da16_swiz_x(devinfo, inst), + brw_inst_src1_da16_swiz_y(devinfo, inst), + brw_inst_src1_da16_swiz_z(devinfo, inst), + brw_inst_src1_da16_swiz_w(devinfo, inst)); + } else { + string(file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +static int +qtr_ctrl(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst) +{ + int qtr_ctl = brw_inst_qtr_control(devinfo, inst); + int exec_size = 1 << brw_inst_exec_size(devinfo, inst); + + if (exec_size < 8) { + const unsigned nib_ctl = devinfo->gen < 7 ? 0 : + brw_inst_nib_control(devinfo, inst); + format(file, " %dN", qtr_ctl * 2 + nib_ctl + 1); + } else if (exec_size == 8) { + switch (qtr_ctl) { + case 0: + string(file, " 1Q"); + break; + case 1: + string(file, " 2Q"); + break; + case 2: + string(file, " 3Q"); + break; + case 3: + string(file, " 4Q"); + break; + } + } else if (exec_size == 16) { + if (qtr_ctl < 2) + string(file, " 1H"); + else + string(file, " 2H"); + } + return 0; +} + +#ifdef DEBUG +static __attribute__((__unused__)) int +brw_disassemble_imm(const struct gen_device_info *devinfo, + uint32_t dw3, uint32_t dw2, uint32_t dw1, uint32_t dw0) +{ + brw_inst inst; + inst.data[0] = (((uint64_t) dw1) << 32) | ((uint64_t) dw0); + inst.data[1] = (((uint64_t) dw3) << 32) | ((uint64_t) dw2); + return brw_disassemble_inst(stderr, devinfo, &inst, false); +} +#endif + +int +brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo, + brw_inst *inst, bool is_compacted) +{ + int err = 0; + int space = 0; + + const enum opcode opcode = brw_inst_opcode(devinfo, inst); + const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode); + + if (brw_inst_pred_control(devinfo, inst)) { + string(file, "("); + err |= control(file, "predicate inverse", pred_inv, + brw_inst_pred_inv(devinfo, inst), NULL); + format(file, "f%"PRIu64, devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0); + if (brw_inst_flag_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst)); + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + err |= control(file, "predicate control align1", pred_ctrl_align1, + brw_inst_pred_control(devinfo, inst), NULL); + } else { + err |= control(file, "predicate control align16", pred_ctrl_align16, + brw_inst_pred_control(devinfo, inst), NULL); + } + string(file, ") "); + } + + err |= print_opcode(file, devinfo, opcode); + err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst), + NULL); + + err |= control(file, "debug control", debug_ctrl, + brw_inst_debug_control(devinfo, inst), NULL); + + if (opcode == BRW_OPCODE_MATH) { + string(file, " "); + err |= control(file, "function", math_function, + brw_inst_math_function(devinfo, inst), NULL); + } else if (opcode != BRW_OPCODE_SEND && opcode != BRW_OPCODE_SENDC) { + err |= control(file, "conditional modifier", conditional_modifier, + brw_inst_cond_modifier(devinfo, inst), NULL); + + /* If we're using the conditional modifier, print which flags reg is + * used for it. Note that on gen6+, the embedded-condition SEL and + * control flow doesn't update flags. + */ + if (brw_inst_cond_modifier(devinfo, inst) && + (devinfo->gen < 6 || (opcode != BRW_OPCODE_SEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE))) { + format(file, ".f%"PRIu64, + devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0); + if (brw_inst_flag_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst)); + } + } + + if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) { + string(file, "("); + err |= control(file, "execution size", exec_size, + brw_inst_exec_size(devinfo, inst), NULL); + string(file, ")"); + } + + if (opcode == BRW_OPCODE_SEND && devinfo->gen < 6) + format(file, " %"PRIu64, brw_inst_base_mrf(devinfo, inst)); + + if (has_uip(devinfo, opcode)) { + /* Instructions that have UIP also have JIP. */ + pad(file, 16); + format(file, "JIP: %d", brw_inst_jip(devinfo, inst)); + pad(file, 32); + format(file, "UIP: %d", brw_inst_uip(devinfo, inst)); + } else if (has_jip(devinfo, opcode)) { + pad(file, 16); + if (devinfo->gen >= 7) { + format(file, "JIP: %d", brw_inst_jip(devinfo, inst)); + } else { + format(file, "JIP: %d", brw_inst_gen6_jump_count(devinfo, inst)); + } + } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_ELSE)) { + pad(file, 16); + format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst)); + pad(file, 32); + format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst)); + } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_IFF || + opcode == BRW_OPCODE_HALT)) { + pad(file, 16); + format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst)); + } else if (devinfo->gen < 6 && opcode == BRW_OPCODE_ENDIF) { + pad(file, 16); + format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst)); + } else if (opcode == BRW_OPCODE_JMPI) { + pad(file, 16); + err |= src1(file, devinfo, inst); + } else if (desc && desc->nsrc == 3) { + pad(file, 16); + err |= dest_3src(file, devinfo, inst); + + pad(file, 32); + err |= src0_3src(file, devinfo, inst); + + pad(file, 48); + err |= src1_3src(file, devinfo, inst); + + pad(file, 64); + err |= src2_3src(file, devinfo, inst); + } else if (desc) { + if (desc->ndst > 0) { + pad(file, 16); + err |= dest(file, devinfo, inst); + } + + if (desc->nsrc > 0) { + pad(file, 32); + err |= src0(file, devinfo, inst); + } + + if (desc->nsrc > 1) { + pad(file, 48); + err |= src1(file, devinfo, inst); + } + } + + if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) { + enum brw_message_target sfid = brw_inst_sfid(devinfo, inst); + + if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) { + /* show the indirect descriptor source */ + pad(file, 48); + err |= src1(file, devinfo, inst); + } + + newline(file); + pad(file, 16); + space = 0; + + fprintf(file, " "); + err |= control(file, "SFID", devinfo->gen >= 6 ? gen6_sfid : gen4_sfid, + sfid, &space); + + + if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) { + format(file, " indirect"); + } else { + switch (sfid) { + case BRW_SFID_MATH: + err |= control(file, "math function", math_function, + brw_inst_math_msg_function(devinfo, inst), &space); + err |= control(file, "math saturate", math_saturate, + brw_inst_math_msg_saturate(devinfo, inst), &space); + err |= control(file, "math signed", math_signed, + brw_inst_math_msg_signed_int(devinfo, inst), &space); + err |= control(file, "math scalar", math_scalar, + brw_inst_math_msg_data_type(devinfo, inst), &space); + err |= control(file, "math precision", math_precision, + brw_inst_math_msg_precision(devinfo, inst), &space); + break; + case BRW_SFID_SAMPLER: + if (devinfo->gen >= 5) { + err |= control(file, "sampler message", gen5_sampler_msg_type, + brw_inst_sampler_msg_type(devinfo, inst), &space); + err |= control(file, "sampler simd mode", gen5_sampler_simd_mode, + brw_inst_sampler_simd_mode(devinfo, inst), &space); + format(file, " Surface = %"PRIu64" Sampler = %"PRIu64, + brw_inst_binding_table_index(devinfo, inst), + brw_inst_sampler(devinfo, inst)); + } else { + format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", ", + brw_inst_binding_table_index(devinfo, inst), + brw_inst_sampler(devinfo, inst), + brw_inst_sampler_msg_type(devinfo, inst)); + if (!devinfo->is_g4x) { + err |= control(file, "sampler target format", + sampler_target_format, + brw_inst_sampler_return_format(devinfo, inst), NULL); + } + string(file, ")"); + } + break; + case GEN6_SFID_DATAPORT_SAMPLER_CACHE: + case GEN6_SFID_DATAPORT_CONSTANT_CACHE: + /* aka BRW_SFID_DATAPORT_READ on Gen4-5 */ + if (devinfo->gen >= 6) { + format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64")", + brw_inst_binding_table_index(devinfo, inst), + brw_inst_dp_msg_control(devinfo, inst), + brw_inst_dp_msg_type(devinfo, inst), + devinfo->gen >= 7 ? 0 : brw_inst_dp_write_commit(devinfo, inst)); + } else { + bool is_965 = devinfo->gen == 4 && !devinfo->is_g4x; + err |= control(file, "DP read message type", + is_965 ? gen4_dp_read_port_msg_type : + g45_dp_read_port_msg_type, + brw_inst_dp_read_msg_type(devinfo, inst), + &space); + + format(file, " MsgCtrl = 0x%"PRIx64, + brw_inst_dp_read_msg_control(devinfo, inst)); + + format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst)); + } + break; + + case GEN6_SFID_DATAPORT_RENDER_CACHE: { + /* aka BRW_SFID_DATAPORT_WRITE on Gen4-5 */ + unsigned msg_type = brw_inst_dp_write_msg_type(devinfo, inst); + + err |= control(file, "DP rc message type", + dp_rc_msg_type(devinfo), msg_type, &space); + + bool is_rt_write = msg_type == + (devinfo->gen >= 6 ? GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE + : BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE); + + if (is_rt_write) { + err |= control(file, "RT message type", m_rt_write_subtype, + brw_inst_rt_message_type(devinfo, inst), &space); + if (devinfo->gen >= 6 && brw_inst_rt_slot_group(devinfo, inst)) + string(file, " Hi"); + if (brw_inst_rt_last(devinfo, inst)) + string(file, " LastRT"); + if (devinfo->gen < 7 && brw_inst_dp_write_commit(devinfo, inst)) + string(file, " WriteCommit"); + } else { + format(file, " MsgCtrl = 0x%"PRIx64, + brw_inst_dp_write_msg_control(devinfo, inst)); + } + + format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst)); + break; + } + + case BRW_SFID_URB: { + unsigned opcode = brw_inst_urb_opcode(devinfo, inst); + + format(file, " %"PRIu64, brw_inst_urb_global_offset(devinfo, inst)); + + space = 1; + + err |= control(file, "urb opcode", + devinfo->gen >= 7 ? gen7_urb_opcode + : gen5_urb_opcode, + opcode, &space); + + if (devinfo->gen >= 7 && + brw_inst_urb_per_slot_offset(devinfo, inst)) { + string(file, " per-slot"); + } + + if (opcode == GEN8_URB_OPCODE_SIMD8_WRITE || + opcode == GEN8_URB_OPCODE_SIMD8_READ) { + if (brw_inst_urb_channel_mask_present(devinfo, inst)) + string(file, " masked"); + } else { + err |= control(file, "urb swizzle", urb_swizzle, + brw_inst_urb_swizzle_control(devinfo, inst), + &space); + } + + if (devinfo->gen < 7) { + err |= control(file, "urb allocate", urb_allocate, + brw_inst_urb_allocate(devinfo, inst), &space); + err |= control(file, "urb used", urb_used, + brw_inst_urb_used(devinfo, inst), &space); + } + if (devinfo->gen < 8) { + err |= control(file, "urb complete", urb_complete, + brw_inst_urb_complete(devinfo, inst), &space); + } + break; + } + case BRW_SFID_THREAD_SPAWNER: + break; + + case BRW_SFID_MESSAGE_GATEWAY: + format(file, " (%s)", + gen7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]); + break; + + case GEN7_SFID_DATAPORT_DATA_CACHE: + if (devinfo->gen >= 7) { + format(file, " ("); + + err |= control(file, "DP DC0 message type", + dp_dc0_msg_type_gen7, + brw_inst_dp_msg_type(devinfo, inst), &space); + + format(file, ", %"PRIu64", ", brw_inst_binding_table_index(devinfo, inst)); + + switch (brw_inst_dp_msg_type(devinfo, inst)) { + case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP: + control(file, "atomic op", aop, + brw_inst_imm_ud(devinfo, inst) >> 8 & 0xf, &space); + break; + default: + format(file, "%"PRIu64, brw_inst_dp_msg_control(devinfo, inst)); + } + format(file, ")"); + break; + } + /* FALLTHROUGH */ + + case HSW_SFID_DATAPORT_DATA_CACHE_1: { + if (devinfo->gen >= 7) { + format(file, " ("); + + unsigned msg_ctrl = brw_inst_dp_msg_control(devinfo, inst); + + err |= control(file, "DP DC1 message type", + dp_dc1_msg_type_hsw, + brw_inst_dp_msg_type(devinfo, inst), &space); + + format(file, ", Surface = %"PRIu64", ", + brw_inst_binding_table_index(devinfo, inst)); + + switch (brw_inst_dp_msg_type(devinfo, inst)) { + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP: + format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16); + /* fallthrough */ + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2: + control(file, "atomic op", aop, msg_ctrl & 0xf, &space); + break; + case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: + case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: + case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: + case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: { + static const char *simd_modes[] = { "4x2", "16", "8" }; + format(file, "SIMD%s, Mask = 0x%x", + simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf); + break; + } + default: + format(file, "0x%x", msg_ctrl); + } + format(file, ")"); + break; + } + /* FALLTHROUGH */ + } + + case GEN7_SFID_PIXEL_INTERPOLATOR: + if (devinfo->gen >= 7) { + format(file, " (%s, %s, 0x%02"PRIx64")", + brw_inst_pi_nopersp(devinfo, inst) ? "linear" : "persp", + pixel_interpolator_msg_types[brw_inst_pi_message_type(devinfo, inst)], + brw_inst_pi_message_data(devinfo, inst)); + break; + } + /* FALLTHROUGH */ + + default: + format(file, "unsupported shared function ID %d", sfid); + break; + } + + if (space) + string(file, " "); + format(file, "mlen %"PRIu64, brw_inst_mlen(devinfo, inst)); + format(file, " rlen %"PRIu64, brw_inst_rlen(devinfo, inst)); + } + } + pad(file, 64); + if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) { + string(file, "{"); + space = 1; + err |= control(file, "access mode", access_mode, + brw_inst_access_mode(devinfo, inst), &space); + if (devinfo->gen >= 6) { + err |= control(file, "write enable control", wectrl, + brw_inst_mask_control(devinfo, inst), &space); + } else { + err |= control(file, "mask control", mask_ctrl, + brw_inst_mask_control(devinfo, inst), &space); + } + err |= control(file, "dependency control", dep_ctrl, + ((brw_inst_no_dd_check(devinfo, inst) << 1) | + brw_inst_no_dd_clear(devinfo, inst)), &space); + + if (devinfo->gen >= 6) + err |= qtr_ctrl(file, devinfo, inst); + else { + if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED && + desc && desc->ndst > 0 && + brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) { + format(file, " compr4"); + } else { + err |= control(file, "compression control", compr_ctrl, + brw_inst_qtr_control(devinfo, inst), &space); + } + } + + err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space); + err |= control(file, "thread control", thread_ctrl, + brw_inst_thread_control(devinfo, inst), &space); + if (has_branch_ctrl(devinfo, opcode)) { + err |= control(file, "branch ctrl", branch_ctrl, + brw_inst_branch_control(devinfo, inst), &space); + } else if (devinfo->gen >= 6) { + err |= control(file, "acc write control", accwr, + brw_inst_acc_wr_control(devinfo, inst), &space); + } + if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) + err |= control(file, "end of thread", end_of_thread, + brw_inst_eot(devinfo, inst), &space); + if (space) + string(file, " "); + string(file, "}"); + } + string(file, ";"); + newline(file); + return err; +} diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c new file mode 100644 index 00000000000..77400c19914 --- /dev/null +++ b/src/intel/compiler/brw_eu.c @@ -0,0 +1,719 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_eu_defines.h" +#include "brw_eu.h" +#include "brw_shader.h" +#include "common/gen_debug.h" + +#include "util/ralloc.h" + +/** + * Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on). + * + * This is different than reg_encoding from brw_disasm.c in that it operates + * on the abstract enum values, rather than the generation-specific encoding. + */ +const char * +brw_reg_type_letters(unsigned type) +{ + const char *names[] = { + [BRW_REGISTER_TYPE_UD] = "UD", + [BRW_REGISTER_TYPE_D] = "D", + [BRW_REGISTER_TYPE_UW] = "UW", + [BRW_REGISTER_TYPE_W] = "W", + [BRW_REGISTER_TYPE_F] = "F", + [BRW_REGISTER_TYPE_UB] = "UB", + [BRW_REGISTER_TYPE_B] = "B", + [BRW_REGISTER_TYPE_UV] = "UV", + [BRW_REGISTER_TYPE_V] = "V", + [BRW_REGISTER_TYPE_VF] = "VF", + [BRW_REGISTER_TYPE_DF] = "DF", + [BRW_REGISTER_TYPE_HF] = "HF", + [BRW_REGISTER_TYPE_UQ] = "UQ", + [BRW_REGISTER_TYPE_Q] = "Q", + }; + assert(type <= BRW_REGISTER_TYPE_Q); + return names[type]; +} + +/* Returns a conditional modifier that negates the condition. */ +enum brw_conditional_mod +brw_negate_cmod(uint32_t cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + return BRW_CONDITIONAL_NZ; + case BRW_CONDITIONAL_NZ: + return BRW_CONDITIONAL_Z; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_GE; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_G; + default: + return ~0; + } +} + +/* Returns the corresponding conditional mod for swapping src0 and + * src1 in e.g. CMP. + */ +enum brw_conditional_mod +brw_swap_cmod(uint32_t cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + case BRW_CONDITIONAL_NZ: + return cmod; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_G; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_GE; + default: + return BRW_CONDITIONAL_NONE; + } +} + +/** + * Get the least significant bit offset of the i+1-th component of immediate + * type \p type. For \p i equal to the two's complement of j, return the + * offset of the j-th component starting from the end of the vector. For + * scalar register types return zero. + */ +static unsigned +imm_shift(enum brw_reg_type type, unsigned i) +{ + assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V && + "Not implemented."); + + if (type == BRW_REGISTER_TYPE_VF) + return 8 * (i & 3); + else + return 0; +} + +/** + * Swizzle an arbitrary immediate \p x of the given type according to the + * permutation specified as \p swz. + */ +uint32_t +brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz) +{ + if (imm_shift(type, 1)) { + const unsigned n = 32 / imm_shift(type, 1); + uint32_t y = 0; + + for (unsigned i = 0; i < n; i++) { + /* Shift the specified component all the way to the right and left to + * discard any undesired L/MSBs, then shift it right into component i. + */ + y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3)) + << imm_shift(type, ~0u) + >> imm_shift(type, ~0u - i); + } + + return y; + } else { + return x; + } +} + +void +brw_set_default_exec_size(struct brw_codegen *p, unsigned value) +{ + brw_inst_set_exec_size(p->devinfo, p->current, value); +} + +void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc ) +{ + brw_inst_set_pred_control(p->devinfo, p->current, pc); +} + +void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse) +{ + brw_inst_set_pred_inv(p->devinfo, p->current, predicate_inverse); +} + +void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg) +{ + if (p->devinfo->gen >= 7) + brw_inst_set_flag_reg_nr(p->devinfo, p->current, reg); + + brw_inst_set_flag_subreg_nr(p->devinfo, p->current, subreg); +} + +void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ) +{ + brw_inst_set_access_mode(p->devinfo, p->current, access_mode); +} + +void +brw_set_default_compression_control(struct brw_codegen *p, + enum brw_compression compression_control) +{ + if (p->devinfo->gen >= 6) { + /* Since we don't use the SIMD32 support in gen6, we translate + * the pre-gen6 compression control here. + */ + switch (compression_control) { + case BRW_COMPRESSION_NONE: + /* This is the "use the first set of bits of dmask/vmask/arf + * according to execsize" option. + */ + brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1Q); + break; + case BRW_COMPRESSION_2NDHALF: + /* For SIMD8, this is "use the second set of 8 bits." */ + brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_2Q); + break; + case BRW_COMPRESSION_COMPRESSED: + /* For SIMD16 instruction compression, use the first set of 16 bits + * since we don't do SIMD32 dispatch. + */ + brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1H); + break; + default: + unreachable("not reached"); + } + } else { + brw_inst_set_qtr_control(p->devinfo, p->current, compression_control); + } +} + +/** + * Enable or disable instruction compression on the given instruction leaving + * the currently selected channel enable group untouched. + */ +void +brw_inst_set_compression(const struct gen_device_info *devinfo, + brw_inst *inst, bool on) +{ + if (devinfo->gen >= 6) { + /* No-op, the EU will figure out for us whether the instruction needs to + * be compressed. + */ + } else { + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for uncompressed instructions and we + * may need to preserve the current one to avoid changing the selected + * channel group inadvertently. + */ + if (on) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED); + else if (brw_inst_qtr_control(devinfo, inst) + == BRW_COMPRESSION_COMPRESSED) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_compression(struct brw_codegen *p, bool on) +{ + brw_inst_set_compression(p->devinfo, p->current, on); +} + +/** + * Apply the range of channel enable signals given by + * [group, group + exec_size) to the instruction passed as argument. + */ +void +brw_inst_set_group(const struct gen_device_info *devinfo, + brw_inst *inst, unsigned group) +{ + if (devinfo->gen >= 7) { + assert(group % 4 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2); + + } else if (devinfo->gen == 6) { + assert(group % 8 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + + } else { + assert(group % 8 == 0 && group < 16); + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for group zero and we may need to + * preserve the current one to avoid changing the selected compression + * enable inadvertently. + */ + if (group == 8) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF); + else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_group(struct brw_codegen *p, unsigned group) +{ + brw_inst_set_group(p->devinfo, p->current, group); +} + +void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ) +{ + brw_inst_set_mask_control(p->devinfo, p->current, value); +} + +void brw_set_default_saturate( struct brw_codegen *p, bool enable ) +{ + brw_inst_set_saturate(p->devinfo, p->current, enable); +} + +void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value) +{ + if (p->devinfo->gen >= 6) + brw_inst_set_acc_wr_control(p->devinfo, p->current, value); +} + +void brw_push_insn_state( struct brw_codegen *p ) +{ + assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]); + memcpy(p->current + 1, p->current, sizeof(brw_inst)); + p->current++; +} + +void brw_pop_insn_state( struct brw_codegen *p ) +{ + assert(p->current != p->stack); + p->current--; +} + + +/*********************************************************************** + */ +void +brw_init_codegen(const struct gen_device_info *devinfo, + struct brw_codegen *p, void *mem_ctx) +{ + memset(p, 0, sizeof(*p)); + + p->devinfo = devinfo; + /* + * Set the initial instruction store array size to 1024, if found that + * isn't enough, then it will double the store size at brw_next_insn() + * until out of memory. + */ + p->store_size = 1024; + p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size); + p->nr_insn = 0; + p->current = p->stack; + memset(p->current, 0, sizeof(p->current[0])); + + p->mem_ctx = mem_ctx; + + /* Some defaults? + */ + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */ + brw_set_default_saturate(p, 0); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + /* Set up control flow stack */ + p->if_stack_depth = 0; + p->if_stack_array_size = 16; + p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size); + + p->loop_stack_depth = 0; + p->loop_stack_array_size = 16; + p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); + p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); +} + + +const unsigned *brw_get_program( struct brw_codegen *p, + unsigned *sz ) +{ + *sz = p->next_insn_offset; + return (const unsigned *)p->store; +} + +void +brw_disassemble(const struct gen_device_info *devinfo, + void *assembly, int start, int end, FILE *out) +{ + bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0; + + for (int offset = start; offset < end;) { + brw_inst *insn = assembly + offset; + brw_inst uncompacted; + bool compacted = brw_inst_cmpt_control(devinfo, insn); + if (0) + fprintf(out, "0x%08x: ", offset); + + if (compacted) { + brw_compact_inst *compacted = (void *)insn; + if (dump_hex) { + fprintf(out, "0x%08x 0x%08x ", + ((uint32_t *)insn)[1], + ((uint32_t *)insn)[0]); + } + + brw_uncompact_instruction(devinfo, &uncompacted, compacted); + insn = &uncompacted; + offset += 8; + } else { + if (dump_hex) { + fprintf(out, "0x%08x 0x%08x 0x%08x 0x%08x ", + ((uint32_t *)insn)[3], + ((uint32_t *)insn)[2], + ((uint32_t *)insn)[1], + ((uint32_t *)insn)[0]); + } + offset += 16; + } + + brw_disassemble_inst(out, devinfo, insn, compacted); + } +} + +enum gen { + GEN4 = (1 << 0), + GEN45 = (1 << 1), + GEN5 = (1 << 2), + GEN6 = (1 << 3), + GEN7 = (1 << 4), + GEN75 = (1 << 5), + GEN8 = (1 << 6), + GEN9 = (1 << 7), + GEN_ALL = ~0 +}; + +#define GEN_LT(gen) ((gen) - 1) +#define GEN_GE(gen) (~GEN_LT(gen)) +#define GEN_LE(gen) (GEN_LT(gen) | (gen)) + +static const struct opcode_desc opcode_10_descs[] = { + { .name = "dim", .nsrc = 1, .ndst = 1, .gens = GEN75 }, + { .name = "smov", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) }, +}; + +static const struct opcode_desc opcode_35_descs[] = { + { .name = "iff", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, + { .name = "brc", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7) }, +}; + +static const struct opcode_desc opcode_38_descs[] = { + { .name = "do", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, + { .name = "case", .nsrc = 0, .ndst = 0, .gens = GEN6 }, +}; + +static const struct opcode_desc opcode_44_descs[] = { + { .name = "msave", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, + { .name = "call", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) }, +}; + +static const struct opcode_desc opcode_45_descs[] = { + { .name = "mrest", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, + { .name = "ret", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) }, +}; + +static const struct opcode_desc opcode_46_descs[] = { + { .name = "push", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, + { .name = "fork", .nsrc = 0, .ndst = 0, .gens = GEN6 }, + { .name = "goto", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) }, +}; + +static const struct opcode_desc opcode_descs[128] = { + [BRW_OPCODE_ILLEGAL] = { + .name = "illegal", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_MOV] = { + .name = "mov", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SEL] = { + .name = "sel", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_MOVI] = { + .name = "movi", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45), + }, + [BRW_OPCODE_NOT] = { + .name = "not", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_AND] = { + .name = "and", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_OR] = { + .name = "or", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_XOR] = { + .name = "xor", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SHR] = { + .name = "shr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SHL] = { + .name = "shl", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [10] = { + .table = opcode_10_descs, .size = ARRAY_SIZE(opcode_10_descs), + }, + /* Reserved - 11 */ + [BRW_OPCODE_ASR] = { + .name = "asr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + /* Reserved - 13-15 */ + [BRW_OPCODE_CMP] = { + .name = "cmp", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_CMPN] = { + .name = "cmpn", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_CSEL] = { + .name = "csel", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8), + }, + [BRW_OPCODE_F32TO16] = { + .name = "f32to16", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75, + }, + [BRW_OPCODE_F16TO32] = { + .name = "f16to32", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75, + }, + /* Reserved - 21-22 */ + [BRW_OPCODE_BFREV] = { + .name = "bfrev", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFE] = { + .name = "bfe", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI1] = { + .name = "bfi1", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI2] = { + .name = "bfi2", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7), + }, + /* Reserved - 27-31 */ + [BRW_OPCODE_JMPI] = { + .name = "jmpi", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [33] = { + .name = "brd", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_IF] = { + .name = "if", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [35] = { + .table = opcode_35_descs, .size = ARRAY_SIZE(opcode_35_descs), + }, + [BRW_OPCODE_ELSE] = { + .name = "else", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_ENDIF] = { + .name = "endif", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [38] = { + .table = opcode_38_descs, .size = ARRAY_SIZE(opcode_38_descs), + }, + [BRW_OPCODE_WHILE] = { + .name = "while", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_BREAK] = { + .name = "break", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_CONTINUE] = { + .name = "cont", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_HALT] = { + .name = "halt", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, + [43] = { + .name = "calla", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN75), + }, + [44] = { + .table = opcode_44_descs, .size = ARRAY_SIZE(opcode_44_descs), + }, + [45] = { + .table = opcode_45_descs, .size = ARRAY_SIZE(opcode_45_descs), + }, + [46] = { + .table = opcode_46_descs, .size = ARRAY_SIZE(opcode_46_descs), + }, + [47] = { + .name = "pop", .nsrc = 2, .ndst = 0, .gens = GEN_LE(GEN5), + }, + [BRW_OPCODE_WAIT] = { + .name = "wait", .nsrc = 1, .ndst = 0, .gens = GEN_ALL, + }, + [BRW_OPCODE_SEND] = { + .name = "send", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SENDC] = { + .name = "sendc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SENDS] = { + .name = "sends", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9), + }, + [BRW_OPCODE_SENDSC] = { + .name = "sendsc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9), + }, + /* Reserved 53-55 */ + [BRW_OPCODE_MATH] = { + .name = "math", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN6), + }, + /* Reserved 57-63 */ + [BRW_OPCODE_ADD] = { + .name = "add", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_MUL] = { + .name = "mul", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_AVG] = { + .name = "avg", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_FRC] = { + .name = "frc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_RNDU] = { + .name = "rndu", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_RNDD] = { + .name = "rndd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_RNDE] = { + .name = "rnde", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_RNDZ] = { + .name = "rndz", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_MAC] = { + .name = "mac", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_MACH] = { + .name = "mach", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_LZD] = { + .name = "lzd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_FBH] = { + .name = "fbh", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_FBL] = { + .name = "fbl", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_CBIT] = { + .name = "cbit", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_ADDC] = { + .name = "addc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_SUBB] = { + .name = "subb", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), + }, + [BRW_OPCODE_SAD2] = { + .name = "sad2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_SADA2] = { + .name = "sada2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + /* Reserved 82-83 */ + [BRW_OPCODE_DP4] = { + .name = "dp4", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_DPH] = { + .name = "dph", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_DP3] = { + .name = "dp3", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_DP2] = { + .name = "dp2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + /* Reserved 88 */ + [BRW_OPCODE_LINE] = { + .name = "line", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, + }, + [BRW_OPCODE_PLN] = { + .name = "pln", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45), + }, + [BRW_OPCODE_MAD] = { + .name = "mad", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6), + }, + [BRW_OPCODE_LRP] = { + .name = "lrp", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6), + }, + [93] = { + .name = "madm", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8), + }, + /* Reserved 94-124 */ + [BRW_OPCODE_NENOP] = { + .name = "nenop", .nsrc = 0, .ndst = 0, .gens = GEN45, + }, + [BRW_OPCODE_NOP] = { + .name = "nop", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, + }, +}; + +static enum gen +gen_from_devinfo(const struct gen_device_info *devinfo) +{ + switch (devinfo->gen) { + case 4: return devinfo->is_g4x ? GEN45 : GEN4; + case 5: return GEN5; + case 6: return GEN6; + case 7: return devinfo->is_haswell ? GEN75 : GEN7; + case 8: return GEN8; + case 9: return GEN9; + default: + unreachable("not reached"); + } +} + +/* Return the matching opcode_desc for the specified opcode number and + * hardware generation, or NULL if the opcode is not supported by the device. + */ +const struct opcode_desc * +brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode) +{ + if (opcode >= ARRAY_SIZE(opcode_descs)) + return NULL; + + enum gen gen = gen_from_devinfo(devinfo); + if (opcode_descs[opcode].gens != 0) { + if ((opcode_descs[opcode].gens & gen) != 0) { + return &opcode_descs[opcode]; + } + } else if (opcode_descs[opcode].table != NULL) { + const struct opcode_desc *table = opcode_descs[opcode].table; + for (unsigned i = 0; i < opcode_descs[opcode].size; i++) { + if ((table[i].gens & gen) != 0) { + return &table[i]; + } + } + } + return NULL; +} diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h new file mode 100644 index 00000000000..f4225952333 --- /dev/null +++ b/src/intel/compiler/brw_eu.h @@ -0,0 +1,612 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_EU_H +#define BRW_EU_H + +#include <stdbool.h> +#include "brw_inst.h" +#include "brw_eu_defines.h" +#include "brw_reg.h" +#include "intel_asm_annotation.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BRW_EU_MAX_INSN_STACK 5 + +/* A helper for accessing the last instruction emitted. This makes it easy + * to set various bits on an instruction without having to create temporary + * variable and assign the emitted instruction to those. + */ +#define brw_last_inst (&p->store[p->nr_insn - 1]) + +struct brw_codegen { + brw_inst *store; + int store_size; + unsigned nr_insn; + unsigned int next_insn_offset; + + void *mem_ctx; + + /* Allow clients to push/pop instruction state: + */ + brw_inst stack[BRW_EU_MAX_INSN_STACK]; + bool compressed_stack[BRW_EU_MAX_INSN_STACK]; + brw_inst *current; + + bool single_program_flow; + const struct gen_device_info *devinfo; + + /* Control flow stacks: + * - if_stack contains IF and ELSE instructions which must be patched + * (and popped) once the matching ENDIF instruction is encountered. + * + * Just store the instruction pointer(an index). + */ + int *if_stack; + int if_stack_depth; + int if_stack_array_size; + + /** + * loop_stack contains the instruction pointers of the starts of loops which + * must be patched (and popped) once the matching WHILE instruction is + * encountered. + */ + int *loop_stack; + /** + * pre-gen6, the BREAK and CONT instructions had to tell how many IF/ENDIF + * blocks they were popping out of, to fix up the mask stack. This tracks + * the IF/ENDIF nesting in each current nested loop level. + */ + int *if_depth_in_loop; + int loop_stack_depth; + int loop_stack_array_size; +}; + +void brw_pop_insn_state( struct brw_codegen *p ); +void brw_push_insn_state( struct brw_codegen *p ); +void brw_set_default_exec_size(struct brw_codegen *p, unsigned value); +void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ); +void brw_set_default_saturate( struct brw_codegen *p, bool enable ); +void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ); +void brw_inst_set_compression(const struct gen_device_info *devinfo, + brw_inst *inst, bool on); +void brw_set_default_compression(struct brw_codegen *p, bool on); +void brw_inst_set_group(const struct gen_device_info *devinfo, + brw_inst *inst, unsigned group); +void brw_set_default_group(struct brw_codegen *p, unsigned group); +void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c); +void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc ); +void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse); +void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg); +void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value); + +void brw_init_codegen(const struct gen_device_info *, struct brw_codegen *p, + void *mem_ctx); +int brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo, + struct brw_inst *inst, bool is_compacted); +void brw_disassemble(const struct gen_device_info *devinfo, void *assembly, + int start, int end, FILE *out); +const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz ); + +brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode); +void brw_set_dest(struct brw_codegen *p, brw_inst *insn, struct brw_reg dest); +void brw_set_src0(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg); + +void gen6_resolve_implied_move(struct brw_codegen *p, + struct brw_reg *src, + unsigned msg_reg_nr); + +/* Helpers for regular instructions: + */ +#define ALU1(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0); + +#define ALU2(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1); + +#define ALU3(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2); + +#define ROUND(OP) \ +void brw_##OP(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0); + +ALU1(MOV) +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU1(DIM) +ALU2(ASR) +ALU1(F32TO16) +ALU1(F16TO32) +ALU2(ADD) +ALU2(AVG) +ALU2(MUL) +ALU1(FRC) +ALU1(RNDD) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU2(LINE) +ALU2(PLN) +ALU3(MAD) +ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU2(ADDC) +ALU2(SUBB) +ALU2(MAC) + +ROUND(RNDZ) +ROUND(RNDE) + +#undef ALU1 +#undef ALU2 +#undef ALU3 +#undef ROUND + + +/* Helpers for SEND instruction: + */ +void brw_set_sampler_message(struct brw_codegen *p, + brw_inst *insn, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format); + +void brw_set_message_descriptor(struct brw_codegen *p, + brw_inst *inst, + enum brw_message_target sfid, + unsigned msg_length, + unsigned response_length, + bool header_present, + bool end_of_thread); + +void brw_set_dp_read_message(struct brw_codegen *p, + brw_inst *insn, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned target_cache, + unsigned msg_length, + bool header_present, + unsigned response_length); + +void brw_set_dp_write_message(struct brw_codegen *p, + brw_inst *insn, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned target_cache, + unsigned msg_length, + bool header_present, + unsigned last_render_target, + unsigned response_length, + unsigned end_of_thread, + unsigned send_commit_msg); + +void brw_urb_WRITE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle); + +/** + * Send message to shared unit \p sfid with a possibly indirect descriptor \p + * desc. If \p desc is not an immediate it will be transparently loaded to an + * address register using an OR instruction. The returned instruction can be + * passed as argument to the usual brw_set_*_message() functions in order to + * specify any additional descriptor bits -- If \p desc is an immediate this + * will be the SEND instruction itself, otherwise it will be the OR + * instruction. + */ +struct brw_inst * +brw_send_indirect_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg desc); + +void brw_ff_sync(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + bool allocate, + unsigned response_length, + bool eot); + +void brw_svb_write(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + bool send_commit_msg); + +void brw_fb_WRITE(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg implied_header, + unsigned msg_control, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool eot, + bool last_render_target, + bool header_present); + +brw_inst *gen9_fb_READ(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool per_sample); + +void brw_SAMPLE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format); + +void brw_adjust_sampler_state_pointer(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg sampler_index); + +void gen4_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + unsigned msg_reg_nr, + struct brw_reg src, + unsigned precision ); + +void gen6_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + struct brw_reg src0, + struct brw_reg src1); + +void brw_oword_block_read(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + uint32_t offset, + uint32_t bind_table_index); + +unsigned brw_scratch_surface_idx(const struct brw_codegen *p); + +void brw_oword_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + unsigned offset); + +void brw_oword_block_write_scratch(struct brw_codegen *p, + struct brw_reg mrf, + int num_regs, + unsigned offset); + +void gen7_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + int num_regs, + unsigned offset); + +void brw_shader_time_add(struct brw_codegen *p, + struct brw_reg payload, + uint32_t surf_index); + +/** + * Return the generation-specific jump distance scaling factor. + * + * Given the number of instructions to jump, we need to scale by + * some number to obtain the actual jump distance to program in an + * instruction. + */ +static inline unsigned +brw_jump_scale(const struct gen_device_info *devinfo) +{ + /* Broadwell measures jump targets in bytes. */ + if (devinfo->gen >= 8) + return 16; + + /* Ironlake and later measure jump targets in 64-bit data chunks (in order + * (to support compaction), so each 128-bit instruction requires 2 chunks. + */ + if (devinfo->gen >= 5) + return 2; + + /* Gen4 simply uses the number of 128-bit instructions. */ + return 1; +} + +void brw_barrier(struct brw_codegen *p, struct brw_reg src); + +/* If/else/endif. Works by manipulating the execution flags on each + * channel. + */ +brw_inst *brw_IF(struct brw_codegen *p, unsigned execute_size); +brw_inst *gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, + struct brw_reg src0, struct brw_reg src1); + +void brw_ELSE(struct brw_codegen *p); +void brw_ENDIF(struct brw_codegen *p); + +/* DO/WHILE loops: + */ +brw_inst *brw_DO(struct brw_codegen *p, unsigned execute_size); + +brw_inst *brw_WHILE(struct brw_codegen *p); + +brw_inst *brw_BREAK(struct brw_codegen *p); +brw_inst *brw_CONT(struct brw_codegen *p); +brw_inst *gen6_HALT(struct brw_codegen *p); + +/* Forward jumps: + */ +void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx); + +brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index, + unsigned predicate_control); + +void brw_NOP(struct brw_codegen *p); + +void brw_WAIT(struct brw_codegen *p); + +/* Special case: there is never a destination, execution size will be + * taken from src0: + */ +void brw_CMP(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1); + +void +brw_untyped_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected); + +void +brw_untyped_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels); + +void +brw_untyped_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels); + +void +brw_typed_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected); + +void +brw_typed_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels); + +void +brw_typed_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels); + +void +brw_memory_fence(struct brw_codegen *p, + struct brw_reg dst); + +void +brw_pixel_interpolator_query(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + bool noperspective, + unsigned mode, + struct brw_reg data, + unsigned msg_length, + unsigned response_length); + +void +brw_find_live_channel(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg mask); + +void +brw_broadcast(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx); + +/*********************************************************************** + * brw_eu_util.c: + */ + +void brw_copy_indirect_to_indirect(struct brw_codegen *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + unsigned count); + +void brw_copy_from_indirect(struct brw_codegen *p, + struct brw_reg dst, + struct brw_indirect ptr, + unsigned count); + +void brw_copy4(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count); + +void brw_copy8(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count); + +void brw_math_invert( struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src); + +void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg); + +void brw_set_uip_jip(struct brw_codegen *p, int start_offset); + +enum brw_conditional_mod brw_negate_cmod(uint32_t cmod); +enum brw_conditional_mod brw_swap_cmod(uint32_t cmod); + +/* brw_eu_compact.c */ +void brw_init_compaction_tables(const struct gen_device_info *devinfo); +void brw_compact_instructions(struct brw_codegen *p, int start_offset, + int num_annotations, struct annotation *annotation); +void brw_uncompact_instruction(const struct gen_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src); +bool brw_try_compact_instruction(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src); + +void brw_debug_compact_uncompact(const struct gen_device_info *devinfo, + brw_inst *orig, brw_inst *uncompacted); + +/* brw_eu_validate.c */ +bool brw_validate_instructions(const struct brw_codegen *p, int start_offset, + struct annotation_info *annotation); + +static inline int +next_offset(const struct gen_device_info *devinfo, void *store, int offset) +{ + brw_inst *insn = (brw_inst *)((char *)store + offset); + + if (brw_inst_cmpt_control(devinfo, insn)) + return offset + 8; + else + return offset + 16; +} + +struct opcode_desc { + /* The union is an implementation detail used by brw_opcode_desc() to handle + * opcodes that have been reused for different instructions across hardware + * generations. + * + * The gens field acts as a tag. If it is non-zero, name points to a string + * containing the instruction mnemonic. If it is zero, the table field is + * valid and either points to a secondary opcode_desc table with 'size' + * elements or is NULL and no such instruction exists for the opcode. + */ + union { + struct { + char *name; + int nsrc; + }; + struct { + const struct opcode_desc *table; + unsigned size; + }; + }; + int ndst; + int gens; +}; + +const struct opcode_desc * +brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode); + +static inline bool +is_3src(const struct gen_device_info *devinfo, enum opcode opcode) +{ + const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode); + return desc && desc->nsrc == 3; +} + +/** Maximum SEND message length */ +#define BRW_MAX_MSG_LENGTH 15 + +/** First MRF register used by pull loads */ +#define FIRST_SPILL_MRF(gen) ((gen) == 6 ? 21 : 13) + +/** First MRF register used by spills */ +#define FIRST_PULL_LOAD_MRF(gen) ((gen) == 6 ? 16 : 13) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c new file mode 100644 index 00000000000..b2af76d533a --- /dev/null +++ b/src/intel/compiler/brw_eu_compact.c @@ -0,0 +1,1579 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_eu_compact.c + * + * Instruction compaction is a feature of G45 and newer hardware that allows + * for a smaller instruction encoding. + * + * The instruction cache is on the order of 32KB, and many programs generate + * far more instructions than that. The instruction cache is built to barely + * keep up with instruction dispatch ability in cache hit cases -- L1 + * instruction cache misses that still hit in the next level could limit + * throughput by around 50%. + * + * The idea of instruction compaction is that most instructions use a tiny + * subset of the GPU functionality, so we can encode what would be a 16 byte + * instruction in 8 bytes using some lookup tables for various fields. + * + * + * Instruction compaction capabilities vary subtly by generation. + * + * G45's support for instruction compaction is very limited. Jump counts on + * this generation are in units of 16-byte uncompacted instructions. As such, + * all jump targets must be 16-byte aligned. Also, all instructions must be + * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned. + * A G45-only instruction, NENOP, must be used to provide padding to align + * uncompacted instructions. + * + * Gen5 removes these restrictions and changes jump counts to be in units of + * 8-byte compacted instructions, allowing jump targets to be only 8-byte + * aligned. Uncompacted instructions can also be placed on 8-byte boundaries. + * + * Gen6 adds the ability to compact instructions with a limited range of + * immediate values. Compactable immediates have 12 unrestricted bits, and a + * 13th bit that's replicated through the high 20 bits, to create the 32-bit + * value of DW3 in the uncompacted instruction word. + * + * On Gen7 we can compact some control flow instructions with a small positive + * immediate in the low bits of DW3, like ENDIF with the JIP field. Other + * control flow instructions with UIP cannot be compacted, because of the + * replicated 13th bit. No control flow instructions can be compacted on Gen6 + * since the jump count field is not in DW3. + * + * break JIP/UIP + * cont JIP/UIP + * halt JIP/UIP + * if JIP/UIP + * else JIP (plus UIP on BDW+) + * endif JIP + * while JIP (must be negative) + * + * Gen 8 adds support for compacting 3-src instructions. + */ + +#include "brw_eu.h" +#include "brw_shader.h" +#include "intel_asm_annotation.h" +#include "common/gen_debug.h" + +static const uint32_t g45_control_index_table[32] = { + 0b00000000000000000, + 0b01000000000000000, + 0b00110000000000000, + 0b00000000000000010, + 0b00100000000000000, + 0b00010000000000000, + 0b01000000000100000, + 0b01000000100000000, + 0b01010000000100000, + 0b00000000100000010, + 0b11000000000000000, + 0b00001000100000010, + 0b01001000100000000, + 0b00000000100000000, + 0b11000000000100000, + 0b00001000100000000, + 0b10110000000000000, + 0b11010000000100000, + 0b00110000100000000, + 0b00100000100000000, + 0b01000000000001000, + 0b01000000000000100, + 0b00111100000000000, + 0b00101011000000000, + 0b00110000000010000, + 0b00010000100000000, + 0b01000000000100100, + 0b01000000000101000, + 0b00110000000000110, + 0b00000000000001010, + 0b01010000000101000, + 0b01010000000100100 +}; + +static const uint32_t g45_datatype_table[32] = { + 0b001000000000100001, + 0b001011010110101101, + 0b001000001000110001, + 0b001111011110111101, + 0b001011010110101100, + 0b001000000110101101, + 0b001000000000100000, + 0b010100010110110001, + 0b001100011000101101, + 0b001000000000100010, + 0b001000001000110110, + 0b010000001000110001, + 0b001000001000110010, + 0b011000001000110010, + 0b001111011110111100, + 0b001000000100101000, + 0b010100011000110001, + 0b001010010100101001, + 0b001000001000101001, + 0b010000001000110110, + 0b101000001000110001, + 0b001011011000101101, + 0b001000000100001001, + 0b001011011000101100, + 0b110100011000110001, + 0b001000001110111101, + 0b110000001000110001, + 0b011000000100101010, + 0b101000001000101001, + 0b001011010110001100, + 0b001000000110100001, + 0b001010010100001000 +}; + +static const uint16_t g45_subreg_table[32] = { + 0b000000000000000, + 0b000000010000000, + 0b000001000000000, + 0b000100000000000, + 0b000000000100000, + 0b100000000000000, + 0b000000000010000, + 0b001100000000000, + 0b001010000000000, + 0b000000100000000, + 0b001000000000000, + 0b000000000001000, + 0b000000001000000, + 0b000000000000001, + 0b000010000000000, + 0b000000010100000, + 0b000000000000111, + 0b000001000100000, + 0b011000000000000, + 0b000000110000000, + 0b000000000000010, + 0b000000000000100, + 0b000000001100000, + 0b000100000000010, + 0b001110011000110, + 0b001110100001000, + 0b000110011000110, + 0b000001000011000, + 0b000110010000100, + 0b001100000000110, + 0b000000010000110, + 0b000001000110000 +}; + +static const uint16_t g45_src_index_table[32] = { + 0b000000000000, + 0b010001101000, + 0b010110001000, + 0b011010010000, + 0b001101001000, + 0b010110001010, + 0b010101110000, + 0b011001111000, + 0b001000101000, + 0b000000101000, + 0b010001010000, + 0b111101101100, + 0b010110001100, + 0b010001101100, + 0b011010010100, + 0b010001001100, + 0b001100101000, + 0b000000000010, + 0b111101001100, + 0b011001101000, + 0b010101001000, + 0b000000000100, + 0b000000101100, + 0b010001101010, + 0b000000111000, + 0b010101011000, + 0b000100100000, + 0b010110000000, + 0b010000000100, + 0b010000111000, + 0b000101100000, + 0b111101110100 +}; + +static const uint32_t gen6_control_index_table[32] = { + 0b00000000000000000, + 0b01000000000000000, + 0b00110000000000000, + 0b00000000100000000, + 0b00010000000000000, + 0b00001000100000000, + 0b00000000100000010, + 0b00000000000000010, + 0b01000000100000000, + 0b01010000000000000, + 0b10110000000000000, + 0b00100000000000000, + 0b11010000000000000, + 0b11000000000000000, + 0b01001000100000000, + 0b01000000000001000, + 0b01000000000000100, + 0b00000000000001000, + 0b00000000000000100, + 0b00111000100000000, + 0b00001000100000010, + 0b00110000100000000, + 0b00110000000000001, + 0b00100000000000001, + 0b00110000000000010, + 0b00110000000000101, + 0b00110000000001001, + 0b00110000000010000, + 0b00110000000000011, + 0b00110000000000100, + 0b00110000100001000, + 0b00100000000001001 +}; + +static const uint32_t gen6_datatype_table[32] = { + 0b001001110000000000, + 0b001000110000100000, + 0b001001110000000001, + 0b001000000001100000, + 0b001010110100101001, + 0b001000000110101101, + 0b001100011000101100, + 0b001011110110101101, + 0b001000000111101100, + 0b001000000001100001, + 0b001000110010100101, + 0b001000000001000001, + 0b001000001000110001, + 0b001000001000101001, + 0b001000000000100000, + 0b001000001000110010, + 0b001010010100101001, + 0b001011010010100101, + 0b001000000110100101, + 0b001100011000101001, + 0b001011011000101100, + 0b001011010110100101, + 0b001011110110100101, + 0b001111011110111101, + 0b001111011110111100, + 0b001111011110111101, + 0b001111011110011101, + 0b001111011110111110, + 0b001000000000100001, + 0b001000000000100010, + 0b001001111111011101, + 0b001000001110111110, +}; + +static const uint16_t gen6_subreg_table[32] = { + 0b000000000000000, + 0b000000000000100, + 0b000000110000000, + 0b111000000000000, + 0b011110000001000, + 0b000010000000000, + 0b000000000010000, + 0b000110000001100, + 0b001000000000000, + 0b000001000000000, + 0b000001010010100, + 0b000000001010110, + 0b010000000000000, + 0b110000000000000, + 0b000100000000000, + 0b000000010000000, + 0b000000000001000, + 0b100000000000000, + 0b000001010000000, + 0b001010000000000, + 0b001100000000000, + 0b000000001010100, + 0b101101010010100, + 0b010100000000000, + 0b000000010001111, + 0b011000000000000, + 0b111110000000000, + 0b101000000000000, + 0b000000000001111, + 0b000100010001111, + 0b001000010001111, + 0b000110000000000, +}; + +static const uint16_t gen6_src_index_table[32] = { + 0b000000000000, + 0b010110001000, + 0b010001101000, + 0b001000101000, + 0b011010010000, + 0b000100100000, + 0b010001101100, + 0b010101110000, + 0b011001111000, + 0b001100101000, + 0b010110001100, + 0b001000100000, + 0b010110001010, + 0b000000000010, + 0b010101010000, + 0b010101101000, + 0b111101001100, + 0b111100101100, + 0b011001110000, + 0b010110001001, + 0b010101011000, + 0b001101001000, + 0b010000101100, + 0b010000000000, + 0b001101110000, + 0b001100010000, + 0b001100000000, + 0b010001101010, + 0b001101111000, + 0b000001110000, + 0b001100100000, + 0b001101010000, +}; + +static const uint32_t gen7_control_index_table[32] = { + 0b0000000000000000010, + 0b0000100000000000000, + 0b0000100000000000001, + 0b0000100000000000010, + 0b0000100000000000011, + 0b0000100000000000100, + 0b0000100000000000101, + 0b0000100000000000111, + 0b0000100000000001000, + 0b0000100000000001001, + 0b0000100000000001101, + 0b0000110000000000000, + 0b0000110000000000001, + 0b0000110000000000010, + 0b0000110000000000011, + 0b0000110000000000100, + 0b0000110000000000101, + 0b0000110000000000111, + 0b0000110000000001001, + 0b0000110000000001101, + 0b0000110000000010000, + 0b0000110000100000000, + 0b0001000000000000000, + 0b0001000000000000010, + 0b0001000000000000100, + 0b0001000000100000000, + 0b0010110000000000000, + 0b0010110000000010000, + 0b0011000000000000000, + 0b0011000000100000000, + 0b0101000000000000000, + 0b0101000000100000000 +}; + +static const uint32_t gen7_datatype_table[32] = { + 0b001000000000000001, + 0b001000000000100000, + 0b001000000000100001, + 0b001000000001100001, + 0b001000000010111101, + 0b001000001011111101, + 0b001000001110100001, + 0b001000001110100101, + 0b001000001110111101, + 0b001000010000100001, + 0b001000110000100000, + 0b001000110000100001, + 0b001001010010100101, + 0b001001110010100100, + 0b001001110010100101, + 0b001111001110111101, + 0b001111011110011101, + 0b001111011110111100, + 0b001111011110111101, + 0b001111111110111100, + 0b000000001000001100, + 0b001000000000111101, + 0b001000000010100101, + 0b001000010000100000, + 0b001001010010100100, + 0b001001110010000100, + 0b001010010100001001, + 0b001101111110111101, + 0b001111111110111101, + 0b001011110110101100, + 0b001010010100101000, + 0b001010110100101000 +}; + +static const uint16_t gen7_subreg_table[32] = { + 0b000000000000000, + 0b000000000000001, + 0b000000000001000, + 0b000000000001111, + 0b000000000010000, + 0b000000010000000, + 0b000000100000000, + 0b000000110000000, + 0b000001000000000, + 0b000001000010000, + 0b000010100000000, + 0b001000000000000, + 0b001000000000001, + 0b001000010000001, + 0b001000010000010, + 0b001000010000011, + 0b001000010000100, + 0b001000010000111, + 0b001000010001000, + 0b001000010001110, + 0b001000010001111, + 0b001000110000000, + 0b001000111101000, + 0b010000000000000, + 0b010000110000000, + 0b011000000000000, + 0b011110010000111, + 0b100000000000000, + 0b101000000000000, + 0b110000000000000, + 0b111000000000000, + 0b111000000011100 +}; + +static const uint16_t gen7_src_index_table[32] = { + 0b000000000000, + 0b000000000010, + 0b000000010000, + 0b000000010010, + 0b000000011000, + 0b000000100000, + 0b000000101000, + 0b000001001000, + 0b000001010000, + 0b000001110000, + 0b000001111000, + 0b001100000000, + 0b001100000010, + 0b001100001000, + 0b001100010000, + 0b001100010010, + 0b001100100000, + 0b001100101000, + 0b001100111000, + 0b001101000000, + 0b001101000010, + 0b001101001000, + 0b001101010000, + 0b001101100000, + 0b001101101000, + 0b001101110000, + 0b001101110001, + 0b001101111000, + 0b010001101000, + 0b010001101001, + 0b010001101010, + 0b010110001000 +}; + +static const uint32_t gen8_control_index_table[32] = { + 0b0000000000000000010, + 0b0000100000000000000, + 0b0000100000000000001, + 0b0000100000000000010, + 0b0000100000000000011, + 0b0000100000000000100, + 0b0000100000000000101, + 0b0000100000000000111, + 0b0000100000000001000, + 0b0000100000000001001, + 0b0000100000000001101, + 0b0000110000000000000, + 0b0000110000000000001, + 0b0000110000000000010, + 0b0000110000000000011, + 0b0000110000000000100, + 0b0000110000000000101, + 0b0000110000000000111, + 0b0000110000000001001, + 0b0000110000000001101, + 0b0000110000000010000, + 0b0000110000100000000, + 0b0001000000000000000, + 0b0001000000000000010, + 0b0001000000000000100, + 0b0001000000100000000, + 0b0010110000000000000, + 0b0010110000000010000, + 0b0011000000000000000, + 0b0011000000100000000, + 0b0101000000000000000, + 0b0101000000100000000 +}; + +static const uint32_t gen8_datatype_table[32] = { + 0b001000000000000000001, + 0b001000000000001000000, + 0b001000000000001000001, + 0b001000000000011000001, + 0b001000000000101011101, + 0b001000000010111011101, + 0b001000000011101000001, + 0b001000000011101000101, + 0b001000000011101011101, + 0b001000001000001000001, + 0b001000011000001000000, + 0b001000011000001000001, + 0b001000101000101000101, + 0b001000111000101000100, + 0b001000111000101000101, + 0b001011100011101011101, + 0b001011101011100011101, + 0b001011101011101011100, + 0b001011101011101011101, + 0b001011111011101011100, + 0b000000000010000001100, + 0b001000000000001011101, + 0b001000000000101000101, + 0b001000001000001000000, + 0b001000101000101000100, + 0b001000111000100000100, + 0b001001001001000001001, + 0b001010111011101011101, + 0b001011111011101011101, + 0b001001111001101001100, + 0b001001001001001001000, + 0b001001011001001001000 +}; + +static const uint16_t gen8_subreg_table[32] = { + 0b000000000000000, + 0b000000000000001, + 0b000000000001000, + 0b000000000001111, + 0b000000000010000, + 0b000000010000000, + 0b000000100000000, + 0b000000110000000, + 0b000001000000000, + 0b000001000010000, + 0b000001010000000, + 0b001000000000000, + 0b001000000000001, + 0b001000010000001, + 0b001000010000010, + 0b001000010000011, + 0b001000010000100, + 0b001000010000111, + 0b001000010001000, + 0b001000010001110, + 0b001000010001111, + 0b001000110000000, + 0b001000111101000, + 0b010000000000000, + 0b010000110000000, + 0b011000000000000, + 0b011110010000111, + 0b100000000000000, + 0b101000000000000, + 0b110000000000000, + 0b111000000000000, + 0b111000000011100 +}; + +static const uint16_t gen8_src_index_table[32] = { + 0b000000000000, + 0b000000000010, + 0b000000010000, + 0b000000010010, + 0b000000011000, + 0b000000100000, + 0b000000101000, + 0b000001001000, + 0b000001010000, + 0b000001110000, + 0b000001111000, + 0b001100000000, + 0b001100000010, + 0b001100001000, + 0b001100010000, + 0b001100010010, + 0b001100100000, + 0b001100101000, + 0b001100111000, + 0b001101000000, + 0b001101000010, + 0b001101001000, + 0b001101010000, + 0b001101100000, + 0b001101101000, + 0b001101110000, + 0b001101110001, + 0b001101111000, + 0b010001101000, + 0b010001101001, + 0b010001101010, + 0b010110001000 +}; + +/* This is actually the control index table for Cherryview (26 bits), but the + * only difference from Broadwell (24 bits) is that it has two extra 0-bits at + * the start. + * + * The low 24 bits have the same mappings on both hardware. + */ +static const uint32_t gen8_3src_control_index_table[4] = { + 0b00100000000110000000000001, + 0b00000000000110000000000001, + 0b00000000001000000000000001, + 0b00000000001000000000100001 +}; + +/* This is actually the control index table for Cherryview (49 bits), but the + * only difference from Broadwell (46 bits) is that it has three extra 0-bits + * at the start. + * + * The low 44 bits have the same mappings on both hardware, and since the high + * three bits on Broadwell are zero, we can reuse Cherryview's table. + */ +static const uint64_t gen8_3src_source_index_table[4] = { + 0b0000001110010011100100111001000001111000000000000, + 0b0000001110010011100100111001000001111000000000010, + 0b0000001110010011100100111001000001111000000001000, + 0b0000001110010011100100111001000001111000000100000 +}; + +static const uint32_t *control_index_table; +static const uint32_t *datatype_table; +static const uint16_t *subreg_table; +static const uint16_t *src_index_table; + +static bool +set_control_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + uint32_t uncompacted = devinfo->gen >= 8 /* 17b/G45; 19b/IVB+ */ + ? (brw_inst_bits(src, 33, 31) << 16) | /* 3b */ + (brw_inst_bits(src, 23, 12) << 4) | /* 12b */ + (brw_inst_bits(src, 10, 9) << 2) | /* 2b */ + (brw_inst_bits(src, 34, 34) << 1) | /* 1b */ + (brw_inst_bits(src, 8, 8)) /* 1b */ + : (brw_inst_bits(src, 31, 31) << 16) | /* 1b */ + (brw_inst_bits(src, 23, 8)); /* 16b */ + + /* On gen7, the flag register and subregister numbers are integrated into + * the control index. + */ + if (devinfo->gen == 7) + uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */ + + for (int i = 0; i < 32; i++) { + if (control_index_table[i] == uncompacted) { + brw_compact_inst_set_control_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_datatype_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, + brw_inst *src) +{ + uint32_t uncompacted = devinfo->gen >= 8 /* 18b/G45+; 21b/BDW+ */ + ? (brw_inst_bits(src, 63, 61) << 18) | /* 3b */ + (brw_inst_bits(src, 94, 89) << 12) | /* 6b */ + (brw_inst_bits(src, 46, 35)) /* 12b */ + : (brw_inst_bits(src, 63, 61) << 15) | /* 3b */ + (brw_inst_bits(src, 46, 32)); /* 15b */ + + for (int i = 0; i < 32; i++) { + if (datatype_table[i] == uncompacted) { + brw_compact_inst_set_datatype_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_subreg_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, + brw_inst *src, bool is_immediate) +{ + uint16_t uncompacted = /* 15b */ + (brw_inst_bits(src, 52, 48) << 0) | /* 5b */ + (brw_inst_bits(src, 68, 64) << 5); /* 5b */ + + if (!is_immediate) + uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */ + + for (int i = 0; i < 32; i++) { + if (subreg_table[i] == uncompacted) { + brw_compact_inst_set_subreg_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +get_src_index(uint16_t uncompacted, + uint16_t *compacted) +{ + for (int i = 0; i < 32; i++) { + if (src_index_table[i] == uncompacted) { + *compacted = i; + return true; + } + } + + return false; +} + +static bool +set_src0_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + uint16_t compacted; + uint16_t uncompacted = brw_inst_bits(src, 88, 77); /* 12b */ + + if (!get_src_index(uncompacted, &compacted)) + return false; + + brw_compact_inst_set_src0_index(devinfo, dst, compacted); + + return true; +} + +static bool +set_src1_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, + brw_inst *src, bool is_immediate) +{ + uint16_t compacted; + + if (is_immediate) { + compacted = (brw_inst_imm_ud(devinfo, src) >> 8) & 0x1f; + } else { + uint16_t uncompacted = brw_inst_bits(src, 120, 109); /* 12b */ + + if (!get_src_index(uncompacted, &compacted)) + return false; + } + + brw_compact_inst_set_src1_index(devinfo, dst, compacted); + + return true; +} + +static bool +set_3src_control_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + assert(devinfo->gen >= 8); + + uint32_t uncompacted = /* 24b/BDW; 26b/CHV */ + (brw_inst_bits(src, 34, 32) << 21) | /* 3b */ + (brw_inst_bits(src, 28, 8)); /* 21b */ + + if (devinfo->gen >= 9 || devinfo->is_cherryview) + uncompacted |= brw_inst_bits(src, 36, 35) << 24; /* 2b */ + + for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) { + if (gen8_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_3src_source_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + assert(devinfo->gen >= 8); + + uint64_t uncompacted = /* 46b/BDW; 49b/CHV */ + (brw_inst_bits(src, 83, 83) << 43) | /* 1b */ + (brw_inst_bits(src, 114, 107) << 35) | /* 8b */ + (brw_inst_bits(src, 93, 86) << 27) | /* 8b */ + (brw_inst_bits(src, 72, 65) << 19) | /* 8b */ + (brw_inst_bits(src, 55, 37)); /* 19b */ + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + uncompacted |= + (brw_inst_bits(src, 126, 125) << 47) | /* 2b */ + (brw_inst_bits(src, 105, 104) << 45) | /* 2b */ + (brw_inst_bits(src, 84, 84) << 44); /* 1b */ + } else { + uncompacted |= + (brw_inst_bits(src, 125, 125) << 45) | /* 1b */ + (brw_inst_bits(src, 104, 104) << 44); /* 1b */ + } + + for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) { + if (gen8_3src_source_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_source_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +has_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src) +{ + /* EOT can only be mapped on a send if the src1 is an immediate */ + if ((brw_inst_opcode(devinfo, src) == BRW_OPCODE_SENDC || + brw_inst_opcode(devinfo, src) == BRW_OPCODE_SEND) && + brw_inst_eot(devinfo, src)) + return true; + + /* Check for instruction bits that don't map to any of the fields of the + * compacted instruction. The instruction cannot be compacted if any of + * them are set. They overlap with: + * - NibCtrl (bit 47 on Gen7, bit 11 on Gen8) + * - Dst.AddrImm[9] (bit 47 on Gen8) + * - Src0.AddrImm[9] (bit 95 on Gen8) + * - Imm64[27:31] (bits 91-95 on Gen7, bit 95 on Gen8) + * - UIP[31] (bit 95 on Gen8) + */ + if (devinfo->gen >= 8) { + assert(!brw_inst_bits(src, 7, 7)); + return brw_inst_bits(src, 95, 95) || + brw_inst_bits(src, 47, 47) || + brw_inst_bits(src, 11, 11); + } else { + assert(!brw_inst_bits(src, 7, 7) && + !(devinfo->gen < 7 && brw_inst_bits(src, 90, 90))); + return brw_inst_bits(src, 95, 91) || + brw_inst_bits(src, 47, 47); + } +} + +static bool +has_3src_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src) +{ + /* Check for three-source instruction bits that don't map to any of the + * fields of the compacted instruction. All of them seem to be reserved + * bits currently. + */ + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + assert(!brw_inst_bits(src, 127, 127) && + !brw_inst_bits(src, 7, 7)); + } else { + assert(devinfo->gen >= 8); + assert(!brw_inst_bits(src, 127, 126) && + !brw_inst_bits(src, 105, 105) && + !brw_inst_bits(src, 84, 84) && + !brw_inst_bits(src, 36, 35) && + !brw_inst_bits(src, 7, 7)); + } + + return false; +} + +static bool +brw_try_compact_3src_instruction(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + assert(devinfo->gen >= 8); + + if (has_3src_unmapped_bits(devinfo, src)) + return false; + +#define compact(field) \ + brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src)) + + compact(opcode); + + if (!set_3src_control_index(devinfo, dst, src)) + return false; + + if (!set_3src_source_index(devinfo, dst, src)) + return false; + + compact(dst_reg_nr); + compact(src0_rep_ctrl); + brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true); + compact(debug_control); + compact(saturate); + compact(src1_rep_ctrl); + compact(src2_rep_ctrl); + compact(src0_reg_nr); + compact(src1_reg_nr); + compact(src2_reg_nr); + compact(src0_subreg_nr); + compact(src1_subreg_nr); + compact(src2_subreg_nr); + +#undef compact + + return true; +} + +/* Compacted instructions have 12-bits for immediate sources, and a 13th bit + * that's replicated through the high 20 bits. + * + * Effectively this means we get 12-bit integers, 0.0f, and some limited uses + * of packed vectors as compactable immediates. + */ +static bool +is_compactable_immediate(unsigned imm) +{ + /* We get the low 12 bits as-is. */ + imm &= ~0xfff; + + /* We get one bit replicated through the top 20 bits. */ + return imm == 0 || imm == 0xfffff000; +} + +/** + * Tries to compact instruction src into dst. + * + * It doesn't modify dst unless src is compactable, which is relied on by + * brw_compact_instructions(). + */ +bool +brw_try_compact_instruction(const struct gen_device_info *devinfo, + brw_compact_inst *dst, brw_inst *src) +{ + brw_compact_inst temp; + + assert(brw_inst_cmpt_control(devinfo, src) == 0); + + if (is_3src(devinfo, brw_inst_opcode(devinfo, src))) { + if (devinfo->gen >= 8) { + memset(&temp, 0, sizeof(temp)); + if (brw_try_compact_3src_instruction(devinfo, &temp, src)) { + *dst = temp; + return true; + } else { + return false; + } + } else { + return false; + } + } + + bool is_immediate = + brw_inst_src0_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE || + brw_inst_src1_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE; + if (is_immediate && + (devinfo->gen < 6 || + !is_compactable_immediate(brw_inst_imm_ud(devinfo, src)))) { + return false; + } + + if (has_unmapped_bits(devinfo, src)) + return false; + + memset(&temp, 0, sizeof(temp)); + +#define compact(field) \ + brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src)) + + compact(opcode); + compact(debug_control); + + if (!set_control_index(devinfo, &temp, src)) + return false; + if (!set_datatype_index(devinfo, &temp, src)) + return false; + if (!set_subreg_index(devinfo, &temp, src, is_immediate)) + return false; + + if (devinfo->gen >= 6) { + compact(acc_wr_control); + } else { + compact(mask_control_ex); + } + + compact(cond_modifier); + + if (devinfo->gen <= 6) + compact(flag_subreg_nr); + + brw_compact_inst_set_cmpt_control(devinfo, &temp, true); + + if (!set_src0_index(devinfo, &temp, src)) + return false; + if (!set_src1_index(devinfo, &temp, src, is_immediate)) + return false; + + brw_compact_inst_set_dst_reg_nr(devinfo, &temp, + brw_inst_dst_da_reg_nr(devinfo, src)); + brw_compact_inst_set_src0_reg_nr(devinfo, &temp, + brw_inst_src0_da_reg_nr(devinfo, src)); + + if (is_immediate) { + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, + brw_inst_imm_ud(devinfo, src) & 0xff); + } else { + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, + brw_inst_src1_da_reg_nr(devinfo, src)); + } + +#undef compact + + *dst = temp; + + return true; +} + +static void +set_uncompacted_control(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src) +{ + uint32_t uncompacted = + control_index_table[brw_compact_inst_control_index(devinfo, src)]; + + if (devinfo->gen >= 8) { + brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16)); + brw_inst_set_bits(dst, 23, 12, (uncompacted >> 4) & 0xfff); + brw_inst_set_bits(dst, 10, 9, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 1) & 0x1); + brw_inst_set_bits(dst, 8, 8, (uncompacted >> 0) & 0x1); + } else { + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 23, 8, (uncompacted & 0xffff)); + + if (devinfo->gen == 7) + brw_inst_set_bits(dst, 90, 89, uncompacted >> 17); + } +} + +static void +set_uncompacted_datatype(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src) +{ + uint32_t uncompacted = + datatype_table[brw_compact_inst_datatype_index(devinfo, src)]; + + if (devinfo->gen >= 8) { + brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18)); + brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f); + brw_inst_set_bits(dst, 46, 35, (uncompacted >> 0) & 0xfff); + } else { + brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15)); + brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff)); + } +} + +static void +set_uncompacted_subreg(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src) +{ + uint16_t uncompacted = + subreg_table[brw_compact_inst_subreg_index(devinfo, src)]; + + brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10)); + brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f); +} + +static void +set_uncompacted_src0(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src) +{ + uint32_t compacted = brw_compact_inst_src0_index(devinfo, src); + uint16_t uncompacted = src_index_table[compacted]; + + brw_inst_set_bits(dst, 88, 77, uncompacted); +} + +static void +set_uncompacted_src1(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src, bool is_immediate) +{ + if (is_immediate) { + signed high5 = brw_compact_inst_src1_index(devinfo, src); + /* Replicate top bit of src1_index into high 20 bits of the immediate. */ + brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19); + } else { + uint16_t uncompacted = + src_index_table[brw_compact_inst_src1_index(devinfo, src)]; + + brw_inst_set_bits(dst, 120, 109, uncompacted); + } +} + +static void +set_uncompacted_3src_control_index(const struct gen_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src) +{ + assert(devinfo->gen >= 8); + + uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint32_t uncompacted = gen8_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff); + + if (devinfo->gen >= 9 || devinfo->is_cherryview) + brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3); +} + +static void +set_uncompacted_3src_source_index(const struct gen_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src) +{ + assert(devinfo->gen >= 8); + + uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src); + uint64_t uncompacted = gen8_3src_source_index_table[compacted]; + + brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1); + brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff); + brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff); + brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff); + brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff); + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3); + brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3); + brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1); + } else { + brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1); + brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1); + } +} + +static void +brw_uncompact_3src_instruction(const struct gen_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src) +{ + assert(devinfo->gen >= 8); + +#define uncompact(field) \ + brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src)) + + uncompact(opcode); + + set_uncompacted_3src_control_index(devinfo, dst, src); + set_uncompacted_3src_source_index(devinfo, dst, src); + + uncompact(dst_reg_nr); + uncompact(src0_rep_ctrl); + brw_inst_set_3src_cmpt_control(devinfo, dst, false); + uncompact(debug_control); + uncompact(saturate); + uncompact(src1_rep_ctrl); + uncompact(src2_rep_ctrl); + uncompact(src0_reg_nr); + uncompact(src1_reg_nr); + uncompact(src2_reg_nr); + uncompact(src0_subreg_nr); + uncompact(src1_subreg_nr); + uncompact(src2_subreg_nr); + +#undef uncompact +} + +void +brw_uncompact_instruction(const struct gen_device_info *devinfo, brw_inst *dst, + brw_compact_inst *src) +{ + memset(dst, 0, sizeof(*dst)); + + if (devinfo->gen >= 8 && + is_3src(devinfo, brw_compact_inst_3src_opcode(devinfo, src))) { + brw_uncompact_3src_instruction(devinfo, dst, src); + return; + } + +#define uncompact(field) \ + brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src)) + + uncompact(opcode); + uncompact(debug_control); + + set_uncompacted_control(devinfo, dst, src); + set_uncompacted_datatype(devinfo, dst, src); + + /* src0/1 register file fields are in the datatype table. */ + bool is_immediate = brw_inst_src0_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE || + brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE; + + set_uncompacted_subreg(devinfo, dst, src); + + if (devinfo->gen >= 6) { + uncompact(acc_wr_control); + } else { + uncompact(mask_control_ex); + } + + uncompact(cond_modifier); + + if (devinfo->gen <= 6) + uncompact(flag_subreg_nr); + + set_uncompacted_src0(devinfo, dst, src); + set_uncompacted_src1(devinfo, dst, src, is_immediate); + + brw_inst_set_dst_da_reg_nr(devinfo, dst, + brw_compact_inst_dst_reg_nr(devinfo, src)); + brw_inst_set_src0_da_reg_nr(devinfo, dst, + brw_compact_inst_src0_reg_nr(devinfo, src)); + + if (is_immediate) { + brw_inst_set_imm_ud(devinfo, dst, + brw_inst_imm_ud(devinfo, dst) | + brw_compact_inst_src1_reg_nr(devinfo, src)); + } else { + brw_inst_set_src1_da_reg_nr(devinfo, dst, + brw_compact_inst_src1_reg_nr(devinfo, src)); + } + +#undef uncompact +} + +void brw_debug_compact_uncompact(const struct gen_device_info *devinfo, + brw_inst *orig, + brw_inst *uncompacted) +{ + fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n", + devinfo->gen); + + fprintf(stderr, " before: "); + brw_disassemble_inst(stderr, devinfo, orig, true); + + fprintf(stderr, " after: "); + brw_disassemble_inst(stderr, devinfo, uncompacted, false); + + uint32_t *before_bits = (uint32_t *)orig; + uint32_t *after_bits = (uint32_t *)uncompacted; + fprintf(stderr, " changed bits:\n"); + for (int i = 0; i < 128; i++) { + uint32_t before = before_bits[i / 32] & (1 << (i & 31)); + uint32_t after = after_bits[i / 32] & (1 << (i & 31)); + + if (before != after) { + fprintf(stderr, " bit %d, %s to %s\n", i, + before ? "set" : "unset", + after ? "set" : "unset"); + } + } +} + +static int +compacted_between(int old_ip, int old_target_ip, int *compacted_counts) +{ + int this_compacted_count = compacted_counts[old_ip]; + int target_compacted_count = compacted_counts[old_target_ip]; + return target_compacted_count - this_compacted_count; +} + +static void +update_uip_jip(const struct gen_device_info *devinfo, brw_inst *insn, + int this_old_ip, int *compacted_counts) +{ + /* JIP and UIP are in units of: + * - bytes on Gen8+; and + * - compacted instructions on Gen6+. + */ + int shift = devinfo->gen >= 8 ? 3 : 0; + + int32_t jip_compacted = brw_inst_jip(devinfo, insn) >> shift; + jip_compacted -= compacted_between(this_old_ip, + this_old_ip + (jip_compacted / 2), + compacted_counts); + brw_inst_set_jip(devinfo, insn, jip_compacted << shift); + + if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ENDIF || + brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE || + (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ELSE && devinfo->gen <= 7)) + return; + + int32_t uip_compacted = brw_inst_uip(devinfo, insn) >> shift; + uip_compacted -= compacted_between(this_old_ip, + this_old_ip + (uip_compacted / 2), + compacted_counts); + brw_inst_set_uip(devinfo, insn, uip_compacted << shift); +} + +static void +update_gen4_jump_count(const struct gen_device_info *devinfo, brw_inst *insn, + int this_old_ip, int *compacted_counts) +{ + assert(devinfo->gen == 5 || devinfo->is_g4x); + + /* Jump Count is in units of: + * - uncompacted instructions on G45; and + * - compacted instructions on Gen5. + */ + int shift = devinfo->is_g4x ? 1 : 0; + + int jump_count_compacted = brw_inst_gen4_jump_count(devinfo, insn) << shift; + + int target_old_ip = this_old_ip + (jump_count_compacted / 2); + + int this_compacted_count = compacted_counts[this_old_ip]; + int target_compacted_count = compacted_counts[target_old_ip]; + + jump_count_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_gen4_jump_count(devinfo, insn, jump_count_compacted >> shift); +} + +void +brw_init_compaction_tables(const struct gen_device_info *devinfo) +{ + assert(g45_control_index_table[ARRAY_SIZE(g45_control_index_table) - 1] != 0); + assert(g45_datatype_table[ARRAY_SIZE(g45_datatype_table) - 1] != 0); + assert(g45_subreg_table[ARRAY_SIZE(g45_subreg_table) - 1] != 0); + assert(g45_src_index_table[ARRAY_SIZE(g45_src_index_table) - 1] != 0); + assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0); + assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0); + assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0); + assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0); + assert(gen7_control_index_table[ARRAY_SIZE(gen7_control_index_table) - 1] != 0); + assert(gen7_datatype_table[ARRAY_SIZE(gen7_datatype_table) - 1] != 0); + assert(gen7_subreg_table[ARRAY_SIZE(gen7_subreg_table) - 1] != 0); + assert(gen7_src_index_table[ARRAY_SIZE(gen7_src_index_table) - 1] != 0); + assert(gen8_control_index_table[ARRAY_SIZE(gen8_control_index_table) - 1] != 0); + assert(gen8_datatype_table[ARRAY_SIZE(gen8_datatype_table) - 1] != 0); + assert(gen8_subreg_table[ARRAY_SIZE(gen8_subreg_table) - 1] != 0); + assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0); + + switch (devinfo->gen) { + case 9: + case 8: + control_index_table = gen8_control_index_table; + datatype_table = gen8_datatype_table; + subreg_table = gen8_subreg_table; + src_index_table = gen8_src_index_table; + break; + case 7: + control_index_table = gen7_control_index_table; + datatype_table = gen7_datatype_table; + subreg_table = gen7_subreg_table; + src_index_table = gen7_src_index_table; + break; + case 6: + control_index_table = gen6_control_index_table; + datatype_table = gen6_datatype_table; + subreg_table = gen6_subreg_table; + src_index_table = gen6_src_index_table; + break; + case 5: + case 4: + control_index_table = g45_control_index_table; + datatype_table = g45_datatype_table; + subreg_table = g45_subreg_table; + src_index_table = g45_src_index_table; + break; + default: + unreachable("unknown generation"); + } +} + +void +brw_compact_instructions(struct brw_codegen *p, int start_offset, + int num_annotations, struct annotation *annotation) +{ + if (unlikely(INTEL_DEBUG & DEBUG_NO_COMPACTION)) + return; + + const struct gen_device_info *devinfo = p->devinfo; + void *store = p->store + start_offset / 16; + /* For an instruction at byte offset 16*i before compaction, this is the + * number of compacted instructions minus the number of padding NOP/NENOPs + * that preceded it. + */ + int compacted_counts[(p->next_insn_offset - start_offset) / sizeof(brw_inst)]; + /* For an instruction at byte offset 8*i after compaction, this was its IP + * (in 16-byte units) before compaction. + */ + int old_ip[(p->next_insn_offset - start_offset) / sizeof(brw_compact_inst)]; + + if (devinfo->gen == 4 && !devinfo->is_g4x) + return; + + int offset = 0; + int compacted_count = 0; + for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset; + src_offset += sizeof(brw_inst)) { + brw_inst *src = store + src_offset; + void *dst = store + offset; + + old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst); + compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count; + + brw_inst saved = *src; + + if (brw_try_compact_instruction(devinfo, dst, src)) { + compacted_count++; + + if (INTEL_DEBUG) { + brw_inst uncompacted; + brw_uncompact_instruction(devinfo, &uncompacted, dst); + if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) { + brw_debug_compact_uncompact(devinfo, &saved, &uncompacted); + } + } + + offset += sizeof(brw_compact_inst); + } else { + /* All uncompacted instructions need to be aligned on G45. */ + if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){ + brw_compact_inst *align = store + offset; + memset(align, 0, sizeof(*align)); + brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP); + brw_compact_inst_set_cmpt_control(devinfo, align, true); + offset += sizeof(brw_compact_inst); + compacted_count--; + compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count; + old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst); + + dst = store + offset; + } + + /* If we didn't compact this intruction, we need to move it down into + * place. + */ + if (offset != src_offset) { + memmove(dst, src, sizeof(brw_inst)); + } + offset += sizeof(brw_inst); + } + } + + /* Fix up control flow offsets. */ + p->next_insn_offset = start_offset + offset; + for (offset = 0; offset < p->next_insn_offset - start_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + int this_old_ip = old_ip[offset / sizeof(brw_compact_inst)]; + int this_compacted_count = compacted_counts[this_old_ip]; + + switch (brw_inst_opcode(devinfo, insn)) { + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case BRW_OPCODE_HALT: + if (devinfo->gen >= 6) { + update_uip_jip(devinfo, insn, this_old_ip, compacted_counts); + } else { + update_gen4_jump_count(devinfo, insn, this_old_ip, + compacted_counts); + } + break; + + case BRW_OPCODE_IF: + case BRW_OPCODE_IFF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + if (devinfo->gen >= 7) { + if (brw_inst_cmpt_control(devinfo, insn)) { + brw_inst uncompacted; + brw_uncompact_instruction(devinfo, &uncompacted, + (brw_compact_inst *)insn); + + update_uip_jip(devinfo, &uncompacted, this_old_ip, + compacted_counts); + + bool ret = brw_try_compact_instruction(devinfo, + (brw_compact_inst *)insn, + &uncompacted); + assert(ret); (void)ret; + } else { + update_uip_jip(devinfo, insn, this_old_ip, compacted_counts); + } + } else if (devinfo->gen == 6) { + assert(!brw_inst_cmpt_control(devinfo, insn)); + + /* Jump Count is in units of compacted instructions on Gen6. */ + int jump_count_compacted = brw_inst_gen6_jump_count(devinfo, insn); + + int target_old_ip = this_old_ip + (jump_count_compacted / 2); + int target_compacted_count = compacted_counts[target_old_ip]; + jump_count_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_gen6_jump_count(devinfo, insn, jump_count_compacted); + } else { + update_gen4_jump_count(devinfo, insn, this_old_ip, + compacted_counts); + } + break; + + case BRW_OPCODE_ADD: + /* Add instructions modifying the IP register use an immediate src1, + * and Gens that use this cannot compact instructions with immediate + * operands. + */ + if (brw_inst_cmpt_control(devinfo, insn)) + break; + + if (brw_inst_dst_reg_file(devinfo, insn) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, insn) == BRW_ARF_IP) { + assert(brw_inst_src1_reg_file(devinfo, insn) == BRW_IMMEDIATE_VALUE); + + int shift = 3; + int jump_compacted = brw_inst_imm_d(devinfo, insn) >> shift; + + int target_old_ip = this_old_ip + (jump_compacted / 2); + int target_compacted_count = compacted_counts[target_old_ip]; + jump_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift); + } + break; + } + } + + /* p->nr_insn is counting the number of uncompacted instructions still, so + * divide. We do want to be sure there's a valid instruction in any + * alignment padding, so that the next compression pass (for the FS 8/16 + * compile passes) parses correctly. + */ + if (p->next_insn_offset & sizeof(brw_compact_inst)) { + brw_compact_inst *align = store + offset; + memset(align, 0, sizeof(*align)); + brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP); + brw_compact_inst_set_cmpt_control(devinfo, align, true); + p->next_insn_offset += sizeof(brw_compact_inst); + } + p->nr_insn = p->next_insn_offset / sizeof(brw_inst); + + /* Update the instruction offsets for each annotation. */ + if (annotation) { + for (int offset = 0, i = 0; i < num_annotations; i++) { + while (start_offset + old_ip[offset / sizeof(brw_compact_inst)] * + sizeof(brw_inst) != annotation[i].offset) { + assert(start_offset + old_ip[offset / sizeof(brw_compact_inst)] * + sizeof(brw_inst) < annotation[i].offset); + offset = next_offset(devinfo, store, offset); + } + + annotation[i].offset = start_offset + offset; + + offset = next_offset(devinfo, store, offset); + } + + annotation[num_annotations].offset = p->next_insn_offset; + } +} diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h new file mode 100644 index 00000000000..5848f920448 --- /dev/null +++ b/src/intel/compiler/brw_eu_defines.h @@ -0,0 +1,1246 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#ifndef BRW_EU_DEFINES_H +#define BRW_EU_DEFINES_H + +#include "util/macros.h" + +/* The following hunk, up-to "Execution Unit" is used by both the + * intel/compiler and i965 codebase. */ + +#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) +/* Using the GNU statement expression extension */ +#define SET_FIELD(value, field) \ + ({ \ + uint32_t fieldval = (value) << field ## _SHIFT; \ + assert((fieldval & ~ field ## _MASK) == 0); \ + fieldval & field ## _MASK; \ + }) + +#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low)) +#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) + +#define _3DPRIM_POINTLIST 0x01 +#define _3DPRIM_LINELIST 0x02 +#define _3DPRIM_LINESTRIP 0x03 +#define _3DPRIM_TRILIST 0x04 +#define _3DPRIM_TRISTRIP 0x05 +#define _3DPRIM_TRIFAN 0x06 +#define _3DPRIM_QUADLIST 0x07 +#define _3DPRIM_QUADSTRIP 0x08 +#define _3DPRIM_LINELIST_ADJ 0x09 /* G45+ */ +#define _3DPRIM_LINESTRIP_ADJ 0x0A /* G45+ */ +#define _3DPRIM_TRILIST_ADJ 0x0B /* G45+ */ +#define _3DPRIM_TRISTRIP_ADJ 0x0C /* G45+ */ +#define _3DPRIM_TRISTRIP_REVERSE 0x0D +#define _3DPRIM_POLYGON 0x0E +#define _3DPRIM_RECTLIST 0x0F +#define _3DPRIM_LINELOOP 0x10 +#define _3DPRIM_POINTLIST_BF 0x11 +#define _3DPRIM_LINESTRIP_CONT 0x12 +#define _3DPRIM_LINESTRIP_BF 0x13 +#define _3DPRIM_LINESTRIP_CONT_BF 0x14 +#define _3DPRIM_TRIFAN_NOSTIPPLE 0x16 +#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); }) + +enum brw_barycentric_mode { + BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, + BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, + BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, + BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, + BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, + BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, + BRW_BARYCENTRIC_MODE_COUNT = 6 +}; +#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ + ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ + (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ + (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) + +enum brw_pixel_shader_computed_depth_mode { + BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */ + BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */ + BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */ + BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */ +}; + +/* Bitfields for the URB_WRITE message, DW2 of message header: */ +#define URB_WRITE_PRIM_END 0x1 +#define URB_WRITE_PRIM_START 0x2 +#define URB_WRITE_PRIM_TYPE_SHIFT 2 + +# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT 0 +# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1 + +/* Execution Unit (EU) defines + */ + +#define BRW_ALIGN_1 0 +#define BRW_ALIGN_16 1 + +#define BRW_ADDRESS_DIRECT 0 +#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER 1 + +#define BRW_CHANNEL_X 0 +#define BRW_CHANNEL_Y 1 +#define BRW_CHANNEL_Z 2 +#define BRW_CHANNEL_W 3 + +enum brw_compression { + BRW_COMPRESSION_NONE = 0, + BRW_COMPRESSION_2NDHALF = 1, + BRW_COMPRESSION_COMPRESSED = 2, +}; + +#define GEN6_COMPRESSION_1Q 0 +#define GEN6_COMPRESSION_2Q 1 +#define GEN6_COMPRESSION_3Q 2 +#define GEN6_COMPRESSION_4Q 3 +#define GEN6_COMPRESSION_1H 0 +#define GEN6_COMPRESSION_2H 2 + +enum PACKED brw_conditional_mod { + BRW_CONDITIONAL_NONE = 0, + BRW_CONDITIONAL_Z = 1, + BRW_CONDITIONAL_NZ = 2, + BRW_CONDITIONAL_EQ = 1, /* Z */ + BRW_CONDITIONAL_NEQ = 2, /* NZ */ + BRW_CONDITIONAL_G = 3, + BRW_CONDITIONAL_GE = 4, + BRW_CONDITIONAL_L = 5, + BRW_CONDITIONAL_LE = 6, + BRW_CONDITIONAL_R = 7, /* Gen <= 5 */ + BRW_CONDITIONAL_O = 8, + BRW_CONDITIONAL_U = 9, +}; + +#define BRW_DEBUG_NONE 0 +#define BRW_DEBUG_BREAKPOINT 1 + +#define BRW_DEPENDENCY_NORMAL 0 +#define BRW_DEPENDENCY_NOTCLEARED 1 +#define BRW_DEPENDENCY_NOTCHECKED 2 +#define BRW_DEPENDENCY_DISABLE 3 + +enum PACKED brw_execution_size { + BRW_EXECUTE_1 = 0, + BRW_EXECUTE_2 = 1, + BRW_EXECUTE_4 = 2, + BRW_EXECUTE_8 = 3, + BRW_EXECUTE_16 = 4, + BRW_EXECUTE_32 = 5, +}; + +enum PACKED brw_horizontal_stride { + BRW_HORIZONTAL_STRIDE_0 = 0, + BRW_HORIZONTAL_STRIDE_1 = 1, + BRW_HORIZONTAL_STRIDE_2 = 2, + BRW_HORIZONTAL_STRIDE_4 = 3, +}; + +#define BRW_INSTRUCTION_NORMAL 0 +#define BRW_INSTRUCTION_SATURATE 1 + +#define BRW_MASK_ENABLE 0 +#define BRW_MASK_DISABLE 1 + +/** @{ + * + * Gen6 has replaced "mask enable/disable" with WECtrl, which is + * effectively the same but much simpler to think about. Now, there + * are two contributors ANDed together to whether channels are + * executed: The predication on the instruction, and the channel write + * enable. + */ +/** + * This is the default value. It means that a channel's write enable is set + * if the per-channel IP is pointing at this instruction. + */ +#define BRW_WE_NORMAL 0 +/** + * This is used like BRW_MASK_DISABLE, and causes all channels to have + * their write enable set. Note that predication still contributes to + * whether the channel actually gets written. + */ +#define BRW_WE_ALL 1 +/** @} */ + +enum opcode { + /* These are the actual hardware opcodes. */ + BRW_OPCODE_ILLEGAL = 0, + BRW_OPCODE_MOV = 1, + BRW_OPCODE_SEL = 2, + BRW_OPCODE_MOVI = 3, /**< G45+ */ + BRW_OPCODE_NOT = 4, + BRW_OPCODE_AND = 5, + BRW_OPCODE_OR = 6, + BRW_OPCODE_XOR = 7, + BRW_OPCODE_SHR = 8, + BRW_OPCODE_SHL = 9, + BRW_OPCODE_DIM = 10, /**< Gen7.5 only */ /* Reused */ + // BRW_OPCODE_SMOV = 10, /**< Gen8+ */ /* Reused */ + /* Reserved - 11 */ + BRW_OPCODE_ASR = 12, + /* Reserved - 13-15 */ + BRW_OPCODE_CMP = 16, + BRW_OPCODE_CMPN = 17, + BRW_OPCODE_CSEL = 18, /**< Gen8+ */ + BRW_OPCODE_F32TO16 = 19, /**< Gen7 only */ + BRW_OPCODE_F16TO32 = 20, /**< Gen7 only */ + /* Reserved - 21-22 */ + BRW_OPCODE_BFREV = 23, /**< Gen7+ */ + BRW_OPCODE_BFE = 24, /**< Gen7+ */ + BRW_OPCODE_BFI1 = 25, /**< Gen7+ */ + BRW_OPCODE_BFI2 = 26, /**< Gen7+ */ + /* Reserved - 27-31 */ + BRW_OPCODE_JMPI = 32, + // BRW_OPCODE_BRD = 33, /**< Gen7+ */ + BRW_OPCODE_IF = 34, + BRW_OPCODE_IFF = 35, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_BRC = 35, /**< Gen7+ */ /* Reused */ + BRW_OPCODE_ELSE = 36, + BRW_OPCODE_ENDIF = 37, + BRW_OPCODE_DO = 38, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_CASE = 38, /**< Gen6 only */ /* Reused */ + BRW_OPCODE_WHILE = 39, + BRW_OPCODE_BREAK = 40, + BRW_OPCODE_CONTINUE = 41, + BRW_OPCODE_HALT = 42, + // BRW_OPCODE_CALLA = 43, /**< Gen7.5+ */ + // BRW_OPCODE_MSAVE = 44, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_CALL = 44, /**< Gen6+ */ /* Reused */ + // BRW_OPCODE_MREST = 45, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_RET = 45, /**< Gen6+ */ /* Reused */ + // BRW_OPCODE_PUSH = 46, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_FORK = 46, /**< Gen6 only */ /* Reused */ + // BRW_OPCODE_GOTO = 46, /**< Gen8+ */ /* Reused */ + // BRW_OPCODE_POP = 47, /**< Pre-Gen6 */ + BRW_OPCODE_WAIT = 48, + BRW_OPCODE_SEND = 49, + BRW_OPCODE_SENDC = 50, + BRW_OPCODE_SENDS = 51, /**< Gen9+ */ + BRW_OPCODE_SENDSC = 52, /**< Gen9+ */ + /* Reserved 53-55 */ + BRW_OPCODE_MATH = 56, /**< Gen6+ */ + /* Reserved 57-63 */ + BRW_OPCODE_ADD = 64, + BRW_OPCODE_MUL = 65, + BRW_OPCODE_AVG = 66, + BRW_OPCODE_FRC = 67, + BRW_OPCODE_RNDU = 68, + BRW_OPCODE_RNDD = 69, + BRW_OPCODE_RNDE = 70, + BRW_OPCODE_RNDZ = 71, + BRW_OPCODE_MAC = 72, + BRW_OPCODE_MACH = 73, + BRW_OPCODE_LZD = 74, + BRW_OPCODE_FBH = 75, /**< Gen7+ */ + BRW_OPCODE_FBL = 76, /**< Gen7+ */ + BRW_OPCODE_CBIT = 77, /**< Gen7+ */ + BRW_OPCODE_ADDC = 78, /**< Gen7+ */ + BRW_OPCODE_SUBB = 79, /**< Gen7+ */ + BRW_OPCODE_SAD2 = 80, + BRW_OPCODE_SADA2 = 81, + /* Reserved 82-83 */ + BRW_OPCODE_DP4 = 84, + BRW_OPCODE_DPH = 85, + BRW_OPCODE_DP3 = 86, + BRW_OPCODE_DP2 = 87, + /* Reserved 88 */ + BRW_OPCODE_LINE = 89, + BRW_OPCODE_PLN = 90, /**< G45+ */ + BRW_OPCODE_MAD = 91, /**< Gen6+ */ + BRW_OPCODE_LRP = 92, /**< Gen6+ */ + // BRW_OPCODE_MADM = 93, /**< Gen8+ */ + /* Reserved 94-124 */ + BRW_OPCODE_NENOP = 125, /**< G45 only */ + BRW_OPCODE_NOP = 126, + /* Reserved 127 */ + + /* These are compiler backend opcodes that get translated into other + * instructions. + */ + FS_OPCODE_FB_WRITE = 128, + + /** + * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as + * individual sources instead of as a single payload blob. The + * position/ordering of the arguments are defined by the enum + * fb_write_logical_srcs. + */ + FS_OPCODE_FB_WRITE_LOGICAL, + + FS_OPCODE_REP_FB_WRITE, + + FS_OPCODE_FB_READ, + FS_OPCODE_FB_READ_LOGICAL, + + SHADER_OPCODE_RCP, + SHADER_OPCODE_RSQ, + SHADER_OPCODE_SQRT, + SHADER_OPCODE_EXP2, + SHADER_OPCODE_LOG2, + SHADER_OPCODE_POW, + SHADER_OPCODE_INT_QUOTIENT, + SHADER_OPCODE_INT_REMAINDER, + SHADER_OPCODE_SIN, + SHADER_OPCODE_COS, + + /** + * Texture sampling opcodes. + * + * LOGICAL opcodes are eventually translated to the matching non-LOGICAL + * opcode but instead of taking a single payload blob they expect their + * arguments separately as individual sources. The position/ordering of the + * arguments are defined by the enum tex_logical_srcs. + */ + SHADER_OPCODE_TEX, + SHADER_OPCODE_TEX_LOGICAL, + SHADER_OPCODE_TXD, + SHADER_OPCODE_TXD_LOGICAL, + SHADER_OPCODE_TXF, + SHADER_OPCODE_TXF_LOGICAL, + SHADER_OPCODE_TXF_LZ, + SHADER_OPCODE_TXL, + SHADER_OPCODE_TXL_LOGICAL, + SHADER_OPCODE_TXL_LZ, + SHADER_OPCODE_TXS, + SHADER_OPCODE_TXS_LOGICAL, + FS_OPCODE_TXB, + FS_OPCODE_TXB_LOGICAL, + SHADER_OPCODE_TXF_CMS, + SHADER_OPCODE_TXF_CMS_LOGICAL, + SHADER_OPCODE_TXF_CMS_W, + SHADER_OPCODE_TXF_CMS_W_LOGICAL, + SHADER_OPCODE_TXF_UMS, + SHADER_OPCODE_TXF_UMS_LOGICAL, + SHADER_OPCODE_TXF_MCS, + SHADER_OPCODE_TXF_MCS_LOGICAL, + SHADER_OPCODE_LOD, + SHADER_OPCODE_LOD_LOGICAL, + SHADER_OPCODE_TG4, + SHADER_OPCODE_TG4_LOGICAL, + SHADER_OPCODE_TG4_OFFSET, + SHADER_OPCODE_TG4_OFFSET_LOGICAL, + SHADER_OPCODE_SAMPLEINFO, + SHADER_OPCODE_SAMPLEINFO_LOGICAL, + + /** + * Combines multiple sources of size 1 into a larger virtual GRF. + * For example, parameters for a send-from-GRF message. Or, updating + * channels of a size 4 VGRF used to store vec4s such as texturing results. + * + * This will be lowered into MOVs from each source to consecutive offsets + * of the destination VGRF. + * + * src[0] may be BAD_FILE. If so, the lowering pass skips emitting the MOV, + * but still reserves the first channel of the destination VGRF. This can be + * used to reserve space for, say, a message header set up by the generators. + */ + SHADER_OPCODE_LOAD_PAYLOAD, + + /** + * Packs a number of sources into a single value. Unlike LOAD_PAYLOAD, this + * acts intra-channel, obtaining the final value for each channel by + * combining the sources values for the same channel, the first source + * occupying the lowest bits and the last source occupying the highest + * bits. + */ + FS_OPCODE_PACK, + + SHADER_OPCODE_SHADER_TIME_ADD, + + /** + * Typed and untyped surface access opcodes. + * + * LOGICAL opcodes are eventually translated to the matching non-LOGICAL + * opcode but instead of taking a single payload blob they expect their + * arguments separately as individual sources: + * + * Source 0: [required] Surface coordinates. + * Source 1: [optional] Operation source. + * Source 2: [required] Surface index. + * Source 3: [required] Number of coordinate components (as UD immediate). + * Source 4: [required] Opcode-specific control immediate, same as source 2 + * of the matching non-LOGICAL opcode. + */ + SHADER_OPCODE_UNTYPED_ATOMIC, + SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, + SHADER_OPCODE_UNTYPED_SURFACE_READ, + SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + SHADER_OPCODE_UNTYPED_SURFACE_WRITE, + SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + + SHADER_OPCODE_TYPED_ATOMIC, + SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, + SHADER_OPCODE_TYPED_SURFACE_READ, + SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, + SHADER_OPCODE_TYPED_SURFACE_WRITE, + SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, + + SHADER_OPCODE_MEMORY_FENCE, + + SHADER_OPCODE_GEN4_SCRATCH_READ, + SHADER_OPCODE_GEN4_SCRATCH_WRITE, + SHADER_OPCODE_GEN7_SCRATCH_READ, + + /** + * Gen8+ SIMD8 URB Read messages. + */ + SHADER_OPCODE_URB_READ_SIMD8, + SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, + + SHADER_OPCODE_URB_WRITE_SIMD8, + SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT, + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT, + + /** + * Return the index of an arbitrary live channel (i.e. one of the channels + * enabled in the current execution mask) and assign it to the first + * component of the destination. Expected to be used as input for the + * BROADCAST pseudo-opcode. + */ + SHADER_OPCODE_FIND_LIVE_CHANNEL, + + /** + * Pick the channel from its first source register given by the index + * specified as second source. Useful for variable indexing of surfaces. + * + * Note that because the result of this instruction is by definition + * uniform and it can always be splatted to multiple channels using a + * scalar regioning mode, only the first channel of the destination region + * is guaranteed to be updated, which implies that BROADCAST instructions + * should usually be marked force_writemask_all. + */ + SHADER_OPCODE_BROADCAST, + + VEC4_OPCODE_MOV_BYTES, + VEC4_OPCODE_PACK_BYTES, + VEC4_OPCODE_UNPACK_UNIFORM, + VEC4_OPCODE_FROM_DOUBLE, + VEC4_OPCODE_TO_DOUBLE, + VEC4_OPCODE_PICK_LOW_32BIT, + VEC4_OPCODE_PICK_HIGH_32BIT, + VEC4_OPCODE_SET_LOW_32BIT, + VEC4_OPCODE_SET_HIGH_32BIT, + + FS_OPCODE_DDX_COARSE, + FS_OPCODE_DDX_FINE, + /** + * Compute dFdy(), dFdyCoarse(), or dFdyFine(). + */ + FS_OPCODE_DDY_COARSE, + FS_OPCODE_DDY_FINE, + FS_OPCODE_CINTERP, + FS_OPCODE_LINTERP, + FS_OPCODE_PIXEL_X, + FS_OPCODE_PIXEL_Y, + FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, + FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4, + FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, + FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, + FS_OPCODE_GET_BUFFER_SIZE, + FS_OPCODE_MOV_DISPATCH_TO_FLAGS, + FS_OPCODE_DISCARD_JUMP, + FS_OPCODE_SET_SAMPLE_ID, + FS_OPCODE_PACK_HALF_2x16_SPLIT, + FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, + FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, + FS_OPCODE_PLACEHOLDER_HALT, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, + + VS_OPCODE_URB_WRITE, + VS_OPCODE_PULL_CONSTANT_LOAD, + VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, + VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, + + VS_OPCODE_GET_BUFFER_SIZE, + + VS_OPCODE_UNPACK_FLAGS_SIMD4X2, + + /** + * Write geometry shader output data to the URB. + * + * Unlike VS_OPCODE_URB_WRITE, this opcode doesn't do an implied move from + * R0 to the first MRF. This allows the geometry shader to override the + * "Slot {0,1} Offset" fields in the message header. + */ + GS_OPCODE_URB_WRITE, + + /** + * Write geometry shader output data to the URB and request a new URB + * handle (gen6). + * + * This opcode doesn't do an implied move from R0 to the first MRF. + */ + GS_OPCODE_URB_WRITE_ALLOCATE, + + /** + * Terminate the geometry shader thread by doing an empty URB write. + * + * This opcode doesn't do an implied move from R0 to the first MRF. This + * allows the geometry shader to override the "GS Number of Output Vertices + * for Slot {0,1}" fields in the message header. + */ + GS_OPCODE_THREAD_END, + + /** + * Set the "Slot {0,1} Offset" fields of a URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src0.x indicates which portion of the URB should be written to (e.g. a + * vertex number) + * + * - src1 is an immediate multiplier which will be applied to src0 + * (e.g. the size of a single vertex in the URB). + * + * Note: the hardware will apply this offset *in addition to* the offset in + * vec4_instruction::offset. + */ + GS_OPCODE_SET_WRITE_OFFSET, + + /** + * Set the "GS Number of Output Vertices for Slot {0,1}" fields of a + * URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src0.x is the vertex count. The upper 16 bits will be ignored. + */ + GS_OPCODE_SET_VERTEX_COUNT, + + /** + * Set DWORD 2 of dst to the value in src. + */ + GS_OPCODE_SET_DWORD_2, + + /** + * Prepare the dst register for storage in the "Channel Mask" fields of a + * URB_WRITE message header. + * + * DWORD 4 of dst is shifted left by 4 bits, so that later, + * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the + * final channel mask. + * + * Note: since GS_OPCODE_SET_CHANNEL_MASKS ORs DWORDs 0 and 4 together to + * form the final channel mask, DWORDs 0 and 4 of the dst register must not + * have any extraneous bits set prior to execution of this opcode (that is, + * they should be in the range 0x0 to 0xf). + */ + GS_OPCODE_PREPARE_CHANNEL_MASKS, + + /** + * Set the "Channel Mask" fields of a URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src.x is the channel mask, as prepared by + * GS_OPCODE_PREPARE_CHANNEL_MASKS. DWORDs 0 and 4 are OR'ed together to + * form the final channel mask. + */ + GS_OPCODE_SET_CHANNEL_MASKS, + + /** + * Get the "Instance ID" fields from the payload. + * + * - dst is the GRF for gl_InvocationID. + */ + GS_OPCODE_GET_INSTANCE_ID, + + /** + * Send a FF_SYNC message to allocate initial URB handles (gen6). + * + * - dst will be used as the writeback register for the FF_SYNC operation. + * + * - src0 is the number of primitives written. + * + * - src1 is the value to hold in M0.0: number of SO vertices to write + * and number of SO primitives needed. Its value will be overwritten + * with the SVBI values if transform feedback is enabled. + * + * Note: This opcode uses an implicit MRF register for the ff_sync message + * header, so the caller is expected to set inst->base_mrf and initialize + * that MRF register to r0. This opcode will also write to this MRF register + * to include the allocated URB handle so it can then be reused directly as + * the header in the URB write operation we are allocating the handle for. + */ + GS_OPCODE_FF_SYNC, + + /** + * Move r0.1 (which holds PrimitiveID information in gen6) to a separate + * register. + * + * - dst is the GRF where PrimitiveID information will be moved. + */ + GS_OPCODE_SET_PRIMITIVE_ID, + + /** + * Write transform feedback data to the SVB by sending a SVB WRITE message. + * Used in gen6. + * + * - dst is the MRF register containing the message header. + * + * - src0 is the register where the vertex data is going to be copied from. + * + * - src1 is the destination register when write commit occurs. + */ + GS_OPCODE_SVB_WRITE, + + /** + * Set destination index in the SVB write message payload (M0.5). Used + * in gen6 for transform feedback. + * + * - dst is the header to save the destination indices for SVB WRITE. + * - src is the register that holds the destination indices value. + */ + GS_OPCODE_SVB_SET_DST_INDEX, + + /** + * Prepare Mx.0 subregister for being used in the FF_SYNC message header. + * Used in gen6 for transform feedback. + * + * - dst will hold the register with the final Mx.0 value. + * + * - src0 has the number of vertices emitted in SO (NumSOVertsToWrite) + * + * - src1 has the number of needed primitives for SO (NumSOPrimsNeeded) + * + * - src2 is the value to hold in M0: number of SO vertices to write + * and number of SO primitives needed. + */ + GS_OPCODE_FF_SYNC_SET_PRIMITIVES, + + /** + * Terminate the compute shader. + */ + CS_OPCODE_CS_TERMINATE, + + /** + * GLSL barrier() + */ + SHADER_OPCODE_BARRIER, + + /** + * Calculate the high 32-bits of a 32x32 multiply. + */ + SHADER_OPCODE_MULH, + + /** + * A MOV that uses VxH indirect addressing. + * + * Source 0: A register to start from (HW_REG). + * Source 1: An indirect offset (in bytes, UD GRF). + * Source 2: The length of the region that could be accessed (in bytes, + * UD immediate). + */ + SHADER_OPCODE_MOV_INDIRECT, + + VEC4_OPCODE_URB_READ, + TCS_OPCODE_GET_INSTANCE_ID, + TCS_OPCODE_URB_WRITE, + TCS_OPCODE_SET_INPUT_URB_OFFSETS, + TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, + TCS_OPCODE_GET_PRIMITIVE_ID, + TCS_OPCODE_CREATE_BARRIER_HEADER, + TCS_OPCODE_SRC0_010_IS_ZERO, + TCS_OPCODE_RELEASE_INPUT, + TCS_OPCODE_THREAD_END, + + TES_OPCODE_GET_PRIMITIVE_ID, + TES_OPCODE_CREATE_INPUT_READ_HEADER, + TES_OPCODE_ADD_INDIRECT_URB_OFFSET, +}; + +enum brw_urb_write_flags { + BRW_URB_WRITE_NO_FLAGS = 0, + + /** + * Causes a new URB entry to be allocated, and its address stored in the + * destination register (gen < 7). + */ + BRW_URB_WRITE_ALLOCATE = 0x1, + + /** + * Causes the current URB entry to be deallocated (gen < 7). + */ + BRW_URB_WRITE_UNUSED = 0x2, + + /** + * Causes the thread to terminate. + */ + BRW_URB_WRITE_EOT = 0x4, + + /** + * Indicates that the given URB entry is complete, and may be sent further + * down the 3D pipeline (gen < 7). + */ + BRW_URB_WRITE_COMPLETE = 0x8, + + /** + * Indicates that an additional offset (which may be different for the two + * vec4 slots) is stored in the message header (gen == 7). + */ + BRW_URB_WRITE_PER_SLOT_OFFSET = 0x10, + + /** + * Indicates that the channel masks in the URB_WRITE message header should + * not be overridden to 0xff (gen == 7). + */ + BRW_URB_WRITE_USE_CHANNEL_MASKS = 0x20, + + /** + * Indicates that the data should be sent to the URB using the + * URB_WRITE_OWORD message rather than URB_WRITE_HWORD (gen == 7). This + * causes offsets to be interpreted as multiples of an OWORD instead of an + * HWORD, and only allows one OWORD to be written. + */ + BRW_URB_WRITE_OWORD = 0x40, + + /** + * Convenient combination of flags: end the thread while simultaneously + * marking the given URB entry as complete. + */ + BRW_URB_WRITE_EOT_COMPLETE = BRW_URB_WRITE_EOT | BRW_URB_WRITE_COMPLETE, + + /** + * Convenient combination of flags: mark the given URB entry as complete + * and simultaneously allocate a new one. + */ + BRW_URB_WRITE_ALLOCATE_COMPLETE = + BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE, +}; + +enum fb_write_logical_srcs { + FB_WRITE_LOGICAL_SRC_COLOR0, /* REQUIRED */ + FB_WRITE_LOGICAL_SRC_COLOR1, /* for dual source blend messages */ + FB_WRITE_LOGICAL_SRC_SRC0_ALPHA, + FB_WRITE_LOGICAL_SRC_SRC_DEPTH, /* gl_FragDepth */ + FB_WRITE_LOGICAL_SRC_DST_DEPTH, /* GEN4-5: passthrough from thread */ + FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */ + FB_WRITE_LOGICAL_SRC_OMASK, /* Sample Mask (gl_SampleMask) */ + FB_WRITE_LOGICAL_SRC_COMPONENTS, /* REQUIRED */ + FB_WRITE_LOGICAL_NUM_SRCS +}; + +enum tex_logical_srcs { + /** Texture coordinates */ + TEX_LOGICAL_SRC_COORDINATE, + /** Shadow comparator */ + TEX_LOGICAL_SRC_SHADOW_C, + /** dPdx if the operation takes explicit derivatives, otherwise LOD value */ + TEX_LOGICAL_SRC_LOD, + /** dPdy if the operation takes explicit derivatives */ + TEX_LOGICAL_SRC_LOD2, + /** Sample index */ + TEX_LOGICAL_SRC_SAMPLE_INDEX, + /** MCS data */ + TEX_LOGICAL_SRC_MCS, + /** REQUIRED: Texture surface index */ + TEX_LOGICAL_SRC_SURFACE, + /** Texture sampler index */ + TEX_LOGICAL_SRC_SAMPLER, + /** Texel offset for gathers */ + TEX_LOGICAL_SRC_TG4_OFFSET, + /** REQUIRED: Number of coordinate components (as UD immediate) */ + TEX_LOGICAL_SRC_COORD_COMPONENTS, + /** REQUIRED: Number of derivative components (as UD immediate) */ + TEX_LOGICAL_SRC_GRAD_COMPONENTS, + + TEX_LOGICAL_NUM_SRCS, +}; + +#ifdef __cplusplus +/** + * Allow brw_urb_write_flags enums to be ORed together. + */ +inline brw_urb_write_flags +operator|(brw_urb_write_flags x, brw_urb_write_flags y) +{ + return static_cast<brw_urb_write_flags>(static_cast<int>(x) | + static_cast<int>(y)); +} +#endif + +enum PACKED brw_predicate { + BRW_PREDICATE_NONE = 0, + BRW_PREDICATE_NORMAL = 1, + BRW_PREDICATE_ALIGN1_ANYV = 2, + BRW_PREDICATE_ALIGN1_ALLV = 3, + BRW_PREDICATE_ALIGN1_ANY2H = 4, + BRW_PREDICATE_ALIGN1_ALL2H = 5, + BRW_PREDICATE_ALIGN1_ANY4H = 6, + BRW_PREDICATE_ALIGN1_ALL4H = 7, + BRW_PREDICATE_ALIGN1_ANY8H = 8, + BRW_PREDICATE_ALIGN1_ALL8H = 9, + BRW_PREDICATE_ALIGN1_ANY16H = 10, + BRW_PREDICATE_ALIGN1_ALL16H = 11, + BRW_PREDICATE_ALIGN1_ANY32H = 12, + BRW_PREDICATE_ALIGN1_ALL32H = 13, + BRW_PREDICATE_ALIGN16_REPLICATE_X = 2, + BRW_PREDICATE_ALIGN16_REPLICATE_Y = 3, + BRW_PREDICATE_ALIGN16_REPLICATE_Z = 4, + BRW_PREDICATE_ALIGN16_REPLICATE_W = 5, + BRW_PREDICATE_ALIGN16_ANY4H = 6, + BRW_PREDICATE_ALIGN16_ALL4H = 7, +}; + +enum PACKED brw_reg_file { + BRW_ARCHITECTURE_REGISTER_FILE = 0, + BRW_GENERAL_REGISTER_FILE = 1, + BRW_MESSAGE_REGISTER_FILE = 2, + BRW_IMMEDIATE_VALUE = 3, + + ARF = BRW_ARCHITECTURE_REGISTER_FILE, + FIXED_GRF = BRW_GENERAL_REGISTER_FILE, + MRF = BRW_MESSAGE_REGISTER_FILE, + IMM = BRW_IMMEDIATE_VALUE, + + /* These are not hardware values */ + VGRF, + ATTR, + UNIFORM, /* prog_data->params[reg] */ + BAD_FILE, +}; + +#define BRW_HW_REG_TYPE_UD 0 +#define BRW_HW_REG_TYPE_D 1 +#define BRW_HW_REG_TYPE_UW 2 +#define BRW_HW_REG_TYPE_W 3 +#define BRW_HW_REG_TYPE_F 7 +#define GEN8_HW_REG_TYPE_UQ 8 +#define GEN8_HW_REG_TYPE_Q 9 + +#define BRW_HW_REG_NON_IMM_TYPE_UB 4 +#define BRW_HW_REG_NON_IMM_TYPE_B 5 +#define GEN7_HW_REG_NON_IMM_TYPE_DF 6 +#define GEN8_HW_REG_NON_IMM_TYPE_HF 10 + +#define BRW_HW_REG_IMM_TYPE_UV 4 /* Gen6+ packed unsigned immediate vector */ +#define BRW_HW_REG_IMM_TYPE_VF 5 /* packed float immediate vector */ +#define BRW_HW_REG_IMM_TYPE_V 6 /* packed int imm. vector; uword dest only */ +#define GEN8_HW_REG_IMM_TYPE_DF 10 +#define GEN8_HW_REG_IMM_TYPE_HF 11 + +/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so + * the types were implied. IVB adds BFE and BFI2 that operate on doublewords + * and unsigned doublewords, so a new field is also available in the da3src + * struct (part of struct brw_instruction.bits1 in brw_structs.h) to select + * dst and shared-src types. The values are different from BRW_REGISTER_TYPE_*. + */ +#define BRW_3SRC_TYPE_F 0 +#define BRW_3SRC_TYPE_D 1 +#define BRW_3SRC_TYPE_UD 2 +#define BRW_3SRC_TYPE_DF 3 + +#define BRW_ARF_NULL 0x00 +#define BRW_ARF_ADDRESS 0x10 +#define BRW_ARF_ACCUMULATOR 0x20 +#define BRW_ARF_FLAG 0x30 +#define BRW_ARF_MASK 0x40 +#define BRW_ARF_MASK_STACK 0x50 +#define BRW_ARF_MASK_STACK_DEPTH 0x60 +#define BRW_ARF_STATE 0x70 +#define BRW_ARF_CONTROL 0x80 +#define BRW_ARF_NOTIFICATION_COUNT 0x90 +#define BRW_ARF_IP 0xA0 +#define BRW_ARF_TDR 0xB0 +#define BRW_ARF_TIMESTAMP 0xC0 + +#define BRW_MRF_COMPR4 (1 << 7) + +#define BRW_AMASK 0 +#define BRW_IMASK 1 +#define BRW_LMASK 2 +#define BRW_CMASK 3 + + + +#define BRW_THREAD_NORMAL 0 +#define BRW_THREAD_ATOMIC 1 +#define BRW_THREAD_SWITCH 2 + +enum PACKED brw_vertical_stride { + BRW_VERTICAL_STRIDE_0 = 0, + BRW_VERTICAL_STRIDE_1 = 1, + BRW_VERTICAL_STRIDE_2 = 2, + BRW_VERTICAL_STRIDE_4 = 3, + BRW_VERTICAL_STRIDE_8 = 4, + BRW_VERTICAL_STRIDE_16 = 5, + BRW_VERTICAL_STRIDE_32 = 6, + BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL = 0xF, +}; + +enum PACKED brw_width { + BRW_WIDTH_1 = 0, + BRW_WIDTH_2 = 1, + BRW_WIDTH_4 = 2, + BRW_WIDTH_8 = 3, + BRW_WIDTH_16 = 4, +}; + +/** + * Message target: Shared Function ID for where to SEND a message. + * + * These are enumerated in the ISA reference under "send - Send Message". + * In particular, see the following tables: + * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition" + * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor" + * - Ivybridge PRM, Volume 1 Part 1, section 3.2.7 "GPE Function IDs" + */ +enum brw_message_target { + BRW_SFID_NULL = 0, + BRW_SFID_MATH = 1, /* Only valid on Gen4-5 */ + BRW_SFID_SAMPLER = 2, + BRW_SFID_MESSAGE_GATEWAY = 3, + BRW_SFID_DATAPORT_READ = 4, + BRW_SFID_DATAPORT_WRITE = 5, + BRW_SFID_URB = 6, + BRW_SFID_THREAD_SPAWNER = 7, + BRW_SFID_VME = 8, + + GEN6_SFID_DATAPORT_SAMPLER_CACHE = 4, + GEN6_SFID_DATAPORT_RENDER_CACHE = 5, + GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9, + + GEN7_SFID_DATAPORT_DATA_CACHE = 10, + GEN7_SFID_PIXEL_INTERPOLATOR = 11, + HSW_SFID_DATAPORT_DATA_CACHE_1 = 12, + HSW_SFID_CRE = 13, +}; + +#define GEN7_MESSAGE_TARGET_DP_DATA_CACHE 10 + +#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0 +#define BRW_SAMPLER_RETURN_FORMAT_UINT32 2 +#define BRW_SAMPLER_RETURN_FORMAT_SINT32 3 + +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0 +#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD8_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD16_LD 3 + +#define GEN5_SAMPLER_MESSAGE_SAMPLE 0 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS 1 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD 2 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE 3 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS 4 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4 8 +#define GEN5_SAMPLER_MESSAGE_LOD 9 +#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10 +#define GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO 11 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C 16 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO 17 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18 +#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20 +#define GEN9_SAMPLER_MESSAGE_SAMPLE_LZ 24 +#define GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ 25 +#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ 26 +#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W 28 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS 29 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS 30 +#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS 31 + +/* for GEN5 only */ +#define BRW_SAMPLER_SIMD_MODE_SIMD4X2 0 +#define BRW_SAMPLER_SIMD_MODE_SIMD8 1 +#define BRW_SAMPLER_SIMD_MODE_SIMD16 2 +#define BRW_SAMPLER_SIMD_MODE_SIMD32_64 3 + +/* GEN9 changes SIMD mode 0 to mean SIMD8D, but lets us get the SIMD4x2 + * behavior by setting bit 22 of dword 2 in the message header. */ +#define GEN9_SAMPLER_SIMD_MODE_SIMD8D 0 +#define GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2 (1 << 22) + +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW 0 +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH 1 +#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS 2 +#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS 3 +#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS 4 +#define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n) \ + ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \ + (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \ + (n) == 16 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS : \ + (n) == 32 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : \ + (abort(), ~0)) + +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD 0 +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS 2 + +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2 +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3 + +/* This one stays the same across generations. */ +#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0 +/* GEN4 */ +#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1 +#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 2 +#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3 +/* G45, GEN5 */ +#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ 3 +#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 +/* GEN6 */ +#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ 5 +#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 + +#define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0 +#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1 +#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE 2 + +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE 0 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED 1 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01 2 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23 3 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01 4 + +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 0 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 1 +#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 2 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 3 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 4 +#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5 +#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7 + +/* GEN6 */ +#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE 7 +#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 8 +#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 9 +#define GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 10 +#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 11 +#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 12 +#define GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE 13 +#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE 14 + +/* GEN7 */ +#define GEN7_DATAPORT_RC_MEDIA_BLOCK_READ 4 +#define GEN7_DATAPORT_RC_TYPED_SURFACE_READ 5 +#define GEN7_DATAPORT_RC_TYPED_ATOMIC_OP 6 +#define GEN7_DATAPORT_RC_MEMORY_FENCE 7 +#define GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE 10 +#define GEN7_DATAPORT_RC_RENDER_TARGET_WRITE 12 +#define GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE 13 +#define GEN7_DATAPORT_DC_OWORD_BLOCK_READ 0 +#define GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ 1 +#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ 2 +#define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ 3 +#define GEN7_DATAPORT_DC_BYTE_SCATTERED_READ 4 +#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ 5 +#define GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP 6 +#define GEN7_DATAPORT_DC_MEMORY_FENCE 7 +#define GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE 8 +#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE 10 +#define GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE 11 +#define GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE 12 +#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE 13 + +#define GEN7_DATAPORT_SCRATCH_READ ((1 << 18) | \ + (0 << 17)) +#define GEN7_DATAPORT_SCRATCH_WRITE ((1 << 18) | \ + (1 << 17)) +#define GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT 12 + +#define GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET 0 +#define GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE 1 +#define GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID 2 +#define GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET 3 + +/* HSW */ +#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ 0 +#define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ 1 +#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_READ 2 +#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_READ 3 +#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ 4 +#define HSW_DATAPORT_DC_PORT0_MEMORY_FENCE 7 +#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_WRITE 8 +#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_WRITE 10 +#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_WRITE 11 +#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE 12 + +#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ 1 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP 2 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2 3 +#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ 4 +#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ 5 +#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP 6 +#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2 7 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE 9 +#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE 10 +#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11 +#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12 +#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13 + +/* GEN9 */ +#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12 +#define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13 + +/* Dataport special binding table indices: */ +#define BRW_BTI_STATELESS 255 +#define GEN7_BTI_SLM 254 +/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the + * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, + * CHV and at least some pre-production steppings of SKL due to + * WaForceEnableNonCoherent, HDC memory access may have been overridden by the + * kernel to be non-coherent (matching the behavior of the same BTI on + * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253. + */ +#define GEN8_BTI_STATELESS_IA_COHERENT 255 +#define GEN8_BTI_STATELESS_NON_COHERENT 253 + +/* dataport atomic operations. */ +#define BRW_AOP_AND 1 +#define BRW_AOP_OR 2 +#define BRW_AOP_XOR 3 +#define BRW_AOP_MOV 4 +#define BRW_AOP_INC 5 +#define BRW_AOP_DEC 6 +#define BRW_AOP_ADD 7 +#define BRW_AOP_SUB 8 +#define BRW_AOP_REVSUB 9 +#define BRW_AOP_IMAX 10 +#define BRW_AOP_IMIN 11 +#define BRW_AOP_UMAX 12 +#define BRW_AOP_UMIN 13 +#define BRW_AOP_CMPWR 14 +#define BRW_AOP_PREDEC 15 + +#define BRW_MATH_FUNCTION_INV 1 +#define BRW_MATH_FUNCTION_LOG 2 +#define BRW_MATH_FUNCTION_EXP 3 +#define BRW_MATH_FUNCTION_SQRT 4 +#define BRW_MATH_FUNCTION_RSQ 5 +#define BRW_MATH_FUNCTION_SIN 6 +#define BRW_MATH_FUNCTION_COS 7 +#define BRW_MATH_FUNCTION_SINCOS 8 /* gen4, gen5 */ +#define BRW_MATH_FUNCTION_FDIV 9 /* gen6+ */ +#define BRW_MATH_FUNCTION_POW 10 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT 12 +#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER 13 +#define GEN8_MATH_FUNCTION_INVM 14 +#define GEN8_MATH_FUNCTION_RSQRTM 15 + +#define BRW_MATH_INTEGER_UNSIGNED 0 +#define BRW_MATH_INTEGER_SIGNED 1 + +#define BRW_MATH_PRECISION_FULL 0 +#define BRW_MATH_PRECISION_PARTIAL 1 + +#define BRW_MATH_SATURATE_NONE 0 +#define BRW_MATH_SATURATE_SATURATE 1 + +#define BRW_MATH_DATA_VECTOR 0 +#define BRW_MATH_DATA_SCALAR 1 + +#define BRW_URB_OPCODE_WRITE_HWORD 0 +#define BRW_URB_OPCODE_WRITE_OWORD 1 +#define BRW_URB_OPCODE_READ_HWORD 2 +#define BRW_URB_OPCODE_READ_OWORD 3 +#define GEN7_URB_OPCODE_ATOMIC_MOV 4 +#define GEN7_URB_OPCODE_ATOMIC_INC 5 +#define GEN8_URB_OPCODE_ATOMIC_ADD 6 +#define GEN8_URB_OPCODE_SIMD8_WRITE 7 +#define GEN8_URB_OPCODE_SIMD8_READ 8 + +#define BRW_URB_SWIZZLE_NONE 0 +#define BRW_URB_SWIZZLE_INTERLEAVE 1 +#define BRW_URB_SWIZZLE_TRANSPOSE 2 + +#define BRW_SCRATCH_SPACE_SIZE_1K 0 +#define BRW_SCRATCH_SPACE_SIZE_2K 1 +#define BRW_SCRATCH_SPACE_SIZE_4K 2 +#define BRW_SCRATCH_SPACE_SIZE_8K 3 +#define BRW_SCRATCH_SPACE_SIZE_16K 4 +#define BRW_SCRATCH_SPACE_SIZE_32K 5 +#define BRW_SCRATCH_SPACE_SIZE_64K 6 +#define BRW_SCRATCH_SPACE_SIZE_128K 7 +#define BRW_SCRATCH_SPACE_SIZE_256K 8 +#define BRW_SCRATCH_SPACE_SIZE_512K 9 +#define BRW_SCRATCH_SPACE_SIZE_1M 10 +#define BRW_SCRATCH_SPACE_SIZE_2M 11 + +#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY 0 +#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY 1 +#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG 2 +#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP 3 +#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG 4 +#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5 +#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE 6 + + +/* Gen7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size + * is 2^9, or 512. It's counted in multiples of 64 bytes. + * + * Identical for VS, DS, and HS. + */ +#define GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES (512*64) +#define GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES (512*64) +#define GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES (512*64) +#define GEN7_MAX_VS_URB_ENTRY_SIZE_BYTES (512*64) + +/* Gen6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit + * (128 bytes) URB rows and the maximum allowed value is 5 rows. + */ +#define GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES (5*128) + +/* GS Thread Payload + */ +/* R0 */ +# define GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 27 + +#endif /* BRW_EU_DEFINES_H */ diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c new file mode 100644 index 00000000000..058742d4f6e --- /dev/null +++ b/src/intel/compiler/brw_eu_emit.c @@ -0,0 +1,3675 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_eu_defines.h" +#include "brw_eu.h" + +#include "util/ralloc.h" + +/** + * Prior to Sandybridge, the SEND instruction accepted non-MRF source + * registers, implicitly moving the operand to a message register. + * + * On Sandybridge, this is no longer the case. This function performs the + * explicit move; it should be called before emitting a SEND instruction. + */ +void +gen6_resolve_implied_move(struct brw_codegen *p, + struct brw_reg *src, + unsigned msg_reg_nr) +{ + const struct gen_device_info *devinfo = p->devinfo; + if (devinfo->gen < 6) + return; + + if (src->file == BRW_MESSAGE_REGISTER_FILE) + return; + + if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), + retype(*src, BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); + } + *src = brw_message_reg(msg_reg_nr); +} + +static void +gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg) +{ + /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"): + * "The send with EOT should use register space R112-R127 for <src>. This is + * to enable loading of a new thread into the same slot while the message + * with EOT for current thread is pending dispatch." + * + * Since we're pretending to have 16 MRFs anyway, we may as well use the + * registers required for messages with EOT. + */ + const struct gen_device_info *devinfo = p->devinfo; + if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) { + reg->file = BRW_GENERAL_REGISTER_FILE; + reg->nr += GEN7_MRF_HACK_START; + } +} + +/** + * Convert a brw_reg_type enumeration value into the hardware representation. + * + * The hardware encoding may depend on whether the value is an immediate. + */ +unsigned +brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, + enum brw_reg_type type, enum brw_reg_file file) +{ + if (file == BRW_IMMEDIATE_VALUE) { + static const int imm_hw_types[] = { + [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, + [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, + [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, + [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, + [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, + [BRW_REGISTER_TYPE_UB] = -1, + [BRW_REGISTER_TYPE_B] = -1, + [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV, + [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF, + [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V, + [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF, + [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF, + [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, + [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, + }; + assert(type < ARRAY_SIZE(imm_hw_types)); + assert(imm_hw_types[type] != -1); + assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF); + return imm_hw_types[type]; + } else { + /* Non-immediate registers */ + static const int hw_types[] = { + [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, + [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, + [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, + [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, + [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB, + [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B, + [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, + [BRW_REGISTER_TYPE_UV] = -1, + [BRW_REGISTER_TYPE_VF] = -1, + [BRW_REGISTER_TYPE_V] = -1, + [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF, + [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF, + [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, + [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, + }; + assert(type < ARRAY_SIZE(hw_types)); + assert(hw_types[type] != -1); + assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF); + assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q); + return hw_types[type]; + } +} + +/** + * Return the element size given a hardware register type and file. + * + * The hardware encoding may depend on whether the value is an immediate. + */ +unsigned +brw_hw_reg_type_to_size(const struct gen_device_info *devinfo, + unsigned type, enum brw_reg_file file) +{ + if (file == BRW_IMMEDIATE_VALUE) { + static const unsigned imm_hw_sizes[] = { + [BRW_HW_REG_TYPE_UD] = 4, + [BRW_HW_REG_TYPE_D] = 4, + [BRW_HW_REG_TYPE_UW] = 2, + [BRW_HW_REG_TYPE_W] = 2, + [BRW_HW_REG_IMM_TYPE_UV] = 2, + [BRW_HW_REG_IMM_TYPE_VF] = 4, + [BRW_HW_REG_IMM_TYPE_V] = 2, + [BRW_HW_REG_TYPE_F] = 4, + [GEN8_HW_REG_TYPE_UQ] = 8, + [GEN8_HW_REG_TYPE_Q] = 8, + [GEN8_HW_REG_IMM_TYPE_DF] = 8, + [GEN8_HW_REG_IMM_TYPE_HF] = 2, + }; + assert(type < ARRAY_SIZE(imm_hw_sizes)); + assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV); + assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F); + return imm_hw_sizes[type]; + } else { + /* Non-immediate registers */ + static const unsigned hw_sizes[] = { + [BRW_HW_REG_TYPE_UD] = 4, + [BRW_HW_REG_TYPE_D] = 4, + [BRW_HW_REG_TYPE_UW] = 2, + [BRW_HW_REG_TYPE_W] = 2, + [BRW_HW_REG_NON_IMM_TYPE_UB] = 1, + [BRW_HW_REG_NON_IMM_TYPE_B] = 1, + [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8, + [BRW_HW_REG_TYPE_F] = 4, + [GEN8_HW_REG_TYPE_UQ] = 8, + [GEN8_HW_REG_TYPE_Q] = 8, + [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2, + }; + assert(type < ARRAY_SIZE(hw_sizes)); + assert(devinfo->gen >= 7 || + (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F)); + assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F); + return hw_sizes[type]; + } +} + +void +brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) +{ + const struct gen_device_info *devinfo = p->devinfo; + + if (dest.file == BRW_MESSAGE_REGISTER_FILE) + assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); + else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE) + assert(dest.nr < 128); + + gen7_convert_mrf_to_grf(p, &dest); + + brw_inst_set_dst_reg_file(devinfo, inst, dest.file); + brw_inst_set_dst_reg_type(devinfo, inst, + brw_reg_type_to_hw_type(devinfo, dest.type, + dest.file)); + brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode); + + if (dest.address_mode == BRW_ADDRESS_DIRECT) { + brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr); + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); + } else { + brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); + brw_inst_set_da16_writemask(devinfo, inst, dest.writemask); + if (dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_MESSAGE_REGISTER_FILE) { + assert(dest.writemask != 0); + } + /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: + * Although Dst.HorzStride is a don't care for Align16, HW needs + * this to be programmed as "01". + */ + brw_inst_set_dst_hstride(devinfo, inst, 1); + } + } else { + brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr); + + /* These are different sizes in align1 vs align16: + */ + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_dst_ia1_addr_imm(devinfo, inst, + dest.indirect_offset); + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); + } else { + brw_inst_set_dst_ia16_addr_imm(devinfo, inst, + dest.indirect_offset); + /* even ignored in da16, still need to set as '01' */ + brw_inst_set_dst_hstride(devinfo, inst, 1); + } + } + + /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8) + * or 16 (SIMD16), as that's normally correct. However, when dealing with + * small registers, we automatically reduce it to match the register size. + * + * In platforms that support fp64 we can emit instructions with a width of + * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these + * cases we need to make sure that these instructions have their exec sizes + * set properly when they are emitted and we can't rely on this code to fix + * it. + */ + bool fix_exec_size; + if (devinfo->gen >= 6) + fix_exec_size = dest.width < BRW_EXECUTE_4; + else + fix_exec_size = dest.width < BRW_EXECUTE_8; + + if (fix_exec_size) + brw_inst_set_exec_size(devinfo, inst, dest.width); +} + +static void +validate_reg(const struct gen_device_info *devinfo, + brw_inst *inst, struct brw_reg reg) +{ + const int hstride_for_reg[] = {0, 1, 2, 4}; + const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32}; + const int width_for_reg[] = {1, 2, 4, 8, 16}; + const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32}; + int width, hstride, vstride, execsize; + + if (reg.file == BRW_IMMEDIATE_VALUE) { + /* 3.3.6: Region Parameters. Restriction: Immediate vectors + * mean the destination has to be 128-bit aligned and the + * destination horiz stride has to be a word. + */ + if (reg.type == BRW_REGISTER_TYPE_V) { + unsigned UNUSED elem_size = brw_element_size(devinfo, inst, dst); + assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] * + elem_size == 2); + } + + return; + } + + if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && + reg.file == BRW_ARF_NULL) + return; + + /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: + * + * "Swizzling is not allowed when an accumulator is used as an implicit + * source or an explicit source in an instruction." + */ + if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && + reg.nr == BRW_ARF_ACCUMULATOR) + assert(reg.swizzle == BRW_SWIZZLE_XYZW); + + assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg)); + hstride = hstride_for_reg[reg.hstride]; + + if (reg.vstride == 0xf) { + vstride = -1; + } else { + assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg)); + vstride = vstride_for_reg[reg.vstride]; + } + + assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg)); + width = width_for_reg[reg.width]; + + assert(brw_inst_exec_size(devinfo, inst) >= 0 && + brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg)); + execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)]; + + /* Restrictions from 3.3.10: Register Region Restrictions. */ + /* 3. */ + assert(execsize >= width); + + /* 4. */ + if (execsize == width && hstride != 0) { + assert(vstride == -1 || vstride == width * hstride); + } + + /* 5. */ + if (execsize == width && hstride == 0) { + /* no restriction on vstride. */ + } + + /* 6. */ + if (width == 1) { + assert(hstride == 0); + } + + /* 7. */ + if (execsize == 1 && width == 1) { + assert(hstride == 0); + assert(vstride == 0); + } + + /* 8. */ + if (vstride == 0 && hstride == 0) { + assert(width == 1); + } + + /* 10. Check destination issues. */ +} + +static bool +is_compactable_immediate(unsigned imm) +{ + /* We get the low 12 bits as-is. */ + imm &= ~0xfff; + + /* We get one bit replicated through the top 20 bits. */ + return imm == 0 || imm == 0xfffff000; +} + +void +brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) +{ + const struct gen_device_info *devinfo = p->devinfo; + + if (reg.file == BRW_MESSAGE_REGISTER_FILE) + assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); + else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) + assert(reg.nr < 128); + + gen7_convert_mrf_to_grf(p, ®); + + if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { + /* Any source modifiers or regions will be ignored, since this just + * identifies the MRF/GRF to start reading the message contents from. + * Check for some likely failures. + */ + assert(!reg.negate); + assert(!reg.abs); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + } + + validate_reg(devinfo, inst, reg); + + brw_inst_set_src0_reg_file(devinfo, inst, reg.file); + brw_inst_set_src0_reg_type(devinfo, inst, + brw_reg_type_to_hw_type(devinfo, reg.type, reg.file)); + brw_inst_set_src0_abs(devinfo, inst, reg.abs); + brw_inst_set_src0_negate(devinfo, inst, reg.negate); + brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode); + + if (reg.file == BRW_IMMEDIATE_VALUE) { + if (reg.type == BRW_REGISTER_TYPE_DF || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM) + brw_inst_set_imm_df(devinfo, inst, reg.df); + else if (reg.type == BRW_REGISTER_TYPE_UQ || + reg.type == BRW_REGISTER_TYPE_Q) + brw_inst_set_imm_uq(devinfo, inst, reg.u64); + else + brw_inst_set_imm_ud(devinfo, inst, reg.ud); + + /* The Bspec's section titled "Non-present Operands" claims that if src0 + * is an immediate that src1's type must be the same as that of src0. + * + * The SNB+ DataTypeIndex instruction compaction tables contain mappings + * that do not follow this rule. E.g., from the IVB/HSW table: + * + * DataTypeIndex 18-Bit Mapping Mapped Meaning + * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir | + * + * And from the SNB table: + * + * DataTypeIndex 18-Bit Mapping Mapped Meaning + * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir | + * + * Neither of these cause warnings from the simulator when used, + * compacted or otherwise. In fact, all compaction mappings that have an + * immediate in src0 use a:ud for src1. + * + * The GM45 instruction compaction tables do not contain mapped meanings + * so it's not clear whether it has the restriction. We'll assume it was + * lifted on SNB. (FINISHME: decode the GM45 tables and check.) + * + * Don't do any of this for 64-bit immediates, since the src1 fields + * overlap with the immediate and setting them would overwrite the + * immediate we set. + */ + if (type_sz(reg.type) < 8) { + brw_inst_set_src1_reg_file(devinfo, inst, + BRW_ARCHITECTURE_REGISTER_FILE); + if (devinfo->gen < 6) { + brw_inst_set_src1_reg_type(devinfo, inst, + brw_inst_src0_reg_type(devinfo, inst)); + } else { + brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); + } + } + + /* Compacted instructions only have 12-bits (plus 1 for the other 20) + * for immediate values. Presumably the hardware engineers realized + * that the only useful floating-point value that could be represented + * in this format is 0.0, which can also be represented as a VF-typed + * immediate, so they gave us the previously mentioned mapping on IVB+. + * + * Strangely, we do have a mapping for imm:f in src1, so we don't need + * to do this there. + * + * If we see a 0.0:F, change the type to VF so that it can be compacted. + */ + if (brw_inst_imm_ud(devinfo, inst) == 0x0 && + brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F && + brw_inst_dst_reg_type(devinfo, inst) != GEN7_HW_REG_NON_IMM_TYPE_DF) { + brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF); + } + + /* There are no mappings for dst:d | i:d, so if the immediate is suitable + * set the types to :UD so the instruction can be compacted. + */ + if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) && + brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE && + brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D && + brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) { + brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); + brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); + } + } else { + if (reg.address_mode == BRW_ADDRESS_DIRECT) { + brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr); + } else { + brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); + } + } else { + brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset); + } else { + brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset); + } + } + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { + brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1); + brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); + } else { + brw_inst_set_src0_hstride(devinfo, inst, reg.hstride); + brw_inst_set_src0_width(devinfo, inst, reg.width); + brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); + } + } else { + brw_inst_set_src0_da16_swiz_x(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); + brw_inst_set_src0_da16_swiz_y(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); + brw_inst_set_src0_da16_swiz_z(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); + brw_inst_set_src0_da16_swiz_w(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); + + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + if (reg.vstride == BRW_VERTICAL_STRIDE_8) + brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + else + brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); + } + } +} + + +void +brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) +{ + const struct gen_device_info *devinfo = p->devinfo; + + if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) + assert(reg.nr < 128); + + /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: + * + * "Accumulator registers may be accessed explicitly as src0 + * operands only." + */ + assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE || + reg.nr != BRW_ARF_ACCUMULATOR); + + gen7_convert_mrf_to_grf(p, ®); + assert(reg.file != BRW_MESSAGE_REGISTER_FILE); + + validate_reg(devinfo, inst, reg); + + brw_inst_set_src1_reg_file(devinfo, inst, reg.file); + brw_inst_set_src1_reg_type(devinfo, inst, + brw_reg_type_to_hw_type(devinfo, reg.type, reg.file)); + brw_inst_set_src1_abs(devinfo, inst, reg.abs); + brw_inst_set_src1_negate(devinfo, inst, reg.negate); + + /* Only src1 can be immediate in two-argument instructions. + */ + assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE); + + if (reg.file == BRW_IMMEDIATE_VALUE) { + /* two-argument instructions can only use 32-bit immediates */ + assert(type_sz(reg.type) < 8); + brw_inst_set_imm_ud(devinfo, inst, reg.ud); + } else { + /* This is a hardware restriction, which may or may not be lifted + * in the future: + */ + assert (reg.address_mode == BRW_ADDRESS_DIRECT); + /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */ + + brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr); + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr); + } else { + brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16); + } + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { + brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1); + brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); + } else { + brw_inst_set_src1_hstride(devinfo, inst, reg.hstride); + brw_inst_set_src1_width(devinfo, inst, reg.width); + brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); + } + } else { + brw_inst_set_src1_da16_swiz_x(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); + brw_inst_set_src1_da16_swiz_y(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); + brw_inst_set_src1_da16_swiz_z(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); + brw_inst_set_src1_da16_swiz_w(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); + + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + if (reg.vstride == BRW_VERTICAL_STRIDE_8) + brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + else + brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); + } + } +} + +/** + * Set the Message Descriptor and Extended Message Descriptor fields + * for SEND messages. + * + * \note This zeroes out the Function Control bits, so it must be called + * \b before filling out any message-specific data. Callers can + * choose not to fill in irrelevant bits; they will be zero. + */ +void +brw_set_message_descriptor(struct brw_codegen *p, + brw_inst *inst, + enum brw_message_target sfid, + unsigned msg_length, + unsigned response_length, + bool header_present, + bool end_of_thread) +{ + const struct gen_device_info *devinfo = p->devinfo; + + brw_set_src1(p, inst, brw_imm_d(0)); + + /* For indirect sends, `inst` will not be the SEND/SENDC instruction + * itself; instead, it will be a MOV/OR into the address register. + * + * In this case, we avoid setting the extended message descriptor bits, + * since they go on the later SEND/SENDC instead and if set here would + * instead clobber the conditionalmod bits. + */ + unsigned opcode = brw_inst_opcode(devinfo, inst); + if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) { + brw_inst_set_sfid(devinfo, inst, sfid); + } + + brw_inst_set_mlen(devinfo, inst, msg_length); + brw_inst_set_rlen(devinfo, inst, response_length); + brw_inst_set_eot(devinfo, inst, end_of_thread); + + if (devinfo->gen >= 5) { + brw_inst_set_header_present(devinfo, inst, header_present); + } +} + +static void brw_set_math_message( struct brw_codegen *p, + brw_inst *inst, + unsigned function, + unsigned integer_type, + bool low_precision, + unsigned dataType ) +{ + const struct gen_device_info *devinfo = p->devinfo; + unsigned msg_length; + unsigned response_length; + + /* Infer message length from the function */ + switch (function) { + case BRW_MATH_FUNCTION_POW: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + msg_length = 2; + break; + default: + msg_length = 1; + break; + } + + /* Infer response length from the function */ + switch (function) { + case BRW_MATH_FUNCTION_SINCOS: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + response_length = 2; + break; + default: + response_length = 1; + break; + } + + + brw_set_message_descriptor(p, inst, BRW_SFID_MATH, + msg_length, response_length, false, false); + brw_inst_set_math_msg_function(devinfo, inst, function); + brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type); + brw_inst_set_math_msg_precision(devinfo, inst, low_precision); + brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst)); + brw_inst_set_math_msg_data_type(devinfo, inst, dataType); + brw_inst_set_saturate(devinfo, inst, 0); +} + + +static void brw_set_ff_sync_message(struct brw_codegen *p, + brw_inst *insn, + bool allocate, + unsigned response_length, + bool end_of_thread) +{ + const struct gen_device_info *devinfo = p->devinfo; + + brw_set_message_descriptor(p, insn, BRW_SFID_URB, + 1, response_length, true, end_of_thread); + brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */ + brw_inst_set_urb_allocate(devinfo, insn, allocate); + /* The following fields are not used by FF_SYNC: */ + brw_inst_set_urb_global_offset(devinfo, insn, 0); + brw_inst_set_urb_swizzle_control(devinfo, insn, 0); + brw_inst_set_urb_used(devinfo, insn, 0); + brw_inst_set_urb_complete(devinfo, insn, 0); +} + +static void brw_set_urb_message( struct brw_codegen *p, + brw_inst *insn, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle_control ) +{ + const struct gen_device_info *devinfo = p->devinfo; + + assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE); + assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE)); + assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); + + brw_set_message_descriptor(p, insn, BRW_SFID_URB, + msg_length, response_length, true, + flags & BRW_URB_WRITE_EOT); + + if (flags & BRW_URB_WRITE_OWORD) { + assert(msg_length == 2); /* header + one OWORD of data */ + brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD); + } else { + brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD); + } + + brw_inst_set_urb_global_offset(devinfo, insn, offset); + brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control); + + if (devinfo->gen < 8) { + brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE)); + } + + if (devinfo->gen < 7) { + brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE)); + brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED)); + } else { + brw_inst_set_urb_per_slot_offset(devinfo, insn, + !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); + } +} + +void +brw_set_dp_write_message(struct brw_codegen *p, + brw_inst *insn, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned target_cache, + unsigned msg_length, + bool header_present, + unsigned last_render_target, + unsigned response_length, + unsigned end_of_thread, + unsigned send_commit_msg) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 6 ? target_cache : + BRW_SFID_DATAPORT_WRITE); + + brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, + header_present, end_of_thread); + + brw_inst_set_binding_table_index(devinfo, insn, binding_table_index); + brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type); + brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control); + brw_inst_set_rt_last(devinfo, insn, last_render_target); + if (devinfo->gen < 7) { + brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg); + } +} + +void +brw_set_dp_read_message(struct brw_codegen *p, + brw_inst *insn, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned target_cache, + unsigned msg_length, + bool header_present, + unsigned response_length) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 6 ? target_cache : + BRW_SFID_DATAPORT_READ); + + brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, + header_present, false); + + brw_inst_set_binding_table_index(devinfo, insn, binding_table_index); + brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type); + brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control); + if (devinfo->gen < 6) + brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache); +} + +void +brw_set_sampler_message(struct brw_codegen *p, + brw_inst *inst, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format) +{ + const struct gen_device_info *devinfo = p->devinfo; + + brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length, + response_length, header_present, false); + + brw_inst_set_binding_table_index(devinfo, inst, binding_table_index); + brw_inst_set_sampler(devinfo, inst, sampler); + brw_inst_set_sampler_msg_type(devinfo, inst, msg_type); + if (devinfo->gen >= 5) { + brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode); + } else if (devinfo->gen == 4 && !devinfo->is_g4x) { + brw_inst_set_sampler_return_format(devinfo, inst, return_format); + } +} + +static void +gen7_set_dp_scratch_message(struct brw_codegen *p, + brw_inst *inst, + bool write, + bool dword, + bool invalidate_after_read, + unsigned num_regs, + unsigned addr_offset, + unsigned mlen, + unsigned rlen, + bool header_present) +{ + const struct gen_device_info *devinfo = p->devinfo; + assert(num_regs == 1 || num_regs == 2 || num_regs == 4 || + (devinfo->gen >= 8 && num_regs == 8)); + const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) : + num_regs - 1); + + brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE, + mlen, rlen, header_present, false); + brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */ + brw_inst_set_scratch_read_write(devinfo, inst, write); + brw_inst_set_scratch_type(devinfo, inst, dword); + brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read); + brw_inst_set_scratch_block_size(devinfo, inst, block_size); + brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset); +} + +#define next_insn brw_next_insn +brw_inst * +brw_next_insn(struct brw_codegen *p, unsigned opcode) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + if (p->nr_insn + 1 > p->store_size) { + p->store_size <<= 1; + p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size); + } + + p->next_insn_offset += 16; + insn = &p->store[p->nr_insn++]; + memcpy(insn, p->current, sizeof(*insn)); + + brw_inst_set_opcode(devinfo, insn, opcode); + return insn; +} + +static brw_inst * +brw_alu1(struct brw_codegen *p, unsigned opcode, + struct brw_reg dest, struct brw_reg src) +{ + brw_inst *insn = next_insn(p, opcode); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src); + return insn; +} + +static brw_inst * +brw_alu2(struct brw_codegen *p, unsigned opcode, + struct brw_reg dest, struct brw_reg src0, struct brw_reg src1) +{ + /* 64-bit immediates are only supported on 1-src instructions */ + assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4); + assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4); + + brw_inst *insn = next_insn(p, opcode); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + return insn; +} + +static int +get_3src_subreg_nr(struct brw_reg reg) +{ + /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions + * use 32-bit units (components 0..7). Since they only support F/D/UD + * types, this doesn't lose any flexibility, but uses fewer bits. + */ + return reg.subnr / 4; +} + +static brw_inst * +brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *inst = next_insn(p, opcode); + + gen7_convert_mrf_to_grf(p, &dest); + + assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16); + + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_MESSAGE_REGISTER_FILE); + assert(dest.nr < 128); + assert(dest.address_mode == BRW_ADDRESS_DIRECT); + assert(dest.type == BRW_REGISTER_TYPE_F || + dest.type == BRW_REGISTER_TYPE_DF || + dest.type == BRW_REGISTER_TYPE_D || + dest.type == BRW_REGISTER_TYPE_UD); + if (devinfo->gen == 6) { + brw_inst_set_3src_dst_reg_file(devinfo, inst, + dest.file == BRW_MESSAGE_REGISTER_FILE); + } + brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); + brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16); + brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask); + + assert(src0.file == BRW_GENERAL_REGISTER_FILE); + assert(src0.address_mode == BRW_ADDRESS_DIRECT); + assert(src0.nr < 128); + brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle); + brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0)); + brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); + brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); + brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); + brw_inst_set_3src_src0_rep_ctrl(devinfo, inst, + src0.vstride == BRW_VERTICAL_STRIDE_0); + + assert(src1.file == BRW_GENERAL_REGISTER_FILE); + assert(src1.address_mode == BRW_ADDRESS_DIRECT); + assert(src1.nr < 128); + brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle); + brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1)); + brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); + brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); + brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); + brw_inst_set_3src_src1_rep_ctrl(devinfo, inst, + src1.vstride == BRW_VERTICAL_STRIDE_0); + + assert(src2.file == BRW_GENERAL_REGISTER_FILE); + assert(src2.address_mode == BRW_ADDRESS_DIRECT); + assert(src2.nr < 128); + brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle); + brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2)); + brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); + brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); + brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); + brw_inst_set_3src_src2_rep_ctrl(devinfo, inst, + src2.vstride == BRW_VERTICAL_STRIDE_0); + + if (devinfo->gen >= 7) { + /* Set both the source and destination types based on dest.type, + * ignoring the source register types. The MAD and LRP emitters ensure + * that all four types are float. The BFE and BFI2 emitters, however, + * may send us mixed D and UD types and want us to ignore that and use + * the destination type. + */ + switch (dest.type) { + case BRW_REGISTER_TYPE_F: + brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F); + brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F); + break; + case BRW_REGISTER_TYPE_DF: + brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF); + brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF); + break; + case BRW_REGISTER_TYPE_D: + brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D); + brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D); + break; + case BRW_REGISTER_TYPE_UD: + brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD); + brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD); + break; + default: + unreachable("not reached"); + } + } + + return inst; +} + + +/*********************************************************************** + * Convenience routines. + */ +#define ALU1(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0) \ +{ \ + return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ +} + +#define ALU2(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1) \ +{ \ + return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ +} + +#define ALU3(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2) \ +{ \ + return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ +} + +#define ALU3F(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2) \ +{ \ + assert(dest.type == BRW_REGISTER_TYPE_F || \ + dest.type == BRW_REGISTER_TYPE_DF); \ + if (dest.type == BRW_REGISTER_TYPE_F) { \ + assert(src0.type == BRW_REGISTER_TYPE_F); \ + assert(src1.type == BRW_REGISTER_TYPE_F); \ + assert(src2.type == BRW_REGISTER_TYPE_F); \ + } else if (dest.type == BRW_REGISTER_TYPE_DF) { \ + assert(src0.type == BRW_REGISTER_TYPE_DF); \ + assert(src1.type == BRW_REGISTER_TYPE_DF); \ + assert(src2.type == BRW_REGISTER_TYPE_DF); \ + } \ + return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ +} + +/* Rounding operations (other than RNDD) require two instructions - the first + * stores a rounded value (possibly the wrong way) in the dest register, but + * also sets a per-channel "increment bit" in the flag register. A predicated + * add of 1.0 fixes dest to contain the desired result. + * + * Sandybridge and later appear to round correctly without an ADD. + */ +#define ROUND(OP) \ +void brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src) \ +{ \ + const struct gen_device_info *devinfo = p->devinfo; \ + brw_inst *rnd, *add; \ + rnd = next_insn(p, BRW_OPCODE_##OP); \ + brw_set_dest(p, rnd, dest); \ + brw_set_src0(p, rnd, src); \ + \ + if (devinfo->gen < 6) { \ + /* turn on round-increments */ \ + brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \ + add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \ + brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \ + } \ +} + + +ALU1(MOV) +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU1(DIM) +ALU2(ASR) +ALU1(FRC) +ALU1(RNDD) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU3F(MAD) +ALU3F(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU2(ADDC) +ALU2(SUBB) + +ROUND(RNDZ) +ROUND(RNDE) + + +brw_inst * +brw_ADD(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + /* 6.2.2: add */ + if (src0.type == BRW_REGISTER_TYPE_F || + (src0.file == BRW_IMMEDIATE_VALUE && + src0.type == BRW_REGISTER_TYPE_VF)) { + assert(src1.type != BRW_REGISTER_TYPE_UD); + assert(src1.type != BRW_REGISTER_TYPE_D); + } + + if (src1.type == BRW_REGISTER_TYPE_F || + (src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_VF)) { + assert(src0.type != BRW_REGISTER_TYPE_UD); + assert(src0.type != BRW_REGISTER_TYPE_D); + } + + return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1); +} + +brw_inst * +brw_AVG(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + assert(dest.type == src0.type); + assert(src0.type == src1.type); + switch (src0.type) { + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + break; + default: + unreachable("Bad type for brw_AVG"); + } + + return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1); +} + +brw_inst * +brw_MUL(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + /* 6.32.38: mul */ + if (src0.type == BRW_REGISTER_TYPE_D || + src0.type == BRW_REGISTER_TYPE_UD || + src1.type == BRW_REGISTER_TYPE_D || + src1.type == BRW_REGISTER_TYPE_UD) { + assert(dest.type != BRW_REGISTER_TYPE_F); + } + + if (src0.type == BRW_REGISTER_TYPE_F || + (src0.file == BRW_IMMEDIATE_VALUE && + src0.type == BRW_REGISTER_TYPE_VF)) { + assert(src1.type != BRW_REGISTER_TYPE_UD); + assert(src1.type != BRW_REGISTER_TYPE_D); + } + + if (src1.type == BRW_REGISTER_TYPE_F || + (src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_VF)) { + assert(src0.type != BRW_REGISTER_TYPE_UD); + assert(src0.type != BRW_REGISTER_TYPE_D); + } + + assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE || + src0.nr != BRW_ARF_ACCUMULATOR); + assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE || + src1.nr != BRW_ARF_ACCUMULATOR); + + return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1); +} + +brw_inst * +brw_LINE(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + src0.vstride = BRW_VERTICAL_STRIDE_0; + src0.width = BRW_WIDTH_1; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1); +} + +brw_inst * +brw_PLN(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + src0.vstride = BRW_VERTICAL_STRIDE_0; + src0.width = BRW_WIDTH_1; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + src1.vstride = BRW_VERTICAL_STRIDE_8; + src1.width = BRW_WIDTH_8; + src1.hstride = BRW_HORIZONTAL_STRIDE_1; + return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1); +} + +brw_inst * +brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16; + /* The F32TO16 instruction doesn't support 32-bit destination types in + * Align1 mode, and neither does the Gen8 implementation in terms of a + * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as + * an undocumented feature. + */ + const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD && + (!align16 || devinfo->gen >= 8)); + brw_inst *inst; + + if (align16) { + assert(dst.type == BRW_REGISTER_TYPE_UD); + } else { + assert(dst.type == BRW_REGISTER_TYPE_UD || + dst.type == BRW_REGISTER_TYPE_W || + dst.type == BRW_REGISTER_TYPE_UW || + dst.type == BRW_REGISTER_TYPE_HF); + } + + brw_push_insn_state(p); + + if (needs_zero_fill) { + brw_set_default_access_mode(p, BRW_ALIGN_1); + dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); + } + + if (devinfo->gen >= 8) { + inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src); + } else { + assert(devinfo->gen == 7); + inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src); + } + + if (needs_zero_fill) { + brw_inst_set_no_dd_clear(devinfo, inst, true); + inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0)); + brw_inst_set_no_dd_check(devinfo, inst, true); + } + + brw_pop_insn_state(p); + return inst; +} + +brw_inst * +brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) +{ + const struct gen_device_info *devinfo = p->devinfo; + bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16; + + if (align16) { + assert(src.type == BRW_REGISTER_TYPE_UD); + } else { + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: + * + * Because this instruction does not have a 16-bit floating-point + * type, the source data type must be Word (W). The destination type + * must be F (Float). + */ + if (src.type == BRW_REGISTER_TYPE_UD) + src = spread(retype(src, BRW_REGISTER_TYPE_W), 2); + + assert(src.type == BRW_REGISTER_TYPE_W || + src.type == BRW_REGISTER_TYPE_UW || + src.type == BRW_REGISTER_TYPE_HF); + } + + if (devinfo->gen >= 8) { + return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF)); + } else { + assert(devinfo->gen == 7); + return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src); + } +} + + +void brw_NOP(struct brw_codegen *p) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_NOP); + memset(insn, 0, sizeof(*insn)); + brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP); +} + + + + + +/*********************************************************************** + * Comparisons, if/else/endif + */ + +brw_inst * +brw_JMPI(struct brw_codegen *p, struct brw_reg index, + unsigned predicate_control) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_reg ip = brw_ip_reg(); + brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index); + + brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2); + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, inst, predicate_control); + + return inst; +} + +static void +push_if_stack(struct brw_codegen *p, brw_inst *inst) +{ + p->if_stack[p->if_stack_depth] = inst - p->store; + + p->if_stack_depth++; + if (p->if_stack_array_size <= p->if_stack_depth) { + p->if_stack_array_size *= 2; + p->if_stack = reralloc(p->mem_ctx, p->if_stack, int, + p->if_stack_array_size); + } +} + +static brw_inst * +pop_if_stack(struct brw_codegen *p) +{ + p->if_stack_depth--; + return &p->store[p->if_stack[p->if_stack_depth]]; +} + +static void +push_loop_stack(struct brw_codegen *p, brw_inst *inst) +{ + if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) { + p->loop_stack_array_size *= 2; + p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, + p->loop_stack_array_size); + p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int, + p->loop_stack_array_size); + } + + p->loop_stack[p->loop_stack_depth] = inst - p->store; + p->loop_stack_depth++; + p->if_depth_in_loop[p->loop_stack_depth] = 0; +} + +static brw_inst * +get_inner_do_insn(struct brw_codegen *p) +{ + return &p->store[p->loop_stack[p->loop_stack_depth - 1]]; +} + +/* EU takes the value from the flag register and pushes it onto some + * sort of a stack (presumably merging with any flag value already on + * the stack). Within an if block, the flags at the top of the stack + * control execution on each channel of the unit, eg. on each of the + * 16 pixel values in our wm programs. + * + * When the matching 'else' instruction is reached (presumably by + * countdown of the instruction count patched in by our ELSE/ENDIF + * functions), the relevant flags are inverted. + * + * When the matching 'endif' instruction is reached, the flags are + * popped off. If the stack is now empty, normal execution resumes. + */ +brw_inst * +brw_IF(struct brw_codegen *p, unsigned execute_size) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_IF); + + /* Override the defaults for this instruction: + */ + if (devinfo->gen < 6) { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->gen == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gen6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + } else if (devinfo->gen == 7) { + brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } else { + brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } + + brw_inst_set_exec_size(devinfo, insn, execute_size); + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (!p->single_program_flow && devinfo->gen < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + push_if_stack(p, insn); + p->if_depth_in_loop[p->loop_stack_depth]++; + return insn; +} + +/* This function is only used for gen6-style IF instructions with an + * embedded comparison (conditional modifier). It is not used on gen7. + */ +brw_inst * +gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, + struct brw_reg src0, struct brw_reg src1) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_IF); + + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_exec_size(devinfo, insn, + brw_inst_exec_size(devinfo, p->current)); + brw_inst_set_gen6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + + assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE); + assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); + brw_inst_set_cond_modifier(devinfo, insn, conditional); + + push_if_stack(p, insn); + return insn; +} + +/** + * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs. + */ +static void +convert_IF_ELSE_to_ADD(struct brw_codegen *p, + brw_inst *if_inst, brw_inst *else_inst) +{ + const struct gen_device_info *devinfo = p->devinfo; + + /* The next instruction (where the ENDIF would be, if it existed) */ + brw_inst *next_inst = &p->store[p->nr_insn]; + + assert(p->single_program_flow); + assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); + assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); + assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1); + + /* Convert IF to an ADD instruction that moves the instruction pointer + * to the first instruction of the ELSE block. If there is no ELSE + * block, point to where ENDIF would be. Reverse the predicate. + * + * There's no need to execute an ENDIF since we don't need to do any + * stack operations, and if we're currently executing, we just want to + * continue normally. + */ + brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD); + brw_inst_set_pred_inv(devinfo, if_inst, true); + + if (else_inst != NULL) { + /* Convert ELSE to an ADD instruction that points where the ENDIF + * would be. + */ + brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD); + + brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16); + brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16); + } else { + brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16); + } +} + +/** + * Patch IF and ELSE instructions with appropriate jump targets. + */ +static void +patch_IF_ELSE(struct brw_codegen *p, + brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst) +{ + const struct gen_device_info *devinfo = p->devinfo; + + /* We shouldn't be patching IF and ELSE instructions in single program flow + * mode when gen < 6, because in single program flow mode on those + * platforms, we convert flow control instructions to conditional ADDs that + * operate on IP (see brw_ENDIF). + * + * However, on Gen6, writing to IP doesn't work in single program flow mode + * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may + * not be updated by non-flow control instructions."). And on later + * platforms, there is no significant benefit to converting control flow + * instructions to conditional ADDs. So we do patch IF and ELSE + * instructions in single program flow mode on those platforms. + */ + if (devinfo->gen < 6) + assert(!p->single_program_flow); + + assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); + assert(endif_inst != NULL); + assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); + + unsigned br = brw_jump_scale(devinfo); + + assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF); + brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst)); + + if (else_inst == NULL) { + /* Patch IF -> ENDIF */ + if (devinfo->gen < 6) { + /* Turn it into an IFF, which means no mask stack operations for + * all-false and jumping past the ENDIF. + */ + brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF); + brw_inst_set_gen4_jump_count(devinfo, if_inst, + br * (endif_inst - if_inst + 1)); + brw_inst_set_gen4_pop_count(devinfo, if_inst, 0); + } else if (devinfo->gen == 6) { + /* As of gen6, there is no IFF and IF must point to the ENDIF. */ + brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst)); + } else { + brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); + brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst)); + } + } else { + brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst)); + + /* Patch IF -> ELSE */ + if (devinfo->gen < 6) { + brw_inst_set_gen4_jump_count(devinfo, if_inst, + br * (else_inst - if_inst)); + brw_inst_set_gen4_pop_count(devinfo, if_inst, 0); + } else if (devinfo->gen == 6) { + brw_inst_set_gen6_jump_count(devinfo, if_inst, + br * (else_inst - if_inst + 1)); + } + + /* Patch ELSE -> ENDIF */ + if (devinfo->gen < 6) { + /* BRW_OPCODE_ELSE pre-gen6 should point just past the + * matching ENDIF. + */ + brw_inst_set_gen4_jump_count(devinfo, else_inst, + br * (endif_inst - else_inst + 1)); + brw_inst_set_gen4_pop_count(devinfo, else_inst, 1); + } else if (devinfo->gen == 6) { + /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */ + brw_inst_set_gen6_jump_count(devinfo, else_inst, + br * (endif_inst - else_inst)); + } else { + /* The IF instruction's JIP should point just past the ELSE */ + brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1)); + /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */ + brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); + brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst)); + if (devinfo->gen >= 8) { + /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both + * should point to ENDIF. + */ + brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst)); + } + } + } +} + +void +brw_ELSE(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_ELSE); + + if (devinfo->gen < 6) { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->gen == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gen6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } else if (devinfo->gen == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } else { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (!p->single_program_flow && devinfo->gen < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + push_if_stack(p, insn); +} + +void +brw_ENDIF(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn = NULL; + brw_inst *else_inst = NULL; + brw_inst *if_inst = NULL; + brw_inst *tmp; + bool emit_endif = true; + + /* In single program flow mode, we can express IF and ELSE instructions + * equivalently as ADD instructions that operate on IP. On platforms prior + * to Gen6, flow control instructions cause an implied thread switch, so + * this is a significant savings. + * + * However, on Gen6, writing to IP doesn't work in single program flow mode + * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may + * not be updated by non-flow control instructions."). And on later + * platforms, there is no significant benefit to converting control flow + * instructions to conditional ADDs. So we only do this trick on Gen4 and + * Gen5. + */ + if (devinfo->gen < 6 && p->single_program_flow) + emit_endif = false; + + /* + * A single next_insn() may change the base address of instruction store + * memory(p->store), so call it first before referencing the instruction + * store pointer from an index + */ + if (emit_endif) + insn = next_insn(p, BRW_OPCODE_ENDIF); + + /* Pop the IF and (optional) ELSE instructions from the stack */ + p->if_depth_in_loop[p->loop_stack_depth]--; + tmp = pop_if_stack(p); + if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) { + else_inst = tmp; + tmp = pop_if_stack(p); + } + if_inst = tmp; + + if (!emit_endif) { + /* ENDIF is useless; don't bother emitting it. */ + convert_IF_ELSE_to_ADD(p, if_inst, else_inst); + return; + } + + if (devinfo->gen < 6) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->gen == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } else if (devinfo->gen == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + } else { + brw_set_src0(p, insn, brw_imm_d(0)); + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (devinfo->gen < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + /* Also pop item off the stack in the endif instruction: */ + if (devinfo->gen < 6) { + brw_inst_set_gen4_jump_count(devinfo, insn, 0); + brw_inst_set_gen4_pop_count(devinfo, insn, 1); + } else if (devinfo->gen == 6) { + brw_inst_set_gen6_jump_count(devinfo, insn, 2); + } else { + brw_inst_set_jip(devinfo, insn, 2); + } + patch_IF_ELSE(p, if_inst, else_inst, insn); +} + +brw_inst * +brw_BREAK(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_BREAK); + if (devinfo->gen >= 8) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, brw_imm_d(0x0)); + } else if (devinfo->gen >= 6) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + brw_inst_set_gen4_pop_count(devinfo, insn, + p->if_depth_in_loop[p->loop_stack_depth]); + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, + brw_inst_exec_size(devinfo, p->current)); + + return insn; +} + +brw_inst * +brw_CONT(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_CONTINUE); + brw_set_dest(p, insn, brw_ip_reg()); + if (devinfo->gen >= 8) { + brw_set_src0(p, insn, brw_imm_d(0x0)); + } else { + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } + + if (devinfo->gen < 6) { + brw_inst_set_gen4_pop_count(devinfo, insn, + p->if_depth_in_loop[p->loop_stack_depth]); + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, + brw_inst_exec_size(devinfo, p->current)); + return insn; +} + +brw_inst * +gen6_HALT(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_HALT); + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + if (devinfo->gen >= 8) { + brw_set_src0(p, insn, brw_imm_d(0x0)); + } else { + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, + brw_inst_exec_size(devinfo, p->current)); + return insn; +} + +/* DO/WHILE loop: + * + * The DO/WHILE is just an unterminated loop -- break or continue are + * used for control within the loop. We have a few ways they can be + * done. + * + * For uniform control flow, the WHILE is just a jump, so ADD ip, ip, + * jip and no DO instruction. + * + * For non-uniform control flow pre-gen6, there's a DO instruction to + * push the mask, and a WHILE to jump back, and BREAK to get out and + * pop the mask. + * + * For gen6, there's no more mask stack, so no need for DO. WHILE + * just points back to the first instruction of the loop. + */ +brw_inst * +brw_DO(struct brw_codegen *p, unsigned execute_size) +{ + const struct gen_device_info *devinfo = p->devinfo; + + if (devinfo->gen >= 6 || p->single_program_flow) { + push_loop_stack(p, &p->store[p->nr_insn]); + return &p->store[p->nr_insn]; + } else { + brw_inst *insn = next_insn(p, BRW_OPCODE_DO); + + push_loop_stack(p, insn); + + /* Override the defaults for this instruction: + */ + brw_set_dest(p, insn, brw_null_reg()); + brw_set_src0(p, insn, brw_null_reg()); + brw_set_src1(p, insn, brw_null_reg()); + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, execute_size); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + + return insn; + } +} + +/** + * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE + * instruction here. + * + * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop + * nesting, since it can always just point to the end of the block/current loop. + */ +static void +brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *do_inst = get_inner_do_insn(p); + brw_inst *inst; + unsigned br = brw_jump_scale(devinfo); + + assert(devinfo->gen < 6); + + for (inst = while_inst - 1; inst != do_inst; inst--) { + /* If the jump count is != 0, that means that this instruction has already + * been patched because it's part of a loop inside of the one we're + * patching. + */ + if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK && + brw_inst_gen4_jump_count(devinfo, inst) == 0) { + brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1)); + } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE && + brw_inst_gen4_jump_count(devinfo, inst) == 0) { + brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst)); + } + } +} + +brw_inst * +brw_WHILE(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn, *do_insn; + unsigned br = brw_jump_scale(devinfo); + + if (devinfo->gen >= 6) { + insn = next_insn(p, BRW_OPCODE_WHILE); + do_insn = get_inner_do_insn(p); + + if (devinfo->gen >= 8) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); + } else if (devinfo->gen == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); + } else { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } + + brw_inst_set_exec_size(devinfo, insn, + brw_inst_exec_size(devinfo, p->current)); + + } else { + if (p->single_program_flow) { + insn = next_insn(p, BRW_OPCODE_ADD); + do_insn = get_inner_do_insn(p); + + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16)); + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); + } else { + insn = next_insn(p, BRW_OPCODE_WHILE); + do_insn = get_inner_do_insn(p); + + assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO); + + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0)); + + brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn)); + brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1)); + brw_inst_set_gen4_pop_count(devinfo, insn, 0); + + brw_patch_break_cont(p, insn); + } + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + + p->loop_stack_depth--; + + return insn; +} + +/* FORWARD JUMPS: + */ +void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *jmp_insn = &p->store[jmp_insn_idx]; + unsigned jmpi = 1; + + if (devinfo->gen >= 5) + jmpi = 2; + + assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI); + assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE); + + brw_inst_set_gen4_jump_count(devinfo, jmp_insn, + jmpi * (p->nr_insn - jmp_insn_idx - 1)); +} + +/* To integrate with the above, it makes sense that the comparison + * instruction should populate the flag register. It might be simpler + * just to use the flag reg for most WM tasks? + */ +void brw_CMP(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_CMP); + + brw_inst_set_cond_modifier(devinfo, insn, conditional); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + + /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds + * page says: + * "Any CMP instruction with a null destination must use a {switch}." + * + * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't + * mentioned on their work-arounds pages. + */ + if (devinfo->gen == 7) { + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL) { + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + } + } +} + +/*********************************************************************** + * Helpers for the various SEND message types: + */ + +/** Extended math function, float[8]. + */ +void gen4_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + unsigned msg_reg_nr, + struct brw_reg src, + unsigned precision ) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + unsigned data_type; + if (has_scalar_region(src)) { + data_type = BRW_MATH_DATA_SCALAR; + } else { + data_type = BRW_MATH_DATA_VECTOR; + } + + assert(devinfo->gen < 6); + + /* Example code doesn't set predicate_control for send + * instructions. + */ + brw_inst_set_pred_control(devinfo, insn, 0); + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src); + brw_set_math_message(p, + insn, + function, + src.type == BRW_REGISTER_TYPE_D, + precision, + data_type); +} + +void gen6_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + struct brw_reg src0, + struct brw_reg src1) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_MATH); + + assert(devinfo->gen >= 6); + + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); + + assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); + if (devinfo->gen == 6) { + assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1); + } + + if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || + function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || + function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { + assert(src0.type != BRW_REGISTER_TYPE_F); + assert(src1.type != BRW_REGISTER_TYPE_F); + assert(src1.file == BRW_GENERAL_REGISTER_FILE || + (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); + } else { + assert(src0.type == BRW_REGISTER_TYPE_F); + assert(src1.type == BRW_REGISTER_TYPE_F); + } + + /* Source modifiers are ignored for extended math instructions on Gen6. */ + if (devinfo->gen == 6) { + assert(!src0.negate); + assert(!src0.abs); + assert(!src1.negate); + assert(!src1.abs); + } + + brw_inst_set_math_function(devinfo, insn, function); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); +} + +/** + * Return the right surface index to access the thread scratch space using + * stateless dataport messages. + */ +unsigned +brw_scratch_surface_idx(const struct brw_codegen *p) +{ + /* The scratch space is thread-local so IA coherency is unnecessary. */ + if (p->devinfo->gen >= 8) + return GEN8_BTI_STATELESS_NON_COHERENT; + else + return BRW_BTI_STATELESS; +} + +/** + * Write a block of OWORDs (half a GRF each) from the scratch buffer, + * using a constant offset per channel. + * + * The offset must be aligned to oword size (16 bytes). Used for + * register spilling. + */ +void brw_oword_block_write_scratch(struct brw_codegen *p, + struct brw_reg mrf, + int num_regs, + unsigned offset) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + uint32_t msg_type; + + if (devinfo->gen >= 6) + offset /= 16; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + + const unsigned mlen = 1 + num_regs; + + /* Set up the message header. This is g0, with g0.2 filled with + * the offset. We don't want to leave our offset around in g0 or + * it'll screw up texture samples, so set it up inside the message + * reg. + */ + { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); + + brw_pop_insn_state(p); + } + + { + struct brw_reg dest; + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + int send_commit_msg; + struct brw_reg src_header = retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW); + + brw_inst_set_compression(devinfo, insn, false); + + if (brw_inst_exec_size(devinfo, insn) >= 16) + src_header = vec16(src_header); + + assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); + if (devinfo->gen < 6) + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + + /* Until gen6, writes followed by reads from the same location + * are not guaranteed to be ordered unless write_commit is set. + * If set, then a no-op write is issued to the destination + * register to set a dependency, and a read from the destination + * can be used to ensure the ordering. + * + * For gen6, only writes between different threads need ordering + * protection. Our use of DP writes is all about register + * spilling within a thread. + */ + if (devinfo->gen >= 6) { + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + send_commit_msg = 0; + } else { + dest = src_header; + send_commit_msg = 1; + } + + brw_set_dest(p, insn, dest); + if (devinfo->gen >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + } + + if (devinfo->gen >= 6) + msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; + else + msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; + + brw_set_dp_write_message(p, + insn, + brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), + msg_type, + target_cache, + mlen, + true, /* header_present */ + 0, /* not a render target */ + send_commit_msg, /* response_length */ + 0, /* eot */ + send_commit_msg); + } +} + + +/** + * Read a block of owords (half a GRF each) from the scratch buffer + * using a constant index per channel. + * + * Offset must be aligned to oword size (16 bytes). Used for register + * spilling. + */ +void +brw_oword_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + unsigned offset) +{ + const struct gen_device_info *devinfo = p->devinfo; + + if (devinfo->gen >= 6) + offset /= 16; + + if (p->devinfo->gen >= 7) { + /* On gen 7 and above, we no longer have message registers and we can + * send from any register we want. By using the destination register + * for the message, we guarantee that the implied message write won't + * accidentally overwrite anything. This has been a problem because + * the MRF registers and source for the final FB write are both fixed + * and may overlap. + */ + mrf = retype(dest, BRW_REGISTER_TYPE_UD); + } else { + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + } + dest = retype(dest, BRW_REGISTER_TYPE_UW); + + const unsigned rlen = num_regs; + const unsigned target_cache = + (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + + { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset)); + + brw_pop_insn_state(p); + } + + { + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + + assert(brw_inst_pred_control(devinfo, insn) == 0); + brw_inst_set_compression(devinfo, insn, false); + + brw_set_dest(p, insn, dest); /* UW? */ + if (devinfo->gen >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + } + + brw_set_dp_read_message(p, + insn, + brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + target_cache, + 1, /* msg_length */ + true, /* header_present */ + rlen); + } +} + +void +gen7_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + int num_regs, + unsigned offset) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE); + + brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW)); + + /* The HW requires that the header is present; this is to get the g0.5 + * scratch offset. + */ + brw_set_src0(p, insn, brw_vec8_grf(0, 0)); + + /* According to the docs, offset is "A 12-bit HWord offset into the memory + * Immediate Memory buffer as specified by binding table 0xFF." An HWORD + * is 32 bytes, which happens to be the size of a register. + */ + offset /= REG_SIZE; + assert(offset < (1 << 12)); + + gen7_set_dp_scratch_message(p, insn, + false, /* scratch read */ + false, /* OWords */ + false, /* invalidate after read */ + num_regs, + offset, + 1, /* mlen: just g0 */ + num_regs, /* rlen */ + true); /* header present */ +} + +/** + * Read float[4] vectors from the data port constant cache. + * Location (in buffer) should be a multiple of 16. + * Used for fetching shader constants. + */ +void brw_oword_block_read(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + uint32_t offset, + uint32_t bind_table_index) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE : + BRW_DATAPORT_READ_TARGET_DATA_CACHE); + const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current); + + /* On newer hardware, offset is in units of owords. */ + if (devinfo->gen >= 6) + offset /= 16; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); + brw_pop_insn_state(p); + + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + + /* cast dest to a uword[8] vector */ + dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); + + brw_set_dest(p, insn, dest); + if (devinfo->gen >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + } + + brw_set_dp_read_message(p, insn, bind_table_index, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size), + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + target_cache, + 1, /* msg_length */ + true, /* header_present */ + DIV_ROUND_UP(exec_size, 8)); /* response_length */ + + brw_pop_insn_state(p); +} + + +void brw_fb_WRITE(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg implied_header, + unsigned msg_control, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool eot, + bool last_render_target, + bool header_present) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + brw_inst *insn; + unsigned msg_type; + struct brw_reg dest, src0; + + if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16) + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + else + dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); + + if (devinfo->gen >= 6) { + insn = next_insn(p, BRW_OPCODE_SENDC); + } else { + insn = next_insn(p, BRW_OPCODE_SEND); + } + brw_inst_set_compression(devinfo, insn, false); + + if (devinfo->gen >= 6) { + /* headerless version, just submit color payload */ + src0 = payload; + + msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; + } else { + assert(payload.file == BRW_MESSAGE_REGISTER_FILE); + brw_inst_set_base_mrf(devinfo, insn, payload.nr); + src0 = implied_header; + + msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; + } + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_dp_write_message(p, + insn, + binding_table_index, + msg_control, + msg_type, + target_cache, + msg_length, + header_present, + last_render_target, + response_length, + eot, + 0 /* send_commit_msg */); +} + +brw_inst * +gen9_fb_READ(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool per_sample) +{ + const struct gen_device_info *devinfo = p->devinfo; + assert(devinfo->gen >= 9); + const unsigned msg_subtype = + brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1; + brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC); + + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, payload); + brw_set_dp_read_message(p, insn, binding_table_index, + per_sample << 5 | msg_subtype, + GEN9_DATAPORT_RC_RENDER_TARGET_READ, + GEN6_SFID_DATAPORT_RENDER_CACHE, + msg_length, true /* header_present */, + response_length); + brw_inst_set_rt_slot_group(devinfo, insn, + brw_inst_qtr_control(devinfo, p->current) / 2); + + return insn; +} + +/** + * Texture sample instruction. + * Note: the msg_type plus msg_length values determine exactly what kind + * of sampling operation is performed. See volume 4, page 161 of docs. + */ +void brw_SAMPLE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + if (msg_reg_nr != -1) + gen6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */ + + /* From the 965 PRM (volume 4, part 1, section 14.2.41): + * + * "Instruction compression is not allowed for this instruction (that + * is, send). The hardware behavior is undefined if this instruction is + * set as compressed. However, compress control can be set to "SecHalf" + * to affect the EMask generation." + * + * No similar wording is found in later PRMs, but there are examples + * utilizing send with SecHalf. More importantly, SIMD8 sampler messages + * are allowed in SIMD16 mode and they could not work without SecHalf. For + * these reasons, we allow BRW_COMPRESSION_2NDHALF here. + */ + brw_inst_set_compression(devinfo, insn, false); + + if (devinfo->gen < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_sampler_message(p, insn, + binding_table_index, + sampler, + msg_type, + response_length, + msg_length, + header_present, + simd_mode, + return_format); +} + +/* Adjust the message header's sampler state pointer to + * select the correct group of 16 samplers. + */ +void brw_adjust_sampler_state_pointer(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg sampler_index) +{ + /* The "Sampler Index" field can only store values between 0 and 15. + * However, we can add an offset to the "Sampler State Pointer" + * field, effectively selecting a different set of 16 samplers. + * + * The "Sampler State Pointer" needs to be aligned to a 32-byte + * offset, and each sampler state is only 16-bytes, so we can't + * exclusively use the offset - we have to use both. + */ + + const struct gen_device_info *devinfo = p->devinfo; + + if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + const int sampler_state_size = 16; /* 16 bytes */ + uint32_t sampler = sampler_index.ud; + + if (sampler >= 16) { + assert(devinfo->is_haswell || devinfo->gen >= 8); + brw_ADD(p, + get_element_ud(header, 3), + get_element_ud(brw_vec8_grf(0, 0), 3), + brw_imm_ud(16 * (sampler / 16) * sampler_state_size)); + } + } else { + /* Non-const sampler array indexing case */ + if (devinfo->gen < 8 && !devinfo->is_haswell) { + return; + } + + struct brw_reg temp = get_element_ud(header, 3); + + brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0)); + brw_SHL(p, temp, temp, brw_imm_ud(4)); + brw_ADD(p, + get_element_ud(header, 3), + get_element_ud(brw_vec8_grf(0, 0), 3), + temp); + } +} + +/* All these variables are pretty confusing - we might be better off + * using bitmasks and macros for this, in the old style. Or perhaps + * just having the caller instantiate the fields in dword3 itself. + */ +void brw_urb_WRITE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + gen6_resolve_implied_move(p, &src0, msg_reg_nr); + + if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { + /* Enable Channel Masks in the URB_WRITE_HWORD message header */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5), + BRW_REGISTER_TYPE_UD), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xff00)); + brw_pop_insn_state(p); + } + + insn = next_insn(p, BRW_OPCODE_SEND); + + assert(msg_length < BRW_MAX_MRF(devinfo->gen)); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, brw_imm_d(0)); + + if (devinfo->gen < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_urb_message(p, + insn, + flags, + msg_length, + response_length, + offset, + swizzle); +} + +struct brw_inst * +brw_send_indirect_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg desc) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_inst *send; + int setup; + + dst = retype(dst, BRW_REGISTER_TYPE_UW); + + assert(desc.type == BRW_REGISTER_TYPE_UD); + + /* We hold on to the setup instruction (the SEND in the direct case, the OR + * in the indirect case) by its index in the instruction store. The + * pointer returned by next_insn() may become invalid if emitting the SEND + * in the indirect case reallocs the store. + */ + + if (desc.file == BRW_IMMEDIATE_VALUE) { + setup = p->nr_insn; + send = next_insn(p, BRW_OPCODE_SEND); + brw_set_src1(p, send, desc); + + } else { + struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + /* Load the indirect descriptor to an address register using OR so the + * caller can specify additional descriptor bits with the usual + * brw_set_*_message() helper functions. + */ + setup = p->nr_insn; + brw_OR(p, addr, desc, brw_imm_ud(0)); + + brw_pop_insn_state(p); + + send = next_insn(p, BRW_OPCODE_SEND); + brw_set_src1(p, send, addr); + } + + if (dst.width < BRW_EXECUTE_8) + brw_inst_set_exec_size(devinfo, send, dst.width); + + brw_set_dest(p, send, dst); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); + brw_inst_set_sfid(devinfo, send, sfid); + + return &p->store[setup]; +} + +static struct brw_inst * +brw_send_indirect_surface_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned message_len, + unsigned response_len, + bool header_present) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_inst *insn; + + if (surface.file != BRW_IMMEDIATE_VALUE) { + struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + /* Mask out invalid bits from the surface index to avoid hangs e.g. when + * some surface array is accessed out of bounds. + */ + insn = brw_AND(p, addr, + suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)), + BRW_GET_SWZ(surface.swizzle, 0)), + brw_imm_ud(0xff)); + + brw_pop_insn_state(p); + + surface = addr; + } + + insn = brw_send_indirect_message(p, sfid, dst, payload, surface); + brw_inst_set_mlen(devinfo, insn, message_len); + brw_inst_set_rlen(devinfo, insn, response_len); + brw_inst_set_header_present(devinfo, insn, header_present); + + return insn; +} + +static bool +while_jumps_before_offset(const struct gen_device_info *devinfo, + brw_inst *insn, int while_offset, int start_offset) +{ + int scale = 16 / brw_jump_scale(devinfo); + int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn) + : brw_inst_jip(devinfo, insn); + assert(jip < 0); + return while_offset + jip * scale <= start_offset; +} + + +static int +brw_find_next_block_end(struct brw_codegen *p, int start_offset) +{ + int offset; + void *store = p->store; + const struct gen_device_info *devinfo = p->devinfo; + + int depth = 0; + + for (offset = next_offset(devinfo, store, start_offset); + offset < p->next_insn_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + + switch (brw_inst_opcode(devinfo, insn)) { + case BRW_OPCODE_IF: + depth++; + break; + case BRW_OPCODE_ENDIF: + if (depth == 0) + return offset; + depth--; + break; + case BRW_OPCODE_WHILE: + /* If the while doesn't jump before our instruction, it's the end + * of a sibling do...while loop. Ignore it. + */ + if (!while_jumps_before_offset(devinfo, insn, offset, start_offset)) + continue; + /* fallthrough */ + case BRW_OPCODE_ELSE: + case BRW_OPCODE_HALT: + if (depth == 0) + return offset; + } + } + + return 0; +} + +/* There is no DO instruction on gen6, so to find the end of the loop + * we have to see if the loop is jumping back before our start + * instruction. + */ +static int +brw_find_loop_end(struct brw_codegen *p, int start_offset) +{ + const struct gen_device_info *devinfo = p->devinfo; + int offset; + void *store = p->store; + + assert(devinfo->gen >= 6); + + /* Always start after the instruction (such as a WHILE) we're trying to fix + * up. + */ + for (offset = next_offset(devinfo, store, start_offset); + offset < p->next_insn_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + + if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) { + if (while_jumps_before_offset(devinfo, insn, offset, start_offset)) + return offset; + } + } + assert(!"not reached"); + return start_offset; +} + +/* After program generation, go back and update the UIP and JIP of + * BREAK, CONT, and HALT instructions to their correct locations. + */ +void +brw_set_uip_jip(struct brw_codegen *p, int start_offset) +{ + const struct gen_device_info *devinfo = p->devinfo; + int offset; + int br = brw_jump_scale(devinfo); + int scale = 16 / br; + void *store = p->store; + + if (devinfo->gen < 6) + return; + + for (offset = start_offset; offset < p->next_insn_offset; offset += 16) { + brw_inst *insn = store + offset; + assert(brw_inst_cmpt_control(devinfo, insn) == 0); + + int block_end_offset = brw_find_next_block_end(p, offset); + switch (brw_inst_opcode(devinfo, insn)) { + case BRW_OPCODE_BREAK: + assert(block_end_offset != 0); + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + /* Gen7 UIP points to WHILE; Gen6 points just after it */ + brw_inst_set_uip(devinfo, insn, + (brw_find_loop_end(p, offset) - offset + + (devinfo->gen == 6 ? 16 : 0)) / scale); + break; + case BRW_OPCODE_CONTINUE: + assert(block_end_offset != 0); + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + brw_inst_set_uip(devinfo, insn, + (brw_find_loop_end(p, offset) - offset) / scale); + + assert(brw_inst_uip(devinfo, insn) != 0); + assert(brw_inst_jip(devinfo, insn) != 0); + break; + + case BRW_OPCODE_ENDIF: { + int32_t jump = (block_end_offset == 0) ? + 1 * br : (block_end_offset - offset) / scale; + if (devinfo->gen >= 7) + brw_inst_set_jip(devinfo, insn, jump); + else + brw_inst_set_gen6_jump_count(devinfo, insn, jump); + break; + } + + case BRW_OPCODE_HALT: + /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): + * + * "In case of the halt instruction not inside any conditional + * code block, the value of <JIP> and <UIP> should be the + * same. In case of the halt instruction inside conditional code + * block, the <UIP> should be the end of the program, and the + * <JIP> should be end of the most inner conditional code block." + * + * The uip will have already been set by whoever set up the + * instruction. + */ + if (block_end_offset == 0) { + brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn)); + } else { + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + } + assert(brw_inst_uip(devinfo, insn) != 0); + assert(brw_inst_jip(devinfo, insn) != 0); + break; + } + } +} + +void brw_ff_sync(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + bool allocate, + unsigned response_length, + bool eot) +{ + const struct gen_device_info *devinfo = p->devinfo; + brw_inst *insn; + + gen6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, brw_imm_d(0)); + + if (devinfo->gen < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_ff_sync_message(p, + insn, + allocate, + response_length, + eot); +} + +/** + * Emit the SEND instruction necessary to generate stream output data on Gen6 + * (for transform feedback). + * + * If send_commit_msg is true, this is the last piece of stream output data + * from this thread, so send the data as a committed write. According to the + * Sandy Bridge PRM (volume 2 part 1, section 4.5.1): + * + * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all + * writes are complete by sending the final write as a committed write." + */ +void +brw_svb_write(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + bool send_commit_msg) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + brw_inst *insn; + + gen6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, brw_imm_d(0)); + brw_set_dp_write_message(p, insn, + binding_table_index, + 0, /* msg_control: ignored */ + GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE, + target_cache, + 1, /* msg_length */ + true, /* header_present */ + 0, /* last_render_target: ignored */ + send_commit_msg, /* response_length */ + 0, /* end_of_thread */ + send_commit_msg); /* send_commit_msg */ +} + +static unsigned +brw_surface_payload_size(struct brw_codegen *p, + unsigned num_channels, + bool has_simd4x2, + bool has_simd16) +{ + if (has_simd4x2 && + brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16) + return 1; + else if (has_simd16 && + brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16) + return 2 * num_channels; + else + return num_channels; +} + +static void +brw_set_dp_untyped_atomic_message(struct brw_codegen *p, + brw_inst *insn, + unsigned atomic_op, + bool response_expected) +{ + const struct gen_device_info *devinfo = p->devinfo; + unsigned msg_control = + atomic_op | /* Atomic Operation Type: BRW_AOP_* */ + (response_expected ? 1 << 5 : 0); /* Return data expected */ + + if (devinfo->gen >= 8 || devinfo->is_haswell) { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16) + msg_control |= 1 << 4; /* SIMD8 mode */ + + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP); + } else { + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2); + } + } else { + brw_inst_set_dp_msg_type(devinfo, insn, + GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP); + + if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16) + msg_control |= 1 << 4; /* SIMD8 mode */ + } + + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_untyped_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN7_SFID_DATAPORT_DATA_CACHE); + const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; + /* Mask out unused components -- This is especially important in Align16 + * mode on generations that don't have native support for SIMD4x2 atomics, + * because unused but enabled components will cause the dataport to perform + * additional atomic operations on the addresses that happen to be in the + * uninitialized Y, Z and W coordinates of the payload. + */ + const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, brw_writemask(dst, mask), payload, surface, msg_length, + brw_surface_payload_size(p, response_expected, + devinfo->gen >= 8 || devinfo->is_haswell, true), + align1); + + brw_set_dp_untyped_atomic_message( + p, insn, atomic_op, response_expected); +} + +static void +brw_set_dp_untyped_surface_read_message(struct brw_codegen *p, + struct brw_inst *insn, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + /* Set mask of 32-bit channels to drop. */ + unsigned msg_control = 0xf & (0xf << num_channels); + + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16) + msg_control |= 1 << 4; /* SIMD16 mode */ + else + msg_control |= 2 << 4; /* SIMD8 mode */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ : + GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ)); + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_untyped_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN7_SFID_DATAPORT_DATA_CACHE); + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, dst, payload, surface, msg_length, + brw_surface_payload_size(p, num_channels, true, true), + false); + + brw_set_dp_untyped_surface_read_message( + p, insn, num_channels); +} + +static void +brw_set_dp_untyped_surface_write_message(struct brw_codegen *p, + struct brw_inst *insn, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + /* Set mask of 32-bit channels to drop. */ + unsigned msg_control = 0xf & (0xf << num_channels); + + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16) + msg_control |= 1 << 4; /* SIMD16 mode */ + else + msg_control |= 2 << 4; /* SIMD8 mode */ + } else { + if (devinfo->gen >= 8 || devinfo->is_haswell) + msg_control |= 0 << 4; /* SIMD4x2 mode */ + else + msg_control |= 2 << 4; /* SIMD8 mode */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE : + GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE); + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_untyped_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN7_SFID_DATAPORT_DATA_CACHE); + const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; + /* Mask out unused components -- See comment in brw_untyped_atomic(). */ + const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ? + WRITEMASK_X : WRITEMASK_XYZW; + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, brw_writemask(brw_null_reg(), mask), + payload, surface, msg_length, 0, align1); + + brw_set_dp_untyped_surface_write_message( + p, insn, num_channels); +} + +static void +brw_set_dp_typed_atomic_message(struct brw_codegen *p, + struct brw_inst *insn, + unsigned atomic_op, + bool response_expected) +{ + const struct gen_device_info *devinfo = p->devinfo; + unsigned msg_control = + atomic_op | /* Atomic Operation Type: BRW_AOP_* */ + (response_expected ? 1 << 5 : 0); /* Return data expected */ + + if (devinfo->gen >= 8 || devinfo->is_haswell) { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */ + + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP); + } else { + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2); + } + + } else { + brw_inst_set_dp_msg_type(devinfo, insn, + GEN7_DATAPORT_RC_TYPED_ATOMIC_OP); + + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */ + } + + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_typed_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected) { + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN6_SFID_DATAPORT_RENDER_CACHE); + const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); + /* Mask out unused components -- See comment in brw_untyped_atomic(). */ + const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, brw_writemask(dst, mask), payload, surface, msg_length, + brw_surface_payload_size(p, response_expected, + devinfo->gen >= 8 || devinfo->is_haswell, false), + true); + + brw_set_dp_typed_atomic_message( + p, insn, atomic_op, response_expected); +} + +static void +brw_set_dp_typed_surface_read_message(struct brw_codegen *p, + struct brw_inst *insn, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + /* Set mask of unused channels. */ + unsigned msg_control = 0xf & (0xf << num_channels); + + if (devinfo->gen >= 8 || devinfo->is_haswell) { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */ + else + msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ); + } else { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + GEN7_DATAPORT_RC_TYPED_SURFACE_READ); + } + + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_typed_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN6_SFID_DATAPORT_RENDER_CACHE); + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, dst, payload, surface, msg_length, + brw_surface_payload_size(p, num_channels, + devinfo->gen >= 8 || devinfo->is_haswell, false), + true); + + brw_set_dp_typed_surface_read_message( + p, insn, num_channels); +} + +static void +brw_set_dp_typed_surface_write_message(struct brw_codegen *p, + struct brw_inst *insn, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + /* Set mask of unused channels. */ + unsigned msg_control = 0xf & (0xf << num_channels); + + if (devinfo->gen >= 8 || devinfo->is_haswell) { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */ + else + msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE); + + } else { + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1) + msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */ + } + + brw_inst_set_dp_msg_type(devinfo, insn, + GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE); + } + + brw_inst_set_dp_msg_control(devinfo, insn, msg_control); +} + +void +brw_typed_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN6_SFID_DATAPORT_RENDER_CACHE); + const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); + /* Mask out unused components -- See comment in brw_untyped_atomic(). */ + const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ? + WRITEMASK_X : WRITEMASK_XYZW); + struct brw_inst *insn = brw_send_indirect_surface_message( + p, sfid, brw_writemask(brw_null_reg(), mask), + payload, surface, msg_length, 0, true); + + brw_set_dp_typed_surface_write_message( + p, insn, num_channels); +} + +static void +brw_set_memory_fence_message(struct brw_codegen *p, + struct brw_inst *insn, + enum brw_message_target sfid, + bool commit_enable) +{ + const struct gen_device_info *devinfo = p->devinfo; + + brw_set_message_descriptor(p, insn, sfid, + 1 /* message length */, + (commit_enable ? 1 : 0) /* response length */, + true /* header present */, + false); + + switch (sfid) { + case GEN6_SFID_DATAPORT_RENDER_CACHE: + brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE); + break; + case GEN7_SFID_DATAPORT_DATA_CACHE: + brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE); + break; + default: + unreachable("Not reached"); + } + + if (commit_enable) + brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5); +} + +void +brw_memory_fence(struct brw_codegen *p, + struct brw_reg dst) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell; + struct brw_inst *insn; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + dst = vec1(dst); + + /* Set dst as destination for dependency tracking, the MEMORY_FENCE + * message doesn't write anything back. + */ + insn = next_insn(p, BRW_OPCODE_SEND); + dst = retype(dst, BRW_REGISTER_TYPE_UW); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, dst); + brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, + commit_enable); + + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* IVB does typed surface access through the render cache, so we need to + * flush it too. Use a different register so both flushes can be + * pipelined by the hardware. + */ + insn = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, offset(dst, 1)); + brw_set_src0(p, insn, offset(dst, 1)); + brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE, + commit_enable); + + /* Now write the response of the second message into the response of the + * first to trigger a pipeline stall -- This way future render and data + * cache messages will be properly ordered with respect to past data and + * render cache messages. + */ + brw_MOV(p, dst, offset(dst, 1)); + } + + brw_pop_insn_state(p); +} + +void +brw_pixel_interpolator_query(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + bool noperspective, + unsigned mode, + struct brw_reg data, + unsigned msg_length, + unsigned response_length) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_inst *insn; + const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current); + + /* brw_send_indirect_message will automatically use a direct send message + * if data is actually immediate. + */ + insn = brw_send_indirect_message(p, + GEN7_SFID_PIXEL_INTERPOLATOR, + dest, + mrf, + vec1(data)); + brw_inst_set_mlen(devinfo, insn, msg_length); + brw_inst_set_rlen(devinfo, insn, response_length); + + brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16); + brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */ + brw_inst_set_pi_nopersp(devinfo, insn, noperspective); + brw_inst_set_pi_message_type(devinfo, insn, mode); +} + +void +brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, + struct brw_reg mask) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current); + const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current); + brw_inst *inst; + + assert(devinfo->gen >= 7); + assert(mask.type == BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + + if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + if (devinfo->gen >= 8) { + /* Getting the first active channel index is easy on Gen8: Just find + * the first bit set in the execution mask. The register exists on + * HSW already but it reads back as all ones when the current + * instruction has execution masking disabled, so it's kind of + * useless. + */ + struct brw_reg exec_mask = + retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD); + + if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) { + /* Unfortunately, ce0 does not take into account the thread + * dispatch mask, which may be a problem in cases where it's not + * tightly packed (i.e. it doesn't have the form '2^n - 1' for + * some n). Combine ce0 with the given dispatch (or vector) mask + * to mask off those channels which were never dispatched by the + * hardware. + */ + brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8)); + brw_AND(p, vec1(dst), exec_mask, vec1(dst)); + exec_mask = vec1(dst); + } + + /* Quarter control has the effect of magically shifting the value of + * ce0 so you'll get the first active channel relative to the + * specified quarter control as result. + */ + inst = brw_FBL(p, vec1(dst), exec_mask); + } else { + const struct brw_reg flag = brw_flag_reg(1, 0); + + brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); + + /* Run enough instructions returning zero with execution masking and + * a conditional modifier enabled in order to get the full execution + * mask in f1.0. We could use a single 32-wide move here if it + * weren't because of the hardware bug that causes channel enables to + * be applied incorrectly to the second half of 32-wide instructions + * on Gen7. + */ + const unsigned lower_size = MIN2(16, exec_size); + for (unsigned i = 0; i < exec_size / lower_size; i++) { + inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); + brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control); + brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z); + brw_inst_set_flag_reg_nr(devinfo, inst, 1); + brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1); + } + + /* Find the first bit set in the exec_size-wide portion of the flag + * register that was updated by the last sequence of MOV + * instructions. + */ + const enum brw_reg_type type = brw_int_type(exec_size / 8, false); + brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control)); + } + } else { + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + if (devinfo->gen >= 8 && + mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) { + /* In SIMD4x2 mode the first active channel index is just the + * negation of the first bit of the mask register. Note that ce0 + * doesn't take into account the dispatch mask, so the Gen7 path + * should be used instead unless you have the guarantee that the + * dispatch mask is tightly packed (i.e. it has the form '2^n - 1' + * for some n). + */ + inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X), + negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(1)); + + } else { + /* Overwrite the destination without and with execution masking to + * find out which of the channels is active. + */ + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), + brw_imm_ud(1)); + + inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), + brw_imm_ud(0)); + brw_pop_insn_state(p); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); + } + } + + brw_pop_insn_state(p); +} + +void +brw_broadcast(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; + brw_inst *inst; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4); + + assert(src.file == BRW_GENERAL_REGISTER_FILE && + src.address_mode == BRW_ADDRESS_DIRECT); + + if ((src.vstride == 0 && (src.hstride == 0 || !align1)) || + idx.file == BRW_IMMEDIATE_VALUE) { + /* Trivial, the source is already uniform or the index is a constant. + * We will typically not get here if the optimizer is doing its job, but + * asserting would be mean. + */ + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; + brw_MOV(p, dst, + (align1 ? stride(suboffset(src, i), 0, 1, 0) : + stride(suboffset(src, 4 * i), 0, 4, 1))); + } else { + if (align1) { + const struct brw_reg addr = + retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + const unsigned offset = src.nr * REG_SIZE + src.subnr; + /* Limit in bytes of the signed indirect addressing immediate. */ + const unsigned limit = 512; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + /* Take into account the component size and horizontal stride. */ + assert(src.vstride == src.hstride + src.width); + brw_SHL(p, addr, vec1(idx), + brw_imm_ud(_mesa_logbase2(type_sz(src.type)) + + src.hstride - 1)); + + /* We can only address up to limit bytes using the indirect + * addressing immediate, account for the difference if the source + * register is above this limit. + */ + if (offset >= limit) + brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit)); + + brw_pop_insn_state(p); + + /* Use indirect addressing to fetch the specified component. */ + brw_MOV(p, dst, + retype(brw_vec1_indirect(addr.subnr, offset % limit), + src.type)); + } else { + /* In SIMD4x2 mode the index can be either zero or one, replicate it + * to all bits of a flag register, + */ + inst = brw_MOV(p, + brw_null_reg(), + stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1)); + brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE); + brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ); + brw_inst_set_flag_reg_nr(devinfo, inst, 1); + + /* and use predicated SEL to pick the right channel. */ + inst = brw_SEL(p, dst, + stride(suboffset(src, 4), 4, 4, 1), + stride(src, 4, 4, 1)); + brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL); + brw_inst_set_flag_reg_nr(devinfo, inst, 1); + } + } + + brw_pop_insn_state(p); +} + +/** + * This instruction is generated as a single-channel align1 instruction by + * both the VS and FS stages when using INTEL_DEBUG=shader_time. + * + * We can't use the typed atomic op in the FS because that has the execution + * mask ANDed with the pixel mask, but we just want to write the one dword for + * all the pixels. + * + * We don't use the SIMD4x2 atomic ops in the VS because want to just write + * one u32. So we use the same untyped atomic write message as the pixel + * shader. + * + * The untyped atomic operation requires a BUFFER surface type with RAW + * format, and is only accessible through the legacy DATA_CACHE dataport + * messages. + */ +void brw_shader_time_add(struct brw_codegen *p, + struct brw_reg payload, + uint32_t surf_index) +{ + const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GEN7_SFID_DATAPORT_DATA_CACHE); + assert(p->devinfo->gen >= 7); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + + /* We use brw_vec1_reg and unmasked because we want to increment the given + * offset only once. + */ + brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NULL, 0)); + brw_set_src0(p, send, brw_vec1_reg(payload.file, + payload.nr, 0)); + brw_set_src1(p, send, brw_imm_ud(0)); + brw_set_message_descriptor(p, send, sfid, 2, 0, false, false); + brw_inst_set_binding_table_index(p->devinfo, send, surf_index); + brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false); + + brw_pop_insn_state(p); +} + + +/** + * Emit the SEND message for a barrier + */ +void +brw_barrier(struct brw_codegen *p, struct brw_reg src) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_inst *inst; + + assert(devinfo->gen >= 7); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + inst = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, inst, src); + brw_set_src1(p, inst, brw_null_reg()); + + brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY, + 1 /* msg_length */, + 0 /* response_length */, + false /* header_present */, + false /* end_of_thread */); + + brw_inst_set_gateway_notify(devinfo, inst, 1); + brw_inst_set_gateway_subfuncid(devinfo, inst, + BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG); + + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); + brw_pop_insn_state(p); +} + + +/** + * Emit the wait instruction for a barrier + */ +void +brw_WAIT(struct brw_codegen *p) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_inst *insn; + + struct brw_reg src = brw_notification_reg(); + + insn = next_insn(p, BRW_OPCODE_WAIT); + brw_set_dest(p, insn, src); + brw_set_src0(p, insn, src); + brw_set_src1(p, insn, brw_null_reg()); + + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); +} diff --git a/src/intel/compiler/brw_eu_util.c b/src/intel/compiler/brw_eu_util.c new file mode 100644 index 00000000000..8c84cb45008 --- /dev/null +++ b/src/intel/compiler/brw_eu_util.c @@ -0,0 +1,123 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_eu_defines.h" +#include "brw_eu.h" + + +void brw_math_invert( struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) +{ + gen4_math(p, + dst, + BRW_MATH_FUNCTION_INV, + 0, + src, + BRW_MATH_PRECISION_FULL); +} + + + +void brw_copy4(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count) +{ + unsigned i; + + dst = vec4(dst); + src = vec4(src); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16)); + } +} + + +void brw_copy8(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count) +{ + unsigned i; + + dst = vec8(dst); + src = vec8(src); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + } +} + + +void brw_copy_indirect_to_indirect(struct brw_codegen *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + unsigned count) +{ + unsigned i; + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta)); + brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16)); + } +} + + +void brw_copy_from_indirect(struct brw_codegen *p, + struct brw_reg dst, + struct brw_indirect ptr, + unsigned count) +{ + unsigned i; + + dst = vec4(dst); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta)); + brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16)); + } +} + + + + diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c new file mode 100644 index 00000000000..64615af44ac --- /dev/null +++ b/src/intel/compiler/brw_eu_validate.c @@ -0,0 +1,1051 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_eu_validate.c + * + * This file implements a pass that validates shader assembly. + */ + +#include "brw_eu.h" + +/* We're going to do lots of string concatenation, so this should help. */ +struct string { + char *str; + size_t len; +}; + +static void +cat(struct string *dest, const struct string src) +{ + dest->str = realloc(dest->str, dest->len + src.len + 1); + memcpy(dest->str + dest->len, src.str, src.len); + dest->str[dest->len + src.len] = '\0'; + dest->len = dest->len + src.len; +} +#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)}) + +#define error(str) "\tERROR: " str "\n" +#define ERROR_INDENT "\t " + +#define ERROR(msg) ERROR_IF(true, msg) +#define ERROR_IF(cond, msg) \ + do { \ + if (cond) { \ + CAT(error_msg, error(msg)); \ + } \ + } while(0) + +#define CHECK(func, args...) \ + do { \ + struct string __msg = func(devinfo, inst, ##args); \ + if (__msg.str) { \ + cat(&error_msg, __msg); \ + free(__msg.str); \ + } \ + } while (0) + +static bool +inst_is_send(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + switch (brw_inst_opcode(devinfo, inst)) { + case BRW_OPCODE_SEND: + case BRW_OPCODE_SENDC: + case BRW_OPCODE_SENDS: + case BRW_OPCODE_SENDSC: + return true; + default: + return false; + } +} + +static unsigned +signed_type(unsigned type) +{ + switch (type) { + case BRW_HW_REG_TYPE_UD: return BRW_HW_REG_TYPE_D; + case BRW_HW_REG_TYPE_UW: return BRW_HW_REG_TYPE_W; + case BRW_HW_REG_NON_IMM_TYPE_UB: return BRW_HW_REG_NON_IMM_TYPE_B; + case GEN8_HW_REG_TYPE_UQ: return GEN8_HW_REG_TYPE_Q; + default: return type; + } +} + +static bool +inst_is_raw_move(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + unsigned dst_type = signed_type(brw_inst_dst_reg_type(devinfo, inst)); + unsigned src_type = signed_type(brw_inst_src0_reg_type(devinfo, inst)); + + if (brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + (brw_inst_src0_negate(devinfo, inst) || + brw_inst_src0_abs(devinfo, inst))) + return false; + + return brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV && + brw_inst_saturate(devinfo, inst) == 0 && + dst_type == src_type; +} + +static bool +dst_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE; +} + +static bool +src0_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 && + brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0; +} + +static bool +src1_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 && + brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0; +} + +static unsigned +num_sources_from_inst(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + const struct opcode_desc *desc = + brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)); + unsigned math_function; + + if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) { + math_function = brw_inst_math_function(devinfo, inst); + } else if (devinfo->gen < 6 && + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) { + if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) { + /* src1 must be a descriptor (including the information to determine + * that the SEND is doing an extended math operation), but src0 can + * actually be null since it serves as the source of the implicit GRF + * to MRF move. + * + * If we stop using that functionality, we'll have to revisit this. + */ + return 2; + } else { + /* Send instructions are allowed to have null sources since they use + * the base_mrf field to specify which message register source. + */ + return 0; + } + } else { + assert(desc->nsrc < 4); + return desc->nsrc; + } + + switch (math_function) { + case BRW_MATH_FUNCTION_INV: + case BRW_MATH_FUNCTION_LOG: + case BRW_MATH_FUNCTION_EXP: + case BRW_MATH_FUNCTION_SQRT: + case BRW_MATH_FUNCTION_RSQ: + case BRW_MATH_FUNCTION_SIN: + case BRW_MATH_FUNCTION_COS: + case BRW_MATH_FUNCTION_SINCOS: + case GEN8_MATH_FUNCTION_INVM: + case GEN8_MATH_FUNCTION_RSQRTM: + return 1; + case BRW_MATH_FUNCTION_FDIV: + case BRW_MATH_FUNCTION_POW: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: + return 2; + default: + unreachable("not reached"); + } +} + +static struct string +sources_not_null(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + unsigned num_sources = num_sources_from_inst(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + /* Nothing to test. 3-src instructions can only have GRF sources, and + * there's no bit to control the file. + */ + if (num_sources == 3) + return (struct string){}; + + if (num_sources >= 1) + ERROR_IF(src0_is_null(devinfo, inst), "src0 is null"); + + if (num_sources == 2) + ERROR_IF(src1_is_null(devinfo, inst), "src1 is null"); + + return error_msg; +} + +static struct string +send_restrictions(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + struct string error_msg = { .str = NULL, .len = 0 }; + + if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) { + ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT, + "send must use direct addressing"); + + if (devinfo->gen >= 7) { + ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF"); + ERROR_IF(brw_inst_eot(devinfo, inst) && + brw_inst_src0_da_reg_nr(devinfo, inst) < 112, + "send with EOT must use g112-g127"); + } + } + + return error_msg; +} + +static bool +is_unsupported_inst(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL; +} + +static unsigned +execution_type_for_type(unsigned type, bool is_immediate) +{ + /* The meaning of the type bits is dependent on whether the operand is an + * immediate, so normalize them first. + */ + if (is_immediate) { + switch (type) { + case BRW_HW_REG_IMM_TYPE_UV: + case BRW_HW_REG_IMM_TYPE_V: + type = BRW_HW_REG_TYPE_W; + break; + case BRW_HW_REG_IMM_TYPE_VF: + type = BRW_HW_REG_TYPE_F; + break; + case GEN8_HW_REG_IMM_TYPE_DF: + type = GEN7_HW_REG_NON_IMM_TYPE_DF; + break; + case GEN8_HW_REG_IMM_TYPE_HF: + type = GEN8_HW_REG_NON_IMM_TYPE_HF; + break; + default: + break; + } + } + + switch (type) { + case BRW_HW_REG_TYPE_UD: + case BRW_HW_REG_TYPE_D: + return BRW_HW_REG_TYPE_D; + case BRW_HW_REG_TYPE_UW: + case BRW_HW_REG_TYPE_W: + case BRW_HW_REG_NON_IMM_TYPE_UB: + case BRW_HW_REG_NON_IMM_TYPE_B: + return BRW_HW_REG_TYPE_W; + case GEN8_HW_REG_TYPE_UQ: + case GEN8_HW_REG_TYPE_Q: + return GEN8_HW_REG_TYPE_Q; + case BRW_HW_REG_TYPE_F: + case GEN7_HW_REG_NON_IMM_TYPE_DF: + case GEN8_HW_REG_NON_IMM_TYPE_HF: + return type; + default: + unreachable("not reached"); + } +} + +/** + * Returns the execution type of an instruction \p inst + */ +static unsigned +execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + unsigned num_sources = num_sources_from_inst(devinfo, inst); + unsigned src0_exec_type, src1_exec_type; + unsigned src0_type = brw_inst_src0_reg_type(devinfo, inst); + unsigned src1_type = brw_inst_src1_reg_type(devinfo, inst); + + bool src0_is_immediate = + brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE; + bool src1_is_immediate = + brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE; + + /* Execution data type is independent of destination data type, except in + * mixed F/HF instructions on CHV and SKL+. + */ + unsigned dst_exec_type = brw_inst_dst_reg_type(devinfo, inst); + + src0_exec_type = execution_type_for_type(src0_type, src0_is_immediate); + if (num_sources == 1) { + if ((devinfo->gen >= 9 || devinfo->is_cherryview) && + src0_exec_type == GEN8_HW_REG_NON_IMM_TYPE_HF) { + return dst_exec_type; + } + return src0_exec_type; + } + + src1_exec_type = execution_type_for_type(src1_type, src1_is_immediate); + if (src0_exec_type == src1_exec_type) + return src0_exec_type; + + /* Mixed operand types where one is float is float on Gen < 6 + * (and not allowed on later platforms) + */ + if (devinfo->gen < 6 && + (src0_exec_type == BRW_HW_REG_TYPE_F || + src1_exec_type == BRW_HW_REG_TYPE_F)) + return BRW_HW_REG_TYPE_F; + + if (src0_exec_type == GEN8_HW_REG_TYPE_Q || + src1_exec_type == GEN8_HW_REG_TYPE_Q) + return GEN8_HW_REG_TYPE_Q; + + if (src0_exec_type == BRW_HW_REG_TYPE_D || + src1_exec_type == BRW_HW_REG_TYPE_D) + return BRW_HW_REG_TYPE_D; + + if (src0_exec_type == BRW_HW_REG_TYPE_W || + src1_exec_type == BRW_HW_REG_TYPE_W) + return BRW_HW_REG_TYPE_W; + + if (src0_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF || + src1_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF) + return GEN7_HW_REG_NON_IMM_TYPE_DF; + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + if (dst_exec_type == BRW_HW_REG_TYPE_F || + src0_exec_type == BRW_HW_REG_TYPE_F || + src1_exec_type == BRW_HW_REG_TYPE_F) { + return BRW_HW_REG_TYPE_F; + } else { + return GEN8_HW_REG_NON_IMM_TYPE_HF; + } + } + + assert(src0_exec_type == BRW_HW_REG_TYPE_F); + return BRW_HW_REG_TYPE_F; +} + +/** + * Returns whether a region is packed + * + * A region is packed if its elements are adjacent in memory, with no + * intervening space, no overlap, and no replicated values. + */ +static bool +is_packed(unsigned vstride, unsigned width, unsigned hstride) +{ + if (vstride == width) { + if (vstride == 1) { + return hstride == 0; + } else { + return hstride == 1; + } + } + + return false; +} + +/** + * Checks restrictions listed in "General Restrictions Based on Operand Types" + * in the "Register Region Restrictions" section. + */ +static struct string +general_restrictions_based_on_operand_types(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + const struct opcode_desc *desc = + brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)); + unsigned num_sources = num_sources_from_inst(devinfo, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3) + return (struct string){}; + + if (inst_is_send(devinfo, inst)) + return (struct string){}; + + if (exec_size == 1) + return (struct string){}; + + if (desc->ndst == 0) + return (struct string){}; + + /* The PRMs say: + * + * Where n is the largest element size in bytes for any source or + * destination operand type, ExecSize * n must be <= 64. + * + * But we do not attempt to enforce it, because it is implied by other + * rules: + * + * - that the destination stride must match the execution data type + * - sources may not span more than two adjacent GRF registers + * - destination may not span more than two adjacent GRF registers + * + * In fact, checking it would weaken testing of the other rules. + */ + + if (num_sources == 3) + return (struct string){}; + + if (exec_size == 1) + return (struct string){}; + + if (inst_is_send(devinfo, inst)) + return (struct string){}; + + if (desc->ndst == 0) + return (struct string){}; + + unsigned dst_stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1); + bool dst_type_is_byte = + brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_B || + brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_UB; + + if (dst_type_is_byte) { + if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) { + if (!inst_is_raw_move(devinfo, inst)) { + ERROR("Only raw MOV supports a packed-byte destination"); + return error_msg; + } else { + return (struct string){}; + } + } + } + + unsigned exec_type = execution_type(devinfo, inst); + unsigned exec_type_size = + brw_hw_reg_type_to_size(devinfo, exec_type, BRW_GENERAL_REGISTER_FILE); + unsigned dst_type_size = brw_element_size(devinfo, inst, dst); + + if (exec_type_size > dst_type_size) { + ERROR_IF(dst_stride * dst_type_size != exec_type_size, + "Destination stride must be equal to the ratio of the sizes of " + "the execution data type to the destination type"); + + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 && + brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + /* The i965 PRM says: + * + * Implementation Restriction: The relaxed alignment rule for byte + * destination (#10.5) is not supported. + */ + if ((devinfo->gen > 4 || devinfo->is_g4x) && dst_type_is_byte) { + ERROR_IF(subreg % exec_type_size != 0 && + subreg % exec_type_size != 1, + "Destination subreg must be aligned to the size of the " + "execution data type (or to the next lowest byte for byte " + "destinations)"); + } else { + ERROR_IF(subreg % exec_type_size != 0, + "Destination subreg must be aligned to the size of the " + "execution data type"); + } + } + } + + return error_msg; +} + +/** + * Checks restrictions listed in "General Restrictions on Regioning Parameters" + * in the "Register Region Restrictions" section. + */ +static struct string +general_restrictions_on_region_parameters(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + const struct opcode_desc *desc = + brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)); + unsigned num_sources = num_sources_from_inst(devinfo, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3) + return (struct string){}; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) { + if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) + ERROR_IF(brw_inst_dst_hstride(devinfo, inst) != BRW_HORIZONTAL_STRIDE_1, + "Destination Horizontal Stride must be 1"); + + if (num_sources >= 1) { + if (devinfo->is_haswell || devinfo->gen >= 8) { + ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0, 2, or 4 is allowed"); + } else { + ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0 or 4 is allowed"); + } + } + + if (num_sources == 2) { + if (devinfo->is_haswell || devinfo->gen >= 8) { + ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0, 2, or 4 is allowed"); + } else { + ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0 or 4 is allowed"); + } + } + + return error_msg; + } + + for (unsigned i = 0; i < num_sources; i++) { + unsigned vstride, width, hstride, element_size, subreg; + +#define DO_SRC(n) \ + if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \ + BRW_IMMEDIATE_VALUE) \ + continue; \ + \ + vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \ + width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \ + hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \ + element_size = brw_element_size(devinfo, inst, src ## n); \ + subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst) + + if (i == 0) { + DO_SRC(0); + } else if (i == 1) { + DO_SRC(1); + } +#undef DO_SRC + + /* ExecSize must be greater than or equal to Width. */ + ERROR_IF(exec_size < width, "ExecSize must be greater than or equal " + "to Width"); + + /* If ExecSize = Width and HorzStride ≠ 0, + * VertStride must be set to Width * HorzStride. + */ + if (exec_size == width && hstride != 0) { + ERROR_IF(vstride != width * hstride, + "If ExecSize = Width and HorzStride ≠ 0, " + "VertStride must be set to Width * HorzStride"); + } + + /* If Width = 1, HorzStride must be 0 regardless of the values of + * ExecSize and VertStride. + */ + if (width == 1) { + ERROR_IF(hstride != 0, + "If Width = 1, HorzStride must be 0 regardless " + "of the values of ExecSize and VertStride"); + } + + /* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */ + if (exec_size == 1 && width == 1) { + ERROR_IF(vstride != 0 || hstride != 0, + "If ExecSize = Width = 1, both VertStride " + "and HorzStride must be 0"); + } + + /* If VertStride = HorzStride = 0, Width must be 1 regardless of the + * value of ExecSize. + */ + if (vstride == 0 && hstride == 0) { + ERROR_IF(width != 1, + "If VertStride = HorzStride = 0, Width must be " + "1 regardless of the value of ExecSize"); + } + + /* VertStride must be used to cross GRF register boundaries. This rule + * implies that elements within a 'Width' cannot cross GRF boundaries. + */ + const uint64_t mask = (1 << element_size) - 1; + unsigned rowbase = subreg; + + for (int y = 0; y < exec_size / width; y++) { + uint64_t access_mask = 0; + unsigned offset = rowbase; + + for (int x = 0; x < width; x++) { + access_mask |= mask << offset; + offset += hstride * element_size; + } + + rowbase += vstride * element_size; + + if ((uint32_t)access_mask != 0 && (access_mask >> 32) != 0) { + ERROR("VertStride must be used to cross GRF register boundaries"); + break; + } + } + } + + /* Dst.HorzStride must not be 0. */ + if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) { + ERROR_IF(brw_inst_dst_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0, + "Destination Horizontal Stride must not be 0"); + } + + return error_msg; +} + +/** + * Creates an \p access_mask for an \p exec_size, \p element_size, and a region + * + * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is + * a bitmask of bytes accessed by the region. + * + * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4 + * instruction would be + * + * access_mask[0] = 0x00000000000000F0 + * access_mask[1] = 0x000000000000F000 + * access_mask[2] = 0x0000000000F00000 + * access_mask[3] = 0x00000000F0000000 + * access_mask[4-31] = 0 + * + * because the first execution channel accesses bytes 7-4 and the second + * execution channel accesses bytes 15-12, etc. + */ +static void +align1_access_mask(uint64_t access_mask[static 32], + unsigned exec_size, unsigned element_size, unsigned subreg, + unsigned vstride, unsigned width, unsigned hstride) +{ + const uint64_t mask = (1 << element_size) - 1; + unsigned rowbase = subreg; + unsigned element = 0; + + for (int y = 0; y < exec_size / width; y++) { + unsigned offset = rowbase; + + for (int x = 0; x < width; x++) { + access_mask[element++] = mask << offset; + offset += hstride * element_size; + } + + rowbase += vstride * element_size; + } + + assert(element == 0 || element == exec_size); +} + +/** + * Returns the number of registers accessed according to the \p access_mask + */ +static int +registers_read(const uint64_t access_mask[static 32]) +{ + int regs_read = 0; + + for (unsigned i = 0; i < 32; i++) { + if (access_mask[i] > 0xFFFFFFFF) { + return 2; + } else if (access_mask[i]) { + regs_read = 1; + } + } + + return regs_read; +} + +/** + * Checks restrictions listed in "Region Alignment Rules" in the "Register + * Region Restrictions" section. + */ +static struct string +region_alignment_rules(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + const struct opcode_desc *desc = + brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)); + unsigned num_sources = num_sources_from_inst(devinfo, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32]; + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3) + return (struct string){}; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) + return (struct string){}; + + if (inst_is_send(devinfo, inst)) + return (struct string){}; + + memset(dst_access_mask, 0, sizeof(dst_access_mask)); + memset(src0_access_mask, 0, sizeof(src0_access_mask)); + memset(src1_access_mask, 0, sizeof(src1_access_mask)); + + for (unsigned i = 0; i < num_sources; i++) { + unsigned vstride, width, hstride, element_size, subreg; + + /* In Direct Addressing mode, a source cannot span more than 2 adjacent + * GRF registers. + */ + +#define DO_SRC(n) \ + if (brw_inst_src ## n ## _address_mode(devinfo, inst) != \ + BRW_ADDRESS_DIRECT) \ + continue; \ + \ + if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \ + BRW_IMMEDIATE_VALUE) \ + continue; \ + \ + vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \ + width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \ + hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \ + element_size = brw_element_size(devinfo, inst, src ## n); \ + subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ + align1_access_mask(src ## n ## _access_mask, \ + exec_size, element_size, subreg, \ + vstride, width, hstride) + + if (i == 0) { + DO_SRC(0); + } else if (i == 1) { + DO_SRC(1); + } +#undef DO_SRC + + unsigned num_vstride = exec_size / width; + unsigned num_hstride = width; + unsigned vstride_elements = (num_vstride - 1) * vstride; + unsigned hstride_elements = (num_hstride - 1) * hstride; + unsigned offset = (vstride_elements + hstride_elements) * element_size + + subreg; + ERROR_IF(offset >= 64, + "A source cannot span more than 2 adjacent GRF registers"); + } + + if (desc->ndst == 0 || dst_is_null(devinfo, inst)) + return error_msg; + + unsigned stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1); + unsigned element_size = brw_element_size(devinfo, inst, dst); + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + unsigned offset = ((exec_size - 1) * stride * element_size) + subreg; + ERROR_IF(offset >= 64, + "A destination cannot span more than 2 adjacent GRF registers"); + + if (error_msg.str) + return error_msg; + + align1_access_mask(dst_access_mask, exec_size, element_size, subreg, + exec_size == 1 ? 0 : exec_size * stride, + exec_size == 1 ? 1 : exec_size, + exec_size == 1 ? 0 : stride); + + unsigned dst_regs = registers_read(dst_access_mask); + unsigned src0_regs = registers_read(src0_access_mask); + unsigned src1_regs = registers_read(src1_access_mask); + + /* The SNB, IVB, HSW, BDW, and CHV PRMs say: + * + * When an instruction has a source region spanning two registers and a + * destination region contained in one register, the number of elements + * must be the same between two sources and one of the following must be + * true: + * + * 1. The destination region is entirely contained in the lower OWord + * of a register. + * 2. The destination region is entirely contained in the upper OWord + * of a register. + * 3. The destination elements are evenly split between the two OWords + * of a register. + */ + if (devinfo->gen <= 8) { + if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) { + unsigned upper_oword_writes = 0, lower_oword_writes = 0; + + for (unsigned i = 0; i < exec_size; i++) { + if (dst_access_mask[i] > 0x0000FFFF) { + upper_oword_writes++; + } else { + assert(dst_access_mask[i] != 0); + lower_oword_writes++; + } + } + + ERROR_IF(lower_oword_writes != 0 && + upper_oword_writes != 0 && + upper_oword_writes != lower_oword_writes, + "Writes must be to only one OWord or " + "evenly split between OWords"); + } + } + + /* The IVB and HSW PRMs say: + * + * When an instruction has a source region that spans two registers and + * the destination spans two registers, the destination elements must be + * evenly split between the two registers [...] + * + * The SNB PRM contains similar wording (but written in a much more + * confusing manner). + * + * The BDW PRM says: + * + * When destination spans two registers, the source may be one or two + * registers. The destination elements must be evenly split between the + * two registers. + * + * The SKL PRM says: + * + * When destination of MATH instruction spans two registers, the + * destination elements must be evenly split between the two registers. + * + * It is not known whether this restriction applies to KBL other Gens after + * SKL. + */ + if (devinfo->gen <= 8 || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) { + + /* Nothing explicitly states that on Gen < 8 elements must be evenly + * split between two destination registers in the two exceptional + * source-region-spans-one-register cases, but since Broadwell requires + * evenly split writes regardless of source region, we assume that it was + * an oversight and require it. + */ + if (dst_regs == 2) { + unsigned upper_reg_writes = 0, lower_reg_writes = 0; + + for (unsigned i = 0; i < exec_size; i++) { + if (dst_access_mask[i] > 0xFFFFFFFF) { + upper_reg_writes++; + } else { + assert(dst_access_mask[i] != 0); + lower_reg_writes++; + } + } + + ERROR_IF(upper_reg_writes != lower_reg_writes, + "Writes must be evenly split between the two " + "destination registers"); + } + } + + /* The IVB and HSW PRMs say: + * + * When an instruction has a source region that spans two registers and + * the destination spans two registers, the destination elements must be + * evenly split between the two registers and each destination register + * must be entirely derived from one source register. + * + * Note: In such cases, the regioning parameters must ensure that the + * offset from the two source registers is the same. + * + * The SNB PRM contains similar wording (but written in a much more + * confusing manner). + * + * There are effectively three rules stated here: + * + * For an instruction with a source and a destination spanning two + * registers, + * + * (1) destination elements must be evenly split between the two + * registers + * (2) all destination elements in a register must be derived + * from one source register + * (3) the offset (i.e. the starting location in each of the two + * registers spanned by a region) must be the same in the two + * registers spanned by a region + * + * It is impossible to violate rule (1) without violating (2) or (3), so we + * do not attempt to validate it. + */ + if (devinfo->gen <= 7 && dst_regs == 2) { + for (unsigned i = 0; i < num_sources; i++) { +#define DO_SRC(n) \ + if (src ## n ## _regs <= 1) \ + continue; \ + \ + for (unsigned i = 0; i < exec_size; i++) { \ + if ((dst_access_mask[i] > 0xFFFFFFFF) != \ + (src ## n ## _access_mask[i] > 0xFFFFFFFF)) { \ + ERROR("Each destination register must be entirely derived " \ + "from one source register"); \ + break; \ + } \ + } \ + \ + unsigned offset_0 = \ + brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ + unsigned offset_1 = offset_0; \ + \ + for (unsigned i = 0; i < exec_size; i++) { \ + if (src ## n ## _access_mask[i] > 0xFFFFFFFF) { \ + offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32; \ + break; \ + } \ + } \ + \ + ERROR_IF(offset_0 != offset_1, \ + "The offset from the two source registers " \ + "must be the same") + + if (i == 0) { + DO_SRC(0); + } else if (i == 1) { + DO_SRC(1); + } +#undef DO_SRC + } + } + + /* The IVB and HSW PRMs say: + * + * When destination spans two registers, the source MUST span two + * registers. The exception to the above rule: + * 1. When source is scalar, the source registers are not + * incremented. + * 2. When source is packed integer Word and destination is packed + * integer DWord, the source register is not incremented by the + * source sub register is incremented. + * + * The SNB PRM does not contain this rule, but the internal documentation + * indicates that it applies to SNB as well. We assume that the rule applies + * to Gen <= 5 although their PRMs do not state it. + * + * While the documentation explicitly says in exception (2) that the + * destination must be an integer DWord, the hardware allows at least a + * float destination type as well. We emit such instructions from + * + * fs_visitor::emit_interpolation_setup_gen6 + * fs_visitor::emit_fragcoord_interpolation + * + * and have for years with no ill effects. + * + * Additionally the simulator source code indicates that the real condition + * is that the size of the destination type is 4 bytes. + */ + if (devinfo->gen <= 7 && dst_regs == 2) { + bool dst_is_packed_dword = + is_packed(exec_size * stride, exec_size, stride) && + brw_element_size(devinfo, inst, dst) == 4; + + for (unsigned i = 0; i < num_sources; i++) { +#define DO_SRC(n) \ + unsigned vstride, width, hstride; \ + vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \ + width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \ + hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \ + (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \ + bool src ## n ## _is_packed_word = \ + is_packed(vstride, width, hstride) && \ + (brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_W || \ + brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_UW); \ + \ + ERROR_IF(src ## n ## _regs == 1 && \ + !src ## n ## _has_scalar_region(devinfo, inst) && \ + !(dst_is_packed_dword && src ## n ## _is_packed_word), \ + "When the destination spans two registers, the source must " \ + "span two registers\n" ERROR_INDENT "(exceptions for scalar " \ + "source and packed-word to packed-dword expansion)") + + if (i == 0) { + DO_SRC(0); + } else if (i == 1) { + DO_SRC(1); + } +#undef DO_SRC + } + } + + return error_msg; +} + +bool +brw_validate_instructions(const struct brw_codegen *p, int start_offset, + struct annotation_info *annotation) +{ + const struct gen_device_info *devinfo = p->devinfo; + const void *store = p->store; + bool valid = true; + + for (int src_offset = start_offset; src_offset < p->next_insn_offset; + src_offset += sizeof(brw_inst)) { + struct string error_msg = { .str = NULL, .len = 0 }; + const brw_inst *inst = store + src_offset; + + if (is_unsupported_inst(devinfo, inst)) { + ERROR("Instruction not supported on this Gen"); + } else { + CHECK(sources_not_null); + CHECK(send_restrictions); + CHECK(general_restrictions_based_on_operand_types); + CHECK(general_restrictions_on_region_parameters); + CHECK(region_alignment_rules); + } + + if (error_msg.str && annotation) { + annotation_insert_error(annotation, src_offset, error_msg.str); + } + valid = valid && error_msg.len == 0; + free(error_msg.str); + } + + return valid; +} diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp new file mode 100644 index 00000000000..c410efc29d6 --- /dev/null +++ b/src/intel/compiler/brw_fs.cpp @@ -0,0 +1,6805 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs.cpp + * + * This file drives the GLSL IR -> LIR translation, contains the + * optimizations on the LIR, and drives the generation of native code + * from the LIR. + */ + +#include "main/macros.h" +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_nir.h" +#include "brw_vec4_gs_visitor.h" +#include "brw_cfg.h" +#include "brw_dead_control_flow.h" +#include "common/gen_debug.h" +#include "compiler/glsl_types.h" +#include "compiler/nir/nir_builder.h" +#include "program/prog_parameter.h" + +using namespace brw; + +static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo, + const fs_inst *inst); + +void +fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg *src, unsigned sources) +{ + memset(this, 0, sizeof(*this)); + + this->src = new fs_reg[MAX2(sources, 3)]; + for (unsigned i = 0; i < sources; i++) + this->src[i] = src[i]; + + this->opcode = opcode; + this->dst = dst; + this->sources = sources; + this->exec_size = exec_size; + this->base_mrf = -1; + + assert(dst.file != IMM && dst.file != UNIFORM); + + assert(this->exec_size != 0); + + this->conditional_mod = BRW_CONDITIONAL_NONE; + + /* This will be the case for almost all instructions. */ + switch (dst.file) { + case VGRF: + case ARF: + case FIXED_GRF: + case MRF: + case ATTR: + this->size_written = dst.component_size(exec_size); + break; + case BAD_FILE: + this->size_written = 0; + break; + case IMM: + case UNIFORM: + unreachable("Invalid destination register file"); + } + + this->writes_accumulator = false; +} + +fs_inst::fs_inst() +{ + init(BRW_OPCODE_NOP, 8, dst, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size) +{ + init(opcode, exec_size, reg_undef, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst) +{ + init(opcode, exec_size, dst, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0) +{ + const fs_reg src[1] = { src0 }; + init(opcode, exec_size, dst, src, 1); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1) +{ + const fs_reg src[2] = { src0, src1 }; + init(opcode, exec_size, dst, src, 2); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1, const fs_reg &src2) +{ + const fs_reg src[3] = { src0, src1, src2 }; + init(opcode, exec_size, dst, src, 3); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, + const fs_reg src[], unsigned sources) +{ + init(opcode, exec_width, dst, src, sources); +} + +fs_inst::fs_inst(const fs_inst &that) +{ + memcpy(this, &that, sizeof(that)); + + this->src = new fs_reg[MAX2(that.sources, 3)]; + + for (unsigned i = 0; i < that.sources; i++) + this->src[i] = that.src[i]; +} + +fs_inst::~fs_inst() +{ + delete[] this->src; +} + +void +fs_inst::resize_sources(uint8_t num_sources) +{ + if (this->sources != num_sources) { + fs_reg *src = new fs_reg[MAX2(num_sources, 3)]; + + for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i) + src[i] = this->src[i]; + + delete[] this->src; + this->src = src; + this->sources = num_sources; + } +} + +void +fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &surf_index, + const fs_reg &varying_offset, + uint32_t const_offset) +{ + /* We have our constant surface use a pitch of 4 bytes, so our index can + * be any component of a vector, and then we load 4 contiguous + * components starting from that. + * + * We break down the const_offset to a portion added to the variable offset + * and a portion done using fs_reg::offset, which means that if you have + * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]", + * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can + * later notice that those loads are all the same and eliminate the + * redundant ones. + */ + fs_reg vec4_offset = vgrf(glsl_type::uint_type); + bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); + + /* The pull load message will load a vec4 (16 bytes). If we are loading + * a double this means we are only loading 2 elements worth of data. + * We also want to use a 32-bit data type for the dst of the load operation + * so other parts of the driver don't get confused about the size of the + * result. + */ + fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, + vec4_result, surf_index, vec4_offset); + inst->size_written = 4 * vec4_result.component_size(inst->exec_size); + + if (type_sz(dst.type) == 8) { + shuffle_32bit_load_result_to_64bit_data( + bld, retype(vec4_result, dst.type), vec4_result, 2); + } + + vec4_result.type = dst.type; + bld.MOV(dst, offset(vec4_result, bld, + (const_offset & 0xf) / type_sz(vec4_result.type))); +} + +/** + * A helper for MOV generation for fixing up broken hardware SEND dependency + * handling. + */ +void +fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) +{ + /* The caller always wants uncompressed to emit the minimal extra + * dependencies, and to avoid having to deal with aligning its regs to 2. + */ + const fs_builder ubld = bld.annotate("send dependency resolve") + .half(0); + + ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); +} + +bool +fs_inst::equals(fs_inst *inst) const +{ + return (opcode == inst->opcode && + dst.equals(inst->dst) && + src[0].equals(inst->src[0]) && + src[1].equals(inst->src[1]) && + src[2].equals(inst->src[2]) && + saturate == inst->saturate && + predicate == inst->predicate && + conditional_mod == inst->conditional_mod && + mlen == inst->mlen && + base_mrf == inst->base_mrf && + target == inst->target && + eot == inst->eot && + header_size == inst->header_size && + shadow_compare == inst->shadow_compare && + exec_size == inst->exec_size && + offset == inst->offset); +} + +bool +fs_inst::is_send_from_grf() const +{ + switch (opcode) { + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + case SHADER_OPCODE_SHADER_TIME_ADD: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + return true; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + return src[1].file == VGRF; + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + return src[0].file == VGRF; + default: + if (is_tex()) + return src[0].file == VGRF; + + return false; + } +} + +/** + * Returns true if this instruction's sources and destinations cannot + * safely be the same register. + * + * In most cases, a register can be written over safely by the same + * instruction that is its last use. For a single instruction, the + * sources are dereferenced before writing of the destination starts + * (naturally). + * + * However, there are a few cases where this can be problematic: + * + * - Virtual opcodes that translate to multiple instructions in the + * code generator: if src == dst and one instruction writes the + * destination before a later instruction reads the source, then + * src will have been clobbered. + * + * - SIMD16 compressed instructions with certain regioning (see below). + * + * The register allocator uses this information to set up conflicts between + * GRF sources and the destination. + */ +bool +fs_inst::has_source_and_destination_hazard() const +{ + switch (opcode) { + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + /* Multiple partial writes to the destination */ + return true; + default: + /* The SIMD16 compressed instruction + * + * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F + * + * is actually decoded in hardware as: + * + * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F + * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F + * + * Which is safe. However, if we have uniform accesses + * happening, we get into trouble: + * + * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F + * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F + * + * Now our destination for the first instruction overwrote the + * second instruction's src0, and we get garbage for those 8 + * pixels. There's a similar issue for the pre-gen6 + * pixel_x/pixel_y, which are registers of 16-bit values and thus + * would get stomped by the first decode as well. + */ + if (exec_size == 16) { + for (int i = 0; i < sources; i++) { + if (src[i].file == VGRF && (src[i].stride == 0 || + src[i].type == BRW_REGISTER_TYPE_UW || + src[i].type == BRW_REGISTER_TYPE_W || + src[i].type == BRW_REGISTER_TYPE_UB || + src[i].type == BRW_REGISTER_TYPE_B)) { + return true; + } + } + } + return false; + } +} + +bool +fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const +{ + if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + return false; + + fs_reg reg = this->src[0]; + if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1) + return false; + + if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) + return false; + + for (int i = 0; i < this->sources; i++) { + reg.type = this->src[i].type; + if (!this->src[i].equals(reg)) + return false; + + if (i < this->header_size) { + reg.offset += REG_SIZE; + } else { + reg = horiz_offset(reg, this->exec_size); + } + } + + return true; +} + +bool +fs_inst::can_do_source_mods(const struct gen_device_info *devinfo) +{ + if (devinfo->gen == 6 && is_math()) + return false; + + if (is_send_from_grf()) + return false; + + if (!backend_instruction::can_do_source_mods()) + return false; + + return true; +} + +bool +fs_inst::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + +bool +fs_inst::has_side_effects() const +{ + return this->eot || backend_instruction::has_side_effects(); +} + +void +fs_reg::init() +{ + memset(this, 0, sizeof(*this)); + stride = 1; +} + +/** Generic unset register constructor. */ +fs_reg::fs_reg() +{ + init(); + this->file = BAD_FILE; +} + +fs_reg::fs_reg(struct ::brw_reg reg) : + backend_reg(reg) +{ + this->offset = 0; + this->stride = 1; + if (this->file == IMM && + (this->type != BRW_REGISTER_TYPE_V && + this->type != BRW_REGISTER_TYPE_UV && + this->type != BRW_REGISTER_TYPE_VF)) { + this->stride = 0; + } +} + +bool +fs_reg::equals(const fs_reg &r) const +{ + return (this->backend_reg::equals(r) && + stride == r.stride); +} + +bool +fs_reg::is_contiguous() const +{ + return stride == 1; +} + +unsigned +fs_reg::component_size(unsigned width) const +{ + const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride : + hstride == 0 ? 0 : + 1 << (hstride - 1)); + return MAX2(width * stride, 1) * type_sz(type); +} + +extern "C" int +type_size_scalar(const struct glsl_type *type) +{ + unsigned int size, i; + + switch (type->base_type) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_BOOL: + return type->components(); + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: + return type->components() * 2; + case GLSL_TYPE_ARRAY: + return type_size_scalar(type->fields.array) * type->length; + case GLSL_TYPE_STRUCT: + size = 0; + for (i = 0; i < type->length; i++) { + size += type_size_scalar(type->fields.structure[i].type); + } + return size; + case GLSL_TYPE_SAMPLER: + /* Samplers take up no register space, since they're baked in at + * link time. + */ + return 0; + case GLSL_TYPE_ATOMIC_UINT: + return 0; + case GLSL_TYPE_SUBROUTINE: + return 1; + case GLSL_TYPE_IMAGE: + return BRW_IMAGE_PARAM_SIZE; + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: + unreachable("not reached"); + } + + return 0; +} + +/** + * Create a MOV to read the timestamp register. + * + * The caller is responsible for emitting the MOV. The return value is + * the destination of the MOV, with extra parameters set. + */ +fs_reg +fs_visitor::get_timestamp(const fs_builder &bld) +{ + assert(devinfo->gen >= 7); + + fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0), + BRW_REGISTER_TYPE_UD)); + + fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + + /* We want to read the 3 fields we care about even if it's not enabled in + * the dispatch. + */ + bld.group(4, 0).exec_all().MOV(dst, ts); + + return dst; +} + +void +fs_visitor::emit_shader_time_begin() +{ + /* We want only the low 32 bits of the timestamp. Since it's running + * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, + * which is plenty of time for our purposes. It is identical across the + * EUs, but since it's tracking GPU core speed it will increment at a + * varying rate as render P-states change. + */ + shader_start_time = component( + get_timestamp(bld.annotate("shader time start")), 0); +} + +void +fs_visitor::emit_shader_time_end() +{ + /* Insert our code just before the final SEND with EOT. */ + exec_node *end = this->instructions.get_tail(); + assert(end && ((fs_inst *) end)->eot); + const fs_builder ibld = bld.annotate("shader time end") + .exec_all().at(NULL, end); + const fs_reg timestamp = get_timestamp(ibld); + + /* We only use the low 32 bits of the timestamp - see + * emit_shader_time_begin()). + * + * We could also check if render P-states have changed (or anything + * else that might disrupt timing) by setting smear to 2 and checking if + * that field is != 0. + */ + const fs_reg shader_end_time = component(timestamp, 0); + + /* Check that there weren't any timestamp reset events (assuming these + * were the only two timestamp reads that happened). + */ + const fs_reg reset = component(timestamp, 2); + set_condmod(BRW_CONDITIONAL_Z, + ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u))); + ibld.IF(BRW_PREDICATE_NORMAL); + + fs_reg start = shader_start_time; + start.negate = true; + const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1), + BRW_REGISTER_TYPE_UD), + 0); + const fs_builder cbld = ibld.group(1, 0); + cbld.group(1, 0).ADD(diff, start, shader_end_time); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + cbld.ADD(diff, diff, brw_imm_ud(-2u)); + SHADER_TIME_ADD(cbld, 0, diff); + SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u)); + ibld.emit(BRW_OPCODE_ELSE); + SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u)); + ibld.emit(BRW_OPCODE_ENDIF); +} + +void +fs_visitor::SHADER_TIME_ADD(const fs_builder &bld, + int shader_time_subindex, + fs_reg value) +{ + int index = shader_time_index * 3 + shader_time_subindex; + struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE); + + fs_reg payload; + if (dispatch_width == 8) + payload = vgrf(glsl_type::uvec2_type); + else + payload = vgrf(glsl_type::uint_type); + + bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value); +} + +void +fs_visitor::vfail(const char *format, va_list va) +{ + char *msg; + + if (failed) + return; + + failed = true; + + msg = ralloc_vasprintf(mem_ctx, format, va); + msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); + + this->fail_msg = msg; + + if (debug_enabled) { + fprintf(stderr, "%s", msg); + } +} + +void +fs_visitor::fail(const char *format, ...) +{ + va_list va; + + va_start(va, format); + vfail(format, va); + va_end(va); +} + +/** + * Mark this program as impossible to compile with dispatch width greater + * than n. + * + * During the SIMD8 compile (which happens first), we can detect and flag + * things that are unsupported in SIMD16+ mode, so the compiler can skip the + * SIMD16+ compile altogether. + * + * During a compile of dispatch width greater than n (if one happens anyway), + * this just calls fail(). + */ +void +fs_visitor::limit_dispatch_width(unsigned n, const char *msg) +{ + if (dispatch_width > n) { + fail("%s", msg); + } else { + max_dispatch_width = n; + compiler->shader_perf_log(log_data, + "Shader dispatch width limited to SIMD%d: %s", + n, msg); + } +} + +/** + * Returns true if the instruction has a flag that means it won't + * update an entire destination register. + * + * For example, dead code elimination and live variable analysis want to know + * when a write to a variable screens off any preceding values that were in + * it. + */ +bool +fs_inst::is_partial_write() const +{ + return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || + (this->exec_size * type_sz(this->dst.type)) < 32 || + !this->dst.is_contiguous() || + this->dst.offset % REG_SIZE != 0); +} + +unsigned +fs_inst::components_read(unsigned i) const +{ + /* Return zero if the source is not present. */ + if (src[i].file == BAD_FILE) + return 0; + + switch (opcode) { + case FS_OPCODE_LINTERP: + if (i == 0) + return 2; + else + return 1; + + case FS_OPCODE_PIXEL_X: + case FS_OPCODE_PIXEL_Y: + assert(i == 0); + return 2; + + case FS_OPCODE_FB_WRITE_LOGICAL: + assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); + /* First/second FB write color. */ + if (i < 2) + return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; + else + return 1; + + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && + src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); + /* Texture coordinates. */ + if (i == TEX_LOGICAL_SRC_COORDINATE) + return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; + /* Texture derivatives. */ + else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) && + opcode == SHADER_OPCODE_TXD_LOGICAL) + return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; + /* Texture offset. */ + else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) + return 2; + /* MCS */ + else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) + return 2; + else + return 1; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + assert(src[3].file == IMM); + /* Surface coordinates. */ + if (i == 0) + return src[3].ud; + /* Surface operation source (ignored for reads). */ + else if (i == 1) + return 0; + else + return 1; + + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + assert(src[3].file == IMM && + src[4].file == IMM); + /* Surface coordinates. */ + if (i == 0) + return src[3].ud; + /* Surface operation source. */ + else if (i == 1) + return src[4].ud; + else + return 1; + + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: { + assert(src[3].file == IMM && + src[4].file == IMM); + const unsigned op = src[4].ud; + /* Surface coordinates. */ + if (i == 0) + return src[3].ud; + /* Surface operation source. */ + else if (i == 1 && op == BRW_AOP_CMPWR) + return 2; + else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC || + op == BRW_AOP_PREDEC)) + return 0; + else + return 1; + } + + default: + return 1; + } +} + +unsigned +fs_inst::size_read(int arg) const +{ + switch (opcode) { + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + if (arg == 0) + return mlen * REG_SIZE; + break; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + /* The payload is actually stored in src1 */ + if (arg == 1) + return mlen * REG_SIZE; + break; + + case FS_OPCODE_LINTERP: + if (arg == 1) + return 16; + break; + + case SHADER_OPCODE_LOAD_PAYLOAD: + if (arg < this->header_size) + return REG_SIZE; + break; + + case CS_OPCODE_CS_TERMINATE: + case SHADER_OPCODE_BARRIER: + return REG_SIZE; + + case SHADER_OPCODE_MOV_INDIRECT: + if (arg == 0) { + assert(src[2].file == IMM); + return src[2].ud; + } + break; + + default: + if (is_tex() && arg == 0 && src[0].file == VGRF) + return mlen * REG_SIZE; + break; + } + + switch (src[arg].file) { + case UNIFORM: + case IMM: + return components_read(arg) * type_sz(src[arg].type); + case BAD_FILE: + case ARF: + case FIXED_GRF: + case VGRF: + case ATTR: + return components_read(arg) * src[arg].component_size(exec_size); + case MRF: + unreachable("MRF registers are not allowed as sources"); + } + return 0; +} + +namespace { + /* Return the subset of flag registers that an instruction could + * potentially read or write based on the execution controls and flag + * subregister number of the instruction. + */ + unsigned + flag_mask(const fs_inst *inst) + { + const unsigned start = inst->flag_subreg * 16 + inst->group; + const unsigned end = start + inst->exec_size; + return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1); + } +} + +unsigned +fs_inst::flags_read(const gen_device_info *devinfo) const +{ + /* XXX - This doesn't consider explicit uses of the flag register as source + * region. + */ + if (predicate == BRW_PREDICATE_ALIGN1_ANYV || + predicate == BRW_PREDICATE_ALIGN1_ALLV) { + /* The vertical predication modes combine corresponding bits from + * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware. + */ + const unsigned shift = devinfo->gen >= 7 ? 4 : 2; + return flag_mask(this) << shift | flag_mask(this); + } else if (predicate) { + return flag_mask(this); + } else { + return 0; + } +} + +unsigned +fs_inst::flags_written() const +{ + /* XXX - This doesn't consider explicit uses of the flag register as + * destination region. + */ + if ((conditional_mod && (opcode != BRW_OPCODE_SEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE)) || + opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) { + return flag_mask(this); + } else { + return 0; + } +} + +/** + * Returns how many MRFs an FS opcode will write over. + * + * Note that this is not the 0 or 1 implied writes in an actual gen + * instruction -- the FS opcodes often generate MOVs in addition. + */ +int +fs_visitor::implied_mrf_writes(fs_inst *inst) +{ + if (inst->mlen == 0) + return 0; + + if (inst->base_mrf == -1) + return 0; + + switch (inst->opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return 1 * dispatch_width / 8; + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return 2 * dispatch_width / 8; + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_SAMPLEINFO: + return 1; + case FS_OPCODE_FB_WRITE: + return 2; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case SHADER_OPCODE_GEN4_SCRATCH_READ: + return 1; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + return inst->mlen; + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + return inst->mlen; + default: + unreachable("not reached"); + } +} + +fs_reg +fs_visitor::vgrf(const glsl_type *const type) +{ + int reg_width = dispatch_width / 8; + return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width), + brw_type_for_base_type(type)); +} + +fs_reg::fs_reg(enum brw_reg_file file, int nr) +{ + init(); + this->file = file; + this->nr = nr; + this->type = BRW_REGISTER_TYPE_F; + this->stride = (file == UNIFORM ? 0 : 1); +} + +fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type) +{ + init(); + this->file = file; + this->nr = nr; + this->type = type; + this->stride = (file == UNIFORM ? 0 : 1); +} + +/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. + * This brings in those uniform definitions + */ +void +fs_visitor::import_uniforms(fs_visitor *v) +{ + this->push_constant_loc = v->push_constant_loc; + this->pull_constant_loc = v->pull_constant_loc; + this->uniforms = v->uniforms; +} + +void +fs_visitor::emit_fragcoord_interpolation(fs_reg wpos) +{ + assert(stage == MESA_SHADER_FRAGMENT); + + /* gl_FragCoord.x */ + bld.MOV(wpos, this->pixel_x); + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.y */ + bld.MOV(wpos, this->pixel_y); + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.z */ + if (devinfo->gen >= 6) { + bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))); + } else { + bld.emit(FS_OPCODE_LINTERP, wpos, + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL], + interp_reg(VARYING_SLOT_POS, 2)); + } + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.w: Already set up in emit_interpolation */ + bld.MOV(wpos, this->wpos_w); +} + +enum brw_barycentric_mode +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op) +{ + /* Barycentric modes don't make sense for flat inputs. */ + assert(mode != INTERP_MODE_FLAT); + + unsigned bary; + switch (op) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + unreachable("invalid intrinsic"); + } + + if (mode == INTERP_MODE_NOPERSPECTIVE) + bary += 3; + + return (enum brw_barycentric_mode) bary; +} + +/** + * Turn one of the two CENTROID barycentric modes into PIXEL mode. + */ +static enum brw_barycentric_mode +centroid_to_pixel(enum brw_barycentric_mode bary) +{ + assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID || + bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); + return (enum brw_barycentric_mode) ((unsigned) bary - 1); +} + +fs_reg * +fs_visitor::emit_frontfacing_interpolation() +{ + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); + + if (devinfo->gen >= 6) { + /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create + * a boolean result from this (~0/true or 0/false). + * + * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish + * this task in only one instruction: + * - a negation source modifier will flip the bit; and + * - a W -> D type conversion will sign extend the bit into the high + * word of the destination. + * + * An ASR 15 fills the low word of the destination. + */ + fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); + g0.negate = true; + + bld.ASR(*reg, g0, brw_imm_d(15)); + } else { + /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create + * a boolean result from this (1/true or 0/false). + * + * Like in the above case, since the bit is the MSB of g1.6:UD we can use + * the negation source modifier to flip it. Unfortunately the SHR + * instruction only operates on UD (or D with an abs source modifier) + * sources without negation. + * + * Instead, use ASR (which will give ~0/true or 0/false). + */ + fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); + g1_6.negate = true; + + bld.ASR(*reg, g1_6, brw_imm_d(31)); + } + + return reg; +} + +void +fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); + assert(dst.type == BRW_REGISTER_TYPE_F); + + if (wm_prog_data->persample_dispatch) { + /* Convert int_sample_pos to floating point */ + bld.MOV(dst, int_sample_pos); + /* Scale to the range [0, 1] */ + bld.MUL(dst, dst, brw_imm_f(1 / 16.0f)); + } + else { + /* From ARB_sample_shading specification: + * "When rendering to a non-multisample buffer, or if multisample + * rasterization is disabled, gl_SamplePosition will always be + * (0.5, 0.5). + */ + bld.MOV(dst, brw_imm_f(0.5f)); + } +} + +fs_reg * +fs_visitor::emit_samplepos_setup() +{ + assert(devinfo->gen >= 6); + + const fs_builder abld = bld.annotate("compute sample position"); + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type)); + fs_reg pos = *reg; + fs_reg int_sample_x = vgrf(glsl_type::int_type); + fs_reg int_sample_y = vgrf(glsl_type::int_type); + + /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 + * mode will be enabled. + * + * From the Ivy Bridge PRM, volume 2 part 1, page 344: + * R31.1:0 Position Offset X/Y for Slot[3:0] + * R31.3:2 Position Offset X/Y for Slot[7:4] + * ..... + * + * The X, Y sample positions come in as bytes in thread payload. So, read + * the positions using vstride=16, width=8, hstride=2. + */ + struct brw_reg sample_pos_reg = + stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0), + BRW_REGISTER_TYPE_B), 16, 8, 2); + + if (dispatch_width == 8) { + abld.MOV(int_sample_x, fs_reg(sample_pos_reg)); + } else { + abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)); + abld.half(1).MOV(half(int_sample_x, 1), + fs_reg(suboffset(sample_pos_reg, 16))); + } + /* Compute gl_SamplePosition.x */ + compute_sample_position(pos, int_sample_x); + pos = offset(pos, abld, 1); + if (dispatch_width == 8) { + abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))); + } else { + abld.half(0).MOV(half(int_sample_y, 0), + fs_reg(suboffset(sample_pos_reg, 1))); + abld.half(1).MOV(half(int_sample_y, 1), + fs_reg(suboffset(sample_pos_reg, 17))); + } + /* Compute gl_SamplePosition.y */ + compute_sample_position(pos, int_sample_y); + return reg; +} + +fs_reg * +fs_visitor::emit_sampleid_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + assert(devinfo->gen >= 6); + + const fs_builder abld = bld.annotate("compute sample id"); + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); + + if (!key->multisample_fbo) { + /* As per GL_ARB_sample_shading specification: + * "When rendering to a non-multisample buffer, or if multisample + * rasterization is disabled, gl_SampleID will always be zero." + */ + abld.MOV(*reg, brw_imm_d(0)); + } else if (devinfo->gen >= 8) { + /* Sample ID comes in as 4-bit numbers in g1.0: + * + * 15:12 Slot 3 SampleID (only used in SIMD16) + * 11:8 Slot 2 SampleID (only used in SIMD16) + * 7:4 Slot 1 SampleID + * 3:0 Slot 0 SampleID + * + * Each slot corresponds to four channels, so we want to replicate each + * half-byte value to 4 channels in a row: + * + * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 + * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 + * + * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16) + * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 + * + * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 + * channels to read the first byte (7:0), and the second group of 8 + * channels to read the second byte (15:8). Then, we shift right by + * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 + * values into place. Finally, we AND with 0xf to keep the low nibble. + * + * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V + * and(16) dst<1>D tmp<8,8,1>W 0xf:W + * + * TODO: These payload bits exist on Gen7 too, but they appear to always + * be zero, so this code fails to work. We should find out why. + */ + fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); + + abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0), + BRW_REGISTER_TYPE_B), 1, 8, 0)), + brw_imm_v(0x44440000)); + abld.AND(*reg, tmp, brw_imm_w(0xf)); + } else { + const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1), + BRW_REGISTER_TYPE_D), 0); + const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); + + /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with + * 8x multisampling, subspan 0 will represent sample N (where N + * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or + * 7. We can find the value of N by looking at R0.0 bits 7:6 + * ("Starting Sample Pair Index (SSPI)") and multiplying by two + * (since samples are always delivered in pairs). That is, we + * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then + * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in + * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, + * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by + * populating a temporary variable with the sequence (0, 1, 2, 3), + * and then reading from it using vstride=1, width=4, hstride=0. + * These computations hold good for 4x multisampling as well. + * + * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1): + * the first four slots are sample 0 of subspan 0; the next four + * are sample 1 of subspan 0; the third group is sample 0 of + * subspan 1, and finally sample 1 of subspan 1. + */ + + /* SKL+ has an extra bit for the Starting Sample Pair Index to + * accomodate 16x MSAA. + */ + abld.exec_all().group(1, 0) + .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), + brw_imm_ud(0xc0)); + abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5)); + + /* This works for both SIMD8 and SIMD16 */ + abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210)); + + /* This special instruction takes care of setting vstride=1, + * width=4, hstride=0 of t2 during an ADD instruction. + */ + abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); + } + + return reg; +} + +fs_reg * +fs_visitor::emit_samplemaskin_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); + assert(devinfo->gen >= 6); + + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); + + fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0), + BRW_REGISTER_TYPE_D)); + + if (wm_prog_data->persample_dispatch) { + /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, + * and a mask representing which sample is being processed by the + * current shader invocation. + * + * From the OES_sample_variables specification: + * "When per-sample shading is active due to the use of a fragment input + * qualified by "sample" or due to the use of the gl_SampleID or + * gl_SamplePosition variables, only the bit for the current sample is + * set in gl_SampleMaskIn." + */ + const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); + + if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); + + fs_reg one = vgrf(glsl_type::int_type); + fs_reg enabled_mask = vgrf(glsl_type::int_type); + abld.MOV(one, brw_imm_d(1)); + abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); + abld.AND(*reg, enabled_mask, coverage_mask); + } else { + /* In per-pixel mode, the coverage mask is sufficient. */ + *reg = coverage_mask; + } + return reg; +} + +fs_reg +fs_visitor::resolve_source_modifiers(const fs_reg &src) +{ + if (!src.abs && !src.negate) + return src; + + fs_reg temp = bld.vgrf(src.type); + bld.MOV(temp, src); + + return temp; +} + +void +fs_visitor::emit_discard_jump() +{ + assert(brw_wm_prog_data(this->prog_data)->uses_kill); + + /* For performance, after a discard, jump to the end of the + * shader if all relevant channels have been discarded. + */ + fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP); + discard_jump->flag_subreg = 1; + + discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; + discard_jump->predicate_inverse = true; +} + +void +fs_visitor::emit_gs_thread_end() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + if (gs_compile->control_data_header_size_bits > 0) { + emit_gs_control_data_bits(this->final_gs_vertex_count); + } + + const fs_builder abld = bld.annotate("thread end"); + fs_inst *inst; + + if (gs_prog_data->static_vertex_count != -1) { + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { + if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) { + prev->eot = true; + + /* Delete now dead instructions. */ + foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { + if (dead == prev) + break; + dead->remove(); + } + return; + } else if (prev->is_control_flow() || prev->has_side_effects()) { + break; + } + } + fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr); + inst->mlen = 1; + } else { + fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); + sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + sources[1] = this->final_gs_vertex_count; + abld.LOAD_PAYLOAD(payload, sources, 2, 2); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->mlen = 2; + } + inst->eot = true; + inst->offset = 0; +} + +void +fs_visitor::assign_curb_setup() +{ + prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8; + + /* Map the offsets in the UNIFORM file to fixed HW regs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == UNIFORM) { + int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4; + int constant_nr; + if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { + constant_nr = push_constant_loc[uniform_nr]; + } else { + /* Section 5.11 of the OpenGL 4.1 spec says: + * "Out-of-bounds reads return undefined values, which include + * values from other variables of the active program or zero." + * Just return the first push constant. + */ + constant_nr = 0; + } + + struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + + constant_nr / 8, + constant_nr % 8); + brw_reg.abs = inst->src[i].abs; + brw_reg.negate = inst->src[i].negate; + + assert(inst->src[i].stride == 0); + inst->src[i] = byte_offset( + retype(brw_reg, inst->src[i].type), + inst->src[i].offset % 4); + } + } + } + + /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ + this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length; +} + +void +fs_visitor::calculate_urb_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + memset(prog_data->urb_setup, -1, + sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); + + int urb_next = 0; + /* Figure out where each of the incoming setup attributes lands. */ + if (devinfo->gen >= 6) { + if (_mesa_bitcount_64(nir->info->inputs_read & + BRW_FS_VARYING_INPUT_MASK) <= 16) { + /* The SF/SBE pipeline stage can do arbitrary rearrangement of the + * first 16 varying inputs, so we can put them wherever we want. + * Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with + * a different vertex (or geometry) shader. + */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK & + BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + bool include_vue_header = + nir->info->inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT); + + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them + * in an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + struct brw_vue_map prev_stage_vue_map; + brw_compute_vue_map(devinfo, &prev_stage_vue_map, + key->input_slots_valid, + nir->info->separate_shader); + int first_slot = + include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET; + + assert(prev_stage_vue_map.num_slots <= first_slot + 32); + for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; + slot++) { + int varying = prev_stage_vue_map.slot_to_varying[slot]; + if (varying != BRW_VARYING_SLOT_PAD && + (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK & + BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = prev_stage_vue_map.num_slots - first_slot; + } + } else { + /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + /* Point size is packed into the header, not as a general attribute */ + if (i == VARYING_SLOT_PSIZ) + continue; + + if (key->input_slots_valid & BITFIELD64_BIT(i)) { + /* The back color slot is skipped when the front color is + * also written to. In addition, some slots can be + * written in the vertex shader and not read in the + * fragment shader. So the register number must always be + * incremented, mapped or not. + */ + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + prog_data->urb_setup[i] = urb_next; + urb_next++; + } + } + + /* + * It's a FS only attribute, and we did interpolation for this attribute + * in SF thread. So, count it here, too. + * + * See compile_sf_prog() for more info. + */ + if (nir->info->inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; + } + + prog_data->num_varying_inputs = urb_next; +} + +void +fs_visitor::assign_urb_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + int urb_start = payload.num_regs + prog_data->base.curb_read_length; + + /* Offset all the urb_setup[] index by the actual position of the + * setup regs, now that the location of the constants has been chosen. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode == FS_OPCODE_LINTERP) { + assert(inst->src[1].file == FIXED_GRF); + inst->src[1].nr += urb_start; + } + + if (inst->opcode == FS_OPCODE_CINTERP) { + assert(inst->src[0].file == FIXED_GRF); + inst->src[0].nr += urb_start; + } + } + + /* Each attribute is 4 setup channels, each of which is half a reg. */ + this->first_non_payload_grf += prog_data->num_varying_inputs * 2; +} + +void +fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) +{ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + int grf = payload.num_regs + + prog_data->curb_read_length + + inst->src[i].nr + + inst->src[i].offset / REG_SIZE; + + /* As explained at brw_reg_from_fs_reg, From the Haswell PRM: + * + * VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries. + * + * So, for registers that are large enough, we have to split the exec + * size in two and trust the compression state to sort it out. + */ + unsigned total_size = inst->exec_size * + inst->src[i].stride * + type_sz(inst->src[i].type); + + assert(total_size <= 2 * REG_SIZE); + const unsigned exec_size = + (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2; + + unsigned width = inst->src[i].stride == 0 ? 1 : exec_size; + struct brw_reg reg = + stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + inst->src[i].offset % REG_SIZE), + exec_size * inst->src[i].stride, + width, inst->src[i].stride); + reg.abs = inst->src[i].abs; + reg.negate = inst->src[i].negate; + + inst->src[i] = reg; + } + } +} + +void +fs_visitor::assign_vs_urb_setup() +{ + struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); + + assert(stage == MESA_SHADER_VERTEX); + + /* Each attribute is 4 regs. */ + this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots; + + assert(vs_prog_data->base.urb_read_length <= 15); + + /* Rewrite all ATTR file references to the hw grf that they land in. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_tcs_single_patch_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_tes_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + first_non_payload_grf += 8 * vue_prog_data->urb_read_length; + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_gs_urb_setup() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + first_non_payload_grf += + 8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* Rewrite all ATTR file references to GRFs. */ + convert_attr_sources_to_hw_regs(inst); + } +} + + +/** + * Split large virtual GRFs into separate components if we can. + * + * This is mostly duplicated with what brw_fs_vector_splitting does, + * but that's really conservative because it's afraid of doing + * splitting that doesn't result in real progress after the rest of + * the optimization phases, which would cause infinite looping in + * optimization. We can do it once here, safely. This also has the + * opportunity to split interpolated values, or maybe even uniforms, + * which we don't have at the IR level. + * + * We want to split, because virtual GRFs are what we register + * allocate and spill (due to contiguousness requirements for some + * instructions), and they're what we naturally generate in the + * codegen process, but most virtual GRFs don't actually need to be + * contiguous sets of GRFs. If we split, we'll end up with reduced + * live intervals and better dead code elimination and coalescing. + */ +void +fs_visitor::split_virtual_grfs() +{ + /* Compact the register file so we eliminate dead vgrfs. This + * only defines split points for live registers, so if we have + * too large dead registers they will hit assertions later. + */ + compact_virtual_grfs(); + + int num_vars = this->alloc.count; + + /* Count the total number of registers */ + int reg_count = 0; + int vgrf_to_reg[num_vars]; + for (int i = 0; i < num_vars; i++) { + vgrf_to_reg[i] = reg_count; + reg_count += alloc.sizes[i]; + } + + /* An array of "split points". For each register slot, this indicates + * if this slot can be separated from the previous slot. Every time an + * instruction uses multiple elements of a register (as a source or + * destination), we mark the used slots as inseparable. Then we go + * through and split the registers into the smallest pieces we can. + */ + bool split_points[reg_count]; + memset(split_points, 0, sizeof(split_points)); + + /* Mark all used registers as fully splittable */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) { + int reg = vgrf_to_reg[inst->dst.nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++) + split_points[reg + j] = true; + } + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + int reg = vgrf_to_reg[inst->src[i].nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++) + split_points[reg + j] = true; + } + } + } + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) { + int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; + for (unsigned j = 1; j < regs_written(inst); j++) + split_points[reg + j] = false; + } + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + for (unsigned j = 1; j < regs_read(inst, i); j++) + split_points[reg + j] = false; + } + } + } + + int new_virtual_grf[reg_count]; + int new_reg_offset[reg_count]; + + int reg = 0; + for (int i = 0; i < num_vars; i++) { + /* The first one should always be 0 as a quick sanity check. */ + assert(split_points[reg] == false); + + /* j = 0 case */ + new_reg_offset[reg] = 0; + reg++; + int offset = 1; + + /* j > 0 case */ + for (unsigned j = 1; j < alloc.sizes[i]; j++) { + /* If this is a split point, reset the offset to 0 and allocate a + * new virtual GRF for the previous offset many registers + */ + if (split_points[reg]) { + assert(offset <= MAX_VGRF_SIZE); + int grf = alloc.allocate(offset); + for (int k = reg - offset; k < reg; k++) + new_virtual_grf[k] = grf; + offset = 0; + } + new_reg_offset[reg] = offset; + offset++; + reg++; + } + + /* The last one gets the original register number */ + assert(offset <= MAX_VGRF_SIZE); + alloc.sizes[i] = offset; + for (int k = reg - offset; k < reg; k++) + new_virtual_grf[k] = i; + } + assert(reg == reg_count); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) { + reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; + inst->dst.nr = new_virtual_grf[reg]; + inst->dst.offset = new_reg_offset[reg] * REG_SIZE + + inst->dst.offset % REG_SIZE; + assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + } + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + inst->src[i].nr = new_virtual_grf[reg]; + inst->src[i].offset = new_reg_offset[reg] * REG_SIZE + + inst->src[i].offset % REG_SIZE; + assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + } + } + } + invalidate_live_intervals(); +} + +/** + * Remove unused virtual GRFs and compact the virtual_grf_* arrays. + * + * During code generation, we create tons of temporary variables, many of + * which get immediately killed and are never used again. Yet, in later + * optimization and analysis passes, such as compute_live_intervals, we need + * to loop over all the virtual GRFs. Compacting them can save a lot of + * overhead. + */ +bool +fs_visitor::compact_virtual_grfs() +{ + bool progress = false; + int remap_table[this->alloc.count]; + memset(remap_table, -1, sizeof(remap_table)); + + /* Mark which virtual GRFs are used. */ + foreach_block_and_inst(block, const fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) + remap_table[inst->dst.nr] = 0; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + remap_table[inst->src[i].nr] = 0; + } + } + + /* Compact the GRF arrays. */ + int new_index = 0; + for (unsigned i = 0; i < this->alloc.count; i++) { + if (remap_table[i] == -1) { + /* We just found an unused register. This means that we are + * actually going to compact something. + */ + progress = true; + } else { + remap_table[i] = new_index; + alloc.sizes[new_index] = alloc.sizes[i]; + invalidate_live_intervals(); + ++new_index; + } + } + + this->alloc.count = new_index; + + /* Patch all the instructions to use the newly renumbered registers */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) + inst->dst.nr = remap_table[inst->dst.nr]; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + inst->src[i].nr = remap_table[inst->src[i].nr]; + } + } + + /* Patch all the references to delta_xy, since they're used in register + * allocation. If they're unused, switch them to BAD_FILE so we don't + * think some random VGRF is delta_xy. + */ + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { + if (delta_xy[i].file == VGRF) { + if (remap_table[delta_xy[i].nr] != -1) { + delta_xy[i].nr = remap_table[delta_xy[i].nr]; + } else { + delta_xy[i].file = BAD_FILE; + } + } + } + + return progress; +} + +static void +set_push_pull_constant_loc(unsigned uniform, int *chunk_start, + unsigned *max_chunk_bitsize, + bool contiguous, unsigned bitsize, + const unsigned target_bitsize, + int *push_constant_loc, int *pull_constant_loc, + unsigned *num_push_constants, + unsigned *num_pull_constants, + const unsigned max_push_components, + const unsigned max_chunk_size, + struct brw_stage_prog_data *stage_prog_data) +{ + /* This is the first live uniform in the chunk */ + if (*chunk_start < 0) + *chunk_start = uniform; + + /* Keep track of the maximum bit size access in contiguous uniforms */ + *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); + + /* If this element does not need to be contiguous with the next, we + * split at this point and everything between chunk_start and u forms a + * single chunk. + */ + if (!contiguous) { + /* If bitsize doesn't match the target one, skip it */ + if (*max_chunk_bitsize != target_bitsize) { + /* FIXME: right now we only support 32 and 64-bit accesses */ + assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); + *max_chunk_bitsize = 0; + *chunk_start = -1; + return; + } + + unsigned chunk_size = uniform - *chunk_start + 1; + + /* Decide whether we should push or pull this parameter. In the + * Vulkan driver, push constants are explicitly exposed via the API + * so we push everything. In GL, we only push small arrays. + */ + if (stage_prog_data->pull_param == NULL || + (*num_push_constants + chunk_size <= max_push_components && + chunk_size <= max_chunk_size)) { + assert(*num_push_constants + chunk_size <= max_push_components); + for (unsigned j = *chunk_start; j <= uniform; j++) + push_constant_loc[j] = (*num_push_constants)++; + } else { + for (unsigned j = *chunk_start; j <= uniform; j++) + pull_constant_loc[j] = (*num_pull_constants)++; + } + + *max_chunk_bitsize = 0; + *chunk_start = -1; + } +} + +/** + * Assign UNIFORM file registers to either push constants or pull constants. + * + * We allow a fragment shader to have more than the specified minimum + * maximum number of fragment shader uniform components (64). If + * there are too many of these, they'd fill up all of register space. + * So, this will push some of them out to the pull constant buffer and + * update the program to load them. + */ +void +fs_visitor::assign_constant_locations() +{ + /* Only the first compile gets to decide on locations. */ + if (dispatch_width != min_dispatch_width) + return; + + bool is_live[uniforms]; + memset(is_live, 0, sizeof(is_live)); + unsigned bitsize_access[uniforms]; + memset(bitsize_access, 0, sizeof(bitsize_access)); + + /* For each uniform slot, a value of true indicates that the given slot and + * the next slot must remain contiguous. This is used to keep us from + * splitting arrays apart. + */ + bool contiguous[uniforms]; + memset(contiguous, 0, sizeof(contiguous)); + + int thread_local_id_index = + (stage == MESA_SHADER_COMPUTE) ? + brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1; + + /* First, we walk through the instructions and do two things: + * + * 1) Figure out which uniforms are live. + * + * 2) Mark any indirectly used ranges of registers as contiguous. + * + * Note that we don't move constant-indexed accesses to arrays. No + * testing has been done of the performance impact of this choice. + */ + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + for (int i = 0 ; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + assert(inst->src[2].ud % 4 == 0); + unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; + assert(last < uniforms); + + for (unsigned j = constant_nr; j < last; j++) { + is_live[j] = true; + contiguous[j] = true; + bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type)); + } + is_live[last] = true; + bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); + } else { + if (constant_nr >= 0 && constant_nr < (int) uniforms) { + int regs_read = inst->components_read(i) * + type_sz(inst->src[i].type) / 4; + for (int j = 0; j < regs_read; j++) { + is_live[constant_nr + j] = true; + bitsize_access[constant_nr + j] = + MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type)); + } + } + } + } + } + + if (thread_local_id_index >= 0 && !is_live[thread_local_id_index]) + thread_local_id_index = -1; + + /* Only allow 16 registers (128 uniform components) as push constants. + * + * Just demote the end of the list. We could probably do better + * here, demoting things that are rarely used in the program first. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c. + */ + unsigned int max_push_components = 16 * 8; + if (thread_local_id_index >= 0) + max_push_components--; /* Save a slot for the thread ID */ + + /* We push small arrays, but no bigger than 16 floats. This is big enough + * for a vec4 but hopefully not large enough to push out other stuff. We + * should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; + + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; + + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + + /* Default to -1 meaning no location */ + memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); + + int chunk_start = -1; + unsigned max_chunk_bitsize = 0; + + /* First push 64-bit uniforms to ensure they are properly aligned */ + const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); + for (unsigned u = 0; u < uniforms; u++) { + if (!is_live[u]) + continue; + + set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, + contiguous[u], bitsize_access[u], + uniform_64_bit_size, + push_constant_loc, pull_constant_loc, + &num_push_constants, &num_pull_constants, + max_push_components, max_chunk_size, + stage_prog_data); + + } + + /* Then push the rest of uniforms */ + const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); + for (unsigned u = 0; u < uniforms; u++) { + if (!is_live[u]) + continue; + + /* Skip thread_local_id_index to put it in the last push register. */ + if (thread_local_id_index == (int)u) + continue; + + set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, + contiguous[u], bitsize_access[u], + uniform_32_bit_size, + push_constant_loc, pull_constant_loc, + &num_push_constants, &num_pull_constants, + max_push_components, max_chunk_size, + stage_prog_data); + } + + /* Add the CS local thread ID uniform at the end of the push constants */ + if (thread_local_id_index >= 0) + push_constant_loc[thread_local_id_index] = num_push_constants++; + + /* As the uniforms are going to be reordered, take the data from a temporary + * copy of the original param[]. + */ + gl_constant_value **param = ralloc_array(NULL, gl_constant_value*, + stage_prog_data->nr_params); + memcpy(param, stage_prog_data->param, + sizeof(gl_constant_value*) * stage_prog_data->nr_params); + stage_prog_data->nr_params = num_push_constants; + stage_prog_data->nr_pull_params = num_pull_constants; + + /* Up until now, the param[] array has been indexed by reg + offset + * of UNIFORM registers. Move pull constants into pull_param[] and + * condense param[] to only contain the uniforms we chose to push. + * + * NOTE: Because we are condensing the params[] array, we know that + * push_constant_loc[i] <= i and we can do it in one smooth loop without + * having to make a copy. + */ + int new_thread_local_id_index = -1; + for (unsigned int i = 0; i < uniforms; i++) { + const gl_constant_value *value = param[i]; + + if (pull_constant_loc[i] != -1) { + stage_prog_data->pull_param[pull_constant_loc[i]] = value; + } else if (push_constant_loc[i] != -1) { + stage_prog_data->param[push_constant_loc[i]] = value; + if (thread_local_id_index == (int)i) + new_thread_local_id_index = push_constant_loc[i]; + } + } + ralloc_free(param); + + if (stage == MESA_SHADER_COMPUTE) + brw_cs_prog_data(stage_prog_data)->thread_local_id_index = + new_thread_local_id_index; +} + +/** + * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD + * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. + */ +void +fs_visitor::lower_constant_loads() +{ + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + + unsigned location = inst->src[i].nr + inst->src[i].offset / 4; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + + if (pull_index == -1) + continue; + + assert(inst->src[i].stride == 0); + + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + const unsigned base = pull_index * 4; + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); + + /* Rewrite the instruction to use the temporary VGRF. */ + inst->src[i].file = VGRF; + inst->src[i].nr = dst.nr; + inst->src[i].offset = (base & (block_sz - 1)) + + inst->src[i].offset % 4; + + brw_mark_surface_used(prog_data, index); + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + unsigned location = inst->src[0].nr + inst->src[0].offset / 4; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + + if (pull_index == -1) + continue; + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + inst->src[1], + pull_index * 4); + inst->remove(block); + + brw_mark_surface_used(prog_data, index); + } + } + invalidate_live_intervals(); +} + +bool +fs_visitor::opt_algebraic() +{ + bool progress = false; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_MOV: + if (inst->src[0].file != IMM) + break; + + if (inst->saturate) { + if (inst->dst.type != inst->src[0].type) + assert(!"unimplemented: saturate mixed types"); + + if (brw_saturate_immediate(inst->dst.type, + &inst->src[0].as_brw_reg())) { + inst->saturate = false; + progress = true; + } + } + break; + + case BRW_OPCODE_MUL: + if (inst->src[1].file != IMM) + continue; + + /* a * 1.0 = a */ + if (inst->src[1].is_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + progress = true; + break; + } + + /* a * -1.0 = -a */ + if (inst->src[1].is_negative_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0].negate = !inst->src[0].negate; + inst->src[1] = reg_undef; + progress = true; + break; + } + + /* a * 0.0 = 0.0 */ + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = inst->src[1]; + inst->src[1] = reg_undef; + progress = true; + break; + } + + if (inst->src[0].file == IMM) { + assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + inst->opcode = BRW_OPCODE_MOV; + inst->src[0].f *= inst->src[1].f; + inst->src[1] = reg_undef; + progress = true; + break; + } + break; + case BRW_OPCODE_ADD: + if (inst->src[1].file != IMM) + continue; + + /* a + 0.0 = a */ + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + progress = true; + break; + } + + if (inst->src[0].file == IMM) { + assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + inst->opcode = BRW_OPCODE_MOV; + inst->src[0].f += inst->src[1].f; + inst->src[1] = reg_undef; + progress = true; + break; + } + break; + case BRW_OPCODE_OR: + if (inst->src[0].equals(inst->src[1])) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + progress = true; + break; + } + break; + case BRW_OPCODE_LRP: + if (inst->src[1].equals(inst->src[2])) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = inst->src[1]; + inst->src[1] = reg_undef; + inst->src[2] = reg_undef; + progress = true; + break; + } + break; + case BRW_OPCODE_CMP: + if (inst->conditional_mod == BRW_CONDITIONAL_GE && + inst->src[0].abs && + inst->src[0].negate && + inst->src[1].is_zero()) { + inst->src[0].abs = false; + inst->src[0].negate = false; + inst->conditional_mod = BRW_CONDITIONAL_Z; + progress = true; + break; + } + break; + case BRW_OPCODE_SEL: + if (inst->src[0].equals(inst->src[1])) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + inst->predicate = BRW_PREDICATE_NONE; + inst->predicate_inverse = false; + progress = true; + } else if (inst->saturate && inst->src[1].file == IMM) { + switch (inst->conditional_mod) { + case BRW_CONDITIONAL_LE: + case BRW_CONDITIONAL_L: + switch (inst->src[1].type) { + case BRW_REGISTER_TYPE_F: + if (inst->src[1].f >= 1.0f) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + progress = true; + } + break; + default: + break; + } + break; + case BRW_CONDITIONAL_GE: + case BRW_CONDITIONAL_G: + switch (inst->src[1].type) { + case BRW_REGISTER_TYPE_F: + if (inst->src[1].f <= 0.0f) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + progress = true; + } + break; + default: + break; + } + default: + break; + } + } + break; + case BRW_OPCODE_MAD: + if (inst->src[1].is_zero() || inst->src[2].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + inst->src[2] = reg_undef; + progress = true; + } else if (inst->src[0].is_zero()) { + inst->opcode = BRW_OPCODE_MUL; + inst->src[0] = inst->src[2]; + inst->src[2] = reg_undef; + progress = true; + } else if (inst->src[1].is_one()) { + inst->opcode = BRW_OPCODE_ADD; + inst->src[1] = inst->src[2]; + inst->src[2] = reg_undef; + progress = true; + } else if (inst->src[2].is_one()) { + inst->opcode = BRW_OPCODE_ADD; + inst->src[2] = reg_undef; + progress = true; + } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) { + inst->opcode = BRW_OPCODE_ADD; + inst->src[1].f *= inst->src[2].f; + inst->src[2] = reg_undef; + progress = true; + } + break; + case SHADER_OPCODE_BROADCAST: + if (is_uniform(inst->src[0])) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } else if (inst->src[1].file == IMM) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = component(inst->src[0], + inst->src[1].ud); + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } + break; + + default: + break; + } + + /* Swap if src[0] is immediate. */ + if (progress && inst->is_commutative()) { + if (inst->src[0].file == IMM) { + fs_reg tmp = inst->src[1]; + inst->src[1] = inst->src[0]; + inst->src[0] = tmp; + } + } + } + return progress; +} + +/** + * Optimize sample messages that have constant zero values for the trailing + * texture coordinates. We can just reduce the message length for these + * instructions instead of reserving a register for it. Trailing parameters + * that aren't sent default to zero anyway. This will cause the dead code + * eliminator to remove the MOV instruction that would otherwise be emitted to + * set up the zero value. + */ +bool +fs_visitor::opt_zero_samples() +{ + /* Gen4 infers the texturing opcode based on the message length so we can't + * change it. + */ + if (devinfo->gen < 5) + return false; + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (!inst->is_tex()) + continue; + + fs_inst *load_payload = (fs_inst *) inst->prev; + + if (load_payload->is_head_sentinel() || + load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + /* We don't want to remove the message header or the first parameter. + * Removing the first parameter is not allowed, see the Haswell PRM + * volume 7, page 149: + * + * "Parameter 0 is required except for the sampleinfo message, which + * has no parameter 0" + */ + while (inst->mlen > inst->header_size + inst->exec_size / 8 && + load_payload->src[(inst->mlen - inst->header_size) / + (inst->exec_size / 8) + + inst->header_size - 1].is_zero()) { + inst->mlen -= inst->exec_size / 8; + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Optimize sample messages which are followed by the final RT write. + * + * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its + * results sent directly to the framebuffer, bypassing the EU. Recognize the + * final texturing results copied to the framebuffer write payload and modify + * them to write to the framebuffer directly. + */ +bool +fs_visitor::opt_sampler_eot() +{ + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + if (stage != MESA_SHADER_FRAGMENT) + return false; + + if (devinfo->gen < 9 && !devinfo->is_cherryview) + return false; + + /* FINISHME: It should be possible to implement this optimization when there + * are multiple drawbuffers. + */ + if (key->nr_color_regions != 1) + return false; + + /* Requires emitting a bunch of saturating MOV instructions during logical + * send lowering to clamp the color payload, which the sampler unit isn't + * going to do for us. + */ + if (key->clamp_fragment_color) + return false; + + /* Look for a texturing instruction immediately before the final FB_WRITE. */ + bblock_t *block = cfg->blocks[cfg->num_blocks - 1]; + fs_inst *fb_write = (fs_inst *)block->end(); + assert(fb_write->eot); + assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL); + + /* There wasn't one; nothing to do. */ + if (unlikely(fb_write->prev->is_head_sentinel())) + return false; + + fs_inst *tex_inst = (fs_inst *) fb_write->prev; + + /* 3D Sampler » Messages » Message Format + * + * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler + * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*” + */ + if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL && + tex_inst->opcode != FS_OPCODE_TXB_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL && + tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL) + return false; + + /* XXX - This shouldn't be necessary. */ + if (tex_inst->prev->is_head_sentinel()) + return false; + + /* Check that the FB write sources are fully initialized by the single + * texturing instruction. + */ + for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { + if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { + if (!fb_write->src[i].equals(tex_inst->dst) || + fb_write->size_read(i) != tex_inst->size_written) + return false; + } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { + if (fb_write->src[i].file != BAD_FILE) + return false; + } + } + + assert(!tex_inst->eot); /* We can't get here twice */ + assert((tex_inst->offset & (0xff << 24)) == 0); + + const fs_builder ibld(this, block, tex_inst); + + tex_inst->offset |= fb_write->target << 24; + tex_inst->eot = true; + tex_inst->dst = ibld.null_reg_ud(); + tex_inst->size_written = 0; + fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); + + /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT + * flag and submit a header together with the sampler message as required + * by the hardware. + */ + invalidate_live_intervals(); + return true; +} + +bool +fs_visitor::opt_register_renaming() +{ + bool progress = false; + int depth = 0; + + int remap[alloc.count]; + memset(remap, -1, sizeof(int) * alloc.count); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) { + depth++; + } else if (inst->opcode == BRW_OPCODE_ENDIF || + inst->opcode == BRW_OPCODE_WHILE) { + depth--; + } + + /* Rewrite instruction sources. */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF && + remap[inst->src[i].nr] != -1 && + remap[inst->src[i].nr] != inst->src[i].nr) { + inst->src[i].nr = remap[inst->src[i].nr]; + progress = true; + } + } + + const int dst = inst->dst.nr; + + if (depth == 0 && + inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written && + !inst->is_partial_write()) { + if (remap[dst] == -1) { + remap[dst] = dst; + } else { + remap[dst] = alloc.allocate(regs_written(inst)); + inst->dst.nr = remap[dst]; + progress = true; + } + } else if (inst->dst.file == VGRF && + remap[dst] != -1 && + remap[dst] != dst) { + inst->dst.nr = remap[dst]; + progress = true; + } + } + + if (progress) { + invalidate_live_intervals(); + + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { + if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) { + delta_xy[i].nr = remap[delta_xy[i].nr]; + } + } + } + + return progress; +} + +/** + * Remove redundant or useless discard jumps. + * + * For example, we can eliminate jumps in the following sequence: + * + * discard-jump (redundant with the next jump) + * discard-jump (useless; jumps to the next instruction) + * placeholder-halt + */ +bool +fs_visitor::opt_redundant_discard_jumps() +{ + bool progress = false; + + bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1]; + + fs_inst *placeholder_halt = NULL; + foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) { + if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) { + placeholder_halt = inst; + break; + } + } + + if (!placeholder_halt) + return false; + + /* Delete any HALTs immediately before the placeholder halt. */ + for (fs_inst *prev = (fs_inst *) placeholder_halt->prev; + !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP; + prev = (fs_inst *) placeholder_halt->prev) { + prev->remove(last_bblock); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Compute a bitmask with GRF granularity with a bit set for each GRF starting + * from \p r.offset which overlaps the region starting at \p s.offset and + * spanning \p ds bytes. + */ +static inline unsigned +mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) +{ + const int rel_offset = reg_offset(s) - reg_offset(r); + const int shift = rel_offset / REG_SIZE; + const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE); + assert(reg_space(r) == reg_space(s) && + shift >= 0 && shift < int(8 * sizeof(unsigned))); + return ((1 << n) - 1) << shift; +} + +bool +fs_visitor::compute_to_mrf() +{ + bool progress = false; + int next_ip = 0; + + /* No MRFs on Gen >= 7. */ + if (devinfo->gen >= 7) + return false; + + calculate_live_intervals(); + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + int ip = next_ip; + next_ip++; + + if (inst->opcode != BRW_OPCODE_MOV || + inst->is_partial_write() || + inst->dst.file != MRF || inst->src[0].file != VGRF || + inst->dst.type != inst->src[0].type || + inst->src[0].abs || inst->src[0].negate || + !inst->src[0].is_contiguous() || + inst->src[0].offset % REG_SIZE != 0) + continue; + + /* Can't compute-to-MRF this GRF if someone else was going to + * read it later. + */ + if (this->virtual_grf_end[inst->src[0].nr] > ip) + continue; + + /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the + * things that computed the value of all GRFs of the source region. The + * regs_left bitset keeps track of the registers we haven't yet found a + * generating instruction for. + */ + unsigned regs_left = (1 << regs_read(inst, 0)) - 1; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + /* Found the last thing to write our reg we want to turn + * into a compute-to-MRF. + */ + + /* If this one instruction didn't populate all the + * channels, bail. We might be able to rewrite everything + * that writes that reg, but it would require smarter + * tracking. + */ + if (scan_inst->is_partial_write()) + break; + + /* Handling things not fully contained in the source of the copy + * would need us to understand coalescing out more than one MOV at + * a time. + */ + if (!region_contained_in(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) + break; + + /* SEND instructions can't have MRF as a destination. */ + if (scan_inst->mlen) + break; + + if (devinfo->gen == 6) { + /* gen6 math instructions must have the destination be + * GRF, so no compute-to-MRF for them. + */ + if (scan_inst->is_math()) { + break; + } + } + + /* Clear the bits for any registers this instruction overwrites. */ + regs_left &= ~mask_relative_to( + inst->src[0], scan_inst->dst, scan_inst->size_written); + if (!regs_left) + break; + } + + /* We don't handle control flow here. Most computation of + * values that end up in MRFs are shortly before the MRF + * write anyway. + */ + if (block->start() == scan_inst) + break; + + /* You can't read from an MRF, so if someone else reads our + * MRF's source GRF that we wanted to rewrite, that stops us. + */ + bool interfered = false; + for (int i = 0; i < scan_inst->sources; i++) { + if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i), + inst->src[0], inst->size_read(0))) { + interfered = true; + } + } + if (interfered) + break; + + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->dst, inst->size_written)) { + /* If somebody else writes our MRF here, we can't + * compute-to-MRF before that. + */ + break; + } + + if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 && + regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE, + inst->dst, inst->size_written)) { + /* Found a SEND instruction, which means that there are + * live values in MRFs from base_mrf to base_mrf + + * scan_inst->mlen - 1. Don't go pushing our MRF write up + * above it. + */ + break; + } + } + + if (regs_left) + continue; + + /* Found all generating instructions of our MRF's source value, so it + * should be safe to rewrite them to point to the MRF directly. + */ + regs_left = (1 << regs_read(inst, 0)) - 1; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + /* Clear the bits for any registers this instruction overwrites. */ + regs_left &= ~mask_relative_to( + inst->src[0], scan_inst->dst, scan_inst->size_written); + + const unsigned rel_offset = reg_offset(scan_inst->dst) - + reg_offset(inst->src[0]); + + if (inst->dst.nr & BRW_MRF_COMPR4) { + /* Apply the same address transformation done by the hardware + * for COMPR4 MRF writes. + */ + assert(rel_offset < 2 * REG_SIZE); + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4; + + /* Clear the COMPR4 bit if the generating instruction is not + * compressed. + */ + if (scan_inst->size_written < 2 * REG_SIZE) + scan_inst->dst.nr &= ~BRW_MRF_COMPR4; + + } else { + /* Calculate the MRF number the result of this instruction is + * ultimately written to. + */ + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE; + } + + scan_inst->dst.file = MRF; + scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE; + scan_inst->saturate |= inst->saturate; + if (!regs_left) + break; + } + } + + assert(!regs_left); + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control + * flow. We could probably do better here with some form of divergence + * analysis. + */ +bool +fs_visitor::eliminate_find_live_channel() +{ + bool progress = false; + unsigned depth = 0; + + if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) { + /* The optimization below assumes that channel zero is live on thread + * dispatch, which may not be the case if the fixed function dispatches + * threads sparsely. + */ + return false; + } + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_DO: + depth++; + break; + + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + depth--; + break; + + case FS_OPCODE_DISCARD_JUMP: + /* This can potentially make control flow non-uniform until the end + * of the program. + */ + return progress; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + if (depth == 0) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = brw_imm_ud(0u); + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } + break; + + default: + break; + } + } + + return progress; +} + +/** + * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE + * instructions to FS_OPCODE_REP_FB_WRITE. + */ +void +fs_visitor::emit_repclear_shader() +{ + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + int base_mrf = 0; + int color_mrf = base_mrf + 2; + fs_inst *mov; + + if (uniforms > 0) { + mov = bld.exec_all().group(4, 0) + .MOV(brw_message_reg(color_mrf), + fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + } else { + struct brw_reg reg = + brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + + mov = bld.exec_all().group(4, 0) + .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); + } + + fs_inst *write; + if (key->nr_color_regions == 1) { + write = bld.emit(FS_OPCODE_REP_FB_WRITE); + write->saturate = key->clamp_fragment_color; + write->base_mrf = color_mrf; + write->target = 0; + write->header_size = 0; + write->mlen = 1; + } else { + assume(key->nr_color_regions > 0); + for (int i = 0; i < key->nr_color_regions; ++i) { + write = bld.emit(FS_OPCODE_REP_FB_WRITE); + write->saturate = key->clamp_fragment_color; + write->base_mrf = base_mrf; + write->target = i; + write->header_size = 2; + write->mlen = 3; + } + } + write->eot = true; + + calculate_cfg(); + + assign_constant_locations(); + assign_curb_setup(); + + /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ + if (uniforms > 0) { + assert(mov->src[0].file == FIXED_GRF); + mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + } +} + +/** + * Walks through basic blocks, looking for repeated MRF writes and + * removing the later ones. + */ +bool +fs_visitor::remove_duplicate_mrf_writes() +{ + fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)]; + bool progress = false; + + /* Need to update the MRF tracking for compressed instructions. */ + if (dispatch_width >= 16) + return false; + + memset(last_mrf_move, 0, sizeof(last_mrf_move)); + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->is_control_flow()) { + memset(last_mrf_move, 0, sizeof(last_mrf_move)); + } + + if (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == MRF) { + fs_inst *prev_inst = last_mrf_move[inst->dst.nr]; + if (prev_inst && inst->equals(prev_inst)) { + inst->remove(block); + progress = true; + continue; + } + } + + /* Clear out the last-write records for MRFs that were overwritten. */ + if (inst->dst.file == MRF) { + last_mrf_move[inst->dst.nr] = NULL; + } + + if (inst->mlen > 0 && inst->base_mrf != -1) { + /* Found a SEND instruction, which will include two or fewer + * implied MRF writes. We could do better here. + */ + for (int i = 0; i < implied_mrf_writes(inst); i++) { + last_mrf_move[inst->base_mrf + i] = NULL; + } + } + + /* Clear out any MRF move records whose sources got overwritten. */ + for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { + if (last_mrf_move[i] && + regions_overlap(inst->dst, inst->size_written, + last_mrf_move[i]->src[0], + last_mrf_move[i]->size_read(0))) { + last_mrf_move[i] = NULL; + } + } + + if (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == MRF && + inst->src[0].file != ARF && + !inst->is_partial_write()) { + last_mrf_move[inst->dst.nr] = inst; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +static void +clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) +{ + /* Clear the flag for registers that actually got read (as expected). */ + for (int i = 0; i < inst->sources; i++) { + int grf; + if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) { + grf = inst->src[i].nr; + } else { + continue; + } + + if (grf >= first_grf && + grf < first_grf + grf_len) { + deps[grf - first_grf] = false; + if (inst->exec_size == 16) + deps[grf - first_grf + 1] = false; + } + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not + * check for post destination dependencies on this instruction, software + * must ensure that there is no destination hazard for the case of ‘write + * followed by a posted write’ shown in the following example. + * + * 1. mov r3 0 + * 2. send r3.xy <rest of send instruction> + * 3. mov r2 r3 + * + * Due to no post-destination dependency check on the ‘send’, the above + * code sequence could have two instructions (1 and 2) in flight at the + * same time that both consider ‘r3’ as the target of their final writes. + */ +void +fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, + fs_inst *inst) +{ + int write_len = regs_written(inst); + int first_write_grf = inst->dst.nr; + bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + + clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); + + /* Walk backwards looking for writes to registers we're writing which + * aren't read since being written. If we hit the start of the program, + * we assume that there are no outstanding dependencies on entry to the + * program. + */ + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + /* If we hit control flow, assume that there *are* outstanding + * dependencies, and force their cleanup before our instruction. + */ + if (block->start() == scan_inst && block->num != 0) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + DEP_RESOLVE_MOV(fs_builder(this, block, inst), + first_write_grf + i); + } + return; + } + + /* We insert our reads as late as possible on the assumption that any + * instruction but a MOV that might have left us an outstanding + * dependency has more latency than a MOV. + */ + if (scan_inst->dst.file == VGRF) { + for (unsigned i = 0; i < regs_written(scan_inst); i++) { + int reg = scan_inst->dst.nr + i; + + if (reg >= first_write_grf && + reg < first_write_grf + write_len && + needs_dep[reg - first_write_grf]) { + DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg); + needs_dep[reg - first_write_grf] = false; + if (scan_inst->exec_size == 16) + needs_dep[reg - first_write_grf + 1] = false; + } + } + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Errata: A destination register from a send can not be + * used as a destination register until after it has been sourced by an + * instruction with a different destination register. + */ +void +fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) +{ + int write_len = regs_written(inst); + int first_write_grf = inst->dst.nr; + bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + /* Walk forwards looking for writes to registers we're writing which aren't + * read before being written. + */ + foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) { + /* If we hit control flow, force resolve all remaining dependencies. */ + if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), + first_write_grf + i); + } + return; + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); + + /* We insert our reads as late as possible since they're reading the + * result of a SEND, which has massive latency. + */ + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr >= first_write_grf && + scan_inst->dst.nr < first_write_grf + write_len && + needs_dep[scan_inst->dst.nr - first_write_grf]) { + DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), + scan_inst->dst.nr); + needs_dep[scan_inst->dst.nr - first_write_grf] = false; + } + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +void +fs_visitor::insert_gen4_send_dependency_workarounds() +{ + if (devinfo->gen != 4 || devinfo->is_g4x) + return; + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->mlen != 0 && inst->dst.file == VGRF) { + insert_gen4_pre_send_dependency_workarounds(block, inst); + insert_gen4_post_send_dependency_workarounds(block, inst); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); +} + +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + if (devinfo->gen >= 7) { + const fs_builder ubld = fs_builder(this, block, inst).exec_all(); + const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); + + ubld.group(8, 0).MOV(payload, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + ubld.group(1, 0).MOV(component(payload, 2), + brw_imm_ud(inst->src[1].ud / 16)); + + inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; + inst->src[1] = payload; + inst->header_size = 1; + inst->mlen = 1; + + invalidate_live_intervals(); + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; + inst->mlen = 1; + } + } +} + +bool +fs_visitor::lower_load_payload() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + assert(inst->dst.file == MRF || inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + /* Get rid of COMPR4. We'll add it back in if we need it */ + if (dst.file == MRF) + dst.nr = dst.nr & ~BRW_MRF_COMPR4; + + const fs_builder ibld(this, block, inst); + const fs_builder hbld = ibld.exec_all().group(8, 0); + + for (uint8_t i = 0; i < inst->header_size; i++) { + if (inst->src[i].file != BAD_FILE) { + fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); + fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); + hbld.MOV(mov_dst, mov_src); + } + dst = offset(dst, hbld, 1); + } + + if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && + inst->exec_size > 8) { + /* In this case, the payload portion of the LOAD_PAYLOAD isn't + * a straightforward copy. Instead, the result of the + * LOAD_PAYLOAD is treated as interleaved and the first four + * non-header sources are unpacked as: + * + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * This is used for gen <= 5 fb writes. + */ + assert(inst->exec_size == 16); + assert(inst->header_size + 4 <= inst->sources); + for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { + if (inst->src[i].file != BAD_FILE) { + if (devinfo->has_compr4) { + fs_reg compr4_dst = retype(dst, inst->src[i].type); + compr4_dst.nr |= BRW_MRF_COMPR4; + ibld.MOV(compr4_dst, inst->src[i]); + } else { + /* Platform doesn't have COMPR4. We have to fake it */ + fs_reg mov_dst = retype(dst, inst->src[i].type); + ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); + mov_dst.nr += 4; + ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); + } + } + + dst.nr++; + } + + /* The loop above only ever incremented us through the first set + * of 4 registers. However, thanks to the magic of COMPR4, we + * actually wrote to the first 8 registers, so we need to take + * that into account now. + */ + dst.nr += 4; + + /* The COMPR4 code took care of the first 4 sources. We'll let + * the regular path handle any remaining sources. Yes, we are + * modifying the instruction but we're about to delete it so + * this really doesn't hurt anything. + */ + inst->header_size += 4; + } + + for (uint8_t i = inst->header_size; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE) + ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]); + dst = offset(dst, ibld, 1); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +bool +fs_visitor::lower_integer_multiplication() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_MUL) { + if (inst->dst.is_accumulator() || + (inst->dst.type != BRW_REGISTER_TYPE_D && + inst->dst.type != BRW_REGISTER_TYPE_UD)) + continue; + + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit + * operation directly, but CHV/BXT cannot. + */ + if (devinfo->gen >= 8 && + !devinfo->is_cherryview && !devinfo->is_broxton) + continue; + + if (inst->src[1].file == IMM && + inst->src[1].ud < (1 << 16)) { + /* The MUL instruction isn't commutative. On Gen <= 6, only the low + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of + * src1 are used. + * + * If multiplying by an immediate value that fits in 16-bits, do a + * single MUL instruction with that value in the proper location. + */ + if (devinfo->gen < 7) { + fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), + inst->dst.type); + ibld.MOV(imm, inst->src[1]); + ibld.MUL(inst->dst, imm, inst->src[0]); + } else { + const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); + ibld.MUL(inst->dst, inst->src[0], + ud ? brw_imm_uw(inst->src[1].ud) + : brw_imm_w(inst->src[1].d)); + } + } else { + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot + * do 32-bit integer multiplication in one instruction, but instead + * must do a sequence (which actually calculates a 64-bit result): + * + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D + * mach(8) null g3<8,8,1>D g4<8,8,1>D + * mov(8) g2<1>D acc0<8,8,1>D + * + * But on Gen > 6, the ability to use second accumulator register + * (acc1) for non-float data types was removed, preventing a simple + * implementation in SIMD16. A 16-channel result can be calculated by + * executing the three instructions twice in SIMD8, once with quarter + * control of 1Q for the first eight channels and again with 2Q for + * the second eight channels. + * + * Which accumulator register is implicitly accessed (by AccWrEnable + * for instance) is determined by the quarter control. Unfortunately + * Ivybridge (and presumably Baytrail) has a hardware bug in which an + * implicit accumulator access by an instruction with 2Q will access + * acc1 regardless of whether the data type is usable in acc1. + * + * Specifically, the 2Q mach(8) writes acc1 which does not exist for + * integer data types. + * + * Since we only want the low 32-bits of the result, we can do two + * 32-bit x 16-bit multiplies (like the mul and mach are doing), and + * adjust the high result and add them (like the mach is doing): + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW + * shl(8) g9<1>D g8<8,8,1>D 16D + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D + * + * We avoid the shl instruction by realizing that we only want to add + * the low 16-bits of the "high" result to the high 16-bits of the + * "low" result and using proper regioning on the add: + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW + * + * Since it does not use the (single) accumulator register, we can + * schedule multi-component multiplications much better. + */ + + fs_reg orig_dst = inst->dst; + if (orig_dst.is_null() || orig_dst.file == MRF) { + inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + inst->dst.type); + } + fs_reg low = inst->dst; + fs_reg high(VGRF, alloc.allocate(dispatch_width / 8), + inst->dst.type); + + if (devinfo->gen >= 7) { + if (inst->src[1].file == IMM) { + ibld.MUL(low, inst->src[0], + brw_imm_uw(inst->src[1].ud & 0xffff)); + ibld.MUL(high, inst->src[0], + brw_imm_uw(inst->src[1].ud >> 16)); + } else { + ibld.MUL(low, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + ibld.MUL(high, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); + } + } else { + ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), + inst->src[1]); + ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), + inst->src[1]); + } + + ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1), + subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(high, BRW_REGISTER_TYPE_UW, 0)); + + if (inst->conditional_mod || orig_dst.file == MRF) { + set_condmod(inst->conditional_mod, + ibld.MOV(orig_dst, inst->dst)); + } + } + + } else if (inst->opcode == SHADER_OPCODE_MULH) { + /* Should have been lowered to 8-wide. */ + assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst)); + const fs_reg acc = retype(brw_acc_reg(inst->exec_size), + inst->dst.type); + fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); + fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]); + + if (devinfo->gen >= 8) { + /* Until Gen8, integer multiplies read 32-bits from one source, + * and 16-bits from the other, and relying on the MACH instruction + * to generate the high bits of the result. + * + * On Gen8, the multiply instruction does a full 32x32-bit + * multiply, but in order to do a 64-bit multiply we can simulate + * the previous behavior and then use a MACH instruction. + * + * FINISHME: Don't use source modifiers on src1. + */ + assert(mul->src[1].type == BRW_REGISTER_TYPE_D || + mul->src[1].type == BRW_REGISTER_TYPE_UD); + mul->src[1].type = BRW_REGISTER_TYPE_UW; + mul->src[1].stride *= 2; + + } else if (devinfo->gen == 7 && !devinfo->is_haswell && + inst->group > 0) { + /* Among other things the quarter control bits influence which + * accumulator register is used by the hardware for instructions + * that access the accumulator implicitly (e.g. MACH). A + * second-half instruction would normally map to acc1, which + * doesn't exist on Gen7 and up (the hardware does emulate it for + * floating-point instructions *only* by taking advantage of the + * extra precision of acc0 not normally used for floating point + * arithmetic). + * + * HSW and up are careful enough not to try to access an + * accumulator register that doesn't exist, but on earlier Gen7 + * hardware we need to make sure that the quarter control bits are + * zero to avoid non-deterministic behaviour and emit an extra MOV + * to get the result masked correctly according to the current + * channel enables. + */ + mach->group = 0; + mach->force_writemask_all = true; + mach->dst = ibld.vgrf(inst->dst.type); + ibld.MOV(inst->dst, mach->dst); + } + } else { + continue; + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +bool +fs_visitor::lower_minmax() +{ + assert(devinfo->gen < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of + * the original SEL.L/GE instruction + */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +static void +setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, + fs_reg *dst, fs_reg color, unsigned components) +{ + if (key->clamp_fragment_color) { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + assert(color.type == BRW_REGISTER_TYPE_F); + + for (unsigned i = 0; i < components; i++) + set_saturate(true, + bld.MOV(offset(tmp, bld, i), offset(color, bld, i))); + + color = tmp; + } + + for (unsigned i = 0; i < components; i++) + dst[i] = offset(color, bld, i); +} + +static void +lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, + const struct brw_wm_prog_data *prog_data, + const brw_wm_prog_key *key, + const fs_visitor::thread_payload &payload) +{ + assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); + const gen_device_info *devinfo = bld.shader->devinfo; + const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0]; + const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1]; + const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; + const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; + const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; + const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; + fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; + const unsigned components = + inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; + + /* We can potentially have a message length of up to 15, so we have to set + * base_mrf to either 0 or 1 in order to fit in m0..m15. + */ + fs_reg sources[15]; + int header_size = 2, payload_header_size; + unsigned length = 0; + + /* From the Sandy Bridge PRM, volume 4, page 198: + * + * "Dispatched Pixel Enables. One bit per pixel indicating + * which pixels were originally enabled when the thread was + * dispatched. This field is only required for the end-of- + * thread message and on all dual-source messages." + */ + if (devinfo->gen >= 6 && + (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) && + color1.file == BAD_FILE && + key->nr_color_regions == 1) { + header_size = 0; + } + + if (header_size != 0) { + assert(header_size == 2); + /* Allocate 2 registers for a header */ + length += 2; + } + + if (payload.aa_dest_stencil_reg) { + sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); + bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha") + .MOV(sources[length], + fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))); + length++; + } + + if (sample_mask.file != BAD_FILE) { + sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1), + BRW_REGISTER_TYPE_UD); + + /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are + * relevant. Since it's unsigned single words one vgrf is always + * 16-wide, but only the lower or higher 8 channels will be used by the + * hardware when doing a SIMD8 write depending on whether we have + * selected the subspans for the first or second half respectively. + */ + assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4); + sample_mask.type = BRW_REGISTER_TYPE_UW; + sample_mask.stride *= 2; + + bld.exec_all().annotate("FB write oMask") + .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW), + inst->group), + sample_mask); + length++; + } + + payload_header_size = length; + + if (src0_alpha.file != BAD_FILE) { + /* FIXME: This is being passed at the wrong location in the payload and + * doesn't work when gl_SampleMask and MRTs are used simultaneously. + * It's supposed to be immediately before oMask but there seems to be no + * reasonable way to pass them in the correct order because LOAD_PAYLOAD + * requires header sources to form a contiguous segment at the beginning + * of the message and src0_alpha has per-channel semantics. + */ + setup_color_payload(bld, key, &sources[length], src0_alpha, 1); + length++; + } else if (key->replicate_alpha && inst->target != 0) { + /* Handle the case when fragment shader doesn't write to draw buffer + * zero. No need to call setup_color_payload() for src0_alpha because + * alpha value will be undefined. + */ + length++; + } + + setup_color_payload(bld, key, &sources[length], color0, components); + length += 4; + + if (color1.file != BAD_FILE) { + setup_color_payload(bld, key, &sources[length], color1, components); + length += 4; + } + + if (src_depth.file != BAD_FILE) { + sources[length] = src_depth; + length++; + } + + if (dst_depth.file != BAD_FILE) { + sources[length] = dst_depth; + length++; + } + + if (src_stencil.file != BAD_FILE) { + assert(devinfo->gen >= 9); + assert(bld.dispatch_width() != 16); + + /* XXX: src_stencil is only available on gen9+. dst_depth is never + * available on gen9+. As such it's impossible to have both enabled at the + * same time and therefore length cannot overrun the array. + */ + assert(length < 15); + + sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.exec_all().annotate("FB write OS") + .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB), + subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0)); + length++; + } + + fs_inst *load; + if (devinfo->gen >= 7) { + /* Send from the GRF */ + fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F); + load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size); + payload.nr = bld.shader->alloc.allocate(regs_written(load)); + load->dst = payload; + + inst->src[0] = payload; + inst->resize_sources(1); + } else { + /* Send from the MRF */ + load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), + sources, length, payload_header_size); + + /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD + * will do this for us if we just give it a COMPR4 destination. + */ + if (devinfo->gen < 6 && bld.dispatch_width() == 16) + load->dst.nr |= BRW_MRF_COMPR4; + + inst->resize_sources(0); + inst->base_mrf = 1; + } + + inst->opcode = FS_OPCODE_FB_WRITE; + inst->mlen = regs_written(load); + inst->header_size = header_size; +} + +static void +lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) +{ + const fs_builder &ubld = bld.exec_all(); + const unsigned length = 2; + const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length); + + ubld.group(16, 0) + .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + inst->resize_sources(1); + inst->src[0] = header; + inst->opcode = FS_OPCODE_FB_READ; + inst->mlen = length; + inst->header_size = length; +} + +static void +lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, + const fs_reg &coordinate, + const fs_reg &shadow_c, + const fs_reg &lod, const fs_reg &lod2, + const fs_reg &surface, + const fs_reg &sampler, + unsigned coord_components, + unsigned grad_components) +{ + const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB || + op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS); + fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F); + fs_reg msg_end = msg_begin; + + /* g0 header. */ + msg_end = offset(msg_end, bld.group(8, 0), 1); + + for (unsigned i = 0; i < coord_components; i++) + bld.MOV(retype(offset(msg_end, bld, i), coordinate.type), + offset(coordinate, bld, i)); + + msg_end = offset(msg_end, bld, coord_components); + + /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8 + * require all three components to be present and zero if they are unused. + */ + if (coord_components > 0 && + (has_lod || shadow_c.file != BAD_FILE || + (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { + for (unsigned i = coord_components; i < 3; i++) + bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); + + msg_end = offset(msg_end, bld, 3 - coord_components); + } + + if (op == SHADER_OPCODE_TXD) { + /* TXD unsupported in SIMD16 mode. */ + assert(bld.dispatch_width() == 8); + + /* the slots for u and v are always present, but r is optional */ + if (coord_components < 2) + msg_end = offset(msg_end, bld, 2 - coord_components); + + /* P = u, v, r + * dPdx = dudx, dvdx, drdx + * dPdy = dudy, dvdy, drdy + * + * 1-arg: Does not exist. + * + * 2-arg: dudx dvdx dudy dvdy + * dPdx.x dPdx.y dPdy.x dPdy.y + * m4 m5 m6 m7 + * + * 3-arg: dudx dvdx drdx dudy dvdy drdy + * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z + * m5 m6 m7 m8 m9 m10 + */ + for (unsigned i = 0; i < grad_components; i++) + bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i)); + + msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); + + for (unsigned i = 0; i < grad_components; i++) + bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i)); + + msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); + } + + if (has_lod) { + /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without* + * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode. + */ + assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 : + bld.dispatch_width() == 16); + + const brw_reg_type type = + (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ? + BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F); + bld.MOV(retype(msg_end, type), lod); + msg_end = offset(msg_end, bld, 1); + } + + if (shadow_c.file != BAD_FILE) { + if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) { + /* There's no plain shadow compare message, so we use shadow + * compare with a bias of 0.0. + */ + bld.MOV(msg_end, brw_imm_f(0.0f)); + msg_end = offset(msg_end, bld, 1); + } + + bld.MOV(msg_end, shadow_c); + msg_end = offset(msg_end, bld, 1); + } + + inst->opcode = op; + inst->src[0] = reg_undef; + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); + inst->base_mrf = msg_begin.nr; + inst->mlen = msg_end.nr - msg_begin.nr; + inst->header_size = 1; +} + +static void +lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, + const fs_reg &coordinate, + const fs_reg &shadow_c, + const fs_reg &lod, const fs_reg &lod2, + const fs_reg &sample_index, + const fs_reg &surface, + const fs_reg &sampler, + unsigned coord_components, + unsigned grad_components) +{ + fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F); + fs_reg msg_coords = message; + unsigned header_size = 0; + + if (inst->offset != 0) { + /* The offsets set up by the visitor are in the m1 header, so we can't + * go headerless. + */ + header_size = 1; + message.nr--; + } + + for (unsigned i = 0; i < coord_components; i++) + bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), + offset(coordinate, bld, i)); + + fs_reg msg_end = offset(msg_coords, bld, coord_components); + fs_reg msg_lod = offset(msg_coords, bld, 4); + + if (shadow_c.file != BAD_FILE) { + fs_reg msg_shadow = msg_lod; + bld.MOV(msg_shadow, shadow_c); + msg_lod = offset(msg_shadow, bld, 1); + msg_end = msg_lod; + } + + switch (op) { + case SHADER_OPCODE_TXL: + case FS_OPCODE_TXB: + bld.MOV(msg_lod, lod); + msg_end = offset(msg_lod, bld, 1); + break; + case SHADER_OPCODE_TXD: + /** + * P = u, v, r + * dPdx = dudx, dvdx, drdx + * dPdy = dudy, dvdy, drdy + * + * Load up these values: + * - dudx dudy dvdx dvdy drdx drdy + * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z + */ + msg_end = msg_lod; + for (unsigned i = 0; i < grad_components; i++) { + bld.MOV(msg_end, offset(lod, bld, i)); + msg_end = offset(msg_end, bld, 1); + + bld.MOV(msg_end, offset(lod2, bld, i)); + msg_end = offset(msg_end, bld, 1); + } + break; + case SHADER_OPCODE_TXS: + msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD); + bld.MOV(msg_lod, lod); + msg_end = offset(msg_lod, bld, 1); + break; + case SHADER_OPCODE_TXF: + msg_lod = offset(msg_coords, bld, 3); + bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod); + msg_end = offset(msg_lod, bld, 1); + break; + case SHADER_OPCODE_TXF_CMS: + msg_lod = offset(msg_coords, bld, 3); + /* lod */ + bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); + /* sample index */ + bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index); + msg_end = offset(msg_lod, bld, 2); + break; + default: + break; + } + + inst->opcode = op; + inst->src[0] = reg_undef; + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); + inst->base_mrf = message.nr; + inst->mlen = msg_end.nr - message.nr; + inst->header_size = header_size; + + /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ + assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); +} + +static bool +is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler) +{ + if (devinfo->gen < 8 && !devinfo->is_haswell) + return false; + + return sampler.file != IMM || sampler.ud >= 16; +} + +static void +lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, + const fs_reg &coordinate, + const fs_reg &shadow_c, + fs_reg lod, const fs_reg &lod2, + const fs_reg &sample_index, + const fs_reg &mcs, + const fs_reg &surface, + const fs_reg &sampler, + const fs_reg &tg4_offset, + unsigned coord_components, + unsigned grad_components) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + unsigned reg_width = bld.dispatch_width() / 8; + unsigned header_size = 0, length = 0; + fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE]; + for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) + sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F); + + if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || + inst->offset != 0 || inst->eot || + op == SHADER_OPCODE_SAMPLEINFO || + is_high_sampler(devinfo, sampler)) { + /* For general texture offsets (no txf workaround), we need a header to + * put them in. Note that we're only reserving space for it in the + * message payload as it will be initialized implicitly by the + * generator. + * + * TG4 needs to place its channel select in the header, for interaction + * with ARB_texture_swizzle. The sampler index is only 4-bits, so for + * larger sampler numbers we need to offset the Sampler State Pointer in + * the header. + */ + header_size = 1; + sources[0] = fs_reg(); + length++; + + /* If we're requesting fewer than four channels worth of response, + * and we have an explicit header, we need to set up the sampler + * writemask. It's reversed from normal: 1 means "don't write". + */ + if (!inst->eot && regs_written(inst) != 4 * reg_width) { + assert(regs_written(inst) % reg_width == 0); + unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf; + inst->offset |= mask << 12; + } + } + + if (shadow_c.file != BAD_FILE) { + bld.MOV(sources[length], shadow_c); + length++; + } + + bool coordinate_done = false; + + /* Set up the LOD info */ + switch (op) { + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXL: + if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) { + op = SHADER_OPCODE_TXL_LZ; + break; + } + bld.MOV(sources[length], lod); + length++; + break; + case SHADER_OPCODE_TXD: + /* TXD should have been lowered in SIMD16 mode. */ + assert(bld.dispatch_width() == 8); + + /* Load dPdx and the coordinate together: + * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z + */ + for (unsigned i = 0; i < coord_components; i++) { + bld.MOV(sources[length++], offset(coordinate, bld, i)); + + /* For cube map array, the coordinate is (u,v,r,ai) but there are + * only derivatives for (u, v, r). + */ + if (i < grad_components) { + bld.MOV(sources[length++], offset(lod, bld, i)); + bld.MOV(sources[length++], offset(lod2, bld, i)); + } + } + + coordinate_done = true; + break; + case SHADER_OPCODE_TXS: + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod); + length++; + break; + case SHADER_OPCODE_TXF: + /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. + * On Gen9 they are u, v, lod, r + */ + bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate); + + if (devinfo->gen >= 9) { + if (coord_components >= 2) { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), + offset(coordinate, bld, 1)); + } else { + sources[length] = brw_imm_d(0); + } + length++; + } + + if (devinfo->gen >= 9 && lod.is_zero()) { + op = SHADER_OPCODE_TXF_LZ; + } else { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod); + length++; + } + + for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) + bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + offset(coordinate, bld, i)); + + coordinate_done = true; + break; + + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + if (op == SHADER_OPCODE_TXF_UMS || + op == SHADER_OPCODE_TXF_CMS || + op == SHADER_OPCODE_TXF_CMS_W) { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index); + length++; + } + + if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { + /* Data from the multisample control surface. */ + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs); + length++; + + /* On Gen9+ we'll use ld2dms_w instead which has two registers for + * the MCS data. + */ + if (op == SHADER_OPCODE_TXF_CMS_W) { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), + mcs.file == IMM ? + mcs : + offset(mcs, bld, 1)); + length++; + } + } + + /* There is no offsetting for this message; just copy in the integer + * texture coordinates. + */ + for (unsigned i = 0; i < coord_components; i++) + bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + offset(coordinate, bld, i)); + + coordinate_done = true; + break; + case SHADER_OPCODE_TG4_OFFSET: + /* More crazy intermixing */ + for (unsigned i = 0; i < 2; i++) /* u, v */ + bld.MOV(sources[length++], offset(coordinate, bld, i)); + + for (unsigned i = 0; i < 2; i++) /* offu, offv */ + bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + offset(tg4_offset, bld, i)); + + if (coord_components == 3) /* r if present */ + bld.MOV(sources[length++], offset(coordinate, bld, 2)); + + coordinate_done = true; + break; + default: + break; + } + + /* Set up the coordinate (except for cases where it was done above) */ + if (!coordinate_done) { + for (unsigned i = 0; i < coord_components; i++) + bld.MOV(sources[length++], offset(coordinate, bld, i)); + } + + int mlen; + if (reg_width == 2) + mlen = length * reg_width - header_size; + else + mlen = length * reg_width; + + const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen), + BRW_REGISTER_TYPE_F); + bld.LOAD_PAYLOAD(src_payload, sources, length, header_size); + + /* Generate the SEND. */ + inst->opcode = op; + inst->src[0] = src_payload; + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); + inst->mlen = mlen; + inst->header_size = header_size; + + /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ + assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); +} + +static void +lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; + const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; + const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD]; + const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2]; + const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; + const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; + const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; + const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; + const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET]; + assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); + const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; + assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); + const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; + + if (devinfo->gen >= 7) { + lower_sampler_logical_send_gen7(bld, inst, op, coordinate, + shadow_c, lod, lod2, sample_index, + mcs, surface, sampler, tg4_offset, + coord_components, grad_components); + } else if (devinfo->gen >= 5) { + lower_sampler_logical_send_gen5(bld, inst, op, coordinate, + shadow_c, lod, lod2, sample_index, + surface, sampler, + coord_components, grad_components); + } else { + lower_sampler_logical_send_gen4(bld, inst, op, coordinate, + shadow_c, lod, lod2, + surface, sampler, + coord_components, grad_components); + } +} + +/** + * Initialize the header present in some typed and untyped surface + * messages. + */ +static fs_reg +emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask) +{ + fs_builder ubld = bld.exec_all().group(8, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(dst, brw_imm_d(0)); + ubld.MOV(component(dst, 7), sample_mask); + return dst; +} + +static void +lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, + const fs_reg &sample_mask) +{ + /* Get the logical send arguments. */ + const fs_reg &addr = inst->src[0]; + const fs_reg &src = inst->src[1]; + const fs_reg &surface = inst->src[2]; + const UNUSED fs_reg &dims = inst->src[3]; + const fs_reg &arg = inst->src[4]; + + /* Calculate the total number of components of the payload. */ + const unsigned addr_sz = inst->components_read(0); + const unsigned src_sz = inst->components_read(1); + const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1); + const unsigned sz = header_sz + addr_sz + src_sz; + + /* Allocate space for the payload. */ + fs_reg *const components = new fs_reg[sz]; + const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); + unsigned n = 0; + + /* Construct the payload. */ + if (header_sz) + components[n++] = emit_surface_header(bld, sample_mask); + + for (unsigned i = 0; i < addr_sz; i++) + components[n++] = offset(addr, bld, i); + + for (unsigned i = 0; i < src_sz; i++) + components[n++] = offset(src, bld, i); + + bld.LOAD_PAYLOAD(payload, components, sz, header_sz); + + /* Update the original instruction. */ + inst->opcode = op; + inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8; + inst->header_size = header_sz; + + inst->src[0] = payload; + inst->src[1] = surface; + inst->src[2] = arg; + inst->resize_sources(3); + + delete[] components; +} + +static void +lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + + if (devinfo->gen >= 7) { + /* We are switching the instruction from an ALU-like instruction to a + * send-from-grf instruction. Since sends can't handle strides or + * source modifiers, we have to make a copy of the offset source. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(tmp, inst->src[1]); + inst->src[1] = tmp; + + inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; + + } else { + const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen), + BRW_REGISTER_TYPE_UD); + + bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); + + inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4; + inst->resize_sources(1); + inst->base_mrf = payload.nr; + inst->header_size = 1; + inst->mlen = 1 + inst->exec_size / 8; + } +} + +static void +lower_math_logical_send(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->devinfo->gen < 6); + + inst->base_mrf = 2; + inst->mlen = inst->sources * inst->exec_size / 8; + + if (inst->sources > 1) { + /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 + * "Message Payload": + * + * "Operand0[7]. For the INT DIV functions, this operand is the + * denominator." + * ... + * "Operand1[7]. For the INT DIV functions, this operand is the + * numerator." + */ + const bool is_int_div = inst->opcode != SHADER_OPCODE_POW; + const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0]; + const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1]; + + inst->resize_sources(1); + inst->src[0] = src0; + + assert(inst->exec_size == 8); + bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1); + } +} + +bool +fs_visitor::lower_logical_sends() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + switch (inst->opcode) { + case FS_OPCODE_FB_WRITE_LOGICAL: + assert(stage == MESA_SHADER_FRAGMENT); + lower_fb_write_logical_send(ibld, inst, + brw_wm_prog_data(prog_data), + (const brw_wm_prog_key *)key, + payload); + break; + + case FS_OPCODE_FB_READ_LOGICAL: + lower_fb_read_logical_send(ibld, inst); + break; + + case SHADER_OPCODE_TEX_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX); + break; + + case SHADER_OPCODE_TXD_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD); + break; + + case SHADER_OPCODE_TXF_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF); + break; + + case SHADER_OPCODE_TXL_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL); + break; + + case SHADER_OPCODE_TXS_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS); + break; + + case FS_OPCODE_TXB_LOGICAL: + lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB); + break; + + case SHADER_OPCODE_TXF_CMS_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS); + break; + + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); + break; + + case SHADER_OPCODE_TXF_UMS_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS); + break; + + case SHADER_OPCODE_TXF_MCS_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS); + break; + + case SHADER_OPCODE_LOD_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD); + break; + + case SHADER_OPCODE_TG4_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4); + break; + + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); + break; + + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_UNTYPED_SURFACE_READ, + fs_reg()); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_UNTYPED_SURFACE_WRITE, + ibld.sample_mask_reg()); + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_UNTYPED_ATOMIC, + ibld.sample_mask_reg()); + break; + + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_TYPED_SURFACE_READ, + brw_imm_d(0xffff)); + break; + + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_TYPED_SURFACE_WRITE, + ibld.sample_mask_reg()); + break; + + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_TYPED_ATOMIC, + ibld.sample_mask_reg()); + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + lower_varying_pull_constant_logical_send(ibld, inst); + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* The math opcodes are overloaded for the send-like and + * expression-like instructions which seems kind of icky. Gen6+ has + * a native (but rather quirky) MATH instruction so we don't need to + * do anything here. On Gen4-5 we'll have to lower the Gen6-like + * logical instructions (which we can easily recognize because they + * have mlen = 0) into send-like virtual instructions. + */ + if (devinfo->gen < 6 && inst->mlen == 0) { + lower_math_logical_send(ibld, inst); + break; + + } else { + continue; + } + + default: + continue; + } + + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Get the closest allowed SIMD width for instruction \p inst accounting for + * some common regioning and execution control restrictions that apply to FPU + * instructions. These restrictions don't necessarily have any relevance to + * instructions not executed by the FPU pipeline like extended math, control + * flow or send message instructions. + * + * For virtual opcodes it's really up to the instruction -- In some cases + * (e.g. where a virtual instruction unrolls into a simple sequence of FPU + * instructions) it may simplify virtual instruction lowering if we can + * enforce FPU-like regioning restrictions already on the virtual instruction, + * in other cases (e.g. virtual send-like instructions) this may be + * excessively restrictive. + */ +static unsigned +get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, + const fs_inst *inst) +{ + /* Maximum execution size representable in the instruction controls. */ + unsigned max_width = MIN2(32, inst->exec_size); + + /* According to the PRMs: + * "A. In Direct Addressing mode, a source cannot span more than 2 + * adjacent GRF registers. + * B. A destination cannot span more than 2 adjacent GRF registers." + * + * Look for the source or destination with the largest register region + * which is the one that is going to limit the overall execution size of + * the instruction due to this rule. + */ + unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); + + for (unsigned i = 0; i < inst->sources; i++) + reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); + + /* Calculate the maximum execution size of the instruction based on the + * factor by which it goes over the hardware limit of 2 GRFs. + */ + if (reg_count > 2) + max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2)); + + /* According to the IVB PRMs: + * "When destination spans two registers, the source MUST span two + * registers. The exception to the above rule: + * + * - When source is scalar, the source registers are not incremented. + * - When source is packed integer Word and destination is packed + * integer DWord, the source register is not incremented but the + * source sub register is incremented." + * + * The hardware specs from Gen4 to Gen7.5 mention similar regioning + * restrictions. The code below intentionally doesn't check whether the + * destination type is integer because empirically the hardware doesn't + * seem to care what the actual type is as long as it's dword-aligned. + */ + if (devinfo->gen < 8) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->size_written > REG_SIZE && + inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE && + !is_uniform(inst->src[i]) && + !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && + type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) { + const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); + max_width = MIN2(max_width, inst->exec_size / reg_count); + } + } + } + + /* From the IVB PRMs: + * "When an instruction is SIMD32, the low 16 bits of the execution mask + * are applied for both halves of the SIMD32 instruction. If different + * execution mask channels are required, split the instruction into two + * SIMD16 instructions." + * + * There is similar text in the HSW PRMs. Gen4-6 don't even implement + * 32-wide control flow support in hardware and will behave similarly. + */ + if (devinfo->gen < 8 && !inst->force_writemask_all) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to HSW too): + * "Instructions with condition modifiers must not use SIMD32." + * + * From the BDW PRMs (applies to later hardware too): + * "Ternary instruction with condition modifiers must not use SIMD32." + */ + if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo))) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to other devices that don't have the + * gen_device_info::supports_simd16_3src flag set): + * "In Align16 access mode, SIMD16 is not allowed for DW operations and + * SIMD8 is not allowed for DF operations." + */ + if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src) + max_width = MIN2(max_width, inst->exec_size / reg_count); + + /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is + * the 8-bit quarter of the execution mask signals specified in the + * instruction control fields) for the second compressed half of any + * single-precision instruction (for double-precision instructions + * it's hardwired to use NibCtrl+1, at least on HSW), which means that + * the EU will apply the wrong execution controls for the second + * sequential GRF write if the number of channels per GRF is not exactly + * eight in single-precision mode (or four in double-float mode). + * + * In this situation we calculate the maximum size of the split + * instructions so they only ever write to a single register. + */ + if (devinfo->gen < 8 && inst->size_written > REG_SIZE && + !inst->force_writemask_all) { + const unsigned channels_per_grf = inst->exec_size / + DIV_ROUND_UP(inst->size_written, REG_SIZE); + unsigned exec_type_size = 0; + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE) + exec_type_size = MAX2(exec_type_size, type_sz(inst->src[i].type)); + } + assert(exec_type_size); + + /* The hardware shifts exactly 8 channels per compressed half of the + * instruction in single-precision mode and exactly 4 in double-precision. + */ + if (channels_per_grf != (exec_type_size == 8 ? 4 : 8)) + max_width = MIN2(max_width, channels_per_grf); + } + + /* Only power-of-two execution sizes are representable in the instruction + * control fields. + */ + return 1 << _mesa_logbase2(max_width); +} + +/** + * Get the maximum allowed SIMD width for instruction \p inst accounting for + * various payload size restrictions that apply to sampler message + * instructions. + * + * This is only intended to provide a maximum theoretical bound for the + * execution size of the message based on the number of argument components + * alone, which in most cases will determine whether the SIMD8 or SIMD16 + * variant of the message can be used, though some messages may have + * additional restrictions not accounted for here (e.g. pre-ILK hardware uses + * the message length to determine the exact SIMD width and argument count, + * which makes a number of sampler message combinations impossible to + * represent). + */ +static unsigned +get_sampler_lowered_simd_width(const struct gen_device_info *devinfo, + const fs_inst *inst) +{ + /* Calculate the number of coordinate components that have to be present + * assuming that additional arguments follow the texel coordinates in the + * message payload. On IVB+ there is no need for padding, on ILK-SNB we + * need to pad to four or three components depending on the message, + * pre-ILK we need to pad to at most three components. + */ + const unsigned req_coord_components = + (devinfo->gen >= 7 || + !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 : + (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL && + inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 : + 3; + + /* On Gen9+ the LOD argument is for free if we're able to use the LZ + * variant of the TXL or TXF message. + */ + const bool implicit_lod = devinfo->gen >= 9 && + (inst->opcode == SHADER_OPCODE_TXL || + inst->opcode == SHADER_OPCODE_TXF) && + inst->src[TEX_LOGICAL_SRC_LOD].is_zero(); + + /* Calculate the total number of argument components that need to be passed + * to the sampler unit. + */ + const unsigned num_payload_components = + MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE), + req_coord_components) + + inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) + + (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) + + inst->components_read(TEX_LOGICAL_SRC_LOD2) + + inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) + + (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ? + inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) + + inst->components_read(TEX_LOGICAL_SRC_MCS); + + /* SIMD16 messages with more than five arguments exceed the maximum message + * size supported by the sampler, regardless of whether a header is + * provided or not. + */ + return MIN2(inst->exec_size, + num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16); +} + +/** + * Get the closest native SIMD width supported by the hardware for instruction + * \p inst. The instruction will be left untouched by + * fs_visitor::lower_simd_width() if the returned value is equal to the + * original execution size. + */ +static unsigned +get_lowered_simd_width(const struct gen_device_info *devinfo, + const fs_inst *inst) +{ + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_CSEL: + case BRW_OPCODE_F32TO16: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_BFREV: + case BRW_OPCODE_BFE: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case BRW_OPCODE_AVG: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_LZD: + case BRW_OPCODE_FBH: + case BRW_OPCODE_FBL: + case BRW_OPCODE_CBIT: + case BRW_OPCODE_SAD2: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case FS_OPCODE_PACK: + return get_fpu_lowered_simd_width(devinfo, inst); + + case BRW_OPCODE_CMP: { + /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that + * when the destination is a GRF the dependency-clear bit on the flag + * register is cleared early. + * + * Suggested workarounds are to disable coissuing CMP instructions + * or to split CMP(16) instructions into two CMP(8) instructions. + * + * We choose to split into CMP(8) instructions since disabling + * coissuing would affect CMP instructions not otherwise affected by + * the errata. + */ + const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell && + !inst->dst.is_null() ? 8 : ~0); + return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst)); + } + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we + * should + * "Force BFI instructions to be executed always in SIMD8." + */ + return MIN2(devinfo->is_haswell ? 8 : ~0u, + get_fpu_lowered_simd_width(devinfo, inst)); + + case BRW_OPCODE_IF: + assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16); + return inst->exec_size; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + /* Unary extended math instructions are limited to SIMD8 on Gen4 and + * Gen6. + */ + return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : + devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) : + MIN2(8, inst->exec_size)); + + case SHADER_OPCODE_POW: + /* SIMD16 is only allowed on Gen7+. */ + return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : + MIN2(8, inst->exec_size)); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* Integer division is limited to SIMD8 on all generations. */ + return MIN2(8, inst->exec_size); + + case FS_OPCODE_LINTERP: + case FS_OPCODE_GET_BUFFER_SIZE: + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + return MIN2(16, inst->exec_size); + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch + * message used to implement varying pull constant loads, so expand it + * to SIMD16. An alternative with longer message payload length but + * shorter return payload would be to use the SIMD8 sampler message that + * takes (header, u, v, r) as parameters instead of (header, u). + */ + return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size)); + + case FS_OPCODE_DDY_FINE: + /* The implementation of this virtual opcode may require emitting + * compressed Align16 instructions, which are severely limited on some + * generations. + * + * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register + * Region Restrictions): + * + * "In Align16 access mode, SIMD16 is not allowed for DW operations + * and SIMD8 is not allowed for DF operations." + * + * In this context, "DW operations" means "operations acting on 32-bit + * values", so it includes operations on floats. + * + * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3 + * (Instruction Compression -> Rules and Restrictions): + * + * "A compressed instruction must be in Align1 access mode. Align16 + * mode instructions cannot be compressed." + * + * Similar text exists in the g45 PRM. + * + * Empirically, compressed align16 instructions using odd register + * numbers don't appear to work on Sandybridge either. + */ + return (devinfo->gen == 4 || devinfo->gen == 6 || + (devinfo->gen == 7 && !devinfo->is_haswell) ? + MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size)); + + case SHADER_OPCODE_MULH: + /* MULH is lowered to the MUL/MACH sequence using the accumulator, which + * is 8-wide on Gen7+. + */ + return (devinfo->gen >= 7 ? 8 : + get_fpu_lowered_simd_width(devinfo, inst)); + + case FS_OPCODE_FB_WRITE_LOGICAL: + /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them + * here. + */ + assert(devinfo->gen != 6 || + inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE || + inst->exec_size == 8); + /* Dual-source FB writes are unsupported in SIMD16 mode. */ + return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? + 8 : MIN2(16, inst->exec_size)); + + case FS_OPCODE_FB_READ_LOGICAL: + return MIN2(16, inst->exec_size); + + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + return get_sampler_lowered_simd_width(devinfo, inst); + + case SHADER_OPCODE_TXD_LOGICAL: + /* TXD is unsupported in SIMD16 mode. */ + return 8; + + case SHADER_OPCODE_TXL_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + /* Only one execution size is representable pre-ILK depending on whether + * the shadow reference argument is present. + */ + if (devinfo->gen == 4) + return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8; + else + return get_sampler_lowered_simd_width(devinfo, inst); + + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD + * messages. Use SIMD16 instead. + */ + if (devinfo->gen == 4) + return 16; + else + return get_sampler_lowered_simd_width(devinfo, inst); + + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + return 8; + + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + return MIN2(16, inst->exec_size); + + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + return MIN2(8, inst->exec_size); + + case SHADER_OPCODE_MOV_INDIRECT: + /* Prior to Broadwell, we only have 8 address subregisters */ + return MIN3(devinfo->gen >= 8 ? 16 : 8, + 2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)), + inst->exec_size); + + case SHADER_OPCODE_LOAD_PAYLOAD: { + const unsigned reg_count = + DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); + + if (reg_count > 2) { + /* Only LOAD_PAYLOAD instructions with per-channel destination region + * can be easily lowered (which excludes headers and heterogeneous + * types). + */ + assert(!inst->header_size); + for (unsigned i = 0; i < inst->sources; i++) + assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) || + inst->src[i].file == BAD_FILE); + + return inst->exec_size / DIV_ROUND_UP(reg_count, 2); + } else { + return inst->exec_size; + } + } + default: + return inst->exec_size; + } +} + +/** + * Return true if splitting out the group of channels of instruction \p inst + * given by lbld.group() requires allocating a temporary for the i-th source + * of the lowered instruction. + */ +static inline bool +needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) +{ + return !(is_periodic(inst->src[i], lbld.dispatch_width()) || + (inst->components_read(i) == 1 && + lbld.dispatch_width() <= inst->exec_size)); +} + +/** + * Extract the data that would be consumed by the channel group given by + * lbld.group() from the i-th source region of instruction \p inst and return + * it as result in packed form. If any copy instructions are required they + * will be emitted before the given \p inst in \p block. + */ +static fs_reg +emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst, + unsigned i) +{ + /* Specified channel group from the source region. */ + const fs_reg src = horiz_offset(inst->src[i], lbld.group()); + + if (needs_src_copy(lbld, inst, i)) { + /* Builder of the right width to perform the copy avoiding uninitialized + * data if the lowered execution size is greater than the original + * execution size of the instruction. + */ + const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), + inst->exec_size), 0); + const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i)); + + for (unsigned k = 0; k < inst->components_read(i); ++k) + cbld.at(block, inst) + .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); + + return tmp; + + } else if (is_periodic(inst->src[i], lbld.dispatch_width())) { + /* The source is invariant for all dispatch_width-wide groups of the + * original region. + */ + return inst->src[i]; + + } else { + /* We can just point the lowered instruction at the right channel group + * from the original region. + */ + return src; + } +} + +/** + * Return true if splitting out the group of channels of instruction \p inst + * given by lbld.group() requires allocating a temporary for the destination + * of the lowered instruction and copying the data back to the original + * destination region. + */ +static inline bool +needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) +{ + /* If the instruction writes more than one component we'll have to shuffle + * the results of multiple lowered instructions in order to make sure that + * they end up arranged correctly in the original destination region. + */ + if (inst->size_written > inst->dst.component_size(inst->exec_size)) + return true; + + /* If the lowered execution size is larger than the original the result of + * the instruction won't fit in the original destination, so we'll have to + * allocate a temporary in any case. + */ + if (lbld.dispatch_width() > inst->exec_size) + return true; + + for (unsigned i = 0; i < inst->sources; i++) { + /* If we already made a copy of the source for other reasons there won't + * be any overlap with the destination. + */ + if (needs_src_copy(lbld, inst, i)) + continue; + + /* In order to keep the logic simple we emit a copy whenever the + * destination region doesn't exactly match an overlapping source, which + * may point at the source and destination not being aligned group by + * group which could cause one of the lowered instructions to overwrite + * the data read from the same source by other lowered instructions. + */ + if (regions_overlap(inst->dst, inst->size_written, + inst->src[i], inst->size_read(i)) && + !inst->dst.equals(inst->src[i])) + return true; + } + + return false; +} + +/** + * Insert data from a packed temporary into the channel group given by + * lbld.group() of the destination region of instruction \p inst and return + * the temporary as result. If any copy instructions are required they will + * be emitted around the given \p inst in \p block. + */ +static fs_reg +emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst) +{ + /* Builder of the right width to perform the copy avoiding uninitialized + * data if the lowered execution size is greater than the original + * execution size of the instruction. + */ + const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), + inst->exec_size), 0); + + /* Specified channel group from the destination region. */ + const fs_reg dst = horiz_offset(inst->dst, lbld.group()); + const unsigned dst_size = inst->size_written / + inst->dst.component_size(inst->exec_size); + + if (needs_dst_copy(lbld, inst)) { + const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size); + + if (inst->predicate) { + /* Handle predication by copying the original contents of + * the destination into the temporary before emitting the + * lowered instruction. + */ + for (unsigned k = 0; k < dst_size; ++k) + cbld.at(block, inst) + .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k)); + } + + for (unsigned k = 0; k < dst_size; ++k) + cbld.at(block, inst->next) + .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k)); + + return tmp; + + } else { + /* No need to allocate a temporary for the lowered instruction, just + * take the right group of channels from the original region. + */ + return dst; + } +} + +bool +fs_visitor::lower_simd_width() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const unsigned lower_width = get_lowered_simd_width(devinfo, inst); + + if (lower_width != inst->exec_size) { + /* Builder matching the original instruction. We may also need to + * emit an instruction of width larger than the original, set the + * execution size of the builder to the highest of both for now so + * we're sure that both cases can be handled. + */ + const unsigned max_width = MAX2(inst->exec_size, lower_width); + const fs_builder ibld = bld.at(block, inst) + .exec_all(inst->force_writemask_all) + .group(max_width, inst->group / max_width); + + /* Split the copies in chunks of the execution width of either the + * original or the lowered instruction, whichever is lower. + */ + const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); + const unsigned dst_size = inst->size_written / + inst->dst.component_size(inst->exec_size); + + assert(!inst->writes_accumulator && !inst->mlen); + + for (unsigned i = 0; i < n; i++) { + /* Emit a copy of the original instruction with the lowered width. + * If the EOT flag was set throw it away except for the last + * instruction to avoid killing the thread prematurely. + */ + fs_inst split_inst = *inst; + split_inst.exec_size = lower_width; + split_inst.eot = inst->eot && i == n - 1; + + /* Select the correct channel enables for the i-th group, then + * transform the sources and destination and emit the lowered + * instruction. + */ + const fs_builder lbld = ibld.group(lower_width, i); + + for (unsigned j = 0; j < inst->sources; j++) + split_inst.src[j] = emit_unzip(lbld, block, inst, j); + + split_inst.dst = emit_zip(lbld, block, inst); + split_inst.size_written = + split_inst.dst.component_size(lower_width) * dst_size; + + lbld.emit(split_inst); + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +void +fs_visitor::dump_instructions() +{ + dump_instructions(NULL); +} + +void +fs_visitor::dump_instructions(const char *name) +{ + FILE *file = stderr; + if (name && geteuid() != 0) { + file = fopen(name, "w"); + if (!file) + file = stderr; + } + + if (cfg) { + calculate_register_pressure(); + int ip = 0, max_pressure = 0; + foreach_block_and_inst(block, backend_instruction, inst, cfg) { + max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); + fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); + dump_instruction(inst, file); + ip++; + } + fprintf(file, "Maximum %3d registers live at once.\n", max_pressure); + } else { + int ip = 0; + foreach_in_list(backend_instruction, inst, &instructions) { + fprintf(file, "%4d: ", ip++); + dump_instruction(inst, file); + } + } + + if (file != stderr) { + fclose(file); + } +} + +void +fs_visitor::dump_instruction(backend_instruction *be_inst) +{ + dump_instruction(be_inst, stderr); +} + +void +fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) +{ + fs_inst *inst = (fs_inst *)be_inst; + + if (inst->predicate) { + fprintf(file, "(%cf0.%d) ", + inst->predicate_inverse ? '-' : '+', + inst->flag_subreg); + } + + fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode)); + if (inst->saturate) + fprintf(file, ".sat"); + if (inst->conditional_mod) { + fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); + if (!inst->predicate && + (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && + inst->opcode != BRW_OPCODE_IF && + inst->opcode != BRW_OPCODE_WHILE))) { + fprintf(file, ".f0.%d", inst->flag_subreg); + } + } + fprintf(file, "(%d) ", inst->exec_size); + + if (inst->mlen) { + fprintf(file, "(mlen: %d) ", inst->mlen); + } + + if (inst->eot) { + fprintf(file, "(EOT) "); + } + + switch (inst->dst.file) { + case VGRF: + fprintf(file, "vgrf%d", inst->dst.nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->dst.nr); + break; + case MRF: + fprintf(file, "m%d", inst->dst.nr); + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case UNIFORM: + fprintf(file, "***u%d***", inst->dst.nr); + break; + case ATTR: + fprintf(file, "***attr%d***", inst->dst.nr); + break; + case ARF: + switch (inst->dst.nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->dst.subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->dst.subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + } + break; + case IMM: + unreachable("not reached"); + } + + if (inst->dst.offset || + (inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { + const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->dst.offset / reg_size, + inst->dst.offset % reg_size); + } + + if (inst->dst.stride != 1) + fprintf(file, "<%u>", inst->dst.stride); + fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type)); + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].negate) + fprintf(file, "-"); + if (inst->src[i].abs) + fprintf(file, "|"); + switch (inst->src[i].file) { + case VGRF: + fprintf(file, "vgrf%d", inst->src[i].nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->src[i].nr); + break; + case MRF: + fprintf(file, "***m%d***", inst->src[i].nr); + break; + case ATTR: + fprintf(file, "attr%d", inst->src[i].nr); + break; + case UNIFORM: + fprintf(file, "u%d", inst->src[i].nr); + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case IMM: + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_F: + fprintf(file, "%-gf", inst->src[i].f); + break; + case BRW_REGISTER_TYPE_DF: + fprintf(file, "%fdf", inst->src[i].df); + break; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_D: + fprintf(file, "%dd", inst->src[i].d); + break; + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_UD: + fprintf(file, "%uu", inst->src[i].ud); + break; + case BRW_REGISTER_TYPE_VF: + fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", + brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); + break; + default: + fprintf(file, "???"); + break; + } + break; + case ARF: + switch (inst->src[i].nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->src[i].subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->src[i].subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + } + break; + } + + if (inst->src[i].offset || + (inst->src[i].file == VGRF && + alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { + const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, + inst->src[i].offset % reg_size); + } + + if (inst->src[i].abs) + fprintf(file, "|"); + + if (inst->src[i].file != IMM) { + unsigned stride; + if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) { + unsigned hstride = inst->src[i].hstride; + stride = (hstride == 0 ? 0 : (1 << (hstride - 1))); + } else { + stride = inst->src[i].stride; + } + if (stride != 1) + fprintf(file, "<%u>", stride); + + fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); + } + + if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE) + fprintf(file, ", "); + } + + fprintf(file, " "); + + if (inst->force_writemask_all) + fprintf(file, "NoMask "); + + if (inst->exec_size != dispatch_width) + fprintf(file, "group%d ", inst->group); + + fprintf(file, "\n"); +} + +/** + * Possibly returns an instruction that set up @param reg. + * + * Sometimes we want to take the result of some expression/variable + * dereference tree and rewrite the instruction generating the result + * of the tree. When processing the tree, we know that the + * instructions generated are all writing temporaries that are dead + * outside of this tree. So, if we have some instructions that write + * a temporary, we're free to point that temp write somewhere else. + * + * Note that this doesn't guarantee that the instruction generated + * only reg -- it might be the size=4 destination of a texture instruction. + */ +fs_inst * +fs_visitor::get_instruction_generating_reg(fs_inst *start, + fs_inst *end, + const fs_reg ®) +{ + if (end == start || + end->is_partial_write() || + !reg.equals(end->dst)) { + return NULL; + } else { + return end; + } +} + +void +fs_visitor::setup_fs_payload_gen6() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + assert(devinfo->gen >= 6); + + /* R0-1: masks, pixel X/Y coordinates. */ + payload.num_regs = 2; + /* R2: only for 32-pixel dispatch.*/ + + /* R3-26: barycentric interpolation coordinates. These appear in the + * same order that they appear in the brw_barycentric_mode + * enum. Each set of coordinates occupies 2 registers if dispatch width + * == 8 and 4 registers if dispatch width == 16. Coordinates only + * appear if they were enabled using the "Barycentric Interpolation + * Mode" bits in WM_STATE. + */ + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (prog_data->barycentric_interp_modes & (1 << i)) { + payload.barycentric_coord_reg[i] = payload.num_regs; + payload.num_regs += 2; + if (dispatch_width == 16) { + payload.num_regs += 2; + } + } + } + + /* R27: interpolated depth if uses source depth */ + prog_data->uses_src_depth = + (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_depth) { + payload.source_depth_reg = payload.num_regs; + payload.num_regs++; + if (dispatch_width == 16) { + /* R28: interpolated depth if not SIMD8. */ + payload.num_regs++; + } + } + + /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */ + prog_data->uses_src_w = + (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_w) { + payload.source_w_reg = payload.num_regs; + payload.num_regs++; + if (dispatch_width == 16) { + /* R30: interpolated W if not SIMD8. */ + payload.num_regs++; + } + } + + /* R31: MSAA position offsets. */ + if (prog_data->persample_dispatch && + (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_POS)) { + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = true; + payload.sample_pos_reg = payload.num_regs; + payload.num_regs++; + } + + /* R32: MSAA input coverage mask */ + prog_data->uses_sample_mask = + (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0; + if (prog_data->uses_sample_mask) { + assert(devinfo->gen >= 7); + payload.sample_mask_in_reg = payload.num_regs; + payload.num_regs++; + if (dispatch_width == 16) { + /* R33: input coverage mask if not SIMD8. */ + payload.num_regs++; + } + } + + /* R34-: bary for 32-pixel. */ + /* R58-59: interp W for 32-pixel. */ + + if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + source_depth_to_render_target = true; + } +} + +void +fs_visitor::setup_vs_payload() +{ + /* R0: thread header, R1: urb handles */ + payload.num_regs = 2; +} + +void +fs_visitor::setup_gs_payload() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + /* R0: thread header, R1: output URB handles */ + payload.num_regs = 2; + + if (gs_prog_data->include_primitive_id) { + /* R2: Primitive ID 0..7 */ + payload.num_regs++; + } + + /* Use a maximum of 24 registers for push-model inputs. */ + const unsigned max_push_components = 24; + + /* If pushing our inputs would take too many registers, reduce the URB read + * length (which is in HWords, or 8 registers), and resort to pulling. + * + * Note that the GS reads <URB Read Length> HWords for every vertex - so we + * have to multiply by VerticesIn to obtain the total storage requirement. + */ + if (8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in > + max_push_components || gs_prog_data->invocations > 1) { + gs_prog_data->base.include_vue_handles = true; + + /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */ + payload.num_regs += nir->info->gs.vertices_in; + + vue_prog_data->urb_read_length = + ROUND_DOWN_TO(max_push_components / nir->info->gs.vertices_in, 8) / 8; + } +} + +void +fs_visitor::setup_cs_payload() +{ + assert(devinfo->gen >= 7); + payload.num_regs = 1; +} + +void +fs_visitor::calculate_register_pressure() +{ + invalidate_live_intervals(); + calculate_live_intervals(); + + unsigned num_instructions = 0; + foreach_block(block, cfg) + num_instructions += block->instructions.length(); + + regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); + + for (unsigned reg = 0; reg < alloc.count; reg++) { + for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) + regs_live_at_ip[ip] += alloc.sizes[reg]; + } +} + +/** + * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones. + * + * The needs_unlit_centroid_workaround ends up producing one of these per + * channel of centroid input, so it's good to clean them up. + * + * An assumption here is that nothing ever modifies the dispatched pixels + * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware + * dictates that anyway. + */ +bool +fs_visitor::opt_drop_redundant_mov_to_flags() +{ + bool flag_mov_found[2] = {false}; + bool progress = false; + + /* Instructions removed by this pass can only be added if this were true */ + if (!devinfo->needs_unlit_centroid_workaround) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->is_control_flow()) { + memset(flag_mov_found, 0, sizeof(flag_mov_found)); + } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) { + if (!flag_mov_found[inst->flag_subreg]) { + flag_mov_found[inst->flag_subreg] = true; + } else { + inst->remove(block); + progress = true; + } + } else if (inst->flags_written()) { + flag_mov_found[inst->flag_subreg] = false; + } + } + + return progress; +} + +void +fs_visitor::optimize() +{ + /* Start by validating the shader we currently have. */ + validate(); + + /* bld is the common builder object pointing at the end of the program we + * used to translate it into i965 IR. For the optimization and lowering + * passes coming next, any code added after the end of the program without + * having explicitly called fs_builder::at() clearly points at a mistake. + * Ideally optimization passes wouldn't be part of the visitor so they + * wouldn't have access to bld at all, but they do, so just in case some + * pass forgets to ask for a location explicitly set it to NULL here to + * make it trip. The dispatch width is initialized to a bogus value to + * make sure that optimizations set the execution controls explicitly to + * match the code they are manipulating instead of relying on the defaults. + */ + bld = fs_builder(this, 64); + + assign_constant_locations(); + lower_constant_loads(); + + validate(); + + split_virtual_grfs(); + validate(); + +#define OPT(pass, args...) ({ \ + pass_num++; \ + bool this_progress = pass(args); \ + \ + if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ + char filename[64]; \ + snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \ + stage_abbrev, dispatch_width, nir->info->name, iteration, pass_num); \ + \ + backend_shader::dump_instructions(filename); \ + } \ + \ + validate(); \ + \ + progress = progress || this_progress; \ + this_progress; \ + }) + + if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { + char filename[64]; + snprintf(filename, 64, "%s%d-%s-00-00-start", + stage_abbrev, dispatch_width, nir->info->name); + + backend_shader::dump_instructions(filename); + } + + bool progress = false; + int iteration = 0; + int pass_num = 0; + + OPT(opt_drop_redundant_mov_to_flags); + + do { + progress = false; + pass_num = 0; + iteration++; + + OPT(remove_duplicate_mrf_writes); + + OPT(opt_algebraic); + OPT(opt_cse); + OPT(opt_copy_propagation); + OPT(opt_predicated_break, this); + OPT(opt_cmod_propagation); + OPT(dead_code_eliminate); + OPT(opt_peephole_sel); + OPT(dead_control_flow_eliminate, this); + OPT(opt_register_renaming); + OPT(opt_saturate_propagation); + OPT(register_coalesce); + OPT(compute_to_mrf); + OPT(eliminate_find_live_channel); + + OPT(compact_virtual_grfs); + } while (progress); + + progress = false; + pass_num = 0; + + if (OPT(lower_pack)) { + OPT(register_coalesce); + OPT(dead_code_eliminate); + } + + if (OPT(lower_d2x)) { + OPT(opt_copy_propagation); + OPT(dead_code_eliminate); + } + + OPT(lower_simd_width); + + /* After SIMD lowering just in case we had to unroll the EOT send. */ + OPT(opt_sampler_eot); + + OPT(lower_logical_sends); + + if (progress) { + OPT(opt_copy_propagation); + /* Only run after logical send lowering because it's easier to implement + * in terms of physical sends. + */ + if (OPT(opt_zero_samples)) + OPT(opt_copy_propagation); + /* Run after logical send lowering to give it a chance to CSE the + * LOAD_PAYLOAD instructions created to construct the payloads of + * e.g. texturing messages in cases where it wasn't possible to CSE the + * whole logical instruction. + */ + OPT(opt_cse); + OPT(register_coalesce); + OPT(compute_to_mrf); + OPT(dead_code_eliminate); + OPT(remove_duplicate_mrf_writes); + OPT(opt_peephole_sel); + } + + OPT(opt_redundant_discard_jumps); + + if (OPT(lower_load_payload)) { + split_virtual_grfs(); + OPT(register_coalesce); + OPT(compute_to_mrf); + OPT(dead_code_eliminate); + } + + OPT(opt_combine_constants); + OPT(lower_integer_multiplication); + + if (devinfo->gen <= 5 && OPT(lower_minmax)) { + OPT(opt_cmod_propagation); + OPT(opt_cse); + OPT(opt_copy_propagation); + OPT(dead_code_eliminate); + } + + lower_uniform_pull_constant_loads(); + + validate(); +} + +/** + * Three source instruction must have a GRF/MRF destination register. + * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. + */ +void +fs_visitor::fixup_3src_null_dest() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->is_3src(devinfo) && inst->dst.is_null()) { + inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + inst->dst.type); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); +} + +void +fs_visitor::allocate_registers(bool allow_spilling) +{ + bool allocated_without_spills; + + static const enum instruction_scheduler_mode pre_modes[] = { + SCHEDULE_PRE, + SCHEDULE_PRE_NON_LIFO, + SCHEDULE_PRE_LIFO, + }; + + bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS); + + /* Try each scheduling heuristic to see if it can successfully register + * allocate without spilling. They should be ordered by decreasing + * performance but increasing likelihood of allocating. + */ + for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { + schedule_instructions(pre_modes[i]); + + if (0) { + assign_regs_trivial(); + allocated_without_spills = true; + } else { + allocated_without_spills = assign_regs(false, spill_all); + } + if (allocated_without_spills) + break; + } + + if (!allocated_without_spills) { + if (!allow_spilling) + fail("Failure to register allocate and spilling is not allowed."); + + /* We assume that any spilling is worse than just dropping back to + * SIMD8. There's probably actually some intermediate point where + * SIMD16 with a couple of spills is still better. + */ + if (dispatch_width > min_dispatch_width) { + fail("Failure to register allocate. Reduce number of " + "live scalar values to avoid this."); + } else { + compiler->shader_perf_log(log_data, + "%s shader triggered register spilling. " + "Try reducing the number of live scalar " + "values to improve performance.\n", + stage_name); + } + + /* Since we're out of heuristics, just go spill registers until we + * get an allocation. + */ + while (!assign_regs(true, spill_all)) { + if (failed) + break; + } + } + + /* This must come after all optimization and register allocation, since + * it inserts dead code that happens to have side effects, and it does + * so based on the actual physical registers in use. + */ + insert_gen4_send_dependency_workarounds(); + + if (failed) + return; + + schedule_instructions(SCHEDULE_POST); + + if (last_scratch > 0) { + MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024; + + prog_data->total_scratch = brw_get_scratch_size(last_scratch); + + if (stage == MESA_SHADER_COMPUTE) { + if (devinfo->is_haswell) { + /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" + * field documentation, Haswell supports a minimum of 2kB of + * scratch space for compute shaders, unlike every other stage + * and platform. + */ + prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048); + } else if (devinfo->gen <= 7) { + /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" + * field documentation, platforms prior to Haswell measure scratch + * size linearly with a range of [1kB, 12kB] and 1kB granularity. + */ + prog_data->total_scratch = ALIGN(last_scratch, 1024); + max_scratch_size = 12 * 1024; + } + } + + /* We currently only support up to 2MB of scratch space. If we + * need to support more eventually, the documentation suggests + * that we could allocate a larger buffer, and partition it out + * ourselves. We'd just have to undo the hardware's address + * calculation by subtracting (FFTID * Per Thread Scratch Space) + * and then add FFTID * (Larger Per Thread Scratch Space). + * + * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > + * Thread Group Tracking > Local Memory/Scratch Space. + */ + assert(prog_data->total_scratch < max_scratch_size); + } +} + +bool +fs_visitor::run_vs(gl_clip_plane *clip_planes) +{ + assert(stage == MESA_SHADER_VERTEX); + + setup_vs_payload(); + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + if (failed) + return false; + + compute_clip_distance(clip_planes); + + emit_urb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_vs_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); + + return !failed; +} + +bool +fs_visitor::run_tcs_single_patch() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + + /* r1-r4 contain the ICP handles. */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + /* Initialize gl_InvocationID */ + fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW); + fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210))); + bld.MOV(channels_ud, channels_uw); + + if (tcs_prog_data->instances == 1) { + invocation_id = channels_ud; + } else { + invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + + /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */ + fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(INTEL_MASK(23, 17))); + bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3)); + + bld.ADD(invocation_id, instance_times_8, channels_ud); + } + + /* Fix the disptach mask */ + if (nir->info->tess.tcs_vertices_out % 8) { + bld.CMP(bld.null_reg_ud(), invocation_id, + brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L); + bld.IF(BRW_PREDICATE_NORMAL); + } + + emit_nir_code(); + + if (nir->info->tess.tcs_vertices_out % 8) { + bld.emit(BRW_OPCODE_ENDIF); + } + + /* Emit EOT write; set TR DS Cache bit */ + fs_reg srcs[3] = { + fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), + fs_reg(brw_imm_ud(WRITEMASK_X << 16)), + fs_reg(brw_imm_ud(0)), + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); + bld.LOAD_PAYLOAD(payload, srcs, 3, 2); + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, + bld.null_reg_ud(), payload); + inst->mlen = 3; + inst->eot = true; + + if (shader_time_index >= 0) + emit_shader_time_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tcs_single_patch_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); + + return !failed; +} + +bool +fs_visitor::run_tes() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + if (failed) + return false; + + emit_urb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tes_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); + + return !failed; +} + +bool +fs_visitor::run_gs() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + setup_gs_payload(); + + this->final_gs_vertex_count = vgrf(glsl_type::uint_type); + + if (gs_compile->control_data_header_size_bits > 0) { + /* Create a VGRF to store accumulated control data bits. */ + this->control_data_bits = vgrf(glsl_type::uint_type); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (gs_compile->control_data_header_size_bits <= 32) { + const fs_builder abld = bld.annotate("initialize control data bits"); + abld.MOV(this->control_data_bits, brw_imm_ud(0u)); + } + } + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + emit_gs_thread_end(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_gs_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); + + return !failed; +} + +bool +fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) +{ + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; + + assert(stage == MESA_SHADER_FRAGMENT); + + if (devinfo->gen >= 6) + setup_fs_payload_gen6(); + else + setup_fs_payload_gen4(); + + if (0) { + emit_dummy_fs(); + } else if (do_rep_send) { + assert(dispatch_width == 16); + emit_repclear_shader(); + } else { + if (shader_time_index >= 0) + emit_shader_time_begin(); + + calculate_urb_setup(); + if (nir->info->inputs_read > 0 || + (nir->info->outputs_read > 0 && !wm_key->coherent_fb_fetch)) { + if (devinfo->gen < 6) + emit_interpolation_setup_gen4(); + else + emit_interpolation_setup_gen6(); + } + + /* We handle discards by keeping track of the still-live pixels in f0.1. + * Initialize it with the dispatched pixels. + */ + if (wm_prog_data->uses_kill) { + fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + discard_init->flag_subreg = 1; + } + + /* Generate FS IR for main(). (the visitor only descends into + * functions called "main"). + */ + emit_nir_code(); + + if (failed) + return false; + + if (wm_prog_data->uses_kill) + bld.emit(FS_OPCODE_PLACEHOLDER_HALT); + + if (wm_key->alpha_test_func) + emit_alpha_test(); + + emit_fb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(allow_spilling); + + if (failed) + return false; + } + + return !failed; +} + +bool +fs_visitor::run_cs() +{ + assert(stage == MESA_SHADER_COMPUTE); + + setup_cs_payload(); + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + if (devinfo->is_haswell && prog_data->total_shared > 0) { + /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ + const fs_builder abld = bld.exec_all().group(1, 0); + abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW), + suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1)); + } + + emit_nir_code(); + + if (failed) + return false; + + emit_cs_terminate(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); + + if (failed) + return false; + + return !failed; +} + +/** + * Return a bitfield where bit n is set if barycentric interpolation mode n + * (see enum brw_barycentric_mode) is needed by the fragment shader. + * + * We examine the load_barycentric intrinsics rather than looking at input + * variables so that we catch interpolateAtCentroid() messages too, which + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. + */ +static unsigned +brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_function(f, shader) { + if (!f->impl) + continue; + + nir_foreach_block(block, f->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + continue; + + /* Ignore WPOS; it doesn't require interpolation. */ + if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS) + continue; + + intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); + enum glsl_interp_mode interp = (enum glsl_interp_mode) + nir_intrinsic_interp_mode(intrin); + nir_intrinsic_op bary_op = intrin->intrinsic; + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp, bary_op); + + barycentric_interp_modes |= 1 << bary; + + if (devinfo->needs_unlit_centroid_workaround && + bary_op == nir_intrinsic_load_barycentric_centroid) + barycentric_interp_modes |= 1 << centroid_to_pixel(bary); + } + } + } + + return barycentric_interp_modes; +} + +static void +brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_variable(var, &shader->inputs) { + int input_index = prog_data->urb_setup[var->data.location]; + + if (input_index < 0) + continue; + + /* flat shading */ + if (var->data.interpolation == INTERP_MODE_FLAT) + prog_data->flat_inputs |= (1 << input_index); + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info->fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + return BRW_PSCDEPTH_OFF; + } + } + return BRW_PSCDEPTH_OFF; +} + +/** + * Move load_interpolated_input with simple (payload-based) barycentric modes + * to the top of the program so we don't emit multiple PLNs for the same input. + * + * This works around CSE not being able to handle non-dominating cases + * such as: + * + * if (...) { + * interpolate input + * } else { + * interpolate the same exact input + * } + * + * This should be replaced by global value numbering someday. + */ +void +move_interpolation_to_top(nir_shader *nir) +{ + nir_foreach_function(f, nir) { + if (!f->impl) + continue; + + nir_block *top = nir_start_block(f->impl); + exec_node *cursor_node = NULL; + + nir_foreach_block(block, f->impl) { + if (block == top) + continue; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + continue; + nir_intrinsic_instr *bary_intrinsic = + nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); + nir_intrinsic_op op = bary_intrinsic->intrinsic; + + /* Leave interpolateAtSample/Offset() where they are. */ + if (op == nir_intrinsic_load_barycentric_at_sample || + op == nir_intrinsic_load_barycentric_at_offset) + continue; + + nir_instr *move[3] = { + &bary_intrinsic->instr, + intrin->src[1].ssa->parent_instr, + instr + }; + + for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { + if (move[i]->block != top) { + move[i]->block = top; + exec_node_remove(&move[i]->node); + if (cursor_node) { + exec_node_insert_after(cursor_node, &move[i]->node); + } else { + exec_list_push_head(&top->instr_list, &move[i]->node); + } + cursor_node = &move[i]->node; + } + } + } + } + nir_metadata_preserve(f->impl, (nir_metadata) + ((unsigned) nir_metadata_block_index | + (unsigned) nir_metadata_dominance)); + } +} + +/** + * Demote per-sample barycentric intrinsics to centroid. + * + * Useful when rendering to a non-multisampled buffer. + */ +static void +demote_sample_qualifiers(nir_shader *nir) +{ + nir_foreach_function(f, nir) { + if (!f->impl) + continue; + + nir_builder b; + nir_builder_init(&b, f->impl); + + nir_foreach_block(block, f->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample && + intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample) + continue; + + b.cursor = nir_before_instr(instr); + nir_ssa_def *centroid = + nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid, + nir_intrinsic_interp_mode(intrin)); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(centroid)); + nir_instr_remove(instr); + } + } + + nir_metadata_preserve(f->impl, (nir_metadata) + ((unsigned) nir_metadata_block_index | + (unsigned) nir_metadata_dominance)); + } +} + +/** + * Pre-gen6, the register file of the EUs was shared between threads, + * and each thread used some subset allocated on a 16-register block + * granularity. The unit states wanted these block counts. + */ +static inline int +brw_register_blocks(int reg_count) +{ + return ALIGN(reg_count, 16) / 16 - 1; +} + +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const nir_shader *src_shader, + struct gl_program *prog, + int shader_time_index8, int shader_time_index16, + bool allow_spilling, + bool use_rep_send, struct brw_vue_map *vue_map, + unsigned *final_assembly_size, + char **error_str) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); + brw_nir_lower_fs_inputs(shader, devinfo, key); + brw_nir_lower_fs_outputs(shader); + + if (devinfo->gen < 6) { + brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo); + } + + if (!key->multisample_fbo) + NIR_PASS_V(shader, demote_sample_qualifiers); + NIR_PASS_V(shader, move_interpolation_to_top); + shader = brw_postprocess_nir(shader, compiler, true); + + /* key->alpha_test_func means simulating alpha testing via discards, + * so the shader definitely kills pixels. + */ + prog_data->uses_kill = shader->info->fs.uses_discard || + key->alpha_test_func; + prog_data->uses_omask = key->multisample_fbo && + shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->persample_dispatch = + key->multisample_fbo && + (key->persample_interp || + (shader->info->system_values_read & (SYSTEM_BIT_SAMPLE_ID | + SYSTEM_BIT_SAMPLE_POS)) || + shader->info->fs.uses_sample_qualifier || + shader->info->outputs_read); + + prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage; + prog_data->inner_coverage = shader->info->fs.inner_coverage; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(compiler->devinfo, shader); + + cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL; + uint8_t simd8_grf_start = 0, simd16_grf_start = 0; + unsigned simd8_grf_used = 0, simd16_grf_used = 0; + + fs_visitor v8(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 8, + shader_time_index8); + if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v8.fail_msg); + + return NULL; + } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { + simd8_cfg = v8.cfg; + simd8_grf_start = v8.payload.num_regs; + simd8_grf_used = v8.grf_used; + } + + if (v8.max_dispatch_width >= 16 && + likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { + /* Try a SIMD16 compile */ + fs_visitor v16(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 16, + shader_time_index16); + v16.import_uniforms(&v8); + if (!v16.run_fs(allow_spilling, use_rep_send)) { + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v16.fail_msg); + } else { + simd16_cfg = v16.cfg; + simd16_grf_start = v16.payload.num_regs; + simd16_grf_used = v16.grf_used; + } + } + + /* When the caller requests a repclear shader, they want SIMD16-only */ + if (use_rep_send) + simd8_cfg = NULL; + + /* Prior to Iron Lake, the PS had a single shader offset with a jump table + * at the top to select the shader. We've never implemented that. + * Instead, we just give them exactly one shader and we pick the widest one + * available. + */ + if (compiler->devinfo->gen < 5 && simd16_cfg) + simd8_cfg = NULL; + + if (prog_data->persample_dispatch) { + /* Starting with SandyBridge (where we first get MSAA), the different + * pixel dispatch combinations are grouped into classifications A + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * generations, the only configurations supporting persample dispatch + * are are this in which only one dispatch width is enabled. + * + * If computed depth is enabled, SNB only allows SIMD8 while IVB+ + * allow SIMD8 or SIMD16 so we choose SIMD16 if available. + */ + if (compiler->devinfo->gen == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) { + simd16_cfg = NULL; + } else if (simd16_cfg) { + simd8_cfg = NULL; + } + } + + /* We have to compute the flat inputs after the visitor is finished running + * because it relies on prog_data->urb_setup which is computed in + * fs_visitor::calculate_urb_setup(). + */ + brw_compute_flat_inputs(prog_data, shader); + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base, + v8.promoted_constants, v8.runtime_check_aads_emit, + MESA_SHADER_FRAGMENT); + + if (unlikely(INTEL_DEBUG & DEBUG_WM)) { + g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", + shader->info->label ? + shader->info->label : "unnamed", + shader->info->name)); + } + + if (simd8_cfg) { + prog_data->dispatch_8 = true; + g.generate_code(simd8_cfg, 8); + prog_data->base.dispatch_grf_start_reg = simd8_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used); + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16); + prog_data->dispatch_grf_start_reg_2 = simd16_grf_start; + prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used); + } + } else if (simd16_cfg) { + prog_data->dispatch_16 = true; + g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd16_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used); + } + + return g.get_assembly(final_assembly_size); +} + +fs_reg * +fs_visitor::emit_cs_work_group_id_setup() +{ + assert(stage == MESA_SHADER_COMPUTE); + + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type)); + + struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD)); + struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD)); + + bld.MOV(*reg, r0_1); + bld.MOV(offset(*reg, bld, 1), r0_6); + bld.MOV(offset(*reg, bld, 2), r0_7); + + return reg; +} + +static void +fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) +{ + block->dwords = dwords; + block->regs = DIV_ROUND_UP(dwords, 8); + block->size = block->regs * 32; +} + +static void +cs_fill_push_const_info(const struct gen_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data) +{ + const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + bool fill_thread_id = + cs_prog_data->thread_local_id_index >= 0 && + cs_prog_data->thread_local_id_index < (int)prog_data->nr_params; + bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell; + + /* The thread ID should be stored in the last param dword */ + assert(prog_data->nr_params > 0 || !fill_thread_id); + assert(!fill_thread_id || + cs_prog_data->thread_local_id_index == + (int)prog_data->nr_params - 1); + + unsigned cross_thread_dwords, per_thread_dwords; + if (!cross_thread_supported) { + cross_thread_dwords = 0u; + per_thread_dwords = prog_data->nr_params; + } else if (fill_thread_id) { + /* Fill all but the last register with cross-thread payload */ + cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8); + per_thread_dwords = prog_data->nr_params - cross_thread_dwords; + assert(per_thread_dwords > 0 && per_thread_dwords <= 8); + } else { + /* Fill all data using cross-thread payload */ + cross_thread_dwords = prog_data->nr_params; + per_thread_dwords = 0u; + } + + fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); + fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); + + unsigned total_dwords = + (cs_prog_data->push.per_thread.size * cs_prog_data->threads + + cs_prog_data->push.cross_thread.size) / 4; + fill_push_const_block_info(&cs_prog_data->push.total, total_dwords); + + assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || + cs_prog_data->push.per_thread.size == 0); + assert(cs_prog_data->push.cross_thread.dwords + + cs_prog_data->push.per_thread.dwords == + prog_data->nr_params); +} + +static void +cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) +{ + cs_prog_data->simd_size = size; + unsigned group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; + cs_prog_data->threads = (group_size + size - 1) / size; +} + +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const nir_shader *src_shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); + brw_nir_lower_cs_shared(shader); + prog_data->base.total_shared += shader->num_shared; + + /* Now that we cloned the nir_shader, we can update num_uniforms based on + * the thread_local_id_index. + */ + assert(prog_data->thread_local_id_index >= 0); + shader->num_uniforms = + MAX2(shader->num_uniforms, + (unsigned)4 * (prog_data->thread_local_id_index + 1)); + + brw_nir_lower_intrinsics(shader, &prog_data->base); + shader = brw_postprocess_nir(shader, compiler, true); + + prog_data->local_size[0] = shader->info->cs.local_size[0]; + prog_data->local_size[1] = shader->info->cs.local_size[1]; + prog_data->local_size[2] = shader->info->cs.local_size[2]; + unsigned local_workgroup_size = + shader->info->cs.local_size[0] * shader->info->cs.local_size[1] * + shader->info->cs.local_size[2]; + + unsigned max_cs_threads = compiler->devinfo->max_cs_threads; + unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads); + + cfg_t *cfg = NULL; + const char *fail_msg = NULL; + + /* Now the main event: Visit the shader IR and generate our CS IR for it. + */ + fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 8, shader_time_index); + if (simd_required <= 8) { + if (!v8.run_cs()) { + fail_msg = v8.fail_msg; + } else { + cfg = v8.cfg; + cs_set_simd_size(prog_data, 8); + cs_fill_push_const_info(compiler->devinfo, prog_data); + prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + } + } + + fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 16, shader_time_index); + if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && + !fail_msg && v8.max_dispatch_width >= 16 && + simd_required <= 16) { + /* Try a SIMD16 compile */ + if (simd_required <= 8) + v16.import_uniforms(&v8); + if (!v16.run_cs()) { + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v16.fail_msg); + if (!cfg) { + fail_msg = + "Couldn't generate SIMD16 program and not " + "enough threads for SIMD8"; + } + } else { + cfg = v16.cfg; + cs_set_simd_size(prog_data, 16); + cs_fill_push_const_info(compiler->devinfo, prog_data); + prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + } + } + + fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 32, shader_time_index); + if (!fail_msg && v8.max_dispatch_width >= 32 && + (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) { + /* Try a SIMD32 compile */ + if (simd_required <= 8) + v32.import_uniforms(&v8); + else if (simd_required <= 16) + v32.import_uniforms(&v16); + + if (!v32.run_cs()) { + compiler->shader_perf_log(log_data, + "SIMD32 shader failed to compile: %s", + v16.fail_msg); + if (!cfg) { + fail_msg = + "Couldn't generate SIMD32 program and not " + "enough threads for SIMD16"; + } + } else { + cfg = v32.cfg; + cs_set_simd_size(prog_data, 32); + cs_fill_push_const_info(compiler->devinfo, prog_data); + } + } + + if (unlikely(cfg == NULL)) { + assert(fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, fail_msg); + + return NULL; + } + + fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, + v8.promoted_constants, v8.runtime_check_aads_emit, + MESA_SHADER_COMPUTE); + if (INTEL_DEBUG & DEBUG_CS) { + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", + shader->info->label ? shader->info->label : + "unnamed", + shader->info->name); + g.enable_debug(name); + } + + g.generate_code(cfg, prog_data->simd_size); + + return g.get_assembly(final_assembly_size); +} + +/** + * Test the dispatch mask packing assumptions of + * brw_stage_has_packed_dispatch(). Call this from e.g. the top of + * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is + * executed with an unexpected dispatch mask. + */ +static UNUSED void +brw_fs_test_dispatch_packing(const fs_builder &bld) +{ + const gl_shader_stage stage = bld.shader->stage; + + if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage, + bld.shader->stage_prog_data)) { + const fs_builder ubld = bld.exec_all().group(1, 0); + const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0); + const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : + brw_dmask_reg()); + + ubld.ADD(tmp, mask, brw_imm_ud(1)); + ubld.AND(tmp, mask, tmp); + + /* This will loop forever if the dispatch mask doesn't have the expected + * form '2^n-1', in which case tmp will be non-zero. + */ + bld.emit(BRW_OPCODE_DO); + bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ); + set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE)); + } +} diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h new file mode 100644 index 00000000000..00861ce5dad --- /dev/null +++ b/src/intel/compiler/brw_fs.h @@ -0,0 +1,500 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#pragma once + +#include "brw_shader.h" +#include "brw_ir_fs.h" +#include "brw_fs_builder.h" +#include "compiler/nir/nir.h" + +struct bblock_t; +namespace { + struct acp_entry; +} + +namespace brw { + class fs_live_variables; +} + +struct brw_gs_compile; + +static inline fs_reg +offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta) +{ + return offset(reg, bld.dispatch_width(), delta); +} + +/** + * The fragment shader front-end. + * + * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR. + */ +class fs_visitor : public backend_shader +{ +public: + fs_visitor(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const void *key, + struct brw_stage_prog_data *prog_data, + struct gl_program *prog, + const nir_shader *shader, + unsigned dispatch_width, + int shader_time_index, + const struct brw_vue_map *input_vue_map = NULL); + fs_visitor(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + struct brw_gs_compile *gs_compile, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + int shader_time_index); + void init(); + ~fs_visitor(); + + fs_reg vgrf(const glsl_type *const type); + void import_uniforms(fs_visitor *v); + void setup_uniform_clipplane_values(gl_clip_plane *clip_planes); + void compute_clip_distance(gl_clip_plane *clip_planes); + + fs_inst *get_instruction_generating_reg(fs_inst *start, + fs_inst *end, + const fs_reg ®); + + void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld, + const fs_reg &dst, + const fs_reg &surf_index, + const fs_reg &varying_offset, + uint32_t const_offset); + void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf); + + bool run_fs(bool allow_spilling, bool do_rep_send); + bool run_vs(gl_clip_plane *clip_planes); + bool run_tcs_single_patch(); + bool run_tes(); + bool run_gs(); + bool run_cs(); + void optimize(); + void allocate_registers(bool allow_spilling); + void setup_fs_payload_gen4(); + void setup_fs_payload_gen6(); + void setup_vs_payload(); + void setup_gs_payload(); + void setup_cs_payload(); + void fixup_3src_null_dest(); + void assign_curb_setup(); + void calculate_urb_setup(); + void assign_urb_setup(); + void convert_attr_sources_to_hw_regs(fs_inst *inst); + void assign_vs_urb_setup(); + void assign_tcs_single_patch_urb_setup(); + void assign_tes_urb_setup(); + void assign_gs_urb_setup(); + bool assign_regs(bool allow_spilling, bool spill_all); + void assign_regs_trivial(); + void calculate_payload_ranges(int payload_node_count, + int *payload_last_use_ip); + void setup_payload_interference(struct ra_graph *g, int payload_reg_count, + int first_payload_node); + int choose_spill_reg(struct ra_graph *g); + void spill_reg(int spill_reg); + void split_virtual_grfs(); + bool compact_virtual_grfs(); + void assign_constant_locations(); + void lower_constant_loads(); + void invalidate_live_intervals(); + void calculate_live_intervals(); + void calculate_register_pressure(); + void validate(); + bool opt_algebraic(); + bool opt_redundant_discard_jumps(); + bool opt_cse(); + bool opt_cse_local(bblock_t *block); + bool opt_copy_propagation(); + bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); + bool try_constant_propagate(fs_inst *inst, acp_entry *entry); + bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block, + exec_list *acp); + bool opt_drop_redundant_mov_to_flags(); + bool opt_register_renaming(); + bool register_coalesce(); + bool compute_to_mrf(); + bool eliminate_find_live_channel(); + bool dead_code_eliminate(); + bool remove_duplicate_mrf_writes(); + + bool opt_sampler_eot(); + bool virtual_grf_interferes(int a, int b); + void schedule_instructions(instruction_scheduler_mode mode); + void insert_gen4_send_dependency_workarounds(); + void insert_gen4_pre_send_dependency_workarounds(bblock_t *block, + fs_inst *inst); + void insert_gen4_post_send_dependency_workarounds(bblock_t *block, + fs_inst *inst); + void vfail(const char *msg, va_list args); + void fail(const char *msg, ...); + void limit_dispatch_width(unsigned n, const char *msg); + void lower_uniform_pull_constant_loads(); + bool lower_load_payload(); + bool lower_pack(); + bool lower_d2x(); + bool lower_logical_sends(); + bool lower_integer_multiplication(); + bool lower_minmax(); + bool lower_simd_width(); + bool opt_combine_constants(); + + void emit_dummy_fs(); + void emit_repclear_shader(); + void emit_fragcoord_interpolation(fs_reg wpos); + fs_reg *emit_frontfacing_interpolation(); + fs_reg *emit_samplepos_setup(); + fs_reg *emit_sampleid_setup(); + fs_reg *emit_samplemaskin_setup(); + fs_reg *emit_vs_system_value(int location); + void emit_interpolation_setup_gen4(); + void emit_interpolation_setup_gen6(); + void compute_sample_position(fs_reg dst, fs_reg int_sample_pos); + fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components, + const fs_reg &sampler); + void emit_gen6_gather_wa(uint8_t wa, fs_reg dst); + fs_reg resolve_source_modifiers(const fs_reg &src); + void emit_discard_jump(); + bool opt_peephole_sel(); + bool opt_peephole_predicated_break(); + bool opt_saturate_propagation(); + bool opt_cmod_propagation(); + bool opt_zero_samples(); + + void emit_nir_code(); + void nir_setup_outputs(); + void nir_setup_uniforms(); + void nir_emit_system_values(); + void nir_emit_impl(nir_function_impl *impl); + void nir_emit_cf_list(exec_list *list); + void nir_emit_if(nir_if *if_stmt); + void nir_emit_loop(nir_loop *loop); + void nir_emit_block(nir_block *block); + void nir_emit_instr(nir_instr *instr); + void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr); + void nir_emit_load_const(const brw::fs_builder &bld, + nir_load_const_instr *instr); + void nir_emit_vs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_tcs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_gs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_fs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_cs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_tes_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_ssbo_atomic(const brw::fs_builder &bld, + int op, nir_intrinsic_instr *instr); + void nir_emit_shared_atomic(const brw::fs_builder &bld, + int op, nir_intrinsic_instr *instr); + void nir_emit_texture(const brw::fs_builder &bld, + nir_tex_instr *instr); + void nir_emit_jump(const brw::fs_builder &bld, + nir_jump_instr *instr); + fs_reg get_nir_src(const nir_src &src); + fs_reg get_nir_src_imm(const nir_src &src); + fs_reg get_nir_dest(const nir_dest &dest); + fs_reg get_nir_image_deref(const nir_deref_var *deref); + fs_reg get_indirect_offset(nir_intrinsic_instr *instr); + void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst, + unsigned wr_mask); + + bool optimize_extract_to_float(nir_alu_instr *instr, + const fs_reg &result); + bool optimize_frontfacing_ternary(nir_alu_instr *instr, + const fs_reg &result); + + void emit_alpha_test(); + fs_inst *emit_single_fb_write(const brw::fs_builder &bld, + fs_reg color1, fs_reg color2, + fs_reg src0_alpha, unsigned components); + void emit_fb_writes(); + fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld, + const fs_reg &dst, unsigned target); + void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg()); + void set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id); + void emit_gs_control_data_bits(const fs_reg &vertex_count); + void emit_gs_end_primitive(const nir_src &vertex_count_nir_src); + void emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id); + void emit_gs_thread_end(); + void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src, + unsigned base_offset, const nir_src &offset_src, + unsigned num_components, unsigned first_component); + void emit_cs_terminate(); + fs_reg *emit_cs_work_group_id_setup(); + + void emit_barrier(); + + void emit_shader_time_begin(); + void emit_shader_time_end(); + void SHADER_TIME_ADD(const brw::fs_builder &bld, + int shader_time_subindex, + fs_reg value); + + fs_reg get_timestamp(const brw::fs_builder &bld); + + struct brw_reg interp_reg(int location, int channel); + + int implied_mrf_writes(fs_inst *inst); + + virtual void dump_instructions(); + virtual void dump_instructions(const char *name); + void dump_instruction(backend_instruction *inst); + void dump_instruction(backend_instruction *inst, FILE *file); + + const void *const key; + const struct brw_sampler_prog_key_data *key_tex; + + struct brw_gs_compile *gs_compile; + + struct brw_stage_prog_data *prog_data; + struct gl_program *prog; + + const struct brw_vue_map *input_vue_map; + + int *virtual_grf_start; + int *virtual_grf_end; + brw::fs_live_variables *live_intervals; + + int *regs_live_at_ip; + + /** Number of uniform variable components visited. */ + unsigned uniforms; + + /** Byte-offset for the next available spot in the scratch space buffer. */ + unsigned last_scratch; + + /** + * Array mapping UNIFORM register numbers to the pull parameter index, + * or -1 if this uniform register isn't being uploaded as a pull constant. + */ + int *pull_constant_loc; + + /** + * Array mapping UNIFORM register numbers to the push parameter index, + * or -1 if this uniform register isn't being uploaded as a push constant. + */ + int *push_constant_loc; + + fs_reg frag_depth; + fs_reg frag_stencil; + fs_reg sample_mask; + fs_reg outputs[VARYING_SLOT_MAX]; + fs_reg dual_src_output; + int first_non_payload_grf; + /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */ + unsigned max_grf; + + fs_reg *nir_locals; + fs_reg *nir_ssa_values; + fs_reg *nir_system_values; + + bool failed; + char *fail_msg; + + /** Register numbers for thread payload fields. */ + struct thread_payload { + uint8_t source_depth_reg; + uint8_t source_w_reg; + uint8_t aa_dest_stencil_reg; + uint8_t dest_depth_reg; + uint8_t sample_pos_reg; + uint8_t sample_mask_in_reg; + uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT]; + uint8_t local_invocation_id_reg; + + /** The number of thread payload registers the hardware will supply. */ + uint8_t num_regs; + } payload; + + bool source_depth_to_render_target; + bool runtime_check_aads_emit; + + fs_reg pixel_x; + fs_reg pixel_y; + fs_reg wpos_w; + fs_reg pixel_w; + fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT]; + fs_reg shader_start_time; + fs_reg userplane[MAX_CLIP_PLANES]; + fs_reg final_gs_vertex_count; + fs_reg control_data_bits; + fs_reg invocation_id; + + unsigned grf_used; + bool spilled_any_registers; + + const unsigned dispatch_width; /**< 8, 16 or 32 */ + unsigned min_dispatch_width; + unsigned max_dispatch_width; + + int shader_time_index; + + unsigned promoted_constants; + brw::fs_builder bld; +}; + +/** + * The fragment shader code generator. + * + * Translates FS IR to actual i965 assembly code. + */ +class fs_generator +{ +public: + fs_generator(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const void *key, + struct brw_stage_prog_data *prog_data, + unsigned promoted_constants, + bool runtime_check_aads_emit, + gl_shader_stage stage); + ~fs_generator(); + + void enable_debug(const char *shader_name); + int generate_code(const cfg_t *cfg, int dispatch_width); + const unsigned *get_assembly(unsigned int *assembly_size); + +private: + void fire_fb_write(fs_inst *inst, + struct brw_reg payload, + struct brw_reg implied_header, + GLuint nr); + void generate_fb_write(fs_inst *inst, struct brw_reg payload); + void generate_fb_read(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload); + void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload); + void generate_urb_write(fs_inst *inst, struct brw_reg payload); + void generate_cs_terminate(fs_inst *inst, struct brw_reg payload); + void generate_barrier(fs_inst *inst, struct brw_reg src); + void generate_linterp(fs_inst *inst, struct brw_reg dst, + struct brw_reg *src); + void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, + struct brw_reg surface_index, + struct brw_reg sampler_index); + void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst, + struct brw_reg src, + struct brw_reg surf_index); + void generate_ddx(enum opcode op, struct brw_reg dst, struct brw_reg src); + void generate_ddy(enum opcode op, struct brw_reg dst, struct brw_reg src); + void generate_scratch_write(fs_inst *inst, struct brw_reg src); + void generate_scratch_read(fs_inst *inst, struct brw_reg dst); + void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst); + void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset); + void generate_uniform_pull_constant_load_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg surf_index, + struct brw_reg payload); + void generate_varying_pull_constant_load_gen4(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index); + void generate_varying_pull_constant_load_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset); + void generate_mov_dispatch_to_flags(fs_inst *inst); + + void generate_pixel_interpolator_query(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg msg_data, + unsigned msg_type); + + void generate_set_sample_id(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1); + + void generate_discard_jump(fs_inst *inst); + + void generate_pack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg x, + struct brw_reg y); + void generate_unpack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src); + + void generate_shader_time_add(fs_inst *inst, + struct brw_reg payload, + struct brw_reg offset, + struct brw_reg value); + + void generate_mov_indirect(fs_inst *inst, + struct brw_reg dst, + struct brw_reg reg, + struct brw_reg indirect_byte_offset); + + bool patch_discard_jumps_to_fb_writes(); + + const struct brw_compiler *compiler; + void *log_data; /* Passed to compiler->*_log functions */ + + const struct gen_device_info *devinfo; + + struct brw_codegen *p; + const void * const key; + struct brw_stage_prog_data * const prog_data; + + unsigned dispatch_width; /**< 8, 16 or 32 */ + + exec_list discard_halt_patches; + unsigned promoted_constants; + bool runtime_check_aads_emit; + bool debug_flag; + const char *shader_name; + gl_shader_stage stage; + void *mem_ctx; +}; + +void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t components); + +void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t components); +fs_reg setup_imm_df(const brw::fs_builder &bld, + double v); + +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode, + nir_intrinsic_op op); diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h new file mode 100644 index 00000000000..87394bc17b3 --- /dev/null +++ b/src/intel/compiler/brw_fs_builder.h @@ -0,0 +1,662 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_FS_BUILDER_H +#define BRW_FS_BUILDER_H + +#include "brw_ir_fs.h" +#include "brw_shader.h" + +namespace brw { + /** + * Toolbox to assemble an FS IR program out of individual instructions. + * + * This object is meant to have an interface consistent with + * brw::vec4_builder. They cannot be fully interchangeable because + * brw::fs_builder generates scalar code while brw::vec4_builder generates + * vector code. + */ + class fs_builder { + public: + /** Type used in this IR to represent a source of an instruction. */ + typedef fs_reg src_reg; + + /** Type used in this IR to represent the destination of an instruction. */ + typedef fs_reg dst_reg; + + /** Type used in this IR to represent an instruction. */ + typedef fs_inst instruction; + + /** + * Construct an fs_builder that inserts instructions into \p shader. + * \p dispatch_width gives the native execution width of the program. + */ + fs_builder(backend_shader *shader, + unsigned dispatch_width) : + shader(shader), block(NULL), cursor(NULL), + _dispatch_width(dispatch_width), + _group(0), + force_writemask_all(false), + annotation() + { + } + + /** + * Construct an fs_builder that inserts instructions into \p shader + * before instruction \p inst in basic block \p block. The default + * execution controls and debug annotation are initialized from the + * instruction passed as argument. + */ + fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : + shader(shader), block(block), cursor(inst), + _dispatch_width(inst->exec_size), + _group(inst->group), + force_writemask_all(inst->force_writemask_all) + { + annotation.str = inst->annotation; + annotation.ir = inst->ir; + } + + /** + * Construct an fs_builder that inserts instructions before \p cursor in + * basic block \p block, inheriting other code generation parameters + * from this. + */ + fs_builder + at(bblock_t *block, exec_node *cursor) const + { + fs_builder bld = *this; + bld.block = block; + bld.cursor = cursor; + return bld; + } + + /** + * Construct an fs_builder appending instructions at the end of the + * instruction list of the shader, inheriting other code generation + * parameters from this. + */ + fs_builder + at_end() const + { + return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); + } + + /** + * Construct a builder specifying the default SIMD width and group of + * channel enable signals, inheriting other code generation parameters + * from this. + * + * \p n gives the default SIMD width, \p i gives the slot group used for + * predication and control flow masking in multiples of \p n channels. + */ + fs_builder + group(unsigned n, unsigned i) const + { + assert(force_writemask_all || + (n <= dispatch_width() && i < dispatch_width() / n)); + fs_builder bld = *this; + bld._dispatch_width = n; + bld._group += i * n; + return bld; + } + + /** + * Alias for group() with width equal to eight. + */ + fs_builder + half(unsigned i) const + { + return group(8, i); + } + + /** + * Construct a builder with per-channel control flow execution masking + * disabled if \p b is true. If control flow execution masking is + * already disabled this has no effect. + */ + fs_builder + exec_all(bool b = true) const + { + fs_builder bld = *this; + if (b) + bld.force_writemask_all = true; + return bld; + } + + /** + * Construct a builder with the given debug annotation info. + */ + fs_builder + annotate(const char *str, const void *ir = NULL) const + { + fs_builder bld = *this; + bld.annotation.str = str; + bld.annotation.ir = ir; + return bld; + } + + /** + * Get the SIMD width in use. + */ + unsigned + dispatch_width() const + { + return _dispatch_width; + } + + /** + * Get the channel group in use. + */ + unsigned + group() const + { + return _group; + } + + /** + * Allocate a virtual register of natural vector size (one for this IR) + * and SIMD width. \p n gives the amount of space to allocate in + * dispatch_width units (which is just enough space for one logical + * component in this IR). + */ + dst_reg + vgrf(enum brw_reg_type type, unsigned n = 1) const + { + assert(dispatch_width() <= 32); + + if (n > 0) + return dst_reg(VGRF, shader->alloc.allocate( + DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), + REG_SIZE)), + type); + else + return retype(null_reg_ud(), type); + } + + /** + * Create a null register of floating type. + */ + dst_reg + null_reg_f() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); + } + + dst_reg + null_reg_df() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); + } + + /** + * Create a null register of signed integer type. + */ + dst_reg + null_reg_d() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } + + /** + * Create a null register of unsigned integer type. + */ + dst_reg + null_reg_ud() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + } + + /** + * Get the mask of SIMD channels enabled by dispatch and not yet + * disabled by discard. + */ + src_reg + sample_mask_reg() const + { + assert(shader->stage != MESA_SHADER_FRAGMENT || + group() + dispatch_width() <= 16); + if (shader->stage != MESA_SHADER_FRAGMENT) { + return brw_imm_d(0xffffffff); + } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) { + return brw_flag_reg(0, 1); + } else { + return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD); + } + } + + /** + * Insert an instruction into the program. + */ + instruction * + emit(const instruction &inst) const + { + return emit(new(shader->mem_ctx) instruction(inst)); + } + + /** + * Create and insert a nullary control instruction into the program. + */ + instruction * + emit(enum opcode opcode) const + { + return emit(instruction(opcode, dispatch_width())); + } + + /** + * Create and insert a nullary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst) const + { + return emit(instruction(opcode, dispatch_width(), dst)); + } + + /** + * Create and insert a unary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const + { + switch (opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return emit(instruction(opcode, dispatch_width(), dst, + fix_math_operand(src0))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, src0)); + } + } + + /** + * Create and insert a binary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1) const + { + switch (opcode) { + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return emit(instruction(opcode, dispatch_width(), dst, + fix_math_operand(src0), + fix_math_operand(src1))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, src0, src1)); + + } + } + + /** + * Create and insert a ternary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1, const src_reg &src2) const + { + switch (opcode) { + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + return emit(instruction(opcode, dispatch_width(), dst, + fix_3src_operand(src0), + fix_3src_operand(src1), + fix_3src_operand(src2))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, + src0, src1, src2)); + } + } + + /** + * Create and insert an instruction with a variable number of sources + * into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], + unsigned n) const + { + return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); + } + + /** + * Insert a preallocated instruction into the program. + */ + instruction * + emit(instruction *inst) const + { + assert(inst->exec_size <= 32); + assert(inst->exec_size == dispatch_width() || + force_writemask_all); + + inst->group = _group; + inst->force_writemask_all = force_writemask_all; + inst->annotation = annotation.str; + inst->ir = annotation.ir; + + if (block) + static_cast<instruction *>(cursor)->insert_before(block, inst); + else + cursor->insert_before(inst); + + return inst; + } + + /** + * Select \p src0 if the comparison of both sources with the given + * conditional mod evaluates to true, otherwise select \p src1. + * + * Generally useful to get the minimum or maximum of two values. + */ + instruction * + emit_minmax(const dst_reg &dst, const src_reg &src0, + const src_reg &src1, brw_conditional_mod mod) const + { + assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); + + return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Copy any live channel from \p src to the first channel of the result. + */ + src_reg + emit_uniformize(const src_reg &src) const + { + /* FIXME: We use a vector chan_index and dst to allow constant and + * copy propagration to move result all the way into the consuming + * instruction (typically a surface index or sampler index for a + * send). This uses 1 or 3 extra hw registers in 16 or 32 wide + * dispatch. Once we teach const/copy propagation about scalars we + * should go back to scalar destinations here. + */ + const fs_builder ubld = exec_all(); + const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); + const dst_reg dst = vgrf(src.type); + + ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); + ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); + + return src_reg(component(dst, 0)); + } + + /** + * Assorted arithmetic ops. + * @{ + */ +#define ALU1(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0); \ + } + +#define ALU2(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1); \ + } + +#define ALU2_ACC(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ + inst->writes_accumulator = true; \ + return inst; \ + } + +#define ALU3(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ + const src_reg &src2) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ + } + + ALU2(ADD) + ALU2_ACC(ADDC) + ALU2(AND) + ALU2(ASR) + ALU2(AVG) + ALU3(BFE) + ALU2(BFI1) + ALU3(BFI2) + ALU1(BFREV) + ALU1(CBIT) + ALU2(CMPN) + ALU3(CSEL) + ALU1(DIM) + ALU2(DP2) + ALU2(DP3) + ALU2(DP4) + ALU2(DPH) + ALU1(F16TO32) + ALU1(F32TO16) + ALU1(FBH) + ALU1(FBL) + ALU1(FRC) + ALU2(LINE) + ALU1(LZD) + ALU2(MAC) + ALU2_ACC(MACH) + ALU3(MAD) + ALU1(MOV) + ALU2(MUL) + ALU1(NOT) + ALU2(OR) + ALU2(PLN) + ALU1(RNDD) + ALU1(RNDE) + ALU1(RNDU) + ALU1(RNDZ) + ALU2(SAD2) + ALU2_ACC(SADA2) + ALU2(SEL) + ALU2(SHL) + ALU2(SHR) + ALU2_ACC(SUBB) + ALU2(XOR) + +#undef ALU3 +#undef ALU2_ACC +#undef ALU2 +#undef ALU1 + /** @} */ + + /** + * CMP: Sets the low bit of the destination channels with the result + * of the comparison, while the upper bits are undefined, and updates + * the flag register with the packed 16 bits of the result. + */ + instruction * + CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, + brw_conditional_mod condition) const + { + /* Take the instruction: + * + * CMP null<d> src0<f> src1<f> + * + * Original gen4 does type conversion to the destination type + * before comparison, producing garbage results for floating + * point comparisons. + * + * The destination type doesn't matter on newer generations, + * so we set the type to match src0 so we can compact the + * instruction. + */ + return set_condmod(condition, + emit(BRW_OPCODE_CMP, retype(dst, src0.type), + fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Gen4 predicated IF. + */ + instruction * + IF(brw_predicate predicate) const + { + return set_predicate(predicate, emit(BRW_OPCODE_IF)); + } + + /** + * Emit a linear interpolation instruction. + */ + instruction * + LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, + const src_reg &a) const + { + if (shader->devinfo->gen >= 6) { + /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so + * we need to reorder the operands. + */ + return emit(BRW_OPCODE_LRP, dst, a, y, x); + + } else { + /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ + const dst_reg y_times_a = vgrf(dst.type); + const dst_reg one_minus_a = vgrf(dst.type); + const dst_reg x_times_one_minus_a = vgrf(dst.type); + + MUL(y_times_a, y, a); + ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); + MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); + return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); + } + } + + /** + * Collect a number of registers in a contiguous range of registers. + */ + instruction * + LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, + unsigned sources, unsigned header_size) const + { + instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); + inst->header_size = header_size; + inst->size_written = header_size * REG_SIZE; + for (unsigned i = header_size; i < sources; i++) { + inst->size_written += + ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, + REG_SIZE); + } + + return inst; + } + + backend_shader *shader; + + private: + /** + * Workaround for negation of UD registers. See comment in + * fs_generator::generate_code() for more details. + */ + src_reg + fix_unsigned_negate(const src_reg &src) const + { + if (src.type == BRW_REGISTER_TYPE_UD && + src.negate) { + dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); + MOV(temp, src); + return src_reg(temp); + } else { + return src; + } + } + + /** + * Workaround for source register modes not supported by the ternary + * instruction encoding. + */ + src_reg + fix_3src_operand(const src_reg &src) const + { + if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) { + return src; + } else { + dst_reg expanded = vgrf(src.type); + MOV(expanded, src); + return expanded; + } + } + + /** + * Workaround for source register modes not supported by the math + * instruction. + */ + src_reg + fix_math_operand(const src_reg &src) const + { + /* Can't do hstride == 0 args on gen6 math, so expand it out. We + * might be able to do better by doing execsize = 1 math and then + * expanding that result out, but we would need to be careful with + * masking. + * + * Gen6 hardware ignores source modifiers (negate and abs) on math + * instructions, so we also move to a temp to set those up. + * + * Gen7 relaxes most of the above restrictions, but still can't use IMM + * operands to math + */ + if ((shader->devinfo->gen == 6 && + (src.file == IMM || src.file == UNIFORM || + src.abs || src.negate)) || + (shader->devinfo->gen == 7 && src.file == IMM)) { + const dst_reg tmp = vgrf(src.type); + MOV(tmp, src); + return tmp; + } else { + return src; + } + } + + bblock_t *block; + exec_node *cursor; + + unsigned _dispatch_width; + unsigned _group; + bool force_writemask_all; + + /** Debug annotation info. */ + struct { + const char *str; + const void *ir; + } annotation; + }; +} + +#endif diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp new file mode 100644 index 00000000000..2d50c92e9e3 --- /dev/null +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -0,0 +1,183 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +/** @file brw_fs_cmod_propagation.cpp + * + * Implements a pass that propagates the conditional modifier from a CMP x 0.0 + * instruction into the instruction that generated x. For instance, in this + * sequence + * + * add(8) g70<1>F g69<8,8,1>F 4096F + * cmp.ge.f0(8) null g70<8,8,1>F 0F + * + * we can do the comparison as part of the ADD instruction directly: + * + * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F + * + * If there had been a use of the flag register and another CMP using g70 + * + * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F + * (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F + * cmp.ge.f0(8) null g70<8,8,1>F 0F + * + * we can recognize that the CMP is generating the flag value that already + * exists and therefore remove the instruction. + */ + +static bool +opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block) +{ + bool progress = false; + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + ip--; + + if ((inst->opcode != BRW_OPCODE_AND && + inst->opcode != BRW_OPCODE_CMP && + inst->opcode != BRW_OPCODE_MOV) || + inst->predicate != BRW_PREDICATE_NONE || + !inst->dst.is_null() || + inst->src[0].file != VGRF || + inst->src[0].abs) + continue; + + /* Only an AND.NZ can be propagated. Many AND.Z instructions are + * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code). + * Propagating those would require inverting the condition on the CMP. + * This changes both the flag value and the register destination of the + * CMP. That result may be used elsewhere, so we can't change its value + * on a whim. + */ + if (inst->opcode == BRW_OPCODE_AND && + !(inst->src[1].is_one() && + inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate)) + continue; + + if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) + continue; + + if (inst->opcode == BRW_OPCODE_MOV && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + continue; + + bool read_flag = false; + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + if (scan_inst->is_partial_write() || + scan_inst->dst.offset != inst->src[0].offset || + scan_inst->exec_size != inst->exec_size) + break; + + /* CMP's result is the same regardless of dest type. */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + scan_inst->opcode == BRW_OPCODE_CMP && + (inst->dst.type == BRW_REGISTER_TYPE_D || + inst->dst.type == BRW_REGISTER_TYPE_UD)) { + inst->remove(block); + progress = true; + break; + } + + /* If the AND wasn't handled by the previous case, it isn't safe + * to remove it. + */ + if (inst->opcode == BRW_OPCODE_AND) + break; + + /* Comparisons operate differently for ints and floats */ + if (scan_inst->dst.type != inst->dst.type && + (scan_inst->dst.type == BRW_REGISTER_TYPE_F || + inst->dst.type == BRW_REGISTER_TYPE_F)) + break; + + /* If the instruction generating inst's source also wrote the + * flag, and inst is doing a simple .nz comparison, then inst + * is redundant - the appropriate value is already in the flag + * register. Delete inst. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate && + scan_inst->flags_written()) { + inst->remove(block); + progress = true; + break; + } + + /* The conditional mod of the CMP/CMPN instructions behaves + * specially because the flag output is not calculated from the + * result of the instruction, but the other way around, which + * means that even if the condmod to propagate and the condmod + * from the CMP instruction are the same they will in general give + * different results because they are evaluated based on different + * inputs. + */ + if (scan_inst->opcode == BRW_OPCODE_CMP || + scan_inst->opcode == BRW_OPCODE_CMPN) + break; + + /* Otherwise, try propagating the conditional. */ + enum brw_conditional_mod cond = + inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) + : inst->conditional_mod; + + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + inst->remove(block); + progress = true; + } + break; + } + + if (scan_inst->flags_written()) + break; + + read_flag = read_flag || scan_inst->flags_read(devinfo); + } + } + + return progress; +} + +bool +fs_visitor::opt_cmod_propagation() +{ + bool progress = false; + + foreach_block_reverse(block, cfg) { + progress = opt_cmod_propagation_local(devinfo, block) || progress; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp new file mode 100644 index 00000000000..e0c95d379b8 --- /dev/null +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -0,0 +1,329 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_combine_constants.cpp + * + * This file contains the opt_combine_constants() pass that runs after the + * regular optimization loop. It passes over the instruction list and + * selectively promotes immediate values to registers by emitting a mov(1) + * instruction. + * + * This is useful on Gen 7 particularly, because a few instructions can be + * coissued (i.e., issued in the same cycle as another thread on the same EU + * issues an instruction) under some circumstances, one of which is that they + * cannot use immediate values. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +using namespace brw; + +static const bool debug = false; + +/* Returns whether an instruction could co-issue if its immediate source were + * replaced with a GRF source. + */ +static bool +could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst) +{ + if (devinfo->gen != 7) + return false; + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_CMP: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + return true; + default: + return false; + } +} + +/** + * Returns true for instructions that don't support immediate sources. + */ +static bool +must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst) +{ + switch (inst->opcode) { + case SHADER_OPCODE_POW: + return devinfo->gen < 8; + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + return true; + default: + return false; + } +} + +/** A box for putting fs_regs in a linked list. */ +struct reg_link { + DECLARE_RALLOC_CXX_OPERATORS(reg_link) + + reg_link(fs_reg *reg) : reg(reg) {} + + struct exec_node link; + fs_reg *reg; +}; + +static struct exec_node * +link(void *mem_ctx, fs_reg *reg) +{ + reg_link *l = new(mem_ctx) reg_link(reg); + return &l->link; +} + +/** + * Information about an immediate value. + */ +struct imm { + /** The common ancestor of all blocks using this immediate value. */ + bblock_t *block; + + /** + * The instruction generating the immediate value, if all uses are contained + * within a single basic block. Otherwise, NULL. + */ + fs_inst *inst; + + /** + * A list of fs_regs that refer to this immediate. If we promote it, we'll + * have to patch these up to refer to the new GRF. + */ + exec_list *uses; + + /** The immediate value. We currently only handle floats. */ + float val; + + /** + * The GRF register and subregister number where we've decided to store the + * constant value. + */ + uint8_t subreg_offset; + uint16_t nr; + + /** The number of coissuable instructions using this immediate. */ + uint16_t uses_by_coissue; + + /** + * Whether this constant is used by an instruction that can't handle an + * immediate source (and already has to be promoted to a GRF). + */ + bool must_promote; + + uint16_t first_use_ip; + uint16_t last_use_ip; +}; + +/** The working set of information about immediates. */ +struct table { + struct imm *imm; + int size; + int len; +}; + +static struct imm * +find_imm(struct table *table, float val) +{ + for (int i = 0; i < table->len; i++) { + if (table->imm[i].val == val) { + return &table->imm[i]; + } + } + return NULL; +} + +static struct imm * +new_imm(struct table *table, void *mem_ctx) +{ + if (table->len == table->size) { + table->size *= 2; + table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size); + } + return &table->imm[table->len++]; +} + +/** + * Comparator used for sorting an array of imm structures. + * + * We sort by basic block number, then last use IP, then first use IP (least + * to greatest). This sorting causes immediates live in the same area to be + * allocated to the same register in the hopes that all values will be dead + * about the same time and the register can be reused. + */ +static int +compare(const void *_a, const void *_b) +{ + const struct imm *a = (const struct imm *)_a, + *b = (const struct imm *)_b; + + int block_diff = a->block->num - b->block->num; + if (block_diff) + return block_diff; + + int end_diff = a->last_use_ip - b->last_use_ip; + if (end_diff) + return end_diff; + + return a->first_use_ip - b->first_use_ip; +} + +bool +fs_visitor::opt_combine_constants() +{ + void *const_ctx = ralloc_context(NULL); + + struct table table; + table.size = 8; + table.len = 0; + table.imm = ralloc_array(const_ctx, struct imm, table.size); + + cfg->calculate_idom(); + unsigned ip = -1; + + /* Make a pass through all instructions and count the number of times each + * constant is used by coissueable instructions or instructions that cannot + * take immediate arguments. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + ip++; + + if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst)) + continue; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != IMM || + inst->src[i].type != BRW_REGISTER_TYPE_F) + continue; + + float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f : + fabs(inst->src[i].f); + struct imm *imm = find_imm(&table, val); + + if (imm) { + bblock_t *intersection = cfg_t::intersect(block, imm->block); + if (intersection != imm->block) + imm->inst = NULL; + imm->block = intersection; + imm->uses->push_tail(link(const_ctx, &inst->src[i])); + imm->uses_by_coissue += could_coissue(devinfo, inst); + imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst); + imm->last_use_ip = ip; + } else { + imm = new_imm(&table, const_ctx); + imm->block = block; + imm->inst = inst; + imm->uses = new(const_ctx) exec_list(); + imm->uses->push_tail(link(const_ctx, &inst->src[i])); + imm->val = val; + imm->uses_by_coissue = could_coissue(devinfo, inst); + imm->must_promote = must_promote_imm(devinfo, inst); + imm->first_use_ip = ip; + imm->last_use_ip = ip; + } + } + } + + /* Remove constants from the table that don't have enough uses to make them + * profitable to store in a register. + */ + for (int i = 0; i < table.len;) { + struct imm *imm = &table.imm[i]; + + if (!imm->must_promote && imm->uses_by_coissue < 4) { + table.imm[i] = table.imm[table.len - 1]; + table.len--; + continue; + } + i++; + } + if (table.len == 0) { + ralloc_free(const_ctx); + return false; + } + if (cfg->num_blocks != 1) + qsort(table.imm, table.len, sizeof(struct imm), compare); + + /* Insert MOVs to load the constant values into GRFs. */ + fs_reg reg(VGRF, alloc.allocate(1)); + reg.stride = 0; + for (int i = 0; i < table.len; i++) { + struct imm *imm = &table.imm[i]; + /* Insert it either before the instruction that generated the immediate + * or after the last non-control flow instruction of the common ancestor. + */ + exec_node *n = (imm->inst ? imm->inst : + imm->block->last_non_control_flow_inst()->next); + const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0); + + ibld.MOV(reg, brw_imm_f(imm->val)); + imm->nr = reg.nr; + imm->subreg_offset = reg.offset; + + reg.offset += sizeof(float); + if (reg.offset == 8 * sizeof(float)) { + reg.nr = alloc.allocate(1); + reg.offset = 0; + } + } + promoted_constants = table.len; + + /* Rewrite the immediate sources to refer to the new GRFs. */ + for (int i = 0; i < table.len; i++) { + foreach_list_typed(reg_link, link, link, table.imm[i].uses) { + fs_reg *reg = link->reg; + reg->file = VGRF; + reg->nr = table.imm[i].nr; + reg->offset = table.imm[i].subreg_offset; + reg->stride = 0; + reg->negate = signbit(reg->f) != signbit(table.imm[i].val); + assert((isnan(reg->f) && isnan(table.imm[i].val)) || + fabsf(reg->f) == fabs(table.imm[i].val)); + } + } + + if (debug) { + for (int i = 0; i < table.len; i++) { + struct imm *imm = &table.imm[i]; + + printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), " + "IP: %4d to %4d, length %4d\n", + imm->val, + imm->block->num, + imm->nr, + imm->subreg_offset, + imm->must_promote, + imm->uses_by_coissue, + imm->first_use_ip, + imm->last_use_ip, + imm->last_use_ip - imm->first_use_ip); + } + } + + ralloc_free(const_ctx); + invalidate_live_intervals(); + + return true; +} diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp new file mode 100644 index 00000000000..cb117396089 --- /dev/null +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -0,0 +1,869 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_copy_propagation.cpp + * + * Support for global copy propagation in two passes: A local pass that does + * intra-block copy (and constant) propagation, and a global pass that uses + * dataflow analysis on the copies available at the end of each block to re-do + * local copy propagation with more copies available. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 12.5 (p356). + */ + +#define ACP_HASH_SIZE 16 + +#include "util/bitset.h" +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +namespace { /* avoid conflict with opt_copy_propagation_elements */ +struct acp_entry : public exec_node { + fs_reg dst; + fs_reg src; + uint8_t size_written; + uint8_t size_read; + enum opcode opcode; + bool saturate; +}; + +struct block_data { + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the + * start of this block. This is the useful output of the analysis, since + * it lets us plug those into the local copy propagation on the second + * pass. + */ + BITSET_WORD *livein; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the end + * of this block. This is done in initial setup from the per-block acps + * returned by the first local copy prop pass. + */ + BITSET_WORD *liveout; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are generated by + * instructions in this block which reach the end of the block without + * being killed. + */ + BITSET_WORD *copy; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are killed over the + * course of this block. + */ + BITSET_WORD *kill; +}; + +class fs_copy_prop_dataflow +{ +public: + fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list *out_acp[ACP_HASH_SIZE]); + + void setup_initial_values(); + void run(); + + void dump_block_data() const UNUSED; + + void *mem_ctx; + cfg_t *cfg; + + acp_entry **acp; + int num_acp; + int bitset_words; + + struct block_data *bd; +}; +} /* anonymous namespace */ + +fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list *out_acp[ACP_HASH_SIZE]) + : mem_ctx(mem_ctx), cfg(cfg) +{ + bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + + num_acp = 0; + foreach_block (block, cfg) { + for (int i = 0; i < ACP_HASH_SIZE; i++) { + num_acp += out_acp[block->num][i].length(); + } + } + + acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp); + + bitset_words = BITSET_WORDS(num_acp); + + int next_acp = 0; + foreach_block (block, cfg) { + bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words); + + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) { + acp[next_acp] = entry; + + /* opt_copy_propagation_local populates out_acp with copies created + * in a block which are still live at the end of the block. This + * is exactly what we want in the COPY set. + */ + BITSET_SET(bd[block->num].copy, next_acp); + + next_acp++; + } + } + } + + assert(next_acp == num_acp); + + setup_initial_values(); + run(); +} + +/** + * Set up initial values for each of the data flow sets, prior to running + * the fixed-point algorithm. + */ +void +fs_copy_prop_dataflow::setup_initial_values() +{ + /* Initialize the COPY and KILL sets. */ + foreach_block (block, cfg) { + foreach_inst_in_block(fs_inst, inst, block) { + if (inst->dst.file != VGRF) + continue; + + /* Mark ACP entries which are killed by this instruction. */ + for (int i = 0; i < num_acp; i++) { + if (regions_overlap(inst->dst, inst->size_written, + acp[i]->dst, acp[i]->size_written) || + regions_overlap(inst->dst, inst->size_written, + acp[i]->src, acp[i]->size_read)) { + BITSET_SET(bd[block->num].kill, i); + } + } + } + } + + /* Populate the initial values for the livein and liveout sets. For the + * block at the start of the program, livein = 0 and liveout = copy. + * For the others, set liveout to 0 (the empty set) and livein to ~0 + * (the universal set). + */ + foreach_block (block, cfg) { + if (block->parents.is_empty()) { + for (int i = 0; i < bitset_words; i++) { + bd[block->num].livein[i] = 0u; + bd[block->num].liveout[i] = bd[block->num].copy[i]; + } + } else { + for (int i = 0; i < bitset_words; i++) { + bd[block->num].liveout[i] = 0u; + bd[block->num].livein[i] = ~0u; + } + } + } +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::run() +{ + bool progress; + + do { + progress = false; + + /* Update liveout for all blocks. */ + foreach_block (block, cfg) { + if (block->parents.is_empty()) + continue; + + for (int i = 0; i < bitset_words; i++) { + const BITSET_WORD old_liveout = bd[block->num].liveout[i]; + + bd[block->num].liveout[i] = + bd[block->num].copy[i] | (bd[block->num].livein[i] & + ~bd[block->num].kill[i]); + + if (old_liveout != bd[block->num].liveout[i]) + progress = true; + } + } + + /* Update livein for all blocks. If a copy is live out of all parent + * blocks, it's live coming in to this block. + */ + foreach_block (block, cfg) { + if (block->parents.is_empty()) + continue; + + for (int i = 0; i < bitset_words; i++) { + const BITSET_WORD old_livein = bd[block->num].livein[i]; + + bd[block->num].livein[i] = ~0u; + foreach_list_typed(bblock_link, parent_link, link, &block->parents) { + bblock_t *parent = parent_link->block; + bd[block->num].livein[i] &= bd[parent->num].liveout[i]; + } + + if (old_livein != bd[block->num].livein[i]) + progress = true; + } + } + } while (progress); +} + +void +fs_copy_prop_dataflow::dump_block_data() const +{ + foreach_block (block, cfg) { + fprintf(stderr, "Block %d [%d, %d] (parents ", block->num, + block->start_ip, block->end_ip); + foreach_list_typed(bblock_link, link, link, &block->parents) { + bblock_t *parent = link->block; + fprintf(stderr, "%d ", parent->num); + } + fprintf(stderr, "):\n"); + fprintf(stderr, " livein = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].livein[i]); + fprintf(stderr, ", liveout = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].liveout[i]); + fprintf(stderr, ",\n copy = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].copy[i]); + fprintf(stderr, ", kill = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].kill[i]); + fprintf(stderr, "\n"); + } +} + +static bool +is_logic_op(enum opcode opcode) +{ + return (opcode == BRW_OPCODE_AND || + opcode == BRW_OPCODE_OR || + opcode == BRW_OPCODE_XOR || + opcode == BRW_OPCODE_NOT); +} + +static bool +can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, + const gen_device_info *devinfo) +{ + if (stride > 4) + return false; + + /* 3-source instructions can only be Align16, which restricts what strides + * they can take. They can only take a stride of 1 (the usual case), or 0 + * with a special "repctrl" bit. But the repctrl bit doesn't work for + * 64-bit datatypes, so if the source type is 64-bit then only a stride of + * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page + * 944: + * + * This is applicable to 32b datatypes and 16b datatype. 64b datatypes + * cannot use the replicate control. + */ + if (inst->is_3src(devinfo)) { + if (type_sz(inst->src[arg].type) > 4) + return stride == 1; + else + return stride == 1 || stride == 0; + } + + /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions", + * page 391 ("Extended Math Function"): + * + * The following restrictions apply for align1 mode: Scalar source is + * supported. Source and destination horizontal stride must be the + * same. + * + * From the Haswell PRM Volume 2b "Command Reference - Instructions", page + * 134 ("Extended Math Function"): + * + * Scalar source is supported. Source and destination horizontal stride + * must be 1. + * + * and similar language exists for IVB and SNB. Pre-SNB, math instructions + * are sends, so the sources are moved to MRF's and there are no + * restrictions. + */ + if (inst->is_math()) { + if (devinfo->gen == 6 || devinfo->gen == 7) { + assert(inst->dst.stride == 1); + return stride == 1 || stride == 0; + } else if (devinfo->gen >= 8) { + return stride == inst->dst.stride || stride == 0; + } + } + + return true; +} + +bool +fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) +{ + if (inst->src[arg].file != VGRF) + return false; + + if (entry->src.file == IMM) + return false; + assert(entry->src.file == VGRF || entry->src.file == UNIFORM || + entry->src.file == ATTR); + + if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && + inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) + return false; + + assert(entry->dst.file == VGRF); + if (inst->src[arg].nr != entry->dst.nr) + return false; + + /* Bail if inst is reading a range that isn't contained in the range + * that entry is writing. + */ + if (!region_contained_in(inst->src[arg], inst->size_read(arg), + entry->dst, entry->size_written)) + return false; + + /* we can't generally copy-propagate UD negations because we + * can end up accessing the resulting values as signed integers + * instead. See also resolve_ud_negate() and comment in + * fs_generator::generate_code. + */ + if (entry->src.type == BRW_REGISTER_TYPE_UD && + entry->src.negate) + return false; + + bool has_source_modifiers = entry->src.abs || entry->src.negate; + + if ((has_source_modifiers || entry->src.file == UNIFORM || + !entry->src.is_contiguous()) && + !inst->can_do_source_mods(devinfo)) + return false; + + if (has_source_modifiers && + inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE) + return false; + + /* Bail if the result of composing both strides would exceed the + * hardware limit. + */ + if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, + devinfo)) + return false; + + /* Bail if the instruction type is larger than the execution type of the + * copy, what implies that each channel is reading multiple channels of the + * destination of the copy, and simply replacing the sources would give a + * program with different semantics. + */ + if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type)) + return false; + + /* Bail if the result of composing both strides cannot be expressed + * as another stride. This avoids, for example, trying to transform + * this: + * + * MOV (8) rX<1>UD rY<0;1,0>UD + * FOO (8) ... rX<8;8,1>UW + * + * into this: + * + * FOO (8) ... rY<0;1,0>UW + * + * Which would have different semantics. + */ + if (entry->src.stride != 1 && + (inst->src[arg].stride * + type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) + return false; + + /* Since semantics of source modifiers are type-dependent we need to + * ensure that the meaning of the instruction remains the same if we + * change the type. If the sizes of the types are different the new + * instruction will read a different amount of data than the original + * and the semantics will always be different. + */ + if (has_source_modifiers && + entry->dst.type != inst->src[arg].type && + (!inst->can_change_types() || + type_sz(entry->dst.type) != type_sz(inst->src[arg].type))) + return false; + + if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) && + is_logic_op(inst->opcode)) { + return false; + } + + if (entry->saturate) { + switch(inst->opcode) { + case BRW_OPCODE_SEL: + if ((inst->conditional_mod != BRW_CONDITIONAL_GE && + inst->conditional_mod != BRW_CONDITIONAL_L) || + inst->src[1].file != IMM || + inst->src[1].f < 0.0 || + inst->src[1].f > 1.0) { + return false; + } + break; + default: + return false; + } + } + + inst->src[arg].file = entry->src.file; + inst->src[arg].nr = entry->src.nr; + inst->src[arg].stride *= entry->src.stride; + inst->saturate = inst->saturate || entry->saturate; + + /* Compute the offset of inst->src[arg] relative to entry->dst */ + const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + + /* Compute the first component of the copy that the instruction is + * reading, and the base byte offset within that component. + */ + assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1); + const unsigned component = rel_offset / type_sz(entry->dst.type); + const unsigned suboffset = rel_offset % type_sz(entry->dst.type); + + /* Calculate the byte offset at the origin of the copy of the given + * component and suboffset. + */ + inst->src[arg].offset = suboffset + + component * entry->src.stride * type_sz(entry->src.type) + + entry->src.offset; + + if (has_source_modifiers) { + if (entry->dst.type != inst->src[arg].type) { + /* We are propagating source modifiers from a MOV with a different + * type. If we got here, then we can just change the source and + * destination types of the instruction and keep going. + */ + assert(inst->can_change_types()); + for (int i = 0; i < inst->sources; i++) { + inst->src[i].type = entry->dst.type; + } + inst->dst.type = entry->dst.type; + } + + if (!inst->src[arg].abs) { + inst->src[arg].abs = entry->src.abs; + inst->src[arg].negate ^= entry->src.negate; + } + } + + return true; +} + + +bool +fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) +{ + bool progress = false; + + if (entry->src.file != IMM) + return false; + if (type_sz(entry->src.type) > 4) + return false; + if (entry->saturate) + return false; + + for (int i = inst->sources - 1; i >= 0; i--) { + if (inst->src[i].file != VGRF) + continue; + + assert(entry->dst.file == VGRF); + if (inst->src[i].nr != entry->dst.nr) + continue; + + /* Bail if inst is reading a range that isn't contained in the range + * that entry is writing. + */ + if (!region_contained_in(inst->src[i], inst->size_read(i), + entry->dst, entry->size_written)) + continue; + + /* If the type sizes don't match each channel of the instruction is + * either extracting a portion of the constant (which could be handled + * with some effort but the code below doesn't) or reading multiple + * channels of the source at once. + */ + if (type_sz(inst->src[i].type) != type_sz(entry->dst.type)) + continue; + + fs_reg val = entry->src; + val.type = inst->src[i].type; + + if (inst->src[i].abs) { + if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || + !brw_abs_immediate(val.type, &val.as_brw_reg())) { + continue; + } + } + + if (inst->src[i].negate) { + if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || + !brw_negate_immediate(val.type, &val.as_brw_reg())) { + continue; + } + } + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case SHADER_OPCODE_LOAD_PAYLOAD: + case FS_OPCODE_PACK: + inst->src[i] = val; + progress = true; + break; + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* FINISHME: Promote non-float constants and remove this. */ + if (devinfo->gen < 8) + break; + /* fallthrough */ + case SHADER_OPCODE_POW: + /* Allow constant propagation into src1 (except on Gen 6 which + * doesn't support scalar source math), and let constant combining + * promote the constant on Gen < 8. + */ + if (devinfo->gen == 6) + break; + /* fallthrough */ + case BRW_OPCODE_BFI1: + case BRW_OPCODE_ASR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SUBB: + if (i == 1) { + inst->src[i] = val; + progress = true; + } + break; + + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_OR: + case BRW_OPCODE_AND: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + if (i == 1) { + inst->src[i] = val; + progress = true; + } else if (i == 0 && inst->src[1].file != IMM) { + /* Fit this constant in by commuting the operands. + * Exception: we can't do this for 32-bit integer MUL/MACH + * because it's asymmetric. + * + * The BSpec says for Broadwell that + * + * "When multiplying DW x DW, the dst cannot be accumulator." + * + * Integer MUL with a non-accumulator destination will be lowered + * by lower_integer_multiplication(), so don't restrict it. + */ + if (((inst->opcode == BRW_OPCODE_MUL && + inst->dst.is_accumulator()) || + inst->opcode == BRW_OPCODE_MACH) && + (inst->src[1].type == BRW_REGISTER_TYPE_D || + inst->src[1].type == BRW_REGISTER_TYPE_UD)) + break; + inst->src[0] = inst->src[1]; + inst->src[1] = val; + progress = true; + } + break; + + case BRW_OPCODE_CMP: + case BRW_OPCODE_IF: + if (i == 1) { + inst->src[i] = val; + progress = true; + } else if (i == 0 && inst->src[1].file != IMM) { + enum brw_conditional_mod new_cmod; + + new_cmod = brw_swap_cmod(inst->conditional_mod); + if (new_cmod != BRW_CONDITIONAL_NONE) { + /* Fit this constant in by swapping the operands and + * flipping the test + */ + inst->src[0] = inst->src[1]; + inst->src[1] = val; + inst->conditional_mod = new_cmod; + progress = true; + } + } + break; + + case BRW_OPCODE_SEL: + if (i == 1) { + inst->src[i] = val; + progress = true; + } else if (i == 0 && inst->src[1].file != IMM) { + inst->src[0] = inst->src[1]; + inst->src[1] = val; + + /* If this was predicated, flipping operands means + * we also need to flip the predicate. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { + inst->predicate_inverse = + !inst->predicate_inverse; + } + progress = true; + } + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + /* We only propagate into the surface argument of the + * instruction. Everything else goes through LOAD_PAYLOAD. + */ + if (i == 1) { + inst->src[i] = val; + progress = true; + } + break; + + case FS_OPCODE_FB_WRITE_LOGICAL: + /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are + * bit-cast using a strided region so they cannot be immediates. + */ + if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && + i != FB_WRITE_LOGICAL_SRC_OMASK) { + inst->src[i] = val; + progress = true; + } + break; + + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + inst->src[i] = val; + progress = true; + break; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case SHADER_OPCODE_BROADCAST: + inst->src[i] = val; + progress = true; + break; + + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + inst->src[i] = val; + progress = true; + break; + + default: + break; + } + } + + return progress; +} + +static bool +can_propagate_from(fs_inst *inst) +{ + return (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == VGRF && + ((inst->src[0].file == VGRF && + !regions_overlap(inst->dst, inst->size_written, + inst->src[0], inst->size_read(0))) || + inst->src[0].file == ATTR || + inst->src[0].file == UNIFORM || + inst->src[0].file == IMM) && + inst->src[0].type == inst->dst.type && + !inst->is_partial_write()); +} + +/* Walks a basic block and does copy propagation on it using the acp + * list. + */ +bool +fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, + exec_list *acp) +{ + bool progress = false; + + foreach_inst_in_block(fs_inst, inst, block) { + /* Try propagating into this instruction. */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != VGRF) + continue; + + foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { + if (try_constant_propagate(inst, entry)) + progress = true; + else if (try_copy_propagate(inst, i, entry)) + progress = true; + } + } + + /* kill the destination from the ACP */ + if (inst->dst.file == VGRF) { + foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { + if (regions_overlap(entry->dst, entry->size_written, + inst->dst, inst->size_written)) + entry->remove(); + } + + /* Oops, we only have the chaining hash based on the destination, not + * the source, so walk across the entire table. + */ + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_in_list_safe(acp_entry, entry, &acp[i]) { + /* Make sure we kill the entry if this instruction overwrites + * _any_ of the registers that it reads + */ + if (regions_overlap(entry->src, entry->size_read, + inst->dst, inst->size_written)) + entry->remove(); + } + } + } + + /* If this instruction's source could potentially be folded into the + * operand of another instruction, add it to the ACP. + */ + if (can_propagate_from(inst)) { + acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); + entry->dst = inst->dst; + entry->src = inst->src[0]; + entry->size_written = inst->size_written; + entry->size_read = inst->size_read(0); + entry->opcode = inst->opcode; + entry->saturate = inst->saturate; + acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); + } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && + inst->dst.file == VGRF) { + int offset = 0; + for (int i = 0; i < inst->sources; i++) { + int effective_width = i < inst->header_size ? 8 : inst->exec_size; + assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); + const unsigned size_written = effective_width * + type_sz(inst->src[i].type); + if (inst->src[i].file == VGRF) { + acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); + entry->dst = byte_offset(inst->dst, offset); + entry->src = inst->src[i]; + entry->size_written = size_written; + entry->size_read = inst->size_read(i); + entry->opcode = inst->opcode; + if (!entry->dst.equals(inst->src[i])) { + acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); + } else { + ralloc_free(entry); + } + } + offset += size_written; + } + } + } + + return progress; +} + +bool +fs_visitor::opt_copy_propagation() +{ + bool progress = false; + void *copy_prop_ctx = ralloc_context(NULL); + exec_list *out_acp[cfg->num_blocks]; + + for (int i = 0; i < cfg->num_blocks; i++) + out_acp[i] = new exec_list [ACP_HASH_SIZE]; + + /* First, walk through each block doing local copy propagation and getting + * the set of copies available at the end of the block. + */ + foreach_block (block, cfg) { + progress = opt_copy_propagation_local(copy_prop_ctx, block, + out_acp[block->num]) || progress; + } + + /* Do dataflow analysis for those available copies. */ + fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp); + + /* Next, re-run local copy propagation, this time with the set of copies + * provided by the dataflow analysis available at the start of a block. + */ + foreach_block (block, cfg) { + exec_list in_acp[ACP_HASH_SIZE]; + + for (int i = 0; i < dataflow.num_acp; i++) { + if (BITSET_TEST(dataflow.bd[block->num].livein, i)) { + struct acp_entry *entry = dataflow.acp[i]; + in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); + } + } + + progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) || + progress; + } + + for (int i = 0; i < cfg->num_blocks; i++) + delete [] out_acp[i]; + ralloc_free(copy_prop_ctx); + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp new file mode 100644 index 00000000000..48220efd730 --- /dev/null +++ b/src/intel/compiler/brw_fs_cse.cpp @@ -0,0 +1,380 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +/** @file brw_fs_cse.cpp + * + * Support for local common subexpression elimination. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 13.1 (p378). + */ + +using namespace brw; + +namespace { +struct aeb_entry : public exec_node { + /** The instruction that generates the expression value. */ + fs_inst *generator; + + /** The temporary where the value is stored. */ + fs_reg tmp; +}; +} + +static bool +is_expression(const fs_visitor *v, const fs_inst *const inst) +{ + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMP: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_LINE: + case BRW_OPCODE_PLN: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case FS_OPCODE_FB_READ_LOGICAL: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + case FS_OPCODE_CINTERP: + case FS_OPCODE_LINTERP: + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_MOV_INDIRECT: + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case FS_OPCODE_PACK: + return true; + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return inst->mlen < 2; + case SHADER_OPCODE_LOAD_PAYLOAD: + return !inst->is_copy_payload(v->alloc); + default: + return inst->is_send_from_grf() && !inst->has_side_effects() && + !inst->is_volatile(); + } +} + +static bool +operands_match(const fs_inst *a, const fs_inst *b, bool *negate) +{ + fs_reg *xs = a->src; + fs_reg *ys = b->src; + + if (a->opcode == BRW_OPCODE_MAD) { + return xs[0].equals(ys[0]) && + ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || + (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); + } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) { + bool xs0_negate = xs[0].negate; + bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f + : xs[1].negate; + bool ys0_negate = ys[0].negate; + bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f + : ys[1].negate; + float xs1_imm = xs[1].f; + float ys1_imm = ys[1].f; + + xs[0].negate = false; + xs[1].negate = false; + ys[0].negate = false; + ys[1].negate = false; + xs[1].f = fabsf(xs[1].f); + ys[1].f = fabsf(ys[1].f); + + bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || + (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); + + xs[0].negate = xs0_negate; + xs[1].negate = xs[1].file == IMM ? false : xs1_negate; + ys[0].negate = ys0_negate; + ys[1].negate = ys[1].file == IMM ? false : ys1_negate; + xs[1].f = xs1_imm; + ys[1].f = ys1_imm; + + *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate); + if (*negate && (a->saturate || b->saturate)) + return false; + return ret; + } else if (!a->is_commutative()) { + bool match = true; + for (int i = 0; i < a->sources; i++) { + if (!xs[i].equals(ys[i])) { + match = false; + break; + } + } + return match; + } else { + return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || + (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); + } +} + +static bool +instructions_match(fs_inst *a, fs_inst *b, bool *negate) +{ + return a->opcode == b->opcode && + a->force_writemask_all == b->force_writemask_all && + a->exec_size == b->exec_size && + a->group == b->group && + a->saturate == b->saturate && + a->predicate == b->predicate && + a->predicate_inverse == b->predicate_inverse && + a->conditional_mod == b->conditional_mod && + a->flag_subreg == b->flag_subreg && + a->dst.type == b->dst.type && + a->offset == b->offset && + a->mlen == b->mlen && + a->size_written == b->size_written && + a->base_mrf == b->base_mrf && + a->eot == b->eot && + a->header_size == b->header_size && + a->shadow_compare == b->shadow_compare && + a->pi_noperspective == b->pi_noperspective && + a->target == b->target && + a->sources == b->sources && + operands_match(a, b, negate); +} + +static void +create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate) +{ + unsigned written = regs_written(inst); + unsigned dst_width = + DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); + fs_inst *copy; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD || + written != dst_width) { + fs_reg *payload; + int sources, header_size; + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + sources = inst->sources; + header_size = inst->header_size; + } else { + assert(written % dst_width == 0); + sources = written / dst_width; + header_size = 0; + } + + assert(src.file == VGRF); + payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources); + for (int i = 0; i < header_size; i++) { + payload[i] = src; + src.offset += REG_SIZE; + } + for (int i = header_size; i < sources; i++) { + payload[i] = src; + src = offset(src, bld, 1); + } + copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size); + } else { + copy = bld.MOV(inst->dst, src); + copy->group = inst->group; + copy->force_writemask_all = inst->force_writemask_all; + copy->src[0].negate = negate; + } + assert(regs_written(copy) == written); +} + +bool +fs_visitor::opt_cse_local(bblock_t *block) +{ + bool progress = false; + exec_list aeb; + + void *cse_ctx = ralloc_context(NULL); + + int ip = block->start_ip; + foreach_inst_in_block(fs_inst, inst, block) { + /* Skip some cases. */ + if (is_expression(this, inst) && !inst->is_partial_write() && + ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || + inst->dst.is_null())) + { + bool found = false; + bool negate = false; + + foreach_in_list_use_after(aeb_entry, entry, &aeb) { + /* Match current instruction's expression against those in AEB. */ + if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && + instructions_match(inst, entry->generator, &negate)) { + found = true; + progress = true; + break; + } + } + + if (!found) { + if (inst->opcode != BRW_OPCODE_MOV || + (inst->opcode == BRW_OPCODE_MOV && + inst->src[0].file == IMM && + inst->src[0].type == BRW_REGISTER_TYPE_VF)) { + /* Our first sighting of this expression. Create an entry. */ + aeb_entry *entry = ralloc(cse_ctx, aeb_entry); + entry->tmp = reg_undef; + entry->generator = inst; + aeb.push_tail(entry); + } + } else { + /* This is at least our second sighting of this expression. + * If we don't have a temporary already, make one. + */ + bool no_existing_temp = entry->tmp.file == BAD_FILE; + if (no_existing_temp && !entry->generator->dst.is_null()) { + const fs_builder ibld = fs_builder(this, block, entry->generator) + .at(block, entry->generator->next); + int written = regs_written(entry->generator); + + entry->tmp = fs_reg(VGRF, alloc.allocate(written), + entry->generator->dst.type); + + create_copy_instr(ibld, entry->generator, entry->tmp, false); + + entry->generator->dst = entry->tmp; + } + + /* dest <- temp */ + if (!inst->dst.is_null()) { + assert(inst->size_written == entry->generator->size_written); + assert(inst->dst.type == entry->tmp.type); + const fs_builder ibld(this, block, inst); + + create_copy_instr(ibld, inst, entry->tmp, negate); + } + + /* Set our iterator so that next time through the loop inst->next + * will get the instruction in the basic block after the one we've + * removed. + */ + fs_inst *prev = (fs_inst *)inst->prev; + + inst->remove(block); + inst = prev; + } + } + + foreach_in_list_safe(aeb_entry, entry, &aeb) { + /* Kill all AEB entries that write a different value to or read from + * the flag register if we just wrote it. + */ + if (inst->flags_written()) { + bool negate; /* dummy */ + if (entry->generator->flags_read(devinfo) || + (entry->generator->flags_written() && + !instructions_match(inst, entry->generator, &negate))) { + entry->remove(); + ralloc_free(entry); + continue; + } + } + + for (int i = 0; i < entry->generator->sources; i++) { + fs_reg *src_reg = &entry->generator->src[i]; + + /* Kill all AEB entries that use the destination we just + * overwrote. + */ + if (regions_overlap(inst->dst, inst->size_written, + entry->generator->src[i], + entry->generator->size_read(i))) { + entry->remove(); + ralloc_free(entry); + break; + } + + /* Kill any AEB entries using registers that don't get reused any + * more -- a sure sign they'll fail operands_match(). + */ + if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) { + entry->remove(); + ralloc_free(entry); + break; + } + } + } + + ip++; + } + + ralloc_free(cse_ctx); + + return progress; +} + +bool +fs_visitor::opt_cse() +{ + bool progress = false; + + calculate_live_intervals(); + + foreach_block (block, cfg) { + progress = opt_cse_local(block) || progress; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp new file mode 100644 index 00000000000..7adb4278919 --- /dev/null +++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp @@ -0,0 +1,148 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_cfg.h" + +/** @file brw_fs_dead_code_eliminate.cpp + * + * Dataflow-aware dead code elimination. + * + * Walks the instruction list from the bottom, removing instructions that + * have results that both aren't used in later blocks and haven't been read + * yet in the tail end of this block. + */ + +/** + * Is it safe to eliminate the instruction? + */ +static bool +can_eliminate(const fs_inst *inst, BITSET_WORD *flag_live) +{ + return !inst->is_control_flow() && + !inst->has_side_effects() && + !(flag_live[0] & inst->flags_written()) && + !inst->writes_accumulator; +} + +/** + * Is it safe to omit the write, making the destination ARF null? + */ +static bool +can_omit_write(const fs_inst *inst) +{ + switch (inst->opcode) { + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + return true; + default: + /* We can eliminate the destination write for ordinary instructions, + * but not most SENDs. + */ + if (inst->opcode < 128 && inst->mlen == 0) + return true; + + /* It might not be safe for other virtual opcodes. */ + return false; + } +} + +bool +fs_visitor::dead_code_eliminate() +{ + bool progress = false; + + calculate_live_intervals(); + + int num_vars = live_intervals->num_vars; + BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars)); + BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1); + + foreach_block_reverse_safe(block, cfg) { + memcpy(live, live_intervals->block_data[block->num].liveout, + sizeof(BITSET_WORD) * BITSET_WORDS(num_vars)); + memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout, + sizeof(BITSET_WORD)); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (inst->dst.file == VGRF) { + const unsigned var = live_intervals->var_from_reg(inst->dst); + bool result_live = false; + + for (unsigned i = 0; i < regs_written(inst); i++) + result_live |= BITSET_TEST(live, var + i); + + if (!result_live && + (can_omit_write(inst) || can_eliminate(inst, flag_live))) { + inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type)); + progress = true; + } + } + + if (inst->dst.is_null() && can_eliminate(inst, flag_live)) { + inst->opcode = BRW_OPCODE_NOP; + progress = true; + } + + if (inst->dst.file == VGRF) { + if (!inst->is_partial_write()) { + int var = live_intervals->var_from_reg(inst->dst); + for (unsigned i = 0; i < regs_written(inst); i++) { + BITSET_CLEAR(live, var + i); + } + } + } + + if (!inst->predicate && inst->exec_size >= 8) + flag_live[0] &= ~inst->flags_written(); + + if (inst->opcode == BRW_OPCODE_NOP) { + inst->remove(block); + continue; + } + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + int var = live_intervals->var_from_reg(inst->src[i]); + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + BITSET_SET(live, var + j); + } + } + } + + flag_live[0] |= inst->flags_read(devinfo); + } + } + + ralloc_free(live); + ralloc_free(flag_live); + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp new file mode 100644 index 00000000000..aeed6a11977 --- /dev/null +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -0,0 +1,2126 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_generator.cpp + * + * This file supports generating code from the FS LIR to the actual + * native instructions. + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_cfg.h" + +static enum brw_reg_file +brw_file_from_reg(fs_reg *reg) +{ + switch (reg->file) { + case ARF: + return BRW_ARCHITECTURE_REGISTER_FILE; + case FIXED_GRF: + case VGRF: + return BRW_GENERAL_REGISTER_FILE; + case MRF: + return BRW_MESSAGE_REGISTER_FILE; + case IMM: + return BRW_IMMEDIATE_VALUE; + case BAD_FILE: + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + return BRW_ARCHITECTURE_REGISTER_FILE; +} + +static struct brw_reg +brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed) +{ + struct brw_reg brw_reg; + + switch (reg->file) { + case MRF: + assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen)); + /* Fallthrough */ + case VGRF: + if (reg->stride == 0) { + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); + } else { + /* From the Haswell PRM: + * + * "VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries." + * + * The maximum width value that could satisfy this restriction is: + */ + const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); + + /* Because the hardware can only split source regions at a whole + * multiple of width during decompression (i.e. vertically), clamp + * the value obtained above to the physical execution size of a + * single decompressed chunk of the instruction: + */ + const unsigned phys_width = compressed ? inst->exec_size / 2 : + inst->exec_size; + + /* XXX - The equation above is strictly speaking not correct on + * hardware that supports unbalanced GRF writes -- On Gen9+ + * each decompressed chunk of the instruction may have a + * different execution size when the number of components + * written to each destination GRF is not the same. + */ + const unsigned width = MIN2(reg_width, phys_width); + brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + } + + brw_reg = retype(brw_reg, reg->type); + brw_reg = byte_offset(brw_reg, reg->offset); + brw_reg.abs = reg->abs; + brw_reg.negate = reg->negate; + break; + case ARF: + case FIXED_GRF: + case IMM: + assert(reg->offset == 0); + brw_reg = reg->as_brw_reg(); + break; + case BAD_FILE: + /* Probably unused. */ + brw_reg = brw_null_reg(); + break; + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + + return brw_reg; +} + +fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const void *key, + struct brw_stage_prog_data *prog_data, + unsigned promoted_constants, + bool runtime_check_aads_emit, + gl_shader_stage stage) + + : compiler(compiler), log_data(log_data), + devinfo(compiler->devinfo), key(key), + prog_data(prog_data), + promoted_constants(promoted_constants), + runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), + stage(stage), mem_ctx(mem_ctx) +{ + p = rzalloc(mem_ctx, struct brw_codegen); + brw_init_codegen(devinfo, p, mem_ctx); +} + +fs_generator::~fs_generator() +{ +} + +class ip_record : public exec_node { +public: + DECLARE_RALLOC_CXX_OPERATORS(ip_record) + + ip_record(int ip) + { + this->ip = ip; + } + + int ip; +}; + +bool +fs_generator::patch_discard_jumps_to_fb_writes() +{ + if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) + return false; + + int scale = brw_jump_scale(p->devinfo); + + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = gen6_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + + int ip = p->nr_insn; + + foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { + brw_inst *patch = &p->store[patch_ip->ip]; + + assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } + + this->discard_halt_patches.make_empty(); + return true; +} + +void +fs_generator::fire_fb_write(fs_inst *inst, + struct brw_reg payload, + struct brw_reg implied_header, + GLuint nr) +{ + uint32_t msg_control; + + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + if (devinfo->gen < 6) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0)); + brw_pop_insn_state(p); + } + + if (inst->opcode == FS_OPCODE_REP_FB_WRITE) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; + else if (prog_data->dual_src_blend) { + if (!inst->group) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; + else + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; + } else if (inst->exec_size == 16) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + else + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; + + uint32_t surf_index = + prog_data->binding_table.render_target_start + inst->target; + + bool last_render_target = inst->eot || + (prog_data->dual_src_blend && dispatch_width == 16); + + + brw_fb_WRITE(p, + payload, + implied_header, + msg_control, + surf_index, + nr, + 0, + inst->eot, + last_render_target, + inst->header_size != 0); + + brw_mark_surface_used(&prog_data->base, surf_index); +} + +void +fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) +{ + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key; + struct brw_reg implied_header; + + if (devinfo->gen < 8 && !devinfo->is_haswell) { + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + } + + if (inst->base_mrf >= 0) + payload = brw_message_reg(inst->base_mrf); + + /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied + * move, here's g1. + */ + if (inst->header_size != 0) { + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_flag_reg(p, 0, 0); + + /* On HSW, the GPU will use the predicate on SENDC, unless the header is + * present. + */ + if (prog_data->uses_kill) { + struct brw_reg pixel_mask; + + if (devinfo->gen >= 6) + pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); + else + pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + + brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); + } + + if (devinfo->gen >= 6) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + brw_MOV(p, + retype(payload, BRW_REGISTER_TYPE_UD), + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); + + if (inst->target > 0 && key->replicate_alpha) { + /* Set "Source0 Alpha Present to RenderTarget" bit in message + * header. + */ + brw_OR(p, + vec1(retype(payload, BRW_REGISTER_TYPE_UD)), + vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(0x1 << 11)); + } + + if (inst->target > 0) { + /* Set the render target index for choosing BLEND_STATE. */ + brw_MOV(p, retype(vec1(suboffset(payload, 2)), + BRW_REGISTER_TYPE_UD), + brw_imm_ud(inst->target)); + } + + /* Set computes stencil to render target */ + if (prog_data->computed_stencil) { + brw_OR(p, + vec1(retype(payload, BRW_REGISTER_TYPE_UD)), + vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(0x1 << 14)); + } + + implied_header = brw_null_reg(); + } else { + implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); + } + + brw_pop_insn_state(p); + } else { + implied_header = brw_null_reg(); + } + + if (!runtime_check_aads_emit) { + fire_fb_write(inst, payload, implied_header, inst->mlen); + } else { + /* This can only happen in gen < 6 */ + assert(devinfo->gen < 6); + + struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + + /* Check runtime bit to detect if we have to send AA data or not */ + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_AND(p, + v1_null_ud, + retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), + brw_imm_ud(1<<26)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + + int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; + brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1); + { + /* Don't send AA data */ + fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); + } + brw_land_fwd_jump(p, jmp); + fire_fb_write(inst, payload, implied_header, inst->mlen); + } +} + +void +fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload) +{ + assert(inst->size_written % REG_SIZE == 0); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + const unsigned surf_index = + prog_data->binding_table.render_target_start + inst->target; + + gen9_fb_READ(p, dst, payload, surf_index, + inst->header_size, inst->size_written / REG_SIZE, + prog_data->persample_dispatch); + + brw_mark_surface_used(&prog_data->base, surf_index); +} + +void +fs_generator::generate_mov_indirect(fs_inst *inst, + struct brw_reg dst, + struct brw_reg reg, + struct brw_reg indirect_byte_offset) +{ + assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); + assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); + + unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; + + if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { + imm_byte_offset += indirect_byte_offset.ud; + + reg.nr = imm_byte_offset / REG_SIZE; + reg.subnr = imm_byte_offset % REG_SIZE; + brw_MOV(p, dst, reg); + } else { + /* Prior to Broadwell, there are only 8 address registers. */ + assert(inst->exec_size == 8 || devinfo->gen >= 8); + + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); + + /* The destination stride of an instruction (in bytes) must be greater + * than or equal to the size of the rest of the instruction. Since the + * address register is of type UW, we can't use a D-type instruction. + * In order to get around this, re retype to UW and use a stride. + */ + indirect_byte_offset = + retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + + /* There are a number of reasons why we don't use the base offset here. + * One reason is that the field is only 9 bits which means we can only + * use it to access the first 16 GRFs. Also, from the Haswell PRM + * section "Register Region Restrictions": + * + * "The lower bits of the AddressImmediate must not overflow to + * change the register address. The lower 5 bits of Address + * Immediate when added to lower 5 bits of address register gives + * the sub-register offset. The upper bits of Address Immediate + * when added to upper bits of address register gives the register + * address. Any overflow from sub-register offset is dropped." + * + * Since the indirect may cause us to cross a register boundary, this + * makes the base offset almost useless. We could try and do something + * clever where we use a actual base offset if base_offset % 32 == 0 but + * that would mean we were generating different code depending on the + * base offset. Instead, for the sake of consistency, we'll just do the + * add ourselves. This restriction is only listed in the Haswell PRM + * but empirical testing indicates that it applies on all older + * generations and is lifted on Broadwell. + * + * In the end, while base_offset is nice to look at in the generated + * code, using it saves us 0 instructions and would require quite a bit + * of case-by-case work. It's just not worth it. + */ + brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + struct brw_reg ind_src = brw_VxH_indirect(0, 0); + + brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); + + if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && + !inst->get_next()->is_tail_sentinel() && + ((fs_inst *)inst->get_next())->mlen > 0) { + /* From the Sandybridge PRM: + * + * "[Errata: DevSNB(SNB)] If MRF register is updated by any + * instruction that “indexed/indirect” source AND is followed by a + * send, the instruction requires a “Switch”. This is to avoid + * race condition where send may dispatch before MRF is updated." + */ + brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); + } + } +} + +void +fs_generator::generate_urb_read(fs_inst *inst, + struct brw_reg dst, + struct brw_reg header) +{ + assert(inst->size_written % REG_SIZE == 0); + assert(header.file == BRW_GENERAL_REGISTER_FILE); + assert(header.type == BRW_REGISTER_TYPE_UD); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); + brw_set_src0(p, send, header); + brw_set_src1(p, send, brw_imm_ud(0u)); + + brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); + brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); + + if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT) + brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); + + brw_inst_set_mlen(p->devinfo, send, inst->mlen); + brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE); + brw_inst_set_header_present(p->devinfo, send, true); + brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset); +} + +void +fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) +{ + brw_inst *insn; + + insn = brw_next_insn(p, BRW_OPCODE_SEND); + + brw_set_dest(p, insn, brw_null_reg()); + brw_set_src0(p, insn, payload); + brw_set_src1(p, insn, brw_imm_d(0)); + + brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); + brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); + + if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || + inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) + brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true); + + if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || + inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) + brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true); + + brw_inst_set_mlen(p->devinfo, insn, inst->mlen); + brw_inst_set_rlen(p->devinfo, insn, 0); + brw_inst_set_eot(p->devinfo, insn, inst->eot); + brw_inst_set_header_present(p->devinfo, insn, true); + brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset); +} + +void +fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) +{ + struct brw_inst *insn; + + insn = brw_next_insn(p, BRW_OPCODE_SEND); + + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); + brw_set_src1(p, insn, brw_imm_d(0)); + + /* Terminate a compute shader by sending a message to the thread spawner. + */ + brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); + brw_inst_set_mlen(devinfo, insn, 1); + brw_inst_set_rlen(devinfo, insn, 0); + brw_inst_set_eot(devinfo, insn, inst->eot); + brw_inst_set_header_present(devinfo, insn, false); + + brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ + brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ + + /* Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); +} + +void +fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src) +{ + brw_barrier(p, src); + brw_WAIT(p); +} + +void +fs_generator::generate_linterp(fs_inst *inst, + struct brw_reg dst, struct brw_reg *src) +{ + /* PLN reads: + * / in SIMD16 \ + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| + * ----------------------------------- + * + * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: + * + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)| | | in SIMD8 + * |-----------------------------------| + * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 + * ----------------------------------- + * + * See also: emit_interpolation_setup_gen4(). + */ + struct brw_reg delta_x = src[0]; + struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); + struct brw_reg interp = src[1]; + + if (devinfo->has_pln && + (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) { + brw_PLN(p, dst, interp, delta_x); + } else { + brw_LINE(p, brw_null_reg(), interp, delta_x); + brw_MAC(p, dst, suboffset(interp, 1), delta_y); + } +} + +void +fs_generator::generate_get_buffer_size(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg surf_index) +{ + assert(devinfo->gen >= 7); + assert(surf_index.file == BRW_IMMEDIATE_VALUE); + + uint32_t simd_mode; + int rlen = 4; + + switch (inst->exec_size) { + case 8: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + break; + case 16: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("Invalid width for texture instruction"); + } + + if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { + rlen = 8; + dst = vec16(dst); + } + + brw_SAMPLE(p, + retype(dst, BRW_REGISTER_TYPE_UW), + inst->base_mrf, + src, + surf_index.ud, + 0, + GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, + rlen, /* response length */ + inst->mlen, + inst->header_size > 0, + simd_mode, + BRW_SAMPLER_RETURN_FORMAT_SINT32); + + brw_mark_surface_used(prog_data, surf_index.ud); +} + +void +fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, + struct brw_reg surface_index, + struct brw_reg sampler_index) +{ + assert(inst->size_written % REG_SIZE == 0); + int msg_type = -1; + uint32_t simd_mode; + uint32_t return_format; + bool is_combined_send = inst->eot; + + switch (dst.type) { + case BRW_REGISTER_TYPE_D: + return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; + break; + case BRW_REGISTER_TYPE_UD: + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + break; + default: + return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + break; + } + + /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type + * is set as part of the message descriptor. On gen4, the PRM seems to + * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on + * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is + * gone from the message descriptor entirely and you just get UINT32 all + * the time regasrdless. Since we can really only do non-UINT32 on gen4, + * just stomp it to UINT32 all the time. + */ + if (inst->opcode == SHADER_OPCODE_TXS) + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + + switch (inst->exec_size) { + case 8: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + break; + case 16: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("Invalid width for texture instruction"); + } + + if (devinfo->gen >= 5) { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + if (inst->shadow_compare) { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; + } + break; + case FS_OPCODE_TXB: + if (inst->shadow_compare) { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; + } + break; + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; + } + break; + case SHADER_OPCODE_TXL_LZ: + assert(devinfo->gen >= 9); + if (inst->shadow_compare) { + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ; + } else { + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ; + } + break; + case SHADER_OPCODE_TXS: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; + break; + case SHADER_OPCODE_TXD: + if (inst->shadow_compare) { + /* Gen7.5+. Otherwise, lowered in NIR */ + assert(devinfo->gen >= 8 || devinfo->is_haswell); + msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; + } + break; + case SHADER_OPCODE_TXF: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_LZ: + assert(devinfo->gen >= 9); + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; + break; + case SHADER_OPCODE_TXF_CMS_W: + assert(devinfo->gen >= 9); + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; + break; + case SHADER_OPCODE_TXF_CMS: + if (devinfo->gen >= 7) + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; + else + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_UMS: + assert(devinfo->gen >= 7); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; + break; + case SHADER_OPCODE_TXF_MCS: + assert(devinfo->gen >= 7); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; + break; + case SHADER_OPCODE_LOD: + msg_type = GEN5_SAMPLER_MESSAGE_LOD; + break; + case SHADER_OPCODE_TG4: + if (inst->shadow_compare) { + assert(devinfo->gen >= 7); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; + } else { + assert(devinfo->gen >= 6); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; + } + break; + case SHADER_OPCODE_TG4_OFFSET: + assert(devinfo->gen >= 7); + if (inst->shadow_compare) { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; + } else { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; + } + break; + case SHADER_OPCODE_SAMPLEINFO: + msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; + break; + default: + unreachable("not reached"); + } + } else { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + /* Note that G45 and older determines shadow compare and dispatch width + * from message length for most messages. + */ + if (inst->exec_size == 8) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; + if (inst->shadow_compare) { + assert(inst->mlen == 6); + } else { + assert(inst->mlen <= 4); + } + } else { + if (inst->shadow_compare) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; + assert(inst->mlen == 9); + } else { + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; + assert(inst->mlen <= 7 && inst->mlen % 2 == 1); + } + } + break; + case FS_OPCODE_TXB: + if (inst->shadow_compare) { + assert(inst->exec_size == 8); + assert(inst->mlen == 6); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; + } else { + assert(inst->mlen == 9); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + break; + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + assert(inst->exec_size == 8); + assert(inst->mlen == 6); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; + } else { + assert(inst->mlen == 9); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + break; + case SHADER_OPCODE_TXD: + /* There is no sample_d_c message; comparisons are done manually */ + assert(inst->exec_size == 8); + assert(inst->mlen == 7 || inst->mlen == 10); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; + break; + case SHADER_OPCODE_TXF: + assert(inst->mlen <= 9 && inst->mlen % 2 == 1); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + case SHADER_OPCODE_TXS: + assert(inst->mlen == 3); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("not reached"); + } + } + assert(msg_type != -1); + + if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { + dst = vec16(dst); + } + + assert(devinfo->gen < 7 || inst->header_size == 0 || + src.file == BRW_GENERAL_REGISTER_FILE); + + assert(sampler_index.type == BRW_REGISTER_TYPE_UD); + + /* Load the message header if present. If there's a texture offset, + * we need to set it up explicitly and load the offset bitfield. + * Otherwise, we can use an implied move from g0 to the first message reg. + */ + if (inst->header_size != 0) { + if (devinfo->gen < 6 && !inst->offset) { + /* Set up an implied move from g0 to the MRF. */ + src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); + } else { + struct brw_reg header_reg; + + if (devinfo->gen >= 7) { + header_reg = src; + } else { + assert(inst->base_mrf != -1); + header_reg = brw_message_reg(inst->base_mrf); + } + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + /* Explicitly set up the message header by copying g0 to the MRF. */ + brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); + + if (inst->offset) { + /* Set the offset bits in DWord 2. */ + brw_MOV(p, get_element_ud(header_reg, 2), + brw_imm_ud(inst->offset)); + } else if (stage != MESA_SHADER_VERTEX && + stage != MESA_SHADER_FRAGMENT) { + /* The vertex and fragment stages have g0.2 set to 0, so + * header0.2 is 0 when g0 is copied. Other stages may not, so we + * must set it to 0 to avoid setting undesirable bits in the + * message. + */ + brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0)); + } + + brw_adjust_sampler_state_pointer(p, header_reg, sampler_index); + brw_pop_insn_state(p); + } + } + + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || + inst->opcode == SHADER_OPCODE_TG4_OFFSET) + ? prog_data->binding_table.gather_texture_start + : prog_data->binding_table.texture_start; + + if (surface_index.file == BRW_IMMEDIATE_VALUE && + sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t surface = surface_index.ud; + uint32_t sampler = sampler_index.ud; + + brw_SAMPLE(p, + retype(dst, BRW_REGISTER_TYPE_UW), + inst->base_mrf, + src, + surface + base_binding_table_index, + sampler % 16, + msg_type, + inst->size_written / REG_SIZE, + inst->mlen, + inst->header_size != 0, + simd_mode, + return_format); + + brw_mark_surface_used(prog_data, surface + base_binding_table_index); + } else { + /* Non-const sampler index */ + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); + struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + if (brw_regs_equal(&surface_reg, &sampler_reg)) { + brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + } else { + if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { + brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); + } else { + brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); + brw_OR(p, addr, addr, surface_reg); + } + } + if (base_binding_table_index) + brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); + brw_AND(p, addr, addr, brw_imm_ud(0xfff)); + + brw_pop_insn_state(p); + + /* dst = send(offset, a0.0 | <descriptor>) */ + brw_inst *insn = brw_send_indirect_message( + p, BRW_SFID_SAMPLER, dst, src, addr); + brw_set_sampler_message(p, insn, + 0 /* surface */, + 0 /* sampler */, + msg_type, + inst->size_written / REG_SIZE, + inst->mlen /* mlen */, + inst->header_size != 0 /* header */, + simd_mode, + return_format); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } + + if (is_combined_send) { + brw_inst_set_eot(p->devinfo, brw_last_inst, true); + brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); + } +} + + +/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input + * looking like: + * + * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br + * + * Ideally, we want to produce: + * + * DDX DDY + * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) + * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) + * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) + * (ss0.br - ss0.bl) (ss0.tr - ss0.br) + * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) + * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) + * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) + * (ss1.br - ss1.bl) (ss1.tr - ss1.br) + * + * and add another set of two more subspans if in 16-pixel dispatch mode. + * + * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result + * for each pair, and vertstride = 2 jumps us 2 elements after processing a + * pair. But the ideal approximation may impose a huge performance cost on + * sample_d. On at least Haswell, sample_d instruction does some + * optimizations if the same LOD is used for all pixels in the subspan. + * + * For DDY, we need to use ALIGN16 mode since it's capable of doing the + * appropriate swizzling. + */ +void +fs_generator::generate_ddx(enum opcode opcode, + struct brw_reg dst, struct brw_reg src) +{ + unsigned vstride, width; + + if (opcode == FS_OPCODE_DDX_FINE) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + + struct brw_reg src0 = brw_reg(src.file, src.nr, 1, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + vstride, + width, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + struct brw_reg src1 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + vstride, + width, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + brw_ADD(p, dst, src0, negate(src1)); +} + +/* The negate_value boolean is used to negate the derivative computation for + * FBOs, since they place the origin at the upper left instead of the lower + * left. + */ +void +fs_generator::generate_ddy(enum opcode opcode, + struct brw_reg dst, struct brw_reg src) +{ + if (opcode == FS_OPCODE_DDY_FINE) { + /* produce accurate derivatives */ + struct brw_reg src0 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); + struct brw_reg src1 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + struct brw_reg src0 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + struct brw_reg src1 = brw_reg(src.file, src.nr, 2, + src.negate, src.abs, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + brw_ADD(p, dst, negate(src0), src1); + } +} + +void +fs_generator::generate_discard_jump(fs_inst *inst) +{ + assert(devinfo->gen >= 6); + + /* This HALT will be patched up at FB write time to point UIP at the end of + * the program, and at brw_uip_jip() JIP will be set to the end of the + * current block (or the program). + */ + this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); + gen6_HALT(p); +} + +void +fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) +{ + /* The 32-wide messages only respect the first 16-wide half of the channel + * enable signals which are replicated identically for the second group of + * 16 channels, so we cannot use them unless the write is marked + * force_writemask_all. + */ + const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : + MIN2(16, inst->exec_size); + const unsigned block_size = 4 * lower_size / REG_SIZE; + assert(inst->mlen != 0); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, cvt(lower_size) - 1); + brw_set_default_compression(p, lower_size > 8); + + for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { + brw_set_default_group(p, inst->group + lower_size * i); + + brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), + retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); + + brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), + block_size, + inst->offset + block_size * REG_SIZE * i); + } + + brw_pop_insn_state(p); +} + +void +fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size <= 16 || inst->force_writemask_all); + assert(inst->mlen != 0); + + brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), + inst->exec_size / 8, inst->offset); +} + +void +fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size <= 16 || inst->force_writemask_all); + + gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); +} + +void +fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) +{ + assert(type_sz(dst.type) == 4); + assert(inst->mlen != 0); + + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.ud; + + assert(offset.file == BRW_IMMEDIATE_VALUE && + offset.type == BRW_REGISTER_TYPE_UD); + uint32_t read_offset = offset.ud; + + brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), + read_offset, surf_index); +} + +void +fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg payload) +{ + assert(index.type == BRW_REGISTER_TYPE_UD); + assert(payload.file == BRW_GENERAL_REGISTER_FILE); + assert(type_sz(dst.type) == 4); + + if (index.file == BRW_IMMEDIATE_VALUE) { + const uint32_t surf_index = index.ud; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_pop_insn_state(p); + + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); + brw_set_dp_read_message(p, send, surf_index, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), + GEN7_DATAPORT_DC_OWORD_BLOCK_READ, + GEN6_SFID_DATAPORT_CONSTANT_CACHE, + 1, /* mlen */ + true, /* header */ + DIV_ROUND_UP(inst->size_written, REG_SIZE)); + + } else { + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + /* dst = send(payload, a0.0 | <descriptor>) */ + brw_inst *insn = brw_send_indirect_message( + p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, + retype(dst, BRW_REGISTER_TYPE_UD), + retype(payload, BRW_REGISTER_TYPE_UD), addr); + brw_set_dp_read_message(p, insn, 0 /* surface */, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), + GEN7_DATAPORT_DC_OWORD_BLOCK_READ, + GEN6_SFID_DATAPORT_CONSTANT_CACHE, + 1, /* mlen */ + true, /* header */ + DIV_ROUND_UP(inst->size_written, REG_SIZE)); + + brw_pop_insn_state(p); + } +} + +void +fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index) +{ + assert(devinfo->gen < 7); /* Should use the gen7 variant. */ + assert(inst->header_size != 0); + assert(inst->mlen); + + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.ud; + + uint32_t simd_mode, rlen, msg_type; + if (inst->exec_size == 16) { + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + rlen = 8; + } else { + assert(inst->exec_size == 8); + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + rlen = 4; + } + + if (devinfo->gen >= 5) + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + else { + /* We always use the SIMD16 message so that we only have to load U, and + * not V or R. + */ + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; + assert(inst->mlen == 3); + assert(inst->size_written == 8 * REG_SIZE); + rlen = 8; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + + struct brw_reg header = brw_vec8_grf(0, 0); + gen6_resolve_implied_move(p, &header, inst->base_mrf); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_inst_set_compression(devinfo, send, false); + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, send, header); + if (devinfo->gen < 6) + brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); + + /* Our surface is set up as floats, regardless of what actual data is + * stored in it. + */ + uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + brw_set_sampler_message(p, send, + surf_index, + 0, /* sampler (unused) */ + msg_type, + rlen, + inst->mlen, + inst->header_size != 0, + simd_mode, + return_format); +} + +void +fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) +{ + assert(devinfo->gen >= 7); + /* Varying-offset pull constant loads are treated as a normal expression on + * gen7, so the fact that it's a send message is hidden at the IR level. + */ + assert(inst->header_size == 0); + assert(!inst->mlen); + assert(index.type == BRW_REGISTER_TYPE_UD); + + uint32_t simd_mode, rlen, mlen; + if (inst->exec_size == 16) { + mlen = 2; + rlen = 8; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } else { + assert(inst->exec_size == 8); + mlen = 1; + rlen = 4; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + } + + if (index.file == BRW_IMMEDIATE_VALUE) { + + uint32_t surf_index = index.ud; + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, send, offset); + brw_set_sampler_message(p, send, + surf_index, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + rlen, + mlen, + false, /* no header */ + simd_mode, + 0); + + } else { + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + brw_pop_insn_state(p); + + /* dst = send(offset, a0.0 | <descriptor>) */ + brw_inst *insn = brw_send_indirect_message( + p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW), + offset, addr); + brw_set_sampler_message(p, insn, + 0 /* surface */, + 0 /* sampler */, + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + rlen /* rlen */, + mlen /* mlen */, + false /* header */, + simd_mode, + 0); + } +} + +/** + * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred + * into the flags register (f0.0). + * + * Used only on Gen6 and above. + */ +void +fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst) +{ + struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg); + struct brw_reg dispatch_mask; + + if (devinfo->gen >= 6) + dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); + else + dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, flags, dispatch_mask); + brw_pop_insn_state(p); +} + +void +fs_generator::generate_pixel_interpolator_query(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg msg_data, + unsigned msg_type) +{ + assert(inst->size_written % REG_SIZE == 0); + assert(msg_data.type == BRW_REGISTER_TYPE_UD); + + brw_pixel_interpolator_query(p, + retype(dst, BRW_REGISTER_TYPE_UW), + src, + inst->pi_noperspective, + msg_type, + msg_data, + inst->mlen, + inst->size_written / REG_SIZE); +} + +/* Sets vstride=1, width=4, hstride=0 of register src1 during + * the ADD instruction. + */ +void +fs_generator::generate_set_sample_id(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + assert(dst.type == BRW_REGISTER_TYPE_D || + dst.type == BRW_REGISTER_TYPE_UD); + assert(src0.type == BRW_REGISTER_TYPE_D || + src0.type == BRW_REGISTER_TYPE_UD); + + struct brw_reg reg = stride(src1, 1, 4, 0); + if (devinfo->gen >= 8 || inst->exec_size == 8) { + brw_ADD(p, dst, src0, reg); + } else if (inst->exec_size == 16) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); + brw_pop_insn_state(p); + } +} + +void +fs_generator::generate_pack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg x, + struct brw_reg y) +{ + assert(devinfo->gen >= 7); + assert(dst.type == BRW_REGISTER_TYPE_UD); + assert(x.type == BRW_REGISTER_TYPE_F); + assert(y.type == BRW_REGISTER_TYPE_F); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: + * + * Because this instruction does not have a 16-bit floating-point type, + * the destination data type must be Word (W). + * + * The destination must be DWord-aligned and specify a horizontal stride + * (HorzStride) of 2. The 16-bit result is stored in the lower word of + * each destination channel and the upper word is not modified. + */ + struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); + + /* Give each 32-bit channel of dst the form below, where "." means + * unchanged. + * 0x....hhhh + */ + brw_F32TO16(p, dst_w, y); + + /* Now the form: + * 0xhhhh0000 + */ + brw_SHL(p, dst, dst, brw_imm_ud(16u)); + + /* And, finally the form of packHalf2x16's output: + * 0xhhhhllll + */ + brw_F32TO16(p, dst_w, x); +} + +void +fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src) +{ + assert(devinfo->gen >= 7); + assert(dst.type == BRW_REGISTER_TYPE_F); + assert(src.type == BRW_REGISTER_TYPE_UD); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: + * + * Because this instruction does not have a 16-bit floating-point type, + * the source data type must be Word (W). The destination type must be + * F (Float). + */ + struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2); + + /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. + * For the Y case, we wish to access only the upper word; therefore + * a 16-bit subregister offset is needed. + */ + assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X || + inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y); + if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y) + src_w.subnr += 2; + + brw_F16TO32(p, dst, src_w); +} + +void +fs_generator::generate_shader_time_add(fs_inst *inst, + struct brw_reg payload, + struct brw_reg offset, + struct brw_reg value) +{ + assert(devinfo->gen >= 7); + brw_push_insn_state(p); + brw_set_default_mask_control(p, true); + + assert(payload.file == BRW_GENERAL_REGISTER_FILE); + struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), + offset.type); + struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0), + value.type); + + assert(offset.file == BRW_IMMEDIATE_VALUE); + if (value.file == BRW_GENERAL_REGISTER_FILE) { + value.width = BRW_WIDTH_1; + value.hstride = BRW_HORIZONTAL_STRIDE_0; + value.vstride = BRW_VERTICAL_STRIDE_0; + } else { + assert(value.file == BRW_IMMEDIATE_VALUE); + } + + /* Trying to deal with setup of the params from the IR is crazy in the FS8 + * case, and we don't really care about squeezing every bit of performance + * out of this path, so we just emit the MOVs from here. + */ + brw_MOV(p, payload_offset, offset); + brw_MOV(p, payload_value, value); + brw_shader_time_add(p, payload, + prog_data->binding_table.shader_time_start); + brw_pop_insn_state(p); + + brw_mark_surface_used(prog_data, + prog_data->binding_table.shader_time_start); +} + +void +fs_generator::enable_debug(const char *shader_name) +{ + debug_flag = true; + this->shader_name = shader_name; +} + +int +fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) +{ + /* align to 64 byte boundary. */ + while (p->next_insn_offset % 64) + brw_NOP(p); + + this->dispatch_width = dispatch_width; + + int start_offset = p->next_insn_offset; + int spill_count = 0, fill_count = 0; + int loop_count = 0; + + struct annotation_info annotation; + memset(&annotation, 0, sizeof(annotation)); + + foreach_block_and_inst (block, fs_inst, inst, cfg) { + struct brw_reg src[3], dst; + unsigned int last_insn_offset = p->next_insn_offset; + bool multiple_instructions_emitted = false; + + /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the + * "Register Region Restrictions" section: for BDW, SKL: + * + * "A POW/FDIV operation must not be followed by an instruction + * that requires two destination registers." + * + * The documentation is often lacking annotations for Atom parts, + * and empirically this affects CHV as well. + */ + if (devinfo->gen >= 8 && + p->nr_insn > 1 && + brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH && + brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && + inst->dst.component_size(inst->exec_size) > REG_SIZE) { + brw_NOP(p); + last_insn_offset = p->next_insn_offset; + } + + if (unlikely(debug_flag)) + annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); + + /* If the instruction writes to more than one register, it needs to be + * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the + * hardware figures out by itself what the right compression mode is, + * but we still need to know whether the instruction is compressed to + * set up the source register regions appropriately. + * + * XXX - This is wrong for instructions that write a single register but + * read more than one which should strictly speaking be treated as + * compressed. For instructions that don't write any registers it + * relies on the destination being a null register of the correct + * type and regioning so the instruction is considered compressed + * or not accordingly. + */ + const bool compressed = + inst->dst.component_size(inst->exec_size) > REG_SIZE; + brw_set_default_compression(p, compressed); + brw_set_default_group(p, inst->group); + + for (unsigned int i = 0; i < inst->sources; i++) { + src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen, + compressed); + + /* The accumulator result appears to get used for the + * conditional modifier generation. When negating a UD + * value, there is a 33rd bit generated for the sign in the + * accumulator value, so now you can't check, for example, + * equality with a 32-bit value. See piglit fs-op-neg-uvec4. + */ + assert(!inst->conditional_mod || + inst->src[i].type != BRW_REGISTER_TYPE_UD || + !inst->src[i].negate); + } + dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_predicate_control(p, inst->predicate); + brw_set_default_predicate_inverse(p, inst->predicate_inverse); + brw_set_default_flag_reg(p, 0, inst->flag_subreg); + brw_set_default_saturate(p, inst->saturate); + brw_set_default_mask_control(p, inst->force_writemask_all); + brw_set_default_acc_write_control(p, inst->writes_accumulator); + brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); + + assert(inst->force_writemask_all || inst->exec_size >= 4); + assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); + assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); + assert(inst->mlen <= BRW_MAX_MSG_LENGTH); + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + brw_MOV(p, dst, src[0]); + break; + case BRW_OPCODE_ADD: + brw_ADD(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MUL: + brw_MUL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_AVG: + brw_AVG(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MACH: + brw_MACH(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_LINE: + brw_LINE(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_MAD: + assert(devinfo->gen >= 6); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MAD(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_LRP: + assert(devinfo->gen >= 6); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_LRP(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_FRC: + brw_FRC(p, dst, src[0]); + break; + case BRW_OPCODE_RNDD: + brw_RNDD(p, dst, src[0]); + break; + case BRW_OPCODE_RNDE: + brw_RNDE(p, dst, src[0]); + break; + case BRW_OPCODE_RNDZ: + brw_RNDZ(p, dst, src[0]); + break; + + case BRW_OPCODE_AND: + brw_AND(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_OR: + brw_OR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_XOR: + brw_XOR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_NOT: + brw_NOT(p, dst, src[0]); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_F32TO16: + assert(devinfo->gen >= 7); + brw_F32TO16(p, dst, src[0]); + break; + case BRW_OPCODE_F16TO32: + assert(devinfo->gen >= 7); + brw_F16TO32(p, dst, src[0]); + break; + case BRW_OPCODE_CMP: + if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell && + dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { + /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround + * implemented in the compiler is not sufficient. Overriding the + * type when the destination is the null register is necessary but + * not sufficient by itself. + */ + assert(dst.nr == BRW_ARF_NULL); + dst.type = BRW_REGISTER_TYPE_D; + } + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + break; + case BRW_OPCODE_SEL: + brw_SEL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_BFREV: + assert(devinfo->gen >= 7); + /* BFREV only supports UD type for src and dst. */ + brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + case BRW_OPCODE_FBH: + assert(devinfo->gen >= 7); + /* FBH only supports UD type for dst. */ + brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_FBL: + assert(devinfo->gen >= 7); + /* FBL only supports UD type for dst. */ + brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_LZD: + brw_LZD(p, dst, src[0]); + break; + case BRW_OPCODE_CBIT: + assert(devinfo->gen >= 7); + /* CBIT only supports UD type for dst. */ + brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_ADDC: + assert(devinfo->gen >= 7); + brw_ADDC(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SUBB: + assert(devinfo->gen >= 7); + brw_SUBB(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MAC: + brw_MAC(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_BFE: + assert(devinfo->gen >= 7); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_BFE(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFI1: + assert(devinfo->gen >= 7); + brw_BFI1(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_BFI2: + assert(devinfo->gen >= 7); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_BFI2(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_IF: + if (inst->src[0].file != BAD_FILE) { + /* The instruction has an embedded compare (only allowed on gen6) */ + assert(devinfo->gen == 6); + gen6_IF(p, inst->conditional_mod, src[0], src[1]); + } else { + brw_IF(p, brw_inst_exec_size(devinfo, p->current)); + } + break; + + case BRW_OPCODE_ELSE: + brw_ELSE(p); + break; + case BRW_OPCODE_ENDIF: + brw_ENDIF(p); + break; + + case BRW_OPCODE_DO: + brw_DO(p, brw_inst_exec_size(devinfo, p->current)); + break; + + case BRW_OPCODE_BREAK: + brw_BREAK(p); + break; + case BRW_OPCODE_CONTINUE: + brw_CONT(p); + break; + + case BRW_OPCODE_WHILE: + brw_WHILE(p); + loop_count++; + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->gen >= 6) { + assert(inst->mlen == 0); + assert(devinfo->gen >= 7 || inst->exec_size == 8); + gen6_math(p, dst, brw_math_function(inst->opcode), + src[0], brw_null_reg()); + } else { + assert(inst->mlen >= 1); + assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8); + gen4_math(p, dst, + brw_math_function(inst->opcode), + inst->base_mrf, src[0], + BRW_MATH_PRECISION_FULL); + } + break; + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_POW: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->gen >= 6) { + assert(inst->mlen == 0); + assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) || + inst->exec_size == 8); + gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); + } else { + assert(inst->mlen >= 1); + assert(inst->exec_size == 8); + gen4_math(p, dst, brw_math_function(inst->opcode), + inst->base_mrf, src[0], + BRW_MATH_PRECISION_FULL); + } + break; + case FS_OPCODE_CINTERP: + brw_MOV(p, dst, src[0]); + break; + case FS_OPCODE_LINTERP: + generate_linterp(inst, dst, src); + break; + case FS_OPCODE_PIXEL_X: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 0 * type_sz(src[0].type); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + break; + case FS_OPCODE_PIXEL_Y: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 4 * type_sz(src[0].type); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + break; + case FS_OPCODE_GET_BUFFER_SIZE: + generate_get_buffer_size(inst, dst, src[0], src[1]); + break; + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + generate_tex(inst, dst, src[0], src[1], src[2]); + break; + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + generate_ddx(inst->opcode, dst, src[0]); + break; + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_DDY_FINE: + generate_ddy(inst->opcode, dst, src[0]); + break; + + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + generate_scratch_write(inst, src[0]); + spill_count++; + break; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + generate_scratch_read(inst, dst); + fill_count++; + break; + + case SHADER_OPCODE_GEN7_SCRATCH_READ: + generate_scratch_read_gen7(inst, dst); + fill_count++; + break; + + case SHADER_OPCODE_MOV_INDIRECT: + generate_mov_indirect(inst, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + generate_urb_read(inst, dst, src[0]); + break; + + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + generate_urb_write(inst, src[0]); + break; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + assert(inst->force_writemask_all); + generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + assert(inst->force_writemask_all); + generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + generate_varying_pull_constant_load_gen4(inst, dst, src[0]); + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_REP_FB_WRITE: + case FS_OPCODE_FB_WRITE: + generate_fb_write(inst, src[0]); + break; + + case FS_OPCODE_FB_READ: + generate_fb_read(inst, dst, src[0]); + break; + + case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: + generate_mov_dispatch_to_flags(inst); + break; + + case FS_OPCODE_DISCARD_JUMP: + generate_discard_jump(inst); + break; + + case SHADER_OPCODE_SHADER_TIME_ADD: + generate_shader_time_add(inst, src[0], src[1], src[2]); + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, + inst->mlen, !inst->dst.is_null()); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_surface_read(p, dst, src[0], src[1], + inst->mlen, src[2].ud); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_surface_write(p, src[0], src[1], + inst->mlen, src[2].ud); + break; + + case SHADER_OPCODE_TYPED_ATOMIC: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_atomic(p, dst, src[0], src[1], + src[2].ud, inst->mlen, !inst->dst.is_null()); + break; + + case SHADER_OPCODE_TYPED_SURFACE_READ: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_surface_read(p, dst, src[0], src[1], + inst->mlen, src[2].ud); + break; + + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud); + break; + + case SHADER_OPCODE_MEMORY_FENCE: + brw_memory_fence(p, dst); + break; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: { + const struct brw_reg mask = + brw_stage_has_packed_dispatch(devinfo, stage, + prog_data) ? brw_imm_ud(~0u) : + stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : + brw_dmask_reg(); + brw_find_live_channel(p, dst, mask); + break; + } + + case SHADER_OPCODE_BROADCAST: + assert(inst->force_writemask_all); + brw_broadcast(p, dst, src[0], src[1]); + break; + + case FS_OPCODE_SET_SAMPLE_ID: + generate_set_sample_id(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + generate_pack_half_2x16_split(inst, dst, src[0], src[1]); + break; + + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: + generate_unpack_half_2x16_split(inst, dst, src[0]); + break; + + case FS_OPCODE_PLACEHOLDER_HALT: + /* This is the place where the final HALT needs to be inserted if + * we've emitted any discards. If not, this will emit no code. + */ + if (!patch_discard_jumps_to_fb_writes()) { + if (unlikely(debug_flag)) { + annotation.ann_count--; + } + } + break; + + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + generate_pixel_interpolator_query(inst, dst, src[0], src[1], + GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); + break; + + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + generate_pixel_interpolator_query(inst, dst, src[0], src[1], + GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); + break; + + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + generate_pixel_interpolator_query(inst, dst, src[0], src[1], + GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); + break; + + case CS_OPCODE_CS_TERMINATE: + generate_cs_terminate(inst, src[0]); + break; + + case SHADER_OPCODE_BARRIER: + generate_barrier(inst, src[0]); + break; + + case BRW_OPCODE_DIM: + assert(devinfo->is_haswell); + assert(src[0].type == BRW_REGISTER_TYPE_DF); + assert(dst.type == BRW_REGISTER_TYPE_DF); + brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); + break; + + default: + unreachable("Unsupported opcode"); + + case SHADER_OPCODE_LOAD_PAYLOAD: + unreachable("Should be lowered by lower_load_payload()"); + } + + if (multiple_instructions_emitted) + continue; + + if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { + assert(p->next_insn_offset == last_insn_offset + 16 || + !"conditional_mod, no_dd_check, or no_dd_clear set for IR " + "emitting more than 1 instruction"); + + brw_inst *last = &p->store[last_insn_offset / 16]; + + if (inst->conditional_mod) + brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } + } + + brw_set_uip_jip(p, start_offset); + annotation_finalize(&annotation, p->next_insn_offset); + +#ifndef NDEBUG + bool validated = brw_validate_instructions(p, start_offset, &annotation); +#else + if (unlikely(debug_flag)) + brw_validate_instructions(p, start_offset, &annotation); +#endif + + int before_size = p->next_insn_offset - start_offset; + brw_compact_instructions(p, start_offset, annotation.ann_count, + annotation.ann); + int after_size = p->next_insn_offset - start_offset; + + if (unlikely(debug_flag)) { + fprintf(stderr, "Native code for %s\n" + "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" + " bytes (%.0f%%)\n", + shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, + spill_count, fill_count, promoted_constants, before_size, after_size, + 100.0f * (before_size - after_size) / before_size); + + dump_assembly(p->store, annotation.ann_count, annotation.ann, + p->devinfo); + ralloc_free(annotation.mem_ctx); + } + assert(validated); + + compiler->shader_debug_log(log_data, + "%s SIMD%d shader: %d inst, %d loops, %u cycles, " + "%d:%d spills:fills, Promoted %u constants, " + "compacted %d to %d bytes.", + _mesa_shader_stage_to_abbrev(stage), + dispatch_width, before_size / 16, + loop_count, cfg->cycle_count, spill_count, + fill_count, promoted_constants, before_size, + after_size); + + return start_offset; +} + +const unsigned * +fs_generator::get_assembly(unsigned int *assembly_size) +{ + return brw_get_program(p, assembly_size); +} diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp new file mode 100644 index 00000000000..c449672a519 --- /dev/null +++ b/src/intel/compiler/brw_fs_live_variables.cpp @@ -0,0 +1,334 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_cfg.h" +#include "brw_fs_live_variables.h" + +using namespace brw; + +#define MAX_INSTRUCTION (1 << 30) + +/** @file brw_fs_live_variables.cpp + * + * Support for calculating liveness information about virtual GRFs. + * + * This produces a live interval for each whole virtual GRF. We could + * choose to expose per-component live intervals for VGRFs of size > 1, + * but we currently do not. It is easier for the consumers of this + * information to work with whole VGRFs. + * + * However, we internally track use/def information at the per-GRF level for + * greater accuracy. Large VGRFs may be accessed piecemeal over many + * (possibly non-adjacent) instructions. In this case, examining a single + * instruction is insufficient to decide whether a whole VGRF is ultimately + * used or defined. Tracking individual components allows us to easily + * assemble this information. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 14.1 (p444). + */ + +void +fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst, + int ip, const fs_reg ®) +{ + int var = var_from_reg(reg); + assert(var < num_vars); + + start[var] = MIN2(start[var], ip); + end[var] = MAX2(end[var], ip); + + /* The use[] bitset marks when the block makes use of a variable (VGRF + * channel) without having completely defined that variable within the + * block. + */ + if (!BITSET_TEST(bd->def, var)) + BITSET_SET(bd->use, var); +} + +void +fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst, + int ip, const fs_reg ®) +{ + int var = var_from_reg(reg); + assert(var < num_vars); + + start[var] = MIN2(start[var], ip); + end[var] = MAX2(end[var], ip); + + /* The def[] bitset marks when an initialization in a block completely + * screens off previous updates of that variable (VGRF channel). + */ + if (inst->dst.file == VGRF && !inst->is_partial_write()) { + if (!BITSET_TEST(bd->use, var)) + BITSET_SET(bd->def, var); + } +} + +/** + * Sets up the use[] and def[] bitsets. + * + * The basic-block-level live variable analysis needs to know which + * variables get used before they're completely defined, and which + * variables are completely defined before they're used. + * + * These are tracked at the per-component level, rather than whole VGRFs. + */ +void +fs_live_variables::setup_def_use() +{ + int ip = 0; + + foreach_block (block, cfg) { + assert(ip == block->start_ip); + if (block->num > 0) + assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); + + struct block_data *bd = &block_data[block->num]; + + foreach_inst_in_block(fs_inst, inst, block) { + /* Set use[] for this instruction */ + for (unsigned int i = 0; i < inst->sources; i++) { + fs_reg reg = inst->src[i]; + + if (reg.file != VGRF) + continue; + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + setup_one_read(bd, inst, ip, reg); + reg.offset += REG_SIZE; + } + } + + bd->flag_use[0] |= inst->flags_read(v->devinfo) & ~bd->flag_def[0]; + + /* Set def[] for this instruction */ + if (inst->dst.file == VGRF) { + fs_reg reg = inst->dst; + for (unsigned j = 0; j < regs_written(inst); j++) { + setup_one_write(bd, inst, ip, reg); + reg.offset += REG_SIZE; + } + } + + if (!inst->predicate && inst->exec_size >= 8) + bd->flag_def[0] |= inst->flags_written() & ~bd->flag_use[0]; + + ip++; + } + } +} + +/** + * The algorithm incrementally sets bits in liveout and livein, + * propagating it through control flow. It will eventually terminate + * because it only ever adds bits, and stops when no bits are added in + * a pass. + */ +void +fs_live_variables::compute_live_variables() +{ + bool cont = true; + + while (cont) { + cont = false; + + foreach_block_reverse (block, cfg) { + struct block_data *bd = &block_data[block->num]; + + /* Update liveout */ + foreach_list_typed(bblock_link, child_link, link, &block->children) { + struct block_data *child_bd = &block_data[child_link->block->num]; + + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = (child_bd->livein[i] & + ~bd->liveout[i]); + if (new_liveout) { + bd->liveout[i] |= new_liveout; + cont = true; + } + } + BITSET_WORD new_liveout = (child_bd->flag_livein[0] & + ~bd->flag_liveout[0]); + if (new_liveout) { + bd->flag_liveout[0] |= new_liveout; + cont = true; + } + } + + /* Update livein */ + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = (bd->use[i] | + (bd->liveout[i] & + ~bd->def[i])); + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + cont = true; + } + } + BITSET_WORD new_livein = (bd->flag_use[0] | + (bd->flag_liveout[0] & + ~bd->flag_def[0])); + if (new_livein & ~bd->flag_livein[0]) { + bd->flag_livein[0] |= new_livein; + cont = true; + } + } + } +} + +/** + * Extend the start/end ranges for each variable to account for the + * new information calculated from control flow. + */ +void +fs_live_variables::compute_start_end() +{ + foreach_block (block, cfg) { + struct block_data *bd = &block_data[block->num]; + + for (int i = 0; i < num_vars; i++) { + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } + + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } + } + } +} + +fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg) + : v(v), cfg(cfg) +{ + mem_ctx = ralloc_context(NULL); + + num_vgrfs = v->alloc.count; + num_vars = 0; + var_from_vgrf = rzalloc_array(mem_ctx, int, num_vgrfs); + for (int i = 0; i < num_vgrfs; i++) { + var_from_vgrf[i] = num_vars; + num_vars += v->alloc.sizes[i]; + } + + vgrf_from_var = rzalloc_array(mem_ctx, int, num_vars); + for (int i = 0; i < num_vgrfs; i++) { + for (unsigned j = 0; j < v->alloc.sizes[i]; j++) { + vgrf_from_var[var_from_vgrf[i] + j] = i; + } + } + + start = ralloc_array(mem_ctx, int, num_vars); + end = rzalloc_array(mem_ctx, int, num_vars); + for (int i = 0; i < num_vars; i++) { + start[i] = MAX_INSTRUCTION; + end[i] = -1; + } + + block_data= rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + + bitset_words = BITSET_WORDS(num_vars); + for (int i = 0; i < cfg->num_blocks; i++) { + block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + + block_data[i].flag_def[0] = 0; + block_data[i].flag_use[0] = 0; + block_data[i].flag_livein[0] = 0; + block_data[i].flag_liveout[0] = 0; + } + + setup_def_use(); + compute_live_variables(); + compute_start_end(); +} + +fs_live_variables::~fs_live_variables() +{ + ralloc_free(mem_ctx); +} + +void +fs_visitor::invalidate_live_intervals() +{ + ralloc_free(live_intervals); + live_intervals = NULL; +} + +/** + * Compute the live intervals for each virtual GRF. + * + * This uses the per-component use/def data, but combines it to produce + * information about whole VGRFs. + */ +void +fs_visitor::calculate_live_intervals() +{ + if (this->live_intervals) + return; + + int num_vgrfs = this->alloc.count; + ralloc_free(this->virtual_grf_start); + ralloc_free(this->virtual_grf_end); + virtual_grf_start = ralloc_array(mem_ctx, int, num_vgrfs); + virtual_grf_end = ralloc_array(mem_ctx, int, num_vgrfs); + + for (int i = 0; i < num_vgrfs; i++) { + virtual_grf_start[i] = MAX_INSTRUCTION; + virtual_grf_end[i] = -1; + } + + this->live_intervals = new(mem_ctx) fs_live_variables(this, cfg); + + /* Merge the per-component live ranges to whole VGRF live ranges. */ + for (int i = 0; i < live_intervals->num_vars; i++) { + int vgrf = live_intervals->vgrf_from_var[i]; + virtual_grf_start[vgrf] = MIN2(virtual_grf_start[vgrf], + live_intervals->start[i]); + virtual_grf_end[vgrf] = MAX2(virtual_grf_end[vgrf], + live_intervals->end[i]); + } +} + +bool +fs_live_variables::vars_interfere(int a, int b) +{ + return !(end[b] <= start[a] || + end[a] <= start[b]); +} + +bool +fs_visitor::virtual_grf_interferes(int a, int b) +{ + return !(virtual_grf_end[a] <= virtual_grf_start[b] || + virtual_grf_end[b] <= virtual_grf_start[a]); +} diff --git a/src/intel/compiler/brw_fs_live_variables.h b/src/intel/compiler/brw_fs_live_variables.h new file mode 100644 index 00000000000..91d1e42cbc1 --- /dev/null +++ b/src/intel/compiler/brw_fs_live_variables.h @@ -0,0 +1,115 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_fs.h" +#include "util/bitset.h" + +struct cfg_t; + +namespace brw { + +struct block_data { + /** + * Which variables are defined before being used in the block. + * + * Note that for our purposes, "defined" means unconditionally, completely + * defined. + */ + BITSET_WORD *def; + + /** + * Which variables are used before being defined in the block. + */ + BITSET_WORD *use; + + /** Which defs reach the entry point of the block. */ + BITSET_WORD *livein; + + /** Which defs reach the exit point of the block. */ + BITSET_WORD *liveout; + + BITSET_WORD flag_def[1]; + BITSET_WORD flag_use[1]; + BITSET_WORD flag_livein[1]; + BITSET_WORD flag_liveout[1]; +}; + +class fs_live_variables { +public: + DECLARE_RALLOC_CXX_OPERATORS(fs_live_variables) + + fs_live_variables(fs_visitor *v, const cfg_t *cfg); + ~fs_live_variables(); + + bool vars_interfere(int a, int b); + int var_from_reg(const fs_reg ®) const + { + return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE; + } + + /** Map from virtual GRF number to index in block_data arrays. */ + int *var_from_vgrf; + + /** + * Map from any index in block_data to the virtual GRF containing it. + * + * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain + * [0, 1, 1, 2, 2, 2]. + */ + int *vgrf_from_var; + + int num_vars; + int num_vgrfs; + int bitset_words; + + /** @{ + * Final computed live ranges for each var (each component of each virtual + * GRF). + */ + int *start; + int *end; + /** @} */ + + /** Per-basic-block information on live variables */ + struct block_data *block_data; + +protected: + void setup_def_use(); + void setup_one_read(struct block_data *bd, fs_inst *inst, int ip, + const fs_reg ®); + void setup_one_write(struct block_data *bd, fs_inst *inst, int ip, + const fs_reg ®); + void compute_live_variables(); + void compute_start_end(); + + fs_visitor *v; + const cfg_t *cfg; + void *mem_ctx; + +}; + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_fs_lower_d2x.cpp b/src/intel/compiler/brw_fs_lower_d2x.cpp new file mode 100644 index 00000000000..a2db1154615 --- /dev/null +++ b/src/intel/compiler/brw_fs_lower_d2x.cpp @@ -0,0 +1,78 @@ +/* + * Copyright © 2015 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_builder.h" + +using namespace brw; + +bool +fs_visitor::lower_d2x() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != BRW_OPCODE_MOV) + continue; + + if (inst->dst.type != BRW_REGISTER_TYPE_F && + inst->dst.type != BRW_REGISTER_TYPE_D && + inst->dst.type != BRW_REGISTER_TYPE_UD) + continue; + + if (inst->src[0].type != BRW_REGISTER_TYPE_DF && + inst->src[0].type != BRW_REGISTER_TYPE_UQ && + inst->src[0].type != BRW_REGISTER_TYPE_Q) + continue; + + assert(inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + const fs_builder ibld(this, block, inst); + + /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to + * Single Precision Float": + * + * The upper Dword of every Qword will be written with undefined + * value when converting DF to F. + * + * So we need to allocate a temporary that's two registers, and then do + * a strided MOV to get the lower DWord of every Qword that has the + * result. + */ + fs_reg temp = ibld.vgrf(inst->src[0].type, 1); + fs_reg strided_temp = subscript(temp, inst->dst.type, 0); + ibld.MOV(strided_temp, inst->src[0]); + ibld.MOV(dst, strided_temp); + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp b/src/intel/compiler/brw_fs_lower_pack.cpp new file mode 100644 index 00000000000..7afaae095bd --- /dev/null +++ b/src/intel/compiler/brw_fs_lower_pack.cpp @@ -0,0 +1,55 @@ +/* + * Copyright © 2015 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_builder.h" + +using namespace brw; + +bool +fs_visitor::lower_pack() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != FS_OPCODE_PACK) + continue; + + assert(inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + const fs_builder ibld(this, block, inst); + for (unsigned i = 0; i < inst->sources; i++) + ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]); + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp new file mode 100644 index 00000000000..d403dec5357 --- /dev/null +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -0,0 +1,4679 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/glsl/ir.h" +#include "brw_fs.h" +#include "brw_fs_surface_builder.h" +#include "brw_nir.h" + +using namespace brw; +using namespace brw::surface_access; + +void +fs_visitor::emit_nir_code() +{ + /* emit the arrays used for inputs and outputs - load/store intrinsics will + * be converted to reads/writes of these arrays + */ + nir_setup_outputs(); + nir_setup_uniforms(); + nir_emit_system_values(); + + /* get the main function and emit it */ + nir_foreach_function(function, nir) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + nir_emit_impl(function->impl); + } +} + +void +fs_visitor::nir_setup_outputs() +{ + if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) + return; + + nir_foreach_variable(var, &nir->outputs) { + const unsigned vec4s = + var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) + : type_size_vec4(var->type); + fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s); + for (unsigned i = 0; i < vec4s; i++) { + if (outputs[var->data.driver_location + i].file == BAD_FILE) + outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i); + } + } +} + +void +fs_visitor::nir_setup_uniforms() +{ + if (dispatch_width != min_dispatch_width) + return; + + uniforms = nir->num_uniforms / 4; +} + +static bool +emit_system_values_block(nir_block *block, fs_visitor *v) +{ + fs_reg *reg; + + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_vertex_id: + unreachable("should be lowered by lower_vertex_id()."); + + case nir_intrinsic_load_vertex_id_zero_base: + assert(v->stage == MESA_SHADER_VERTEX); + reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]; + if (reg->file == BAD_FILE) + *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + break; + + case nir_intrinsic_load_base_vertex: + assert(v->stage == MESA_SHADER_VERTEX); + reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX]; + if (reg->file == BAD_FILE) + *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX); + break; + + case nir_intrinsic_load_instance_id: + assert(v->stage == MESA_SHADER_VERTEX); + reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID]; + if (reg->file == BAD_FILE) + *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID); + break; + + case nir_intrinsic_load_base_instance: + assert(v->stage == MESA_SHADER_VERTEX); + reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE]; + if (reg->file == BAD_FILE) + *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE); + break; + + case nir_intrinsic_load_draw_id: + assert(v->stage == MESA_SHADER_VERTEX); + reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID]; + if (reg->file == BAD_FILE) + *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID); + break; + + case nir_intrinsic_load_invocation_id: + if (v->stage == MESA_SHADER_TESS_CTRL) + break; + assert(v->stage == MESA_SHADER_GEOMETRY); + reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; + if (reg->file == BAD_FILE) { + const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); + fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHR(iid, g1, brw_imm_ud(27u)); + *reg = iid; + } + break; + + case nir_intrinsic_load_sample_pos: + assert(v->stage == MESA_SHADER_FRAGMENT); + reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; + if (reg->file == BAD_FILE) + *reg = *v->emit_samplepos_setup(); + break; + + case nir_intrinsic_load_sample_id: + assert(v->stage == MESA_SHADER_FRAGMENT); + reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; + if (reg->file == BAD_FILE) + *reg = *v->emit_sampleid_setup(); + break; + + case nir_intrinsic_load_sample_mask_in: + assert(v->stage == MESA_SHADER_FRAGMENT); + assert(v->devinfo->gen >= 7); + reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; + if (reg->file == BAD_FILE) + *reg = *v->emit_samplemaskin_setup(); + break; + + case nir_intrinsic_load_work_group_id: + assert(v->stage == MESA_SHADER_COMPUTE); + reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; + if (reg->file == BAD_FILE) + *reg = *v->emit_cs_work_group_id_setup(); + break; + + case nir_intrinsic_load_helper_invocation: + assert(v->stage == MESA_SHADER_FRAGMENT); + reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; + if (reg->file == BAD_FILE) { + const fs_builder abld = + v->bld.annotate("gl_HelperInvocation", NULL); + + /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the + * pixel mask is in g1.7 of the thread payload. + * + * We move the per-channel pixel enable bit to the low bit of each + * channel by shifting the byte containing the pixel mask by the + * vector immediate 0x76543210UV. + * + * The region of <1,8,0> reads only 1 byte (the pixel masks for + * subspans 0 and 1) in SIMD8 and an additional byte (the pixel + * masks for 2 and 3) in SIMD16. + */ + fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); + abld.SHR(shifted, + stride(byte_offset(retype(brw_vec1_grf(1, 0), + BRW_REGISTER_TYPE_UB), 28), + 1, 8, 0), + brw_imm_v(0x76543210)); + + /* A set bit in the pixel mask means the channel is enabled, but + * that is the opposite of gl_HelperInvocation so we need to invert + * the mask. + * + * The negate source-modifier bit of logical instructions on Gen8+ + * performs 1's complement negation, so we can use that instead of + * a NOT instruction. + */ + fs_reg inverted = negate(shifted); + if (v->devinfo->gen < 8) { + inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); + abld.NOT(inverted, shifted); + } + + /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing + * with 1 and negating. + */ + fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.AND(anded, inverted, brw_imm_uw(1)); + + fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); + abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); + *reg = dst; + } + break; + + default: + break; + } + } + + return true; +} + +void +fs_visitor::nir_emit_system_values() +{ + nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); + for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { + nir_system_values[i] = fs_reg(); + } + + nir_foreach_function(function, nir) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + nir_foreach_block(block, function->impl) { + emit_system_values_block(block, this); + } + } +} + +void +fs_visitor::nir_emit_impl(nir_function_impl *impl) +{ + nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); + for (unsigned i = 0; i < impl->reg_alloc; i++) { + nir_locals[i] = fs_reg(); + } + + foreach_list_typed(nir_register, reg, node, &impl->registers) { + unsigned array_elems = + reg->num_array_elems == 0 ? 1 : reg->num_array_elems; + unsigned size = array_elems * reg->num_components; + const brw_reg_type reg_type = + reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF; + nir_locals[reg->index] = bld.vgrf(reg_type, size); + } + + nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, + impl->ssa_alloc); + + nir_emit_cf_list(&impl->body); +} + +void +fs_visitor::nir_emit_cf_list(exec_list *list) +{ + exec_list_validate(list); + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_if: + nir_emit_if(nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + nir_emit_loop(nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_block: + nir_emit_block(nir_cf_node_as_block(node)); + break; + + default: + unreachable("Invalid CFG node block"); + } + } +} + +void +fs_visitor::nir_emit_if(nir_if *if_stmt) +{ + /* first, put the condition into f0 */ + fs_inst *inst = bld.MOV(bld.null_reg_d(), + retype(get_nir_src(if_stmt->condition), + BRW_REGISTER_TYPE_D)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + bld.IF(BRW_PREDICATE_NORMAL); + + nir_emit_cf_list(&if_stmt->then_list); + + /* note: if the else is empty, dead CF elimination will remove it */ + bld.emit(BRW_OPCODE_ELSE); + + nir_emit_cf_list(&if_stmt->else_list); + + bld.emit(BRW_OPCODE_ENDIF); +} + +void +fs_visitor::nir_emit_loop(nir_loop *loop) +{ + bld.emit(BRW_OPCODE_DO); + + nir_emit_cf_list(&loop->body); + + bld.emit(BRW_OPCODE_WHILE); +} + +void +fs_visitor::nir_emit_block(nir_block *block) +{ + nir_foreach_instr(instr, block) { + nir_emit_instr(instr); + } +} + +void +fs_visitor::nir_emit_instr(nir_instr *instr) +{ + const fs_builder abld = bld.annotate(NULL, instr); + + switch (instr->type) { + case nir_instr_type_alu: + nir_emit_alu(abld, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + switch (stage) { + case MESA_SHADER_VERTEX: + nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_TESS_CTRL: + nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_TESS_EVAL: + nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_GEOMETRY: + nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_FRAGMENT: + nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_COMPUTE: + nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + default: + unreachable("unsupported shader stage"); + } + break; + + case nir_instr_type_tex: + nir_emit_texture(abld, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_load_const: + nir_emit_load_const(abld, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_ssa_undef: + /* We create a new VGRF for undefs on every use (by handling + * them in get_nir_src()), rather than for each definition. + * This helps register coalescing eliminate MOVs from undef. + */ + break; + + case nir_instr_type_jump: + nir_emit_jump(abld, nir_instr_as_jump(instr)); + break; + + default: + unreachable("unknown instruction type"); + } +} + +/** + * Recognizes a parent instruction of nir_op_extract_* and changes the type to + * match instr. + */ +bool +fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, + const fs_reg &result) +{ + if (!instr->src[0].src.is_ssa || + !instr->src[0].src.ssa->parent_instr) + return false; + + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *src0 = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && + src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) + return false; + + nir_const_value *element = nir_src_as_const_value(src0->src[1].src); + assert(element != NULL); + + /* Element type to extract.*/ + const brw_reg_type type = brw_int_type( + src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, + src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); + + fs_reg op0 = get_nir_src(src0->src[0].src); + op0.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | + nir_src_bit_size(src0->src[0].src))); + op0 = offset(op0, bld, src0->src[0].swizzle[0]); + + set_saturate(instr->dest.saturate, + bld.MOV(result, subscript(op0, type, element->u32[0]))); + return true; +} + +bool +fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, + const fs_reg &result) +{ + if (!instr->src[0].src.is_ssa || + instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *src0 = + nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr); + + if (src0->intrinsic != nir_intrinsic_load_front_face) + return false; + + nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); + if (!value1 || fabsf(value1->f32[0]) != 1.0f) + return false; + + nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src); + if (!value2 || fabsf(value2->f32[0]) != 1.0f) + return false; + + fs_reg tmp = vgrf(glsl_type::int_type); + + if (devinfo->gen >= 6) { + /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ + fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). + * + * This negation looks like it's safe in practice, because bits 0:4 will + * surely be TRIANGLES + */ + + if (value1->f32[0] == -1.0f) { + g0.negate = true; + } + + bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g0, brw_imm_uw(0x3f80)); + } else { + /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ + fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). + * + * This negation looks like it's safe in practice, because bits 0:4 will + * surely be TRIANGLES + */ + + if (value1->f32[0] == -1.0f) { + g1_6.negate = true; + } + + bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); + } + bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); + + return true; +} + +static void +emit_find_msb_using_lzd(const fs_builder &bld, + const fs_reg &result, + const fs_reg &src, + bool is_signed) +{ + fs_inst *inst; + fs_reg temp = src; + + if (is_signed) { + /* LZD of an absolute value source almost always does the right + * thing. There are two problem values: + * + * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns + * 0. However, findMSB(int(0x80000000)) == 30. + * + * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns + * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: + * + * For a value of zero or negative one, -1 will be returned. + * + * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but + * findMSB(-(1<<x)) should return x-1. + * + * For all negative number cases, including 0x80000000 and + * 0xffffffff, the correct value is obtained from LZD if instead of + * negating the (already negative) value the logical-not is used. A + * conditonal logical-not can be achieved in two instructions. + */ + temp = bld.vgrf(BRW_REGISTER_TYPE_D); + + bld.ASR(temp, src, brw_imm_d(31)); + bld.XOR(temp, temp, src); + } + + bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), + retype(temp, BRW_REGISTER_TYPE_UD)); + + /* LZD counts from the MSB side, while GLSL's findMSB() wants the count + * from the LSB side. Subtract the result from 31 to convert the MSB + * count into an LSB count. If no bits are set, LZD will return 32. + * 31-32 = -1, which is exactly what findMSB() is supposed to return. + */ + inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); + inst->src[0].negate = true; +} + +void +fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) +{ + struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; + fs_inst *inst; + + fs_reg result = get_nir_dest(instr->dest.dest); + result.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[instr->op].output_type | + nir_dest_bit_size(instr->dest.dest))); + + fs_reg op[4]; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + op[i] = get_nir_src(instr->src[i].src); + op[i].type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | + nir_src_bit_size(instr->src[i].src))); + op[i].abs = instr->src[i].abs; + op[i].negate = instr->src[i].negate; + } + + /* We get a bunch of mov's out of the from_ssa pass and they may still + * be vectorized. We'll handle them as a special-case. We'll also + * handle vecN here because it's basically the same thing. + */ + switch (instr->op) { + case nir_op_imov: + case nir_op_fmov: + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + fs_reg temp = result; + bool need_extra_copy = false; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + if (!instr->src[i].src.is_ssa && + instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { + need_extra_copy = true; + temp = bld.vgrf(result.type, 4); + break; + } + } + + for (unsigned i = 0; i < 4; i++) { + if (!(instr->dest.write_mask & (1 << i))) + continue; + + if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { + inst = bld.MOV(offset(temp, bld, i), + offset(op[0], bld, instr->src[0].swizzle[i])); + } else { + inst = bld.MOV(offset(temp, bld, i), + offset(op[i], bld, instr->src[i].swizzle[0])); + } + inst->saturate = instr->dest.saturate; + } + + /* In this case the source and destination registers were the same, + * so we need to insert an extra set of moves in order to deal with + * any swizzling. + */ + if (need_extra_copy) { + for (unsigned i = 0; i < 4; i++) { + if (!(instr->dest.write_mask & (1 << i))) + continue; + + bld.MOV(offset(result, bld, i), offset(temp, bld, i)); + } + } + return; + } + default: + break; + } + + /* At this point, we have dealt with any instruction that operates on + * more than a single channel. Therefore, we can just adjust the source + * and destination registers for that channel and emit the instruction. + */ + unsigned channel = 0; + if (nir_op_infos[instr->op].output_size == 0) { + /* Since NIR is doing the scalarizing for us, we should only ever see + * vectorized operations with a single channel. + */ + assert(_mesa_bitcount(instr->dest.write_mask) == 1); + channel = ffs(instr->dest.write_mask) - 1; + + result = offset(result, bld, channel); + } + + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + assert(nir_op_infos[instr->op].input_sizes[i] < 2); + op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); + } + + switch (instr->op) { + case nir_op_i2f: + case nir_op_u2f: + case nir_op_i642d: + case nir_op_u642d: + if (optimize_extract_to_float(instr, result)) + return; + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_f2d: + case nir_op_i2d: + case nir_op_u2d: + /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions: + * + * "When source or destination is 64b (...), regioning in Align1 + * must follow these rules: + * + * 1. Source and destination horizontal stride must be aligned to + * the same qword. + * (...)" + * + * This means that 32-bit to 64-bit conversions need to have the 32-bit + * data elements aligned to 64-bit. This restriction does not apply to + * BDW and later. + */ + if (nir_dest_bit_size(instr->dest.dest) == 64 && + nir_src_bit_size(instr->src[0].src) == 32 && + (devinfo->is_cherryview || devinfo->is_broxton)) { + fs_reg tmp = bld.vgrf(result.type, 1); + tmp = subscript(tmp, op[0].type, 0); + inst = bld.MOV(tmp, op[0]); + inst = bld.MOV(result, tmp); + inst->saturate = instr->dest.saturate; + break; + } + /* fallthrough */ + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_i2i64: + case nir_op_i2u64: + case nir_op_u2i64: + case nir_op_u2u64: + case nir_op_b2i64: + case nir_op_d2f: + case nir_op_d2i: + case nir_op_d2u: + case nir_op_i642f: + case nir_op_u642f: + case nir_op_u2i32: + case nir_op_i2i32: + case nir_op_u2u32: + case nir_op_i2u32: + if (instr->op == nir_op_b2i64) { + bld.MOV(result, negate(op[0])); + } else { + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + } + break; + + case nir_op_f2i: + case nir_op_f2u: + bld.MOV(result, op[0]); + break; + + case nir_op_fsign: { + if (op[0].abs) { + /* Straightforward since the source can be assumed to be + * non-negative. + */ + set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); + set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(result, brw_imm_f(1.0f))); + + } else if (type_sz(op[0].type) < 8) { + /* AND(val, 0x80000000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not + * zero. + */ + bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); + + fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); + op[0].type = BRW_REGISTER_TYPE_UD; + result.type = BRW_REGISTER_TYPE_UD; + bld.AND(result_int, op[0], brw_imm_ud(0x80000000u)); + + inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); + inst->predicate = BRW_PREDICATE_NORMAL; + if (instr->dest.saturate) { + inst = bld.MOV(result, result); + inst->saturate = true; + } + } else { + /* For doubles we do the same but we need to consider: + * + * - 2-src instructions can't operate with 64-bit immediates + * - The sign is encoded in the high 32-bit of each DF + * - CMP with DF requires special handling in SIMD16 + * - We need to produce a DF result. + */ + + /* 2-src instructions can't have 64-bit immediates, so put 0.0 in + * a register and compare with that. + */ + fs_reg tmp = vgrf(glsl_type::double_type); + bld.MOV(tmp, setup_imm_df(bld, 0.0)); + + /* A direct DF CMP using the flag register (null dst) won't work in + * SIMD16 because the CMP will be split in two by lower_simd_width, + * resulting in two CMP instructions with the same dst (NULL), + * leading to dead code elimination of the first one. In SIMD8, + * however, there is no need to split the CMP and we can save some + * work. + */ + fs_reg dst_tmp = vgrf(glsl_type::double_type); + bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ); + + /* In SIMD16 we want to avoid using a NULL dst register with DF CMP, + * so we store the result of the comparison in a vgrf instead and + * then we generate a UD comparison from that that won't have to + * be split by lower_simd_width. This is what NIR does to handle + * double comparisons in the general case. + */ + if (bld.dispatch_width() == 16 ) { + fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD); + bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0)); + bld.CMP(bld.null_reg_ud(), + dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ); + } + + /* Get the high 32-bit of each double component where the sign is */ + fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); + bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); + + /* Get the sign bit */ + bld.AND(result_int, result_int, brw_imm_ud(0x80000000u)); + + /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */ + inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); + inst->predicate = BRW_PREDICATE_NORMAL; + + /* Convert from 32-bit float to 64-bit double */ + result.type = BRW_REGISTER_TYPE_DF; + inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F)); + + if (instr->dest.saturate) { + inst = bld.MOV(result, result); + inst->saturate = true; + } + } + break; + } + + case nir_op_isign: + /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1). + * -> non-negative val generates 0x00000000. + * Predicated OR sets 1 if val is positive. + */ + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G); + bld.ASR(result, op[0], brw_imm_d(31)); + inst = bld.OR(result, result, brw_imm_d(1)); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + + case nir_op_frcp: + inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fexp2: + inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_flog2: + inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fsin: + inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fcos: + inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fddx: + if (fs_key->high_quality_derivatives) { + inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); + } else { + inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); + } + inst->saturate = instr->dest.saturate; + break; + case nir_op_fddx_fine: + inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_fddx_coarse: + inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_fddy: + if (fs_key->high_quality_derivatives) { + inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); + } else { + inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); + } + inst->saturate = instr->dest.saturate; + break; + case nir_op_fddy_fine: + inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_fddy_coarse: + inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_iadd: + case nir_op_fadd: + inst = bld.ADD(result, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fmul: + inst = bld.MUL(result, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_imul: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.MUL(result, op[0], op[1]); + break; + + case nir_op_imul_high: + case nir_op_umul_high: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); + break; + + case nir_op_idiv: + case nir_op_udiv: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); + break; + + case nir_op_uadd_carry: + unreachable("Should have been lowered by carry_to_arith()."); + + case nir_op_usub_borrow: + unreachable("Should have been lowered by borrow_to_arith()."); + + case nir_op_umod: + case nir_op_irem: + /* According to the sign table for INT DIV in the Ivy Bridge PRM, it + * appears that our hardware just does the right thing for signed + * remainder. + */ + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); + break; + + case nir_op_imod: { + /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ + bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); + + /* Math instructions don't support conditional mod */ + inst = bld.MOV(bld.null_reg_d(), result); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Now, we need to determine if signs of the sources are different. + * When we XOR the sources, the top bit is 0 if they are the same and 1 + * if they are different. We can then use a conditional modifier to + * turn that into a predicate. This leads us to an XOR.l instruction. + * + * Technically, according to the PRM, you're not allowed to use .l on a + * XOR instruction. However, emperical experiments and Curro's reading + * of the simulator source both indicate that it's safe. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); + inst = bld.XOR(tmp, op[0], op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_L; + + /* If the result of the initial remainder operation is non-zero and the + * two sources have different signs, add in a copy of op[1] to get the + * final integer modulus value. + */ + inst = bld.ADD(result, result, op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + + case nir_op_flt: + case nir_op_fge: + case nir_op_feq: + case nir_op_fne: { + fs_reg dest = result; + if (nir_src_bit_size(instr->src[0].src) > 32) { + dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1); + } + brw_conditional_mod cond; + switch (instr->op) { + case nir_op_flt: + cond = BRW_CONDITIONAL_L; + break; + case nir_op_fge: + cond = BRW_CONDITIONAL_GE; + break; + case nir_op_feq: + cond = BRW_CONDITIONAL_Z; + break; + case nir_op_fne: + cond = BRW_CONDITIONAL_NZ; + break; + default: + unreachable("bad opcode"); + } + bld.CMP(dest, op[0], op[1], cond); + if (nir_src_bit_size(instr->src[0].src) > 32) { + bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); + } + break; + } + + case nir_op_ilt: + case nir_op_ult: + case nir_op_ige: + case nir_op_uge: + case nir_op_ieq: + case nir_op_ine: { + fs_reg dest = result; + if (nir_src_bit_size(instr->src[0].src) > 32) { + dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1); + } + + brw_conditional_mod cond; + switch (instr->op) { + case nir_op_ilt: + case nir_op_ult: + cond = BRW_CONDITIONAL_L; + break; + case nir_op_ige: + case nir_op_uge: + cond = BRW_CONDITIONAL_GE; + break; + case nir_op_ieq: + cond = BRW_CONDITIONAL_Z; + break; + case nir_op_ine: + cond = BRW_CONDITIONAL_NZ; + break; + default: + unreachable("bad opcode"); + } + bld.CMP(dest, op[0], op[1], cond); + if (nir_src_bit_size(instr->src[0].src) > 32) { + bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); + } + break; + } + + case nir_op_inot: + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + } + bld.NOT(result, op[0]); + break; + case nir_op_ixor: + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + bld.XOR(result, op[0], op[1]); + break; + case nir_op_ior: + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + bld.OR(result, op[0], op[1]); + break; + case nir_op_iand: + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + bld.AND(result, op[0], op[1]); + break; + + case nir_op_fdot2: + case nir_op_fdot3: + case nir_op_fdot4: + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + unreachable("Lowered by nir_lower_alu_reductions"); + + case nir_op_fnoise1_1: + case nir_op_fnoise1_2: + case nir_op_fnoise1_3: + case nir_op_fnoise1_4: + case nir_op_fnoise2_1: + case nir_op_fnoise2_2: + case nir_op_fnoise2_3: + case nir_op_fnoise2_4: + case nir_op_fnoise3_1: + case nir_op_fnoise3_2: + case nir_op_fnoise3_3: + case nir_op_fnoise3_4: + case nir_op_fnoise4_1: + case nir_op_fnoise4_2: + case nir_op_fnoise4_3: + case nir_op_fnoise4_4: + unreachable("not reached: should be handled by lower_noise"); + + case nir_op_ldexp: + unreachable("not reached: should be handled by ldexp_to_arith()"); + + case nir_op_fsqrt: + inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_frsq: + inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_b2i: + case nir_op_b2f: + bld.MOV(result, negate(op[0])); + break; + + case nir_op_f2b: + bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); + break; + + case nir_op_i642b: + case nir_op_d2b: { + /* two-argument instructions can't take 64-bit immediates */ + fs_reg zero; + fs_reg tmp; + + if (instr->op == nir_op_d2b) { + zero = vgrf(glsl_type::double_type); + tmp = vgrf(glsl_type::double_type); + } else { + zero = vgrf(glsl_type::int64_t_type); + tmp = vgrf(glsl_type::int64_t_type); + } + + bld.MOV(zero, setup_imm_df(bld, 0.0)); + /* A SIMD16 execution needs to be split in two instructions, so use + * a vgrf instead of the flag register as dst so instruction splitting + * works + */ + bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); + bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); + break; + } + case nir_op_i2b: + bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); + break; + + case nir_op_ftrunc: + inst = bld.RNDZ(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fceil: { + op[0].negate = !op[0].negate; + fs_reg temp = vgrf(glsl_type::float_type); + bld.RNDD(temp, op[0]); + temp.negate = true; + inst = bld.MOV(result, temp); + inst->saturate = instr->dest.saturate; + break; + } + case nir_op_ffloor: + inst = bld.RNDD(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_ffract: + inst = bld.FRC(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_fround_even: + inst = bld.RNDE(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fquantize2f16: { + fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); + fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); + fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); + + /* The destination stride must be at least as big as the source stride. */ + tmp16.type = BRW_REGISTER_TYPE_W; + tmp16.stride = 2; + + /* Check for denormal */ + fs_reg abs_src0 = op[0]; + abs_src0.abs = true; + bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), + BRW_CONDITIONAL_L); + /* Get the appropriately signed zero */ + bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x80000000)); + /* Do the actual F32 -> F16 -> F32 conversion */ + bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); + bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); + /* Select that or zero based on normal status */ + inst = bld.SEL(result, zero, tmp32); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->saturate = instr->dest.saturate; + break; + } + + case nir_op_imin: + case nir_op_umin: + case nir_op_fmin: + inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_imax: + case nir_op_umax: + case nir_op_fmax: + inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_pack_snorm_2x16: + case nir_op_pack_snorm_4x8: + case nir_op_pack_unorm_2x16: + case nir_op_pack_unorm_4x8: + case nir_op_unpack_snorm_2x16: + case nir_op_unpack_snorm_4x8: + case nir_op_unpack_unorm_2x16: + case nir_op_unpack_unorm_4x8: + case nir_op_unpack_half_2x16: + case nir_op_pack_half_2x16: + unreachable("not reached: should be handled by lower_packing_builtins"); + + case nir_op_unpack_half_2x16_split_x: + inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_unpack_half_2x16_split_y: + inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_pack_64_2x32_split: + bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); + break; + + case nir_op_unpack_64_2x32_split_x: + case nir_op_unpack_64_2x32_split_y: { + if (instr->op == nir_op_unpack_64_2x32_split_x) + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); + else + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); + break; + } + + case nir_op_fpow: + inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_bitfield_reverse: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.BFREV(result, op[0]); + break; + + case nir_op_bit_count: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.CBIT(result, op[0]); + break; + + case nir_op_ufind_msb: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_find_msb_using_lzd(bld, result, op[0], false); + break; + } + + case nir_op_ifind_msb: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + + if (devinfo->gen < 7) { + emit_find_msb_using_lzd(bld, result, op[0], true); + } else { + bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + + /* FBH counts from the MSB side, while GLSL's findMSB() wants the + * count from the LSB side. If FBH didn't return an error + * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB + * count into an LSB count. + */ + bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); + + inst = bld.ADD(result, result, brw_imm_d(31)); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->src[0].negate = true; + } + break; + } + + case nir_op_find_lsb: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + + if (devinfo->gen < 7) { + fs_reg temp = vgrf(glsl_type::int_type); + + /* (x & -x) generates a value that consists of only the LSB of x. + * For all powers of 2, findMSB(y) == findLSB(y). + */ + fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); + fs_reg negated_src = src; + + /* One must be negated, and the other must be non-negated. It + * doesn't matter which is which. + */ + negated_src.negate = true; + src.negate = false; + + bld.AND(temp, src, negated_src); + emit_find_msb_using_lzd(bld, result, temp, false); + } else { + bld.FBL(result, op[0]); + } + break; + + case nir_op_ubitfield_extract: + case nir_op_ibitfield_extract: + unreachable("should have been lowered"); + case nir_op_ubfe: + case nir_op_ibfe: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.BFE(result, op[2], op[1], op[0]); + break; + case nir_op_bfm: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.BFI1(result, op[0], op[1]); + break; + case nir_op_bfi: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + bld.BFI2(result, op[0], op[1], op[2]); + break; + + case nir_op_bitfield_insert: + unreachable("not reached: should have been lowered"); + + case nir_op_ishl: + bld.SHL(result, op[0], op[1]); + break; + case nir_op_ishr: + bld.ASR(result, op[0], op[1]); + break; + case nir_op_ushr: + bld.SHR(result, op[0], op[1]); + break; + + case nir_op_pack_half_2x16_split: + bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); + break; + + case nir_op_ffma: + inst = bld.MAD(result, op[2], op[1], op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_flrp: + inst = bld.LRP(result, op[0], op[1], op[2]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_bcsel: + if (optimize_frontfacing_ternary(instr, result)) + return; + + bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); + inst = bld.SEL(result, op[1], op[2]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + + case nir_op_extract_u8: + case nir_op_extract_i8: { + const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); + nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); + assert(byte != NULL); + bld.MOV(result, subscript(op[0], type, byte->u32[0])); + break; + } + + case nir_op_extract_u16: + case nir_op_extract_i16: { + const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); + nir_const_value *word = nir_src_as_const_value(instr->src[1].src); + assert(word != NULL); + bld.MOV(result, subscript(op[0], type, word->u32[0])); + break; + } + + default: + unreachable("unhandled instruction"); + } + + /* If we need to do a boolean resolve, replace the result with -(x & 1) + * to sign extend the low bit to 0/~0 + */ + if (devinfo->gen <= 5 && + (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { + fs_reg masked = vgrf(glsl_type::int_type); + bld.AND(masked, result, brw_imm_d(1)); + masked.negate = true; + bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); + } +} + +void +fs_visitor::nir_emit_load_const(const fs_builder &bld, + nir_load_const_instr *instr) +{ + const brw_reg_type reg_type = + instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF; + fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); + + switch (instr->def.bit_size) { + case 32: + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i])); + break; + + case 64: + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), + setup_imm_df(bld, instr->value.f64[i])); + break; + + default: + unreachable("Invalid bit size"); + } + + nir_ssa_values[instr->def.index] = reg; +} + +fs_reg +fs_visitor::get_nir_src(const nir_src &src) +{ + fs_reg reg; + if (src.is_ssa) { + if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { + const brw_reg_type reg_type = src.ssa->bit_size == 32 ? + BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF; + reg = bld.vgrf(reg_type, src.ssa->num_components); + } else { + reg = nir_ssa_values[src.ssa->index]; + } + } else { + /* We don't handle indirects on locals */ + assert(src.reg.indirect == NULL); + reg = offset(nir_locals[src.reg.reg->index], bld, + src.reg.base_offset * src.reg.reg->num_components); + } + + /* to avoid floating-point denorm flushing problems, set the type by + * default to D - instructions that need floating point semantics will set + * this to F if they need to + */ + return retype(reg, BRW_REGISTER_TYPE_D); +} + +/** + * Return an IMM for constants; otherwise call get_nir_src() as normal. + */ +fs_reg +fs_visitor::get_nir_src_imm(const nir_src &src) +{ + nir_const_value *val = nir_src_as_const_value(src); + return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src); +} + +fs_reg +fs_visitor::get_nir_dest(const nir_dest &dest) +{ + if (dest.is_ssa) { + const brw_reg_type reg_type = + dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF; + nir_ssa_values[dest.ssa.index] = + bld.vgrf(reg_type, dest.ssa.num_components); + return nir_ssa_values[dest.ssa.index]; + } else { + /* We don't handle indirects on locals */ + assert(dest.reg.indirect == NULL); + return offset(nir_locals[dest.reg.reg->index], bld, + dest.reg.base_offset * dest.reg.reg->num_components); + } +} + +fs_reg +fs_visitor::get_nir_image_deref(const nir_deref_var *deref) +{ + fs_reg image(UNIFORM, deref->var->data.driver_location / 4, + BRW_REGISTER_TYPE_UD); + fs_reg indirect; + unsigned indirect_max = 0; + + for (const nir_deref *tail = &deref->deref; tail->child; + tail = tail->child) { + const nir_deref_array *deref_array = nir_deref_as_array(tail->child); + assert(tail->child->deref_type == nir_deref_type_array); + const unsigned size = glsl_get_length(tail->type); + const unsigned element_size = type_size_scalar(deref_array->deref.type); + const unsigned base = MIN2(deref_array->base_offset, size - 1); + image = offset(image, bld, base * element_size); + + if (deref_array->deref_array_type == nir_deref_array_type_indirect) { + fs_reg tmp = vgrf(glsl_type::uint_type); + + /* Accessing an invalid surface index with the dataport can result + * in a hang. According to the spec "if the index used to + * select an individual element is negative or greater than or + * equal to the size of the array, the results of the operation + * are undefined but may not lead to termination" -- which is one + * of the possible outcomes of the hang. Clamp the index to + * prevent access outside of the array bounds. + */ + bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect), + BRW_REGISTER_TYPE_UD), + brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L); + + indirect_max += element_size * (tail->type->length - 1); + + bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4)); + if (indirect.file == BAD_FILE) { + indirect = tmp; + } else { + bld.ADD(indirect, indirect, tmp); + } + } + } + + if (indirect.file == BAD_FILE) { + return image; + } else { + /* Emit a pile of MOVs to load the uniform into a temporary. The + * dead-code elimination pass will get rid of what we don't use. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE); + for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(tmp, bld, j), offset(image, bld, j), + indirect, brw_imm_ud((indirect_max + 1) * 4)); + } + return tmp; + } +} + +void +fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, + unsigned wr_mask) +{ + for (unsigned i = 0; i < 4; i++) { + if (!((wr_mask >> i) & 1)) + continue; + + fs_inst *new_inst = new(mem_ctx) fs_inst(inst); + new_inst->dst = offset(new_inst->dst, bld, i); + for (unsigned j = 0; j < new_inst->sources; j++) + if (new_inst->src[j].file == VGRF) + new_inst->src[j] = offset(new_inst->src[j], bld, i); + + bld.emit(new_inst); + } +} + +/** + * Get the matching channel register datatype for an image intrinsic of the + * specified GLSL image type. + */ +static brw_reg_type +get_image_base_type(const glsl_type *type) +{ + switch ((glsl_base_type)type->sampled_type) { + case GLSL_TYPE_UINT: + return BRW_REGISTER_TYPE_UD; + case GLSL_TYPE_INT: + return BRW_REGISTER_TYPE_D; + case GLSL_TYPE_FLOAT: + return BRW_REGISTER_TYPE_F; + default: + unreachable("Not reached."); + } +} + +/** + * Get the appropriate atomic op for an image atomic intrinsic. + */ +static unsigned +get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type) +{ + switch (op) { + case nir_intrinsic_image_atomic_add: + return BRW_AOP_ADD; + case nir_intrinsic_image_atomic_min: + return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ? + BRW_AOP_IMIN : BRW_AOP_UMIN); + case nir_intrinsic_image_atomic_max: + return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ? + BRW_AOP_IMAX : BRW_AOP_UMAX); + case nir_intrinsic_image_atomic_and: + return BRW_AOP_AND; + case nir_intrinsic_image_atomic_or: + return BRW_AOP_OR; + case nir_intrinsic_image_atomic_xor: + return BRW_AOP_XOR; + case nir_intrinsic_image_atomic_exchange: + return BRW_AOP_MOV; + case nir_intrinsic_image_atomic_comp_swap: + return BRW_AOP_CMPWR; + default: + unreachable("Not reachable."); + } +} + +static fs_inst * +emit_pixel_interpolater_send(const fs_builder &bld, + enum opcode opcode, + const fs_reg &dst, + const fs_reg &src, + const fs_reg &desc, + glsl_interp_mode interpolation) +{ + struct brw_wm_prog_data *wm_prog_data = + brw_wm_prog_data(bld.shader->stage_prog_data); + fs_inst *inst; + fs_reg payload; + int mlen; + + if (src.file == BAD_FILE) { + /* Dummy payload */ + payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1); + mlen = 1; + } else { + payload = src; + mlen = 2 * bld.dispatch_width() / 8; + } + + inst = bld.emit(opcode, dst, payload, desc); + inst->mlen = mlen; + /* 2 floats per slot returned */ + inst->size_written = 2 * dst.component_size(inst->exec_size); + inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; + + wm_prog_data->pulls_bary = true; + + return inst; +} + +/** + * Computes 1 << x, given a D/UD register containing some value x. + */ +static fs_reg +intexp2(const fs_builder &bld, const fs_reg &x) +{ + assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); + + fs_reg result = bld.vgrf(x.type, 1); + fs_reg one = bld.vgrf(x.type, 1); + + bld.MOV(one, retype(brw_imm_d(1), one.type)); + bld.SHL(result, one, x); + return result; +} + +void +fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + if (gs_compile->control_data_header_size_bits == 0) + return; + + /* We can only do EndPrimitive() functionality when the control data + * consists of cut bits. Fortunately, the only time it isn't is when the + * output type is points, in which case EndPrimitive() is a no-op. + */ + if (gs_prog_data->control_data_format != + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { + return; + } + + /* Cut bits use one bit per vertex. */ + assert(gs_compile->control_data_bits_per_vertex == 1); + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting + * vertex n, 0 otherwise. So all we need to do here is mark bit + * (vertex_count - 1) % 32 in the cut_bits register to indicate that + * EndPrimitive() was called after emitting vertex (vertex_count - 1); + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. + * + * Note that if EndPrimitive() is called before emitting any vertices, this + * will cause us to set bit 31 of the control_data_bits register to 1. + * That's fine because: + * + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be + * output, so the hardware will ignore cut bit 31. + * + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the + * last vertex, so setting cut bit 31 has no effect (since the primitive + * is automatically ended when the GS terminates). + * + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the + * control_data_bits register to 0 when the first vertex is emitted. + */ + + const fs_builder abld = bld.annotate("end primitive"); + + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); + fs_reg mask = intexp2(abld, prev_count); + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << + * ((vertex_count - 1) % 32). + */ + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) +{ + assert(stage == MESA_SHADER_GEOMETRY); + assert(gs_compile->control_data_bits_per_vertex != 0); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + const fs_builder abld = bld.annotate("emit control data bits"); + const fs_builder fwa_bld = bld.exec_all(); + + /* We use a single UD register to accumulate control data bits (32 bits + * for each of the SIMD8 channels). So we need to write a DWord (32 bits) + * at a time. + * + * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. + * We have select a 128-bit group via the Global and Per-Slot Offsets, then + * use the Channel Mask phase to enable/disable which DWord within that + * group to write. (Remember, different SIMD8 channels may have emitted + * different numbers of vertices, so we may need per-slot offsets.) + * + * Channel masking presents an annoying problem: we may have to replicate + * the data up to 4 times: + * + * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. + * + * To avoid penalizing shaders that emit a small number of vertices, we + * can avoid these sometimes: if the size of the control data header is + * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land + * land in the same 128-bit group, so we can skip per-slot offsets. + * + * Similarly, if the control data header is <= 32 bits, there is only one + * DWord, so we can skip channel masks. + */ + enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + + fs_reg channel_mask, per_slot_offset; + + if (gs_compile->control_data_header_size_bits > 32) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + channel_mask = vgrf(glsl_type::uint_type); + } + + if (gs_compile->control_data_header_size_bits > 128) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; + per_slot_offset = vgrf(glsl_type::uint_type); + } + + /* Figure out which DWord we're trying to write to using the formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + */ + if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { + fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); + unsigned log2_bits_per_vertex = + util_last_bit(gs_compile->control_data_bits_per_vertex); + abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); + + if (per_slot_offset.file != BAD_FILE) { + /* Set the per-slot offset to dword_index / 4, so that we'll write to + * the appropriate OWord within the control data header. + */ + abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); + } + + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. + */ + fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); + channel_mask = intexp2(fwa_bld, channel); + /* Then the channel masks need to be in bits 23:16. */ + fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); + } + + /* Store the control data bits in the message payload and send it. */ + int mlen = 2; + if (channel_mask.file != BAD_FILE) + mlen += 4; /* channel masks, plus 3 extra copies of the data */ + if (per_slot_offset.file != BAD_FILE) + mlen++; + + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); + int i = 0; + sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + if (per_slot_offset.file != BAD_FILE) + sources[i++] = per_slot_offset; + if (channel_mask.file != BAD_FILE) + sources[i++] = channel_mask; + while (i < mlen) { + sources[i++] = this->control_data_bits; + } + + abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); + fs_inst *inst = abld.emit(opcode, reg_undef, payload); + inst->mlen = mlen; + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (gs_prog_data->static_vertex_count == -1) + inst->offset = 2; +} + +void +fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id) +{ + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(gs_compile->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + const fs_builder abld = bld.annotate("set stream control data bits", NULL); + + /* reg::sid = stream_id */ + fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(sid, brw_imm_ud(stream_id)); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(mask, sid, shift_count); + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (stream_id > 0 && !nir->info->has_transform_feedback_varyings) + return; + + /* If we're outputting 32 control data bits or less, then we can wait + * until the shader is over to output them all. Otherwise we need to + * output them as we go. Now is the time to do it, since we're about to + * output the vertex_count'th vertex, so it's guaranteed that the + * control data bits associated with the (vertex_count - 1)th vertex are + * correct. + */ + if (gs_compile->control_data_header_size_bits > 32) { + const fs_builder abld = + bld.annotate("emit vertex: emit control data bits"); + + /* Only emit control data bits if we've finished accumulating a batch + * of 32 bits. This is the case when: + * + * (vertex_count * bits_per_vertex) % 32 == 0 + * + * (in other words, when the last 5 bits of vertex_count * + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some + * integer n (which is always the case, since bits_per_vertex is + * always 1 or 2), this is equivalent to requiring that the last 5-n + * bits of vertex_count are 0: + * + * vertex_count & (2^(5-n) - 1) == 0 + * + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is + * equivalent to: + * + * vertex_count & (32 / bits_per_vertex - 1) == 0 + * + * TODO: If vertex_count is an immediate, we could do some of this math + * at compile time... + */ + fs_inst *inst = + abld.AND(bld.null_reg_d(), vertex_count, + brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); + inst->conditional_mod = BRW_CONDITIONAL_Z; + + abld.IF(BRW_PREDICATE_NORMAL); + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we can skip emitting them. + */ + abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), + BRW_CONDITIONAL_NEQ); + abld.IF(BRW_PREDICATE_NORMAL); + emit_gs_control_data_bits(vertex_count); + abld.emit(BRW_OPCODE_ENDIF); + + /* Reset control_data_bits to 0 so we can start accumulating a new + * batch. + * + * Note: in the case where vertex_count == 0, this neutralizes the + * effect of any call to EndPrimitive() that the shader may have + * made before outputting its first vertex. + */ + inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); + inst->force_writemask_all = true; + abld.emit(BRW_OPCODE_ENDIF); + } + + emit_urb_writes(vertex_count); + + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for GL_POINTS outputs that don't use streams). + */ + if (gs_compile->control_data_header_size_bits > 0 && + gs_prog_data->control_data_format == + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + set_gs_stream_control_data_bits(vertex_count, stream_id); + } +} + +void +fs_visitor::emit_gs_input_load(const fs_reg &dst, + const nir_src &vertex_src, + unsigned base_offset, + const nir_src &offset_src, + unsigned num_components, + unsigned first_component) +{ + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); + nir_const_value *offset_const = nir_src_as_const_value(offset_src); + const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; + + /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y], + * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. Only + * gl_PointSize is available as a GS input, however, so it must be that. + */ + const bool is_point_size = (base_offset == 0); + + /* TODO: figure out push input layout for invocations == 1 */ + if (gs_prog_data->invocations == 1 && + offset_const != NULL && vertex_const != NULL && + 4 * (base_offset + offset_const->u32[0]) < push_reg_count) { + int imm_offset = (base_offset + offset_const->u32[0]) * 4 + + vertex_const->u32[0] * push_reg_count; + /* This input was pushed into registers. */ + if (is_point_size) { + /* gl_PointSize comes in .w */ + bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type)); + } else { + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + fs_reg(ATTR, imm_offset + i + first_component, dst.type)); + } + } + return; + } + + /* Resort to the pull model. Ensure the VUE handles are provided. */ + gs_prog_data->base.include_vue_handles = true; + + unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; + fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + if (gs_prog_data->invocations == 1) { + if (vertex_const) { + /* The vertex index is constant; just select the proper URB handle. */ + icp_handle = + retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0), + BRW_REGISTER_TYPE_UD); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> + * indicating that channel <n> should read the handle from + * DWord <n>. We convert that to bytes by multiplying by 4. + * + * Next, we convert the vertex index to bytes by multiplying + * by 32 (shifting by 5), and add the two together. This is + * the final indirect byte offset. + */ + fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1); + fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ + bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); + /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ + bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); + /* Convert vertex_index to bytes (multiply by 32) */ + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(5u)); + bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); + + /* Use first_icp_handle as the base offset. There is one register + * of URB handles per vertex, so inform the register allocator that + * we might read up to nir->info->gs.vertices_in registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), + fs_reg(icp_offset_bytes), + brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE)); + } + } else { + assert(gs_prog_data->invocations > 1); + + if (vertex_const) { + assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5); + bld.MOV(icp_handle, + retype(brw_vec1_grf(first_icp_handle + + vertex_const->i32[0] / 8, + vertex_const->i32[0] % 8), + BRW_REGISTER_TYPE_UD)); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + */ + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Convert vertex_index to bytes (multiply by 4) */ + bld.SHL(icp_offset_bytes, + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(2u)); + + /* Use first_icp_handle as the base offset. There is one DWord + * of URB handles per vertex, so inform the register allocator that + * we might read up to ceil(nir->info->gs.vertices_in / 8) registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), + fs_reg(icp_offset_bytes), + brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) * + REG_SIZE)); + } + } + + fs_inst *inst; + + fs_reg tmp_dst = dst; + fs_reg indirect_offset = get_nir_src(offset_src); + unsigned num_iterations = 1; + unsigned orig_num_components = num_components; + + if (type_sz(dst.type) == 8) { + if (num_components > 2) { + num_iterations = 2; + num_components = 2; + } + fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); + tmp_dst = tmp; + first_component = first_component / 2; + } + + for (unsigned iter = 0; iter < num_iterations; iter++) { + if (offset_const) { + /* Constant indexing - use global offset. */ + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(tmp_dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, + icp_handle); + inst->size_written = num_components * + tmp_dst.component_size(inst->exec_size); + } + inst->offset = base_offset + offset_const->u32[0]; + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { icp_handle, indirect_offset }; + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + if (first_component != 0) { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(tmp_dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, + payload); + inst->size_written = num_components * + tmp_dst.component_size(inst->exec_size); + } + inst->offset = base_offset; + inst->mlen = 2; + } + + if (type_sz(dst.type) == 8) { + shuffle_32bit_load_result_to_64bit_data( + bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components); + + for (unsigned c = 0; c < num_components; c++) + bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c)); + } + + if (num_iterations > 1) { + num_components = orig_num_components - 2; + if(offset_const) { + base_offset++; + } else { + fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); + indirect_offset = new_indirect; + } + } + } + + if (is_point_size) { + /* Read the whole VUE header (because of alignment) and read .w. */ + fs_reg tmp = bld.vgrf(dst.type, 4); + inst->dst = tmp; + inst->size_written = 4 * REG_SIZE; + bld.MOV(dst, offset(tmp, bld, 3)); + } +} + +fs_reg +fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u32[0] == 0); + return fs_reg(); + } + + return get_nir_src(*offset_src); +} + +static void +do_untyped_vector_read(const fs_builder &bld, + const fs_reg dest, + const fs_reg surf_index, + const fs_reg offset_reg, + unsigned num_components) +{ + if (type_sz(dest.type) == 4) { + fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, + num_components, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + for (unsigned i = 0; i < num_components; i++) + bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); + } else if (type_sz(dest.type) == 8) { + /* Reading a dvec, so we need to: + * + * 1. Multiply num_components by 2, to account for the fact that we + * need to read 64-bit components. + * 2. Shuffle the result of the load to form valid 64-bit elements + * 3. Emit a second load (for components z/w) if needed. + */ + fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(read_offset, offset_reg); + + int iters = num_components <= 2 ? 1 : 2; + + /* Load the dvec, the first iteration loads components x/y, the second + * iteration, if needed, loads components z/w + */ + for (int it = 0; it < iters; it++) { + /* Compute number of components to read in this iteration */ + int iter_components = MIN2(2, num_components); + num_components -= iter_components; + + /* Read. Since this message reads 32-bit components, we need to + * read twice as many components. + */ + fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset, + 1 /* dims */, + iter_components * 2, + BRW_PREDICATE_NONE); + + /* Shuffle the 32-bit load result into valid 64-bit data */ + const fs_reg packed_result = bld.vgrf(dest.type, iter_components); + shuffle_32bit_load_result_to_64bit_data( + bld, packed_result, read_result, iter_components); + + /* Move each component to its destination */ + read_result = retype(read_result, BRW_REGISTER_TYPE_DF); + for (int c = 0; c < iter_components; c++) { + bld.MOV(offset(dest, bld, it * 2 + c), + offset(packed_result, bld, c)); + } + + bld.ADD(read_offset, read_offset, brw_imm_ud(16)); + } + } else { + unreachable("Unsupported type"); + } +} + +void +fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_VERTEX); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + unreachable("should be lowered by lower_vertex_id()"); + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_draw_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + case nir_intrinsic_load_input: { + fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type); + unsigned first_component = nir_intrinsic_component(instr); + unsigned num_components = instr->num_components; + enum brw_reg_type type = dest.type; + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset && "Indirect input loads not allowed"); + src = offset(src, bld, const_offset->u32[0]); + + for (unsigned j = 0; j < num_components; j++) { + bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component)); + } + + if (type == BRW_REGISTER_TYPE_DF) { + /* Once the double vector is read, set again its original register + * type to continue with normal execution. + */ + src = retype(src, type); + dest = retype(dest, type); + } + + if (type_sz(src.type) == 8) { + shuffle_32bit_load_result_to_64bit_data(bld, + dest, + retype(dest, BRW_REGISTER_TYPE_F), + instr->num_components); + } + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_TESS_CTRL); + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + + fs_reg dst; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dst = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); + break; + case nir_intrinsic_load_invocation_id: + bld.MOV(retype(dst, invocation_id.type), invocation_id); + break; + case nir_intrinsic_load_patch_vertices_in: + bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), + brw_imm_d(tcs_key->input_vertices)); + break; + + case nir_intrinsic_barrier: { + if (tcs_prog_data->instances == 1) + break; + + fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg m0_2 = component(m0, 2); + + const fs_builder chanbld = bld.exec_all().group(1, 0); + + /* Zero the message header */ + bld.exec_all().MOV(m0, brw_imm_ud(0u)); + + /* Copy "Barrier ID" from r0.2, bits 16:13 */ + chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(16, 13))); + + /* Shift it up to bits 27:24. */ + chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); + + /* Set the Barrier Count and the enable bit */ + chanbld.OR(m0_2, m0_2, + brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); + + bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); + break; + } + + case nir_intrinsic_load_input: + unreachable("nir_lower_io should never give us these."); + break; + + case nir_intrinsic_load_per_vertex_input: { + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + const nir_src &vertex_src = instr->src[0]; + nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); + + fs_inst *inst; + + fs_reg icp_handle; + + if (vertex_const) { + /* Emit a MOV to resolve <0,1,0> regioning. */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.MOV(icp_handle, + retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3), + vertex_const->i32[0] & 7), + BRW_REGISTER_TYPE_UD)); + } else if (tcs_prog_data->instances == 1 && + vertex_src.is_ssa && + vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) { + /* For the common case of only 1 instance, an array index of + * gl_InvocationID means reading g1. Skip all the indirect work. + */ + icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Each ICP handle is a single DWord (4 bytes) */ + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(2u)); + + /* Start at g1. We might read up to 4 registers. */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, + brw_imm_ud(4 * REG_SIZE)); + } + + /* We can only read two double components with each URB read, so + * we send two read messages in that case, each one loading up to + * two double components. + */ + unsigned num_iterations = 1; + unsigned num_components = instr->num_components; + unsigned first_component = nir_intrinsic_component(instr); + fs_reg orig_dst = dst; + if (type_sz(dst.type) == 8) { + first_component = first_component / 2; + if (instr->num_components > 2) { + num_iterations = 2; + num_components = 2; + } + + fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); + dst = tmp; + } + + for (unsigned iter = 0; iter < num_iterations; iter++) { + if (indirect_offset.file == BAD_FILE) { + /* Constant indexing - use global offset. */ + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); + } + inst->offset = imm_offset; + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { icp_handle, indirect_offset }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, + payload); + } + inst->offset = imm_offset; + inst->mlen = 2; + } + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); + + /* If we are reading 64-bit data using 32-bit read messages we need + * build proper 64-bit data elements by shuffling the low and high + * 32-bit components around like we do for other things like UBOs + * or SSBOs. + */ + if (type_sz(dst.type) == 8) { + shuffle_32bit_load_result_to_64bit_data( + bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components); + + for (unsigned c = 0; c < num_components; c++) { + bld.MOV(offset(orig_dst, bld, iter * 2 + c), + offset(dst, bld, c)); + } + } + + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + assert(type_sz(dst.type) < 8); + inst->dst = bld.vgrf(dst.type, 4); + inst->size_written = 4 * REG_SIZE; + bld.MOV(dst, offset(inst->dst, bld, 3)); + } + + /* If we are loading double data and we need a second read message + * adjust the write offset + */ + if (num_iterations > 1) { + num_components = instr->num_components - 2; + imm_offset++; + } + } + break; + } + + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: { + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + unsigned first_component = nir_intrinsic_component(instr); + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* Replicate the patch handle to all enabled channels */ + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.MOV(patch_handle, + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + { + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, + patch_handle); + inst->size_written = read_components * REG_SIZE; + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, + patch_handle); + inst->size_written = instr->num_components * REG_SIZE; + } + inst->offset = imm_offset; + inst->mlen = 1; + } + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + indirect_offset + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + inst->size_written = read_components * REG_SIZE; + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, + payload); + inst->size_written = instr->num_components * REG_SIZE; + } + inst->offset = imm_offset; + inst->mlen = 2; + } + break; + } + + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: { + fs_reg value = get_nir_src(instr->src[0]); + bool is_64bit = (instr->src[0].is_ssa ? + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + unsigned swiz = BRW_SWIZZLE_XYZW; + unsigned mask = instr->const_index[1]; + unsigned header_regs = 0; + fs_reg srcs[7]; + srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); + + if (indirect_offset.file != BAD_FILE) { + srcs[header_regs++] = indirect_offset; + } + + if (mask == 0) + break; + + unsigned num_components = util_last_bit(mask); + enum opcode opcode; + + /* We can only pack two 64-bit components in a single message, so send + * 2 messages if we have more components + */ + unsigned num_iterations = 1; + unsigned iter_components = num_components; + unsigned first_component = nir_intrinsic_component(instr); + if (is_64bit) { + first_component = first_component / 2; + if (instr->num_components > 2) { + num_iterations = 2; + iter_components = 2; + } + } + + /* 64-bit data needs to me shuffled before we can write it to the URB. + * We will use this temporary to shuffle the components in each + * iteration. + */ + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type); + + mask = mask << first_component; + + for (unsigned iter = 0; iter < num_iterations; iter++) { + if (!is_64bit && mask != WRITEMASK_XYZW) { + srcs[header_regs++] = brw_imm_ud(mask << 16); + opcode = indirect_offset.file != BAD_FILE ? + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { + /* Expand the 64-bit mask to 32-bit channels. We only handle + * two channels in each iteration, so we only care about X/Y. + */ + unsigned mask32 = 0; + if (mask & WRITEMASK_X) + mask32 |= WRITEMASK_XY; + if (mask & WRITEMASK_Y) + mask32 |= WRITEMASK_ZW; + + /* If the mask does not include any of the channels X or Y there + * is nothing to do in this iteration. Move on to the next couple + * of 64-bit channels. + */ + if (!mask32) { + mask >>= 2; + imm_offset++; + continue; + } + + srcs[header_regs++] = brw_imm_ud(mask32 << 16); + opcode = indirect_offset.file != BAD_FILE ? + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + } else { + opcode = indirect_offset.file != BAD_FILE ? + SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : + SHADER_OPCODE_URB_WRITE_SIMD8; + } + + for (unsigned i = 0; i < iter_components; i++) { + if (!(mask & (1 << (i + first_component)))) + continue; + + if (!is_64bit) { + srcs[header_regs + i + first_component] = + offset(value, bld, BRW_GET_SWZ(swiz, i)); + } else { + /* We need to shuffle the 64-bit data to match the layout + * expected by our 32-bit URB write messages. We use a temporary + * for that. + */ + unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i); + shuffle_64bit_data_for_32bit_write(bld, + retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F), + retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF), + 1); + + /* Now copy the data to the destination */ + fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type); + unsigned idx = 2 * i; + bld.MOV(dest, offset(tmp, bld, idx)); + bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1)); + srcs[header_regs + idx + first_component * 2] = dest; + srcs[header_regs + idx + 1 + first_component * 2] = + offset(dest, bld, 1); + } + } + + unsigned mlen = + header_regs + (is_64bit ? 2 * iter_components : iter_components) + + (is_64bit ? 2 * first_component : first_component); + fs_reg payload = + bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); + bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); + + fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); + inst->offset = imm_offset; + inst->mlen = mlen; + + /* If this is a 64-bit attribute, select the next two 64-bit channels + * to be handled in the next iteration. + */ + if (is_64bit) { + mask >>= 2; + imm_offset++; + } + } + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_TESS_EVAL); + struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); + break; + case nir_intrinsic_load_tess_coord: + /* gl_TessCoord is part of the payload in g1-3 */ + for (unsigned i = 0; i < 3; i++) { + bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); + } + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: { + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + unsigned first_component = nir_intrinsic_component(instr); + + if (type_sz(dest.type) == 8) { + first_component = first_component / 2; + } + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* Arbitrarily only push up to 32 vec4 slots worth of data, + * which is 16 registers (since each holds 2 vec4 slots). + */ + const unsigned max_push_slots = 32; + if (imm_offset < max_push_slots) { + fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); + for (int i = 0; i < instr->num_components; i++) { + unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + + i + first_component; + bld.MOV(offset(dest, bld, i), component(src, comp)); + } + tes_prog_data->base.urb_read_length = + MAX2(tes_prog_data->base.urb_read_length, + DIV_ROUND_UP(imm_offset + 1, 2)); + } else { + /* Replicate the patch handle to all enabled channels */ + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) + }; + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); + + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dest.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, + patch_handle); + inst->size_written = read_components * REG_SIZE; + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dest, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, + patch_handle); + inst->size_written = instr->num_components * REG_SIZE; + } + inst->mlen = 1; + inst->offset = imm_offset; + } + } else { + /* Indirect indexing - use per-slot offsets as well. */ + + /* We can only read two double components with each URB read, so + * we send two read messages in that case, each one loading up to + * two double components. + */ + unsigned num_iterations = 1; + unsigned num_components = instr->num_components; + fs_reg orig_dest = dest; + if (type_sz(dest.type) == 8) { + if (instr->num_components > 2) { + num_iterations = 2; + num_components = 2; + } + fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); + dest = tmp; + } + + for (unsigned iter = 0; iter < num_iterations; iter++) { + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + indirect_offset + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + + if (first_component != 0) { + unsigned read_components = + num_components + first_component; + fs_reg tmp = bld.vgrf(dest.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, + payload); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, + payload); + } + inst->mlen = 2; + inst->offset = imm_offset; + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); + + /* If we are reading 64-bit data using 32-bit read messages we need + * build proper 64-bit data elements by shuffling the low and high + * 32-bit components around like we do for other things like UBOs + * or SSBOs. + */ + if (type_sz(dest.type) == 8) { + shuffle_32bit_load_result_to_64bit_data( + bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components); + + for (unsigned c = 0; c < num_components; c++) { + bld.MOV(offset(orig_dest, bld, iter * 2 + c), + offset(dest, bld, c)); + } + } + + /* If we are loading double data and we need a second read message + * adjust the offset + */ + if (num_iterations > 1) { + num_components = instr->num_components - 2; + imm_offset++; + } + } + } + break; + } + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_GEOMETRY); + fs_reg indirect_offset; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + assert(stage == MESA_SHADER_GEOMETRY); + assert(brw_gs_prog_data(prog_data)->include_primitive_id); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), + retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); + break; + + case nir_intrinsic_load_input: + unreachable("load_input intrinsics are invalid for the GS stage"); + + case nir_intrinsic_load_per_vertex_input: + emit_gs_input_load(dest, instr->src[0], instr->const_index[0], + instr->src[1], instr->num_components, + nir_intrinsic_component(instr)); + break; + + case nir_intrinsic_emit_vertex_with_counter: + emit_gs_vertex(instr->src[0], instr->const_index[0]); + break; + + case nir_intrinsic_end_primitive_with_counter: + emit_gs_end_primitive(instr->src[0]); + break; + + case nir_intrinsic_set_vertex_count: + bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); + break; + + case nir_intrinsic_load_invocation_id: { + fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +/** + * Fetch the current render target layer index. + */ +static fs_reg +fetch_render_target_array_index(const fs_builder &bld) +{ + if (bld.shader->devinfo->gen >= 6) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r0.0. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), + brw_imm_uw(0x7ff)); + return idx; + } else { + /* Pre-SNB we only ever render into the first layer of the framebuffer + * since layered rendering is not implemented. + */ + return brw_imm_ud(0); + } +} + +/** + * Fake non-coherent framebuffer read implemented using TXF to fetch from the + * framebuffer at the current fragment coordinates and sample index. + */ +fs_inst * +fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, + unsigned target) +{ + const struct gen_device_info *devinfo = bld.shader->devinfo; + + assert(bld.shader->stage == MESA_SHADER_FRAGMENT); + const brw_wm_prog_key *wm_key = + reinterpret_cast<const brw_wm_prog_key *>(key); + assert(!wm_key->coherent_fb_fetch); + const struct brw_wm_prog_data *wm_prog_data = + brw_wm_prog_data(stage_prog_data); + + /* Calculate the surface index relative to the start of the texture binding + * table block, since that's what the texturing messages expect. + */ + const unsigned surface = target + + wm_prog_data->binding_table.render_target_read_start - + wm_prog_data->base.binding_table.texture_start; + + brw_mark_surface_used( + bld.shader->stage_prog_data, + wm_prog_data->binding_table.render_target_read_start + target); + + /* Calculate the fragment coordinates. */ + const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); + bld.MOV(offset(coords, bld, 0), pixel_x); + bld.MOV(offset(coords, bld, 1), pixel_y); + bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); + + /* Calculate the sample index and MCS payload when multisampling. Luckily + * the MCS fetch message behaves deterministically for UMS surfaces, so it + * shouldn't be necessary to recompile based on whether the framebuffer is + * CMS or UMS. + */ + if (wm_key->multisample_fbo && + nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); + + const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; + const fs_reg mcs = wm_key->multisample_fbo ? + emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg(); + + /* Use either a normal or a CMS texel fetch message depending on whether + * the framebuffer is single or multisample. On SKL+ use the wide CMS + * message just in case the framebuffer uses 16x multisampling, it should + * be equivalent to the normal CMS fetch for lower multisampling modes. + */ + const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : + devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : + SHADER_OPCODE_TXF_CMS_LOGICAL; + + /* Emit the instruction. */ + const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(), + sample, mcs, + brw_imm_ud(surface), brw_imm_ud(0), + fs_reg(), brw_imm_ud(3), brw_imm_ud(0) }; + STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS); + + fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + + return inst; +} + +/** + * Actual coherent framebuffer read implemented using the native render target + * read message. Requires SKL+. + */ +static fs_inst * +emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) +{ + assert(bld.shader->devinfo->gen >= 9); + fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); + inst->target = target; + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + + return inst; +} + +static fs_reg +alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) +{ + if (n && regs[0].file != BAD_FILE) { + return regs[0]; + + } else { + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); + + for (unsigned i = 0; i < n; i++) + regs[i] = tmp; + + return tmp; + } +} + +static fs_reg +alloc_frag_output(fs_visitor *v, unsigned location) +{ + assert(v->stage == MESA_SHADER_FRAGMENT); + const brw_wm_prog_key *const key = + reinterpret_cast<const brw_wm_prog_key *>(v->key); + const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); + const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); + + if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) + return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); + + else if (l == FRAG_RESULT_COLOR) + return alloc_temporary(v->bld, 4, v->outputs, + MAX2(key->nr_color_regions, 1)); + + else if (l == FRAG_RESULT_DEPTH) + return alloc_temporary(v->bld, 1, &v->frag_depth, 1); + + else if (l == FRAG_RESULT_STENCIL) + return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); + + else if (l == FRAG_RESULT_SAMPLE_MASK) + return alloc_temporary(v->bld, 1, &v->sample_mask, 1); + + else if (l >= FRAG_RESULT_DATA0 && + l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) + return alloc_temporary(v->bld, 4, + &v->outputs[l - FRAG_RESULT_DATA0], 1); + + else + unreachable("Invalid location"); +} + +void +fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_FRAGMENT); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_front_face: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), + *emit_frontfacing_interpolation()); + break; + + case nir_intrinsic_load_sample_pos: { + fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; + assert(sample_pos.file != BAD_FILE); + dest.type = sample_pos.type; + bld.MOV(dest, sample_pos); + bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); + break; + } + + case nir_intrinsic_load_layer_id: + dest.type = BRW_REGISTER_TYPE_UD; + bld.MOV(dest, fetch_render_target_array_index(bld)); + break; + + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_sample_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + case nir_intrinsic_store_output: { + const fs_reg src = get_nir_src(instr->src[0]); + const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + assert(const_offset && "Indirect output stores not allowed"); + const unsigned location = nir_intrinsic_base(instr) + + SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION); + const fs_reg new_dest = retype(alloc_frag_output(this, location), + src.type); + + for (unsigned j = 0; j < instr->num_components; j++) + bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), + offset(src, bld, j)); + + break; + } + + case nir_intrinsic_load_output: { + const unsigned l = GET_FIELD(nir_intrinsic_base(instr), + BRW_NIR_FRAG_OUTPUT_LOCATION); + assert(l >= FRAG_RESULT_DATA0); + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset && "Indirect output loads not allowed"); + const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0]; + const fs_reg tmp = bld.vgrf(dest.type, 4); + + if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) + emit_coherent_fb_read(bld, tmp, target); + else + emit_non_coherent_fb_read(bld, tmp, target); + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), + offset(tmp, bld, nir_intrinsic_component(instr) + j)); + } + + break; + } + + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: { + /* We track our discarded pixels in f0.1. By predicating on it, we can + * update just the flag bits that aren't yet discarded. If there's no + * condition, we emit a CMP of g0 != g0, so all currently executing + * channels will get turned off. + */ + fs_inst *cmp; + if (instr->intrinsic == nir_intrinsic_discard_if) { + cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), + brw_imm_d(0), BRW_CONDITIONAL_Z); + } else { + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW)); + cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); + } + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = 1; + + if (devinfo->gen >= 6) { + emit_discard_jump(); + } + break; + } + + case nir_intrinsic_load_input: { + /* load_input is only used for flat inputs */ + unsigned base = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + unsigned num_components = instr->num_components; + enum brw_reg_type type = dest.type; + + /* Special case fields in the VUE header */ + if (base == VARYING_SLOT_LAYER) + component = 1; + else if (base == VARYING_SLOT_VIEWPORT) + component = 2; + + if (nir_dest_bit_size(instr->dest) == 64) { + /* const_index is in 32-bit type size units that could not be aligned + * with DF. We need to read the double vector as if it was a float + * vector of twice the number of components to fetch the right data. + */ + type = BRW_REGISTER_TYPE_F; + num_components *= 2; + } + + for (unsigned int i = 0; i < num_components; i++) { + struct brw_reg interp = interp_reg(base, component + i); + interp = suboffset(interp, 3); + bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i), + retype(fs_reg(interp), type)); + } + + if (nir_dest_bit_size(instr->dest) == 64) { + shuffle_32bit_load_result_to_64bit_data(bld, + dest, + retype(dest, type), + instr->num_components); + } + break; + } + + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + /* Do nothing - load_interpolated_input handling will handle it later. */ + break; + + case nir_intrinsic_load_barycentric_at_sample: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + + nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); + + if (const_sample) { + unsigned msg_data = const_sample->i32[0] << 4; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dest, + fs_reg(), /* src */ + brw_imm_ud(msg_data), + interpolation); + } else { + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + if (nir_src_is_dynamically_uniform(instr->src[0])) { + const fs_reg sample_id = bld.emit_uniformize(sample_src); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0) + .SHL(msg_data, sample_id, brw_imm_ud(4u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dest, + fs_reg(), /* src */ + msg_data, + interpolation); + } else { + /* Make a loop that sends a message to the pixel interpolater + * for the sample number in each live channel. If there are + * multiple channels with the same sample number then these + * will be handled simultaneously with a single interation of + * the loop. + */ + bld.emit(BRW_OPCODE_DO); + + /* Get the next live sample number into sample_id_reg */ + const fs_reg sample_id = bld.emit_uniformize(sample_src); + + /* Set the flag register so that we can perform the send + * message on all channels that have the same sample number + */ + bld.CMP(bld.null_reg_ud(), + sample_src, sample_id, + BRW_CONDITIONAL_EQ); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0) + .SHL(msg_data, sample_id, brw_imm_ud(4u)); + fs_inst *inst = + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dest, + fs_reg(), /* src */ + msg_data, + interpolation); + set_predicate(BRW_PREDICATE_NORMAL, inst); + + /* Continue the loop if there are any live channels left */ + set_predicate_inv(BRW_PREDICATE_NORMAL, + true, /* inverse */ + bld.emit(BRW_OPCODE_WHILE)); + } + } + break; + } + + case nir_intrinsic_load_barycentric_at_offset: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + + if (const_offset) { + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dest, + fs_reg(), /* src */ + brw_imm_ud(off_x | (off_y << 4)), + interpolation); + } else { + fs_reg src = vgrf(glsl_type::ivec2_type); + fs_reg offset_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_F); + for (int i = 0; i < 2; i++) { + fs_reg temp = vgrf(glsl_type::float_type); + bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); + fs_reg itemp = vgrf(glsl_type::int_type); + /* float to int */ + bld.MOV(itemp, temp); + + /* Clamp the upper end of the range to +7/16. + * ARB_gpu_shader5 requires that we support a maximum offset + * of +0.5, which isn't representable in a S0.4 value -- if + * we didn't clamp it, we'd end up with -8/16, which is the + * opposite of what the shader author wanted. + * + * This is legal due to ARB_gpu_shader5's quantization + * rules: + * + * "Not all values of <offset> may be supported; x and y + * offsets may be rounded to fixed-point values with the + * number of fraction bits given by the + * implementation-dependent constant + * FRAGMENT_INTERPOLATION_OFFSET_BITS" + */ + set_condmod(BRW_CONDITIONAL_L, + bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); + } + + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dest, + src, + brw_imm_ud(0u), + interpolation); + } + break; + } + + case nir_intrinsic_load_interpolated_input: { + if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { + emit_fragcoord_interpolation(dest); + break; + } + + assert(instr->src[0].ssa && + instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *bary_intrinsic = + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; + enum glsl_interp_mode interp_mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); + fs_reg dst_xy; + + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || + bary_intrin == nir_intrinsic_load_barycentric_at_sample) { + /* Use the result of the PI message */ + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); + } else { + /* Use the delta_xy values computed from the payload */ + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp_mode, bary_intrin); + + dst_xy = this->delta_xy[bary]; + } + + for (unsigned int i = 0; i < instr->num_components; i++) { + fs_reg interp = + fs_reg(interp_reg(nir_intrinsic_base(instr), + nir_intrinsic_component(instr) + i)); + interp.type = BRW_REGISTER_TYPE_F; + dest.type = BRW_REGISTER_TYPE_F; + + if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { + fs_reg tmp = vgrf(glsl_type::float_type); + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); + bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); + } else { + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); + } + } + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_COMPUTE); + struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_barrier: + emit_barrier(); + cs_prog_data->uses_barrier = true; + break; + + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_work_group_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), offset(val, bld, i)); + break; + } + + case nir_intrinsic_load_num_work_groups: { + const unsigned surface = + cs_prog_data->binding_table.work_groups_start; + + cs_prog_data->uses_num_work_groups = true; + + fs_reg surf_index = brw_imm_ud(surface); + brw_mark_surface_used(prog_data, surface); + + /* Read the 3 GLuint components of gl_NumWorkGroups */ + for (unsigned i = 0; i < 3; i++) { + fs_reg read_result = + emit_untyped_read(bld, surf_index, + brw_imm_ud(i << 2), + 1 /* dims */, 1 /* size */, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + bld.MOV(dest, read_result); + dest = offset(dest, bld, 1); + } + break; + } + + case nir_intrinsic_shared_atomic_add: + nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr); + break; + case nir_intrinsic_shared_atomic_imin: + nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); + break; + case nir_intrinsic_shared_atomic_umin: + nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_shared_atomic_imax: + nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_shared_atomic_umax: + nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); + break; + case nir_intrinsic_shared_atomic_and: + nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); + break; + case nir_intrinsic_shared_atomic_or: + nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); + break; + case nir_intrinsic_shared_atomic_xor: + nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); + break; + case nir_intrinsic_shared_atomic_exchange: + nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); + break; + case nir_intrinsic_shared_atomic_comp_swap: + nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); + break; + + case nir_intrinsic_load_shared: { + assert(devinfo->gen >= 7); + + fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); + + /* Get the offset to read from */ + fs_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(instr->const_index[0])); + } + + /* Read the vector */ + do_untyped_vector_read(bld, dest, surf_index, offset_reg, + instr->num_components); + break; + } + + case nir_intrinsic_store_shared: { + assert(devinfo->gen >= 7); + + /* Block index */ + fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); + + /* Value */ + fs_reg val_reg = get_nir_src(instr->src[0]); + + /* Writemask */ + unsigned writemask = instr->const_index[1]; + + /* get_nir_src() retypes to integer. Be wary of 64-bit types though + * since the untyped writes below operate in units of 32-bits, which + * means that we need to write twice as many components each time. + * Also, we have to suffle 64-bit data to be in the appropriate layout + * expected by our 32-bit write messages. + */ + unsigned type_size = 4; + unsigned bit_size = instr->src[0].is_ssa ? + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; + if (bit_size == 64) { + type_size = 8; + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type); + shuffle_64bit_data_for_32bit_write( + bld, + retype(tmp, BRW_REGISTER_TYPE_F), + retype(val_reg, BRW_REGISTER_TYPE_DF), + instr->num_components); + val_reg = tmp; + } + + unsigned type_slots = type_size / 4; + + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + */ + while (writemask) { + unsigned first_component = ffs(writemask) - 1; + unsigned length = ffs(~(writemask >> first_component)) - 1; + + /* We can't write more than 2 64-bit components at once. Limit the + * length of the write to what we can do and let the next iteration + * handle the rest + */ + if (type_size > 4) + length = MIN2(2, length); + + fs_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset) { + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] + + type_size * first_component); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(instr->const_index[0] + type_size * first_component)); + } + + emit_untyped_write(bld, surf_index, offset_reg, + offset(val_reg, bld, first_component * type_slots), + 1 /* dims */, length * type_slots, + BRW_PREDICATE_NONE); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + writemask &= (15 << (first_component + length)); + } + + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) +{ + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_atomic_counter_inc: + case nir_intrinsic_atomic_counter_dec: + case nir_intrinsic_atomic_counter_read: + case nir_intrinsic_atomic_counter_add: + case nir_intrinsic_atomic_counter_min: + case nir_intrinsic_atomic_counter_max: + case nir_intrinsic_atomic_counter_and: + case nir_intrinsic_atomic_counter_or: + case nir_intrinsic_atomic_counter_xor: + case nir_intrinsic_atomic_counter_exchange: + case nir_intrinsic_atomic_counter_comp_swap: { + if (stage == MESA_SHADER_FRAGMENT && + instr->intrinsic != nir_intrinsic_atomic_counter_read) + brw_wm_prog_data(prog_data)->has_side_effects = true; + + /* Get some metadata from the image intrinsic. */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + + /* Get the arguments of the atomic intrinsic. */ + const fs_reg offset = get_nir_src(instr->src[0]); + const unsigned surface = (stage_prog_data->binding_table.abo_start + + instr->const_index[0]); + const fs_reg src0 = (info->num_srcs >= 2 + ? get_nir_src(instr->src[1]) : fs_reg()); + const fs_reg src1 = (info->num_srcs >= 3 + ? get_nir_src(instr->src[2]) : fs_reg()); + fs_reg tmp; + + assert(info->num_srcs <= 3); + + /* Emit a surface read or atomic op. */ + if (instr->intrinsic == nir_intrinsic_atomic_counter_read) { + tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1); + } else { + tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0, + src1, 1, 1, + get_atomic_counter_op(instr->intrinsic)); + } + + /* Assign the result. */ + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp); + + /* Mark the surface as used. */ + brw_mark_surface_used(stage_prog_data, surface); + break; + } + + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_min: + case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: { + using namespace image_access; + + if (stage == MESA_SHADER_FRAGMENT && + instr->intrinsic != nir_intrinsic_image_load) + brw_wm_prog_data(prog_data)->has_side_effects = true; + + /* Get the referenced image variable and type. */ + const nir_variable *var = instr->variables[0]->var; + const glsl_type *type = var->type->without_array(); + const brw_reg_type base_type = get_image_base_type(type); + + /* Get some metadata from the image intrinsic. */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + const unsigned arr_dims = type->sampler_array ? 1 : 0; + const unsigned surf_dims = type->coordinate_components() - arr_dims; + const unsigned format = var->data.image.format; + + /* Get the arguments of the image intrinsic. */ + const fs_reg image = get_nir_image_deref(instr->variables[0]); + const fs_reg addr = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + const fs_reg src0 = (info->num_srcs >= 3 ? + retype(get_nir_src(instr->src[2]), base_type) : + fs_reg()); + const fs_reg src1 = (info->num_srcs >= 4 ? + retype(get_nir_src(instr->src[3]), base_type) : + fs_reg()); + fs_reg tmp; + + /* Emit an image load, store or atomic op. */ + if (instr->intrinsic == nir_intrinsic_image_load) + tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format); + + else if (instr->intrinsic == nir_intrinsic_image_store) + emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, + var->data.image.write_only ? GL_NONE : format); + + else + tmp = emit_image_atomic(bld, image, addr, src0, src1, + surf_dims, arr_dims, info->dest_components, + get_image_atomic_op(instr->intrinsic, type)); + + /* Assign the result. */ + for (unsigned c = 0; c < info->dest_components; ++c) + bld.MOV(offset(retype(dest, base_type), bld, c), + offset(tmp, bld, c)); + break; + } + + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier: { + const fs_builder ubld = bld.group(8, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) + ->size_written = 2 * REG_SIZE; + break; + } + + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_shared: + /* We treat these workgroup-level barriers as no-ops. This should be + * safe at present and as long as: + * + * - Memory access instructions are not subsequently reordered by the + * compiler back-end. + * + * - All threads from a given compute shader workgroup fit within a + * single subslice and therefore talk to the same HDC shared unit + * what supposedly guarantees ordering and coherency between threads + * from the same workgroup. This may change in the future when we + * start splitting workgroups across multiple subslices. + * + * - The context is not in fault-and-stream mode, which could cause + * memory transactions (including to SLM) prior to the barrier to be + * replayed after the barrier if a pagefault occurs. This shouldn't + * be a problem up to and including SKL because fault-and-stream is + * not usable due to hardware issues, but that's likely to change in + * the future. + */ + break; + + case nir_intrinsic_shader_clock: { + /* We cannot do anything if there is an event, so ignore it for now */ + const fs_reg shader_clock = get_timestamp(bld); + const fs_reg srcs[] = { component(shader_clock, 0), + component(shader_clock, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); + break; + } + + case nir_intrinsic_image_size: { + /* Get the referenced image variable and type. */ + const nir_variable *var = instr->variables[0]->var; + const glsl_type *type = var->type->without_array(); + + /* Get the size of the image. */ + const fs_reg image = get_nir_image_deref(instr->variables[0]); + const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); + + /* For 1DArray image types, the array index is stored in the Z component. + * Fix this by swizzling the Z component to the Y component. + */ + const bool is_1d_array_image = + type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D && + type->sampler_array; + + /* For CubeArray images, we should count the number of cubes instead + * of the number of faces. Fix it by dividing the (Z component) by 6. + */ + const bool is_cube_array_image = + type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && + type->sampler_array; + + /* Copy all the components. */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + for (unsigned c = 0; c < info->dest_components; ++c) { + if ((int)c >= type->coordinate_components()) { + bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), + brw_imm_d(1)); + } else if (c == 1 && is_1d_array_image) { + bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), + offset(size, bld, 2)); + } else if (c == 2 && is_cube_array_image) { + bld.emit(SHADER_OPCODE_INT_QUOTIENT, + offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), + offset(size, bld, c), brw_imm_d(6)); + } else { + bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), + offset(size, bld, c)); + } + } + + break; + } + + case nir_intrinsic_image_samples: + /* The driver does not support multi-sampled images. */ + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); + break; + + case nir_intrinsic_load_uniform: { + /* Offsets are in bytes but they should always be multiples of 4 */ + assert(instr->const_index[0] % 4 == 0); + + fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + /* Offsets are in bytes but they should always be multiples of 4 */ + assert(const_offset->u32[0] % 4 == 0); + src.offset = const_offset->u32[0]; + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + } + } else { + fs_reg indirect = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + /* We need to pass a size to the MOV_INDIRECT but we don't want it to + * go past the end of the uniform. In order to keep the n'th + * component from running past, we subtract off the size of all but + * one component of the vector. + */ + assert(instr->const_index[1] >= + instr->num_components * (int) type_sz(dest.type)); + unsigned read_size = instr->const_index[1] - + (instr->num_components - 1) * type_sz(dest.type); + + bool supports_64bit_indirects = + !devinfo->is_cherryview && !devinfo->is_broxton; + + if (type_sz(dest.type) != 8 || supports_64bit_indirects) { + for (unsigned j = 0; j < instr->num_components; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(dest, bld, j), offset(src, bld, j), + indirect, brw_imm_ud(read_size)); + } + } else { + const unsigned num_mov_indirects = + type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); + /* We read a little bit less per MOV INDIRECT, as they are now + * 32-bits ones instead of 64-bit. Fix read_size then. + */ + const unsigned read_size_32bit = read_size - + (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); + for (unsigned j = 0; j < instr->num_components; j++) { + for (unsigned i = 0; i < num_mov_indirects; i++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), + subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), + indirect, brw_imm_ud(read_size_32bit)); + } + } + } + } + break; + } + + case nir_intrinsic_load_ubo: { + nir_const_value *const_index = nir_src_as_const_value(instr->src[0]); + fs_reg surf_index; + + if (const_index) { + const unsigned index = stage_prog_data->binding_table.ubo_start + + const_index->u32[0]; + surf_index = brw_imm_ud(index); + brw_mark_surface_used(prog_data, index); + } else { + /* The block index is not a constant. Evaluate the index expression + * per-channel and add the base UBO index; we have to select a value + * from any live channel. + */ + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, get_nir_src(instr->src[0]), + brw_imm_ud(stage_prog_data->binding_table.ubo_start)); + surf_index = bld.emit_uniformize(surf_index); + + /* Assume this may touch any UBO. It would be nice to provide + * a tighter bound, but the array information is already lowered away. + */ + brw_mark_surface_used(prog_data, + stage_prog_data->binding_table.ubo_start + + nir->info->num_ubos - 1); + } + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset == NULL) { + fs_reg base_offset = retype(get_nir_src(instr->src[1]), + BRW_REGISTER_TYPE_UD); + + for (int i = 0; i < instr->num_components; i++) + VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, + base_offset, i * type_sz(dest.type)); + } else { + /* Even if we are loading doubles, a pull constant load will load + * a 32-bit vec4, so should only reserve vgrf space for that. If we + * need to load a full dvec4 we will have to emit 2 loads. This is + * similar to demote_pull_constants(), except that in that case we + * see individual accesses to each component of the vector and then + * we let CSE deal with duplicate loads. Here we see a vector access + * and we have to split it if necessary. + */ + const unsigned type_size = type_sz(dest.type); + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); + const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); + + for (unsigned c = 0; c < instr->num_components;) { + const unsigned base = const_offset->u32[0] + c * type_size; + /* Number of usable components in the next block-aligned load. */ + const unsigned count = MIN2(instr->num_components - c, + (block_sz - base % block_sz) / type_size); + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + packed_consts, surf_index, + brw_imm_ud(base & ~(block_sz - 1))); + + const fs_reg consts = + retype(byte_offset(packed_consts, base & (block_sz - 1)), + dest.type); + + for (unsigned d = 0; d < count; d++) + bld.MOV(offset(dest, bld, c + d), component(consts, d)); + + c += count; + } + } + break; + } + + case nir_intrinsic_load_ssbo: { + assert(devinfo->gen >= 7); + + nir_const_value *const_uniform_block = + nir_src_as_const_value(instr->src[0]); + + fs_reg surf_index; + if (const_uniform_block) { + unsigned index = stage_prog_data->binding_table.ssbo_start + + const_uniform_block->u32[0]; + surf_index = brw_imm_ud(index); + brw_mark_surface_used(prog_data, index); + } else { + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, get_nir_src(instr->src[0]), + brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); + + /* Assume this may touch any UBO. It would be nice to provide + * a tighter bound, but the array information is already lowered away. + */ + brw_mark_surface_used(prog_data, + stage_prog_data->binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + fs_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset) { + offset_reg = brw_imm_ud(const_offset->u32[0]); + } else { + offset_reg = get_nir_src(instr->src[1]); + } + + /* Read the vector */ + do_untyped_vector_read(bld, dest, surf_index, offset_reg, + instr->num_components); + + break; + } + + case nir_intrinsic_store_ssbo: { + assert(devinfo->gen >= 7); + + if (stage == MESA_SHADER_FRAGMENT) + brw_wm_prog_data(prog_data)->has_side_effects = true; + + /* Block index */ + fs_reg surf_index; + nir_const_value *const_uniform_block = + nir_src_as_const_value(instr->src[1]); + if (const_uniform_block) { + unsigned index = stage_prog_data->binding_table.ssbo_start + + const_uniform_block->u32[0]; + surf_index = brw_imm_ud(index); + brw_mark_surface_used(prog_data, index); + } else { + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, get_nir_src(instr->src[1]), + brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); + + brw_mark_surface_used(prog_data, + stage_prog_data->binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + /* Value */ + fs_reg val_reg = get_nir_src(instr->src[0]); + + /* Writemask */ + unsigned writemask = instr->const_index[0]; + + /* get_nir_src() retypes to integer. Be wary of 64-bit types though + * since the untyped writes below operate in units of 32-bits, which + * means that we need to write twice as many components each time. + * Also, we have to suffle 64-bit data to be in the appropriate layout + * expected by our 32-bit write messages. + */ + unsigned type_size = 4; + unsigned bit_size = instr->src[0].is_ssa ? + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; + if (bit_size == 64) { + type_size = 8; + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type); + shuffle_64bit_data_for_32bit_write(bld, + retype(tmp, BRW_REGISTER_TYPE_F), + retype(val_reg, BRW_REGISTER_TYPE_DF), + instr->num_components); + val_reg = tmp; + } + + unsigned type_slots = type_size / 4; + + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + */ + while (writemask) { + unsigned first_component = ffs(writemask) - 1; + unsigned length = ffs(~(writemask >> first_component)) - 1; + + /* We can't write more than 2 64-bit components at once. Limit the + * length of the write to what we can do and let the next iteration + * handle the rest + */ + if (type_size > 4) + length = MIN2(2, length); + + fs_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); + if (const_offset) { + offset_reg = brw_imm_ud(const_offset->u32[0] + + type_size * first_component); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(type_size * first_component)); + } + + + emit_untyped_write(bld, surf_index, offset_reg, + offset(val_reg, bld, first_component * type_slots), + 1 /* dims */, length * type_slots, + BRW_PREDICATE_NONE); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + writemask &= (15 << (first_component + length)); + } + break; + } + + case nir_intrinsic_store_output: { + fs_reg src = get_nir_src(instr->src[0]); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + assert(const_offset && "Indirect output stores not allowed"); + fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, + 4 * const_offset->u32[0]), src.type); + + unsigned num_components = instr->num_components; + unsigned first_component = nir_intrinsic_component(instr); + unsigned bit_size = instr->src[0].is_ssa ? + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; + if (bit_size == 64) { + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(2 * num_components), + BRW_REGISTER_TYPE_F); + shuffle_64bit_data_for_32bit_write( + bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components); + src = retype(tmp, src.type); + num_components *= 2; + } + + for (unsigned j = 0; j < num_components; j++) { + bld.MOV(offset(new_dest, bld, j + first_component), + offset(src, bld, j)); + } + break; + } + + case nir_intrinsic_ssbo_atomic_add: + nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr); + break; + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_and: + nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); + break; + case nir_intrinsic_ssbo_atomic_or: + nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); + break; + case nir_intrinsic_ssbo_atomic_xor: + nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); + break; + case nir_intrinsic_ssbo_atomic_exchange: + nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); + break; + + case nir_intrinsic_get_buffer_size: { + nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; + + /* A resinfo's sampler message is used to get the buffer size. The + * SIMD8's writeback message consists of four registers and SIMD16's + * writeback message consists of 8 destination registers (two per each + * component). Because we are only interested on the first channel of + * the first returned component, where resinfo returns the buffer size + * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of + * the dispatch width. + */ + const fs_builder ubld = bld.exec_all().group(8, 0); + fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); + + /* Set LOD = 0 */ + ubld.MOV(src_payload, brw_imm_d(0)); + + const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; + fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload, + src_payload, brw_imm_ud(index)); + inst->header_size = 0; + inst->mlen = 1; + inst->size_written = 4 * REG_SIZE; + + bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0)); + brw_mark_surface_used(prog_data, index); + break; + } + + case nir_intrinsic_load_channel_num: { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); + dest = retype(dest, BRW_REGISTER_TYPE_UD); + const fs_builder allbld8 = bld.group(8, 0).exec_all(); + allbld8.MOV(tmp, brw_imm_v(0x76543210)); + if (dispatch_width > 8) + allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u)); + if (dispatch_width > 16) { + const fs_builder allbld16 = bld.group(16, 0).exec_all(); + allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u)); + } + bld.MOV(dest, tmp); + break; + } + + default: + unreachable("unknown intrinsic"); + } +} + +void +fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, + int op, nir_intrinsic_instr *instr) +{ + if (stage == MESA_SHADER_FRAGMENT) + brw_wm_prog_data(prog_data)->has_side_effects = true; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + fs_reg surface; + nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); + if (const_surface) { + unsigned surf_index = stage_prog_data->binding_table.ssbo_start + + const_surface->u32[0]; + surface = brw_imm_ud(surf_index); + brw_mark_surface_used(prog_data, surf_index); + } else { + surface = vgrf(glsl_type::uint_type); + bld.ADD(surface, get_nir_src(instr->src[0]), + brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); + + /* Assume this may touch any SSBO. This is the same we do for other + * UBO/SSBO accesses with non-constant surface. + */ + brw_mark_surface_used(prog_data, + stage_prog_data->binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + fs_reg offset = get_nir_src(instr->src[1]); + fs_reg data1 = get_nir_src(instr->src[2]); + fs_reg data2; + if (op == BRW_AOP_CMPWR) + data2 = get_nir_src(instr->src[3]); + + /* Emit the actual atomic operation */ + + fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); + dest.type = atomic_result.type; + bld.MOV(dest, atomic_result); +} + +void +fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, + int op, nir_intrinsic_instr *instr) +{ + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + fs_reg surface = brw_imm_ud(GEN7_BTI_SLM); + fs_reg offset; + fs_reg data1 = get_nir_src(instr->src[1]); + fs_reg data2; + if (op == BRW_AOP_CMPWR) + data2 = get_nir_src(instr->src[2]); + + /* Get the offset */ + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); + } else { + offset = vgrf(glsl_type::uint_type); + bld.ADD(offset, + retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(instr->const_index[0])); + } + + /* Emit the actual atomic operation operation */ + + fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); + dest.type = atomic_result.type; + bld.MOV(dest, atomic_result); +} + +void +fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) +{ + unsigned texture = instr->texture_index; + unsigned sampler = instr->sampler_index; + + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + + srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); + + int lod_components = 0; + + /* The hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); + + uint32_t header_bits = 0; + for (unsigned i = 0; i < instr->num_srcs; i++) { + fs_reg src = get_nir_src(instr->src[i].src); + switch (instr->src[i].src_type) { + case nir_tex_src_bias: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_comparator: + srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_coord: + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_txf_ms_mcs: + case nir_texop_samples_identical: + srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); + break; + default: + srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); + break; + } + break; + case nir_tex_src_ddx: + srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); + lod_components = nir_tex_instr_src_size(instr, i); + break; + case nir_tex_src_ddy: + srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_lod: + switch (instr->op) { + case nir_texop_txs: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); + break; + case nir_texop_txf: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); + break; + default: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + } + break; + case nir_tex_src_ms_index: + srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); + break; + + case nir_tex_src_offset: { + nir_const_value *const_offset = + nir_src_as_const_value(instr->src[i].src); + unsigned offset_bits = 0; + if (const_offset && + brw_texture_offset(const_offset->i32, + nir_tex_instr_src_size(instr, i), + &offset_bits)) { + header_bits |= offset_bits; + } else { + srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = + retype(src, BRW_REGISTER_TYPE_D); + } + break; + } + + case nir_tex_src_projector: + unreachable("should be lowered"); + + case nir_tex_src_texture_offset: { + /* Figure out the highest possible texture index and mark it as used */ + uint32_t max_used = texture + instr->texture_array_size - 1; + if (instr->op == nir_texop_tg4 && devinfo->gen < 8) { + max_used += stage_prog_data->binding_table.gather_texture_start; + } else { + max_used += stage_prog_data->binding_table.texture_start; + } + brw_mark_surface_used(prog_data, max_used); + + /* Emit code to evaluate the actual indexing expression */ + fs_reg tmp = vgrf(glsl_type::uint_type); + bld.ADD(tmp, src, brw_imm_ud(texture)); + srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); + break; + } + + case nir_tex_src_sampler_offset: { + /* Emit code to evaluate the actual indexing expression */ + fs_reg tmp = vgrf(glsl_type::uint_type); + bld.ADD(tmp, src, brw_imm_ud(sampler)); + srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); + break; + } + + case nir_tex_src_ms_mcs: + assert(instr->op == nir_texop_txf_ms); + srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); + break; + + case nir_tex_src_plane: { + nir_const_value *const_plane = + nir_src_as_const_value(instr->src[i].src); + const uint32_t plane = const_plane->u32[0]; + const uint32_t texture_index = + instr->texture_index + + stage_prog_data->binding_table.plane_start[plane] - + stage_prog_data->binding_table.texture_start; + + srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); + break; + } + + default: + unreachable("unknown texture source"); + } + } + + if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && + (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical)) { + if (devinfo->gen >= 7 && + key_tex->compressed_multisample_layout_mask & (1 << texture)) { + srcs[TEX_LOGICAL_SRC_MCS] = + emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], + instr->coord_components, + srcs[TEX_LOGICAL_SRC_SURFACE]); + } else { + srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); + } + } + + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); + + if (instr->op == nir_texop_query_levels || + (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) { + /* textureQueryLevels() and texture() are implemented in terms of TXS + * and TXL respectively, so we need to pass a valid LOD argument. + */ + assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE); + srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u); + } + + enum opcode opcode; + switch (instr->op) { + case nir_texop_tex: + opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL : + SHADER_OPCODE_TXL_LOGICAL); + break; + case nir_texop_txb: + opcode = FS_OPCODE_TXB_LOGICAL; + break; + case nir_texop_txl: + opcode = SHADER_OPCODE_TXL_LOGICAL; + break; + case nir_texop_txd: + opcode = SHADER_OPCODE_TXD_LOGICAL; + break; + case nir_texop_txf: + opcode = SHADER_OPCODE_TXF_LOGICAL; + break; + case nir_texop_txf_ms: + if ((key_tex->msaa_16 & (1 << sampler))) + opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; + else + opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; + break; + case nir_texop_txf_ms_mcs: + opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; + break; + case nir_texop_query_levels: + case nir_texop_txs: + opcode = SHADER_OPCODE_TXS_LOGICAL; + break; + case nir_texop_lod: + opcode = SHADER_OPCODE_LOD_LOGICAL; + break; + case nir_texop_tg4: + if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) + opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; + else + opcode = SHADER_OPCODE_TG4_LOGICAL; + break; + case nir_texop_texture_samples: + opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; + break; + case nir_texop_samples_identical: { + fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); + + /* If mcs is an immediate value, it means there is no MCS. In that case + * just return false. + */ + if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { + bld.MOV(dst, brw_imm_ud(0u)); + } else if ((key_tex->msaa_16 & (1 << sampler))) { + fs_reg tmp = vgrf(glsl_type::uint_type); + bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], + offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); + bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); + } else { + bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), + BRW_CONDITIONAL_EQ); + } + return; + } + default: + unreachable("unknown texture opcode"); + } + + if (instr->op == nir_texop_tg4) { + if (instr->component == 1 && + key_tex->gather_channel_quirk_mask & (1 << texture)) { + /* gather4 sampler is broken for green channel on RG32F -- + * we must ask for blue instead. + */ + header_bits |= 2 << 16; + } else { + header_bits |= instr->component << 16; + } + } + + fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); + fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); + inst->offset = header_bits; + + const unsigned dest_size = nir_tex_instr_dest_size(instr); + if (devinfo->gen >= 9 && + instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { + unsigned write_mask = instr->dest.is_ssa ? + nir_ssa_def_components_read(&instr->dest.ssa): + (1 << dest_size) - 1; + assert(write_mask != 0); /* dead code should have been eliminated */ + inst->size_written = util_last_bit(write_mask) * + inst->dst.component_size(inst->exec_size); + } else { + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + } + + if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) + inst->shadow_compare = true; + + if (instr->op == nir_texop_tg4 && devinfo->gen == 6) + emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst); + + fs_reg nir_dest[4]; + for (unsigned i = 0; i < dest_size; i++) + nir_dest[i] = offset(dst, bld, i); + + if (instr->op == nir_texop_query_levels) { + /* # levels is in .w */ + nir_dest[0] = offset(dst, bld, 3); + } else if (instr->op == nir_texop_txs && + dest_size >= 3 && devinfo->gen < 7) { + /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ + fs_reg depth = offset(dst, bld, 2); + nir_dest[2] = vgrf(glsl_type::int_type); + bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); + } + + bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); +} + +void +fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + bld.emit(BRW_OPCODE_BREAK); + break; + case nir_jump_continue: + bld.emit(BRW_OPCODE_CONTINUE); + break; + case nir_jump_return: + default: + unreachable("unknown jump"); + } +} + +/** + * This helper takes the result of a load operation that reads 32-bit elements + * in this format: + * + * x x x x x x x x + * y y y y y y y y + * z z z z z z z z + * w w w w w w w w + * + * and shuffles the data to get this: + * + * x y x y x y x y + * x y x y x y x y + * z w z w z w z w + * z w z w z w z w + * + * Which is exactly what we want if the load is reading 64-bit components + * like doubles, where x represents the low 32-bit of the x double component + * and y represents the high 32-bit of the x double component (likewise with + * z and w for double component y). The parameter @components represents + * the number of 64-bit components present in @src. This would typically be + * 2 at most, since we can only fit 2 double elements in the result of a + * vec4 load. + * + * Notice that @dst and @src can be the same register. + */ +void +shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t components) +{ + assert(type_sz(src.type) == 4); + assert(type_sz(dst.type) == 8); + + /* A temporary that we will use to shuffle the 32-bit data of each + * component in the vector into valid 64-bit data. We can't write directly + * to dst because dst can be (and would usually be) the same as src + * and in that case the first MOV in the loop below would overwrite the + * data read in the second MOV. + */ + fs_reg tmp = bld.vgrf(dst.type); + + for (unsigned i = 0; i < components; i++) { + const fs_reg component_i = offset(src, bld, 2 * i); + + bld.MOV(subscript(tmp, src.type, 0), component_i); + bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1)); + + bld.MOV(offset(dst, bld, i), tmp); + } +} + +/** + * This helper does the inverse operation of + * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. + * + * We need to do this when we are going to use untyped write messsages that + * operate with 32-bit components in order to arrange our 64-bit data to be + * in the expected layout. + * + * Notice that callers of this function, unlike in the case of the inverse + * operation, would typically need to call this with dst and src being + * different registers, since they would otherwise corrupt the original + * 64-bit data they are about to write. Because of this the function checks + * that the src and dst regions involved in the operation do not overlap. + */ +void +shuffle_64bit_data_for_32bit_write(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t components) +{ + assert(type_sz(src.type) == 8); + assert(type_sz(dst.type) == 4); + + assert(!regions_overlap( + dst, 2 * components * dst.component_size(bld.dispatch_width()), + src, components * src.component_size(bld.dispatch_width()))); + + for (unsigned i = 0; i < components; i++) { + const fs_reg component_i = offset(src, bld, i); + bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0)); + bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1)); + } +} + +fs_reg +setup_imm_df(const fs_builder &bld, double v) +{ + const struct gen_device_info *devinfo = bld.shader->devinfo; + assert(devinfo->gen >= 7); + + if (devinfo->gen >= 8) + return brw_imm_df(v); + + /* gen7.5 does not support DF immediates straighforward but the DIM + * instruction allows to set the 64-bit immediate value. + */ + if (devinfo->is_haswell) { + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); + ubld.DIM(dst, brw_imm_df(v)); + return component(dst, 0); + } + + /* gen7 does not support DF immediates, so we generate a 64-bit constant by + * writing the low 32-bit of the constant to suboffset 0 of a VGRF and + * the high 32-bit to suboffset 4 and then applying a stride of 0. + * + * Alternatively, we could also produce a normal VGRF (without stride 0) + * by writing to all the channels in the VGRF, however, that would hit the + * gen7 bug where we have to split writes that span more than 1 register + * into instructions with a width of 4 (otherwise the write to the second + * register written runs into an execmask hardware bug) which isn't very + * nice. + */ + union { + double d; + struct { + uint32_t i1; + uint32_t i2; + }; + } di; + + di.d = v; + + const fs_builder ubld = bld.exec_all().group(1, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + ubld.MOV(tmp, brw_imm_ud(di.i1)); + ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); + + return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); +} diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp new file mode 100644 index 00000000000..5c6f3d490f0 --- /dev/null +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -0,0 +1,992 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_cfg.h" +#include "util/register_allocate.h" + +using namespace brw; + +static void +assign_reg(unsigned *reg_hw_locations, fs_reg *reg) +{ + if (reg->file == VGRF) { + reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE; + reg->offset %= REG_SIZE; + } +} + +void +fs_visitor::assign_regs_trivial() +{ + unsigned hw_reg_mapping[this->alloc.count + 1]; + unsigned i; + int reg_width = dispatch_width / 8; + + /* Note that compressed instructions require alignment to 2 registers. */ + hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); + for (i = 1; i <= this->alloc.count; i++) { + hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + + this->alloc.sizes[i - 1]); + } + this->grf_used = hw_reg_mapping[this->alloc.count]; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + assign_reg(hw_reg_mapping, &inst->dst); + for (i = 0; i < inst->sources; i++) { + assign_reg(hw_reg_mapping, &inst->src[i]); + } + } + + if (this->grf_used >= max_grf) { + fail("Ran out of regs on trivial allocator (%d/%d)\n", + this->grf_used, max_grf); + } else { + this->alloc.count = this->grf_used; + } + +} + +static void +brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + int base_reg_count = BRW_MAX_GRF; + const int index = _mesa_logbase2(dispatch_width / 8); + + if (dispatch_width > 8 && devinfo->gen >= 7) { + /* For IVB+, we don't need the PLN hacks or the even-reg alignment in + * SIMD16. Therefore, we can use the exact same register sets for + * SIMD16 as we do for SIMD8 and we don't need to recalculate them. + */ + compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0]; + return; + } + + /* The registers used to make up almost all values handled in the compiler + * are a scalar value occupying a single register (or 2 registers in the + * case of SIMD16, which is handled by dividing base_reg_count by 2 and + * multiplying allocated register numbers by 2). Things that were + * aggregates of scalar values at the GLSL level were split to scalar + * values by split_virtual_grfs(). + * + * However, texture SEND messages return a series of contiguous registers + * to write into. We currently always ask for 4 registers, but we may + * convert that to use less some day. + * + * Additionally, on gen5 we need aligned pairs of registers for the PLN + * instruction, and on gen4 we need 8 contiguous regs for workaround simd16 + * texturing. + */ + const int class_count = MAX_VGRF_SIZE; + int class_sizes[MAX_VGRF_SIZE]; + for (unsigned i = 0; i < MAX_VGRF_SIZE; i++) + class_sizes[i] = i + 1; + + memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0, + sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range)); + int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range; + + /* Compute the total number of registers across all classes. */ + int ra_reg_count = 0; + for (int i = 0; i < class_count; i++) { + if (devinfo->gen <= 5 && dispatch_width >= 16) { + /* From the G45 PRM: + * + * In order to reduce the hardware complexity, the following + * rules and restrictions apply to the compressed instruction: + * ... + * * Operand Alignment Rule: With the exceptions listed below, a + * source/destination operand in general should be aligned to + * even 256-bit physical register with a region size equal to + * two 256-bit physical register + */ + ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2; + } else { + ra_reg_count += base_reg_count - (class_sizes[i] - 1); + } + /* Mark the last register. We'll fill in the beginnings later. */ + class_to_ra_reg_range[class_sizes[i]] = ra_reg_count; + } + + /* Fill out the rest of the range markers */ + for (int i = 1; i < 17; ++i) { + if (class_to_ra_reg_range[i] == 0) + class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1]; + } + + uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count); + struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false); + if (devinfo->gen >= 6) + ra_set_allocate_round_robin(regs); + int *classes = ralloc_array(compiler, int, class_count); + int aligned_pairs_class = -1; + + /* Allocate space for q values. We allocate class_count + 1 because we + * want to leave room for the aligned pairs class if we have it. */ + unsigned int **q_values = ralloc_array(compiler, unsigned int *, + class_count + 1); + for (int i = 0; i < class_count + 1; ++i) + q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1); + + /* Now, add the registers to their classes, and add the conflicts + * between them and the base GRF registers (and also each other). + */ + int reg = 0; + int pairs_base_reg = 0; + int pairs_reg_count = 0; + for (int i = 0; i < class_count; i++) { + int class_reg_count; + if (devinfo->gen <= 5 && dispatch_width >= 16) { + class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2; + + /* See comment below. The only difference here is that we are + * dealing with pairs of registers instead of single registers. + * Registers of odd sizes simply get rounded up. */ + for (int j = 0; j < class_count; j++) + q_values[i][j] = (class_sizes[i] + 1) / 2 + + (class_sizes[j] + 1) / 2 - 1; + } else { + class_reg_count = base_reg_count - (class_sizes[i] - 1); + + /* From register_allocate.c: + * + * q(B,C) (indexed by C, B is this register class) in + * Runeson/Nyström paper. This is "how many registers of B could + * the worst choice register from C conflict with". + * + * If we just let the register allocation algorithm compute these + * values, is extremely expensive. However, since all of our + * registers are laid out, we can very easily compute them + * ourselves. View the register from C as fixed starting at GRF n + * somwhere in the middle, and the register from B as sliding back + * and forth. Then the first register to conflict from B is the + * one starting at n - class_size[B] + 1 and the last register to + * conflict will start at n + class_size[B] - 1. Therefore, the + * number of conflicts from B is class_size[B] + class_size[C] - 1. + * + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * B | | | | | |n| --> | | | | | | | + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * +-+-+-+-+-+ + * C |n| | | | | + * +-+-+-+-+-+ + */ + for (int j = 0; j < class_count; j++) + q_values[i][j] = class_sizes[i] + class_sizes[j] - 1; + } + classes[i] = ra_alloc_reg_class(regs); + + /* Save this off for the aligned pair class at the end. */ + if (class_sizes[i] == 2) { + pairs_base_reg = reg; + pairs_reg_count = class_reg_count; + } + + if (devinfo->gen <= 5 && dispatch_width >= 16) { + for (int j = 0; j < class_reg_count; j++) { + ra_class_add_reg(regs, classes[i], reg); + + ra_reg_to_grf[reg] = j * 2; + + for (int base_reg = j; + base_reg < j + (class_sizes[i] + 1) / 2; + base_reg++) { + ra_add_reg_conflict(regs, base_reg, reg); + } + + reg++; + } + } else { + for (int j = 0; j < class_reg_count; j++) { + ra_class_add_reg(regs, classes[i], reg); + + ra_reg_to_grf[reg] = j; + + for (int base_reg = j; + base_reg < j + class_sizes[i]; + base_reg++) { + ra_add_reg_conflict(regs, base_reg, reg); + } + + reg++; + } + } + } + assert(reg == ra_reg_count); + + /* Applying transitivity to all of the base registers gives us the + * appropreate register conflict relationships everywhere. + */ + for (int reg = 0; reg < base_reg_count; reg++) + ra_make_reg_conflicts_transitive(regs, reg); + + /* Add a special class for aligned pairs, which we'll put delta_xy + * in on Gen <= 6 so that we can do PLN. + */ + if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) { + aligned_pairs_class = ra_alloc_reg_class(regs); + + for (int i = 0; i < pairs_reg_count; i++) { + if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { + ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i); + } + } + + for (int i = 0; i < class_count; i++) { + /* These are a little counter-intuitive because the pair registers + * are required to be aligned while the register they are + * potentially interferring with are not. In the case where the + * size is even, the worst-case is that the register is + * odd-aligned. In the odd-size case, it doesn't matter. + */ + q_values[class_count][i] = class_sizes[i] / 2 + 1; + q_values[i][class_count] = class_sizes[i] + 1; + } + q_values[class_count][class_count] = 1; + } + + ra_set_finalize(regs, q_values); + + ralloc_free(q_values); + + compiler->fs_reg_sets[index].regs = regs; + for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++) + compiler->fs_reg_sets[index].classes[i] = -1; + for (int i = 0; i < class_count; i++) + compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; + compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf; + compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class; +} + +void +brw_fs_alloc_reg_sets(struct brw_compiler *compiler) +{ + brw_alloc_reg_set(compiler, 8); + brw_alloc_reg_set(compiler, 16); + brw_alloc_reg_set(compiler, 32); +} + +static int +count_to_loop_end(const bblock_t *block) +{ + if (block->end()->opcode == BRW_OPCODE_WHILE) + return block->end_ip; + + int depth = 1; + /* Skip the first block, since we don't want to count the do the calling + * function found. + */ + for (block = block->next(); + depth > 0; + block = block->next()) { + if (block->start()->opcode == BRW_OPCODE_DO) + depth++; + if (block->end()->opcode == BRW_OPCODE_WHILE) { + depth--; + if (depth == 0) + return block->end_ip; + } + } + unreachable("not reached"); +} + +void fs_visitor::calculate_payload_ranges(int payload_node_count, + int *payload_last_use_ip) +{ + int loop_depth = 0; + int loop_end_ip = 0; + + for (int i = 0; i < payload_node_count; i++) + payload_last_use_ip[i] = -1; + + int ip = 0; + foreach_block_and_inst(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_DO: + loop_depth++; + + /* Since payload regs are deffed only at the start of the shader + * execution, any uses of the payload within a loop mean the live + * interval extends to the end of the outermost loop. Find the ip of + * the end now. + */ + if (loop_depth == 1) + loop_end_ip = count_to_loop_end(block); + break; + case BRW_OPCODE_WHILE: + loop_depth--; + break; + default: + break; + } + + int use_ip; + if (loop_depth > 0) + use_ip = loop_end_ip; + else + use_ip = ip; + + /* Note that UNIFORM args have been turned into FIXED_GRF by + * assign_curbe_setup(), and interpolation uses fixed hardware regs from + * the start (see interp_reg()). + */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == FIXED_GRF) { + int node_nr = inst->src[i].nr; + if (node_nr >= payload_node_count) + continue; + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + payload_last_use_ip[node_nr + j] = use_ip; + assert(node_nr + j < unsigned(payload_node_count)); + } + } + } + + /* Special case instructions which have extra implied registers used. */ + switch (inst->opcode) { + case CS_OPCODE_CS_TERMINATE: + payload_last_use_ip[0] = use_ip; + break; + + default: + if (inst->eot) { + /* We could omit this for the !inst->header_present case, except + * that the simulator apparently incorrectly reads from g0/g1 + * instead of sideband. It also really freaks out driver + * developers to see g0 used in unusual places, so just always + * reserve it. + */ + payload_last_use_ip[0] = use_ip; + payload_last_use_ip[1] = use_ip; + } + break; + } + + ip++; + } +} + + +/** + * Sets up interference between thread payload registers and the virtual GRFs + * to be allocated for program temporaries. + * + * We want to be able to reallocate the payload for our virtual GRFs, notably + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of + * our 128 registers. + * + * The layout of the payload registers is: + * + * 0..payload.num_regs-1: fixed function setup (including bary coordinates). + * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data + * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. + * + * And we have payload_node_count nodes covering these registers in order + * (note that in SIMD16, a node is two registers). + */ +void +fs_visitor::setup_payload_interference(struct ra_graph *g, + int payload_node_count, + int first_payload_node) +{ + int payload_last_use_ip[payload_node_count]; + calculate_payload_ranges(payload_node_count, payload_last_use_ip); + + for (int i = 0; i < payload_node_count; i++) { + if (payload_last_use_ip[i] == -1) + continue; + + /* Mark the payload node as interfering with any virtual grf that is + * live between the start of the program and our last use of the payload + * node. + */ + for (unsigned j = 0; j < this->alloc.count; j++) { + /* Note that we use a <= comparison, unlike virtual_grf_interferes(), + * in order to not have to worry about the uniform issue described in + * calculate_live_intervals(). + */ + if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) { + ra_add_node_interference(g, first_payload_node + i, j); + } + } + } + + for (int i = 0; i < payload_node_count; i++) { + /* Mark each payload node as being allocated to its physical register. + * + * The alternative would be to have per-physical-register classes, which + * would just be silly. + */ + if (devinfo->gen <= 5 && dispatch_width >= 16) { + /* We have to divide by 2 here because we only have even numbered + * registers. Some of the payload registers will be odd, but + * that's ok because their physical register numbers have already + * been assigned. The only thing this is used for is interference. + */ + ra_set_node_reg(g, first_payload_node + i, i / 2); + } else { + ra_set_node_reg(g, first_payload_node + i, i); + } + } +} + +/** + * Sets the mrf_used array to indicate which MRFs are used by the shader IR + * + * This is used in assign_regs() to decide which of the GRFs that we use as + * MRFs on gen7 get normally register allocated, and in register spilling to + * see if we can actually use MRFs to do spills without overwriting normal MRF + * contents. + */ +static void +get_used_mrfs(fs_visitor *v, bool *mrf_used) +{ + int reg_width = v->dispatch_width / 8; + + memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool)); + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->dst.file == MRF) { + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; + mrf_used[reg] = true; + if (reg_width == 2) { + if (inst->dst.nr & BRW_MRF_COMPR4) { + mrf_used[reg + 4] = true; + } else { + mrf_used[reg + 1] = true; + } + } + } + + if (inst->mlen > 0) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + mrf_used[inst->base_mrf + i] = true; + } + } + } +} + +/** + * Sets interference between virtual GRFs and usage of the high GRFs for SEND + * messages (treated as MRFs in code generation). + */ +static void +setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g, + int first_mrf_node, int *first_used_mrf) +{ + bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)]; + get_used_mrfs(v, mrf_used); + + *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen); + for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) { + /* Mark each MRF reg node as being allocated to its physical register. + * + * The alternative would be to have per-physical-register classes, which + * would just be silly. + */ + ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i); + + /* Since we don't have any live/dead analysis on the MRFs, just mark all + * that are used as conflicting with all virtual GRFs. + */ + if (mrf_used[i]) { + if (i < *first_used_mrf) + *first_used_mrf = i; + + for (unsigned j = 0; j < v->alloc.count; j++) { + ra_add_node_interference(g, first_mrf_node + i, j); + } + } + } +} + +bool +fs_visitor::assign_regs(bool allow_spilling, bool spill_all) +{ + /* Most of this allocation was written for a reg_width of 1 + * (dispatch_width == 8). In extending to SIMD16, the code was + * left in place and it was converted to have the hardware + * registers it's allocating be contiguous physical pairs of regs + * for reg_width == 2. + */ + int reg_width = dispatch_width / 8; + unsigned hw_reg_mapping[this->alloc.count]; + int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width); + int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */ + calculate_live_intervals(); + + int node_count = this->alloc.count; + int first_payload_node = node_count; + node_count += payload_node_count; + int first_mrf_hack_node = node_count; + if (devinfo->gen >= 7) + node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; + struct ra_graph *g = + ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); + + for (unsigned i = 0; i < this->alloc.count; i++) { + unsigned size = this->alloc.sizes[i]; + int c; + + assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && + "Register allocation relies on split_virtual_grfs()"); + c = compiler->fs_reg_sets[rsi].classes[size - 1]; + + /* Special case: on pre-GEN6 hardware that supports PLN, the + * second operand of a PLN instruction needs to be an + * even-numbered register, so we have a special register class + * wm_aligned_pairs_class to handle this case. pre-GEN6 always + * uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the + * second operand of a PLN instruction (since it doesn't support + * any other interpolation modes). So all we need to do is find + * that register and set it to the appropriate class. + */ + if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF && + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) { + c = compiler->fs_reg_sets[rsi].aligned_pairs_class; + } + + ra_set_node_class(g, i, c); + + for (unsigned j = 0; j < i; j++) { + if (virtual_grf_interferes(i, j)) { + ra_add_node_interference(g, i, j); + } + } + } + + /* Certain instructions can't safely use the same register for their + * sources and destination. Add interference. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); + } + } + } + } + + setup_payload_interference(g, payload_node_count, first_payload_node); + if (devinfo->gen >= 7) { + int first_used_mrf = BRW_MAX_MRF(devinfo->gen); + setup_mrf_hack_interference(this, g, first_mrf_hack_node, + &first_used_mrf); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* When we do send-from-GRF for FB writes, we need to ensure that + * the last write instruction sends from a high register. This is + * because the vertex fetcher wants to start filling the low + * payload registers while the pixel data port is still working on + * writing out the memory. If we don't do this, we get rendering + * artifacts. + * + * We could just do "something high". Instead, we just pick the + * highest register that works. + */ + if (inst->eot) { + int size = alloc.sizes[inst->src[0].nr]; + int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; + + /* If something happened to spill, we want to push the EOT send + * register early enough in the register file that we don't + * conflict with any used MRF hack registers. + */ + reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf; + + ra_set_node_reg(g, inst->src[0].nr, reg); + break; + } + } + } + + if (dispatch_width > 8) { + /* In 16-wide dispatch we have an issue where a compressed + * instruction is actually two instructions executed simultaneiously. + * It's actually ok to have the source and destination registers be + * the same. In this case, each instruction over-writes its own + * source and there's no problem. The real problem here is if the + * source and destination registers are off by one. Then you can end + * up in a scenario where the first instruction over-writes the + * source of the second instruction. Since the compiler doesn't know + * about this level of granularity, we simply make the source and + * destination interfere. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file != VGRF) + continue; + + for (int i = 0; i < inst->sources; ++i) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); + } + } + } + } + + /* Debug of register spilling: Go spill everything. */ + if (unlikely(spill_all)) { + int reg = choose_spill_reg(g); + + if (reg != -1) { + spill_reg(reg); + ralloc_free(g); + return false; + } + } + + if (!ra_allocate(g)) { + /* Failed to allocate registers. Spill a reg, and the caller will + * loop back into here to try again. + */ + int reg = choose_spill_reg(g); + + if (reg == -1) { + fail("no register to spill:\n"); + dump_instructions(NULL); + } else if (allow_spilling) { + spill_reg(reg); + } + + ralloc_free(g); + + return false; + } + + /* Get the chosen virtual registers for each node, and map virtual + * regs in the register classes back down to real hardware reg + * numbers. + */ + this->grf_used = payload_node_count; + for (unsigned i = 0; i < this->alloc.count; i++) { + int reg = ra_get_node_reg(g, i); + + hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg]; + this->grf_used = MAX2(this->grf_used, + hw_reg_mapping[i] + this->alloc.sizes[i]); + } + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + assign_reg(hw_reg_mapping, &inst->dst); + for (int i = 0; i < inst->sources; i++) { + assign_reg(hw_reg_mapping, &inst->src[i]); + } + } + + this->alloc.count = this->grf_used; + + ralloc_free(g); + + return true; +} + +namespace { + /** + * Maximum spill block size we expect to encounter in 32B units. + * + * This is somewhat arbitrary and doesn't necessarily limit the maximum + * variable size that can be spilled -- A higher value will allow a + * variable of a given size to be spilled more efficiently with a smaller + * number of scratch messages, but will increase the likelihood of a + * collision between the MRFs reserved for spilling and other MRFs used by + * the program (and possibly increase GRF register pressure on platforms + * without hardware MRFs), what could cause register allocation to fail. + * + * For the moment reserve just enough space so a register of 32 bit + * component type and natural region width can be spilled without splitting + * into multiple (force_writemask_all) scratch messages. + */ + unsigned + spill_max_size(const backend_shader *s) + { + /* FINISHME - On Gen7+ it should be possible to avoid this limit + * altogether by spilling directly from the temporary GRF + * allocated to hold the result of the instruction (and the + * scratch write header). + */ + /* FINISHME - The shader's dispatch width probably belongs in + * backend_shader (or some nonexistent fs_shader class?) + * rather than in the visitor class. + */ + return static_cast<const fs_visitor *>(s)->dispatch_width / 8; + } + + /** + * First MRF register available for spilling. + */ + unsigned + spill_base_mrf(const backend_shader *s) + { + return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1; + } +} + +static void +emit_unspill(const fs_builder &bld, fs_reg dst, + uint32_t spill_offset, unsigned count) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + const unsigned reg_size = dst.component_size(bld.dispatch_width()) / + REG_SIZE; + assert(count % reg_size == 0); + + for (unsigned i = 0; i < count / reg_size; i++) { + /* The Gen7 descriptor-based offset is 12 bits of HWORD units. Because + * the Gen7-style scratch block read is hardwired to BTI 255, on Gen9+ + * it would cause the DC to do an IA-coherent read, what largely + * outweighs the slight advantage from not having to provide the address + * as part of the message header, so we're better off using plain old + * oword block reads. + */ + bool gen7_read = (devinfo->gen >= 7 && devinfo->gen < 9 && + spill_offset < (1 << 12) * REG_SIZE); + fs_inst *unspill_inst = bld.emit(gen7_read ? + SHADER_OPCODE_GEN7_SCRATCH_READ : + SHADER_OPCODE_GEN4_SCRATCH_READ, + dst); + unspill_inst->offset = spill_offset; + + if (!gen7_read) { + unspill_inst->base_mrf = spill_base_mrf(bld.shader); + unspill_inst->mlen = 1; /* header contains offset */ + } + + dst.offset += reg_size * REG_SIZE; + spill_offset += reg_size * REG_SIZE; + } +} + +static void +emit_spill(const fs_builder &bld, fs_reg src, + uint32_t spill_offset, unsigned count) +{ + const unsigned reg_size = src.component_size(bld.dispatch_width()) / + REG_SIZE; + assert(count % reg_size == 0); + + for (unsigned i = 0; i < count / reg_size; i++) { + fs_inst *spill_inst = + bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src); + src.offset += reg_size * REG_SIZE; + spill_inst->offset = spill_offset + i * reg_size * REG_SIZE; + spill_inst->mlen = 1 + reg_size; /* header, value */ + spill_inst->base_mrf = spill_base_mrf(bld.shader); + } +} + +int +fs_visitor::choose_spill_reg(struct ra_graph *g) +{ + float loop_scale = 1.0; + float spill_costs[this->alloc.count]; + bool no_spill[this->alloc.count]; + + for (unsigned i = 0; i < this->alloc.count; i++) { + spill_costs[i] = 0.0; + no_spill[i] = false; + } + + /* Calculate costs for spilling nodes. Call it a cost of 1 per + * spill/unspill we'll have to do, and guess that the insides of + * loops run 10 times. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + spill_costs[inst->src[i].nr] += loop_scale; + } + + if (inst->dst.file == VGRF) + spill_costs[inst->dst.nr] += DIV_ROUND_UP(inst->size_written, REG_SIZE) + * loop_scale; + + switch (inst->opcode) { + + case BRW_OPCODE_DO: + loop_scale *= 10; + break; + + case BRW_OPCODE_WHILE: + loop_scale /= 10; + break; + + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + if (inst->src[0].file == VGRF) + no_spill[inst->src[0].nr] = true; + break; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case SHADER_OPCODE_GEN7_SCRATCH_READ: + if (inst->dst.file == VGRF) + no_spill[inst->dst.nr] = true; + break; + + default: + break; + } + } + + for (unsigned i = 0; i < this->alloc.count; i++) { + if (!no_spill[i]) + ra_set_node_spill_cost(g, i, spill_costs[i]); + } + + return ra_get_best_spill_node(g); +} + +void +fs_visitor::spill_reg(int spill_reg) +{ + int size = alloc.sizes[spill_reg]; + unsigned int spill_offset = last_scratch; + assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */ + + /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done + * using up to 11 MRFs starting from either m1 or m2, and fb writes can use + * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or + * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst + * depth), starting from m1. In summary: We may not be able to spill in + * SIMD16 mode, because we'd stomp the FB writes. + */ + if (!spilled_any_registers) { + bool mrf_used[BRW_MAX_MRF(devinfo->gen)]; + get_used_mrfs(this, mrf_used); + + for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) { + if (mrf_used[i]) { + fail("Register spilling not supported with m%d used", i); + return; + } + } + + spilled_any_registers = true; + } + + last_scratch += size * REG_SIZE; + + /* Generate spill/unspill instructions for the objects being + * spilled. Right now, we spill or unspill the whole thing to a + * virtual grf of the same size. For most instructions, though, we + * could just spill/unspill the GRF being accessed. + */ + foreach_block_and_inst (block, fs_inst, inst, cfg) { + const fs_builder ibld = fs_builder(this, block, inst); + + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF && + inst->src[i].nr == spill_reg) { + int count = regs_read(inst, i); + int subset_spill_offset = spill_offset + + ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE); + fs_reg unspill_dst(VGRF, alloc.allocate(count)); + + inst->src[i].nr = unspill_dst.nr; + inst->src[i].offset %= REG_SIZE; + + /* We read the largest power-of-two divisor of the register count + * (because only POT scratch read blocks are allowed by the + * hardware) up to the maximum supported block size. + */ + const unsigned width = + MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1)); + + /* Set exec_all() on unspill messages under the (rather + * pessimistic) assumption that there is no one-to-one + * correspondence between channels of the spilled variable in + * scratch space and the scratch read message, which operates on + * 32 bit channels. It shouldn't hurt in any case because the + * unspill destination is a block-local temporary. + */ + emit_unspill(ibld.exec_all().group(width, 0), + unspill_dst, subset_spill_offset, count); + } + } + + if (inst->dst.file == VGRF && + inst->dst.nr == spill_reg) { + int subset_spill_offset = spill_offset + + ROUND_DOWN_TO(inst->dst.offset, REG_SIZE); + fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst))); + + inst->dst.nr = spill_src.nr; + inst->dst.offset %= REG_SIZE; + + /* If we're immediately spilling the register, we should not use + * destination dependency hints. Doing so will cause the GPU do + * try to read and write the register at the same time and may + * hang the GPU. + */ + inst->no_dd_clear = false; + inst->no_dd_check = false; + + /* Calculate the execution width of the scratch messages (which work + * in terms of 32 bit components so we have a fixed number of eight + * channels per spilled register). We attempt to write one + * exec_size-wide component of the variable at a time without + * exceeding the maximum number of (fake) MRF registers reserved for + * spills. + */ + const unsigned width = 8 * MIN2( + DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE), + spill_max_size(this)); + + /* Spills should only write data initialized by the instruction for + * whichever channels are enabled in the excution mask. If that's + * not possible we'll have to emit a matching unspill before the + * instruction and set force_writemask_all on the spill. + */ + const bool per_channel = + inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 && + inst->exec_size == width; + + /* Builder used to emit the scratch messages. */ + const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0); + + /* If our write is going to affect just part of the + * regs_written(inst), then we need to unspill the destination since + * we write back out all of the regs_written(). If the original + * instruction had force_writemask_all set and is not a partial + * write, there should be no need for the unspill since the + * instruction will be overwriting the whole destination in any case. + */ + if (inst->is_partial_write() || + (!inst->force_writemask_all && !per_channel)) + emit_unspill(ubld, spill_src, subset_spill_offset, + regs_written(inst)); + + emit_spill(ubld.at(block, inst->next), spill_src, + subset_spill_offset, regs_written(inst)); + } + } + + invalidate_live_intervals(); +} diff --git a/src/intel/compiler/brw_fs_register_coalesce.cpp b/src/intel/compiler/brw_fs_register_coalesce.cpp new file mode 100644 index 00000000000..952276faed8 --- /dev/null +++ b/src/intel/compiler/brw_fs_register_coalesce.cpp @@ -0,0 +1,295 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_register_coalesce.cpp + * + * Implements register coalescing: Checks if the two registers involved in a + * raw move don't interfere, in which case they can both be stored in the same + * place and the MOV removed. + * + * To do this, all uses of the source of the MOV in the shader are replaced + * with the destination of the MOV. For example: + * + * add vgrf3:F, vgrf1:F, vgrf2:F + * mov vgrf4:F, vgrf3:F + * mul vgrf5:F, vgrf5:F, vgrf4:F + * + * becomes + * + * add vgrf4:F, vgrf1:F, vgrf2:F + * mul vgrf5:F, vgrf5:F, vgrf4:F + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_live_variables.h" + +static bool +is_nop_mov(const fs_inst *inst) +{ + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + fs_reg dst = inst->dst; + for (int i = 0; i < inst->sources; i++) { + if (!dst.equals(inst->src[i])) { + return false; + } + dst.offset += (i < inst->header_size ? REG_SIZE : + inst->exec_size * dst.stride * + type_sz(inst->src[i].type)); + } + return true; + } else if (inst->opcode == BRW_OPCODE_MOV) { + return inst->dst.equals(inst->src[0]); + } + + return false; +} + +static bool +is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst) +{ + if ((inst->opcode != BRW_OPCODE_MOV && + inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) || + inst->is_partial_write() || + inst->saturate || + inst->src[0].file != VGRF || + inst->src[0].negate || + inst->src[0].abs || + !inst->src[0].is_contiguous() || + inst->dst.file != VGRF || + inst->dst.type != inst->src[0].type) { + return false; + } + + if (v->alloc.sizes[inst->src[0].nr] > + v->alloc.sizes[inst->dst.nr]) + return false; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + if (!inst->is_copy_payload(v->alloc)) { + return false; + } + } + + return true; +} + +static bool +can_coalesce_vars(brw::fs_live_variables *live_intervals, + const cfg_t *cfg, const fs_inst *inst, + int dst_var, int src_var) +{ + if (!live_intervals->vars_interfere(src_var, dst_var)) + return true; + + int dst_start = live_intervals->start[dst_var]; + int dst_end = live_intervals->end[dst_var]; + int src_start = live_intervals->start[src_var]; + int src_end = live_intervals->end[src_var]; + + /* Variables interfere and one line range isn't a subset of the other. */ + if ((dst_end > src_end && src_start < dst_start) || + (src_end > dst_end && dst_start < src_start)) + return false; + + /* Check for a write to either register in the intersection of their live + * ranges. + */ + int start_ip = MAX2(dst_start, src_start); + int end_ip = MIN2(dst_end, src_end); + + foreach_block(block, cfg) { + if (block->end_ip < start_ip) + continue; + + int scan_ip = block->start_ip - 1; + + foreach_inst_in_block(fs_inst, scan_inst, block) { + scan_ip++; + + /* Ignore anything before the intersection of the live ranges */ + if (scan_ip < start_ip) + continue; + + /* Ignore the copying instruction itself */ + if (scan_inst == inst) + continue; + + if (scan_ip > end_ip) + return true; /* registers do not interfere */ + + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->dst, inst->size_written) || + regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) + return false; /* registers interfere */ + } + } + + return true; +} + +bool +fs_visitor::register_coalesce() +{ + bool progress = false; + + calculate_live_intervals(); + + int src_size = 0; + int channels_remaining = 0; + int src_reg = -1, dst_reg = -1; + int dst_reg_offset[MAX_VGRF_SIZE]; + fs_inst *mov[MAX_VGRF_SIZE]; + int dst_var[MAX_VGRF_SIZE]; + int src_var[MAX_VGRF_SIZE]; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (!is_coalesce_candidate(this, inst)) + continue; + + if (is_nop_mov(inst)) { + inst->opcode = BRW_OPCODE_NOP; + progress = true; + continue; + } + + if (src_reg != inst->src[0].nr) { + src_reg = inst->src[0].nr; + + src_size = alloc.sizes[inst->src[0].nr]; + assert(src_size <= MAX_VGRF_SIZE); + + channels_remaining = src_size; + memset(mov, 0, sizeof(mov)); + + dst_reg = inst->dst.nr; + } + + if (dst_reg != inst->dst.nr) + continue; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + for (int i = 0; i < src_size; i++) { + dst_reg_offset[i] = i; + } + mov[0] = inst; + channels_remaining -= regs_written(inst); + } else { + const int offset = inst->src[0].offset / REG_SIZE; + if (mov[offset]) { + /* This is the second time that this offset in the register has + * been set. This means, in particular, that inst->dst was + * live before this instruction and that the live ranges of + * inst->dst and inst->src[0] overlap and we can't coalesce the + * two variables. Let's ensure that doesn't happen. + */ + channels_remaining = -1; + continue; + } + for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++) + dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i; + mov[offset] = inst; + channels_remaining -= regs_written(inst); + } + + if (channels_remaining) + continue; + + bool can_coalesce = true; + for (int i = 0; i < src_size; i++) { + if (dst_reg_offset[i] != dst_reg_offset[0] + i) { + /* Registers are out-of-order. */ + can_coalesce = false; + src_reg = -1; + break; + } + + dst_var[i] = live_intervals->var_from_vgrf[dst_reg] + dst_reg_offset[i]; + src_var[i] = live_intervals->var_from_vgrf[src_reg] + i; + + if (!can_coalesce_vars(live_intervals, cfg, inst, + dst_var[i], src_var[i])) { + can_coalesce = false; + src_reg = -1; + break; + } + } + + if (!can_coalesce) + continue; + + progress = true; + + for (int i = 0; i < src_size; i++) { + if (mov[i]) { + mov[i]->opcode = BRW_OPCODE_NOP; + mov[i]->conditional_mod = BRW_CONDITIONAL_NONE; + mov[i]->dst = reg_undef; + for (int j = 0; j < mov[i]->sources; j++) { + mov[i]->src[j] = reg_undef; + } + } + } + + foreach_block_and_inst(block, fs_inst, scan_inst, cfg) { + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == src_reg) { + scan_inst->dst.nr = dst_reg; + scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE + + dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE; + } + + for (int j = 0; j < scan_inst->sources; j++) { + if (scan_inst->src[j].file == VGRF && + scan_inst->src[j].nr == src_reg) { + scan_inst->src[j].nr = dst_reg; + scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE + + dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE; + } + } + } + + for (int i = 0; i < src_size; i++) { + live_intervals->start[dst_var[i]] = + MIN2(live_intervals->start[dst_var[i]], + live_intervals->start[src_var[i]]); + live_intervals->end[dst_var[i]] = + MAX2(live_intervals->end[dst_var[i]], + live_intervals->end[src_var[i]]); + } + src_reg = -1; + } + + if (progress) { + foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) { + if (inst->opcode == BRW_OPCODE_NOP) { + inst->remove(block); + } + } + + invalidate_live_intervals(); + } + + return progress; +} diff --git a/src/intel/compiler/brw_fs_saturate_propagation.cpp b/src/intel/compiler/brw_fs_saturate_propagation.cpp new file mode 100644 index 00000000000..1c97a507d8c --- /dev/null +++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp @@ -0,0 +1,156 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_cfg.h" + +/** @file brw_fs_saturate_propagation.cpp + * + * Implements a pass that propagates the SAT modifier from a MOV.SAT into the + * instruction that produced the source of the MOV.SAT, thereby allowing the + * MOV's src and dst to be coalesced and the MOV removed. + * + * For instance, + * + * ADD tmp, src0, src1 + * MOV.SAT dst, tmp + * + * would be transformed into + * + * ADD.SAT tmp, src0, src1 + * MOV dst, tmp + */ + +static bool +opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) +{ + bool progress = false; + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse(fs_inst, inst, block) { + ip--; + + if (inst->opcode != BRW_OPCODE_MOV || + !inst->saturate || + inst->dst.file != VGRF || + inst->dst.type != inst->src[0].type || + inst->src[0].file != VGRF || + inst->src[0].abs) + continue; + + int src_var = v->live_intervals->var_from_reg(inst->src[0]); + int src_end_ip = v->live_intervals->end[src_var]; + + bool interfered = false; + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + if (scan_inst->is_partial_write() || + (scan_inst->dst.type != inst->dst.type && + !scan_inst->can_change_types())) + break; + + if (scan_inst->saturate) { + inst->saturate = false; + progress = true; + } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) { + if (scan_inst->can_do_saturate()) { + if (scan_inst->dst.type != inst->dst.type) { + scan_inst->dst.type = inst->dst.type; + for (int i = 0; i < scan_inst->sources; i++) { + scan_inst->src[i].type = inst->dst.type; + } + } + + if (inst->src[0].negate) { + if (scan_inst->opcode == BRW_OPCODE_MUL) { + scan_inst->src[0].negate = !scan_inst->src[0].negate; + inst->src[0].negate = false; + } else if (scan_inst->opcode == BRW_OPCODE_MAD) { + scan_inst->src[0].negate = !scan_inst->src[0].negate; + scan_inst->src[1].negate = !scan_inst->src[1].negate; + inst->src[0].negate = false; + } else if (scan_inst->opcode == BRW_OPCODE_ADD) { + if (scan_inst->src[1].file == IMM) { + if (!brw_negate_immediate(scan_inst->src[1].type, + &scan_inst->src[1].as_brw_reg())) { + break; + } + } else { + scan_inst->src[1].negate = !scan_inst->src[1].negate; + } + scan_inst->src[0].negate = !scan_inst->src[0].negate; + inst->src[0].negate = false; + } else { + break; + } + } + + scan_inst->saturate = true; + inst->saturate = false; + progress = true; + } + } + break; + } + for (int i = 0; i < scan_inst->sources; i++) { + if (scan_inst->src[i].file == VGRF && + scan_inst->src[i].nr == inst->src[0].nr && + scan_inst->src[i].offset / REG_SIZE == + inst->src[0].offset / REG_SIZE) { + if (scan_inst->opcode != BRW_OPCODE_MOV || + !scan_inst->saturate || + scan_inst->src[0].abs || + scan_inst->src[0].negate || + scan_inst->src[0].abs != inst->src[0].abs || + scan_inst->src[0].negate != inst->src[0].negate) { + interfered = true; + break; + } + } + } + + if (interfered) + break; + } + } + + return progress; +} + +bool +fs_visitor::opt_saturate_propagation() +{ + bool progress = false; + + calculate_live_intervals(); + + foreach_block (block, cfg) { + progress = opt_saturate_propagation_local(this, block) || progress; + } + + /* Live intervals are still valid. */ + + return progress; +} diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp new file mode 100644 index 00000000000..8cd897f72e0 --- /dev/null +++ b/src/intel/compiler/brw_fs_sel_peephole.cpp @@ -0,0 +1,220 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +/** @file brw_fs_sel_peephole.cpp + * + * This file contains the opt_peephole_sel() optimization pass that replaces + * MOV instructions to the same destination in the "then" and "else" bodies of + * an if statement with SEL instructions. + */ + +/* Four MOVs seems to be pretty typical, so I picked the next power of two in + * the hopes that it would handle almost anything possible in a single + * pass. + */ +#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */ + +using namespace brw; + +/** + * Scans forwards from an IF counting consecutive MOV instructions in the + * "then" and "else" blocks of the if statement. + * + * A pointer to the bblock_t following the IF is passed as the <then_block> + * argument. The function stores pointers to the MOV instructions in the + * <then_mov> and <else_mov> arrays. + * + * \return the minimum number of MOVs found in the two branches or zero if + * an error occurred. + * + * E.g.: + * IF ... + * then_mov[0] = MOV g4, ... + * then_mov[1] = MOV g5, ... + * then_mov[2] = MOV g6, ... + * ELSE ... + * else_mov[0] = MOV g4, ... + * else_mov[1] = MOV g5, ... + * else_mov[2] = MOV g7, ... + * ENDIF + * returns 3. + */ +static int +count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS], + bblock_t *then_block, bblock_t *else_block) +{ + int then_movs = 0; + foreach_inst_in_block(fs_inst, inst, then_block) { + if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV) + break; + + then_mov[then_movs] = inst; + then_movs++; + } + + int else_movs = 0; + foreach_inst_in_block(fs_inst, inst, else_block) { + if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV) + break; + + else_mov[else_movs] = inst; + else_movs++; + } + + return MIN2(then_movs, else_movs); +} + +/** + * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL. + * + * Many GLSL shaders contain the following pattern: + * + * x = condition ? foo : bar + * + * or + * + * if (...) a.xyzw = foo.xyzw; + * else a.xyzw = bar.xyzw; + * + * The compiler emits an ir_if tree for this, since each subexpression might be + * a complex tree that could have side-effects or short-circuit logic. + * + * However, the common case is to simply select one of two constants or + * variable values---which is exactly what SEL is for. In this case, the + * assembly looks like: + * + * (+f0) IF + * MOV dst src0 + * ... + * ELSE + * MOV dst src1 + * ... + * ENDIF + * + * where each pair of MOVs to a common destination and can be easily translated + * into + * + * (+f0) SEL dst src0 src1 + * + * If src0 is an immediate value, we promote it to a temporary GRF. + */ +bool +fs_visitor::opt_peephole_sel() +{ + bool progress = false; + + foreach_block (block, cfg) { + /* IF instructions, by definition, can only be found at the ends of + * basic blocks. + */ + fs_inst *if_inst = (fs_inst *)block->end(); + if (if_inst->opcode != BRW_OPCODE_IF) + continue; + + fs_inst *else_mov[MAX_MOVS] = { NULL }; + fs_inst *then_mov[MAX_MOVS] = { NULL }; + + bblock_t *then_block = block->next(); + bblock_t *else_block = NULL; + foreach_list_typed(bblock_link, child, link, &block->children) { + if (child->block != then_block) { + if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) { + else_block = child->block; + } + break; + } + } + if (else_block == NULL) + continue; + + int movs = count_movs_from_if(then_mov, else_mov, then_block, else_block); + + if (movs == 0) + continue; + + /* Generate SEL instructions for pairs of MOVs to a common destination. */ + for (int i = 0; i < movs; i++) { + if (!then_mov[i] || !else_mov[i]) + break; + + /* Check that the MOVs are the right form. */ + if (!then_mov[i]->dst.equals(else_mov[i]->dst) || + then_mov[i]->exec_size != else_mov[i]->exec_size || + then_mov[i]->group != else_mov[i]->group || + then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all || + then_mov[i]->is_partial_write() || + else_mov[i]->is_partial_write() || + then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE || + else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) { + movs = i; + break; + } + + /* Check that source types for mov operations match. */ + if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) { + movs = i; + break; + } + } + + if (movs == 0) + continue; + + for (int i = 0; i < movs; i++) { + const fs_builder ibld = fs_builder(this, then_block, then_mov[i]) + .at(block, if_inst); + + if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) { + ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]); + } else { + /* Only the last source register can be a constant, so if the MOV + * in the "then" clause uses a constant, we need to put it in a + * temporary. + */ + fs_reg src0(then_mov[i]->src[0]); + if (src0.file == IMM) { + src0 = vgrf(glsl_type::float_type); + src0.type = then_mov[i]->src[0].type; + ibld.MOV(src0, then_mov[i]->src[0]); + } + + set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse, + ibld.SEL(then_mov[i]->dst, src0, + else_mov[i]->src[0])); + } + + then_mov[i]->remove(then_block); + else_mov[i]->remove(else_block); + } + + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp new file mode 100644 index 00000000000..8990a5ca710 --- /dev/null +++ b/src/intel/compiler/brw_fs_surface_builder.cpp @@ -0,0 +1,1194 @@ +/* + * Copyright © 2013-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "isl/isl.h" +#include "brw_fs_surface_builder.h" +#include "brw_fs.h" + +using namespace brw; + +namespace brw { + namespace surface_access { + namespace { + /** + * Generate a logical send opcode for a surface message and return + * the result. + */ + fs_reg + emit_send(const fs_builder &bld, enum opcode opcode, + const fs_reg &addr, const fs_reg &src, const fs_reg &surface, + unsigned dims, unsigned arg, unsigned rsize, + brw_predicate pred = BRW_PREDICATE_NONE) + { + /* Reduce the dynamically uniform surface index to a single + * scalar. + */ + const fs_reg usurface = bld.emit_uniformize(surface); + const fs_reg srcs[] = { + addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg) + }; + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize); + fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); + + inst->size_written = rsize * dst.component_size(inst->exec_size); + inst->predicate = pred; + return dst; + } + } + + /** + * Emit an untyped surface read opcode. \p dims determines the number + * of components of the address and \p size the number of components of + * the returned value. + */ + fs_reg + emit_untyped_read(const fs_builder &bld, + const fs_reg &surface, const fs_reg &addr, + unsigned dims, unsigned size, + brw_predicate pred) + { + return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + addr, fs_reg(), surface, dims, size, size, pred); + } + + /** + * Emit an untyped surface write opcode. \p dims determines the number + * of components of the address and \p size the number of components of + * the argument. + */ + void + emit_untyped_write(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, const fs_reg &src, + unsigned dims, unsigned size, + brw_predicate pred) + { + emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + addr, src, surface, dims, size, 0, pred); + } + + /** + * Emit an untyped surface atomic opcode. \p dims determines the number + * of components of the address and \p rsize the number of components of + * the returned value (either zero or one). + */ + fs_reg + emit_untyped_atomic(const fs_builder &bld, + const fs_reg &surface, const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred) + { + /* FINISHME: Factor out this frequently recurring pattern into a + * helper function. + */ + const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); + const fs_reg srcs[] = { src0, src1 }; + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); + bld.LOAD_PAYLOAD(tmp, srcs, n, 0); + + return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, + addr, tmp, surface, dims, op, rsize, pred); + } + + /** + * Emit a typed surface read opcode. \p dims determines the number of + * components of the address and \p size the number of components of the + * returned value. + */ + fs_reg + emit_typed_read(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, unsigned dims, unsigned size) + { + return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, + addr, fs_reg(), surface, dims, size, size); + } + + /** + * Emit a typed surface write opcode. \p dims determines the number of + * components of the address and \p size the number of components of the + * argument. + */ + void + emit_typed_write(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, const fs_reg &src, + unsigned dims, unsigned size) + { + emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, + addr, src, surface, dims, size, 0); + } + + /** + * Emit a typed surface atomic opcode. \p dims determines the number of + * components of the address and \p rsize the number of components of + * the returned value (either zero or one). + */ + fs_reg + emit_typed_atomic(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred) + { + /* FINISHME: Factor out this frequently recurring pattern into a + * helper function. + */ + const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); + const fs_reg srcs[] = { src0, src1 }; + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); + bld.LOAD_PAYLOAD(tmp, srcs, n, 0); + + return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, + addr, tmp, surface, dims, op, rsize); + } + } +} + +namespace { + namespace image_format_info { + /* The higher compiler layers use the GL enums for image formats even if + * they come in from SPIR-V or Vulkan. We need to turn them into an ISL + * enum before we can use them. + */ + enum isl_format + isl_format_for_gl_format(uint32_t gl_format) + { + switch (gl_format) { + case GL_R8: return ISL_FORMAT_R8_UNORM; + case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM; + case GL_R8UI: return ISL_FORMAT_R8_UINT; + case GL_R8I: return ISL_FORMAT_R8_SINT; + case GL_RG8: return ISL_FORMAT_R8G8_UNORM; + case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM; + case GL_RG8UI: return ISL_FORMAT_R8G8_UINT; + case GL_RG8I: return ISL_FORMAT_R8G8_SINT; + case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM; + case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM; + case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT; + case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT; + case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT; + case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM; + case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT; + case GL_R16: return ISL_FORMAT_R16_UNORM; + case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM; + case GL_R16F: return ISL_FORMAT_R16_FLOAT; + case GL_R16UI: return ISL_FORMAT_R16_UINT; + case GL_R16I: return ISL_FORMAT_R16_SINT; + case GL_RG16: return ISL_FORMAT_R16G16_UNORM; + case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM; + case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT; + case GL_RG16UI: return ISL_FORMAT_R16G16_UINT; + case GL_RG16I: return ISL_FORMAT_R16G16_SINT; + case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM; + case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM; + case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT; + case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT; + case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT; + case GL_R32F: return ISL_FORMAT_R32_FLOAT; + case GL_R32UI: return ISL_FORMAT_R32_UINT; + case GL_R32I: return ISL_FORMAT_R32_SINT; + case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT; + case GL_RG32UI: return ISL_FORMAT_R32G32_UINT; + case GL_RG32I: return ISL_FORMAT_R32G32_SINT; + case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT; + case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT; + case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT; + case GL_NONE: return ISL_FORMAT_UNSUPPORTED; + default: + assert(!"Invalid image format"); + return ISL_FORMAT_UNSUPPORTED; + } + } + + /** + * Simple 4-tuple of scalars used to pass around per-color component + * values. + */ + struct color_u { + color_u(unsigned x = 0) : r(x), g(x), b(x), a(x) + { + } + + color_u(unsigned r, unsigned g, unsigned b, unsigned a) : + r(r), g(g), b(b), a(a) + { + } + + unsigned + operator[](unsigned i) const + { + const unsigned xs[] = { r, g, b, a }; + return xs[i]; + } + + unsigned r, g, b, a; + }; + + /** + * Return the per-channel bitfield widths for a given image format. + */ + inline color_u + get_bit_widths(isl_format format) + { + const isl_format_layout *fmtl = isl_format_get_layout(format); + + return color_u(fmtl->channels.r.bits, + fmtl->channels.g.bits, + fmtl->channels.b.bits, + fmtl->channels.a.bits); + } + + /** + * Return the per-channel bitfield shifts for a given image format. + */ + inline color_u + get_bit_shifts(isl_format format) + { + const color_u widths = get_bit_widths(format); + return color_u(0, widths.r, widths.r + widths.g, + widths.r + widths.g + widths.b); + } + + /** + * Return true if all present components have the same bit width. + */ + inline bool + is_homogeneous(isl_format format) + { + const color_u widths = get_bit_widths(format); + return ((widths.g == 0 || widths.g == widths.r) && + (widths.b == 0 || widths.b == widths.r) && + (widths.a == 0 || widths.a == widths.r)); + } + + /** + * Return true if the format conversion boils down to a trivial copy. + */ + inline bool + is_conversion_trivial(const gen_device_info *devinfo, isl_format format) + { + return (get_bit_widths(format).r == 32 && is_homogeneous(format)) || + format == isl_lower_storage_image_format(devinfo, format); + } + + /** + * Return true if the hardware natively supports some format with + * compatible bitfield layout, but possibly different data types. + */ + inline bool + has_supported_bit_layout(const gen_device_info *devinfo, + isl_format format) + { + const color_u widths = get_bit_widths(format); + const color_u lower_widths = get_bit_widths( + isl_lower_storage_image_format(devinfo, format)); + + return (widths.r == lower_widths.r && + widths.g == lower_widths.g && + widths.b == lower_widths.b && + widths.a == lower_widths.a); + } + + /** + * Return true if we are required to spread individual components over + * several components of the format used by the hardware (RG32 and + * friends implemented as RGBA16UI). + */ + inline bool + has_split_bit_layout(const gen_device_info *devinfo, isl_format format) + { + const isl_format lower_format = + isl_lower_storage_image_format(devinfo, format); + + return (isl_format_get_num_channels(format) < + isl_format_get_num_channels(lower_format)); + } + + /** + * Return true if the hardware returns garbage in the unused high bits + * of each component. This may happen on IVB because we rely on the + * undocumented behavior that typed reads from surfaces of the + * unsupported R8 and R16 formats return useful data in their least + * significant bits. + */ + inline bool + has_undefined_high_bits(const gen_device_info *devinfo, + isl_format format) + { + const isl_format lower_format = + isl_lower_storage_image_format(devinfo, format); + + return (devinfo->gen == 7 && !devinfo->is_haswell && + (lower_format == ISL_FORMAT_R16_UINT || + lower_format == ISL_FORMAT_R8_UINT)); + } + + /** + * Return true if the format represents values as signed integers + * requiring sign extension when unpacking. + */ + inline bool + needs_sign_extension(isl_format format) + { + return isl_format_has_snorm_channel(format) || + isl_format_has_sint_channel(format); + } + } + + namespace image_validity { + /** + * Check whether the bound image is suitable for untyped access. + */ + brw_predicate + emit_untyped_image_check(const fs_builder &bld, const fs_reg &image, + brw_predicate pred) + { + const gen_device_info *devinfo = bld.shader->devinfo; + const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); + + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* Check whether the first stride component (i.e. the Bpp value) + * is greater than four, what on Gen7 indicates that a surface of + * type RAW has been bound for untyped access. Reading or writing + * to a surface of type other than RAW using untyped surface + * messages causes a hang on IVB and VLV. + */ + set_predicate(pred, + bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4), + BRW_CONDITIONAL_G)); + + return BRW_PREDICATE_NORMAL; + } else { + /* More recent generations handle the format mismatch + * gracefully. + */ + return pred; + } + } + + /** + * Check whether there is an image bound at the given index and write + * the comparison result to f0.0. Returns an appropriate predication + * mode to use on subsequent image operations. + */ + brw_predicate + emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image) + { + const gen_device_info *devinfo = bld.shader->devinfo; + const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); + + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* Check the first component of the size field to find out if the + * image is bound. Necessary on IVB for typed atomics because + * they don't seem to respect null surfaces and will happily + * corrupt or read random memory when no image is bound. + */ + bld.CMP(bld.null_reg_ud(), + retype(size, BRW_REGISTER_TYPE_UD), + brw_imm_d(0), BRW_CONDITIONAL_NZ); + + return BRW_PREDICATE_NORMAL; + } else { + /* More recent platforms implement compliant behavior when a null + * surface is bound. + */ + return BRW_PREDICATE_NONE; + } + } + + /** + * Check whether the provided coordinates are within the image bounds + * and write the comparison result to f0.0. Returns an appropriate + * predication mode to use on subsequent image operations. + */ + brw_predicate + emit_bounds_check(const fs_builder &bld, const fs_reg &image, + const fs_reg &addr, unsigned dims) + { + const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); + + for (unsigned c = 0; c < dims; ++c) + set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL, + bld.CMP(bld.null_reg_ud(), + offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c), + offset(size, bld, c), + BRW_CONDITIONAL_L)); + + return BRW_PREDICATE_NORMAL; + } + } + + namespace image_coordinates { + /** + * Return the total number of coordinates needed to address a texel of + * the surface, which may be more than the sum of \p surf_dims and \p + * arr_dims if padding is required. + */ + unsigned + num_image_coordinates(const fs_builder &bld, + unsigned surf_dims, unsigned arr_dims, + isl_format format) + { + /* HSW in vec4 mode and our software coordinate handling for untyped + * reads want the array index to be at the Z component. + */ + const bool array_index_at_z = + format != ISL_FORMAT_UNSUPPORTED && + !isl_has_matching_typed_storage_image_format( + bld.shader->devinfo, format); + const unsigned zero_dims = + ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0); + + return surf_dims + zero_dims + arr_dims; + } + + /** + * Transform image coordinates into the form expected by the + * implementation. + */ + fs_reg + emit_image_coordinates(const fs_builder &bld, const fs_reg &addr, + unsigned surf_dims, unsigned arr_dims, + isl_format format) + { + const unsigned dims = + num_image_coordinates(bld, surf_dims, arr_dims, format); + + if (dims > surf_dims + arr_dims) { + assert(surf_dims == 1 && arr_dims == 1 && dims == 3); + /* The array index is required to be passed in as the Z component, + * insert a zero at the Y component to shift it to the right + * position. + * + * FINISHME: Factor out this frequently recurring pattern into a + * helper function. + */ + const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) }; + const fs_reg dst = bld.vgrf(addr.type, dims); + bld.LOAD_PAYLOAD(dst, srcs, dims, 0); + return dst; + } else { + return addr; + } + } + + /** + * Calculate the offset in memory of the texel given by \p coord. + * + * This is meant to be used with untyped surface messages to access a + * tiled surface, what involves taking into account the tiling and + * swizzling modes of the surface manually so it will hopefully not + * happen very often. + * + * The tiling algorithm implemented here matches either the X or Y + * tiling layouts supported by the hardware depending on the tiling + * coefficients passed to the program as uniforms. See Volume 1 Part 2 + * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth + * explanation of the hardware tiling format. + */ + fs_reg + emit_address_calculation(const fs_builder &bld, const fs_reg &image, + const fs_reg &coord, unsigned dims) + { + const gen_device_info *devinfo = bld.shader->devinfo; + const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET); + const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); + const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET); + const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET); + const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); + + /* Shift the coordinates by the fixed surface offset. It may be + * non-zero if the image is a single slice of a higher-dimensional + * surface, or if a non-zero mipmap level of the surface is bound to + * the pipeline. The offset needs to be applied here rather than at + * surface state set-up time because the desired slice-level may + * start mid-tile, so simply shifting the surface base address + * wouldn't give a well-formed tiled surface in the general case. + */ + for (unsigned c = 0; c < 2; ++c) + bld.ADD(offset(addr, bld, c), offset(off, bld, c), + (c < dims ? + offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) : + fs_reg(brw_imm_d(0)))); + + /* The layout of 3-D textures in memory is sort-of like a tiling + * format. At each miplevel, the slices are arranged in rows of + * 2^level slices per row. The slice row is stored in tmp.y and + * the slice within the row is stored in tmp.x. + * + * The layout of 2-D array textures and cubemaps is much simpler: + * Depending on whether the ARYSPC_LOD0 layout is in use it will be + * stored in memory as an array of slices, each one being a 2-D + * arrangement of miplevels, or as a 2D arrangement of miplevels, + * each one being an array of slices. In either case the separation + * between slices of the same LOD is equal to the qpitch value + * provided as stride.w. + * + * This code can be made to handle either 2D arrays and 3D textures + * by passing in the miplevel as tile.z for 3-D textures and 0 in + * tile.z for 2-D array textures. + * + * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface + * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion + * of the hardware 3D texture and 2D array layouts. + */ + if (dims > 2) { + /* Decompose z into a major (tmp.y) and a minor (tmp.x) + * index. + */ + bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0), + offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2)); + bld.SHR(offset(tmp, bld, 1), + offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2), + offset(tile, bld, 2)); + + /* Take into account the horizontal (tmp.x) and vertical (tmp.y) + * slice offset. + */ + for (unsigned c = 0; c < 2; ++c) { + bld.MUL(offset(tmp, bld, c), + offset(stride, bld, 2 + c), offset(tmp, bld, c)); + bld.ADD(offset(addr, bld, c), + offset(addr, bld, c), offset(tmp, bld, c)); + } + } + + if (dims > 1) { + /* Calculate the major/minor x and y indices. In order to + * accommodate both X and Y tiling, the Y-major tiling format is + * treated as being a bunch of narrow X-tiles placed next to each + * other. This means that the tile width for Y-tiling is actually + * the width of one sub-column of the Y-major tile where each 4K + * tile has 8 512B sub-columns. + * + * The major Y value is the row of tiles in which the pixel lives. + * The major X value is the tile sub-column in which the pixel + * lives; for X tiling, this is the same as the tile column, for Y + * tiling, each tile has 8 sub-columns. The minor X and Y indices + * are the position within the sub-column. + */ + for (unsigned c = 0; c < 2; ++c) { + /* Calculate the minor x and y indices. */ + bld.BFE(offset(minor, bld, c), offset(tile, bld, c), + brw_imm_d(0), offset(addr, bld, c)); + + /* Calculate the major x and y indices. */ + bld.SHR(offset(major, bld, c), + offset(addr, bld, c), offset(tile, bld, c)); + } + + /* Calculate the texel index from the start of the tile row and + * the vertical coordinate of the row. + * Equivalent to: + * tmp.x = (major.x << tile.y << tile.x) + + * (minor.y << tile.x) + minor.x + * tmp.y = major.y << tile.y + */ + bld.SHL(tmp, major, offset(tile, bld, 1)); + bld.ADD(tmp, tmp, offset(minor, bld, 1)); + bld.SHL(tmp, tmp, offset(tile, bld, 0)); + bld.ADD(tmp, tmp, minor); + bld.SHL(offset(tmp, bld, 1), + offset(major, bld, 1), offset(tile, bld, 1)); + + /* Add it to the start of the tile row. */ + bld.MUL(offset(tmp, bld, 1), + offset(tmp, bld, 1), offset(stride, bld, 1)); + bld.ADD(tmp, tmp, offset(tmp, bld, 1)); + + /* Multiply by the Bpp value. */ + bld.MUL(dst, tmp, stride); + + if (devinfo->gen < 8 && !devinfo->is_baytrail) { + /* Take into account the two dynamically specified shifts. + * Both need are used to implement swizzling of X-tiled + * surfaces. For Y-tiled surfaces only one bit needs to be + * XOR-ed with bit 6 of the memory address, so a swz value of + * 0xff (actually interpreted as 31 by the hardware) will be + * provided to cause the relevant bit of tmp.y to be zero and + * turn the first XOR into the identity. For linear surfaces + * or platforms lacking address swizzling both shifts will be + * 0xff causing the relevant bits of both tmp.x and .y to be + * zero, what effectively disables swizzling. + */ + for (unsigned c = 0; c < 2; ++c) + bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c)); + + /* XOR tmp.x and tmp.y with bit 6 of the memory address. */ + bld.XOR(tmp, tmp, offset(tmp, bld, 1)); + bld.AND(tmp, tmp, brw_imm_d(1 << 6)); + bld.XOR(dst, dst, tmp); + } + + } else { + /* Multiply by the Bpp/stride value. Note that the addr.y may be + * non-zero even if the image is one-dimensional because a + * vertical offset may have been applied above to select a + * non-zero slice or level of a higher-dimensional texture. + */ + bld.MUL(offset(addr, bld, 1), + offset(addr, bld, 1), offset(stride, bld, 1)); + bld.ADD(addr, addr, offset(addr, bld, 1)); + bld.MUL(dst, addr, stride); + } + + return dst; + } + } + + namespace image_format_conversion { + using image_format_info::color_u; + + namespace { + /** + * Maximum representable value in an unsigned integer with the given + * number of bits. + */ + inline unsigned + scale(unsigned n) + { + return (1 << n) - 1; + } + } + + /** + * Pack the vector \p src in a bitfield given the per-component bit + * shifts and widths. Note that bitfield components are not allowed to + * cross 32-bit boundaries. + */ + fs_reg + emit_pack(const fs_builder &bld, const fs_reg &src, + const color_u &shifts, const color_u &widths) + { + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); + bool seen[4] = {}; + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + + /* Shift each component left to the correct bitfield position. */ + bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32)); + + /* Add everything up. */ + if (seen[shifts[c] / 32]) { + bld.OR(offset(dst, bld, shifts[c] / 32), + offset(dst, bld, shifts[c] / 32), tmp); + } else { + bld.MOV(offset(dst, bld, shifts[c] / 32), tmp); + seen[shifts[c] / 32] = true; + } + } + } + + return dst; + } + + /** + * Unpack a vector from the bitfield \p src given the per-component bit + * shifts and widths. Note that bitfield components are not allowed to + * cross 32-bit boundaries. + */ + fs_reg + emit_unpack(const fs_builder &bld, const fs_reg &src, + const color_u &shifts, const color_u &widths) + { + const fs_reg dst = bld.vgrf(src.type, 4); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + /* Shift left to discard the most significant bits. */ + bld.SHL(offset(dst, bld, c), + offset(src, bld, shifts[c] / 32), + brw_imm_ud(32 - shifts[c] % 32 - widths[c])); + + /* Shift back to the least significant bits using an arithmetic + * shift to get sign extension on signed types. + */ + bld.ASR(offset(dst, bld, c), + offset(dst, bld, c), brw_imm_ud(32 - widths[c])); + } + } + + return dst; + } + + /** + * Convert an integer vector into another integer vector of the + * specified bit widths, properly handling overflow. + */ + fs_reg + emit_convert_to_integer(const fs_builder &bld, const fs_reg &src, + const color_u &widths, bool is_signed) + { + const unsigned s = (is_signed ? 1 : 0); + const fs_reg dst = bld.vgrf( + is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); + assert(src.type == dst.type); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + /* Clamp to the maximum value. */ + bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c), + brw_imm_d((int)scale(widths[c] - s)), + BRW_CONDITIONAL_L); + + /* Clamp to the minimum value. */ + if (is_signed) + bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_d(-(int)scale(widths[c] - s) - 1), + BRW_CONDITIONAL_GE); + + /* Mask off all but the bits we actually want. Otherwise, if + * we pass a negative number into the hardware when it's + * expecting something like UINT8, it will happily clamp it to + * +255 for us. + */ + if (is_signed && widths[c] < 32) + bld.AND(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_d(scale(widths[c]))); + } + } + + return dst; + } + + /** + * Convert a normalized fixed-point vector of the specified signedness + * and bit widths into a floating point vector. + */ + fs_reg + emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src, + const color_u &widths, bool is_signed) + { + const unsigned s = (is_signed ? 1 : 0); + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + /* Convert to float. */ + bld.MOV(offset(dst, bld, c), offset(src, bld, c)); + + /* Divide by the normalization constants. */ + bld.MUL(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_f(1.0f / scale(widths[c] - s))); + + /* Clamp to the minimum value. */ + if (is_signed) + bld.emit_minmax(offset(dst, bld, c), + offset(dst, bld, c), brw_imm_f(-1.0f), + BRW_CONDITIONAL_GE); + } + } + return dst; + } + + /** + * Convert a floating-point vector into a normalized fixed-point vector + * of the specified signedness and bit widths. + */ + fs_reg + emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src, + const color_u &widths, bool is_signed) + { + const unsigned s = (is_signed ? 1 : 0); + const fs_reg dst = bld.vgrf( + is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); + const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + /* Clamp the normalized floating-point argument. */ + if (is_signed) { + bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c), + brw_imm_f(-1.0f), BRW_CONDITIONAL_GE); + + bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), + brw_imm_f(1.0f), BRW_CONDITIONAL_L); + } else { + set_saturate(true, bld.MOV(offset(fdst, bld, c), + offset(src, bld, c))); + } + + /* Multiply by the normalization constants. */ + bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c), + brw_imm_f((float)scale(widths[c] - s))); + + /* Convert to integer. */ + bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c)); + bld.MOV(offset(dst, bld, c), offset(fdst, bld, c)); + + /* Mask off all but the bits we actually want. Otherwise, if + * we pass a negative number into the hardware when it's + * expecting something like UINT8, it will happily clamp it to + * +255 for us. + */ + if (is_signed && widths[c] < 32) + bld.AND(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_d(scale(widths[c]))); + } + } + + return dst; + } + + /** + * Convert a floating point vector of the specified bit widths into a + * 32-bit floating point vector. + */ + fs_reg + emit_convert_from_float(const fs_builder &bld, const fs_reg &src, + const color_u &widths) + { + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); + const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + bld.MOV(offset(dst, bld, c), offset(src, bld, c)); + + /* Extend 10-bit and 11-bit floating point numbers to 15 bits. + * This works because they have a 5-bit exponent just like the + * 16-bit floating point format, and they have no sign bit. + */ + if (widths[c] < 16) + bld.SHL(offset(dst, bld, c), + offset(dst, bld, c), brw_imm_ud(15 - widths[c])); + + /* Convert to 32-bit floating point. */ + bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c)); + } + } + + return fdst; + } + + /** + * Convert a vector into a floating point vector of the specified bit + * widths. + */ + fs_reg + emit_convert_to_float(const fs_builder &bld, const fs_reg &src, + const color_u &widths) + { + const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); + const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); + + for (unsigned c = 0; c < 4; ++c) { + if (widths[c]) { + bld.MOV(offset(fdst, bld, c), offset(src, bld, c)); + + /* Clamp to the minimum value. */ + if (widths[c] < 16) + bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), + brw_imm_f(0.0f), BRW_CONDITIONAL_GE); + + /* Convert to 16-bit floating-point. */ + bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c)); + + /* Discard the least significant bits to get floating point + * numbers of the requested width. This works because the + * 10-bit and 11-bit floating point formats have a 5-bit + * exponent just like the 16-bit format, and they have no sign + * bit. + */ + if (widths[c] < 16) + bld.SHR(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_ud(15 - widths[c])); + } + } + + return dst; + } + + /** + * Fill missing components of a vector with 0, 0, 0, 1. + */ + fs_reg + emit_pad(const fs_builder &bld, const fs_reg &src, + const color_u &widths) + { + const fs_reg dst = bld.vgrf(src.type, 4); + const unsigned pad[] = { 0, 0, 0, 1 }; + + for (unsigned c = 0; c < 4; ++c) + bld.MOV(offset(dst, bld, c), + widths[c] ? offset(src, bld, c) + : fs_reg(brw_imm_ud(pad[c]))); + + return dst; + } + } +} + +namespace brw { + namespace image_access { + /** + * Load a vector from a surface of the given format and dimensionality + * at the given coordinates. \p surf_dims and \p arr_dims give the + * number of non-array and array coordinates of the image respectively. + */ + fs_reg + emit_image_load(const fs_builder &bld, + const fs_reg &image, const fs_reg &addr, + unsigned surf_dims, unsigned arr_dims, + unsigned gl_format) + { + using namespace image_format_info; + using namespace image_format_conversion; + using namespace image_validity; + using namespace image_coordinates; + using namespace surface_access; + const gen_device_info *devinfo = bld.shader->devinfo; + const isl_format format = isl_format_for_gl_format(gl_format); + const isl_format lower_format = + isl_lower_storage_image_format(devinfo, format); + fs_reg tmp; + + /* Transform the image coordinates into actual surface coordinates. */ + const fs_reg saddr = + emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); + const unsigned dims = + num_image_coordinates(bld, surf_dims, arr_dims, format); + + if (isl_has_matching_typed_storage_image_format(devinfo, format)) { + /* Hopefully we get here most of the time... */ + tmp = emit_typed_read(bld, image, saddr, dims, + isl_format_get_num_channels(lower_format)); + } else { + /* Untyped surface reads return 32 bits of the surface per + * component, without any sort of unpacking or type conversion, + */ + const unsigned size = isl_format_get_layout(format)->bpb / 32; + /* they don't properly handle out of bounds access, so we have to + * check manually if the coordinates are valid and predicate the + * surface read on the result, + */ + const brw_predicate pred = + emit_untyped_image_check(bld, image, + emit_bounds_check(bld, image, + saddr, dims)); + + /* and they don't know about surface coordinates, we need to + * convert them to a raw memory offset. + */ + const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims); + + tmp = emit_untyped_read(bld, image, laddr, 1, size, pred); + + /* An out of bounds surface access should give zero as result. */ + for (unsigned c = 0; c < size; ++c) + set_predicate(pred, bld.SEL(offset(tmp, bld, c), + offset(tmp, bld, c), brw_imm_d(0))); + } + + /* Set the register type to D instead of UD if the data type is + * represented as a signed integer in memory so that sign extension + * is handled correctly by unpack. + */ + if (needs_sign_extension(format)) + tmp = retype(tmp, BRW_REGISTER_TYPE_D); + + if (!has_supported_bit_layout(devinfo, format)) { + /* Unpack individual vector components from the bitfield if the + * hardware is unable to do it for us. + */ + if (has_split_bit_layout(devinfo, format)) + tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format), + get_bit_widths(lower_format)); + else + tmp = emit_unpack(bld, tmp, get_bit_shifts(format), + get_bit_widths(format)); + + } else if ((needs_sign_extension(format) && + !is_conversion_trivial(devinfo, format)) || + has_undefined_high_bits(devinfo, format)) { + /* Perform a trivial unpack even though the bit layout matches in + * order to get the most significant bits of each component + * initialized properly. + */ + tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96), + get_bit_widths(format)); + } + + if (!isl_format_has_int_channel(format)) { + if (is_conversion_trivial(devinfo, format)) { + /* Just need to cast the vector to the target type. */ + tmp = retype(tmp, BRW_REGISTER_TYPE_F); + } else { + /* Do the right sort of type conversion to float. */ + if (isl_format_has_float_channel(format)) + tmp = emit_convert_from_float( + bld, tmp, get_bit_widths(format)); + else + tmp = emit_convert_from_scaled( + bld, tmp, get_bit_widths(format), + isl_format_has_snorm_channel(format)); + } + } + + /* Initialize missing components of the result. */ + return emit_pad(bld, tmp, get_bit_widths(format)); + } + + /** + * Store a vector in a surface of the given format and dimensionality at + * the given coordinates. \p surf_dims and \p arr_dims give the number + * of non-array and array coordinates of the image respectively. + */ + void + emit_image_store(const fs_builder &bld, const fs_reg &image, + const fs_reg &addr, const fs_reg &src, + unsigned surf_dims, unsigned arr_dims, + unsigned gl_format) + { + using namespace image_format_info; + using namespace image_format_conversion; + using namespace image_validity; + using namespace image_coordinates; + using namespace surface_access; + const isl_format format = isl_format_for_gl_format(gl_format); + const gen_device_info *devinfo = bld.shader->devinfo; + + /* Transform the image coordinates into actual surface coordinates. */ + const fs_reg saddr = + emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); + const unsigned dims = + num_image_coordinates(bld, surf_dims, arr_dims, format); + + if (gl_format == GL_NONE) { + /* We don't know what the format is, but that's fine because it + * implies write-only access, and typed surface writes are always + * able to take care of type conversion and packing for us. + */ + emit_typed_write(bld, image, saddr, src, dims, 4); + + } else { + const isl_format lower_format = + isl_lower_storage_image_format(devinfo, format); + fs_reg tmp = src; + + if (!is_conversion_trivial(devinfo, format)) { + /* Do the right sort of type conversion. */ + if (isl_format_has_float_channel(format)) + tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format)); + + else if (isl_format_has_int_channel(format)) + tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format), + isl_format_has_sint_channel(format)); + + else + tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format), + isl_format_has_snorm_channel(format)); + } + + /* We're down to bit manipulation at this point. */ + tmp = retype(tmp, BRW_REGISTER_TYPE_UD); + + if (!has_supported_bit_layout(devinfo, format)) { + /* Pack the vector components into a bitfield if the hardware + * is unable to do it for us. + */ + if (has_split_bit_layout(devinfo, format)) + tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format), + get_bit_widths(lower_format)); + + else + tmp = emit_pack(bld, tmp, get_bit_shifts(format), + get_bit_widths(format)); + } + + if (isl_has_matching_typed_storage_image_format(devinfo, format)) { + /* Hopefully we get here most of the time... */ + emit_typed_write(bld, image, saddr, tmp, dims, + isl_format_get_num_channels(lower_format)); + + } else { + /* Untyped surface writes store 32 bits of the surface per + * component, without any sort of packing or type conversion, + */ + const unsigned size = isl_format_get_layout(format)->bpb / 32; + + /* they don't properly handle out of bounds access, so we have + * to check manually if the coordinates are valid and predicate + * the surface write on the result, + */ + const brw_predicate pred = + emit_untyped_image_check(bld, image, + emit_bounds_check(bld, image, + saddr, dims)); + + /* and, phew, they don't know about surface coordinates, we + * need to convert them to a raw memory offset. + */ + const fs_reg laddr = emit_address_calculation( + bld, image, saddr, dims); + + emit_untyped_write(bld, image, laddr, tmp, 1, size, pred); + } + } + } + + /** + * Perform an atomic read-modify-write operation in a surface of the + * given dimensionality at the given coordinates. \p surf_dims and \p + * arr_dims give the number of non-array and array coordinates of the + * image respectively. Main building block of the imageAtomic GLSL + * built-ins. + */ + fs_reg + emit_image_atomic(const fs_builder &bld, + const fs_reg &image, const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned surf_dims, unsigned arr_dims, + unsigned rsize, unsigned op) + { + using namespace image_validity; + using namespace image_coordinates; + using namespace surface_access; + /* Avoid performing an atomic operation on an unbound surface. */ + const brw_predicate pred = emit_typed_atomic_check(bld, image); + + /* Transform the image coordinates into actual surface coordinates. */ + const fs_reg saddr = + emit_image_coordinates(bld, addr, surf_dims, arr_dims, + ISL_FORMAT_R32_UINT); + const unsigned dims = + num_image_coordinates(bld, surf_dims, arr_dims, + ISL_FORMAT_R32_UINT); + + /* Thankfully we can do without untyped atomics here. */ + const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1, + dims, rsize, op, pred); + + /* An unbound surface access should give zero as result. */ + if (rsize && pred) + set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0))); + + return retype(tmp, src0.type); + } + } +} diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h new file mode 100644 index 00000000000..32b56d387f6 --- /dev/null +++ b/src/intel/compiler/brw_fs_surface_builder.h @@ -0,0 +1,88 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2013-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_FS_SURFACE_BUILDER_H +#define BRW_FS_SURFACE_BUILDER_H + +#include "brw_fs_builder.h" + +namespace brw { + namespace surface_access { + fs_reg + emit_untyped_read(const fs_builder &bld, + const fs_reg &surface, const fs_reg &addr, + unsigned dims, unsigned size, + brw_predicate pred = BRW_PREDICATE_NONE); + + void + emit_untyped_write(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, const fs_reg &src, + unsigned dims, unsigned size, + brw_predicate pred = BRW_PREDICATE_NONE); + + fs_reg + emit_untyped_atomic(const fs_builder &bld, + const fs_reg &surface, const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred = BRW_PREDICATE_NONE); + + fs_reg + emit_typed_read(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, unsigned dims, unsigned size); + + void + emit_typed_write(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, const fs_reg &src, + unsigned dims, unsigned size); + + fs_reg + emit_typed_atomic(const fs_builder &bld, const fs_reg &surface, + const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred = BRW_PREDICATE_NONE); + } + + namespace image_access { + fs_reg + emit_image_load(const fs_builder &bld, + const fs_reg &image, const fs_reg &addr, + unsigned surf_dims, unsigned arr_dims, + unsigned gl_format); + + void + emit_image_store(const fs_builder &bld, const fs_reg &image, + const fs_reg &addr, const fs_reg &src, + unsigned surf_dims, unsigned arr_dims, + unsigned gl_format); + fs_reg + emit_image_atomic(const fs_builder &bld, + const fs_reg &image, const fs_reg &addr, + const fs_reg &src0, const fs_reg &src1, + unsigned surf_dims, unsigned arr_dims, + unsigned rsize, unsigned op); + } +} +#endif diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp new file mode 100644 index 00000000000..676942c19c0 --- /dev/null +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -0,0 +1,57 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_validate.cpp + * + * Implements a pass that validates various invariants of the IR. The current + * pass only validates that GRF's uses are sane. More can be added later. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +#define fsv_assert(cond) \ + if (!(cond)) { \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \ + dump_instruction(inst, stderr); \ + fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \ + abort(); \ + } + +void +fs_visitor::validate() +{ + foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) { + fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <= + alloc.sizes[inst->dst.nr]); + } + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + fsv_assert(inst->src[i].offset / REG_SIZE + regs_read(inst, i) <= + alloc.sizes[inst->src[i].nr]); + } + } + } +} diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp new file mode 100644 index 00000000000..cea38d86237 --- /dev/null +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -0,0 +1,953 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_visitor.cpp + * + * This file supports generating the FS LIR from the GLSL IR. The LIR + * makes it easier to do backend-specific optimizations than doing so + * in the GLSL IR or in the native code. + */ +#include "brw_fs.h" +#include "compiler/glsl_types.h" + +using namespace brw; + +fs_reg * +fs_visitor::emit_vs_system_value(int location) +{ + fs_reg *reg = new(this->mem_ctx) + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read), + BRW_REGISTER_TYPE_D); + struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); + + switch (location) { + case SYSTEM_VALUE_BASE_VERTEX: + reg->offset = 0; + vs_prog_data->uses_basevertex = true; + break; + case SYSTEM_VALUE_BASE_INSTANCE: + reg->offset = REG_SIZE; + vs_prog_data->uses_baseinstance = true; + break; + case SYSTEM_VALUE_VERTEX_ID: + unreachable("should have been lowered"); + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + reg->offset = 2 * REG_SIZE; + vs_prog_data->uses_vertexid = true; + break; + case SYSTEM_VALUE_INSTANCE_ID: + reg->offset = 3 * REG_SIZE; + vs_prog_data->uses_instanceid = true; + break; + case SYSTEM_VALUE_DRAW_ID: + if (nir->info->system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) | + BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | + BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) + reg->nr += 4; + reg->offset = 0; + vs_prog_data->uses_drawid = true; + break; + default: + unreachable("not reached"); + } + + return reg; +} + +/* Sample from the MCS surface attached to this multisample texture. */ +fs_reg +fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, + const fs_reg &texture) +{ + const fs_reg dest = vgrf(glsl_type::uvec4_type); + + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate; + srcs[TEX_LOGICAL_SRC_SURFACE] = texture; + srcs[TEX_LOGICAL_SRC_SAMPLER] = texture; + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); + + fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, + ARRAY_SIZE(srcs)); + + /* We only care about one or two regs of response, but the sampler always + * writes 4/8. + */ + inst->size_written = 4 * dest.component_size(inst->exec_size); + + return dest; +} + +/** + * Apply workarounds for Gen6 gather with UINT/SINT + */ +void +fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst) +{ + if (!wa) + return; + + int width = (wa & WA_8BIT) ? 8 : 16; + + for (int i = 0; i < 4; i++) { + fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F); + /* Convert from UNORM to UINT */ + bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1)); + bld.MOV(dst, dst_f); + + if (wa & WA_SIGN) { + /* Reinterpret the UINT value as a signed INT value by + * shifting the sign bit into place, then shifting back + * preserving sign. + */ + bld.SHL(dst, dst, brw_imm_d(32 - width)); + bld.ASR(dst, dst, brw_imm_d(32 - width)); + } + + dst = offset(dst, bld, 1); + } +} + +/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ +void +fs_visitor::emit_dummy_fs() +{ + int reg_width = dispatch_width / 8; + + /* Everyone's favorite color. */ + const float color[4] = { 1.0, 0.0, 1.0, 0.0 }; + for (int i = 0; i < 4; i++) { + bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F), + brw_imm_f(color[i])); + } + + fs_inst *write; + write = bld.emit(FS_OPCODE_FB_WRITE); + write->eot = true; + if (devinfo->gen >= 6) { + write->base_mrf = 2; + write->mlen = 4 * reg_width; + } else { + write->header_size = 2; + write->base_mrf = 0; + write->mlen = 2 + 4 * reg_width; + } + + /* Tell the SF we don't have any inputs. Gen4-5 require at least one + * varying to avoid GPU hangs, so set that. + */ + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); + wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0; + memset(wm_prog_data->urb_setup, -1, + sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX); + + /* We don't have any uniforms. */ + stage_prog_data->nr_params = 0; + stage_prog_data->nr_pull_params = 0; + stage_prog_data->curb_read_length = 0; + stage_prog_data->dispatch_grf_start_reg = 2; + wm_prog_data->dispatch_grf_start_reg_2 = 2; + grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */ + + calculate_cfg(); +} + +/* The register location here is relative to the start of the URB + * data. It will get adjusted to be a real location before + * generate_code() time. + */ +struct brw_reg +fs_visitor::interp_reg(int location, int channel) +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + int regnr = prog_data->urb_setup[location] * 2 + channel / 2; + int stride = (channel & 1) * 4; + + assert(prog_data->urb_setup[location] != -1); + + return brw_vec1_grf(regnr, stride); +} + +/** Emits the interpolation for the varying inputs. */ +void +fs_visitor::emit_interpolation_setup_gen4() +{ + struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); + + fs_builder abld = bld.annotate("compute pixel centers"); + this->pixel_x = vgrf(glsl_type::uint_type); + this->pixel_y = vgrf(glsl_type::uint_type); + this->pixel_x.type = BRW_REGISTER_TYPE_UW; + this->pixel_y.type = BRW_REGISTER_TYPE_UW; + abld.ADD(this->pixel_x, + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), + fs_reg(brw_imm_v(0x10101010))); + abld.ADD(this->pixel_y, + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), + fs_reg(brw_imm_v(0x11001100))); + + abld = bld.annotate("compute pixel deltas from v0"); + + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] = + vgrf(glsl_type::vec2_type); + const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL]; + const fs_reg xstart(negate(brw_vec1_grf(1, 0))); + const fs_reg ystart(negate(brw_vec1_grf(1, 1))); + + if (devinfo->has_pln && dispatch_width == 16) { + for (unsigned i = 0; i < 2; i++) { + abld.half(i).ADD(half(offset(delta_xy, abld, i), 0), + half(this->pixel_x, i), xstart); + abld.half(i).ADD(half(offset(delta_xy, abld, i), 1), + half(this->pixel_y, i), ystart); + } + } else { + abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart); + abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart); + } + + abld = bld.annotate("compute pos.w and 1/pos.w"); + /* Compute wpos.w. It's always in our setup, since it's needed to + * interpolate the other attributes. + */ + this->wpos_w = vgrf(glsl_type::float_type); + abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, + interp_reg(VARYING_SLOT_POS, 3)); + /* Compute the pixel 1/W value from wpos.w. */ + this->pixel_w = vgrf(glsl_type::float_type); + abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); +} + +/** Emits the interpolation for the varying inputs. */ +void +fs_visitor::emit_interpolation_setup_gen6() +{ + struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); + + fs_builder abld = bld.annotate("compute pixel centers"); + if (devinfo->gen >= 8 || dispatch_width == 8) { + /* The "Register Region Restrictions" page says for BDW (and newer, + * presumably): + * + * "When destination spans two registers, the source may be one or + * two registers. The destination elements must be evenly split + * between the two registers." + * + * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to + * compute our pixel centers. + */ + fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UW); + + const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0); + dbld.ADD(int_pixel_xy, + fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)), + fs_reg(brw_imm_v(0x11001010))); + + this->pixel_x = vgrf(glsl_type::float_type); + this->pixel_y = vgrf(glsl_type::float_type); + abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy); + abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy); + } else { + /* The "Register Region Restrictions" page says for SNB, IVB, HSW: + * + * "When destination spans two registers, the source MUST span two + * registers." + * + * Since the GRF source of the ADD will only read a single register, we + * must do two separate ADDs in SIMD16. + */ + fs_reg int_pixel_x = vgrf(glsl_type::uint_type); + fs_reg int_pixel_y = vgrf(glsl_type::uint_type); + int_pixel_x.type = BRW_REGISTER_TYPE_UW; + int_pixel_y.type = BRW_REGISTER_TYPE_UW; + abld.ADD(int_pixel_x, + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), + fs_reg(brw_imm_v(0x10101010))); + abld.ADD(int_pixel_y, + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), + fs_reg(brw_imm_v(0x11001100))); + + /* As of gen6, we can no longer mix float and int sources. We have + * to turn the integer pixel centers into floats for their actual + * use. + */ + this->pixel_x = vgrf(glsl_type::float_type); + this->pixel_y = vgrf(glsl_type::float_type); + abld.MOV(this->pixel_x, int_pixel_x); + abld.MOV(this->pixel_y, int_pixel_y); + } + + abld = bld.annotate("compute pos.w"); + this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0)); + this->wpos_w = vgrf(glsl_type::float_type); + abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); + + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); + uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes & + (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID | + 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); + + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + uint8_t reg = payload.barycentric_coord_reg[i]; + this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0)); + + if (devinfo->needs_unlit_centroid_workaround && + (centroid_modes & (1 << i))) { + /* Get the pixel/sample mask into f0 so that we know which + * pixels are lit. Then, for each channel that is unlit, + * replace the centroid data with non-centroid data. + */ + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + + uint8_t pixel_reg = payload.barycentric_coord_reg[i - 1]; + + set_predicate_inv(BRW_PREDICATE_NORMAL, true, + bld.half(0).MOV(brw_vec8_grf(reg, 0), + brw_vec8_grf(pixel_reg, 0))); + set_predicate_inv(BRW_PREDICATE_NORMAL, true, + bld.half(0).MOV(brw_vec8_grf(reg + 1, 0), + brw_vec8_grf(pixel_reg + 1, 0))); + if (dispatch_width == 16) { + set_predicate_inv(BRW_PREDICATE_NORMAL, true, + bld.half(1).MOV(brw_vec8_grf(reg + 2, 0), + brw_vec8_grf(pixel_reg + 2, 0))); + set_predicate_inv(BRW_PREDICATE_NORMAL, true, + bld.half(1).MOV(brw_vec8_grf(reg + 3, 0), + brw_vec8_grf(pixel_reg + 3, 0))); + } + assert(dispatch_width != 32); /* not implemented yet */ + } + } +} + +static enum brw_conditional_mod +cond_for_alpha_func(GLenum func) +{ + switch(func) { + case GL_GREATER: + return BRW_CONDITIONAL_G; + case GL_GEQUAL: + return BRW_CONDITIONAL_GE; + case GL_LESS: + return BRW_CONDITIONAL_L; + case GL_LEQUAL: + return BRW_CONDITIONAL_LE; + case GL_EQUAL: + return BRW_CONDITIONAL_EQ; + case GL_NOTEQUAL: + return BRW_CONDITIONAL_NEQ; + default: + unreachable("Not reached"); + } +} + +/** + * Alpha test support for when we compile it into the shader instead + * of using the normal fixed-function alpha test. + */ +void +fs_visitor::emit_alpha_test() +{ + assert(stage == MESA_SHADER_FRAGMENT); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + const fs_builder abld = bld.annotate("Alpha test"); + + fs_inst *cmp; + if (key->alpha_test_func == GL_ALWAYS) + return; + + if (key->alpha_test_func == GL_NEVER) { + /* f0.1 = 0 */ + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW)); + cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg, + BRW_CONDITIONAL_NEQ); + } else { + /* RT0 alpha */ + fs_reg color = offset(outputs[0], bld, 3); + + /* f0.1 &= func(color, ref) */ + cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref), + cond_for_alpha_func(key->alpha_test_func)); + } + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = 1; +} + +fs_inst * +fs_visitor::emit_single_fb_write(const fs_builder &bld, + fs_reg color0, fs_reg color1, + fs_reg src0_alpha, unsigned components) +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + /* Hand over gl_FragDepth or the payload depth. */ + const fs_reg dst_depth = (payload.dest_depth_reg ? + fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) : + fs_reg()); + fs_reg src_depth, src_stencil; + + if (source_depth_to_render_target) { + if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) + src_depth = frag_depth; + else + src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)); + } + + if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + src_stencil = frag_stencil; + + const fs_reg sources[] = { + color0, color1, src0_alpha, src_depth, dst_depth, src_stencil, + (prog_data->uses_omask ? sample_mask : fs_reg()), + brw_imm_ud(components) + }; + assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS); + fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(), + sources, ARRAY_SIZE(sources)); + + if (prog_data->uses_kill) { + write->predicate = BRW_PREDICATE_NORMAL; + write->flag_subreg = 1; + } + + return write; +} + +void +fs_visitor::emit_fb_writes() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + fs_inst *inst = NULL; + + if (source_depth_to_render_target && devinfo->gen == 6) { + /* For outputting oDepth on gen6, SIMD8 writes have to be used. This + * would require SIMD8 moves of each half to message regs, e.g. by using + * the SIMD lowering pass. Unfortunately this is more difficult than it + * sounds because the SIMD8 single-source message lacks channel selects + * for the second and third subspans. + */ + limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n"); + } + + if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) { + /* From the 'Render Target Write message' section of the docs: + * "Output Stencil is not supported with SIMD16 Render Target Write + * Messages." + */ + limit_dispatch_width(8, "gl_FragStencilRefARB unsupported " + "in SIMD16+ mode.\n"); + } + + for (int target = 0; target < key->nr_color_regions; target++) { + /* Skip over outputs that weren't written. */ + if (this->outputs[target].file == BAD_FILE) + continue; + + const fs_builder abld = bld.annotate( + ralloc_asprintf(this->mem_ctx, "FB write target %d", target)); + + fs_reg src0_alpha; + if (devinfo->gen >= 6 && key->replicate_alpha && target != 0) + src0_alpha = offset(outputs[0], bld, 3); + + inst = emit_single_fb_write(abld, this->outputs[target], + this->dual_src_output, src0_alpha, 4); + inst->target = target; + } + + prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE); + assert(!prog_data->dual_src_blend || key->nr_color_regions == 1); + + if (inst == NULL) { + /* Even if there's no color buffers enabled, we still need to send + * alpha out the pipeline to our null renderbuffer to support + * alpha-testing, alpha-to-coverage, and so on. + */ + /* FINISHME: Factor out this frequently recurring pattern into a + * helper function. + */ + const fs_reg srcs[] = { reg_undef, reg_undef, + reg_undef, offset(this->outputs[0], bld, 3) }; + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); + bld.LOAD_PAYLOAD(tmp, srcs, 4, 0); + + inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4); + inst->target = 0; + } + + inst->eot = true; +} + +void +fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes) +{ + const struct brw_vs_prog_key *key = + (const struct brw_vs_prog_key *) this->key; + + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { + this->userplane[i] = fs_reg(UNIFORM, uniforms); + for (int j = 0; j < 4; ++j) { + stage_prog_data->param[uniforms + j] = + (gl_constant_value *) &clip_planes[i][j]; + } + uniforms += 4; + } +} + +/** + * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances. + * + * This does nothing if the shader uses gl_ClipDistance or user clipping is + * disabled altogether. + */ +void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes) +{ + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + const struct brw_vs_prog_key *key = + (const struct brw_vs_prog_key *) this->key; + + /* Bail unless some sort of legacy clipping is enabled */ + if (key->nr_userclip_plane_consts == 0) + return; + + /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables): + * + * "If a linked set of shaders forming the vertex stage contains no + * static write to gl_ClipVertex or gl_ClipDistance, but the + * application has requested clipping against user clip planes through + * the API, then the coordinate written to gl_Position is used for + * comparison against the user clip planes." + * + * This function is only called if the shader didn't write to + * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping + * if the user wrote to it; otherwise we use gl_Position. + */ + + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX; + if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) + clip_vertex = VARYING_SLOT_POS; + + /* If the clip vertex isn't written, skip this. Typically this means + * the GS will set up clipping. */ + if (outputs[clip_vertex].file == BAD_FILE) + return; + + setup_uniform_clipplane_values(clip_planes); + + const fs_builder abld = bld.annotate("user clip distances"); + + this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type); + this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type); + + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { + fs_reg u = userplane[i]; + const fs_reg output = offset(outputs[VARYING_SLOT_CLIP_DIST0 + i / 4], + bld, i & 3); + + abld.MUL(output, outputs[clip_vertex], u); + for (int j = 1; j < 4; j++) { + u.nr = userplane[i].nr + j; + abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u); + } + } +} + +void +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) +{ + int slot, urb_offset, length; + int starting_urb_offset = 0; + const struct brw_vue_prog_data *vue_prog_data = + brw_vue_prog_data(this->prog_data); + const struct brw_vs_prog_key *vs_key = + (const struct brw_vs_prog_key *) this->key; + const GLbitfield64 psiz_mask = + VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ; + const struct brw_vue_map *vue_map = &vue_prog_data->vue_map; + bool flush; + fs_reg sources[8]; + fs_reg urb_handle; + + if (stage == MESA_SHADER_TESS_EVAL) + urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD)); + else + urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + + /* If we don't have any valid slots to write, just do a minimal urb write + * send to terminate the shader. This includes 1 slot of undefined data, + * because it's invalid to write 0 data: + * + * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions - + * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read > + * Write Data Payload: + * + * "The write data payload can be between 1 and 8 message phases long." + */ + if (vue_map->slots_valid == 0) { + /* For GS, just turn EmitVertex() into a no-op. We don't want it to + * end the thread, and emit_gs_thread_end() already emits a SEND with + * EOT at the end of the program for us. + */ + if (stage == MESA_SHADER_GEOMETRY) + return; + + fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); + bld.exec_all().MOV(payload, urb_handle); + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->eot = true; + inst->mlen = 2; + inst->offset = 1; + return; + } + + opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + int header_size = 1; + fs_reg per_slot_offsets; + + if (stage == MESA_SHADER_GEOMETRY) { + const struct brw_gs_prog_data *gs_prog_data = + brw_gs_prog_data(this->prog_data); + + /* We need to increment the Global Offset to skip over the control data + * header and the extra "Vertex Count" field (1 HWord) at the beginning + * of the VUE. We're counting in OWords, so the units are doubled. + */ + starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; + if (gs_prog_data->static_vertex_count == -1) + starting_urb_offset += 2; + + /* We also need to use per-slot offsets. The per-slot offset is the + * Vertex Count. SIMD8 mode processes 8 different primitives at a + * time; each may output a different number of vertices. + */ + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT; + header_size++; + + /* The URB offset is in 128-bit units, so we need to multiply by 2 */ + const int output_vertex_size_owords = + gs_prog_data->output_vertex_size_hwords * 2; + + if (gs_vertex_count.file == IMM) { + per_slot_offsets = brw_imm_ud(output_vertex_size_owords * + gs_vertex_count.ud); + } else { + per_slot_offsets = vgrf(glsl_type::int_type); + bld.MUL(per_slot_offsets, gs_vertex_count, + brw_imm_ud(output_vertex_size_owords)); + } + } + + length = 0; + urb_offset = starting_urb_offset; + flush = false; + + /* SSO shaders can have VUE slots allocated which are never actually + * written to, so ignore them when looking for the last (written) slot. + */ + int last_slot = vue_map->num_slots - 1; + while (last_slot > 0 && + (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD || + outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) { + last_slot--; + } + + for (slot = 0; slot < vue_map->num_slots; slot++) { + int varying = vue_map->slot_to_varying[slot]; + switch (varying) { + case VARYING_SLOT_PSIZ: { + /* The point size varying slot is the vue header and is always in the + * vue map. But often none of the special varyings that live there + * are written and in that case we can skip writing to the vue + * header, provided the corresponding state properly clamps the + * values further down the pipeline. */ + if ((vue_map->slots_valid & psiz_mask) == 0) { + assert(length == 0); + urb_offset++; + break; + } + + fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + bld.MOV(zero, brw_imm_ud(0u)); + + sources[length++] = zero; + if (vue_map->slots_valid & VARYING_BIT_LAYER) + sources[length++] = this->outputs[VARYING_SLOT_LAYER]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_VIEWPORT) + sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_PSIZ) + sources[length++] = this->outputs[VARYING_SLOT_PSIZ]; + else + sources[length++] = zero; + break; + } + case BRW_VARYING_SLOT_NDC: + case VARYING_SLOT_EDGE: + unreachable("unexpected scalar vs output"); + break; + + default: + /* gl_Position is always in the vue map, but isn't always written by + * the shader. Other varyings (clip distances) get added to the vue + * map but don't always get written. In those cases, the + * corresponding this->output[] slot will be invalid we and can skip + * the urb write for the varying. If we've already queued up a vue + * slot for writing we flush a mlen 5 urb write, otherwise we just + * advance the urb_offset. + */ + if (varying == BRW_VARYING_SLOT_PAD || + this->outputs[varying].file == BAD_FILE) { + if (length > 0) + flush = true; + else + urb_offset++; + break; + } + + if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color && + (varying == VARYING_SLOT_COL0 || + varying == VARYING_SLOT_COL1 || + varying == VARYING_SLOT_BFC0 || + varying == VARYING_SLOT_BFC1)) { + /* We need to clamp these guys, so do a saturating MOV into a + * temp register and use that for the payload. + */ + for (int i = 0; i < 4; i++) { + fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type); + fs_reg src = offset(this->outputs[varying], bld, i); + set_saturate(true, bld.MOV(reg, src)); + sources[length++] = reg; + } + } else { + for (unsigned i = 0; i < 4; i++) + sources[length++] = offset(this->outputs[varying], bld, i); + } + break; + } + + const fs_builder abld = bld.annotate("URB write"); + + /* If we've queued up 8 registers of payload (2 VUE slots), if this is + * the last slot or if we need to flush (see BAD_FILE varying case + * above), emit a URB write send now to flush out the data. + */ + if (length == 8 || slot == last_slot) + flush = true; + if (flush) { + fs_reg *payload_sources = + ralloc_array(mem_ctx, fs_reg, length + header_size); + fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size), + BRW_REGISTER_TYPE_F); + payload_sources[0] = urb_handle; + + if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT) + payload_sources[1] = per_slot_offsets; + + memcpy(&payload_sources[header_size], sources, + length * sizeof sources[0]); + + abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, + header_size); + + fs_inst *inst = abld.emit(opcode, reg_undef, payload); + inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY; + inst->mlen = length + header_size; + inst->offset = urb_offset; + urb_offset = starting_urb_offset + slot + 1; + length = 0; + flush = false; + } + } +} + +void +fs_visitor::emit_cs_terminate() +{ + assert(devinfo->gen >= 7); + + /* We are getting the thread ID from the compute shader header */ + assert(stage == MESA_SHADER_COMPUTE); + + /* We can't directly send from g0, since sends with EOT have to use + * g112-127. So, copy it to a virtual register, The register allocator will + * make sure it uses the appropriate register range. + */ + struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + bld.group(8, 0).exec_all().MOV(payload, g0); + + /* Send a message to the thread spawner to terminate the thread. */ + fs_inst *inst = bld.exec_all() + .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload); + inst->eot = true; +} + +void +fs_visitor::emit_barrier() +{ + assert(devinfo->gen >= 7); + const uint32_t barrier_id_mask = + devinfo->gen >= 9 ? 0x8f000000u : 0x0f000000u; + + /* We are getting the barrier ID from the compute shader header */ + assert(stage == MESA_SHADER_COMPUTE); + + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + + const fs_builder pbld = bld.exec_all().group(8, 0); + + /* Clear the message payload */ + pbld.MOV(payload, brw_imm_ud(0u)); + + /* Copy the barrier id from r0.2 to the message payload reg.2 */ + fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)); + pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask)); + + /* Emit a gateway "barrier" message using the payload we set up, followed + * by a wait instruction. + */ + bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload); +} + +fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const void *key, + struct brw_stage_prog_data *prog_data, + struct gl_program *prog, + const nir_shader *shader, + unsigned dispatch_width, + int shader_time_index, + const struct brw_vue_map *input_vue_map) + : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), + key(key), gs_compile(NULL), prog_data(prog_data), prog(prog), + input_vue_map(input_vue_map), + dispatch_width(dispatch_width), + shader_time_index(shader_time_index), + bld(fs_builder(this, dispatch_width).at_end()) +{ + init(); +} + +fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + struct brw_gs_compile *c, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + int shader_time_index) + : backend_shader(compiler, log_data, mem_ctx, shader, + &prog_data->base.base), + key(&c->key), gs_compile(c), + prog_data(&prog_data->base.base), prog(NULL), + dispatch_width(8), + shader_time_index(shader_time_index), + bld(fs_builder(this, dispatch_width).at_end()) +{ + init(); +} + + +void +fs_visitor::init() +{ + switch (stage) { + case MESA_SHADER_FRAGMENT: + key_tex = &((const brw_wm_prog_key *) key)->tex; + break; + case MESA_SHADER_VERTEX: + key_tex = &((const brw_vs_prog_key *) key)->tex; + break; + case MESA_SHADER_TESS_CTRL: + key_tex = &((const brw_tcs_prog_key *) key)->tex; + break; + case MESA_SHADER_TESS_EVAL: + key_tex = &((const brw_tes_prog_key *) key)->tex; + break; + case MESA_SHADER_GEOMETRY: + key_tex = &((const brw_gs_prog_key *) key)->tex; + break; + case MESA_SHADER_COMPUTE: + key_tex = &((const brw_cs_prog_key*) key)->tex; + break; + default: + unreachable("unhandled shader stage"); + } + + if (stage == MESA_SHADER_COMPUTE) { + const struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); + unsigned size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; + size = DIV_ROUND_UP(size, devinfo->max_cs_threads); + min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8); + } else { + min_dispatch_width = 8; + } + + this->max_dispatch_width = 32; + this->prog_data = this->stage_prog_data; + + this->failed = false; + + this->nir_locals = NULL; + this->nir_ssa_values = NULL; + + memset(&this->payload, 0, sizeof(this->payload)); + this->source_depth_to_render_target = false; + this->runtime_check_aads_emit = false; + this->first_non_payload_grf = 0; + this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; + + this->virtual_grf_start = NULL; + this->virtual_grf_end = NULL; + this->live_intervals = NULL; + this->regs_live_at_ip = NULL; + + this->uniforms = 0; + this->last_scratch = 0; + this->pull_constant_loc = NULL; + this->push_constant_loc = NULL; + + this->promoted_constants = 0, + + this->spilled_any_registers = false; +} + +fs_visitor::~fs_visitor() +{ +} diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h new file mode 100644 index 00000000000..a0b8fb66dd6 --- /dev/null +++ b/src/intel/compiler/brw_inst.h @@ -0,0 +1,866 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file brw_inst.h + * + * A representation of i965 EU assembly instructions, with helper methods to + * get and set various fields. This is the actual hardware format. + */ + +#ifndef BRW_INST_H +#define BRW_INST_H + +#include <assert.h> +#include <stdint.h> + +#include "brw_eu_defines.h" +#include "common/gen_device_info.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* brw_context.h has a forward declaration of brw_inst, so name the struct. */ +typedef struct brw_inst { + uint64_t data[2]; +} brw_inst; + +static inline uint64_t brw_inst_bits(const brw_inst *inst, + unsigned high, unsigned low); +static inline void brw_inst_set_bits(brw_inst *inst, + unsigned high, unsigned low, + uint64_t value); + +#define FC(name, high, low, assertions) \ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + brw_inst_set_bits(inst, high, low, v); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct gen_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + return brw_inst_bits(inst, high, low); \ +} + +/* A simple macro for fields which stay in the same place on all generations. */ +#define F(name, high, low) FC(name, high, low, true) + +#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ + unsigned high, low; \ + if (devinfo->gen >= 8) { \ + high = hi8; low = lo8; \ + } else if (devinfo->gen >= 7) { \ + high = hi7; low = lo7; \ + } else if (devinfo->gen >= 6) { \ + high = hi6; low = lo6; \ + } else if (devinfo->gen >= 5) { \ + high = hi5; low = lo5; \ + } else if (devinfo->is_g4x) { \ + high = hi45; low = lo45; \ + } else { \ + high = hi4; low = lo4; \ + } \ + assert(((int) high) != -1 && ((int) low) != -1); \ + +/* A general macro for cases where the field has moved to several different + * bit locations across generations. GCC appears to combine cases where the + * bits are identical, removing some of the inefficiency. + */ +#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)\ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ + brw_inst_set_bits(inst, high, low, value); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \ +{ \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ + return brw_inst_bits(inst, high, low); \ +} + +/* A macro for fields which moved as of Gen8+. */ +#define F8(name, gen4_high, gen4_low, gen8_high, gen8_low) \ +FF(name, \ + /* 4: */ gen4_high, gen4_low, \ + /* 4.5: */ gen4_high, gen4_low, \ + /* 5: */ gen4_high, gen4_low, \ + /* 6: */ gen4_high, gen4_low, \ + /* 7: */ gen4_high, gen4_low, \ + /* 8: */ gen8_high, gen8_low); + +F(src1_vstride, 120, 117) +F(src1_width, 116, 114) +F(src1_da16_swiz_w, 115, 114) +F(src1_da16_swiz_z, 113, 112) +F(src1_hstride, 113, 112) +F(src1_address_mode, 111, 111) +/** Src1.SrcMod @{ */ +F(src1_negate, 110, 110) +F(src1_abs, 109, 109) +/** @} */ +F8(src1_ia_subreg_nr, /* 4+ */ 108, 106, /* 8+ */ 108, 105) +F(src1_da_reg_nr, 108, 101) +F(src1_da16_subreg_nr, 100, 100) +F(src1_da1_subreg_nr, 100, 96) +F(src1_da16_swiz_y, 99, 98) +F(src1_da16_swiz_x, 97, 96) +F8(src1_reg_type, /* 4+ */ 46, 44, /* 8+ */ 94, 91) +F8(src1_reg_file, /* 4+ */ 43, 42, /* 8+ */ 90, 89) +F(src0_vstride, 88, 85) +F(src0_width, 84, 82) +F(src0_da16_swiz_w, 83, 82) +F(src0_da16_swiz_z, 81, 80) +F(src0_hstride, 81, 80) +F(src0_address_mode, 79, 79) +/** Src0.SrcMod @{ */ +F(src0_negate, 78, 78) +F(src0_abs, 77, 77) +/** @} */ +F8(src0_ia_subreg_nr, /* 4+ */ 76, 74, /* 8+ */ 76, 73) +F(src0_da_reg_nr, 76, 69) +F(src0_da16_subreg_nr, 68, 68) +F(src0_da1_subreg_nr, 68, 64) +F(src0_da16_swiz_y, 67, 66) +F(src0_da16_swiz_x, 65, 64) +F(dst_address_mode, 63, 63) +F(dst_hstride, 62, 61) +F8(dst_ia_subreg_nr, /* 4+ */ 60, 58, /* 8+ */ 60, 57) +F(dst_da_reg_nr, 60, 53) +F(dst_da16_subreg_nr, 52, 52) +F(dst_da1_subreg_nr, 52, 48) +F(da16_writemask, 51, 48) /* Dst.ChanEn */ +F8(src0_reg_type, /* 4+ */ 41, 39, /* 8+ */ 46, 43) +F8(src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41) +F8(dst_reg_type, /* 4+ */ 36, 34, /* 8+ */ 40, 37) +F8(dst_reg_file, /* 4+ */ 33, 32, /* 8+ */ 36, 35) +F8(mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34) +FF(flag_reg_nr, + /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ 90, 90, + /* 8: */ 33, 33) +F8(flag_subreg_nr, /* 4+ */ 89, 89, /* 8+ */ 32, 32) +F(saturate, 31, 31) +F(debug_control, 30, 30) +F(cmpt_control, 29, 29) +FC(branch_control, 28, 28, devinfo->gen >= 8) +FC(acc_wr_control, 28, 28, devinfo->gen >= 6) +FC(mask_control_ex, 28, 28, devinfo->is_g4x || devinfo->gen == 5) +F(cond_modifier, 27, 24) +FC(math_function, 27, 24, devinfo->gen >= 6) +F(exec_size, 23, 21) +F(pred_inv, 20, 20) +F(pred_control, 19, 16) +F(thread_control, 15, 14) +F(qtr_control, 13, 12) +FF(nib_control, + /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ 47, 47, + /* 8: */ 11, 11) +F8(no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10) +F8(no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9) +F(access_mode, 8, 8) +/* Bit 7 is Reserved (for future Opcode expansion) */ +F(opcode, 6, 0) + +/** + * Three-source instructions: + * @{ + */ +F(3src_src2_reg_nr, 125, 118) +F(3src_src2_subreg_nr, 117, 115) /* Extra discontiguous bit on CHV? */ +F(3src_src2_swizzle, 114, 107) +F(3src_src2_rep_ctrl, 106, 106) +F(3src_src1_reg_nr, 104, 97) +F(3src_src1_subreg_nr, 96, 94) /* Extra discontiguous bit on CHV? */ +F(3src_src1_swizzle, 93, 86) +F(3src_src1_rep_ctrl, 85, 85) +F(3src_src0_reg_nr, 83, 76) +F(3src_src0_subreg_nr, 75, 73) /* Extra discontiguous bit on CHV? */ +F(3src_src0_swizzle, 72, 65) +F(3src_src0_rep_ctrl, 64, 64) +F(3src_dst_reg_nr, 63, 56) +F(3src_dst_subreg_nr, 55, 53) +F(3src_dst_writemask, 52, 49) +F8(3src_nib_ctrl, 47, 47, 11, 11) /* only exists on IVB+ */ +F8(3src_dst_type, 45, 44, 48, 46) /* only exists on IVB+ */ +F8(3src_src_type, 43, 42, 45, 43) +F8(3src_src2_negate, 41, 41, 42, 42) +F8(3src_src2_abs, 40, 40, 41, 41) +F8(3src_src1_negate, 39, 39, 40, 40) +F8(3src_src1_abs, 38, 38, 39, 39) +F8(3src_src0_negate, 37, 37, 38, 38) +F8(3src_src0_abs, 36, 36, 37, 37) +F8(3src_flag_reg_nr, 34, 34, 33, 33) +F8(3src_flag_subreg_nr, 33, 33, 32, 32) +FF(3src_dst_reg_file, + /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1, + /* 6: */ 32, 32, + /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1) +F(3src_saturate, 31, 31) +F(3src_debug_control, 30, 30) +F(3src_cmpt_control, 29, 29) +F(3src_acc_wr_control, 28, 28) +F(3src_cond_modifier, 27, 24) +F(3src_exec_size, 23, 21) +F(3src_pred_inv, 20, 20) +F(3src_pred_control, 19, 16) +F(3src_thread_control, 15, 14) +F(3src_qtr_control, 13, 12) +F8(3src_no_dd_check, 11, 11, 10, 10) +F8(3src_no_dd_clear, 10, 10, 9, 9) +F8(3src_mask_control, 9, 9, 34, 34) +F(3src_access_mode, 8, 8) +/* Bit 7 is Reserved (for future Opcode expansion) */ +F(3src_opcode, 6, 0) +/** @} */ + +/** + * Flow control instruction bits: + * @{ + */ +static inline void +brw_inst_set_uip(const struct gen_device_info *devinfo, + brw_inst *inst, int32_t value) +{ + assert(devinfo->gen >= 6); + + if (devinfo->gen >= 8) { + brw_inst_set_bits(inst, 95, 64, (uint32_t)value); + } else { + assert(value <= (1 << 16) - 1); + assert(value > -(1 << 16)); + brw_inst_set_bits(inst, 127, 112, (uint16_t)value); + } +} + +static inline int32_t +brw_inst_uip(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + assert(devinfo->gen >= 6); + + if (devinfo->gen >= 8) { + return brw_inst_bits(inst, 95, 64); + } else { + return (int16_t)brw_inst_bits(inst, 127, 112); + } +} + +static inline void +brw_inst_set_jip(const struct gen_device_info *devinfo, + brw_inst *inst, int32_t value) +{ + assert(devinfo->gen >= 6); + + if (devinfo->gen >= 8) { + brw_inst_set_bits(inst, 127, 96, (uint32_t)value); + } else { + assert(value <= (1 << 15) - 1); + assert(value >= -(1 << 15)); + brw_inst_set_bits(inst, 111, 96, (uint16_t)value); + } +} + +static inline int32_t +brw_inst_jip(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + assert(devinfo->gen >= 6); + + if (devinfo->gen >= 8) { + return brw_inst_bits(inst, 127, 96); + } else { + return (int16_t)brw_inst_bits(inst, 111, 96); + } +} + +/** Like FC, but using int16_t to handle negative jump targets. */ +#define FJ(name, high, low, assertions) \ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, brw_inst *inst, int16_t v) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + brw_inst_set_bits(inst, high, low, (uint16_t) v); \ +} \ +static inline int16_t \ +brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + return brw_inst_bits(inst, high, low); \ +} + +FJ(gen6_jump_count, 63, 48, devinfo->gen == 6) +FJ(gen4_jump_count, 111, 96, devinfo->gen < 6) +FC(gen4_pop_count, 115, 112, devinfo->gen < 6) +/** @} */ + +/* Message descriptor bits */ +#define MD(x) ((x) + 96) + +/** + * Fields for SEND messages: + * @{ + */ +F(eot, 127, 127) +FF(mlen, + /* 4: */ 119, 116, + /* 4.5: */ 119, 116, + /* 5: */ 124, 121, + /* 6: */ 124, 121, + /* 7: */ 124, 121, + /* 8: */ 124, 121); +FF(rlen, + /* 4: */ 115, 112, + /* 4.5: */ 115, 112, + /* 5: */ 120, 116, + /* 6: */ 120, 116, + /* 7: */ 120, 116, + /* 8: */ 120, 116); +FF(header_present, + /* 4: doesn't exist */ -1, -1, -1, -1, + /* 5: */ 115, 115, + /* 6: */ 115, 115, + /* 7: */ 115, 115, + /* 8: */ 115, 115) +F(gateway_notify, MD(16), MD(15)) +FF(function_control, + /* 4: */ 111, 96, + /* 4.5: */ 111, 96, + /* 5: */ 114, 96, + /* 6: */ 114, 96, + /* 7: */ 114, 96, + /* 8: */ 114, 96) +FF(gateway_subfuncid, + /* 4: */ MD(1), MD(0), + /* 4.5: */ MD(1), MD(0), + /* 5: */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */ + /* 6: */ MD(2), MD(0), + /* 7: */ MD(2), MD(0), + /* 8: */ MD(2), MD(0)) +FF(sfid, + /* 4: */ 123, 120, /* called msg_target */ + /* 4.5 */ 123, 120, + /* 5: */ 95, 92, + /* 6: */ 27, 24, + /* 7: */ 27, 24, + /* 8: */ 27, 24) +FC(base_mrf, 27, 24, devinfo->gen < 6); +/** @} */ + +/** + * URB message function control bits: + * @{ + */ +FF(urb_per_slot_offset, + /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ MD(16), MD(16), + /* 8: */ MD(17), MD(17)) +FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8) +FC(urb_complete, MD(15), MD(15), devinfo->gen < 8) +FC(urb_used, MD(14), MD(14), devinfo->gen < 7) +FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7) +FF(urb_swizzle_control, + /* 4: */ MD(11), MD(10), + /* 4.5: */ MD(11), MD(10), + /* 5: */ MD(11), MD(10), + /* 6: */ MD(11), MD(10), + /* 7: */ MD(14), MD(14), + /* 8: */ MD(15), MD(15)) +FF(urb_global_offset, + /* 4: */ MD( 9), MD(4), + /* 4.5: */ MD( 9), MD(4), + /* 5: */ MD( 9), MD(4), + /* 6: */ MD( 9), MD(4), + /* 7: */ MD(13), MD(3), + /* 8: */ MD(14), MD(4)) +FF(urb_opcode, + /* 4: */ MD( 3), MD(0), + /* 4.5: */ MD( 3), MD(0), + /* 5: */ MD( 3), MD(0), + /* 6: */ MD( 3), MD(0), + /* 7: */ MD( 2), MD(0), + /* 8: */ MD( 3), MD(0)) +/** @} */ + +/** + * Gen4-5 math messages: + * @{ + */ +FC(math_msg_data_type, MD(7), MD(7), devinfo->gen < 6) +FC(math_msg_saturate, MD(6), MD(6), devinfo->gen < 6) +FC(math_msg_precision, MD(5), MD(5), devinfo->gen < 6) +FC(math_msg_signed_int, MD(4), MD(4), devinfo->gen < 6) +FC(math_msg_function, MD(3), MD(0), devinfo->gen < 6) +/** @} */ + +/** + * Sampler message function control bits: + * @{ + */ +FF(sampler_simd_mode, + /* 4: doesn't exist */ -1, -1, -1, -1, + /* 5: */ MD(17), MD(16), + /* 6: */ MD(17), MD(16), + /* 7: */ MD(18), MD(17), + /* 8: */ MD(18), MD(17)) +FF(sampler_msg_type, + /* 4: */ MD(15), MD(14), + /* 4.5: */ MD(15), MD(12), + /* 5: */ MD(15), MD(12), + /* 6: */ MD(15), MD(12), + /* 7: */ MD(16), MD(12), + /* 8: */ MD(16), MD(12)) +FC(sampler_return_format, MD(13), MD(12), devinfo->gen == 4 && !devinfo->is_g4x) +F(sampler, MD(11), MD(8)) +F(binding_table_index, MD( 7), MD(0)) /* also used by other messages */ +/** @} */ + +/** + * Data port message function control bits: + * @{ + */ +FC(dp_category, MD(18), MD(18), devinfo->gen >= 7) + +/* Gen4-5 store fields in different bits for read/write messages. */ +FF(dp_read_msg_type, + /* 4: */ MD(13), MD(12), + /* 4.5: */ MD(13), MD(11), + /* 5: */ MD(13), MD(11), + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(17), MD(14)) +FF(dp_write_msg_type, + /* 4: */ MD(14), MD(12), + /* 4.5: */ MD(14), MD(12), + /* 5: */ MD(14), MD(12), + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(17), MD(14)) +FF(dp_read_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5: */ MD(10), MD( 8), + /* 5: */ MD(10), MD( 8), + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8)) +FF(dp_write_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5: */ MD(11), MD( 8), + /* 5: */ MD(11), MD( 8), + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8)) +FC(dp_read_target_cache, MD(15), MD(14), devinfo->gen < 6); + +FF(dp_write_commit, + /* 4: */ MD(15), MD(15), + /* 4.5: */ MD(15), MD(15), + /* 5: */ MD(15), MD(15), + /* 6: */ MD(17), MD(17), + /* 7+: does not exist */ -1, -1, -1, -1) + +/* Gen6+ use the same bit locations for everything. */ +FF(dp_msg_type, + /* 4-5: use dp_read_msg_type or dp_write_msg_type instead */ + -1, -1, -1, -1, -1, -1, + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(17), MD(14)) +FF(dp_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1, + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8)) +/** @} */ + +/** + * Scratch message bits (Gen7+): + * @{ + */ +FC(scratch_read_write, MD(17), MD(17), devinfo->gen >= 7) /* 0 = read, 1 = write */ +FC(scratch_type, MD(16), MD(16), devinfo->gen >= 7) /* 0 = OWord, 1 = DWord */ +FC(scratch_invalidate_after_read, MD(15), MD(15), devinfo->gen >= 7) +FC(scratch_block_size, MD(13), MD(12), devinfo->gen >= 7) +FC(scratch_addr_offset, MD(11), MD( 0), devinfo->gen >= 7) +/** @} */ + +/** + * Render Target message function control bits: + * @{ + */ +FF(rt_last, + /* 4: */ MD(11), MD(11), + /* 4.5: */ MD(11), MD(11), + /* 5: */ MD(11), MD(11), + /* 6: */ MD(12), MD(12), + /* 7: */ MD(12), MD(12), + /* 8: */ MD(12), MD(12)) +FC(rt_slot_group, MD(11), MD(11), devinfo->gen >= 6) +F(rt_message_type, MD(10), MD( 8)) +/** @} */ + +/** + * Thread Spawn message function control bits: + * @{ + */ +F(ts_resource_select, MD( 4), MD( 4)) +F(ts_request_type, MD( 1), MD( 1)) +F(ts_opcode, MD( 0), MD( 0)) +/** @} */ + +/** + * Pixel Interpolator message function control bits: + * @{ + */ +F(pi_simd_mode, MD(16), MD(16)) +F(pi_nopersp, MD(14), MD(14)) +F(pi_message_type, MD(13), MD(12)) +F(pi_slot_group, MD(11), MD(11)) +F(pi_message_data, MD(7), MD(0)) +/** @} */ + +/** + * Immediates: + * @{ + */ +static inline int +brw_inst_imm_d(const struct gen_device_info *devinfo, const brw_inst *insn) +{ + (void) devinfo; + return brw_inst_bits(insn, 127, 96); +} + +static inline unsigned +brw_inst_imm_ud(const struct gen_device_info *devinfo, const brw_inst *insn) +{ + (void) devinfo; + return brw_inst_bits(insn, 127, 96); +} + +static inline float +brw_inst_imm_f(const struct gen_device_info *devinfo, const brw_inst *insn) +{ + union { + float f; + uint32_t u; + } ft; + (void) devinfo; + ft.u = brw_inst_bits(insn, 127, 96); + return ft.f; +} + +static inline double +brw_inst_imm_df(const struct gen_device_info *devinfo, const brw_inst *insn) +{ + union { + double d; + uint64_t u; + } dt; + (void) devinfo; + dt.u = brw_inst_bits(insn, 127, 64); + return dt.d; +} + +static inline void +brw_inst_set_imm_d(const struct gen_device_info *devinfo, + brw_inst *insn, int value) +{ + (void) devinfo; + return brw_inst_set_bits(insn, 127, 96, value); +} + +static inline void +brw_inst_set_imm_ud(const struct gen_device_info *devinfo, + brw_inst *insn, unsigned value) +{ + (void) devinfo; + return brw_inst_set_bits(insn, 127, 96, value); +} + +static inline void +brw_inst_set_imm_f(const struct gen_device_info *devinfo, + brw_inst *insn, float value) +{ + union { + float f; + uint32_t u; + } ft; + (void) devinfo; + ft.f = value; + brw_inst_set_bits(insn, 127, 96, ft.u); +} + +static inline void +brw_inst_set_imm_df(const struct gen_device_info *devinfo, + brw_inst *insn, double value) +{ + union { + double d; + uint64_t u; + } dt; + (void) devinfo; + dt.d = value; + brw_inst_set_bits(insn, 127, 64, dt.u); +} + +static inline void +brw_inst_set_imm_uq(const struct gen_device_info *devinfo, + brw_inst *insn, uint64_t value) +{ + (void) devinfo; + brw_inst_set_bits(insn, 127, 64, value); +} + +/** @} */ + +/* The AddrImm fields are split into two discontiguous sections on Gen8+ */ +#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \ +static inline void \ +brw_inst_set_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \ + brw_inst *inst, \ + unsigned value) \ +{ \ + assert((value & ~0x3ff) == 0); \ + if (devinfo->gen >= 8) { \ + brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \ + brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \ + } else { \ + brw_inst_set_bits(inst, g4_high, g4_low, value); \ + } \ +} \ +static inline unsigned \ +brw_inst_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + if (devinfo->gen >= 8) { \ + return brw_inst_bits(inst, g8_high, g8_low) | \ + (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ + } else { \ + return brw_inst_bits(inst, g4_high, g4_low); \ + } \ +} + +/* AddrImm[9:0] for Align1 Indirect Addressing */ +/* -Gen 4- ----Gen8---- */ +BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96) +BRW_IA1_ADDR_IMM(src0, 73, 64, 95, 72, 64) +BRW_IA1_ADDR_IMM(dst, 57, 48, 47, 56, 48) + +#define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \ +static inline void \ +brw_inst_set_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \ + brw_inst *inst, unsigned value) \ +{ \ + assert((value & ~0x3ff) == 0); \ + if (devinfo->gen >= 8) { \ + brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \ + brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \ + } else { \ + brw_inst_set_bits(inst, g4_high, g4_low, value >> 9); \ + } \ +} \ +static inline unsigned \ +brw_inst_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + if (devinfo->gen >= 8) { \ + return brw_inst_bits(inst, g8_high, g8_low) | \ + (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ + } else { \ + return brw_inst_bits(inst, g4_high, g4_low); \ + } \ +} + +/* AddrImm[9:0] for Align16 Indirect Addressing: + * Compared to Align1, these are missing the low 4 bits. + * -Gen 4- ----Gen8---- + */ +BRW_IA16_ADDR_IMM(src1, 105, 96, 121, 104, 100) +BRW_IA16_ADDR_IMM(src0, 73, 64, 95, 72, 68) +BRW_IA16_ADDR_IMM(dst, 57, 52, 47, 56, 52) + +/** + * Fetch a set of contiguous bits from the instruction. + * + * Bits indices range from 0..127; fields may not cross 64-bit boundaries. + */ +static inline uint64_t +brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low) +{ + /* We assume the field doesn't cross 64-bit boundaries. */ + const unsigned word = high / 64; + assert(word == low / 64); + + high %= 64; + low %= 64; + + const uint64_t mask = (~0ull >> (64 - (high - low + 1))); + + return (inst->data[word] >> low) & mask; +} + +/** + * Set bits in the instruction, with proper shifting and masking. + * + * Bits indices range from 0..127; fields may not cross 64-bit boundaries. + */ +static inline void +brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value) +{ + const unsigned word = high / 64; + assert(word == low / 64); + + high %= 64; + low %= 64; + + const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low; + + /* Make sure the supplied value actually fits in the given bitfield. */ + assert((value & (mask >> low)) == value); + + inst->data[word] = (inst->data[word] & ~mask) | (value << low); +} + +#undef BRW_IA16_ADDR_IMM +#undef BRW_IA1_ADDR_IMM +#undef MD +#undef F8 +#undef FF +#undef BOUNDS +#undef F +#undef FC + +typedef struct { + uint64_t data; +} brw_compact_inst; + +/** + * Fetch a set of contiguous bits from the compacted instruction. + * + * Bits indices range from 0..63. + */ +static inline unsigned +brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low) +{ + const uint64_t mask = (1ull << (high - low + 1)) - 1; + + return (inst->data >> low) & mask; +} + +/** + * Set bits in the compacted instruction. + * + * Bits indices range from 0..63. + */ +static inline void +brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low, + uint64_t value) +{ + const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low; + + /* Make sure the supplied value actually fits in the given bitfield. */ + assert((value & (mask >> low)) == value); + + inst->data = (inst->data & ~mask) | (value << low); +} + +#define FC(name, high, low, assertions) \ +static inline void \ +brw_compact_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_compact_inst *inst, unsigned v) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + brw_compact_inst_set_bits(inst, high, low, v); \ +} \ +static inline unsigned \ +brw_compact_inst_##name(const struct gen_device_info *devinfo, \ + const brw_compact_inst *inst) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + return brw_compact_inst_bits(inst, high, low); \ +} + +/* A simple macro for fields which stay in the same place on all generations. */ +#define F(name, high, low) FC(name, high, low, true) + +F(src1_reg_nr, 63, 56) +F(src0_reg_nr, 55, 48) +F(dst_reg_nr, 47, 40) +F(src1_index, 39, 35) +F(src0_index, 34, 30) +F(cmpt_control, 29, 29) /* Same location as brw_inst */ +FC(flag_subreg_nr, 28, 28, devinfo->gen <= 6) +F(cond_modifier, 27, 24) /* Same location as brw_inst */ +FC(acc_wr_control, 23, 23, devinfo->gen >= 6) +FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5) +F(subreg_index, 22, 18) +F(datatype_index, 17, 13) +F(control_index, 12, 8) +F(debug_control, 7, 7) +F(opcode, 6, 0) /* Same location as brw_inst */ + +/** + * (Gen8+) Compacted three-source instructions: + * @{ + */ +FC(3src_src2_reg_nr, 63, 57, devinfo->gen >= 8) +FC(3src_src1_reg_nr, 56, 50, devinfo->gen >= 8) +FC(3src_src0_reg_nr, 49, 43, devinfo->gen >= 8) +FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8) +FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8) +FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8) +FC(3src_src2_rep_ctrl, 33, 33, devinfo->gen >= 8) +FC(3src_src1_rep_ctrl, 32, 32, devinfo->gen >= 8) +FC(3src_saturate, 31, 31, devinfo->gen >= 8) +FC(3src_debug_control, 30, 30, devinfo->gen >= 8) +FC(3src_cmpt_control, 29, 29, devinfo->gen >= 8) +FC(3src_src0_rep_ctrl, 28, 28, devinfo->gen >= 8) +/* Reserved */ +FC(3src_dst_reg_nr, 18, 12, devinfo->gen >= 8) +FC(3src_source_index, 11, 10, devinfo->gen >= 8) +FC(3src_control_index, 9, 8, devinfo->gen >= 8) +/* Bit 7 is Reserved (for future Opcode expansion) */ +FC(3src_opcode, 6, 0, devinfo->gen >= 8) +/** @} */ + +#undef F + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/intel/compiler/brw_interpolation_map.c b/src/intel/compiler/brw_interpolation_map.c new file mode 100644 index 00000000000..7b9f58eb6ee --- /dev/null +++ b/src/intel/compiler/brw_interpolation_map.c @@ -0,0 +1,109 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "compiler/nir/nir.h" + +static char const *get_qual_name(int mode) +{ + switch (mode) { + case INTERP_MODE_NONE: return "none"; + case INTERP_MODE_FLAT: return "flat"; + case INTERP_MODE_SMOOTH: return "smooth"; + case INTERP_MODE_NOPERSPECTIVE: return "nopersp"; + default: return "???"; + } +} + +static void +gen4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data, + struct brw_vue_map *vue_map, + unsigned location, unsigned slot_count, + enum glsl_interp_mode interp) +{ + for (unsigned k = 0; k < slot_count; k++) { + unsigned slot = vue_map->varying_to_slot[location + k]; + if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) { + prog_data->interp_mode[slot] = interp; + + if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) { + prog_data->contains_flat_varying = true; + } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) { + prog_data->contains_noperspective_varying = true; + } + } + } +} + +/* Set up interpolation modes for every element in the VUE */ +void +brw_setup_vue_interpolation(struct brw_vue_map *vue_map, nir_shader *nir, + struct brw_wm_prog_data *prog_data, + const struct gen_device_info *devinfo) +{ + /* Initialise interp_mode. INTERP_MODE_NONE == 0 */ + memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode)); + + if (!vue_map) + return; + + /* HPOS always wants noperspective. setting it up here allows + * us to not need special handling in the SF program. + */ + unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS]; + if (pos_slot != -1) {; + prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE; + prog_data->contains_noperspective_varying = true; + } + + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + unsigned location = var->data.location; + unsigned slot_count = glsl_count_attribute_slots(var->type, false); + + gen4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count, + var->data.interpolation); + + if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) { + location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0; + gen4_frag_prog_set_interp_modes(prog_data, vue_map, location, + slot_count, var->data.interpolation); + } + } + + bool debug = false; + if (debug) { + fprintf(stderr, "VUE map:\n"); + for (int i = 0; i < vue_map->num_slots; i++) { + int varying = vue_map->slot_to_varying[i]; + if (varying == -1) { + fprintf(stderr, "%d: --\n", i); + continue; + } + + fprintf(stderr, "%d: %d %s ofs %d\n", + i, varying, + get_qual_name(prog_data->interp_mode[i]), + brw_vue_slot_to_offset(i)); + } + } +} diff --git a/src/intel/compiler/brw_ir_allocator.h b/src/intel/compiler/brw_ir_allocator.h new file mode 100644 index 00000000000..b1237ed38e7 --- /dev/null +++ b/src/intel/compiler/brw_ir_allocator.h @@ -0,0 +1,87 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_ALLOCATOR_H +#define BRW_IR_ALLOCATOR_H + +#include "main/macros.h" + +namespace brw { + /** + * Simple allocator used to keep track of virtual GRFs. + */ + class simple_allocator { + public: + simple_allocator() : + sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0) + { + } + + ~simple_allocator() + { + free(offsets); + free(sizes); + } + + unsigned + allocate(unsigned size) + { + if (capacity <= count) { + capacity = MAX2(16, capacity * 2); + sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned)); + offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned)); + } + + sizes[count] = size; + offsets[count] = total_size; + total_size += size; + + return count++; + } + + /** + * Array of sizes for each allocation. The allocation unit is up to the + * back-end, but it's expected to be one scalar value in the FS back-end + * and one vec4 in the VEC4 back-end. + */ + unsigned *sizes; + + /** + * Array of offsets from the start of the VGRF space in allocation + * units. + */ + unsigned *offsets; + + /** Total number of VGRFs allocated. */ + unsigned count; + + /** Cumulative size in allocation units. */ + unsigned total_size; + + private: + unsigned capacity; + }; +} + +#endif diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h new file mode 100644 index 00000000000..cad371248c4 --- /dev/null +++ b/src/intel/compiler/brw_ir_fs.h @@ -0,0 +1,451 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_FS_H +#define BRW_IR_FS_H + +#include "brw_shader.h" + +class fs_inst; + +class fs_reg : public backend_reg { +public: + DECLARE_RALLOC_CXX_OPERATORS(fs_reg) + + void init(); + + fs_reg(); + fs_reg(struct ::brw_reg reg); + fs_reg(enum brw_reg_file file, int nr); + fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type); + + bool equals(const fs_reg &r) const; + bool is_contiguous() const; + + /** + * Return the size in bytes of a single logical component of the + * register assuming the given execution width. + */ + unsigned component_size(unsigned width) const; + + /** Register region horizontal stride */ + uint8_t stride; +}; + +static inline fs_reg +negate(fs_reg reg) +{ + assert(reg.file != IMM); + reg.negate = !reg.negate; + return reg; +} + +static inline fs_reg +retype(fs_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +static inline fs_reg +byte_offset(fs_reg reg, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + break; + case VGRF: + case ATTR: + case UNIFORM: + reg.offset += delta; + break; + case MRF: { + const unsigned suboffset = reg.offset + delta; + reg.nr += suboffset / REG_SIZE; + reg.offset = suboffset % REG_SIZE; + break; + } + case ARF: + case FIXED_GRF: { + const unsigned suboffset = reg.subnr + delta; + reg.nr += suboffset / REG_SIZE; + reg.subnr = suboffset % REG_SIZE; + break; + } + case IMM: + default: + assert(delta == 0); + } + return reg; +} + +static inline fs_reg +horiz_offset(const fs_reg ®, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + case UNIFORM: + case IMM: + /* These only have a single component that is implicitly splatted. A + * horizontal offset should be a harmless no-op. + * XXX - Handle vector immediates correctly. + */ + return reg; + case VGRF: + case MRF: + case ATTR: + return byte_offset(reg, delta * reg.stride * type_sz(reg.type)); + case ARF: + case FIXED_GRF: + if (reg.is_null()) { + return reg; + } else { + const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0; + return byte_offset(reg, delta * stride * type_sz(reg.type)); + } + } + unreachable("Invalid register file"); +} + +static inline fs_reg +offset(fs_reg reg, unsigned width, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + break; + case ARF: + case FIXED_GRF: + case MRF: + case VGRF: + case ATTR: + case UNIFORM: + return byte_offset(reg, delta * reg.component_size(width)); + case IMM: + assert(delta == 0); + } + return reg; +} + +/** + * Get the scalar channel of \p reg given by \p idx and replicate it to all + * channels of the result. + */ +static inline fs_reg +component(fs_reg reg, unsigned idx) +{ + reg = horiz_offset(reg, idx); + reg.stride = 0; + return reg; +} + +/** + * Return an integer identifying the discrete address space a register is + * contained in. A register is by definition fully contained in the single + * reg_space it belongs to, so two registers with different reg_space ids are + * guaranteed not to overlap. Most register files are a single reg_space of + * its own, only the VGRF file is composed of multiple discrete address + * spaces, one for each VGRF allocation. + */ +static inline uint32_t +reg_space(const fs_reg &r) +{ + return r.file << 16 | (r.file == VGRF ? r.nr : 0); +} + +/** + * Return the base offset in bytes of a register relative to the start of its + * reg_space(). + */ +static inline unsigned +reg_offset(const fs_reg &r) +{ + return (r.file == VGRF || r.file == IMM ? 0 : r.nr) * + (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset + + (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); +} + +/** + * Return the amount of padding in bytes left unused between individual + * components of register \p r due to a (horizontal) stride value greater than + * one, or zero if components are tightly packed in the register file. + */ +static inline unsigned +reg_padding(const fs_reg &r) +{ + const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride : + r.hstride == 0 ? 0 : + 1 << (r.hstride - 1)); + return (MAX2(1, stride) - 1) * type_sz(r.type); +} + +/** + * Return whether the register region starting at \p r and spanning \p dr + * bytes could potentially overlap the register region starting at \p s and + * spanning \p ds bytes. + */ +static inline bool +regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { + fs_reg t = r; + t.nr &= ~BRW_MRF_COMPR4; + /* COMPR4 regions are translated by the hardware during decompression + * into two separate half-regions 4 MRFs apart from each other. + */ + return regions_overlap(t, dr / 2, s, ds) || + regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds); + + } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { + return regions_overlap(s, ds, r, dr); + + } else { + return reg_space(r) == reg_space(s) && + !(reg_offset(r) + dr <= reg_offset(s) || + reg_offset(s) + ds <= reg_offset(r)); + } +} + +/** + * Check that the register region given by r [r.offset, r.offset + dr[ + * is fully contained inside the register region given by s + * [s.offset, s.offset + ds[. + */ +static inline bool +region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + return reg_space(r) == reg_space(s) && + reg_offset(r) >= reg_offset(s) && + reg_offset(r) + dr <= reg_offset(s) + ds; +} + +/** + * Return whether the given register region is n-periodic, i.e. whether the + * original region remains invariant after shifting it by \p n scalar + * channels. + */ +static inline bool +is_periodic(const fs_reg ®, unsigned n) +{ + if (reg.file == BAD_FILE || reg.is_null()) { + return true; + + } else if (reg.file == IMM) { + const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV || + reg.type == BRW_REGISTER_TYPE_V ? 8 : + reg.type == BRW_REGISTER_TYPE_VF ? 4 : + 1); + return n % period == 0; + + } else if (reg.file == ARF || reg.file == FIXED_GRF) { + const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 : + reg.vstride == 0 ? 1 << reg.width : + ~0); + return n % period == 0; + + } else { + return reg.stride == 0; + } +} + +static inline bool +is_uniform(const fs_reg ®) +{ + return is_periodic(reg, 1); +} + +/** + * Get the specified 8-component quarter of a register. + * XXX - Maybe come up with a less misleading name for this (e.g. quarter())? + */ +static inline fs_reg +half(const fs_reg ®, unsigned idx) +{ + assert(idx < 2); + return horiz_offset(reg, 8 * idx); +} + +/** + * Reinterpret each channel of register \p reg as a vector of values of the + * given smaller type and take the i-th subcomponent from each. + */ +static inline fs_reg +subscript(fs_reg reg, brw_reg_type type, unsigned i) +{ + assert((i + 1) * type_sz(type) <= type_sz(reg.type)); + + if (reg.file == ARF || reg.file == FIXED_GRF) { + /* The stride is encoded inconsistently for fixed GRF and ARF registers + * as the log2 of the actual vertical and horizontal strides. + */ + const int delta = _mesa_logbase2(type_sz(reg.type)) - + _mesa_logbase2(type_sz(type)); + reg.hstride += (reg.hstride ? delta : 0); + reg.vstride += (reg.vstride ? delta : 0); + + } else if (reg.file == IMM) { + assert(reg.type == type); + + } else { + reg.stride *= type_sz(reg.type) / type_sz(type); + } + + return byte_offset(retype(reg, type), i * type_sz(type)); +} + +static const fs_reg reg_undef; + +class fs_inst : public backend_instruction { + fs_inst &operator=(const fs_inst &); + + void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, + const fs_reg *src, unsigned sources); + +public: + DECLARE_RALLOC_CXX_OPERATORS(fs_inst) + + fs_inst(); + fs_inst(enum opcode opcode, uint8_t exec_size); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1, const fs_reg &src2); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg src[], unsigned sources); + fs_inst(const fs_inst &that); + ~fs_inst(); + + void resize_sources(uint8_t num_sources); + + bool equals(fs_inst *inst) const; + bool is_send_from_grf() const; + bool is_partial_write() const; + bool is_copy_payload(const brw::simple_allocator &grf_alloc) const; + unsigned components_read(unsigned i) const; + unsigned size_read(int arg) const; + bool can_do_source_mods(const struct gen_device_info *devinfo); + bool can_change_types() const; + bool has_side_effects() const; + bool has_source_and_destination_hazard() const; + + /** + * Return the subset of flag registers read by the instruction as a bitset + * with byte granularity. + */ + unsigned flags_read(const gen_device_info *devinfo) const; + + /** + * Return the subset of flag registers updated by the instruction (either + * partially or fully) as a bitset with byte granularity. + */ + unsigned flags_written() const; + + fs_reg dst; + fs_reg *src; + + uint8_t sources; /**< Number of fs_reg sources. */ + + bool eot:1; + bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */ +}; + +/** + * Make the execution of \p inst dependent on the evaluation of a possibly + * inverted predicate. + */ +static inline fs_inst * +set_predicate_inv(enum brw_predicate pred, bool inverse, + fs_inst *inst) +{ + inst->predicate = pred; + inst->predicate_inverse = inverse; + return inst; +} + +/** + * Make the execution of \p inst dependent on the evaluation of a predicate. + */ +static inline fs_inst * +set_predicate(enum brw_predicate pred, fs_inst *inst) +{ + return set_predicate_inv(pred, false, inst); +} + +/** + * Write the result of evaluating the condition given by \p mod to a flag + * register. + */ +static inline fs_inst * +set_condmod(enum brw_conditional_mod mod, fs_inst *inst) +{ + inst->conditional_mod = mod; + return inst; +} + +/** + * Clamp the result of \p inst to the saturation range of its destination + * datatype. + */ +static inline fs_inst * +set_saturate(bool saturate, fs_inst *inst) +{ + inst->saturate = saturate; + return inst; +} + +/** + * Return the number of dataflow registers written by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->dst) / + * register_size)'. The somewhat arbitrary register size unit is 4B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_written(const fs_inst *inst) +{ + assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); + return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + + inst->size_written - + MIN2(inst->size_written, reg_padding(inst->dst)), + REG_SIZE); +} + +/** + * Return the number of dataflow registers read by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / + * register_size)'. The somewhat arbitrary register size unit is 4B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_read(const fs_inst *inst, unsigned i) +{ + const unsigned reg_size = + inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE; + return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + + inst->size_read(i) - + MIN2(inst->size_read(i), reg_padding(inst->src[i])), + reg_size); +} + +#endif diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h new file mode 100644 index 00000000000..bd026eb2aeb --- /dev/null +++ b/src/intel/compiler/brw_ir_vec4.h @@ -0,0 +1,409 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2011-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_VEC4_H +#define BRW_IR_VEC4_H + +#include "brw_shader.h" + +namespace brw { + +class dst_reg; + +class src_reg : public backend_reg +{ +public: + DECLARE_RALLOC_CXX_OPERATORS(src_reg) + + void init(); + + src_reg(enum brw_reg_file file, int nr, const glsl_type *type); + src_reg(); + src_reg(struct ::brw_reg reg); + + bool equals(const src_reg &r) const; + + src_reg(class vec4_visitor *v, const struct glsl_type *type); + src_reg(class vec4_visitor *v, const struct glsl_type *type, int size); + + explicit src_reg(const dst_reg ®); + + src_reg *reladdr; +}; + +static inline src_reg +retype(src_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +namespace detail { + +static inline void +add_byte_offset(backend_reg *reg, unsigned bytes) +{ + switch (reg->file) { + case BAD_FILE: + break; + case VGRF: + case ATTR: + case UNIFORM: + reg->offset += bytes; + assert(reg->offset % 16 == 0); + break; + case MRF: { + const unsigned suboffset = reg->offset + bytes; + reg->nr += suboffset / REG_SIZE; + reg->offset = suboffset % REG_SIZE; + assert(reg->offset % 16 == 0); + break; + } + case ARF: + case FIXED_GRF: { + const unsigned suboffset = reg->subnr + bytes; + reg->nr += suboffset / REG_SIZE; + reg->subnr = suboffset % REG_SIZE; + assert(reg->subnr % 16 == 0); + break; + } + default: + assert(bytes == 0); + } +} + +} /* namepace detail */ + +static inline src_reg +byte_offset(src_reg reg, unsigned bytes) +{ + detail::add_byte_offset(®, bytes); + return reg; +} + +static inline src_reg +offset(src_reg reg, unsigned width, unsigned delta) +{ + const unsigned stride = (reg.file == UNIFORM ? 0 : 4); + const unsigned num_components = MAX2(width / 4 * stride, 4); + return byte_offset(reg, num_components * type_sz(reg.type) * delta); +} + +static inline src_reg +horiz_offset(src_reg reg, unsigned delta) +{ + return byte_offset(reg, delta * type_sz(reg.type)); +} + +/** + * Reswizzle a given source register. + * \sa brw_swizzle(). + */ +static inline src_reg +swizzle(src_reg reg, unsigned swizzle) +{ + if (reg.file == IMM) + reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle); + else + reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); + + return reg; +} + +static inline src_reg +negate(src_reg reg) +{ + assert(reg.file != IMM); + reg.negate = !reg.negate; + return reg; +} + +static inline bool +is_uniform(const src_reg ®) +{ + return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) && + (!reg.reladdr || is_uniform(*reg.reladdr)); +} + +class dst_reg : public backend_reg +{ +public: + DECLARE_RALLOC_CXX_OPERATORS(dst_reg) + + void init(); + + dst_reg(); + dst_reg(enum brw_reg_file file, int nr); + dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, + unsigned writemask); + dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, + unsigned writemask); + dst_reg(struct ::brw_reg reg); + dst_reg(class vec4_visitor *v, const struct glsl_type *type); + + explicit dst_reg(const src_reg ®); + + bool equals(const dst_reg &r) const; + + src_reg *reladdr; +}; + +static inline dst_reg +retype(dst_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +static inline dst_reg +byte_offset(dst_reg reg, unsigned bytes) +{ + detail::add_byte_offset(®, bytes); + return reg; +} + +static inline dst_reg +offset(dst_reg reg, unsigned width, unsigned delta) +{ + const unsigned stride = (reg.file == UNIFORM ? 0 : 4); + const unsigned num_components = MAX2(width / 4 * stride, 4); + return byte_offset(reg, num_components * type_sz(reg.type) * delta); +} + +static inline dst_reg +horiz_offset(dst_reg reg, unsigned delta) +{ + return byte_offset(reg, delta * type_sz(reg.type)); +} + +static inline dst_reg +writemask(dst_reg reg, unsigned mask) +{ + assert(reg.file != IMM); + assert((reg.writemask & mask) != 0); + reg.writemask &= mask; + return reg; +} + +/** + * Return an integer identifying the discrete address space a register is + * contained in. A register is by definition fully contained in the single + * reg_space it belongs to, so two registers with different reg_space ids are + * guaranteed not to overlap. Most register files are a single reg_space of + * its own, only the VGRF file is composed of multiple discrete address + * spaces, one for each VGRF allocation. + */ +static inline uint32_t +reg_space(const backend_reg &r) +{ + return r.file << 16 | (r.file == VGRF ? r.nr : 0); +} + +/** + * Return the base offset in bytes of a register relative to the start of its + * reg_space(). + */ +static inline unsigned +reg_offset(const backend_reg &r) +{ + return (r.file == VGRF || r.file == IMM ? 0 : r.nr) * + (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset + + (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); +} + +/** + * Return whether the register region starting at \p r and spanning \p dr + * bytes could potentially overlap the register region starting at \p s and + * spanning \p ds bytes. + */ +static inline bool +regions_overlap(const backend_reg &r, unsigned dr, + const backend_reg &s, unsigned ds) +{ + if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { + /* COMPR4 regions are translated by the hardware during decompression + * into two separate half-regions 4 MRFs apart from each other. + */ + backend_reg t0 = r; + t0.nr &= ~BRW_MRF_COMPR4; + backend_reg t1 = t0; + t1.offset += 4 * REG_SIZE; + return regions_overlap(t0, dr / 2, s, ds) || + regions_overlap(t1, dr / 2, s, ds); + + } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { + return regions_overlap(s, ds, r, dr); + + } else { + return reg_space(r) == reg_space(s) && + !(reg_offset(r) + dr <= reg_offset(s) || + reg_offset(s) + ds <= reg_offset(r)); + } +} + +class vec4_instruction : public backend_instruction { +public: + DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction) + + vec4_instruction(enum opcode opcode, + const dst_reg &dst = dst_reg(), + const src_reg &src0 = src_reg(), + const src_reg &src1 = src_reg(), + const src_reg &src2 = src_reg()); + + dst_reg dst; + src_reg src[3]; + + enum brw_urb_write_flags urb_write_flags; + + unsigned sol_binding; /**< gen6: SOL binding table index */ + bool sol_final_write; /**< gen6: send commit message */ + unsigned sol_vertex; /**< gen6: used for setting dst index in SVB header */ + + bool is_send_from_grf(); + unsigned size_read(unsigned arg) const; + bool can_reswizzle(const struct gen_device_info *devinfo, int dst_writemask, + int swizzle, int swizzle_mask); + void reswizzle(int dst_writemask, int swizzle); + bool can_do_source_mods(const struct gen_device_info *devinfo); + bool can_do_writemask(const struct gen_device_info *devinfo); + bool can_change_types() const; + bool has_source_and_destination_hazard() const; + + bool is_align1_partial_write() + { + return opcode == VEC4_OPCODE_SET_LOW_32BIT || + opcode == VEC4_OPCODE_SET_HIGH_32BIT; + } + + bool reads_flag() + { + return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2; + } + + bool reads_flag(unsigned c) + { + if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) + return true; + + switch (predicate) { + case BRW_PREDICATE_NONE: + return false; + case BRW_PREDICATE_ALIGN16_REPLICATE_X: + return c == 0; + case BRW_PREDICATE_ALIGN16_REPLICATE_Y: + return c == 1; + case BRW_PREDICATE_ALIGN16_REPLICATE_Z: + return c == 2; + case BRW_PREDICATE_ALIGN16_REPLICATE_W: + return c == 3; + default: + return true; + } + } + + bool writes_flag() + { + return (conditional_mod && (opcode != BRW_OPCODE_SEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE)); + } +}; + +/** + * Make the execution of \p inst dependent on the evaluation of a possibly + * inverted predicate. + */ +inline vec4_instruction * +set_predicate_inv(enum brw_predicate pred, bool inverse, + vec4_instruction *inst) +{ + inst->predicate = pred; + inst->predicate_inverse = inverse; + return inst; +} + +/** + * Make the execution of \p inst dependent on the evaluation of a predicate. + */ +inline vec4_instruction * +set_predicate(enum brw_predicate pred, vec4_instruction *inst) +{ + return set_predicate_inv(pred, false, inst); +} + +/** + * Write the result of evaluating the condition given by \p mod to a flag + * register. + */ +inline vec4_instruction * +set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst) +{ + inst->conditional_mod = mod; + return inst; +} + +/** + * Clamp the result of \p inst to the saturation range of its destination + * datatype. + */ +inline vec4_instruction * +set_saturate(bool saturate, vec4_instruction *inst) +{ + inst->saturate = saturate; + return inst; +} + +/** + * Return the number of dataflow registers written by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->dst) / + * register_size)'. The somewhat arbitrary register size unit is 16B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_written(const vec4_instruction *inst) +{ + assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); + return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written, + REG_SIZE); +} + +/** + * Return the number of dataflow registers read by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / + * register_size)'. The somewhat arbitrary register size unit is 16B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_read(const vec4_instruction *inst, unsigned i) +{ + const unsigned reg_size = + inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE; + return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i), + reg_size); +} + +} /* namespace brw */ + +#endif diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c new file mode 100644 index 00000000000..f86308521e9 --- /dev/null +++ b/src/intel/compiler/brw_nir.c @@ -0,0 +1,764 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "brw_shader.h" +#include "common/gen_debug.h" +#include "compiler/glsl_types.h" +#include "compiler/nir/nir_builder.h" + +static bool +is_input(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input || + intrin->intrinsic == nir_intrinsic_load_interpolated_input; +} + +static bool +is_output(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_output || + intrin->intrinsic == nir_intrinsic_load_per_vertex_output || + intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_per_vertex_output; +} + +/** + * In many cases, we just add the base and offset together, so there's no + * reason to keep them separate. Sometimes, combining them is essential: + * if a shader only accesses part of a compound variable (such as a matrix + * or array), the variable's base may not actually exist in the VUE map. + * + * This pass adds constant offsets to instr->const_index[0], and resets + * the offset source to 0. Non-constant offsets remain unchanged - since + * we don't know what part of a compound variable is accessed, we allocate + * storage for the entire thing. + */ + +static bool +add_const_offset_to_base_block(nir_block *block, nir_builder *b, + nir_variable_mode mode) +{ + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if ((mode == nir_var_shader_in && is_input(intrin)) || + (mode == nir_var_shader_out && is_output(intrin))) { + nir_src *offset = nir_get_io_offset_src(intrin); + nir_const_value *const_offset = nir_src_as_const_value(*offset); + + if (const_offset) { + intrin->const_index[0] += const_offset->u32[0]; + b->cursor = nir_before_instr(&intrin->instr); + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(nir_imm_int(b, 0))); + } + } + } + return true; +} + +static void +add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode) +{ + nir_foreach_function(f, nir) { + if (f->impl) { + nir_builder b; + nir_builder_init(&b, f->impl); + nir_foreach_block(block, f->impl) { + add_const_offset_to_base_block(block, &b, mode); + } + } + } +} + +static bool +remap_vs_attrs(nir_block *block, shader_info *nir_info) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_load_input) { + /* Attributes come in a contiguous block, ordered by their + * gl_vert_attrib value. That means we can compute the slot + * number for an attribute by masking out the enabled attributes + * before it and counting the bits. + */ + int attr = intrin->const_index[0]; + int slot = _mesa_bitcount_64(nir_info->inputs_read & + BITFIELD64_MASK(attr)); + intrin->const_index[0] = 4 * slot; + } + } + return true; +} + +static bool +remap_inputs_with_vue_map(nir_block *block, const struct brw_vue_map *vue_map) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input) { + int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + } + } + return true; +} + +static bool +remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr, + GLenum primitive_mode) +{ + const int location = nir_intrinsic_base(intr); + const unsigned component = nir_intrinsic_component(intr); + bool out_of_bounds; + + if (location == VARYING_SLOT_TESS_LEVEL_INNER) { + switch (primitive_mode) { + case GL_QUADS: + /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */ + nir_intrinsic_set_base(intr, 0); + nir_intrinsic_set_component(intr, 3 - component); + out_of_bounds = false; + break; + case GL_TRIANGLES: + /* gl_TessLevelInner[0] lives at DWord 4. */ + nir_intrinsic_set_base(intr, 1); + out_of_bounds = component > 0; + break; + case GL_ISOLINES: + out_of_bounds = true; + break; + default: + unreachable("Bogus tessellation domain"); + } + } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) { + if (primitive_mode == GL_ISOLINES) { + /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */ + nir_intrinsic_set_base(intr, 1); + nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr)); + out_of_bounds = component > 1; + } else { + /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */ + nir_intrinsic_set_base(intr, 1); + nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr)); + out_of_bounds = component == 3 && primitive_mode == GL_TRIANGLES; + } + } else { + return false; + } + + if (out_of_bounds) { + if (nir_intrinsic_infos[intr->intrinsic].has_dest) { + b->cursor = nir_before_instr(&intr->instr); + nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(undef)); + } + nir_instr_remove(&intr->instr); + } + + return true; +} + +static bool +remap_patch_urb_offsets(nir_block *block, nir_builder *b, + const struct brw_vue_map *vue_map, + GLenum tes_primitive_mode) +{ + const bool is_passthrough_tcs = b->shader->info->name && + strcmp(b->shader->info->name, "passthrough") == 0; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + gl_shader_stage stage = b->shader->stage; + + if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || + (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { + + if (!is_passthrough_tcs && + remap_tess_levels(b, intrin, tes_primitive_mode)) + continue; + + int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + + nir_src *vertex = nir_get_io_vertex_index_src(intrin); + if (vertex) { + nir_const_value *const_vertex = nir_src_as_const_value(*vertex); + if (const_vertex) { + intrin->const_index[0] += const_vertex->u32[0] * + vue_map->num_per_vertex_slots; + } else { + b->cursor = nir_before_instr(&intrin->instr); + + /* Multiply by the number of per-vertex slots. */ + nir_ssa_def *vertex_offset = + nir_imul(b, + nir_ssa_for_src(b, *vertex, 1), + nir_imm_int(b, + vue_map->num_per_vertex_slots)); + + /* Add it to the existing offset */ + nir_src *offset = nir_get_io_offset_src(intrin); + nir_ssa_def *total_offset = + nir_iadd(b, vertex_offset, + nir_ssa_for_src(b, *offset, 1)); + + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(total_offset)); + } + } + } + } + return true; +} + +void +brw_nir_lower_vs_inputs(nir_shader *nir, + bool is_scalar, + bool use_legacy_snorm_formula, + const uint8_t *vs_attrib_wa_flags) +{ + /* Start with the location of the variable's base. */ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + /* Now use nir_lower_io to walk dereference chains. Attribute arrays are + * loaded as one vec4 or dvec4 per element (or matrix column), depending on + * whether it is a double-precision type or not. + */ + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); + + brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula, + vs_attrib_wa_flags); + + if (is_scalar) { + /* Finally, translate VERT_ATTRIB_* values into the actual registers. */ + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + remap_vs_attrs(block, nir->info); + } + } + } + } +} + +void +brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar, + const struct brw_vue_map *vue_map) +{ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); + + if (is_scalar || nir->stage != MESA_SHADER_GEOMETRY) { + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + remap_inputs_with_vue_map(block, vue_map); + } + } + } + } +} + +void +brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map) +{ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + nir_foreach_block(block, function->impl) { + remap_patch_urb_offsets(block, &b, vue_map, + nir->info->tess.primitive_mode); + } + } + } +} + +void +brw_nir_lower_fs_inputs(nir_shader *nir, + const struct gen_device_info *devinfo, + const struct brw_wm_prog_key *key) +{ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + + /* Apply default interpolation mode. + * + * Everything defaults to smooth except for the legacy GL color + * built-in variables, which might be flat depending on API state. + */ + if (var->data.interpolation == INTERP_MODE_NONE) { + const bool flat = key->flat_shade && + (var->data.location == VARYING_SLOT_COL0 || + var->data.location == VARYING_SLOT_COL1); + + var->data.interpolation = flat ? INTERP_MODE_FLAT + : INTERP_MODE_SMOOTH; + } + + /* On Ironlake and below, there is only one interpolation mode. + * Centroid interpolation doesn't mean anything on this hardware -- + * there is no multisampling. + */ + if (devinfo->gen < 6) { + var->data.centroid = false; + var->data.sample = false; + } + } + + nir_lower_io_options lower_io_options = 0; + if (key->persample_interp) + lower_io_options |= nir_lower_io_force_sample_interpolation; + + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); +} + +void +brw_nir_lower_vue_outputs(nir_shader *nir, + bool is_scalar) +{ + nir_foreach_variable(var, &nir->outputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0); +} + +void +brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map, + GLenum tes_primitive_mode) +{ + nir_foreach_variable(var, &nir->outputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_out); + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + nir_foreach_block(block, function->impl) { + remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode); + } + } + } +} + +void +brw_nir_lower_fs_outputs(nir_shader *nir) +{ + nir_foreach_variable(var, &nir->outputs) { + var->data.driver_location = + SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) | + SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION); + } + + nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0); +} + +void +brw_nir_lower_cs_shared(nir_shader *nir) +{ + nir_assign_var_locations(&nir->shared, &nir->num_shared, + type_size_scalar_bytes); + nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes, 0); +} + +#define OPT(pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + if (this_progress) \ + progress = true; \ + this_progress; \ +}) + +#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) + +static nir_shader * +nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, + bool is_scalar) +{ + nir_variable_mode indirect_mask = 0; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput) + indirect_mask |= nir_var_shader_in; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput) + indirect_mask |= nir_var_shader_out; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp) + indirect_mask |= nir_var_local; + + bool progress; + do { + progress = false; + OPT_V(nir_lower_vars_to_ssa); + OPT(nir_opt_copy_prop_vars); + + if (is_scalar) { + OPT(nir_lower_alu_to_scalar); + } + + OPT(nir_copy_prop); + + if (is_scalar) { + OPT(nir_lower_phis_to_scalar); + } + + OPT(nir_copy_prop); + OPT(nir_opt_dce); + OPT(nir_opt_cse); + OPT(nir_opt_peephole_select, 0); + OPT(nir_opt_algebraic); + OPT(nir_opt_constant_folding); + OPT(nir_opt_dead_cf); + if (OPT(nir_opt_trivial_continues)) { + /* If nir_opt_trivial_continues makes progress, then we need to clean + * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll + * to make progress. + */ + OPT(nir_copy_prop); + OPT(nir_opt_dce); + } + OPT(nir_opt_if); + if (nir->options->max_unroll_iterations != 0) { + OPT(nir_opt_loop_unroll, indirect_mask); + } + OPT(nir_opt_remove_phis); + OPT(nir_opt_undef); + OPT_V(nir_lower_doubles, nir_lower_drcp | + nir_lower_dsqrt | + nir_lower_drsq | + nir_lower_dtrunc | + nir_lower_dfloor | + nir_lower_dceil | + nir_lower_dfract | + nir_lower_dround_even | + nir_lower_dmod); + OPT_V(nir_lower_64bit_pack); + } while (progress); + + return nir; +} + +/* Does some simple lowering and runs the standard suite of optimizations + * + * This is intended to be called more-or-less directly after you get the + * shader out of GLSL or some other source. While it is geared towards i965, + * it is not at all generator-specific except for the is_scalar flag. Even + * there, it is safe to call with is_scalar = false for a shader that is + * intended for the FS backend as long as nir_optimize is called again with + * is_scalar = true to scalarize everything prior to code gen. + */ +nir_shader * +brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + bool progress; /* Written by OPT and OPT_V */ + (void)progress; + + const bool is_scalar = compiler->scalar_stage[nir->stage]; + + if (nir->stage == MESA_SHADER_GEOMETRY) + OPT(nir_lower_gs_intrinsics); + + /* See also brw_nir_trig_workarounds.py */ + if (compiler->precise_trig && + !(devinfo->gen >= 10 || devinfo->is_kabylake)) + OPT(brw_nir_apply_trig_workarounds); + + static const nir_lower_tex_options tex_options = { + .lower_txp = ~0, + .lower_txf_offset = true, + .lower_rect_offset = true, + .lower_txd_cube_map = true, + }; + + OPT(nir_lower_tex, &tex_options); + OPT(nir_normalize_cubemap_coords); + + OPT(nir_lower_global_vars_to_local); + + OPT(nir_split_var_copies); + + nir = nir_optimize(nir, compiler, is_scalar); + + if (is_scalar) { + OPT_V(nir_lower_load_const_to_scalar); + } + + /* Lower a bunch of stuff */ + OPT_V(nir_lower_var_copies); + + OPT_V(nir_lower_clip_cull_distance_arrays); + + nir_variable_mode indirect_mask = 0; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput) + indirect_mask |= nir_var_shader_in; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput) + indirect_mask |= nir_var_shader_out; + if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp) + indirect_mask |= nir_var_local; + + nir_lower_indirect_derefs(nir, indirect_mask); + + nir_lower_int64(nir, nir_lower_imul64 | + nir_lower_isign64 | + nir_lower_divmod64); + + /* Get rid of split copies */ + nir = nir_optimize(nir, compiler, is_scalar); + + OPT(nir_remove_dead_variables, nir_var_local); + + return nir; +} + +/* Prepare the given shader for codegen + * + * This function is intended to be called right before going into the actual + * backend and is highly backend-specific. Also, once this function has been + * called on a shader, it will no longer be in SSA form so most optimizations + * will not work. + */ +nir_shader * +brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, + bool is_scalar) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + bool debug_enabled = + (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage)); + + bool progress; /* Written by OPT and OPT_V */ + (void)progress; + + nir = nir_optimize(nir, compiler, is_scalar); + + if (devinfo->gen >= 6) { + /* Try and fuse multiply-adds */ + OPT(brw_nir_opt_peephole_ffma); + } + + OPT(nir_opt_algebraic_late); + + OPT_V(nir_lower_to_source_mods); + OPT(nir_copy_prop); + OPT(nir_opt_dce); + OPT(nir_opt_move_comparisons); + + OPT(nir_lower_locals_to_regs); + + if (unlikely(debug_enabled)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function(function, nir) { + if (function->impl) + nir_index_ssa_defs(function->impl); + } + + fprintf(stderr, "NIR (SSA form) for %s shader:\n", + _mesa_shader_stage_to_string(nir->stage)); + nir_print_shader(nir, stderr); + } + + OPT_V(nir_convert_from_ssa, true); + + if (!is_scalar) { + OPT_V(nir_move_vec_src_uses_to_dest); + OPT(nir_lower_vec_to_movs); + } + + /* This is the last pass we run before we start emitting stuff. It + * determines when we need to insert boolean resolves on Gen <= 5. We + * run it last because it stashes data in instr->pass_flags and we don't + * want that to be squashed by other NIR passes. + */ + if (devinfo->gen <= 5) + brw_nir_analyze_boolean_resolves(nir); + + nir_sweep(nir); + + if (unlikely(debug_enabled)) { + fprintf(stderr, "NIR (final form) for %s shader:\n", + _mesa_shader_stage_to_string(nir->stage)); + nir_print_shader(nir, stderr); + } + + return nir; +} + +nir_shader * +brw_nir_apply_sampler_key(nir_shader *nir, + const struct brw_compiler *compiler, + const struct brw_sampler_prog_key_data *key_tex, + bool is_scalar) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + nir_lower_tex_options tex_options = { 0 }; + + /* Iron Lake and prior require lowering of all rectangle textures */ + if (devinfo->gen < 6) + tex_options.lower_rect = true; + + /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */ + if (devinfo->gen < 8) { + tex_options.saturate_s = key_tex->gl_clamp_mask[0]; + tex_options.saturate_t = key_tex->gl_clamp_mask[1]; + tex_options.saturate_r = key_tex->gl_clamp_mask[2]; + } + + /* Prior to Haswell, we have to fake texture swizzle */ + for (unsigned s = 0; s < MAX_SAMPLERS; s++) { + if (key_tex->swizzles[s] == SWIZZLE_NOOP) + continue; + + tex_options.swizzle_result |= (1 << s); + for (unsigned c = 0; c < 4; c++) + tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c); + } + + /* Prior to Haswell, we have to lower gradients on shadow samplers */ + tex_options.lower_txd_shadow = devinfo->gen < 8 && !devinfo->is_haswell; + + tex_options.lower_y_uv_external = key_tex->y_uv_image_mask; + tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask; + tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask; + + if (nir_lower_tex(nir, &tex_options)) { + nir_validate_shader(nir); + nir = nir_optimize(nir, compiler, is_scalar); + } + + return nir; +} + +enum brw_reg_type +brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type) +{ + switch (type) { + case nir_type_uint: + case nir_type_uint32: + return BRW_REGISTER_TYPE_UD; + case nir_type_bool: + case nir_type_int: + case nir_type_bool32: + case nir_type_int32: + return BRW_REGISTER_TYPE_D; + case nir_type_float: + case nir_type_float32: + return BRW_REGISTER_TYPE_F; + case nir_type_float64: + return BRW_REGISTER_TYPE_DF; + case nir_type_int64: + return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q; + case nir_type_uint64: + return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ; + default: + unreachable("unknown type"); + } + + return BRW_REGISTER_TYPE_F; +} + +/* Returns the glsl_base_type corresponding to a nir_alu_type. + * This is used by both brw_vec4_nir and brw_fs_nir. + */ +enum glsl_base_type +brw_glsl_base_type_for_nir_type(nir_alu_type type) +{ + switch (type) { + case nir_type_float: + case nir_type_float32: + return GLSL_TYPE_FLOAT; + + case nir_type_float64: + return GLSL_TYPE_DOUBLE; + + case nir_type_int: + case nir_type_int32: + return GLSL_TYPE_INT; + + case nir_type_uint: + case nir_type_uint32: + return GLSL_TYPE_UINT; + + default: + unreachable("bad type"); + } +} diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h new file mode 100644 index 00000000000..76d7ec89f9b --- /dev/null +++ b/src/intel/compiler/brw_nir.h @@ -0,0 +1,154 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "brw_reg.h" +#include "compiler/nir/nir.h" +#include "brw_compiler.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int type_size_scalar(const struct glsl_type *type); +int type_size_vec4(const struct glsl_type *type); +int type_size_dvec4(const struct glsl_type *type); + +static inline int +type_size_scalar_bytes(const struct glsl_type *type) +{ + return type_size_scalar(type) * 4; +} + +static inline int +type_size_vec4_bytes(const struct glsl_type *type) +{ + return type_size_vec4(type) * 16; +} + +/* Flags set in the instr->pass_flags field by i965 analysis passes */ +enum { + BRW_NIR_NON_BOOLEAN = 0x0, + + /* Indicates that the given instruction's destination is a boolean + * value but that it needs to be resolved before it can be used. + * On Gen <= 5, CMP instructions return a 32-bit value where the bottom + * bit represents the actual true/false value of the compare and the top + * 31 bits are undefined. In order to use this value, we have to do a + * "resolve" operation by replacing the value of the CMP with -(x & 1) + * to sign-extend the bottom bit to 0/~0. + */ + BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1, + + /* Indicates that the given instruction's destination is a boolean + * value that has intentionally been left unresolved. Not all boolean + * values need to be resolved immediately. For instance, if we have + * + * CMP r1 r2 r3 + * CMP r4 r5 r6 + * AND r7 r1 r4 + * + * We don't have to resolve the result of the two CMP instructions + * immediately because the AND still does an AND of the bottom bits. + * Instead, we can save ourselves instructions by delaying the resolve + * until after the AND. The result of the two CMP instructions is left + * as BRW_NIR_BOOLEAN_UNRESOLVED. + */ + BRW_NIR_BOOLEAN_UNRESOLVED = 0x2, + + /* Indicates a that the given instruction's destination is a boolean + * value that does not need a resolve. For instance, if you AND two + * values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both + * values will be 0/~0 before we get them and the result of the AND is + * also guaranteed to be 0/~0 and does not need a resolve. + */ + BRW_NIR_BOOLEAN_NO_RESOLVE = 0x3, + + /* A mask to mask the boolean status values off of instr->pass_flags */ + BRW_NIR_BOOLEAN_MASK = 0x3, +}; + +void brw_nir_analyze_boolean_resolves(nir_shader *nir); + +nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler, + nir_shader *nir); + +bool brw_nir_lower_intrinsics(nir_shader *nir, + struct brw_stage_prog_data *prog_data); +void brw_nir_lower_vs_inputs(nir_shader *nir, + bool is_scalar, + bool use_legacy_snorm_formula, + const uint8_t *vs_attrib_wa_flags); +void brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar, + const struct brw_vue_map *vue_map); +void brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue); +void brw_nir_lower_fs_inputs(nir_shader *nir, + const struct gen_device_info *devinfo, + const struct brw_wm_prog_key *key); +void brw_nir_lower_vue_outputs(nir_shader *nir, bool is_scalar); +void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue, + GLenum tes_primitive_mode); +void brw_nir_lower_fs_outputs(nir_shader *nir); +void brw_nir_lower_cs_shared(nir_shader *nir); + +nir_shader *brw_postprocess_nir(nir_shader *nir, + const struct brw_compiler *compiler, + bool is_scalar); + +bool brw_nir_apply_attribute_workarounds(nir_shader *nir, + bool use_legacy_snorm_formula, + const uint8_t *attrib_wa_flags); + +bool brw_nir_apply_trig_workarounds(nir_shader *nir); + +void brw_nir_apply_tcs_quads_workaround(nir_shader *nir); + +nir_shader *brw_nir_apply_sampler_key(nir_shader *nir, + const struct brw_compiler *compiler, + const struct brw_sampler_prog_key_data *key, + bool is_scalar); + +enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo, + nir_alu_type type); + +enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type); + +void brw_nir_setup_glsl_uniforms(nir_shader *shader, + const struct gl_program *prog, + struct brw_stage_prog_data *stage_prog_data, + bool is_scalar); + +void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog, + struct brw_stage_prog_data *stage_prog_data); + +bool brw_nir_opt_peephole_ffma(nir_shader *shader); + +#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0 +#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0) +#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1 +#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1) + +#ifdef __cplusplus +} +#endif diff --git a/src/intel/compiler/brw_nir_analyze_boolean_resolves.c b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c new file mode 100644 index 00000000000..4ad26e21103 --- /dev/null +++ b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c @@ -0,0 +1,269 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Jason Ekstrand <[email protected]> + */ + +#include "brw_nir.h" + +/* + * This file implements an analysis pass that determines when we have to do + * a boolean resolve on Gen <= 5. Instructions that need a boolean resolve + * will have the booleans portion of the instr->pass_flags field set to + * BRW_NIR_BOOLEAN_NEEDS_RESOLVE. + */ + + +/** Returns the resolve status for the given source + * + * If the source has a parent instruction then the resolve status is the + * status of the parent instruction. If the source does not have a parent + * instruction then we don't know so we return NON_BOOLEAN. + */ +static uint8_t +get_resolve_status_for_src(nir_src *src) +{ + if (src->is_ssa) { + nir_instr *src_instr = src->ssa->parent_instr; + uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK; + + /* If the source instruction needs resolve, then from the perspective + * of the user, it's a true boolean. + */ + if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) + resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE; + return resolve_status; + } else { + return BRW_NIR_NON_BOOLEAN; + } +} + +/** Marks the given source as needing a resolve + * + * If the given source corresponds to an unresolved boolean it marks it as + * needing a resolve. Otherwise, we leave it alone. + */ +static bool +src_mark_needs_resolve(nir_src *src, void *void_state) +{ + if (src->is_ssa) { + nir_instr *src_instr = src->ssa->parent_instr; + uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK; + + /* If the source instruction is unresolved, then mark it as needing + * to be resolved. + */ + if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) { + src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK; + src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE; + } + + } + + return true; +} + +static bool +analyze_boolean_resolves_block(nir_block *block) +{ + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: { + /* For ALU instructions, the resolve status is handled in a + * three-step process. + * + * 1) Look at the instruction type and sources and determine if it + * can be left unresolved. + * + * 2) Look at the destination and see if we have to resolve + * anyway. (This is the case if this instruction is not the + * only instruction writing to a given register.) + * + * 3) If the instruction has a resolve status other than + * BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through + * the sources and ensure that they are also resolved. This + * ensures that we don't end up with any stray unresolved + * booleans going into ADDs or something like that. + */ + + uint8_t resolve_status; + nir_alu_instr *alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + /* These are only implemented by the vec4 backend and its + * implementation emits resolved booleans. At some point in the + * future, this may change and we'll have to remove some of the + * above cases. + */ + resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE; + break; + + case nir_op_imov: + case nir_op_inot: + /* This is a single-source instruction. Just copy the resolve + * status from the source. + */ + resolve_status = get_resolve_status_for_src(&alu->src[0].src); + break; + + case nir_op_iand: + case nir_op_ior: + case nir_op_ixor: { + uint8_t src0_status = get_resolve_status_for_src(&alu->src[0].src); + uint8_t src1_status = get_resolve_status_for_src(&alu->src[1].src); + + if (src0_status == src1_status) { + resolve_status = src0_status; + } else if (src0_status == BRW_NIR_NON_BOOLEAN || + src1_status == BRW_NIR_NON_BOOLEAN) { + /* If one of the sources is a non-boolean then the whole + * thing is a non-boolean. + */ + resolve_status = BRW_NIR_NON_BOOLEAN; + } else { + /* At this point one of them is a true boolean and one is a + * boolean that needs a resolve. We could either resolve the + * unresolved source or we could resolve here. If we resolve + * the unresolved source then we get two resolves for the price + * of one. Just set this one to BOOLEAN_NO_RESOLVE and we'll + * let the code below force a resolve on the unresolved source. + */ + resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE; + } + break; + } + + default: + if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) { + /* This instructions will turn into a CMP when we actually emit + * them so the result will have to be resolved before it can be + * used. + */ + resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED; + + /* Even though the destination is allowed to be left + * unresolved, the sources are treated as regular integers or + * floats so they need to be resolved. + */ + nir_foreach_src(instr, src_mark_needs_resolve, NULL); + } else { + resolve_status = BRW_NIR_NON_BOOLEAN; + } + } + + /* If the destination is SSA, go ahead allow unresolved booleans. + * If the destination register doesn't have a well-defined parent_instr + * we need to resolve immediately. + */ + if (!alu->dest.dest.is_ssa && + resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) { + resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE; + } + + instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) | + resolve_status; + + /* Finally, resolve sources if it's needed */ + switch (resolve_status) { + case BRW_NIR_BOOLEAN_NEEDS_RESOLVE: + case BRW_NIR_BOOLEAN_UNRESOLVED: + /* This instruction is either unresolved or we're doing the + * resolve here; leave the sources alone. + */ + break; + + case BRW_NIR_BOOLEAN_NO_RESOLVE: + case BRW_NIR_NON_BOOLEAN: + nir_foreach_src(instr, src_mark_needs_resolve, NULL); + break; + + default: + unreachable("Invalid boolean flag"); + } + + break; + } + + case nir_instr_type_load_const: { + nir_load_const_instr *load = nir_instr_as_load_const(instr); + + /* For load_const instructions, it's a boolean exactly when it holds + * one of the values NIR_TRUE or NIR_FALSE. + * + * Since load_const instructions don't have any sources, we don't + * have to worry about resolving them. + */ + instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK; + if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) { + instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE; + } else { + instr->pass_flags |= BRW_NIR_NON_BOOLEAN; + } + continue; + } + + default: + /* Everything else is an unknown non-boolean value and needs to + * have all sources resolved. + */ + instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) | + BRW_NIR_NON_BOOLEAN; + nir_foreach_src(instr, src_mark_needs_resolve, NULL); + continue; + } + } + + nir_if *following_if = nir_block_get_following_if(block); + if (following_if) + src_mark_needs_resolve(&following_if->condition, NULL); + + return true; +} + +static void +analyze_boolean_resolves_impl(nir_function_impl *impl) +{ + nir_foreach_block(block, impl) { + analyze_boolean_resolves_block(block); + } +} + +void +brw_nir_analyze_boolean_resolves(nir_shader *shader) +{ + nir_foreach_function(function, shader) { + if (function->impl) + analyze_boolean_resolves_impl(function->impl); + } +} diff --git a/src/intel/compiler/brw_nir_attribute_workarounds.c b/src/intel/compiler/brw_nir_attribute_workarounds.c new file mode 100644 index 00000000000..d695771f04a --- /dev/null +++ b/src/intel/compiler/brw_nir_attribute_workarounds.c @@ -0,0 +1,176 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "brw_nir.h" + +/** + * Prior to Haswell, the hardware can't natively support GL_FIXED or + * 2_10_10_10_REV vertex formats. This pass inserts extra shader code + * to produce the correct values. + */ + +struct attr_wa_state { + nir_builder builder; + bool impl_progress; + bool use_legacy_snorm_formula; + const uint8_t *wa_flags; +}; + +static bool +apply_attr_wa_block(nir_block *block, struct attr_wa_state *state) +{ + nir_builder *b = &state->builder; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_input) + continue; + + uint8_t wa_flags = state->wa_flags[intrin->const_index[0]]; + if (wa_flags == 0) + continue; + + b->cursor = nir_after_instr(instr); + + nir_ssa_def *val = &intrin->dest.ssa; + + /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes + * come in as floating point conversions of the integer values. + */ + if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) { + nir_ssa_def *scaled = + nir_fmul(b, val, nir_imm_float(b, 1.0f / 65536.0f)); + nir_ssa_def *comps[4]; + for (int i = 0; i < val->num_components; i++) { + bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK); + comps[i] = nir_channel(b, rescale ? scaled : val, i); + } + val = nir_vec(b, comps, val->num_components); + } + + /* Do sign recovery for 2101010 formats if required. */ + if (wa_flags & BRW_ATTRIB_WA_SIGN) { + /* sign recovery shift: <22, 22, 22, 30> */ + nir_ssa_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30); + val = nir_ishr(b, nir_ishl(b, val, shift), shift); + } + + /* Apply BGRA swizzle if required. */ + if (wa_flags & BRW_ATTRIB_WA_BGRA) { + val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4, true); + } + + if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) { + /* ES 3.0 has different rules for converting signed normalized + * fixed-point numbers than desktop GL. + */ + if ((wa_flags & BRW_ATTRIB_WA_SIGN) && + !state->use_legacy_snorm_formula) { + /* According to equation 2.2 of the ES 3.0 specification, + * signed normalization conversion is done by: + * + * f = c / (2^(b-1)-1) + */ + nir_ssa_def *es3_normalize_factor = + nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1), + 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1)); + val = nir_fmax(b, + nir_fmul(b, nir_i2f(b, val), es3_normalize_factor), + nir_imm_float(b, -1.0f)); + } else { + /* The following equations are from the OpenGL 3.2 specification: + * + * 2.1 unsigned normalization + * f = c/(2^n-1) + * + * 2.2 signed normalization + * f = (2c+1)/(2^n-1) + * + * Both of these share a common divisor, which we handle by + * multiplying by 1 / (2^b - 1) for b = <10, 10, 10, 2>. + */ + nir_ssa_def *normalize_factor = + nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1), + 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2) - 1)); + + if (wa_flags & BRW_ATTRIB_WA_SIGN) { + /* For signed normalization, the numerator is 2c+1. */ + nir_ssa_def *two = nir_imm_float(b, 2.0f); + nir_ssa_def *one = nir_imm_float(b, 1.0f); + val = nir_fadd(b, nir_fmul(b, nir_i2f(b, val), two), one); + } else { + /* For unsigned normalization, the numerator is just c. */ + val = nir_u2f(b, val); + } + val = nir_fmul(b, val, normalize_factor); + } + } + + if (wa_flags & BRW_ATTRIB_WA_SCALE) { + val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f(b, val) + : nir_u2f(b, val); + } + + nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, nir_src_for_ssa(val), + val->parent_instr); + state->impl_progress = true; + } + + return true; +} + +bool +brw_nir_apply_attribute_workarounds(nir_shader *shader, + bool use_legacy_snorm_formula, + const uint8_t *attrib_wa_flags) +{ + bool progress = false; + struct attr_wa_state state = { + .use_legacy_snorm_formula = use_legacy_snorm_formula, + .wa_flags = attrib_wa_flags, + }; + + nir_foreach_function(func, shader) { + if (!func->impl) + continue; + + nir_builder_init(&state.builder, func->impl); + state.impl_progress = false; + + nir_foreach_block(block, func->impl) { + apply_attr_wa_block(block, &state); + } + + if (state.impl_progress) { + nir_metadata_preserve(func->impl, nir_metadata_block_index | + nir_metadata_dominance); + progress = true; + } + } + + return progress; +} diff --git a/src/intel/compiler/brw_nir_intrinsics.c b/src/intel/compiler/brw_nir_intrinsics.c new file mode 100644 index 00000000000..901a1fb0ab9 --- /dev/null +++ b/src/intel/compiler/brw_nir_intrinsics.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" + +struct lower_intrinsics_state { + nir_shader *nir; + union { + struct brw_stage_prog_data *prog_data; + struct brw_cs_prog_data *cs_prog_data; + }; + nir_function_impl *impl; + bool progress; + nir_builder builder; + bool cs_thread_id_used; +}; + +static nir_ssa_def * +read_thread_local_id(struct lower_intrinsics_state *state) +{ + nir_builder *b = &state->builder; + nir_shader *nir = state->nir; + const unsigned *sizes = nir->info->cs.local_size; + const unsigned group_size = sizes[0] * sizes[1] * sizes[2]; + + /* Some programs have local_size dimensions so small that the thread local + * ID will always be 0. + */ + if (group_size <= 8) + return nir_imm_int(b, 0); + + assert(state->cs_prog_data->thread_local_id_index >= 0); + state->cs_thread_id_used = true; + const int id_index = state->cs_prog_data->thread_local_id_index; + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); + load->num_components = 1; + load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_intrinsic_set_base(load, id_index * sizeof(uint32_t)); + nir_intrinsic_set_range(load, sizeof(uint32_t)); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static bool +lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, + nir_block *block) +{ + bool progress = false; + nir_builder *b = &state->builder; + nir_shader *nir = state->nir; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + + b->cursor = nir_after_instr(&intrinsic->instr); + + nir_ssa_def *sysval; + switch (intrinsic->intrinsic) { + case nir_intrinsic_load_local_invocation_index: { + assert(nir->stage == MESA_SHADER_COMPUTE); + /* We construct the local invocation index from: + * + * gl_LocalInvocationIndex = + * cs_thread_local_id + channel_num; + */ + nir_ssa_def *thread_local_id = read_thread_local_id(state); + nir_ssa_def *channel = nir_load_channel_num(b); + sysval = nir_iadd(b, channel, thread_local_id); + break; + } + + case nir_intrinsic_load_local_invocation_id: { + assert(nir->stage == MESA_SHADER_COMPUTE); + /* We lower gl_LocalInvocationID from gl_LocalInvocationIndex based + * on this formula: + * + * gl_LocalInvocationID.x = + * gl_LocalInvocationIndex % gl_WorkGroupSize.x; + * gl_LocalInvocationID.y = + * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) % + * gl_WorkGroupSize.y; + * gl_LocalInvocationID.z = + * (gl_LocalInvocationIndex / + * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % + * gl_WorkGroupSize.z; + */ + unsigned *size = nir->info->cs.local_size; + + nir_ssa_def *local_index = nir_load_local_invocation_index(b); + + nir_const_value uvec3; + uvec3.u32[0] = 1; + uvec3.u32[1] = size[0]; + uvec3.u32[2] = size[0] * size[1]; + nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); + uvec3.u32[0] = size[0]; + uvec3.u32[1] = size[1]; + uvec3.u32[2] = size[2]; + nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); + + sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val); + break; + } + + default: + continue; + } + + nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(sysval)); + nir_instr_remove(&intrinsic->instr); + + state->progress = true; + } + + return progress; +} + +static void +lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state) +{ + nir_builder_init(&state->builder, state->impl); + + nir_foreach_block(block, state->impl) { + lower_cs_intrinsics_convert_block(state, block); + } + + nir_metadata_preserve(state->impl, + nir_metadata_block_index | nir_metadata_dominance); +} + +bool +brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data) +{ + /* Currently we only lower intrinsics for compute shaders */ + if (nir->stage != MESA_SHADER_COMPUTE) + return false; + + bool progress = false; + struct lower_intrinsics_state state; + memset(&state, 0, sizeof(state)); + state.nir = nir; + state.prog_data = prog_data; + + do { + state.progress = false; + nir_foreach_function(function, nir) { + if (function->impl) { + state.impl = function->impl; + lower_cs_intrinsics_convert_impl(&state); + } + } + progress |= state.progress; + } while (state.progress); + + if (nir->stage == MESA_SHADER_COMPUTE && !state.cs_thread_id_used) + state.cs_prog_data->thread_local_id_index = -1; + + return progress; +} diff --git a/src/intel/compiler/brw_nir_opt_peephole_ffma.c b/src/intel/compiler/brw_nir_opt_peephole_ffma.c new file mode 100644 index 00000000000..cc225e1847b --- /dev/null +++ b/src/intel/compiler/brw_nir_opt_peephole_ffma.c @@ -0,0 +1,297 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Jason Ekstrand ([email protected]) + * + */ + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" + +/* + * Implements a small peephole optimization that looks for a multiply that + * is only ever used in an add and replaces both with an fma. + */ + +static inline bool +are_all_uses_fadd(nir_ssa_def *def) +{ + if (!list_empty(&def->if_uses)) + return false; + + nir_foreach_use(use_src, def) { + nir_instr *use_instr = use_src->parent_instr; + + if (use_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *use_alu = nir_instr_as_alu(use_instr); + switch (use_alu->op) { + case nir_op_fadd: + break; /* This one's ok */ + + case nir_op_imov: + case nir_op_fmov: + case nir_op_fneg: + case nir_op_fabs: + assert(use_alu->dest.dest.is_ssa); + if (!are_all_uses_fadd(&use_alu->dest.dest.ssa)) + return false; + break; + + default: + return false; + } + } + + return true; +} + +static nir_alu_instr * +get_mul_for_src(nir_alu_src *src, int num_components, + uint8_t swizzle[4], bool *negate, bool *abs) +{ + uint8_t swizzle_tmp[4]; + assert(src->src.is_ssa && !src->abs && !src->negate); + + nir_instr *instr = src->src.ssa->parent_instr; + if (instr->type != nir_instr_type_alu) + return NULL; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + + /* We want to bail if any of the other ALU operations involved is labled + * exact. One reason for this is that, while the value that is changing is + * actually the result of the add and not the multiply, the intention of + * the user when they specify an exact multiply is that they want *that* + * value and what they don't care about is the add. Another reason is that + * SPIR-V explicitly requires this behaviour. + */ + if (alu->exact) + return NULL; + + switch (alu->op) { + case nir_op_imov: + case nir_op_fmov: + alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs); + break; + + case nir_op_fneg: + alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs); + *negate = !*negate; + break; + + case nir_op_fabs: + alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs); + *negate = false; + *abs = true; + break; + + case nir_op_fmul: + /* Only absorb a fmul into a ffma if the fmul is only used in fadd + * operations. This prevents us from being too aggressive with our + * fusing which can actually lead to more instructions. + */ + if (!are_all_uses_fadd(&alu->dest.dest.ssa)) + return NULL; + break; + + default: + return NULL; + } + + if (!alu) + return NULL; + + /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle. + * + * Example: + * Former swizzle[] = xyzw + * src->swizzle[] = zyxx + * + * Expected output swizzle = zyxx + * If we reuse swizzle in the loop, then output swizzle would be zyzz. + */ + memcpy(swizzle_tmp, swizzle, 4*sizeof(uint8_t)); + for (int i = 0; i < num_components; i++) + swizzle[i] = swizzle_tmp[src->swizzle[i]]; + + return alu; +} + +/** + * Given a list of (at least two) nir_alu_src's, tells if any of them is a + * constant value and is used only once. + */ +static bool +any_alu_src_is_a_constant(nir_alu_src srcs[]) +{ + for (unsigned i = 0; i < 2; i++) { + if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) { + nir_load_const_instr *load_const = + nir_instr_as_load_const (srcs[i].src.ssa->parent_instr); + + if (list_is_singular(&load_const->def.uses) && + list_empty(&load_const->def.if_uses)) { + return true; + } + } + } + + return false; +} + +static bool +brw_nir_opt_peephole_ffma_block(nir_builder *b, nir_block *block) +{ + bool progress = false; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *add = nir_instr_as_alu(instr); + if (add->op != nir_op_fadd) + continue; + + assert(add->dest.dest.is_ssa); + if (add->exact) + continue; + + assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa); + + /* This, is the case a + a. We would rather handle this with an + * algebraic reduction than fuse it. Also, we want to only fuse + * things where the multiply is used only once and, in this case, + * it would be used twice by the same instruction. + */ + if (add->src[0].src.ssa == add->src[1].src.ssa) + continue; + + nir_alu_instr *mul; + uint8_t add_mul_src, swizzle[4]; + bool negate, abs; + for (add_mul_src = 0; add_mul_src < 2; add_mul_src++) { + for (unsigned i = 0; i < 4; i++) + swizzle[i] = i; + + negate = false; + abs = false; + + mul = get_mul_for_src(&add->src[add_mul_src], + add->dest.dest.ssa.num_components, + swizzle, &negate, &abs); + + if (mul != NULL) + break; + } + + if (mul == NULL) + continue; + + unsigned bit_size = add->dest.dest.ssa.bit_size; + + nir_ssa_def *mul_src[2]; + mul_src[0] = mul->src[0].src.ssa; + mul_src[1] = mul->src[1].src.ssa; + + /* If any of the operands of the fmul and any of the fadd is a constant, + * we bypass because it will be more efficient as the constants will be + * propagated as operands, potentially saving two load_const instructions. + */ + if (any_alu_src_is_a_constant(mul->src) && + any_alu_src_is_a_constant(add->src)) { + continue; + } + + b->cursor = nir_before_instr(&add->instr); + + if (abs) { + for (unsigned i = 0; i < 2; i++) + mul_src[i] = nir_fabs(b, mul_src[i]); + } + + if (negate) + mul_src[0] = nir_fneg(b, mul_src[0]); + + nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma); + ffma->dest.saturate = add->dest.saturate; + ffma->dest.write_mask = add->dest.write_mask; + + for (unsigned i = 0; i < 2; i++) { + ffma->src[i].src = nir_src_for_ssa(mul_src[i]); + for (unsigned j = 0; j < add->dest.dest.ssa.num_components; j++) + ffma->src[i].swizzle[j] = mul->src[i].swizzle[swizzle[j]]; + } + nir_alu_src_copy(&ffma->src[2], &add->src[1 - add_mul_src], ffma); + + assert(add->dest.dest.is_ssa); + + nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest, + add->dest.dest.ssa.num_components, + bit_size, + add->dest.dest.ssa.name); + nir_ssa_def_rewrite_uses(&add->dest.dest.ssa, + nir_src_for_ssa(&ffma->dest.dest.ssa)); + + nir_builder_instr_insert(b, &ffma->instr); + assert(list_empty(&add->dest.dest.ssa.uses)); + nir_instr_remove(&add->instr); + + progress = true; + } + + return progress; +} + +static bool +brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl) +{ + bool progress = false; + + nir_builder builder; + nir_builder_init(&builder, impl); + + nir_foreach_block(block, impl) { + progress |= brw_nir_opt_peephole_ffma_block(&builder, block); + } + + if (progress) + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + return progress; +} + +bool +brw_nir_opt_peephole_ffma(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= brw_nir_opt_peephole_ffma_impl(function->impl); + } + + return progress; +} diff --git a/src/intel/compiler/brw_nir_tcs_workarounds.c b/src/intel/compiler/brw_nir_tcs_workarounds.c new file mode 100644 index 00000000000..a85f493c704 --- /dev/null +++ b/src/intel/compiler/brw_nir_tcs_workarounds.c @@ -0,0 +1,152 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "brw_nir.h" + +/** + * Implements the WaPreventHSTessLevelsInterference workaround (for Gen7-8). + * + * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the + * definition of the patch header layouts): + * + * "HW Bug: The Tessellation stage will incorrectly add domain points + * along patch edges under the following conditions, which may result + * in conformance failures and/or cracking artifacts: + * + * * QUAD domain + * * INTEGER partitioning + * * All three TessFactors in a given U or V direction (e.g., V + * direction: UEQ0, InsideV, UEQ1) are all exactly 1.0 + * * All three TessFactors in the other direction are > 1.0 and all + * round up to the same integer value (e.g, U direction: + * VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4) + * + * The suggested workaround (to be implemented as part of the postamble + * to the HS shader in the HS kernel) is: + * + * if ( + * (TF[UEQ0] > 1.0) || + * (TF[VEQ0] > 1.0) || + * (TF[UEQ1] > 1.0) || + * (TF[VEQ1] > 1.0) || + * (TF[INSIDE_U] > 1.0) || + * (TF[INSIDE_V] > 1.0) ) + * { + * TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U]; + * TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V]; + * }" + * + * There's a subtlety here. Intel internal HSD-ES bug 1208668495 notes + * that the above workaround fails to fix certain GL/ES CTS tests which + * have inside tessellation factors of -1.0. This can be explained by + * a quote from the ARB_tessellation_shader specification: + * + * "If "equal_spacing" is used, the floating-point tessellation level is + * first clamped to the range [1,<max>], where <max> is implementation- + * dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)." + * + * In other words, the actual inner tessellation factor used is + * clamp(TF[INSIDE_*], 1.0, 64.0). So we want to compare the clamped + * value against 1.0. To accomplish this, we change the comparison from + * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0). + */ + +static inline nir_ssa_def * +load_output(nir_builder *b, int num_components, int offset, int component) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_output); + nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL); + load->num_components = num_components; + load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); + nir_intrinsic_set_base(load, offset); + nir_intrinsic_set_component(load, component); + + nir_builder_instr_insert(b, &load->instr); + + return &load->dest.ssa; +} + +static void +emit_quads_workaround(nir_builder *b, nir_block *block) +{ + b->cursor = nir_after_block_before_jump(block); + + nir_ssa_def *inner = load_output(b, 2, 0, 2); + nir_ssa_def *outer = load_output(b, 4, 1, 0); + + nir_ssa_def *any_greater_than_1 = + nir_ior(b, nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), outer)), + nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), inner))); + + nir_if *if_stmt = nir_if_create(b->shader); + if_stmt->condition = nir_src_for_ssa(any_greater_than_1); + nir_builder_cf_insert(b, &if_stmt->cf_node); + + /* Fill out the new then-block */ + b->cursor = nir_after_cf_list(&if_stmt->then_list); + + inner = nir_bcsel(b, nir_fge(b, nir_imm_float(b, 1.0f), inner), + nir_imm_float(b, 2.0f), inner); + + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); + store->num_components = 2; + nir_intrinsic_set_write_mask(store, WRITEMASK_XY); + nir_intrinsic_set_component(store, 2); + store->src[0] = nir_src_for_ssa(inner); + store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); + nir_builder_instr_insert(b, &store->instr); +} + +void +brw_nir_apply_tcs_quads_workaround(nir_shader *nir) +{ + assert(nir->stage == MESA_SHADER_TESS_CTRL); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + /* emit_quads_workaround() inserts an if statement into each block, + * which splits it in two. This changes the set of predecessors of + * the end block. We want to process the original set, so to be safe, + * save it off to an array first. + */ + const unsigned num_end_preds = impl->end_block->predecessors->entries; + nir_block *end_preds[num_end_preds]; + unsigned i = 0; + struct set_entry *entry; + + set_foreach(impl->end_block->predecessors, entry) { + end_preds[i++] = (nir_block *) entry->key; + } + + for (i = 0; i < num_end_preds; i++) { + emit_quads_workaround(&b, end_preds[i]); + } + + nir_metadata_preserve(impl, 0); +} diff --git a/src/intel/compiler/brw_nir_trig_workarounds.py b/src/intel/compiler/brw_nir_trig_workarounds.py new file mode 100644 index 00000000000..6a77d64dbd4 --- /dev/null +++ b/src/intel/compiler/brw_nir_trig_workarounds.py @@ -0,0 +1,43 @@ +# +# Copyright (C) 2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import nir_algebraic + +# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can +# produce values slightly outside of the [-1.0, 1.0] range for a small set of +# values. Obviously, this can break everyone's expectations about trig +# functions. This appears to be fixed in Kaby Lake. +# +# According to an internal presentation, the COS instruction can produce +# a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One +# suggested workaround is to multiply by 0.99997, scaling down the +# amplitude slightly. Apparently this also minimizes the error function, +# reducing the maximum error from 0.00006 to about 0.00003. + +trig_workarounds = [ + (('fsin', 'x'), ('fmul', ('fsin', 'x'), 0.99997)), + (('fcos', 'x'), ('fmul', ('fcos', 'x'), 0.99997)), +] + +print '#include "brw_nir.h"' +print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds", + trig_workarounds).render() diff --git a/src/intel/compiler/brw_packed_float.c b/src/intel/compiler/brw_packed_float.c new file mode 100644 index 00000000000..9b7687a756f --- /dev/null +++ b/src/intel/compiler/brw_packed_float.c @@ -0,0 +1,75 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include "brw_reg.h" + +union fu { + float f; + unsigned u; + struct { + unsigned mantissa:23; + unsigned exponent:8; + unsigned sign:1; + } s; +}; + +int +brw_float_to_vf(float f) +{ + union fu fu = { .f = f }; + + /* ±0.0f is special cased. */ + if (f == 0.0f) + return fu.s.sign << 7; + + unsigned mantissa = fu.s.mantissa >> (23 - 4); + unsigned exponent = fu.s.exponent - (127 - 3); + unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa; + + /* 0.125 would have had the same representation as 0.0, so reject it. */ + if ((vf & 0x7f) == 0) + return -1; + + /* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */ + if (fu.u & 0x7ffff || exponent > 7) + return -1; + + return vf; +} + +float +brw_vf_to_float(unsigned char vf) +{ + union fu fu; + + /* ±0.0f is special cased. */ + if (vf == 0x00 || vf == 0x80) { + fu.u = vf << 24; + return fu.f; + } + + fu.s.sign = vf >> 7; + fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3); + fu.s.mantissa = (vf & 0xf) << (23 - 4); + + return fu.f; +} diff --git a/src/intel/compiler/brw_predicated_break.cpp b/src/intel/compiler/brw_predicated_break.cpp new file mode 100644 index 00000000000..607715dace4 --- /dev/null +++ b/src/intel/compiler/brw_predicated_break.cpp @@ -0,0 +1,148 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_cfg.h" + +using namespace brw; + +/** @file brw_predicated_break.cpp + * + * Loops are often structured as + * + * loop: + * CMP.f0 + * (+f0) IF + * BREAK + * ENDIF + * ... + * WHILE loop + * + * This peephole pass removes the IF and ENDIF instructions and predicates the + * BREAK, dropping two instructions from the loop body. + * + * If the loop was a DO { ... } WHILE loop, it looks like + * + * loop: + * ... + * CMP.f0 + * (+f0) IF + * BREAK + * ENDIF + * WHILE loop + * + * and we can remove the BREAK instruction and predicate the WHILE. + */ + +bool +opt_predicated_break(backend_shader *s) +{ + bool progress = false; + + foreach_block (block, s->cfg) { + if (block->start_ip != block->end_ip) + continue; + + /* BREAK and CONTINUE instructions, by definition, can only be found at + * the ends of basic blocks. + */ + backend_instruction *jump_inst = block->end(); + if (jump_inst->opcode != BRW_OPCODE_BREAK && + jump_inst->opcode != BRW_OPCODE_CONTINUE) + continue; + + backend_instruction *if_inst = block->prev()->end(); + if (if_inst->opcode != BRW_OPCODE_IF) + continue; + + backend_instruction *endif_inst = block->next()->start(); + if (endif_inst->opcode != BRW_OPCODE_ENDIF) + continue; + + bblock_t *jump_block = block; + bblock_t *if_block = jump_block->prev(); + bblock_t *endif_block = jump_block->next(); + + jump_inst->predicate = if_inst->predicate; + jump_inst->predicate_inverse = if_inst->predicate_inverse; + + bblock_t *earlier_block = if_block; + if (if_block->start_ip == if_block->end_ip) { + earlier_block = if_block->prev(); + } + + if_inst->remove(if_block); + + bblock_t *later_block = endif_block; + if (endif_block->start_ip == endif_block->end_ip) { + later_block = endif_block->next(); + } + endif_inst->remove(endif_block); + + if (!earlier_block->ends_with_control_flow()) { + earlier_block->children.make_empty(); + earlier_block->add_successor(s->cfg->mem_ctx, jump_block); + } + + if (!later_block->starts_with_control_flow()) { + later_block->parents.make_empty(); + } + jump_block->add_successor(s->cfg->mem_ctx, later_block); + + if (earlier_block->can_combine_with(jump_block)) { + earlier_block->combine_with(jump_block); + + block = earlier_block; + } + + /* Now look at the first instruction of the block following the BREAK. If + * it's a WHILE, we can delete the break, predicate the WHILE, and join + * the two basic blocks. + */ + bblock_t *while_block = earlier_block->next(); + backend_instruction *while_inst = while_block->start(); + + if (jump_inst->opcode == BRW_OPCODE_BREAK && + while_inst->opcode == BRW_OPCODE_WHILE && + while_inst->predicate == BRW_PREDICATE_NONE) { + jump_inst->remove(earlier_block); + while_inst->predicate = jump_inst->predicate; + while_inst->predicate_inverse = !jump_inst->predicate_inverse; + + earlier_block->children.make_empty(); + earlier_block->add_successor(s->cfg->mem_ctx, while_block); + + assert(earlier_block->can_combine_with(while_block)); + earlier_block->combine_with(while_block); + + earlier_block->next()->parents.make_empty(); + earlier_block->add_successor(s->cfg->mem_ctx, earlier_block->next()); + } + + progress = true; + } + + if (progress) + s->invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h new file mode 100644 index 00000000000..f8c3340e452 --- /dev/null +++ b/src/intel/compiler/brw_reg.h @@ -0,0 +1,1135 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +/** @file brw_reg.h + * + * This file defines struct brw_reg, which is our representation for EU + * registers. They're not a hardware specific format, just an abstraction + * that intends to capture the full flexibility of the hardware registers. + * + * The brw_eu_emit.c layer's brw_set_dest/brw_set_src[01] functions encode + * the abstract brw_reg type into the actual hardware instruction encoding. + */ + +#ifndef BRW_REG_H +#define BRW_REG_H + +#include <stdbool.h> +#include "main/compiler.h" +#include "main/macros.h" +#include "program/prog_instruction.h" +#include "brw_eu_defines.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct gen_device_info; + +/** Number of general purpose registers (VS, WM, etc) */ +#define BRW_MAX_GRF 128 + +/** + * First GRF used for the MRF hack. + * + * On gen7, MRFs are no longer used, and contiguous GRFs are used instead. We + * haven't converted our compiler to be aware of this, so it asks for MRFs and + * brw_eu_emit.c quietly converts them to be accesses of the top GRFs. The + * register allocators have to be careful of this to avoid corrupting the "MRF"s + * with actual GRF allocations. + */ +#define GEN7_MRF_HACK_START 112 + +/** Number of message register file registers */ +#define BRW_MAX_MRF(gen) (gen == 6 ? 24 : 16) + +#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6)) +#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3) + +#define BRW_SWIZZLE_NOOP BRW_SWIZZLE4(0,1,2,3) +#define BRW_SWIZZLE_XYZW BRW_SWIZZLE4(0,1,2,3) +#define BRW_SWIZZLE_XXXX BRW_SWIZZLE4(0,0,0,0) +#define BRW_SWIZZLE_YYYY BRW_SWIZZLE4(1,1,1,1) +#define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2) +#define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3) +#define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1) +#define BRW_SWIZZLE_YXYX BRW_SWIZZLE4(1,0,1,0) +#define BRW_SWIZZLE_XZXZ BRW_SWIZZLE4(0,2,0,2) +#define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3) +#define BRW_SWIZZLE_YWYW BRW_SWIZZLE4(1,3,1,3) +#define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3) +#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) +#define BRW_SWIZZLE_WZWZ BRW_SWIZZLE4(3,2,3,2) +#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0) +#define BRW_SWIZZLE_XXZZ BRW_SWIZZLE4(0,0,2,2) +#define BRW_SWIZZLE_YYWW BRW_SWIZZLE4(1,1,3,3) +#define BRW_SWIZZLE_YXWZ BRW_SWIZZLE4(1,0,3,2) + +#define BRW_SWZ_COMP_INPUT(comp) (BRW_SWIZZLE_XYZW >> ((comp)*2)) +#define BRW_SWZ_COMP_OUTPUT(comp) (BRW_SWIZZLE_XYZW << ((comp)*2)) + +static inline bool +brw_is_single_value_swizzle(unsigned swiz) +{ + return (swiz == BRW_SWIZZLE_XXXX || + swiz == BRW_SWIZZLE_YYYY || + swiz == BRW_SWIZZLE_ZZZZ || + swiz == BRW_SWIZZLE_WWWW); +} + +/** + * Compute the swizzle obtained from the application of \p swz0 on the result + * of \p swz1. The argument ordering is expected to match function + * composition. + */ +static inline unsigned +brw_compose_swizzle(unsigned swz0, unsigned swz1) +{ + return BRW_SWIZZLE4( + BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 0)), + BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 1)), + BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 2)), + BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 3))); +} + +/** + * Return the result of applying swizzle \p swz to shuffle the bits of \p mask + * (AKA image). + */ +static inline unsigned +brw_apply_swizzle_to_mask(unsigned swz, unsigned mask) +{ + unsigned result = 0; + + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << BRW_GET_SWZ(swz, i))) + result |= 1 << i; + } + + return result; +} + +/** + * Return the result of applying the inverse of swizzle \p swz to shuffle the + * bits of \p mask (AKA preimage). Useful to find out which components are + * read from a swizzled source given the instruction writemask. + */ +static inline unsigned +brw_apply_inv_swizzle_to_mask(unsigned swz, unsigned mask) +{ + unsigned result = 0; + + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + result |= 1 << BRW_GET_SWZ(swz, i); + } + + return result; +} + +/** + * Construct an identity swizzle for the set of enabled channels given by \p + * mask. The result will only reference channels enabled in the provided \p + * mask, assuming that \p mask is non-zero. The constructed swizzle will + * satisfy the property that for any instruction OP and any mask: + * + * brw_OP(p, brw_writemask(dst, mask), + * brw_swizzle(src, brw_swizzle_for_mask(mask))); + * + * will be equivalent to the same instruction without swizzle: + * + * brw_OP(p, brw_writemask(dst, mask), src); + */ +static inline unsigned +brw_swizzle_for_mask(unsigned mask) +{ + unsigned last = (mask ? ffs(mask) - 1 : 0); + unsigned swz[4]; + + for (unsigned i = 0; i < 4; i++) + last = swz[i] = (mask & (1 << i) ? i : last); + + return BRW_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]); +} + +/** + * Construct an identity swizzle for the first \p n components of a vector. + * When only a subset of channels of a vec4 are used we don't want to + * reference the other channels, as that will tell optimization passes that + * those other channels are used. + */ +static inline unsigned +brw_swizzle_for_size(unsigned n) +{ + return brw_swizzle_for_mask((1 << n) - 1); +} + +/** + * Converse of brw_swizzle_for_mask(). Returns the mask of components + * accessed by the specified swizzle \p swz. + */ +static inline unsigned +brw_mask_for_swizzle(unsigned swz) +{ + return brw_apply_inv_swizzle_to_mask(swz, ~0); +} + +enum PACKED brw_reg_type { + BRW_REGISTER_TYPE_UD = 0, + BRW_REGISTER_TYPE_D, + BRW_REGISTER_TYPE_UW, + BRW_REGISTER_TYPE_W, + BRW_REGISTER_TYPE_F, + + /** Non-immediates only: @{ */ + BRW_REGISTER_TYPE_UB, + BRW_REGISTER_TYPE_B, + /** @} */ + + /** Immediates only: @{ */ + BRW_REGISTER_TYPE_UV, /* Gen6+ */ + BRW_REGISTER_TYPE_V, + BRW_REGISTER_TYPE_VF, + /** @} */ + + BRW_REGISTER_TYPE_DF, /* Gen7+ (no immediates until Gen8+) */ + + /* Gen8+ */ + BRW_REGISTER_TYPE_HF, + BRW_REGISTER_TYPE_UQ, + BRW_REGISTER_TYPE_Q, +}; + +unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, + enum brw_reg_type type, enum brw_reg_file file); + +#define brw_element_size(devinfo, inst, operand) \ + brw_hw_reg_type_to_size(devinfo, \ + brw_inst_ ## operand ## _reg_type(devinfo, inst), \ + brw_inst_ ## operand ## _reg_file(devinfo, inst)) +unsigned brw_hw_reg_type_to_size(const struct gen_device_info *devinfo, + unsigned type, enum brw_reg_file file); + +const char *brw_reg_type_letters(unsigned brw_reg_type); +uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz); + +#define REG_SIZE (8*4) + +/* These aren't hardware structs, just something useful for us to pass around: + * + * Align1 operation has a lot of control over input ranges. Used in + * WM programs to implement shaders decomposed into "channel serial" + * or "structure of array" form: + */ +struct brw_reg { + union { + struct { + enum brw_reg_type type:4; + enum brw_reg_file file:3; /* :2 hardware format */ + unsigned negate:1; /* source only */ + unsigned abs:1; /* source only */ + unsigned address_mode:1; /* relative addressing, hopefully! */ + unsigned pad0:1; + unsigned subnr:5; /* :1 in align16 */ + unsigned nr:16; + }; + uint32_t bits; + }; + + union { + struct { + unsigned swizzle:8; /* src only, align16 only */ + unsigned writemask:4; /* dest only, align16 only */ + int indirect_offset:10; /* relative addressing offset */ + unsigned vstride:4; /* source only */ + unsigned width:3; /* src only, align1 only */ + unsigned hstride:2; /* align1 only */ + unsigned pad1:1; + }; + + double df; + uint64_t u64; + int64_t d64; + float f; + int d; + unsigned ud; + }; +}; + +static inline bool +brw_regs_equal(const struct brw_reg *a, const struct brw_reg *b) +{ + const bool df = a->type == BRW_REGISTER_TYPE_DF && a->file == IMM; + return a->bits == b->bits && (df ? a->u64 == b->u64 : a->ud == b->ud); +} + +struct brw_indirect { + unsigned addr_subnr:4; + int addr_offset:10; + unsigned pad:18; +}; + + +static inline unsigned +type_sz(unsigned type) +{ + switch(type) { + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_DF: + return 8; + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_F: + case BRW_REGISTER_TYPE_VF: + return 4; + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_HF: + return 2; + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + return 1; + default: + unreachable("not reached"); + } +} + +/** + * Return an integer type of the requested size and signedness. + */ +static inline enum brw_reg_type +brw_int_type(unsigned sz, bool is_signed) +{ + switch (sz) { + case 1: + return (is_signed ? BRW_REGISTER_TYPE_B : BRW_REGISTER_TYPE_UB); + case 2: + return (is_signed ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW); + case 4: + return (is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD); + case 8: + return (is_signed ? BRW_REGISTER_TYPE_Q : BRW_REGISTER_TYPE_UQ); + default: + unreachable("Not reached."); + } +} + +/** + * Construct a brw_reg. + * \param file one of the BRW_x_REGISTER_FILE values + * \param nr register number/index + * \param subnr register sub number + * \param negate register negate modifier + * \param abs register abs modifier + * \param type one of BRW_REGISTER_TYPE_x + * \param vstride one of BRW_VERTICAL_STRIDE_x + * \param width one of BRW_WIDTH_x + * \param hstride one of BRW_HORIZONTAL_STRIDE_x + * \param swizzle one of BRW_SWIZZLE_x + * \param writemask WRITEMASK_X/Y/Z/W bitfield + */ +static inline struct brw_reg +brw_reg(enum brw_reg_file file, + unsigned nr, + unsigned subnr, + unsigned negate, + unsigned abs, + enum brw_reg_type type, + unsigned vstride, + unsigned width, + unsigned hstride, + unsigned swizzle, + unsigned writemask) +{ + struct brw_reg reg; + if (file == BRW_GENERAL_REGISTER_FILE) + assert(nr < BRW_MAX_GRF); + else if (file == BRW_ARCHITECTURE_REGISTER_FILE) + assert(nr <= BRW_ARF_TIMESTAMP); + /* Asserting on the MRF register number requires to know the hardware gen + * (gen6 has 24 MRF registers), which we don't know here, so we assert + * for that in the generators and in brw_eu_emit.c + */ + + reg.type = type; + reg.file = file; + reg.negate = negate; + reg.abs = abs; + reg.address_mode = BRW_ADDRESS_DIRECT; + reg.pad0 = 0; + reg.subnr = subnr * type_sz(type); + reg.nr = nr; + + /* Could do better: If the reg is r5.3<0;1,0>, we probably want to + * set swizzle and writemask to W, as the lower bits of subnr will + * be lost when converted to align16. This is probably too much to + * keep track of as you'd want it adjusted by suboffset(), etc. + * Perhaps fix up when converting to align16? + */ + reg.swizzle = swizzle; + reg.writemask = writemask; + reg.indirect_offset = 0; + reg.vstride = vstride; + reg.width = width; + reg.hstride = hstride; + reg.pad1 = 0; + return reg; +} + +/** Construct float[16] register */ +static inline struct brw_reg +brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return brw_reg(file, + nr, + subnr, + 0, + 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_16, + BRW_WIDTH_16, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[8] register */ +static inline struct brw_reg +brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return brw_reg(file, + nr, + subnr, + 0, + 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_8, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[4] register */ +static inline struct brw_reg +brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return brw_reg(file, + nr, + subnr, + 0, + 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[2] register */ +static inline struct brw_reg +brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return brw_reg(file, + nr, + subnr, + 0, + 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_2, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYXY, + WRITEMASK_XY); +} + +/** Construct float[1] register */ +static inline struct brw_reg +brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return brw_reg(file, + nr, + subnr, + 0, + 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); +} + +static inline struct brw_reg +brw_vecn_reg(unsigned width, enum brw_reg_file file, + unsigned nr, unsigned subnr) +{ + switch (width) { + case 1: + return brw_vec1_reg(file, nr, subnr); + case 2: + return brw_vec2_reg(file, nr, subnr); + case 4: + return brw_vec4_reg(file, nr, subnr); + case 8: + return brw_vec8_reg(file, nr, subnr); + case 16: + return brw_vec16_reg(file, nr, subnr); + default: + unreachable("Invalid register width"); + } +} + +static inline struct brw_reg +retype(struct brw_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +static inline struct brw_reg +firsthalf(struct brw_reg reg) +{ + return reg; +} + +static inline struct brw_reg +sechalf(struct brw_reg reg) +{ + if (reg.vstride) + reg.nr++; + return reg; +} + +static inline struct brw_reg +offset(struct brw_reg reg, unsigned delta) +{ + reg.nr += delta; + return reg; +} + + +static inline struct brw_reg +byte_offset(struct brw_reg reg, unsigned bytes) +{ + unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes; + reg.nr = newoffset / REG_SIZE; + reg.subnr = newoffset % REG_SIZE; + return reg; +} + +static inline struct brw_reg +suboffset(struct brw_reg reg, unsigned delta) +{ + return byte_offset(reg, delta * type_sz(reg.type)); +} + +/** Construct unsigned word[16] register */ +static inline struct brw_reg +brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +/** Construct unsigned word[8] register */ +static inline struct brw_reg +brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +/** Construct unsigned word[1] register */ +static inline struct brw_reg +brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +static inline struct brw_reg +brw_ud1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) +{ + return retype(brw_vec1_reg(file, nr, subnr), BRW_REGISTER_TYPE_UD); +} + +static inline struct brw_reg +brw_imm_reg(enum brw_reg_type type) +{ + return brw_reg(BRW_IMMEDIATE_VALUE, + 0, + 0, + 0, + 0, + type, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + 0, + 0); +} + +/** Construct float immediate register */ +static inline struct brw_reg +brw_imm_df(double df) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_DF); + imm.df = df; + return imm; +} + +static inline struct brw_reg +brw_imm_f(float f) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F); + imm.f = f; + return imm; +} + +/** Construct integer immediate register */ +static inline struct brw_reg +brw_imm_d(int d) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D); + imm.d = d; + return imm; +} + +/** Construct uint immediate register */ +static inline struct brw_reg +brw_imm_ud(unsigned ud) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD); + imm.ud = ud; + return imm; +} + +/** Construct ushort immediate register */ +static inline struct brw_reg +brw_imm_uw(uint16_t uw) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW); + imm.ud = uw | (uw << 16); + return imm; +} + +/** Construct short immediate register */ +static inline struct brw_reg +brw_imm_w(int16_t w) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W); + imm.d = w | (w << 16); + return imm; +} + +/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type + * numbers alias with _V and _VF below: + */ + +/** Construct vector of eight signed half-byte values */ +static inline struct brw_reg +brw_imm_v(unsigned v) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V); + imm.ud = v; + return imm; +} + +/** Construct vector of eight unsigned half-byte values */ +static inline struct brw_reg +brw_imm_uv(unsigned uv) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UV); + imm.ud = uv; + return imm; +} + +/** Construct vector of four 8-bit float values */ +static inline struct brw_reg +brw_imm_vf(unsigned v) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF); + imm.ud = v; + return imm; +} + +static inline struct brw_reg +brw_imm_vf4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF); + imm.vstride = BRW_VERTICAL_STRIDE_0; + imm.width = BRW_WIDTH_4; + imm.hstride = BRW_HORIZONTAL_STRIDE_1; + imm.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24)); + return imm; +} + + +static inline struct brw_reg +brw_address(struct brw_reg reg) +{ + return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr); +} + +/** Construct float[1] general-purpose register */ +static inline struct brw_reg +brw_vec1_grf(unsigned nr, unsigned subnr) +{ + return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[2] general-purpose register */ +static inline struct brw_reg +brw_vec2_grf(unsigned nr, unsigned subnr) +{ + return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[4] general-purpose register */ +static inline struct brw_reg +brw_vec4_grf(unsigned nr, unsigned subnr) +{ + return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[8] general-purpose register */ +static inline struct brw_reg +brw_vec8_grf(unsigned nr, unsigned subnr) +{ + return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[16] general-purpose register */ +static inline struct brw_reg +brw_vec16_grf(unsigned nr, unsigned subnr) +{ + return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +static inline struct brw_reg +brw_vecn_grf(unsigned width, unsigned nr, unsigned subnr) +{ + return brw_vecn_reg(width, BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + + +static inline struct brw_reg +brw_uw8_grf(unsigned nr, unsigned subnr) +{ + return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +static inline struct brw_reg +brw_uw16_grf(unsigned nr, unsigned subnr) +{ + return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + + +/** Construct null register (usually used for setting condition codes) */ +static inline struct brw_reg +brw_null_reg(void) +{ + return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0); +} + +static inline struct brw_reg +brw_null_vec(unsigned width) +{ + return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0); +} + +static inline struct brw_reg +brw_address_reg(unsigned subnr) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_ADDRESS, subnr); +} + +/* If/else instructions break in align16 mode if writemask & swizzle + * aren't xyzw. This goes against the convention for other scalar + * regs: + */ +static inline struct brw_reg +brw_ip_reg(void) +{ + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_IP, + 0, + 0, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_4, /* ? */ + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, /* NOTE! */ + WRITEMASK_XYZW); /* NOTE! */ +} + +static inline struct brw_reg +brw_notification_reg(void) +{ + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NOTIFICATION_COUNT, + 0, + 0, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); +} + +static inline struct brw_reg +brw_sr0_reg(unsigned subnr) +{ + return brw_ud1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_STATE, subnr); +} + +static inline struct brw_reg +brw_acc_reg(unsigned width) +{ + return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_ACCUMULATOR, 0); +} + +static inline struct brw_reg +brw_flag_reg(int reg, int subreg) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_FLAG + reg, subreg); +} + +/** + * Return the mask register present in Gen4-5, or the related register present + * in Gen7.5 and later hardware referred to as "channel enable" register in + * the documentation. + */ +static inline struct brw_reg +brw_mask_reg(unsigned subnr) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_MASK, subnr); +} + +static inline struct brw_reg +brw_vmask_reg() +{ + return brw_sr0_reg(3); +} + +static inline struct brw_reg +brw_dmask_reg() +{ + return brw_sr0_reg(2); +} + +static inline struct brw_reg +brw_message_reg(unsigned nr) +{ + return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0); +} + +static inline struct brw_reg +brw_uvec_mrf(unsigned width, unsigned nr, unsigned subnr) +{ + return retype(brw_vecn_reg(width, BRW_MESSAGE_REGISTER_FILE, nr, subnr), + BRW_REGISTER_TYPE_UD); +} + +/* This is almost always called with a numeric constant argument, so + * make things easy to evaluate at compile time: + */ +static inline unsigned cvt(unsigned val) +{ + switch (val) { + case 0: return 0; + case 1: return 1; + case 2: return 2; + case 4: return 3; + case 8: return 4; + case 16: return 5; + case 32: return 6; + } + return 0; +} + +static inline struct brw_reg +stride(struct brw_reg reg, unsigned vstride, unsigned width, unsigned hstride) +{ + reg.vstride = cvt(vstride); + reg.width = cvt(width) - 1; + reg.hstride = cvt(hstride); + return reg; +} + +/** + * Multiply the vertical and horizontal stride of a register by the given + * factor \a s. + */ +static inline struct brw_reg +spread(struct brw_reg reg, unsigned s) +{ + if (s) { + assert(_mesa_is_pow_two(s)); + + if (reg.hstride) + reg.hstride += cvt(s) - 1; + + if (reg.vstride) + reg.vstride += cvt(s) - 1; + + return reg; + } else { + return stride(reg, 0, 1, 0); + } +} + +static inline struct brw_reg +vec16(struct brw_reg reg) +{ + return stride(reg, 16,16,1); +} + +static inline struct brw_reg +vec8(struct brw_reg reg) +{ + return stride(reg, 8,8,1); +} + +static inline struct brw_reg +vec4(struct brw_reg reg) +{ + return stride(reg, 4,4,1); +} + +static inline struct brw_reg +vec2(struct brw_reg reg) +{ + return stride(reg, 2,2,1); +} + +static inline struct brw_reg +vec1(struct brw_reg reg) +{ + return stride(reg, 0,1,0); +} + + +static inline struct brw_reg +get_element(struct brw_reg reg, unsigned elt) +{ + return vec1(suboffset(reg, elt)); +} + +static inline struct brw_reg +get_element_ud(struct brw_reg reg, unsigned elt) +{ + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt)); +} + +static inline struct brw_reg +get_element_d(struct brw_reg reg, unsigned elt) +{ + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt)); +} + +static inline struct brw_reg +brw_swizzle(struct brw_reg reg, unsigned swz) +{ + if (reg.file == BRW_IMMEDIATE_VALUE) + reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz); + else + reg.swizzle = brw_compose_swizzle(swz, reg.swizzle); + + return reg; +} + +static inline struct brw_reg +brw_writemask(struct brw_reg reg, unsigned mask) +{ + assert(reg.file != BRW_IMMEDIATE_VALUE); + reg.writemask &= mask; + return reg; +} + +static inline struct brw_reg +brw_set_writemask(struct brw_reg reg, unsigned mask) +{ + assert(reg.file != BRW_IMMEDIATE_VALUE); + reg.writemask = mask; + return reg; +} + +static inline unsigned +brw_writemask_for_size(unsigned n) +{ + return (1 << n) - 1; +} + +static inline unsigned +brw_writemask_for_component_packing(unsigned n, unsigned first_component) +{ + assert(first_component + n <= 4); + return (((1 << n) - 1) << first_component); +} + +static inline struct brw_reg +negate(struct brw_reg reg) +{ + reg.negate ^= 1; + return reg; +} + +static inline struct brw_reg +brw_abs(struct brw_reg reg) +{ + reg.abs = 1; + reg.negate = 0; + return reg; +} + +/************************************************************************/ + +static inline struct brw_reg +brw_vec4_indirect(unsigned subnr, int offset) +{ + struct brw_reg reg = brw_vec4_grf(0, 0); + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.indirect_offset = offset; + return reg; +} + +static inline struct brw_reg +brw_vec1_indirect(unsigned subnr, int offset) +{ + struct brw_reg reg = brw_vec1_grf(0, 0); + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.indirect_offset = offset; + return reg; +} + +static inline struct brw_reg +brw_VxH_indirect(unsigned subnr, int offset) +{ + struct brw_reg reg = brw_vec1_grf(0, 0); + reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.indirect_offset = offset; + return reg; +} + +static inline struct brw_reg +deref_4f(struct brw_indirect ptr, int offset) +{ + return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset); +} + +static inline struct brw_reg +deref_1f(struct brw_indirect ptr, int offset) +{ + return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset); +} + +static inline struct brw_reg +deref_4b(struct brw_indirect ptr, int offset) +{ + return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B); +} + +static inline struct brw_reg +deref_1uw(struct brw_indirect ptr, int offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW); +} + +static inline struct brw_reg +deref_1d(struct brw_indirect ptr, int offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D); +} + +static inline struct brw_reg +deref_1ud(struct brw_indirect ptr, int offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD); +} + +static inline struct brw_reg +get_addr_reg(struct brw_indirect ptr) +{ + return brw_address_reg(ptr.addr_subnr); +} + +static inline struct brw_indirect +brw_indirect_offset(struct brw_indirect ptr, int offset) +{ + ptr.addr_offset += offset; + return ptr; +} + +static inline struct brw_indirect +brw_indirect(unsigned addr_subnr, int offset) +{ + struct brw_indirect ptr; + ptr.addr_subnr = addr_subnr; + ptr.addr_offset = offset; + ptr.pad = 0; + return ptr; +} + +static inline bool +region_matches(struct brw_reg reg, enum brw_vertical_stride v, + enum brw_width w, enum brw_horizontal_stride h) +{ + return reg.vstride == v && + reg.width == w && + reg.hstride == h; +} + +#define has_scalar_region(reg) \ + region_matches(reg, BRW_VERTICAL_STRIDE_0, BRW_WIDTH_1, \ + BRW_HORIZONTAL_STRIDE_0) + +/* brw_packed_float.c */ +int brw_float_to_vf(float f); +float brw_vf_to_float(unsigned char vf); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp new file mode 100644 index 00000000000..b3f7e877c80 --- /dev/null +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -0,0 +1,1753 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_vec4.h" +#include "brw_cfg.h" +#include "brw_shader.h" + +using namespace brw; + +/** @file brw_fs_schedule_instructions.cpp + * + * List scheduling of FS instructions. + * + * The basic model of the list scheduler is to take a basic block, + * compute a DAG of the dependencies (RAW ordering with latency, WAW + * ordering with latency, WAR ordering), and make a list of the DAG heads. + * Heuristically pick a DAG head, then put all the children that are + * now DAG heads into the list of things to schedule. + * + * The heuristic is the important part. We're trying to be cheap, + * since actually computing the optimal scheduling is NP complete. + * What we do is track a "current clock". When we schedule a node, we + * update the earliest-unblocked clock time of its children, and + * increment the clock. Then, when trying to schedule, we just pick + * the earliest-unblocked instruction to schedule. + * + * Note that often there will be many things which could execute + * immediately, and there are a range of heuristic options to choose + * from in picking among those. + */ + +static bool debug = false; + +class instruction_scheduler; + +class schedule_node : public exec_node +{ +public: + schedule_node(backend_instruction *inst, instruction_scheduler *sched); + void set_latency_gen4(); + void set_latency_gen7(bool is_haswell); + + backend_instruction *inst; + schedule_node **children; + int *child_latency; + int child_count; + int parent_count; + int child_array_size; + int unblocked_time; + int latency; + + /** + * Which iteration of pushing groups of children onto the candidates list + * this node was a part of. + */ + unsigned cand_generation; + + /** + * This is the sum of the instruction's latency plus the maximum delay of + * its children, or just the issue_time if it's a leaf node. + */ + int delay; + + /** + * Preferred exit node among the (direct or indirect) successors of this + * node. Among the scheduler nodes blocked by this node, this will be the + * one that may cause earliest program termination, or NULL if none of the + * successors is an exit node. + */ + schedule_node *exit; + + bool is_barrier; +}; + +/** + * Lower bound of the scheduling time after which one of the instructions + * blocked by this node may lead to program termination. + * + * exit_unblocked_time() determines a strict partial ordering relation '«' on + * the set of scheduler nodes as follows: + * + * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) + * + * which can be used to heuristically order nodes according to how early they + * can unblock an exit node and lead to program termination. + */ +static inline int +exit_unblocked_time(const schedule_node *n) +{ + return n->exit ? n->exit->unblocked_time : INT_MAX; +} + +void +schedule_node::set_latency_gen4() +{ + int chans = 8; + int math_latency = 22; + + switch (inst->opcode) { + case SHADER_OPCODE_RCP: + this->latency = 1 * chans * math_latency; + break; + case SHADER_OPCODE_RSQ: + this->latency = 2 * chans * math_latency; + break; + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_LOG2: + /* full precision log. partial is 2. */ + this->latency = 3 * chans * math_latency; + break; + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_EXP2: + /* full precision. partial is 3, same throughput. */ + this->latency = 4 * chans * math_latency; + break; + case SHADER_OPCODE_POW: + this->latency = 8 * chans * math_latency; + break; + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + /* minimum latency, max is 12 rounds. */ + this->latency = 5 * chans * math_latency; + break; + default: + this->latency = 2; + break; + } +} + +void +schedule_node::set_latency_gen7(bool is_haswell) +{ + switch (inst->opcode) { + case BRW_OPCODE_MAD: + /* 2 cycles + * (since the last two src operands are in different register banks): + * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; + * + * 3 cycles on IVB, 4 on HSW + * (since the last two src operands are in the same register bank): + * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; + * + * 18 cycles on IVB, 16 on HSW + * (since the last two src operands are in different register banks): + * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; + * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; + * + * 20 cycles on IVB, 18 on HSW + * (since the last two src operands are in the same register bank): + * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; + * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; + */ + + /* Our register allocator doesn't know about register banks, so use the + * higher latency. + */ + latency = is_haswell ? 16 : 18; + break; + + case BRW_OPCODE_LRP: + /* 2 cycles + * (since the last two src operands are in different register banks): + * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; + * + * 3 cycles on IVB, 4 on HSW + * (since the last two src operands are in the same register bank): + * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; + * + * 16 cycles on IVB, 14 on HSW + * (since the last two src operands are in different register banks): + * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; + * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; + * + * 16 cycles + * (since the last two src operands are in the same register bank): + * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; + * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; + */ + + /* Our register allocator doesn't know about register banks, so use the + * higher latency. + */ + latency = 14; + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + /* 2 cycles: + * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; + * + * 18 cycles: + * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * Same for exp2, log2, rsq, sqrt, sin, cos. + */ + latency = is_haswell ? 14 : 16; + break; + + case SHADER_OPCODE_POW: + /* 2 cycles: + * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; + * + * 26 cycles: + * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + */ + latency = is_haswell ? 22 : 24; + break; + + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + /* 18 cycles: + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * + * 697 +/-49 cycles (min 610, n=26): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * So the latency on our first texture load of the batchbuffer takes + * ~700 cycles, since the caches are cold at that point. + * + * 840 +/- 92 cycles (min 720, n=25): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * On the second load, it takes just an extra ~140 cycles, and after + * accounting for the 14 cycles of the MOV's latency, that makes ~130. + * + * 683 +/- 49 cycles (min = 602, n=47): + * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; + * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; + * send(8) g4<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * send(8) g50<1>UW g114<8,8,1>F + * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * The unit appears to be pipelined, since this matches up with the + * cache-cold case, despite there being two loads here. If you replace + * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). + * + * So, take some number between the cache-hot 140 cycles and the + * cache-cold 700 cycles. No particular tuning was done on this. + * + * I haven't done significant testing of the non-TEX opcodes. TXL at + * least looked about the same as TEX. + */ + latency = 200; + break; + + case SHADER_OPCODE_TXS: + /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 + * cycles (n=15): + * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; + * send(8) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; + * + * + * Two loads was 535 +/- 30 cycles (n=19): + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * send(16) g6<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; + * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; + * send(16) g8<1>UW g114<8,8,1>F + * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; + * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; + * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; + * + * Since the only caches that should matter are just the + * instruction/state cache containing the surface state, assume that we + * always have hot caches. + */ + latency = 100; + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + case VS_OPCODE_PULL_CONSTANT_LOAD: + /* testing using varying-index pull constants: + * + * 16 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * + * ~480 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * ~620 cycles: + * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * send(8) g4<1>F g4<8,8,1>D + * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + * + * So, if it's cache-hot, it's about 140. If it's cache cold, it's + * about 460. We expect to mostly be cache hot, so pick something more + * in that direction. + */ + latency = 200; + break; + + case SHADER_OPCODE_GEN7_SCRATCH_READ: + /* Testing a load from offset 0, that had been previously written: + * + * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; + * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; + * + * The cycles spent seemed to be grouped around 40-50 (as low as 38), + * then around 140. Presumably this is cache hit vs miss. + */ + latency = 50; + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_TYPED_ATOMIC: + /* Test code: + * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; + * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; + * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; + * send(8) g4<1>ud g112<8,8,1>ud + * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 13867 cycles per atomic op, + * standard deviation 3%. Note that this is a rather + * pessimistic estimate, the actual latency in cases with few + * collisions between threads and favorable pipelining has been + * seen to be reduced by a factor of 100. + */ + latency = 14000; + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + /* Test code: + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * . + * . [repeats 8 times] + * . + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 583 cycles per surface read, + * standard deviation 0.9%. + */ + latency = is_haswell ? 300 : 600; + break; + + default: + /* 2 cycles: + * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; + * + * 16 cycles: + * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; + * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; + */ + latency = 14; + break; + } +} + +class instruction_scheduler { +public: + instruction_scheduler(backend_shader *s, int grf_count, + int hw_reg_count, int block_count, + instruction_scheduler_mode mode) + { + this->bs = s; + this->mem_ctx = ralloc_context(NULL); + this->grf_count = grf_count; + this->hw_reg_count = hw_reg_count; + this->instructions.make_empty(); + this->instructions_to_schedule = 0; + this->post_reg_alloc = (mode == SCHEDULE_POST); + this->mode = mode; + if (!post_reg_alloc) { + this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); + + this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(hw_reg_count)); + + this->written = rzalloc_array(mem_ctx, bool, grf_count); + + this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); + + this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); + } else { + this->reg_pressure_in = NULL; + this->livein = NULL; + this->liveout = NULL; + this->hw_liveout = NULL; + this->written = NULL; + this->reads_remaining = NULL; + this->hw_reads_remaining = NULL; + } + } + + ~instruction_scheduler() + { + ralloc_free(this->mem_ctx); + } + void add_barrier_deps(schedule_node *n); + void add_dep(schedule_node *before, schedule_node *after, int latency); + void add_dep(schedule_node *before, schedule_node *after); + + void run(cfg_t *cfg); + void add_insts_from_block(bblock_t *block); + void compute_delays(); + void compute_exits(); + virtual void calculate_deps() = 0; + virtual schedule_node *choose_instruction_to_schedule() = 0; + + /** + * Returns how many cycles it takes the instruction to issue. + * + * Instructions in gen hardware are handled one simd4 vector at a time, + * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 + * cycles to dispatch and SIMD16 (compressed) instructions take 4. + */ + virtual int issue_time(backend_instruction *inst) = 0; + + virtual void count_reads_remaining(backend_instruction *inst) = 0; + virtual void setup_liveness(cfg_t *cfg) = 0; + virtual void update_register_pressure(backend_instruction *inst) = 0; + virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; + + void schedule_instructions(bblock_t *block); + + void *mem_ctx; + + bool post_reg_alloc; + int instructions_to_schedule; + int grf_count; + int hw_reg_count; + int reg_pressure; + int block_idx; + exec_list instructions; + backend_shader *bs; + + instruction_scheduler_mode mode; + + /* + * The register pressure at the beginning of each basic block. + */ + + int *reg_pressure_in; + + /* + * The virtual GRF's whose range overlaps the beginning of each basic block. + */ + + BITSET_WORD **livein; + + /* + * The virtual GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **liveout; + + /* + * The hardware GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **hw_liveout; + + /* + * Whether we've scheduled a write for this virtual GRF yet. + */ + + bool *written; + + /* + * How many reads we haven't scheduled for this virtual GRF yet. + */ + + int *reads_remaining; + + /* + * How many reads we haven't scheduled for this hardware GRF yet. + */ + + int *hw_reads_remaining; +}; + +class fs_instruction_scheduler : public instruction_scheduler +{ +public: + fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, + int block_count, + instruction_scheduler_mode mode); + void calculate_deps(); + bool is_compressed(fs_inst *inst); + schedule_node *choose_instruction_to_schedule(); + int issue_time(backend_instruction *inst); + fs_visitor *v; + + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); + void update_register_pressure(backend_instruction *inst); + int get_register_pressure_benefit(backend_instruction *inst); +}; + +fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, + int grf_count, int hw_reg_count, + int block_count, + instruction_scheduler_mode mode) + : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), + v(v) +{ +} + +static bool +is_src_duplicate(fs_inst *inst, int src) +{ + for (int i = 0; i < src; i++) + if (inst->src[i].equals(inst->src[src])) + return true; + + return false; +} + +void +fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) +{ + fs_inst *inst = (fs_inst *)be; + + if (!reads_remaining) + return; + + for (int i = 0; i < inst->sources; i++) { + if (is_src_duplicate(inst, i)) + continue; + + if (inst->src[i].file == VGRF) { + reads_remaining[inst->src[i].nr]++; + } else if (inst->src[i].file == FIXED_GRF) { + if (inst->src[i].nr >= hw_reg_count) + continue; + + for (unsigned j = 0; j < regs_read(inst, i); j++) + hw_reads_remaining[inst->src[i].nr + j]++; + } + } +} + +void +fs_instruction_scheduler::setup_liveness(cfg_t *cfg) +{ + /* First, compute liveness on a per-GRF level using the in/out sets from + * liveness calculation. + */ + for (int block = 0; block < cfg->num_blocks; block++) { + for (int i = 0; i < v->live_intervals->num_vars; i++) { + if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { + int vgrf = v->live_intervals->vgrf_from_var[i]; + if (!BITSET_TEST(livein[block], vgrf)) { + reg_pressure_in[block] += v->alloc.sizes[vgrf]; + BITSET_SET(livein[block], vgrf); + } + } + + if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) + BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); + } + } + + /* Now, extend the live in/live out sets for when a range crosses a block + * boundary, which matches what our register allocator/interference code + * does to account for force_writemask_all and incompatible exec_mask's. + */ + for (int block = 0; block < cfg->num_blocks - 1; block++) { + for (int i = 0; i < grf_count; i++) { + if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && + v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { + if (!BITSET_TEST(livein[block + 1], i)) { + reg_pressure_in[block + 1] += v->alloc.sizes[i]; + BITSET_SET(livein[block + 1], i); + } + + BITSET_SET(liveout[block], i); + } + } + } + + int payload_last_use_ip[hw_reg_count]; + v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); + + for (int i = 0; i < hw_reg_count; i++) { + if (payload_last_use_ip[i] == -1) + continue; + + for (int block = 0; block < cfg->num_blocks; block++) { + if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) + reg_pressure_in[block]++; + + if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) + BITSET_SET(hw_liveout[block], i); + } + } +} + +void +fs_instruction_scheduler::update_register_pressure(backend_instruction *be) +{ + fs_inst *inst = (fs_inst *)be; + + if (!reads_remaining) + return; + + if (inst->dst.file == VGRF) { + written[inst->dst.nr] = true; + } + + for (int i = 0; i < inst->sources; i++) { + if (is_src_duplicate(inst, i)) + continue; + + if (inst->src[i].file == VGRF) { + reads_remaining[inst->src[i].nr]--; + } else if (inst->src[i].file == FIXED_GRF && + inst->src[i].nr < hw_reg_count) { + for (unsigned off = 0; off < regs_read(inst, i); off++) + hw_reads_remaining[inst->src[i].nr + off]--; + } + } +} + +int +fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) +{ + fs_inst *inst = (fs_inst *)be; + int benefit = 0; + + if (inst->dst.file == VGRF) { + if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && + !written[inst->dst.nr]) + benefit -= v->alloc.sizes[inst->dst.nr]; + } + + for (int i = 0; i < inst->sources; i++) { + if (is_src_duplicate(inst, i)) + continue; + + if (inst->src[i].file == VGRF && + !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && + reads_remaining[inst->src[i].nr] == 1) + benefit += v->alloc.sizes[inst->src[i].nr]; + + if (inst->src[i].file == FIXED_GRF && + inst->src[i].nr < hw_reg_count) { + for (unsigned off = 0; off < regs_read(inst, i); off++) { + int reg = inst->src[i].nr + off; + if (!BITSET_TEST(hw_liveout[block_idx], reg) && + hw_reads_remaining[reg] == 1) { + benefit++; + } + } + } + } + + return benefit; +} + +class vec4_instruction_scheduler : public instruction_scheduler +{ +public: + vec4_instruction_scheduler(vec4_visitor *v, int grf_count); + void calculate_deps(); + schedule_node *choose_instruction_to_schedule(); + int issue_time(backend_instruction *inst); + vec4_visitor *v; + + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); + void update_register_pressure(backend_instruction *inst); + int get_register_pressure_benefit(backend_instruction *inst); +}; + +vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, + int grf_count) + : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), + v(v) +{ +} + +void +vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be) +{ +} + +void +vec4_instruction_scheduler::setup_liveness(cfg_t *cfg) +{ +} + +void +vec4_instruction_scheduler::update_register_pressure(backend_instruction *be) +{ +} + +int +vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) +{ + return 0; +} + +schedule_node::schedule_node(backend_instruction *inst, + instruction_scheduler *sched) +{ + const struct gen_device_info *devinfo = sched->bs->devinfo; + + this->inst = inst; + this->child_array_size = 0; + this->children = NULL; + this->child_latency = NULL; + this->child_count = 0; + this->parent_count = 0; + this->unblocked_time = 0; + this->cand_generation = 0; + this->delay = 0; + this->exit = NULL; + this->is_barrier = false; + + /* We can't measure Gen6 timings directly but expect them to be much + * closer to Gen7 than Gen4. + */ + if (!sched->post_reg_alloc) + this->latency = 1; + else if (devinfo->gen >= 6) + set_latency_gen7(devinfo->is_haswell); + else + set_latency_gen4(); +} + +void +instruction_scheduler::add_insts_from_block(bblock_t *block) +{ + foreach_inst_in_block(backend_instruction, inst, block) { + schedule_node *n = new(mem_ctx) schedule_node(inst, this); + + instructions.push_tail(n); + } + + this->instructions_to_schedule = block->end_ip - block->start_ip + 1; +} + +/** Computation of the delay member of each node. */ +void +instruction_scheduler::compute_delays() +{ + foreach_in_list_reverse(schedule_node, n, &instructions) { + if (!n->child_count) { + n->delay = issue_time(n->inst); + } else { + for (int i = 0; i < n->child_count; i++) { + assert(n->children[i]->delay); + n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); + } + } + } +} + +void +instruction_scheduler::compute_exits() +{ + /* Calculate a lower bound of the scheduling time of each node in the + * graph. This is analogous to the node's critical path but calculated + * from the top instead of from the bottom of the block. + */ + foreach_in_list(schedule_node, n, &instructions) { + for (int i = 0; i < n->child_count; i++) { + n->children[i]->unblocked_time = + MAX2(n->children[i]->unblocked_time, + n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); + } + } + + /* Calculate the exit of each node by induction based on the exit nodes of + * its children. The preferred exit of a node is the one among the exit + * nodes of its children which can be unblocked first according to the + * optimistic unblocked time estimate calculated above. + */ + foreach_in_list_reverse(schedule_node, n, &instructions) { + n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL); + + for (int i = 0; i < n->child_count; i++) { + if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) + n->exit = n->children[i]->exit; + } + } +} + +/** + * Add a dependency between two instruction nodes. + * + * The @after node will be scheduled after @before. We will try to + * schedule it @latency cycles after @before, but no guarantees there. + */ +void +instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, + int latency) +{ + if (!before || !after) + return; + + assert(before != after); + + for (int i = 0; i < before->child_count; i++) { + if (before->children[i] == after) { + before->child_latency[i] = MAX2(before->child_latency[i], latency); + return; + } + } + + if (before->child_array_size <= before->child_count) { + if (before->child_array_size < 16) + before->child_array_size = 16; + else + before->child_array_size *= 2; + + before->children = reralloc(mem_ctx, before->children, + schedule_node *, + before->child_array_size); + before->child_latency = reralloc(mem_ctx, before->child_latency, + int, before->child_array_size); + } + + before->children[before->child_count] = after; + before->child_latency[before->child_count] = latency; + before->child_count++; + after->parent_count++; +} + +void +instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) +{ + if (!before) + return; + + add_dep(before, after, before->latency); +} + +/** + * Sometimes we really want this node to execute after everything that + * was before it and before everything that followed it. This adds + * the deps to do so. + */ +void +instruction_scheduler::add_barrier_deps(schedule_node *n) +{ + schedule_node *prev = (schedule_node *)n->prev; + schedule_node *next = (schedule_node *)n->next; + + n->is_barrier = true; + + if (prev) { + while (!prev->is_head_sentinel()) { + add_dep(prev, n, 0); + if (prev->is_barrier) + break; + prev = (schedule_node *)prev->prev; + } + } + + if (next) { + while (!next->is_tail_sentinel()) { + add_dep(n, next, 0); + if (next->is_barrier) + break; + next = (schedule_node *)next->next; + } + } +} + +/* instruction scheduling needs to be aware of when an MRF write + * actually writes 2 MRFs. + */ +bool +fs_instruction_scheduler::is_compressed(fs_inst *inst) +{ + return inst->exec_size == 16; +} + +static bool +is_scheduling_barrier(const fs_inst *inst) +{ + return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || + inst->is_control_flow() || + inst->has_side_effects(); +} + +void +fs_instruction_scheduler::calculate_deps() +{ + /* Pre-register-allocation, this tracks the last write per VGRF offset. + * After register allocation, reg_offsets are gone and we track individual + * GRF registers. + */ + schedule_node *last_grf_write[grf_count * 16]; + schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; + schedule_node *last_conditional_mod[4] = {}; + schedule_node *last_accumulator_write = NULL; + /* Fixed HW registers are assumed to be separate from the virtual + * GRFs, so they can be tracked separately. We don't really write + * to fixed GRFs much, so don't bother tracking them on a more + * granular level. + */ + schedule_node *last_fixed_grf_write = NULL; + + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + + /* top-to-bottom dependencies: RAW and WAW. */ + foreach_in_list(schedule_node, n, &instructions) { + fs_inst *inst = (fs_inst *)n->inst; + + if (is_scheduling_barrier(inst)) + add_barrier_deps(n); + + /* read-after-write deps. */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_read(inst, i); r++) + add_dep(last_grf_write[inst->src[i].nr + r], n); + } else { + for (unsigned r = 0; r < regs_read(inst, i); r++) { + add_dep(last_grf_write[inst->src[i].nr * 16 + + inst->src[i].offset / REG_SIZE + r], n); + } + } + } else if (inst->src[i].file == FIXED_GRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_read(inst, i); r++) + add_dep(last_grf_write[inst->src[i].nr + r], n); + } else { + add_dep(last_fixed_grf_write, n); + } + } else if (inst->src[i].is_accumulator()) { + add_dep(last_accumulator_write, n); + } else if (inst->src[i].file == ARF) { + add_barrier_deps(n); + } + } + + if (inst->base_mrf != -1) { + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + add_dep(last_mrf_write[inst->base_mrf + i], n); + } + } + + if (const unsigned mask = inst->flags_read(v->devinfo)) { + assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); + + for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { + if (mask & (1 << i)) + add_dep(last_conditional_mod[i], n); + } + } + + if (inst->reads_accumulator_implicitly()) { + add_dep(last_accumulator_write, n); + } + + /* write-after-write deps. */ + if (inst->dst.file == VGRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_written(inst); r++) { + add_dep(last_grf_write[inst->dst.nr + r], n); + last_grf_write[inst->dst.nr + r] = n; + } + } else { + for (unsigned r = 0; r < regs_written(inst); r++) { + add_dep(last_grf_write[inst->dst.nr * 16 + + inst->dst.offset / REG_SIZE + r], n); + last_grf_write[inst->dst.nr * 16 + + inst->dst.offset / REG_SIZE + r] = n; + } + } + } else if (inst->dst.file == MRF) { + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; + + add_dep(last_mrf_write[reg], n); + last_mrf_write[reg] = n; + if (is_compressed(inst)) { + if (inst->dst.nr & BRW_MRF_COMPR4) + reg += 4; + else + reg++; + add_dep(last_mrf_write[reg], n); + last_mrf_write[reg] = n; + } + } else if (inst->dst.file == FIXED_GRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_written(inst); r++) + last_grf_write[inst->dst.nr + r] = n; + } else { + last_fixed_grf_write = n; + } + } else if (inst->dst.is_accumulator()) { + add_dep(last_accumulator_write, n); + last_accumulator_write = n; + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { + add_barrier_deps(n); + } + + if (inst->mlen > 0 && inst->base_mrf != -1) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + add_dep(last_mrf_write[inst->base_mrf + i], n); + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (const unsigned mask = inst->flags_written()) { + assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); + + for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { + if (mask & (1 << i)) { + add_dep(last_conditional_mod[i], n, 0); + last_conditional_mod[i] = n; + } + } + } + + if (inst->writes_accumulator_implicitly(v->devinfo) && + !inst->dst.is_accumulator()) { + add_dep(last_accumulator_write, n); + last_accumulator_write = n; + } + } + + /* bottom-to-top dependencies: WAR */ + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); + last_accumulator_write = NULL; + last_fixed_grf_write = NULL; + + foreach_in_list_reverse_safe(schedule_node, n, &instructions) { + fs_inst *inst = (fs_inst *)n->inst; + + /* write-after-read deps. */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_read(inst, i); r++) + add_dep(n, last_grf_write[inst->src[i].nr + r], 0); + } else { + for (unsigned r = 0; r < regs_read(inst, i); r++) { + add_dep(n, last_grf_write[inst->src[i].nr * 16 + + inst->src[i].offset / REG_SIZE + r], 0); + } + } + } else if (inst->src[i].file == FIXED_GRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_read(inst, i); r++) + add_dep(n, last_grf_write[inst->src[i].nr + r], 0); + } else { + add_dep(n, last_fixed_grf_write, 0); + } + } else if (inst->src[i].is_accumulator()) { + add_dep(n, last_accumulator_write, 0); + } else if (inst->src[i].file == ARF) { + add_barrier_deps(n); + } + } + + if (inst->base_mrf != -1) { + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + add_dep(n, last_mrf_write[inst->base_mrf + i], 2); + } + } + + if (const unsigned mask = inst->flags_read(v->devinfo)) { + assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); + + for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { + if (mask & (1 << i)) + add_dep(n, last_conditional_mod[i]); + } + } + + if (inst->reads_accumulator_implicitly()) { + add_dep(n, last_accumulator_write); + } + + /* Update the things this instruction wrote, so earlier reads + * can mark this as WAR dependency. + */ + if (inst->dst.file == VGRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_written(inst); r++) + last_grf_write[inst->dst.nr + r] = n; + } else { + for (unsigned r = 0; r < regs_written(inst); r++) { + last_grf_write[inst->dst.nr * 16 + + inst->dst.offset / REG_SIZE + r] = n; + } + } + } else if (inst->dst.file == MRF) { + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; + + last_mrf_write[reg] = n; + + if (is_compressed(inst)) { + if (inst->dst.nr & BRW_MRF_COMPR4) + reg += 4; + else + reg++; + + last_mrf_write[reg] = n; + } + } else if (inst->dst.file == FIXED_GRF) { + if (post_reg_alloc) { + for (unsigned r = 0; r < regs_written(inst); r++) + last_grf_write[inst->dst.nr + r] = n; + } else { + last_fixed_grf_write = n; + } + } else if (inst->dst.is_accumulator()) { + last_accumulator_write = n; + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { + add_barrier_deps(n); + } + + if (inst->mlen > 0 && inst->base_mrf != -1) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (const unsigned mask = inst->flags_written()) { + assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); + + for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { + if (mask & (1 << i)) + last_conditional_mod[i] = n; + } + } + + if (inst->writes_accumulator_implicitly(v->devinfo)) { + last_accumulator_write = n; + } + } +} + +static bool +is_scheduling_barrier(const vec4_instruction *inst) +{ + return inst->is_control_flow() || + inst->has_side_effects(); +} + +void +vec4_instruction_scheduler::calculate_deps() +{ + schedule_node *last_grf_write[grf_count]; + schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; + schedule_node *last_conditional_mod = NULL; + schedule_node *last_accumulator_write = NULL; + /* Fixed HW registers are assumed to be separate from the virtual + * GRFs, so they can be tracked separately. We don't really write + * to fixed GRFs much, so don't bother tracking them on a more + * granular level. + */ + schedule_node *last_fixed_grf_write = NULL; + + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + + /* top-to-bottom dependencies: RAW and WAW. */ + foreach_in_list(schedule_node, n, &instructions) { + vec4_instruction *inst = (vec4_instruction *)n->inst; + + if (is_scheduling_barrier(inst)) + add_barrier_deps(n); + + /* read-after-write deps. */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + for (unsigned j = 0; j < regs_read(inst, i); ++j) + add_dep(last_grf_write[inst->src[i].nr + j], n); + } else if (inst->src[i].file == FIXED_GRF) { + add_dep(last_fixed_grf_write, n); + } else if (inst->src[i].is_accumulator()) { + assert(last_accumulator_write); + add_dep(last_accumulator_write, n); + } else if (inst->src[i].file == ARF) { + add_barrier_deps(n); + } + } + + if (!inst->is_send_from_grf()) { + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + add_dep(last_mrf_write[inst->base_mrf + i], n); + } + } + + if (inst->reads_flag()) { + assert(last_conditional_mod); + add_dep(last_conditional_mod, n); + } + + if (inst->reads_accumulator_implicitly()) { + assert(last_accumulator_write); + add_dep(last_accumulator_write, n); + } + + /* write-after-write deps. */ + if (inst->dst.file == VGRF) { + for (unsigned j = 0; j < regs_written(inst); ++j) { + add_dep(last_grf_write[inst->dst.nr + j], n); + last_grf_write[inst->dst.nr + j] = n; + } + } else if (inst->dst.file == MRF) { + add_dep(last_mrf_write[inst->dst.nr], n); + last_mrf_write[inst->dst.nr] = n; + } else if (inst->dst.file == FIXED_GRF) { + last_fixed_grf_write = n; + } else if (inst->dst.is_accumulator()) { + add_dep(last_accumulator_write, n); + last_accumulator_write = n; + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { + add_barrier_deps(n); + } + + if (inst->mlen > 0 && !inst->is_send_from_grf()) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + add_dep(last_mrf_write[inst->base_mrf + i], n); + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (inst->writes_flag()) { + add_dep(last_conditional_mod, n, 0); + last_conditional_mod = n; + } + + if (inst->writes_accumulator_implicitly(v->devinfo) && + !inst->dst.is_accumulator()) { + add_dep(last_accumulator_write, n); + last_accumulator_write = n; + } + } + + /* bottom-to-top dependencies: WAR */ + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + last_conditional_mod = NULL; + last_accumulator_write = NULL; + last_fixed_grf_write = NULL; + + foreach_in_list_reverse_safe(schedule_node, n, &instructions) { + vec4_instruction *inst = (vec4_instruction *)n->inst; + + /* write-after-read deps. */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + for (unsigned j = 0; j < regs_read(inst, i); ++j) + add_dep(n, last_grf_write[inst->src[i].nr + j]); + } else if (inst->src[i].file == FIXED_GRF) { + add_dep(n, last_fixed_grf_write); + } else if (inst->src[i].is_accumulator()) { + add_dep(n, last_accumulator_write); + } else if (inst->src[i].file == ARF) { + add_barrier_deps(n); + } + } + + if (!inst->is_send_from_grf()) { + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + add_dep(n, last_mrf_write[inst->base_mrf + i], 2); + } + } + + if (inst->reads_flag()) { + add_dep(n, last_conditional_mod); + } + + if (inst->reads_accumulator_implicitly()) { + add_dep(n, last_accumulator_write); + } + + /* Update the things this instruction wrote, so earlier reads + * can mark this as WAR dependency. + */ + if (inst->dst.file == VGRF) { + for (unsigned j = 0; j < regs_written(inst); ++j) + last_grf_write[inst->dst.nr + j] = n; + } else if (inst->dst.file == MRF) { + last_mrf_write[inst->dst.nr] = n; + } else if (inst->dst.file == FIXED_GRF) { + last_fixed_grf_write = n; + } else if (inst->dst.is_accumulator()) { + last_accumulator_write = n; + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { + add_barrier_deps(n); + } + + if (inst->mlen > 0 && !inst->is_send_from_grf()) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (inst->writes_flag()) { + last_conditional_mod = n; + } + + if (inst->writes_accumulator_implicitly(v->devinfo)) { + last_accumulator_write = n; + } + } +} + +schedule_node * +fs_instruction_scheduler::choose_instruction_to_schedule() +{ + schedule_node *chosen = NULL; + + if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { + int chosen_time = 0; + + /* Of the instructions ready to execute or the closest to being ready, + * choose the one most likely to unblock an early program exit, or + * otherwise the oldest one. + */ + foreach_in_list(schedule_node, n, &instructions) { + if (!chosen || + exit_unblocked_time(n) < exit_unblocked_time(chosen) || + (exit_unblocked_time(n) == exit_unblocked_time(chosen) && + n->unblocked_time < chosen_time)) { + chosen = n; + chosen_time = n->unblocked_time; + } + } + } else { + /* Before register allocation, we don't care about the latencies of + * instructions. All we care about is reducing live intervals of + * variables so that we can avoid register spilling, or get SIMD16 + * shaders which naturally do a better job of hiding instruction + * latency. + */ + foreach_in_list(schedule_node, n, &instructions) { + fs_inst *inst = (fs_inst *)n->inst; + + if (!chosen) { + chosen = n; + continue; + } + + /* Most important: If we can definitely reduce register pressure, do + * so immediately. + */ + int register_pressure_benefit = get_register_pressure_benefit(n->inst); + int chosen_register_pressure_benefit = + get_register_pressure_benefit(chosen->inst); + + if (register_pressure_benefit > 0 && + register_pressure_benefit > chosen_register_pressure_benefit) { + chosen = n; + continue; + } else if (chosen_register_pressure_benefit > 0 && + (register_pressure_benefit < + chosen_register_pressure_benefit)) { + continue; + } + + if (mode == SCHEDULE_PRE_LIFO) { + /* Prefer instructions that recently became available for + * scheduling. These are the things that are most likely to + * (eventually) make a variable dead and reduce register pressure. + * Typical register pressure estimates don't work for us because + * most of our pressure comes from texturing, where no single + * instruction to schedule will make a vec4 value dead. + */ + if (n->cand_generation > chosen->cand_generation) { + chosen = n; + continue; + } else if (n->cand_generation < chosen->cand_generation) { + continue; + } + + /* On MRF-using chips, prefer non-SEND instructions. If we don't + * do this, then because we prefer instructions that just became + * candidates, we'll end up in a pattern of scheduling a SEND, + * then the MRFs for the next SEND, then the next SEND, then the + * MRFs, etc., without ever consuming the results of a send. + */ + if (v->devinfo->gen < 7) { + fs_inst *chosen_inst = (fs_inst *)chosen->inst; + + /* We use size_written > 4 * exec_size as our test for the kind + * of send instruction to avoid -- only sends generate many + * regs, and a single-result send is probably actually reducing + * register pressure. + */ + if (inst->size_written <= 4 * inst->exec_size && + chosen_inst->size_written > 4 * chosen_inst->exec_size) { + chosen = n; + continue; + } else if (inst->size_written > chosen_inst->size_written) { + continue; + } + } + } + + /* For instructions pushed on the cands list at the same time, prefer + * the one with the highest delay to the end of the program. This is + * most likely to have its values able to be consumed first (such as + * for a large tree of lowered ubo loads, which appear reversed in + * the instruction stream with respect to when they can be consumed). + */ + if (n->delay > chosen->delay) { + chosen = n; + continue; + } else if (n->delay < chosen->delay) { + continue; + } + + /* Prefer the node most likely to unblock an early program exit. + */ + if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { + chosen = n; + continue; + } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { + continue; + } + + /* If all other metrics are equal, we prefer the first instruction in + * the list (program execution). + */ + } + } + + return chosen; +} + +schedule_node * +vec4_instruction_scheduler::choose_instruction_to_schedule() +{ + schedule_node *chosen = NULL; + int chosen_time = 0; + + /* Of the instructions ready to execute or the closest to being ready, + * choose the oldest one. + */ + foreach_in_list(schedule_node, n, &instructions) { + if (!chosen || n->unblocked_time < chosen_time) { + chosen = n; + chosen_time = n->unblocked_time; + } + } + + return chosen; +} + +int +fs_instruction_scheduler::issue_time(backend_instruction *inst) +{ + if (is_compressed((fs_inst *)inst)) + return 4; + else + return 2; +} + +int +vec4_instruction_scheduler::issue_time(backend_instruction *inst) +{ + /* We always execute as two vec4s in parallel. */ + return 2; +} + +void +instruction_scheduler::schedule_instructions(bblock_t *block) +{ + const struct gen_device_info *devinfo = bs->devinfo; + int time = 0; + if (!post_reg_alloc) + reg_pressure = reg_pressure_in[block->num]; + block_idx = block->num; + + /* Remove non-DAG heads from the list. */ + foreach_in_list_safe(schedule_node, n, &instructions) { + if (n->parent_count != 0) + n->remove(); + } + + unsigned cand_generation = 1; + while (!instructions.is_empty()) { + schedule_node *chosen = choose_instruction_to_schedule(); + + /* Schedule this instruction. */ + assert(chosen); + chosen->remove(); + chosen->inst->exec_node::remove(); + block->instructions.push_tail(chosen->inst); + instructions_to_schedule--; + + if (!post_reg_alloc) { + reg_pressure -= get_register_pressure_benefit(chosen->inst); + update_register_pressure(chosen->inst); + } + + /* If we expected a delay for scheduling, then bump the clock to reflect + * that. In reality, the hardware will switch to another hyperthread + * and may not return to dispatching our thread for a while even after + * we're unblocked. After this, we have the time when the chosen + * instruction will start executing. + */ + time = MAX2(time, chosen->unblocked_time); + + /* Update the clock for how soon an instruction could start after the + * chosen one. + */ + time += issue_time(chosen->inst); + + if (debug) { + fprintf(stderr, "clock %4d, scheduled: ", time); + bs->dump_instruction(chosen->inst); + if (!post_reg_alloc) + fprintf(stderr, "(register pressure %d)\n", reg_pressure); + } + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + for (int i = chosen->child_count - 1; i >= 0; i--) { + schedule_node *child = chosen->children[i]; + + child->unblocked_time = MAX2(child->unblocked_time, + time + chosen->child_latency[i]); + + if (debug) { + fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); + bs->dump_instruction(child->inst); + } + + child->cand_generation = cand_generation; + child->parent_count--; + if (child->parent_count == 0) { + if (debug) { + fprintf(stderr, "\t\tnow available\n"); + } + instructions.push_head(child); + } + } + cand_generation++; + + /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ + * but it's more limited pre-gen6, so if we send something off to it then + * the next math instruction isn't going to make progress until the first + * is done. + */ + if (devinfo->gen < 6 && chosen->inst->is_math()) { + foreach_in_list(schedule_node, n, &instructions) { + if (n->inst->is_math()) + n->unblocked_time = MAX2(n->unblocked_time, + time + chosen->latency); + } + } + } + + assert(instructions_to_schedule == 0); + + block->cycle_count = time; +} + +static unsigned get_cycle_count(cfg_t *cfg) +{ + unsigned count = 0, multiplier = 1; + foreach_block(block, cfg) { + if (block->start()->opcode == BRW_OPCODE_DO) + multiplier *= 10; /* assume that loops execute ~10 times */ + + count += block->cycle_count * multiplier; + + if (block->end()->opcode == BRW_OPCODE_WHILE) + multiplier /= 10; + } + + return count; +} + +void +instruction_scheduler::run(cfg_t *cfg) +{ + if (debug && !post_reg_alloc) { + fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", + post_reg_alloc); + bs->dump_instructions(); + } + + if (!post_reg_alloc) + setup_liveness(cfg); + + foreach_block(block, cfg) { + if (reads_remaining) { + memset(reads_remaining, 0, + grf_count * sizeof(*reads_remaining)); + memset(hw_reads_remaining, 0, + hw_reg_count * sizeof(*hw_reads_remaining)); + memset(written, 0, grf_count * sizeof(*written)); + + foreach_inst_in_block(fs_inst, inst, block) + count_reads_remaining(inst); + } + + add_insts_from_block(block); + + calculate_deps(); + + compute_delays(); + compute_exits(); + + schedule_instructions(block); + } + + if (debug && !post_reg_alloc) { + fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", + post_reg_alloc); + bs->dump_instructions(); + } + + cfg->cycle_count = get_cycle_count(cfg); +} + +void +fs_visitor::schedule_instructions(instruction_scheduler_mode mode) +{ + if (mode != SCHEDULE_POST) + calculate_live_intervals(); + + int grf_count; + if (mode == SCHEDULE_POST) + grf_count = grf_used; + else + grf_count = alloc.count; + + fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, + cfg->num_blocks, mode); + sched.run(cfg); + + invalidate_live_intervals(); +} + +void +vec4_visitor::opt_schedule_instructions() +{ + vec4_instruction_scheduler sched(this, prog_data->total_grf); + sched.run(cfg); + + invalidate_live_intervals(); +} diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp new file mode 100644 index 00000000000..bfaa5e7bfe2 --- /dev/null +++ b/src/intel/compiler/brw_shader.cpp @@ -0,0 +1,1273 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_cfg.h" +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_nir.h" +#include "brw_vec4_tes.h" +#include "common/gen_debug.h" +#include "main/uniforms.h" +#include "util/macros.h" + +enum brw_reg_type +brw_type_for_base_type(const struct glsl_type *type) +{ + switch (type->base_type) { + case GLSL_TYPE_FLOAT: + return BRW_REGISTER_TYPE_F; + case GLSL_TYPE_INT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_SUBROUTINE: + return BRW_REGISTER_TYPE_D; + case GLSL_TYPE_UINT: + return BRW_REGISTER_TYPE_UD; + case GLSL_TYPE_ARRAY: + return brw_type_for_base_type(type->fields.array); + case GLSL_TYPE_STRUCT: + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_ATOMIC_UINT: + /* These should be overridden with the type of the member when + * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely + * way to trip up if we don't. + */ + return BRW_REGISTER_TYPE_UD; + case GLSL_TYPE_IMAGE: + return BRW_REGISTER_TYPE_UD; + case GLSL_TYPE_DOUBLE: + return BRW_REGISTER_TYPE_DF; + case GLSL_TYPE_UINT64: + return BRW_REGISTER_TYPE_UQ; + case GLSL_TYPE_INT64: + return BRW_REGISTER_TYPE_Q; + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: + unreachable("not reached"); + } + + return BRW_REGISTER_TYPE_F; +} + +enum brw_conditional_mod +brw_conditional_for_comparison(unsigned int op) +{ + switch (op) { + case ir_binop_less: + return BRW_CONDITIONAL_L; + case ir_binop_greater: + return BRW_CONDITIONAL_G; + case ir_binop_lequal: + return BRW_CONDITIONAL_LE; + case ir_binop_gequal: + return BRW_CONDITIONAL_GE; + case ir_binop_equal: + case ir_binop_all_equal: /* same as equal for scalars */ + return BRW_CONDITIONAL_Z; + case ir_binop_nequal: + case ir_binop_any_nequal: /* same as nequal for scalars */ + return BRW_CONDITIONAL_NZ; + default: + unreachable("not reached: bad operation for comparison"); + } +} + +uint32_t +brw_math_function(enum opcode op) +{ + switch (op) { + case SHADER_OPCODE_RCP: + return BRW_MATH_FUNCTION_INV; + case SHADER_OPCODE_RSQ: + return BRW_MATH_FUNCTION_RSQ; + case SHADER_OPCODE_SQRT: + return BRW_MATH_FUNCTION_SQRT; + case SHADER_OPCODE_EXP2: + return BRW_MATH_FUNCTION_EXP; + case SHADER_OPCODE_LOG2: + return BRW_MATH_FUNCTION_LOG; + case SHADER_OPCODE_POW: + return BRW_MATH_FUNCTION_POW; + case SHADER_OPCODE_SIN: + return BRW_MATH_FUNCTION_SIN; + case SHADER_OPCODE_COS: + return BRW_MATH_FUNCTION_COS; + case SHADER_OPCODE_INT_QUOTIENT: + return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT; + case SHADER_OPCODE_INT_REMAINDER: + return BRW_MATH_FUNCTION_INT_DIV_REMAINDER; + default: + unreachable("not reached: unknown math function"); + } +} + +bool +brw_texture_offset(int *offsets, unsigned num_components, uint32_t *offset_bits) +{ + if (!offsets) return false; /* nonconstant offset; caller will handle it. */ + + /* offset out of bounds; caller will handle it. */ + for (unsigned i = 0; i < num_components; i++) + if (offsets[i] > 7 || offsets[i] < -8) + return false; + + /* Combine all three offsets into a single unsigned dword: + * + * bits 11:8 - U Offset (X component) + * bits 7:4 - V Offset (Y component) + * bits 3:0 - R Offset (Z component) + */ + *offset_bits = 0; + for (unsigned i = 0; i < num_components; i++) { + const unsigned shift = 4 * (2 - i); + *offset_bits |= (offsets[i] << shift) & (0xF << shift); + } + return true; +} + +const char * +brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) +{ + switch (op) { + case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP: + /* The DO instruction doesn't exist on Gen6+, but we use it to mark the + * start of a loop in the IR. + */ + if (devinfo->gen >= 6 && op == BRW_OPCODE_DO) + return "do"; + + assert(brw_opcode_desc(devinfo, op)->name); + return brw_opcode_desc(devinfo, op)->name; + case FS_OPCODE_FB_WRITE: + return "fb_write"; + case FS_OPCODE_FB_WRITE_LOGICAL: + return "fb_write_logical"; + case FS_OPCODE_REP_FB_WRITE: + return "rep_fb_write"; + case FS_OPCODE_FB_READ: + return "fb_read"; + case FS_OPCODE_FB_READ_LOGICAL: + return "fb_read_logical"; + + case SHADER_OPCODE_RCP: + return "rcp"; + case SHADER_OPCODE_RSQ: + return "rsq"; + case SHADER_OPCODE_SQRT: + return "sqrt"; + case SHADER_OPCODE_EXP2: + return "exp2"; + case SHADER_OPCODE_LOG2: + return "log2"; + case SHADER_OPCODE_POW: + return "pow"; + case SHADER_OPCODE_INT_QUOTIENT: + return "int_quot"; + case SHADER_OPCODE_INT_REMAINDER: + return "int_rem"; + case SHADER_OPCODE_SIN: + return "sin"; + case SHADER_OPCODE_COS: + return "cos"; + + case SHADER_OPCODE_TEX: + return "tex"; + case SHADER_OPCODE_TEX_LOGICAL: + return "tex_logical"; + case SHADER_OPCODE_TXD: + return "txd"; + case SHADER_OPCODE_TXD_LOGICAL: + return "txd_logical"; + case SHADER_OPCODE_TXF: + return "txf"; + case SHADER_OPCODE_TXF_LOGICAL: + return "txf_logical"; + case SHADER_OPCODE_TXF_LZ: + return "txf_lz"; + case SHADER_OPCODE_TXL: + return "txl"; + case SHADER_OPCODE_TXL_LOGICAL: + return "txl_logical"; + case SHADER_OPCODE_TXL_LZ: + return "txl_lz"; + case SHADER_OPCODE_TXS: + return "txs"; + case SHADER_OPCODE_TXS_LOGICAL: + return "txs_logical"; + case FS_OPCODE_TXB: + return "txb"; + case FS_OPCODE_TXB_LOGICAL: + return "txb_logical"; + case SHADER_OPCODE_TXF_CMS: + return "txf_cms"; + case SHADER_OPCODE_TXF_CMS_LOGICAL: + return "txf_cms_logical"; + case SHADER_OPCODE_TXF_CMS_W: + return "txf_cms_w"; + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + return "txf_cms_w_logical"; + case SHADER_OPCODE_TXF_UMS: + return "txf_ums"; + case SHADER_OPCODE_TXF_UMS_LOGICAL: + return "txf_ums_logical"; + case SHADER_OPCODE_TXF_MCS: + return "txf_mcs"; + case SHADER_OPCODE_TXF_MCS_LOGICAL: + return "txf_mcs_logical"; + case SHADER_OPCODE_LOD: + return "lod"; + case SHADER_OPCODE_LOD_LOGICAL: + return "lod_logical"; + case SHADER_OPCODE_TG4: + return "tg4"; + case SHADER_OPCODE_TG4_LOGICAL: + return "tg4_logical"; + case SHADER_OPCODE_TG4_OFFSET: + return "tg4_offset"; + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + return "tg4_offset_logical"; + case SHADER_OPCODE_SAMPLEINFO: + return "sampleinfo"; + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + return "sampleinfo_logical"; + + case SHADER_OPCODE_SHADER_TIME_ADD: + return "shader_time_add"; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + return "untyped_atomic"; + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + return "untyped_atomic_logical"; + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + return "untyped_surface_read"; + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + return "untyped_surface_read_logical"; + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + return "untyped_surface_write"; + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + return "untyped_surface_write_logical"; + case SHADER_OPCODE_TYPED_ATOMIC: + return "typed_atomic"; + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + return "typed_atomic_logical"; + case SHADER_OPCODE_TYPED_SURFACE_READ: + return "typed_surface_read"; + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + return "typed_surface_read_logical"; + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + return "typed_surface_write"; + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + return "typed_surface_write_logical"; + case SHADER_OPCODE_MEMORY_FENCE: + return "memory_fence"; + + case SHADER_OPCODE_LOAD_PAYLOAD: + return "load_payload"; + case FS_OPCODE_PACK: + return "pack"; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + return "gen4_scratch_read"; + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + return "gen4_scratch_write"; + case SHADER_OPCODE_GEN7_SCRATCH_READ: + return "gen7_scratch_read"; + case SHADER_OPCODE_URB_WRITE_SIMD8: + return "gen8_urb_write_simd8"; + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + return "gen8_urb_write_simd8_per_slot"; + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + return "gen8_urb_write_simd8_masked"; + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + return "gen8_urb_write_simd8_masked_per_slot"; + case SHADER_OPCODE_URB_READ_SIMD8: + return "urb_read_simd8"; + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + return "urb_read_simd8_per_slot"; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + return "find_live_channel"; + case SHADER_OPCODE_BROADCAST: + return "broadcast"; + + case VEC4_OPCODE_MOV_BYTES: + return "mov_bytes"; + case VEC4_OPCODE_PACK_BYTES: + return "pack_bytes"; + case VEC4_OPCODE_UNPACK_UNIFORM: + return "unpack_uniform"; + case VEC4_OPCODE_FROM_DOUBLE: + return "double_to_single"; + case VEC4_OPCODE_TO_DOUBLE: + return "single_to_double"; + case VEC4_OPCODE_PICK_LOW_32BIT: + return "pick_low_32bit"; + case VEC4_OPCODE_PICK_HIGH_32BIT: + return "pick_high_32bit"; + case VEC4_OPCODE_SET_LOW_32BIT: + return "set_low_32bit"; + case VEC4_OPCODE_SET_HIGH_32BIT: + return "set_high_32bit"; + + case FS_OPCODE_DDX_COARSE: + return "ddx_coarse"; + case FS_OPCODE_DDX_FINE: + return "ddx_fine"; + case FS_OPCODE_DDY_COARSE: + return "ddy_coarse"; + case FS_OPCODE_DDY_FINE: + return "ddy_fine"; + + case FS_OPCODE_CINTERP: + return "cinterp"; + case FS_OPCODE_LINTERP: + return "linterp"; + + case FS_OPCODE_PIXEL_X: + return "pixel_x"; + case FS_OPCODE_PIXEL_Y: + return "pixel_y"; + + case FS_OPCODE_GET_BUFFER_SIZE: + return "fs_get_buffer_size"; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + return "uniform_pull_const"; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + return "uniform_pull_const_gen7"; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + return "varying_pull_const_gen4"; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: + return "varying_pull_const_gen7"; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + return "varying_pull_const_logical"; + + case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: + return "mov_dispatch_to_flags"; + case FS_OPCODE_DISCARD_JUMP: + return "discard_jump"; + + case FS_OPCODE_SET_SAMPLE_ID: + return "set_sample_id"; + + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + return "pack_half_2x16_split"; + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: + return "unpack_half_2x16_split_x"; + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: + return "unpack_half_2x16_split_y"; + + case FS_OPCODE_PLACEHOLDER_HALT: + return "placeholder_halt"; + + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + return "interp_sample"; + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + return "interp_shared_offset"; + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + return "interp_per_slot_offset"; + + case VS_OPCODE_URB_WRITE: + return "vs_urb_write"; + case VS_OPCODE_PULL_CONSTANT_LOAD: + return "pull_constant_load"; + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + return "pull_constant_load_gen7"; + + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + return "set_simd4x2_header_gen9"; + + case VS_OPCODE_GET_BUFFER_SIZE: + return "vs_get_buffer_size"; + + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + return "unpack_flags_simd4x2"; + + case GS_OPCODE_URB_WRITE: + return "gs_urb_write"; + case GS_OPCODE_URB_WRITE_ALLOCATE: + return "gs_urb_write_allocate"; + case GS_OPCODE_THREAD_END: + return "gs_thread_end"; + case GS_OPCODE_SET_WRITE_OFFSET: + return "set_write_offset"; + case GS_OPCODE_SET_VERTEX_COUNT: + return "set_vertex_count"; + case GS_OPCODE_SET_DWORD_2: + return "set_dword_2"; + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + return "prepare_channel_masks"; + case GS_OPCODE_SET_CHANNEL_MASKS: + return "set_channel_masks"; + case GS_OPCODE_GET_INSTANCE_ID: + return "get_instance_id"; + case GS_OPCODE_FF_SYNC: + return "ff_sync"; + case GS_OPCODE_SET_PRIMITIVE_ID: + return "set_primitive_id"; + case GS_OPCODE_SVB_WRITE: + return "gs_svb_write"; + case GS_OPCODE_SVB_SET_DST_INDEX: + return "gs_svb_set_dst_index"; + case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: + return "gs_ff_sync_set_primitives"; + case CS_OPCODE_CS_TERMINATE: + return "cs_terminate"; + case SHADER_OPCODE_BARRIER: + return "barrier"; + case SHADER_OPCODE_MULH: + return "mulh"; + case SHADER_OPCODE_MOV_INDIRECT: + return "mov_indirect"; + + case VEC4_OPCODE_URB_READ: + return "urb_read"; + case TCS_OPCODE_GET_INSTANCE_ID: + return "tcs_get_instance_id"; + case TCS_OPCODE_URB_WRITE: + return "tcs_urb_write"; + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + return "tcs_set_input_urb_offsets"; + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return "tcs_set_output_urb_offsets"; + case TCS_OPCODE_GET_PRIMITIVE_ID: + return "tcs_get_primitive_id"; + case TCS_OPCODE_CREATE_BARRIER_HEADER: + return "tcs_create_barrier_header"; + case TCS_OPCODE_SRC0_010_IS_ZERO: + return "tcs_src0<0,1,0>_is_zero"; + case TCS_OPCODE_RELEASE_INPUT: + return "tcs_release_input"; + case TCS_OPCODE_THREAD_END: + return "tcs_thread_end"; + case TES_OPCODE_CREATE_INPUT_READ_HEADER: + return "tes_create_input_read_header"; + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + return "tes_add_indirect_urb_offset"; + case TES_OPCODE_GET_PRIMITIVE_ID: + return "tes_get_primitive_id"; + } + + unreachable("not reached"); +} + +bool +brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg) +{ + union { + unsigned ud; + int d; + float f; + double df; + } imm, sat_imm = { 0 }; + + const unsigned size = type_sz(type); + + /* We want to either do a 32-bit or 64-bit data copy, the type is otherwise + * irrelevant, so just check the size of the type and copy from/to an + * appropriately sized field. + */ + if (size < 8) + imm.ud = reg->ud; + else + imm.df = reg->df; + + switch (type) { + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + /* Nothing to do. */ + return false; + case BRW_REGISTER_TYPE_F: + sat_imm.f = CLAMP(imm.f, 0.0f, 1.0f); + break; + case BRW_REGISTER_TYPE_DF: + sat_imm.df = CLAMP(imm.df, 0.0, 1.0); + break; + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + unreachable("no UB/B immediates"); + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_VF: + unreachable("unimplemented: saturate vector immediate"); + case BRW_REGISTER_TYPE_HF: + unreachable("unimplemented: saturate HF immediate"); + } + + if (size < 8) { + if (imm.ud != sat_imm.ud) { + reg->ud = sat_imm.ud; + return true; + } + } else { + if (imm.df != sat_imm.df) { + reg->df = sat_imm.df; + return true; + } + } + return false; +} + +bool +brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg) +{ + switch (type) { + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + reg->d = -reg->d; + return true; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + reg->d = -(int16_t)reg->ud; + return true; + case BRW_REGISTER_TYPE_F: + reg->f = -reg->f; + return true; + case BRW_REGISTER_TYPE_VF: + reg->ud ^= 0x80808080; + return true; + case BRW_REGISTER_TYPE_DF: + reg->df = -reg->df; + return true; + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + reg->d64 = -reg->d64; + return true; + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + unreachable("no UB/B immediates"); + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + assert(!"unimplemented: negate UV/V immediate"); + case BRW_REGISTER_TYPE_HF: + assert(!"unimplemented: negate HF immediate"); + } + + return false; +} + +bool +brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg) +{ + switch (type) { + case BRW_REGISTER_TYPE_D: + reg->d = abs(reg->d); + return true; + case BRW_REGISTER_TYPE_W: + reg->d = abs((int16_t)reg->ud); + return true; + case BRW_REGISTER_TYPE_F: + reg->f = fabsf(reg->f); + return true; + case BRW_REGISTER_TYPE_DF: + reg->df = fabs(reg->df); + return true; + case BRW_REGISTER_TYPE_VF: + reg->ud &= ~0x80808080; + return true; + case BRW_REGISTER_TYPE_Q: + reg->d64 = imaxabs(reg->d64); + return true; + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + unreachable("no UB/B immediates"); + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_UV: + /* Presumably the absolute value modifier on an unsigned source is a + * nop, but it would be nice to confirm. + */ + assert(!"unimplemented: abs unsigned immediate"); + case BRW_REGISTER_TYPE_V: + assert(!"unimplemented: abs V immediate"); + case BRW_REGISTER_TYPE_HF: + assert(!"unimplemented: abs HF immediate"); + } + + return false; +} + +/** + * Get the appropriate atomic op for an image atomic intrinsic. + */ +unsigned +get_atomic_counter_op(nir_intrinsic_op op) +{ + switch (op) { + case nir_intrinsic_atomic_counter_inc: + return BRW_AOP_INC; + case nir_intrinsic_atomic_counter_dec: + return BRW_AOP_PREDEC; + case nir_intrinsic_atomic_counter_add: + return BRW_AOP_ADD; + case nir_intrinsic_atomic_counter_min: + return BRW_AOP_UMIN; + case nir_intrinsic_atomic_counter_max: + return BRW_AOP_UMAX; + case nir_intrinsic_atomic_counter_and: + return BRW_AOP_AND; + case nir_intrinsic_atomic_counter_or: + return BRW_AOP_OR; + case nir_intrinsic_atomic_counter_xor: + return BRW_AOP_XOR; + case nir_intrinsic_atomic_counter_exchange: + return BRW_AOP_MOV; + case nir_intrinsic_atomic_counter_comp_swap: + return BRW_AOP_CMPWR; + default: + unreachable("Not reachable."); + } +} + +backend_shader::backend_shader(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *shader, + struct brw_stage_prog_data *stage_prog_data) + : compiler(compiler), + log_data(log_data), + devinfo(compiler->devinfo), + nir(shader), + stage_prog_data(stage_prog_data), + mem_ctx(mem_ctx), + cfg(NULL), + stage(shader->stage) +{ + debug_enabled = INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage); + stage_name = _mesa_shader_stage_to_string(stage); + stage_abbrev = _mesa_shader_stage_to_abbrev(stage); +} + +bool +backend_reg::equals(const backend_reg &r) const +{ + return brw_regs_equal(this, &r) && offset == r.offset; +} + +bool +backend_reg::is_zero() const +{ + if (file != IMM) + return false; + + switch (type) { + case BRW_REGISTER_TYPE_F: + return f == 0; + case BRW_REGISTER_TYPE_DF: + return df == 0; + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + return d == 0; + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + return u64 == 0; + default: + return false; + } +} + +bool +backend_reg::is_one() const +{ + if (file != IMM) + return false; + + switch (type) { + case BRW_REGISTER_TYPE_F: + return f == 1.0f; + case BRW_REGISTER_TYPE_DF: + return df == 1.0; + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + return d == 1; + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + return u64 == 1; + default: + return false; + } +} + +bool +backend_reg::is_negative_one() const +{ + if (file != IMM) + return false; + + switch (type) { + case BRW_REGISTER_TYPE_F: + return f == -1.0; + case BRW_REGISTER_TYPE_DF: + return df == -1.0; + case BRW_REGISTER_TYPE_D: + return d == -1; + case BRW_REGISTER_TYPE_Q: + return d64 == -1; + default: + return false; + } +} + +bool +backend_reg::is_null() const +{ + return file == ARF && nr == BRW_ARF_NULL; +} + + +bool +backend_reg::is_accumulator() const +{ + return file == ARF && nr == BRW_ARF_ACCUMULATOR; +} + +bool +backend_instruction::is_commutative() const +{ + switch (opcode) { + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + return true; + case BRW_OPCODE_SEL: + /* MIN and MAX are commutative. */ + if (conditional_mod == BRW_CONDITIONAL_GE || + conditional_mod == BRW_CONDITIONAL_L) { + return true; + } + /* fallthrough */ + default: + return false; + } +} + +bool +backend_instruction::is_3src(const struct gen_device_info *devinfo) const +{ + return ::is_3src(devinfo, opcode); +} + +bool +backend_instruction::is_tex() const +{ + return (opcode == SHADER_OPCODE_TEX || + opcode == FS_OPCODE_TXB || + opcode == SHADER_OPCODE_TXD || + opcode == SHADER_OPCODE_TXF || + opcode == SHADER_OPCODE_TXF_LZ || + opcode == SHADER_OPCODE_TXF_CMS || + opcode == SHADER_OPCODE_TXF_CMS_W || + opcode == SHADER_OPCODE_TXF_UMS || + opcode == SHADER_OPCODE_TXF_MCS || + opcode == SHADER_OPCODE_TXL || + opcode == SHADER_OPCODE_TXL_LZ || + opcode == SHADER_OPCODE_TXS || + opcode == SHADER_OPCODE_LOD || + opcode == SHADER_OPCODE_TG4 || + opcode == SHADER_OPCODE_TG4_OFFSET || + opcode == SHADER_OPCODE_SAMPLEINFO); +} + +bool +backend_instruction::is_math() const +{ + return (opcode == SHADER_OPCODE_RCP || + opcode == SHADER_OPCODE_RSQ || + opcode == SHADER_OPCODE_SQRT || + opcode == SHADER_OPCODE_EXP2 || + opcode == SHADER_OPCODE_LOG2 || + opcode == SHADER_OPCODE_SIN || + opcode == SHADER_OPCODE_COS || + opcode == SHADER_OPCODE_INT_QUOTIENT || + opcode == SHADER_OPCODE_INT_REMAINDER || + opcode == SHADER_OPCODE_POW); +} + +bool +backend_instruction::is_control_flow() const +{ + switch (opcode) { + case BRW_OPCODE_DO: + case BRW_OPCODE_WHILE: + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + return true; + default: + return false; + } +} + +bool +backend_instruction::can_do_source_mods() const +{ + switch (opcode) { + case BRW_OPCODE_ADDC: + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_BFREV: + case BRW_OPCODE_CBIT: + case BRW_OPCODE_FBH: + case BRW_OPCODE_FBL: + case BRW_OPCODE_SUBB: + return false; + default: + return true; + } +} + +bool +backend_instruction::can_do_saturate() const +{ + switch (opcode) { + case BRW_OPCODE_ADD: + case BRW_OPCODE_ASR: + case BRW_OPCODE_AVG: + case BRW_OPCODE_DP2: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_F32TO16: + case BRW_OPCODE_LINE: + case BRW_OPCODE_LRP: + case BRW_OPCODE_MAC: + case BRW_OPCODE_MAD: + case BRW_OPCODE_MATH: + case BRW_OPCODE_MOV: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_PLN: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_SEL: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case FS_OPCODE_LINTERP: + case SHADER_OPCODE_COS: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_SQRT: + return true; + default: + return false; + } +} + +bool +backend_instruction::can_do_cmod() const +{ + switch (opcode) { + case BRW_OPCODE_ADD: + case BRW_OPCODE_ADDC: + case BRW_OPCODE_AND: + case BRW_OPCODE_ASR: + case BRW_OPCODE_AVG: + case BRW_OPCODE_CMP: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_DP2: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_F32TO16: + case BRW_OPCODE_FRC: + case BRW_OPCODE_LINE: + case BRW_OPCODE_LRP: + case BRW_OPCODE_LZD: + case BRW_OPCODE_MAC: + case BRW_OPCODE_MACH: + case BRW_OPCODE_MAD: + case BRW_OPCODE_MOV: + case BRW_OPCODE_MUL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_OR: + case BRW_OPCODE_PLN: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_SAD2: + case BRW_OPCODE_SADA2: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SUBB: + case BRW_OPCODE_XOR: + case FS_OPCODE_CINTERP: + case FS_OPCODE_LINTERP: + return true; + default: + return false; + } +} + +bool +backend_instruction::reads_accumulator_implicitly() const +{ + switch (opcode) { + case BRW_OPCODE_MAC: + case BRW_OPCODE_MACH: + case BRW_OPCODE_SADA2: + return true; + default: + return false; + } +} + +bool +backend_instruction::writes_accumulator_implicitly(const struct gen_device_info *devinfo) const +{ + return writes_accumulator || + (devinfo->gen < 6 && + ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) || + (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP && + opcode != FS_OPCODE_CINTERP))); +} + +bool +backend_instruction::has_side_effects() const +{ + switch (opcode) { + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_WRITE_LOGICAL: + case SHADER_OPCODE_BARRIER: + case TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: + return true; + default: + return false; + } +} + +bool +backend_instruction::is_volatile() const +{ + switch (opcode) { + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case VEC4_OPCODE_URB_READ: + return true; + default: + return false; + } +} + +#ifndef NDEBUG +static bool +inst_is_in_block(const bblock_t *block, const backend_instruction *inst) +{ + bool found = false; + foreach_inst_in_block (backend_instruction, i, block) { + if (inst == i) { + found = true; + } + } + return found; +} +#endif + +static void +adjust_later_block_ips(bblock_t *start_block, int ip_adjustment) +{ + for (bblock_t *block_iter = start_block->next(); + block_iter; + block_iter = block_iter->next()) { + block_iter->start_ip += ip_adjustment; + block_iter->end_ip += ip_adjustment; + } +} + +void +backend_instruction::insert_after(bblock_t *block, backend_instruction *inst) +{ + assert(this != inst); + + if (!this->is_head_sentinel()) + assert(inst_is_in_block(block, this) || !"Instruction not in block"); + + block->end_ip++; + + adjust_later_block_ips(block, 1); + + exec_node::insert_after(inst); +} + +void +backend_instruction::insert_before(bblock_t *block, backend_instruction *inst) +{ + assert(this != inst); + + if (!this->is_tail_sentinel()) + assert(inst_is_in_block(block, this) || !"Instruction not in block"); + + block->end_ip++; + + adjust_later_block_ips(block, 1); + + exec_node::insert_before(inst); +} + +void +backend_instruction::insert_before(bblock_t *block, exec_list *list) +{ + assert(inst_is_in_block(block, this) || !"Instruction not in block"); + + unsigned num_inst = list->length(); + + block->end_ip += num_inst; + + adjust_later_block_ips(block, num_inst); + + exec_node::insert_before(list); +} + +void +backend_instruction::remove(bblock_t *block) +{ + assert(inst_is_in_block(block, this) || !"Instruction not in block"); + + adjust_later_block_ips(block, -1); + + if (block->start_ip == block->end_ip) { + block->cfg->remove_block(block); + } else { + block->end_ip--; + } + + exec_node::remove(); +} + +void +backend_shader::dump_instructions() +{ + dump_instructions(NULL); +} + +void +backend_shader::dump_instructions(const char *name) +{ + FILE *file = stderr; + if (name && geteuid() != 0) { + file = fopen(name, "w"); + if (!file) + file = stderr; + } + + if (cfg) { + int ip = 0; + foreach_block_and_inst(block, backend_instruction, inst, cfg) { + if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) + fprintf(file, "%4d: ", ip++); + dump_instruction(inst, file); + } + } else { + int ip = 0; + foreach_in_list(backend_instruction, inst, &instructions) { + if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) + fprintf(file, "%4d: ", ip++); + dump_instruction(inst, file); + } + } + + if (file != stderr) { + fclose(file); + } +} + +void +backend_shader::calculate_cfg() +{ + if (this->cfg) + return; + cfg = new(mem_ctx) cfg_t(&this->instructions); +} + +extern "C" const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + const struct brw_vue_map *input_vue_map, + struct brw_tes_prog_data *prog_data, + const nir_shader *src_shader, + struct gl_program *prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir->info->inputs_read = key->inputs_read; + nir->info->patch_inputs_read = key->patch_inputs_read; + + nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar); + brw_nir_lower_tes_inputs(nir, input_vue_map); + brw_nir_lower_vue_outputs(nir, is_scalar); + nir = brw_postprocess_nir(nir, compiler, is_scalar); + + brw_compute_vue_map(devinfo, &prog_data->base.vue_map, + nir->info->outputs_written, + nir->info->separate_shader); + + unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size"); + return NULL; + } + + prog_data->base.clip_distance_mask = + ((1 << nir->info->clip_distance_array_size) - 1); + prog_data->base.cull_distance_mask = + ((1 << nir->info->cull_distance_array_size) - 1) << + nir->info->clip_distance_array_size; + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + prog_data->base.urb_read_length = 0; + + STATIC_ASSERT(BRW_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1); + STATIC_ASSERT(BRW_TESS_PARTITIONING_ODD_FRACTIONAL == + TESS_SPACING_FRACTIONAL_ODD - 1); + STATIC_ASSERT(BRW_TESS_PARTITIONING_EVEN_FRACTIONAL == + TESS_SPACING_FRACTIONAL_EVEN - 1); + + prog_data->partitioning = + (enum brw_tess_partitioning) (nir->info->tess.spacing - 1); + + switch (nir->info->tess.primitive_mode) { + case GL_QUADS: + prog_data->domain = BRW_TESS_DOMAIN_QUAD; + break; + case GL_TRIANGLES: + prog_data->domain = BRW_TESS_DOMAIN_TRI; + break; + case GL_ISOLINES: + prog_data->domain = BRW_TESS_DOMAIN_ISOLINE; + break; + default: + unreachable("invalid domain shader primitive mode"); + } + + if (nir->info->tess.point_mode) { + prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } else if (nir->info->tess.primitive_mode == GL_ISOLINES) { + prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE; + } else { + /* Hardware winding order is backwards from OpenGL */ + prog_data->output_topology = + nir->info->tess.ccw ? BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW + : BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW; + } + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + fprintf(stderr, "TES Input "); + brw_print_vue_map(stderr, input_vue_map); + fprintf(stderr, "TES Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (is_scalar) { + fs_visitor v(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, NULL, nir, 8, + shader_time_index, input_vue_map); + if (!v.run_tes()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, false, + MESA_SHADER_TESS_EVAL); + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + g.enable_debug(ralloc_asprintf(mem_ctx, + "%s tessellation evaluation shader %s", + nir->info->label ? nir->info->label + : "unnamed", + nir->info->name)); + } + + g.generate_code(v.cfg, 8); + + return g.get_assembly(final_assembly_size); + } else { + brw::vec4_tes_visitor v(compiler, log_data, key, prog_data, + nir, mem_ctx, shader_time_index); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) + v.dump_instructions(); + + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, + &prog_data->base, v.cfg, + final_assembly_size); + } +} diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h new file mode 100644 index 00000000000..5a253e66570 --- /dev/null +++ b/src/intel/compiler/brw_shader.h @@ -0,0 +1,295 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include <stdint.h> +#include "brw_reg.h" +#include "brw_compiler.h" +#include "brw_eu_defines.h" +#include "brw_inst.h" +#include "compiler/nir/nir.h" + +#ifdef __cplusplus +#include "brw_ir_allocator.h" +#endif + +#define MAX_SAMPLER_MESSAGE_SIZE 11 +#define MAX_VGRF_SIZE 16 + +#ifdef __cplusplus +struct backend_reg : private brw_reg +{ + backend_reg() {} + backend_reg(const struct brw_reg ®) : brw_reg(reg) {} + + const brw_reg &as_brw_reg() const + { + assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM); + assert(offset == 0); + return static_cast<const brw_reg &>(*this); + } + + brw_reg &as_brw_reg() + { + assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM); + assert(offset == 0); + return static_cast<brw_reg &>(*this); + } + + bool equals(const backend_reg &r) const; + + bool is_zero() const; + bool is_one() const; + bool is_negative_one() const; + bool is_null() const; + bool is_accumulator() const; + + /** Offset from the start of the (virtual) register in bytes. */ + uint16_t offset; + + using brw_reg::type; + using brw_reg::file; + using brw_reg::negate; + using brw_reg::abs; + using brw_reg::address_mode; + using brw_reg::subnr; + using brw_reg::nr; + + using brw_reg::swizzle; + using brw_reg::writemask; + using brw_reg::indirect_offset; + using brw_reg::vstride; + using brw_reg::width; + using brw_reg::hstride; + + using brw_reg::df; + using brw_reg::f; + using brw_reg::d; + using brw_reg::ud; +}; +#endif + +struct cfg_t; +struct bblock_t; + +#ifdef __cplusplus +struct backend_instruction : public exec_node { + bool is_3src(const struct gen_device_info *devinfo) const; + bool is_tex() const; + bool is_math() const; + bool is_control_flow() const; + bool is_commutative() const; + bool can_do_source_mods() const; + bool can_do_saturate() const; + bool can_do_cmod() const; + bool reads_accumulator_implicitly() const; + bool writes_accumulator_implicitly(const struct gen_device_info *devinfo) const; + + void remove(bblock_t *block); + void insert_after(bblock_t *block, backend_instruction *inst); + void insert_before(bblock_t *block, backend_instruction *inst); + void insert_before(bblock_t *block, exec_list *list); + + /** + * True if the instruction has side effects other than writing to + * its destination registers. You are expected not to reorder or + * optimize these out unless you know what you are doing. + */ + bool has_side_effects() const; + + /** + * True if the instruction might be affected by side effects of other + * instructions. + */ + bool is_volatile() const; +#else +struct backend_instruction { + struct exec_node link; +#endif + /** @{ + * Annotation for the generated IR. One of the two can be set. + */ + const void *ir; + const char *annotation; + /** @} */ + + /** + * Execution size of the instruction. This is used by the generator to + * generate the correct binary for the given instruction. Current valid + * values are 1, 4, 8, 16, 32. + */ + uint8_t exec_size; + + /** + * Channel group from the hardware execution and predication mask that + * should be applied to the instruction. The subset of channel enable + * signals (calculated from the EU control flow and predication state) + * given by [group, group + exec_size) will be used to mask GRF writes and + * any other side effects of the instruction. + */ + uint8_t group; + + uint32_t offset; /**< spill/unspill offset or texture offset bitfield */ + uint8_t mlen; /**< SEND message length */ + int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */ + uint8_t target; /**< MRT target. */ + unsigned size_written; /**< Data written to the destination register in bytes. */ + + enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */ + enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */ + enum brw_predicate predicate; + bool predicate_inverse:1; + bool writes_accumulator:1; /**< instruction implicitly writes accumulator */ + bool force_writemask_all:1; + bool no_dd_clear:1; + bool no_dd_check:1; + bool saturate:1; + bool shadow_compare:1; + + /* Chooses which flag subregister (f0.0 or f0.1) is used for conditional + * mod and predication. + */ + unsigned flag_subreg:1; + + /** The number of hardware registers used for a message header. */ + uint8_t header_size; +}; + +#ifdef __cplusplus + +enum instruction_scheduler_mode { + SCHEDULE_PRE, + SCHEDULE_PRE_NON_LIFO, + SCHEDULE_PRE_LIFO, + SCHEDULE_POST, +}; + +struct backend_shader { +protected: + + backend_shader(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *shader, + struct brw_stage_prog_data *stage_prog_data); + +public: + + const struct brw_compiler *compiler; + void *log_data; /* Passed to compiler->*_log functions */ + + const struct gen_device_info * const devinfo; + const nir_shader *nir; + struct brw_stage_prog_data * const stage_prog_data; + + /** ralloc context for temporary data used during compile */ + void *mem_ctx; + + /** + * List of either fs_inst or vec4_instruction (inheriting from + * backend_instruction) + */ + exec_list instructions; + + cfg_t *cfg; + + gl_shader_stage stage; + bool debug_enabled; + const char *stage_name; + const char *stage_abbrev; + + brw::simple_allocator alloc; + + virtual void dump_instruction(backend_instruction *inst) = 0; + virtual void dump_instruction(backend_instruction *inst, FILE *file) = 0; + virtual void dump_instructions(); + virtual void dump_instructions(const char *name); + + void calculate_cfg(); + + virtual void invalidate_live_intervals() = 0; +}; + +bool brw_texture_offset(int *offsets, + unsigned num_components, + uint32_t *offset_bits); + +void brw_setup_image_uniform_values(gl_shader_stage stage, + struct brw_stage_prog_data *stage_prog_data, + unsigned param_start_index, + const gl_uniform_storage *storage); + +#else +struct backend_shader; +#endif /* __cplusplus */ + +enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type); +enum brw_conditional_mod brw_conditional_for_comparison(unsigned int op); +uint32_t brw_math_function(enum opcode op); +const char *brw_instruction_name(const struct gen_device_info *devinfo, + enum opcode op); +bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg); +bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg); +bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg); + +bool opt_predicated_break(struct backend_shader *s); + +#ifdef __cplusplus +extern "C" { +#endif + +/* brw_fs_reg_allocate.cpp */ +void brw_fs_alloc_reg_sets(struct brw_compiler *compiler); + +/* brw_vec4_reg_allocate.cpp */ +void brw_vec4_alloc_reg_set(struct brw_compiler *compiler); + +/* brw_disasm.c */ +extern const char *const conditional_modifier[16]; +extern const char *const pred_ctrl_align16[16]; + +/* Per-thread scratch space is a power-of-two multiple of 1KB. */ +static inline int +brw_get_scratch_size(int size) +{ + return MAX2(1024, util_next_power_of_two(size)); +} + +/** + * Scratch data used when compiling a GLSL geometry shader. + */ +struct brw_gs_compile +{ + struct brw_gs_prog_key key; + struct brw_vue_map input_vue_map; + + unsigned control_data_bits_per_vertex; + unsigned control_data_header_size_bits; +}; + +unsigned get_atomic_counter_op(nir_intrinsic_op op); + +#ifdef __cplusplus +} +#endif diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp new file mode 100644 index 00000000000..d7c09093032 --- /dev/null +++ b/src/intel/compiler/brw_vec4.cpp @@ -0,0 +1,2851 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_nir.h" +#include "brw_vec4_builder.h" +#include "brw_vec4_live_variables.h" +#include "brw_vec4_vs.h" +#include "brw_dead_control_flow.h" +#include "common/gen_debug.h" +#include "program/prog_parameter.h" + +#define MAX_INSTRUCTION (1 << 30) + +using namespace brw; + +namespace brw { + +void +src_reg::init() +{ + memset(this, 0, sizeof(*this)); + + this->file = BAD_FILE; +} + +src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type) +{ + init(); + + this->file = file; + this->nr = nr; + if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) + this->swizzle = brw_swizzle_for_size(type->vector_elements); + else + this->swizzle = BRW_SWIZZLE_XYZW; + if (type) + this->type = brw_type_for_base_type(type); +} + +/** Generic unset register constructor. */ +src_reg::src_reg() +{ + init(); +} + +src_reg::src_reg(struct ::brw_reg reg) : + backend_reg(reg) +{ + this->offset = 0; + this->reladdr = NULL; +} + +src_reg::src_reg(const dst_reg ®) : + backend_reg(reg) +{ + this->reladdr = reg.reladdr; + this->swizzle = brw_swizzle_for_mask(reg.writemask); +} + +void +dst_reg::init() +{ + memset(this, 0, sizeof(*this)); + this->file = BAD_FILE; + this->writemask = WRITEMASK_XYZW; +} + +dst_reg::dst_reg() +{ + init(); +} + +dst_reg::dst_reg(enum brw_reg_file file, int nr) +{ + init(); + + this->file = file; + this->nr = nr; +} + +dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, + unsigned writemask) +{ + init(); + + this->file = file; + this->nr = nr; + this->type = brw_type_for_base_type(type); + this->writemask = writemask; +} + +dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, + unsigned writemask) +{ + init(); + + this->file = file; + this->nr = nr; + this->type = type; + this->writemask = writemask; +} + +dst_reg::dst_reg(struct ::brw_reg reg) : + backend_reg(reg) +{ + this->offset = 0; + this->reladdr = NULL; +} + +dst_reg::dst_reg(const src_reg ®) : + backend_reg(reg) +{ + this->writemask = brw_mask_for_swizzle(reg.swizzle); + this->reladdr = reg.reladdr; +} + +bool +dst_reg::equals(const dst_reg &r) const +{ + return (this->backend_reg::equals(r) && + (reladdr == r.reladdr || + (reladdr && r.reladdr && reladdr->equals(*r.reladdr)))); +} + +bool +vec4_instruction::is_send_from_grf() +{ + switch (opcode) { + case SHADER_OPCODE_SHADER_TIME_ADD: + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_URB_READ: + case TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: + case SHADER_OPCODE_BARRIER: + return true; + default: + return false; + } +} + +/** + * Returns true if this instruction's sources and destinations cannot + * safely be the same register. + * + * In most cases, a register can be written over safely by the same + * instruction that is its last use. For a single instruction, the + * sources are dereferenced before writing of the destination starts + * (naturally). + * + * However, there are a few cases where this can be problematic: + * + * - Virtual opcodes that translate to multiple instructions in the + * code generator: if src == dst and one instruction writes the + * destination before a later instruction reads the source, then + * src will have been clobbered. + * + * The register allocator uses this information to set up conflicts between + * GRF sources and the destination. + */ +bool +vec4_instruction::has_source_and_destination_hazard() const +{ + switch (opcode) { + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + return true; + default: + /* 8-wide compressed DF operations are executed as two 4-wide operations, + * so we have a src/dst hazard if the first half of the instruction + * overwrites the source of the second half. Prevent this by marking + * compressed instructions as having src/dst hazards, so the register + * allocator assigns safe register regions for dst and srcs. + */ + return size_written > REG_SIZE; + } +} + +unsigned +vec4_instruction::size_read(unsigned arg) const +{ + switch (opcode) { + case SHADER_OPCODE_SHADER_TIME_ADD: + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case TCS_OPCODE_URB_WRITE: + if (arg == 0) + return mlen * REG_SIZE; + break; + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + if (arg == 1) + return mlen * REG_SIZE; + break; + default: + break; + } + + switch (src[arg].file) { + case BAD_FILE: + return 0; + case IMM: + case UNIFORM: + return 4 * type_sz(src[arg].type); + default: + /* XXX - Represent actual vertical stride. */ + return exec_size * type_sz(src[arg].type); + } +} + +bool +vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo) +{ + if (devinfo->gen == 6 && is_math()) + return false; + + if (is_send_from_grf()) + return false; + + if (!backend_instruction::can_do_source_mods()) + return false; + + return true; +} + +bool +vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo) +{ + switch (opcode) { + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case VEC4_OPCODE_FROM_DOUBLE: + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + case VS_OPCODE_PULL_CONSTANT_LOAD: + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case TES_OPCODE_CREATE_INPUT_READ_HEADER: + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + case VEC4_OPCODE_URB_READ: + case SHADER_OPCODE_MOV_INDIRECT: + return false; + default: + /* The MATH instruction on Gen6 only executes in align1 mode, which does + * not support writemasking. + */ + if (devinfo->gen == 6 && is_math()) + return false; + + if (is_tex()) + return false; + + return true; + } +} + +bool +vec4_instruction::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + +/** + * Returns how many MRFs an opcode will write over. + * + * Note that this is not the 0 or 1 implied writes in an actual gen + * instruction -- the generate_* functions generate additional MOVs + * for setup. + */ +int +vec4_visitor::implied_mrf_writes(vec4_instruction *inst) +{ + if (inst->mlen == 0 || inst->is_send_from_grf()) + return 0; + + switch (inst->opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return 1; + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_POW: + case TCS_OPCODE_THREAD_END: + return 2; + case VS_OPCODE_URB_WRITE: + return 1; + case VS_OPCODE_PULL_CONSTANT_LOAD: + return 2; + case SHADER_OPCODE_GEN4_SCRATCH_READ: + return 2; + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + return 3; + case GS_OPCODE_URB_WRITE: + case GS_OPCODE_URB_WRITE_ALLOCATE: + case GS_OPCODE_THREAD_END: + return 0; + case GS_OPCODE_FF_SYNC: + return 1; + case TCS_OPCODE_URB_WRITE: + return 0; + case SHADER_OPCODE_SHADER_TIME_ADD: + return 0; + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + case VS_OPCODE_GET_BUFFER_SIZE: + return inst->header_size; + default: + unreachable("not reached"); + } +} + +bool +src_reg::equals(const src_reg &r) const +{ + return (this->backend_reg::equals(r) && + !reladdr && !r.reladdr); +} + +bool +vec4_visitor::opt_vector_float() +{ + bool progress = false; + + foreach_block(block, cfg) { + int last_reg = -1, last_offset = -1; + enum brw_reg_file last_reg_file = BAD_FILE; + + uint8_t imm[4] = { 0 }; + int inst_count = 0; + vec4_instruction *imm_inst[4]; + unsigned writemask = 0; + enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F; + + foreach_inst_in_block_safe(vec4_instruction, inst, block) { + int vf = -1; + enum brw_reg_type need_type; + + /* Look for unconditional MOVs from an immediate with a partial + * writemask. Skip type-conversion MOVs other than integer 0, + * where the type doesn't matter. See if the immediate can be + * represented as a VF. + */ + if (inst->opcode == BRW_OPCODE_MOV && + inst->src[0].file == IMM && + inst->predicate == BRW_PREDICATE_NONE && + inst->dst.writemask != WRITEMASK_XYZW && + type_sz(inst->src[0].type) < 8 && + (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) { + + vf = brw_float_to_vf(inst->src[0].d); + need_type = BRW_REGISTER_TYPE_D; + + if (vf == -1) { + vf = brw_float_to_vf(inst->src[0].f); + need_type = BRW_REGISTER_TYPE_F; + } + } else { + last_reg = -1; + } + + /* If this wasn't a MOV, or the destination register doesn't match, + * or we have to switch destination types, then this breaks our + * sequence. Combine anything we've accumulated so far. + */ + if (last_reg != inst->dst.nr || + last_offset != inst->dst.offset || + last_reg_file != inst->dst.file || + (vf > 0 && dest_type != need_type)) { + + if (inst_count > 1) { + unsigned vf; + memcpy(&vf, imm, sizeof(vf)); + vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf)); + mov->dst.type = dest_type; + mov->dst.writemask = writemask; + inst->insert_before(block, mov); + + for (int i = 0; i < inst_count; i++) { + imm_inst[i]->remove(block); + } + + progress = true; + } + + inst_count = 0; + last_reg = -1; + writemask = 0; + dest_type = BRW_REGISTER_TYPE_F; + + for (int i = 0; i < 4; i++) { + imm[i] = 0; + } + } + + /* Record this instruction's value (if it was representable). */ + if (vf != -1) { + if ((inst->dst.writemask & WRITEMASK_X) != 0) + imm[0] = vf; + if ((inst->dst.writemask & WRITEMASK_Y) != 0) + imm[1] = vf; + if ((inst->dst.writemask & WRITEMASK_Z) != 0) + imm[2] = vf; + if ((inst->dst.writemask & WRITEMASK_W) != 0) + imm[3] = vf; + + writemask |= inst->dst.writemask; + imm_inst[inst_count++] = inst; + + last_reg = inst->dst.nr; + last_offset = inst->dst.offset; + last_reg_file = inst->dst.file; + if (vf > 0) + dest_type = need_type; + } + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/* Replaces unused channels of a swizzle with channels that are used. + * + * For instance, this pass transforms + * + * mov vgrf4.yz, vgrf5.wxzy + * + * into + * + * mov vgrf4.yz, vgrf5.xxzx + * + * This eliminates false uses of some channels, letting dead code elimination + * remove the instructions that wrote them. + */ +bool +vec4_visitor::opt_reduce_swizzle() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == BAD_FILE || + inst->dst.file == ARF || + inst->dst.file == FIXED_GRF || + inst->is_send_from_grf()) + continue; + + unsigned swizzle; + + /* Determine which channels of the sources are read. */ + switch (inst->opcode) { + case VEC4_OPCODE_PACK_BYTES: + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0, + * but all four of src1. + */ + swizzle = brw_swizzle_for_size(4); + break; + case BRW_OPCODE_DP3: + swizzle = brw_swizzle_for_size(3); + break; + case BRW_OPCODE_DP2: + swizzle = brw_swizzle_for_size(2); + break; + + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_FROM_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + swizzle = brw_swizzle_for_size(4); + break; + + default: + swizzle = brw_swizzle_for_mask(inst->dst.writemask); + break; + } + + /* Update sources' swizzles. */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != VGRF && + inst->src[i].file != ATTR && + inst->src[i].file != UNIFORM) + continue; + + const unsigned new_swizzle = + brw_compose_swizzle(swizzle, inst->src[i].swizzle); + if (inst->src[i].swizzle != new_swizzle) { + inst->src[i].swizzle = new_swizzle; + progress = true; + } + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +void +vec4_visitor::split_uniform_registers() +{ + /* Prior to this, uniforms have been in an array sized according to + * the number of vector uniforms present, sparsely filled (so an + * aggregate results in reg indices being skipped over). Now we're + * going to cut those aggregates up so each .nr index is one + * vector. The goal is to make elimination of unused uniform + * components easier later. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0 ; i < 3; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + assert(!inst->src[i].reladdr); + + inst->src[i].nr += inst->src[i].offset / 16; + inst->src[i].offset %= 16; + } + } +} + +void +vec4_visitor::pack_uniform_registers() +{ + uint8_t chans_used[this->uniforms]; + int new_loc[this->uniforms]; + int new_chan[this->uniforms]; + + memset(chans_used, 0, sizeof(chans_used)); + memset(new_loc, 0, sizeof(new_loc)); + memset(new_chan, 0, sizeof(new_chan)); + + /* Find which uniform vectors are actually used by the program. We + * expect unused vector elements when we've moved array access out + * to pull constants, and from some GLSL code generators like wine. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + unsigned readmask; + switch (inst->opcode) { + case VEC4_OPCODE_PACK_BYTES: + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + readmask = 0xf; + break; + case BRW_OPCODE_DP3: + readmask = 0x7; + break; + case BRW_OPCODE_DP2: + readmask = 0x3; + break; + default: + readmask = inst->dst.writemask; + break; + } + + for (int i = 0 ; i < 3; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + assert(type_sz(inst->src[i].type) % 4 == 0); + unsigned channel_size = type_sz(inst->src[i].type) / 4; + + int reg = inst->src[i].nr; + for (int c = 0; c < 4; c++) { + if (!(readmask & (1 << c))) + continue; + + unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1; + unsigned used = MAX2(chans_used[reg], channel * channel_size); + if (used <= 4) + chans_used[reg] = used; + else + chans_used[reg + 1] = used - 4; + } + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); + assert(inst->src[0].subnr == 0); + + unsigned bytes_read = inst->src[2].ud; + assert(bytes_read % 4 == 0); + unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16); + + /* We just mark every register touched by a MOV_INDIRECT as being + * fully used. This ensures that it doesn't broken up piecewise by + * the next part of our packing algorithm. + */ + int reg = inst->src[0].nr; + for (unsigned i = 0; i < vec4s_read; i++) + chans_used[reg + i] = 4; + } + } + + int new_uniform_count = 0; + + /* Now, figure out a packing of the live uniform vectors into our + * push constants. + */ + for (int src = 0; src < uniforms; src++) { + int size = chans_used[src]; + + if (size == 0) + continue; + + int dst; + /* Find the lowest place we can slot this uniform in. */ + for (dst = 0; dst < src; dst++) { + if (chans_used[dst] + size <= 4) + break; + } + + if (src == dst) { + new_loc[src] = dst; + new_chan[src] = 0; + } else { + new_loc[src] = dst; + new_chan[src] = chans_used[dst]; + + /* Move the references to the data */ + for (int j = 0; j < size; j++) { + stage_prog_data->param[dst * 4 + new_chan[src] + j] = + stage_prog_data->param[src * 4 + j]; + } + + chans_used[dst] += size; + chans_used[src] = 0; + } + + new_uniform_count = MAX2(new_uniform_count, dst + 1); + } + + this->uniforms = new_uniform_count; + + /* Now, update the instructions for our repacked uniforms. */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0 ; i < 3; i++) { + int src = inst->src[i].nr; + + if (inst->src[i].file != UNIFORM) + continue; + + inst->src[i].nr = new_loc[src]; + inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src], + new_chan[src], new_chan[src]); + } + } +} + +/** + * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). + * + * While GLSL IR also performs this optimization, we end up with it in + * our instruction stream for a couple of reasons. One is that we + * sometimes generate silly instructions, for example in array access + * where we'll generate "ADD offset, index, base" even if base is 0. + * The other is that GLSL IR's constant propagation doesn't track the + * components of aggregates, so some VS patterns (initialize matrix to + * 0, accumulate in vertex blending factors) end up breaking down to + * instructions involving 0. + */ +bool +vec4_visitor::opt_algebraic() +{ + bool progress = false; + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_MOV: + if (inst->src[0].file != IMM) + break; + + if (inst->saturate) { + if (inst->dst.type != inst->src[0].type) + assert(!"unimplemented: saturate mixed types"); + + if (brw_saturate_immediate(inst->dst.type, + &inst->src[0].as_brw_reg())) { + inst->saturate = false; + progress = true; + } + } + break; + + case VEC4_OPCODE_UNPACK_UNIFORM: + if (inst->src[0].file != UNIFORM) { + inst->opcode = BRW_OPCODE_MOV; + progress = true; + } + break; + + case BRW_OPCODE_ADD: + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = src_reg(); + progress = true; + } + break; + + case BRW_OPCODE_MUL: + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + switch (inst->src[0].type) { + case BRW_REGISTER_TYPE_F: + inst->src[0] = brw_imm_f(0.0f); + break; + case BRW_REGISTER_TYPE_D: + inst->src[0] = brw_imm_d(0); + break; + case BRW_REGISTER_TYPE_UD: + inst->src[0] = brw_imm_ud(0u); + break; + default: + unreachable("not reached"); + } + inst->src[1] = src_reg(); + progress = true; + } else if (inst->src[1].is_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = src_reg(); + progress = true; + } else if (inst->src[1].is_negative_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0].negate = !inst->src[0].negate; + inst->src[1] = src_reg(); + progress = true; + } + break; + case BRW_OPCODE_CMP: + if (inst->conditional_mod == BRW_CONDITIONAL_GE && + inst->src[0].abs && + inst->src[0].negate && + inst->src[1].is_zero()) { + inst->src[0].abs = false; + inst->src[0].negate = false; + inst->conditional_mod = BRW_CONDITIONAL_Z; + progress = true; + break; + } + break; + case SHADER_OPCODE_BROADCAST: + if (is_uniform(inst->src[0]) || + inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = src_reg(); + inst->force_writemask_all = true; + progress = true; + } + break; + + default: + break; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Only a limited number of hardware registers may be used for push + * constants, so this turns access to the overflowed constants into + * pull constants. + */ +void +vec4_visitor::move_push_constants_to_pull_constants() +{ + int pull_constant_loc[this->uniforms]; + + /* Only allow 32 registers (256 uniform components) as push constants, + * which is the limit on gen6. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c. + */ + int max_uniform_components = 32 * 8; + if (this->uniforms * 4 <= max_uniform_components) + return; + + /* Make some sort of choice as to which uniforms get sent to pull + * constants. We could potentially do something clever here like + * look for the most infrequently used uniform vec4s, but leave + * that for later. + */ + for (int i = 0; i < this->uniforms * 4; i += 4) { + pull_constant_loc[i / 4] = -1; + + if (i >= max_uniform_components) { + const gl_constant_value **values = &stage_prog_data->param[i]; + + /* Try to find an existing copy of this uniform in the pull + * constants if it was part of an array access already. + */ + for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) { + int matches; + + for (matches = 0; matches < 4; matches++) { + if (stage_prog_data->pull_param[j + matches] != values[matches]) + break; + } + + if (matches == 4) { + pull_constant_loc[i / 4] = j / 4; + break; + } + } + + if (pull_constant_loc[i / 4] == -1) { + assert(stage_prog_data->nr_pull_params % 4 == 0); + pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4; + + for (int j = 0; j < 4; j++) { + stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = + values[j]; + } + } + } + } + + /* Now actually rewrite usage of the things we've moved to pull + * constants. + */ + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + for (int i = 0 ; i < 3; i++) { + if (inst->src[i].file != UNIFORM || + pull_constant_loc[inst->src[i].nr] == -1) + continue; + + int uniform = inst->src[i].nr; + + const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ? + glsl_type::dvec4_type : glsl_type::vec4_type; + dst_reg temp = dst_reg(this, temp_type); + + emit_pull_constant_load(block, inst, temp, inst->src[i], + pull_constant_loc[uniform], src_reg()); + + inst->src[i].file = temp.file; + inst->src[i].nr = temp.nr; + inst->src[i].offset %= 16; + inst->src[i].reladdr = NULL; + } + } + + /* Repack push constants to remove the now-unused ones. */ + pack_uniform_registers(); +} + +/* Conditions for which we want to avoid setting the dependency control bits */ +bool +vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) +{ +#define IS_DWORD(reg) \ + (reg.type == BRW_REGISTER_TYPE_UD || \ + reg.type == BRW_REGISTER_TYPE_D) + +#define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8) + + /* From the Cherryview and Broadwell PRMs: + * + * "When source or destination datatype is 64b or operation is integer DWord + * multiply, DepCtrl must not be used." + * + * SKL PRMs don't include this restriction, however, gen7 seems to be + * affected, at least by the 64b restriction, since DepCtrl with double + * precision instructions seems to produce GPU hangs in some cases. + */ + if (devinfo->gen == 8 || devinfo->is_broxton) { + if (inst->opcode == BRW_OPCODE_MUL && + IS_DWORD(inst->src[0]) && + IS_DWORD(inst->src[1])) + return true; + } + + if (devinfo->gen >= 7 && devinfo->gen <= 8) { + if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) || + IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2])) + return true; + } + +#undef IS_64BIT +#undef IS_DWORD + + if (devinfo->gen >= 8) { + if (inst->opcode == BRW_OPCODE_F32TO16) + return true; + } + + /* + * mlen: + * In the presence of send messages, totally interrupt dependency + * control. They're long enough that the chance of dependency + * control around them just doesn't matter. + * + * predicate: + * From the Ivy Bridge PRM, volume 4 part 3.7, page 80: + * When a sequence of NoDDChk and NoDDClr are used, the last instruction that + * completes the scoreboard clear must have a non-zero execution mask. This + * means, if any kind of predication can change the execution mask or channel + * enable of the last instruction, the optimization must be avoided. This is + * to avoid instructions being shot down the pipeline when no writes are + * required. + * + * math: + * Dependency control does not work well over math instructions. + * NB: Discovered empirically + */ + return (inst->mlen || inst->predicate || inst->is_math()); +} + +/** + * Sets the dependency control fields on instructions after register + * allocation and before the generator is run. + * + * When you have a sequence of instructions like: + * + * DP4 temp.x vertex uniform[0] + * DP4 temp.y vertex uniform[0] + * DP4 temp.z vertex uniform[0] + * DP4 temp.w vertex uniform[0] + * + * The hardware doesn't know that it can actually run the later instructions + * while the previous ones are in flight, producing stalls. However, we have + * manual fields we can set in the instructions that let it do so. + */ +void +vec4_visitor::opt_set_dependency_control() +{ + vec4_instruction *last_grf_write[BRW_MAX_GRF]; + uint8_t grf_channels_written[BRW_MAX_GRF]; + vec4_instruction *last_mrf_write[BRW_MAX_GRF]; + uint8_t mrf_channels_written[BRW_MAX_GRF]; + + assert(prog_data->total_grf || + !"Must be called after register allocation"); + + foreach_block (block, cfg) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + + foreach_inst_in_block (vec4_instruction, inst, block) { + /* If we read from a register that we were doing dependency control + * on, don't do dependency control across the read. + */ + for (int i = 0; i < 3; i++) { + int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE; + if (inst->src[i].file == VGRF) { + last_grf_write[reg] = NULL; + } else if (inst->src[i].file == FIXED_GRF) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + break; + } + assert(inst->src[i].file != MRF); + } + + if (is_dep_ctrl_unsafe(inst)) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + continue; + } + + /* Now, see if we can do dependency control for this instruction + * against a previous one writing to its destination. + */ + int reg = inst->dst.nr + inst->dst.offset / REG_SIZE; + if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { + if (last_grf_write[reg] && + last_grf_write[reg]->dst.offset == inst->dst.offset && + !(inst->dst.writemask & grf_channels_written[reg])) { + last_grf_write[reg]->no_dd_clear = true; + inst->no_dd_check = true; + } else { + grf_channels_written[reg] = 0; + } + + last_grf_write[reg] = inst; + grf_channels_written[reg] |= inst->dst.writemask; + } else if (inst->dst.file == MRF) { + if (last_mrf_write[reg] && + last_mrf_write[reg]->dst.offset == inst->dst.offset && + !(inst->dst.writemask & mrf_channels_written[reg])) { + last_mrf_write[reg]->no_dd_clear = true; + inst->no_dd_check = true; + } else { + mrf_channels_written[reg] = 0; + } + + last_mrf_write[reg] = inst; + mrf_channels_written[reg] |= inst->dst.writemask; + } + } + } +} + +bool +vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo, + int dst_writemask, + int swizzle, + int swizzle_mask) +{ + /* Gen6 MATH instructions can not execute in align16 mode, so swizzles + * are not allowed. + */ + if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW) + return false; + + if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW) + return false; + + /* If this instruction sets anything not referenced by swizzle, then we'd + * totally break it when we reswizzle. + */ + if (dst.writemask & ~swizzle_mask) + return false; + + if (mlen > 0) + return false; + + for (int i = 0; i < 3; i++) { + if (src[i].is_accumulator()) + return false; + } + + return true; +} + +/** + * For any channels in the swizzle's source that were populated by this + * instruction, rewrite the instruction to put the appropriate result directly + * in those channels. + * + * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x + */ +void +vec4_instruction::reswizzle(int dst_writemask, int swizzle) +{ + /* Destination write mask doesn't correspond to source swizzle for the dot + * product and pack_bytes instructions. + */ + if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH && + opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && + opcode != VEC4_OPCODE_PACK_BYTES) { + for (int i = 0; i < 3; i++) { + if (src[i].file == BAD_FILE || src[i].file == IMM) + continue; + + src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); + } + } + + /* Apply the specified swizzle and writemask to the original mask of + * written components. + */ + dst.writemask = dst_writemask & + brw_apply_swizzle_to_mask(swizzle, dst.writemask); +} + +/* + * Tries to reduce extra MOV instructions by taking temporary GRFs that get + * just written and then MOVed into another reg and making the original write + * of the GRF write directly to the final destination instead. + */ +bool +vec4_visitor::opt_register_coalesce() +{ + bool progress = false; + int next_ip = 0; + + calculate_live_intervals(); + + foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { + int ip = next_ip; + next_ip++; + + if (inst->opcode != BRW_OPCODE_MOV || + (inst->dst.file != VGRF && inst->dst.file != MRF) || + inst->predicate || + inst->src[0].file != VGRF || + inst->dst.type != inst->src[0].type || + inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) + continue; + + /* Remove no-op MOVs */ + if (inst->dst.file == inst->src[0].file && + inst->dst.nr == inst->src[0].nr && + inst->dst.offset == inst->src[0].offset) { + bool is_nop_mov = true; + + for (unsigned c = 0; c < 4; c++) { + if ((inst->dst.writemask & (1 << c)) == 0) + continue; + + if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) { + is_nop_mov = false; + break; + } + } + + if (is_nop_mov) { + inst->remove(block); + progress = true; + continue; + } + } + + bool to_mrf = (inst->dst.file == MRF); + + /* Can't coalesce this GRF if someone else was going to + * read it later. + */ + if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) + continue; + + /* We need to check interference with the final destination between this + * instruction and the earliest instruction involved in writing the GRF + * we're eliminating. To do that, keep track of which of our source + * channels we've seen initialized. + */ + const unsigned chans_needed = + brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle, + inst->dst.writemask); + unsigned chans_remaining = chans_needed; + + /* Now walk up the instruction stream trying to see if we can rewrite + * everything writing to the temporary to write into the destination + * instead. + */ + vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev; + foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, + inst) { + _scan_inst = scan_inst; + + if (regions_overlap(inst->src[0], inst->size_read(0), + scan_inst->dst, scan_inst->size_written)) { + /* Found something writing to the reg we want to coalesce away. */ + if (to_mrf) { + /* SEND instructions can't have MRF as a destination. */ + if (scan_inst->mlen) + break; + + if (devinfo->gen == 6) { + /* gen6 math instructions must have the destination be + * VGRF, so no compute-to-MRF for them. + */ + if (scan_inst->is_math()) { + break; + } + } + } + + /* This doesn't handle saturation on the instruction we + * want to coalesce away if the register types do not match. + * But if scan_inst is a non type-converting 'mov', we can fix + * the types later. + */ + if (inst->saturate && + inst->dst.type != scan_inst->dst.type && + !(scan_inst->opcode == BRW_OPCODE_MOV && + scan_inst->dst.type == scan_inst->src[0].type)) + break; + + /* Only allow coalescing between registers of the same type size. + * Otherwise we would need to make the pass aware of the fact that + * channel sizes are different for single and double precision. + */ + if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type)) + break; + + /* Check that scan_inst writes the same amount of data as the + * instruction, otherwise coalescing would lead to writing a + * different (larger or smaller) region of the destination + */ + if (scan_inst->size_written != inst->size_written) + break; + + /* If we can't handle the swizzle, bail. */ + if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask, + inst->src[0].swizzle, + chans_needed)) { + break; + } + + /* This only handles coalescing writes of 8 channels (1 register + * for single-precision and 2 registers for double-precision) + * starting at the source offset of the copy instruction. + */ + if (DIV_ROUND_UP(scan_inst->size_written, + type_sz(scan_inst->dst.type)) > 8 || + scan_inst->dst.offset != inst->src[0].offset) + break; + + /* Mark which channels we found unconditional writes for. */ + if (!scan_inst->predicate) + chans_remaining &= ~scan_inst->dst.writemask; + + if (chans_remaining == 0) + break; + } + + /* You can't read from an MRF, so if someone else reads our MRF's + * source GRF that we wanted to rewrite, that stops us. If it's a + * GRF we're trying to coalesce to, we don't actually handle + * rewriting sources so bail in that case as well. + */ + bool interfered = false; + for (int i = 0; i < 3; i++) { + if (regions_overlap(inst->src[0], inst->size_read(0), + scan_inst->src[i], scan_inst->size_read(i))) + interfered = true; + } + if (interfered) + break; + + /* If somebody else writes the same channels of our destination here, + * we can't coalesce before that. + */ + if (regions_overlap(inst->dst, inst->size_written, + scan_inst->dst, scan_inst->size_written) && + (inst->dst.writemask & scan_inst->dst.writemask) != 0) { + break; + } + + /* Check for reads of the register we're trying to coalesce into. We + * can't go rewriting instructions above that to put some other value + * in the register instead. + */ + if (to_mrf && scan_inst->mlen > 0) { + if (inst->dst.nr >= scan_inst->base_mrf && + inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) { + break; + } + } else { + for (int i = 0; i < 3; i++) { + if (regions_overlap(inst->dst, inst->size_written, + scan_inst->src[i], scan_inst->size_read(i))) + interfered = true; + } + if (interfered) + break; + } + } + + if (chans_remaining == 0) { + /* If we've made it here, we have an MOV we want to coalesce out, and + * a scan_inst pointing to the earliest instruction involved in + * computing the value. Now go rewrite the instruction stream + * between the two. + */ + vec4_instruction *scan_inst = _scan_inst; + while (scan_inst != inst) { + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == inst->src[0].nr && + scan_inst->dst.offset == inst->src[0].offset) { + scan_inst->reswizzle(inst->dst.writemask, + inst->src[0].swizzle); + scan_inst->dst.file = inst->dst.file; + scan_inst->dst.nr = inst->dst.nr; + scan_inst->dst.offset = inst->dst.offset; + if (inst->saturate && + inst->dst.type != scan_inst->dst.type) { + /* If we have reached this point, scan_inst is a non + * type-converting 'mov' and we can modify its register types + * to match the ones in inst. Otherwise, we could have an + * incorrect saturation result. + */ + scan_inst->dst.type = inst->dst.type; + scan_inst->src[0].type = inst->src[0].type; + } + scan_inst->saturate |= inst->saturate; + } + scan_inst = (vec4_instruction *)scan_inst->next; + } + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/** + * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control + * flow. We could probably do better here with some form of divergence + * analysis. + */ +bool +vec4_visitor::eliminate_find_live_channel() +{ + bool progress = false; + unsigned depth = 0; + + if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) { + /* The optimization below assumes that channel zero is live on thread + * dispatch, which may not be the case if the fixed function dispatches + * threads sparsely. + */ + return false; + } + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_DO: + depth++; + break; + + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + depth--; + break; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + if (depth == 0) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = brw_imm_d(0); + inst->force_writemask_all = true; + progress = true; + } + break; + + default: + break; + } + } + + return progress; +} + +/** + * Splits virtual GRFs requesting more than one contiguous physical register. + * + * We initially create large virtual GRFs for temporary structures, arrays, + * and matrices, so that the visitor functions can add offsets to work their + * way down to the actual member being accessed. But when it comes to + * optimization, we'd like to treat each register as individual storage if + * possible. + * + * So far, the only thing that might prevent splitting is a send message from + * a GRF on IVB. + */ +void +vec4_visitor::split_virtual_grfs() +{ + int num_vars = this->alloc.count; + int new_virtual_grf[num_vars]; + bool split_grf[num_vars]; + + memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); + + /* Try to split anything > 0 sized. */ + for (int i = 0; i < num_vars; i++) { + split_grf[i] = this->alloc.sizes[i] != 1; + } + + /* Check that the instructions are compatible with the registers we're trying + * to split. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == VGRF && regs_written(inst) > 1) + split_grf[inst->dst.nr] = false; + + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF && regs_read(inst, i) > 1) + split_grf[inst->src[i].nr] = false; + } + } + + /* Allocate new space for split regs. Note that the virtual + * numbers will be contiguous. + */ + for (int i = 0; i < num_vars; i++) { + if (!split_grf[i]) + continue; + + new_virtual_grf[i] = alloc.allocate(1); + for (unsigned j = 2; j < this->alloc.sizes[i]; j++) { + unsigned reg = alloc.allocate(1); + assert(reg == new_virtual_grf[i] + j - 1); + (void) reg; + } + this->alloc.sizes[i] = 1; + } + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == VGRF && split_grf[inst->dst.nr] && + inst->dst.offset / REG_SIZE != 0) { + inst->dst.nr = (new_virtual_grf[inst->dst.nr] + + inst->dst.offset / REG_SIZE - 1); + inst->dst.offset %= REG_SIZE; + } + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] && + inst->src[i].offset / REG_SIZE != 0) { + inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] + + inst->src[i].offset / REG_SIZE - 1); + inst->src[i].offset %= REG_SIZE; + } + } + } + invalidate_live_intervals(); +} + +void +vec4_visitor::dump_instruction(backend_instruction *be_inst) +{ + dump_instruction(be_inst, stderr); +} + +void +vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) +{ + vec4_instruction *inst = (vec4_instruction *)be_inst; + + if (inst->predicate) { + fprintf(file, "(%cf0.%d%s) ", + inst->predicate_inverse ? '-' : '+', + inst->flag_subreg, + pred_ctrl_align16[inst->predicate]); + } + + fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode), + inst->exec_size); + if (inst->saturate) + fprintf(file, ".sat"); + if (inst->conditional_mod) { + fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); + if (!inst->predicate && + (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && + inst->opcode != BRW_OPCODE_IF && + inst->opcode != BRW_OPCODE_WHILE))) { + fprintf(file, ".f0.%d", inst->flag_subreg); + } + } + fprintf(file, " "); + + switch (inst->dst.file) { + case VGRF: + fprintf(file, "vgrf%d", inst->dst.nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->dst.nr); + break; + case MRF: + fprintf(file, "m%d", inst->dst.nr); + break; + case ARF: + switch (inst->dst.nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->dst.subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->dst.subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + } + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case IMM: + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + if (inst->dst.offset || + (inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { + const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE); + fprintf(file, "+%d.%d", inst->dst.offset / reg_size, + inst->dst.offset % reg_size); + } + if (inst->dst.writemask != WRITEMASK_XYZW) { + fprintf(file, "."); + if (inst->dst.writemask & 1) + fprintf(file, "x"); + if (inst->dst.writemask & 2) + fprintf(file, "y"); + if (inst->dst.writemask & 4) + fprintf(file, "z"); + if (inst->dst.writemask & 8) + fprintf(file, "w"); + } + fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type)); + + if (inst->src[0].file != BAD_FILE) + fprintf(file, ", "); + + for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { + if (inst->src[i].negate) + fprintf(file, "-"); + if (inst->src[i].abs) + fprintf(file, "|"); + switch (inst->src[i].file) { + case VGRF: + fprintf(file, "vgrf%d", inst->src[i].nr); + break; + case FIXED_GRF: + fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr); + break; + case ATTR: + fprintf(file, "attr%d", inst->src[i].nr); + break; + case UNIFORM: + fprintf(file, "u%d", inst->src[i].nr); + break; + case IMM: + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_F: + fprintf(file, "%fF", inst->src[i].f); + break; + case BRW_REGISTER_TYPE_DF: + fprintf(file, "%fDF", inst->src[i].df); + break; + case BRW_REGISTER_TYPE_D: + fprintf(file, "%dD", inst->src[i].d); + break; + case BRW_REGISTER_TYPE_UD: + fprintf(file, "%uU", inst->src[i].ud); + break; + case BRW_REGISTER_TYPE_VF: + fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", + brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); + break; + default: + fprintf(file, "???"); + break; + } + break; + case ARF: + switch (inst->src[i].nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->src[i].subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->src[i].subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + } + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case MRF: + unreachable("not reached"); + } + + if (inst->src[i].offset || + (inst->src[i].file == VGRF && + alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { + const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE); + fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, + inst->src[i].offset % reg_size); + } + + if (inst->src[i].file != IMM) { + static const char *chans[4] = {"x", "y", "z", "w"}; + fprintf(file, "."); + for (int c = 0; c < 4; c++) { + fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); + } + } + + if (inst->src[i].abs) + fprintf(file, "|"); + + if (inst->src[i].file != IMM) { + fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); + } + + if (i < 2 && inst->src[i + 1].file != BAD_FILE) + fprintf(file, ", "); + } + + if (inst->force_writemask_all) + fprintf(file, " NoMask"); + + if (inst->exec_size != 8) + fprintf(file, " group%d", inst->group); + + fprintf(file, "\n"); +} + + +static inline struct brw_reg +attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved) +{ + struct brw_reg reg; + + unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type)); + if (interleaved) { + reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1); + } else { + reg = brw_vecn_grf(width, attr, 0); + } + + reg.type = type; + return reg; +} + + +/** + * Replace each register of type ATTR in this->instructions with a reference + * to a fixed HW register. + * + * If interleaved is true, then each attribute takes up half a register, with + * register N containing attribute 2*N in its first half and attribute 2*N+1 + * in its second half (this corresponds to the payload setup used by geometry + * shaders in "single" or "dual instanced" dispatch mode). If interleaved is + * false, then each attribute takes up a whole register, with register N + * containing attribute N (this corresponds to the payload setup used by + * vertex shaders, and by geometry shaders in "dual object" dispatch mode). + */ +void +vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, + bool interleaved) +{ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != ATTR) + continue; + + int grf = attribute_map[inst->src[i].nr + + inst->src[i].offset / REG_SIZE]; + assert(inst->src[i].offset % REG_SIZE == 0); + + /* All attributes used in the shader need to have been assigned a + * hardware register by the caller + */ + assert(grf != 0); + + struct brw_reg reg = + attribute_to_hw_reg(grf, inst->src[i].type, interleaved); + reg.swizzle = inst->src[i].swizzle; + if (inst->src[i].abs) + reg = brw_abs(reg); + if (inst->src[i].negate) + reg = negate(reg); + + inst->src[i] = reg; + } + } +} + +int +vec4_vs_visitor::setup_attributes(int payload_reg) +{ + int nr_attributes; + int attribute_map[VERT_ATTRIB_MAX + 2]; + memset(attribute_map, 0, sizeof(attribute_map)); + + nr_attributes = 0; + GLbitfield64 vs_inputs = vs_prog_data->inputs_read; + while (vs_inputs) { + GLuint first = ffsll(vs_inputs) - 1; + int needed_slots = + (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) ? 2 : 1; + for (int c = 0; c < needed_slots; c++) { + attribute_map[first + c] = payload_reg + nr_attributes; + nr_attributes++; + vs_inputs &= ~BITFIELD64_BIT(first + c); + } + } + + /* VertexID is stored by the VF as the last vertex element, but we + * don't represent it with a flag in inputs_read, so we call it + * VERT_ATTRIB_MAX. + */ + if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid || + vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) { + attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; + nr_attributes++; + } + + if (vs_prog_data->uses_drawid) { + attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes; + nr_attributes++; + } + + lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); + + return payload_reg + vs_prog_data->nr_attribute_slots; +} + +int +vec4_visitor::setup_uniforms(int reg) +{ + prog_data->base.dispatch_grf_start_reg = reg; + + /* The pre-gen6 VS requires that some push constants get loaded no + * matter what, or the GPU would hang. + */ + if (devinfo->gen < 6 && this->uniforms == 0) { + stage_prog_data->param = + reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); + for (unsigned int i = 0; i < 4; i++) { + unsigned int slot = this->uniforms * 4 + i; + static gl_constant_value zero = { 0.0 }; + stage_prog_data->param[slot] = &zero; + } + + this->uniforms++; + reg++; + } else { + reg += ALIGN(uniforms, 2) / 2; + } + + stage_prog_data->nr_params = this->uniforms * 4; + + prog_data->base.curb_read_length = + reg - prog_data->base.dispatch_grf_start_reg; + + return reg; +} + +void +vec4_vs_visitor::setup_payload(void) +{ + int reg = 0; + + /* The payload always contains important data in g0, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. So, we always start push constants at g1. + */ + reg++; + + reg = setup_uniforms(reg); + + reg = setup_attributes(reg); + + this->first_non_payload_grf = reg; +} + +bool +vec4_visitor::lower_minmax() +{ + assert(devinfo->gen < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + const vec4_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of + * the original SEL.L/GE instruction + */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +src_reg +vec4_visitor::get_timestamp() +{ + assert(devinfo->gen >= 7); + + src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0, + 0, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_4, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW)); + + dst_reg dst = dst_reg(this, glsl_type::uvec4_type); + + vec4_instruction *mov = emit(MOV(dst, ts)); + /* We want to read the 3 fields we care about (mostly field 0, but also 2) + * even if it's not enabled in the dispatch. + */ + mov->force_writemask_all = true; + + return src_reg(dst); +} + +void +vec4_visitor::emit_shader_time_begin() +{ + current_annotation = "shader time start"; + shader_start_time = get_timestamp(); +} + +void +vec4_visitor::emit_shader_time_end() +{ + current_annotation = "shader time end"; + src_reg shader_end_time = get_timestamp(); + + + /* Check that there weren't any timestamp reset events (assuming these + * were the only two timestamp reads that happened). + */ + src_reg reset_end = shader_end_time; + reset_end.swizzle = BRW_SWIZZLE_ZZZZ; + vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u))); + test->conditional_mod = BRW_CONDITIONAL_Z; + + emit(IF(BRW_PREDICATE_NORMAL)); + + /* Take the current timestamp and get the delta. */ + shader_start_time.negate = true; + dst_reg diff = dst_reg(this, glsl_type::uint_type); + emit(ADD(diff, shader_start_time, shader_end_time)); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u))); + + emit_shader_time_write(0, src_reg(diff)); + emit_shader_time_write(1, brw_imm_ud(1u)); + emit(BRW_OPCODE_ELSE); + emit_shader_time_write(2, brw_imm_ud(1u)); + emit(BRW_OPCODE_ENDIF); +} + +void +vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value) +{ + dst_reg dst = + dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2)); + + dst_reg offset = dst; + dst_reg time = dst; + time.offset += REG_SIZE; + + offset.type = BRW_REGISTER_TYPE_UD; + int index = shader_time_index * 3 + shader_time_subindex; + emit(MOV(offset, brw_imm_d(index * BRW_SHADER_TIME_STRIDE))); + + time.type = BRW_REGISTER_TYPE_UD; + emit(MOV(time, value)); + + vec4_instruction *inst = + emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); + inst->mlen = 2; +} + +void +vec4_visitor::convert_to_hw_regs() +{ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0; i < 3; i++) { + struct src_reg &src = inst->src[i]; + struct brw_reg reg; + switch (src.file) { + case VGRF: { + const unsigned type_size = type_sz(src.type); + const unsigned width = REG_SIZE / 2 / MAX2(4, type_size); + reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset); + reg.type = src.type; + reg.abs = src.abs; + reg.negate = src.negate; + break; + } + + case UNIFORM: { + const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type)); + reg = stride(byte_offset(brw_vec4_grf( + prog_data->base.dispatch_grf_start_reg + + src.nr / 2, src.nr % 2 * 4), + src.offset), + 0, width, 1); + reg.type = src.type; + reg.abs = src.abs; + reg.negate = src.negate; + + /* This should have been moved to pull constants. */ + assert(!src.reladdr); + break; + } + + case FIXED_GRF: + if (type_sz(src.type) == 8) { + reg = src.as_brw_reg(); + break; + } + /* fallthrough */ + case ARF: + case IMM: + continue; + + case BAD_FILE: + /* Probably unused. */ + reg = brw_null_reg(); + break; + + case MRF: + case ATTR: + unreachable("not reached"); + } + + apply_logical_swizzle(®, inst, i); + src = reg; + } + + if (inst->is_3src(devinfo)) { + /* 3-src instructions with scalar sources support arbitrary subnr, + * but don't actually use swizzles. Convert swizzle into subnr. + * Skip this for double-precision instructions: RepCtrl=1 is not + * allowed for them and needs special handling. + */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 && + type_sz(inst->src[i].type) < 8) { + assert(brw_is_single_value_swizzle(inst->src[i].swizzle)); + inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0); + } + } + } + + dst_reg &dst = inst->dst; + struct brw_reg reg; + + switch (inst->dst.file) { + case VGRF: + reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset); + reg.type = dst.type; + reg.writemask = dst.writemask; + break; + + case MRF: + reg = byte_offset(brw_message_reg(dst.nr), dst.offset); + assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); + reg.type = dst.type; + reg.writemask = dst.writemask; + break; + + case ARF: + case FIXED_GRF: + reg = dst.as_brw_reg(); + break; + + case BAD_FILE: + reg = brw_null_reg(); + break; + + case IMM: + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + + dst = reg; + } +} + +static bool +stage_uses_interleaved_attributes(unsigned stage, + enum shader_dispatch_mode dispatch_mode) +{ + switch (stage) { + case MESA_SHADER_TESS_EVAL: + return true; + case MESA_SHADER_GEOMETRY: + return dispatch_mode != DISPATCH_MODE_4X2_DUAL_OBJECT; + default: + return false; + } +} + +/** + * Get the closest native SIMD width supported by the hardware for instruction + * \p inst. The instruction will be left untouched by + * vec4_visitor::lower_simd_width() if the returned value matches the + * instruction's original execution size. + */ +static unsigned +get_lowered_simd_width(const struct gen_device_info *devinfo, + enum shader_dispatch_mode dispatch_mode, + unsigned stage, const vec4_instruction *inst) +{ + /* Do not split some instructions that require special handling */ + switch (inst->opcode) { + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + return inst->exec_size; + default: + break; + } + + unsigned lowered_width = MIN2(16, inst->exec_size); + + /* We need to split some cases of double-precision instructions that write + * 2 registers. We only need to care about this in gen7 because that is the + * only hardware that implements fp64 in Align16. + */ + if (devinfo->gen == 7 && inst->size_written > REG_SIZE) { + /* Align16 8-wide double-precision SEL does not work well. Verified + * empirically. + */ + if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8) + lowered_width = MIN2(lowered_width, 4); + + /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct + * Register Addressing: + * + * "When destination spans two registers, the source MUST span two + * registers." + */ + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == BAD_FILE) + continue; + if (inst->size_read(i) <= REG_SIZE) + lowered_width = MIN2(lowered_width, 4); + + /* Interleaved attribute setups use a vertical stride of 0, which + * makes them hit the associated instruction decompression bug in gen7. + * Split them to prevent this. + */ + if (inst->src[i].file == ATTR && + stage_uses_interleaved_attributes(stage, dispatch_mode)) + lowered_width = MIN2(lowered_width, 4); + } + } + + return lowered_width; +} + +static bool +dst_src_regions_overlap(vec4_instruction *inst) +{ + if (inst->size_written == 0) + return false; + + unsigned dst_start = inst->dst.offset; + unsigned dst_end = dst_start + inst->size_written - 1; + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == BAD_FILE) + continue; + + if (inst->dst.file != inst->src[i].file || + inst->dst.nr != inst->src[i].nr) + continue; + + unsigned src_start = inst->src[i].offset; + unsigned src_end = src_start + inst->size_read(i) - 1; + + if ((dst_start >= src_start && dst_start <= src_end) || + (dst_end >= src_start && dst_end <= src_end) || + (dst_start <= src_start && dst_end >= src_end)) { + return true; + } + } + + return false; +} + +bool +vec4_visitor::lower_simd_width() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + const unsigned lowered_width = + get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst); + assert(lowered_width <= inst->exec_size); + if (lowered_width == inst->exec_size) + continue; + + /* We need to deal with source / destination overlaps when splitting. + * The hardware supports reading from and writing to the same register + * in the same instruction, but we need to be careful that each split + * instruction we produce does not corrupt the source of the next. + * + * The easiest way to handle this is to make the split instructions write + * to temporaries if there is an src/dst overlap and then move from the + * temporaries to the original destination. We also need to consider + * instructions that do partial writes via align1 opcodes, in which case + * we need to make sure that the we initialize the temporary with the + * value of the instruction's dst. + */ + bool needs_temp = dst_src_regions_overlap(inst); + for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) { + unsigned channel_offset = lowered_width * n; + + unsigned size_written = lowered_width * type_sz(inst->dst.type); + + /* Create the split instruction from the original so that we copy all + * relevant instruction fields, then set the width and calculate the + * new dst/src regions. + */ + vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst); + linst->exec_size = lowered_width; + linst->group = channel_offset; + linst->size_written = size_written; + + /* Compute split dst region */ + dst_reg dst; + if (needs_temp) { + unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); + dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), + inst->dst.type); + if (inst->is_align1_partial_write()) { + vec4_instruction *copy = MOV(dst, src_reg(inst->dst)); + copy->exec_size = lowered_width; + copy->group = channel_offset; + copy->size_written = size_written; + inst->insert_before(block, copy); + } + } else { + dst = horiz_offset(inst->dst, channel_offset); + } + linst->dst = dst; + + /* Compute split source regions */ + for (int i = 0; i < 3; i++) { + if (linst->src[i].file == BAD_FILE) + continue; + + if (!is_uniform(linst->src[i])) + linst->src[i] = horiz_offset(linst->src[i], channel_offset); + } + + inst->insert_before(block, linst); + + /* If we used a temporary to store the result of the split + * instruction, copy the result to the original destination + */ + if (needs_temp) { + vec4_instruction *mov = + MOV(offset(inst->dst, lowered_width, n), src_reg(dst)); + mov->exec_size = lowered_width; + mov->group = channel_offset; + mov->size_written = size_written; + mov->predicate = inst->predicate; + inst->insert_before(block, mov); + } + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +static bool +is_align1_df(vec4_instruction *inst) +{ + switch (inst->opcode) { + case VEC4_OPCODE_FROM_DOUBLE: + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + return true; + default: + return false; + } +} + +static brw_predicate +scalarize_predicate(brw_predicate predicate, unsigned writemask) +{ + if (predicate != BRW_PREDICATE_NORMAL) + return predicate; + + switch (writemask) { + case WRITEMASK_X: + return BRW_PREDICATE_ALIGN16_REPLICATE_X; + case WRITEMASK_Y: + return BRW_PREDICATE_ALIGN16_REPLICATE_Y; + case WRITEMASK_Z: + return BRW_PREDICATE_ALIGN16_REPLICATE_Z; + case WRITEMASK_W: + return BRW_PREDICATE_ALIGN16_REPLICATE_W; + default: + unreachable("invalid writemask"); + } +} + +/* Gen7 has a hardware decompression bug that we can exploit to represent + * handful of additional swizzles natively. + */ +static bool +is_gen7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg) +{ + switch (inst->src[arg].swizzle) { + case BRW_SWIZZLE_XXXX: + case BRW_SWIZZLE_YYYY: + case BRW_SWIZZLE_ZZZZ: + case BRW_SWIZZLE_WWWW: + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_YXYX: + case BRW_SWIZZLE_ZWZW: + case BRW_SWIZZLE_WZWZ: + return true; + default: + return false; + } +} + +/* 64-bit sources use regions with a width of 2. These 2 elements in each row + * can be addressed using 32-bit swizzles (which is what the hardware supports) + * but it also means that the swizzle we apply on the first two components of a + * dvec4 is coupled with the swizzle we use for the last 2. In other words, + * only some specific swizzle combinations can be natively supported. + * + * FIXME: we can go an step further and implement even more swizzle + * variations using only partial scalarization. + * + * For more details see: + * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82 + */ +bool +vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg) +{ + const src_reg &src = inst->src[arg]; + assert(type_sz(src.type) == 8); + + /* Uniform regions have a vstride=0. Because we use 2-wide rows with + * 64-bit regions it means that we cannot access components Z/W, so + * return false for any such case. Interleaved attributes will also be + * mapped to GRF registers with a vstride of 0, so apply the same + * treatment. + */ + if ((is_uniform(src) || + (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) && + src.file == ATTR)) && + (brw_mask_for_swizzle(src.swizzle) & 12)) + return false; + + switch (src.swizzle) { + case BRW_SWIZZLE_XYZW: + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + case BRW_SWIZZLE_YXWZ: + return true; + default: + return devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg); + } +} + +bool +vec4_visitor::scalarize_df() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + /* Skip DF instructions that operate in Align1 mode */ + if (is_align1_df(inst)) + continue; + + /* Check if this is a double-precision instruction */ + bool is_double = type_sz(inst->dst.type) == 8; + for (int arg = 0; !is_double && arg < 3; arg++) { + is_double = inst->src[arg].file != BAD_FILE && + type_sz(inst->src[arg].type) == 8; + } + + if (!is_double) + continue; + + /* Skip the lowering for specific regioning scenarios that we can + * support natively. + */ + bool skip_lowering = true; + + /* XY and ZW writemasks operate in 32-bit, which means that they don't + * have a native 64-bit representation and they should always be split. + */ + if (inst->dst.writemask == WRITEMASK_XY || + inst->dst.writemask == WRITEMASK_ZW) { + skip_lowering = false; + } else { + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8) + continue; + skip_lowering = skip_lowering && is_supported_64bit_region(inst, i); + } + } + + if (skip_lowering) + continue; + + /* Generate scalar instructions for each enabled channel */ + for (unsigned chan = 0; chan < 4; chan++) { + unsigned chan_mask = 1 << chan; + if (!(inst->dst.writemask & chan_mask)) + continue; + + vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst); + + for (unsigned i = 0; i < 3; i++) { + unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan); + scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz); + } + + scalar_inst->dst.writemask = chan_mask; + + if (inst->predicate != BRW_PREDICATE_NONE) { + scalar_inst->predicate = + scalarize_predicate(inst->predicate, chan_mask); + } + + inst->insert_before(block, scalar_inst); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +bool +vec4_visitor::lower_64bit_mad_to_mul_add() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + if (inst->opcode != BRW_OPCODE_MAD) + continue; + + if (type_sz(inst->dst.type) != 8) + continue; + + dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type); + + /* Use the copy constructor so we copy all relevant instruction fields + * from the original mad into the add and mul instructions + */ + vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst); + mul->opcode = BRW_OPCODE_MUL; + mul->dst = mul_dst; + mul->src[0] = inst->src[1]; + mul->src[1] = inst->src[2]; + mul->src[2].file = BAD_FILE; + + vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst); + add->opcode = BRW_OPCODE_ADD; + add->src[0] = src_reg(mul_dst); + add->src[1] = inst->src[0]; + add->src[2].file = BAD_FILE; + + inst->insert_before(block, mul); + inst->insert_before(block, add); + inst->remove(block); + + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +/* The align16 hardware can only do 32-bit swizzle channels, so we need to + * translate the logical 64-bit swizzle channels that we use in the Vec4 IR + * to 32-bit swizzle channels in hardware registers. + * + * @inst and @arg identify the original vec4 IR source operand we need to + * translate the swizzle for and @hw_reg is the hardware register where we + * will write the hardware swizzle to use. + * + * This pass assumes that Align16/DF instructions have been fully scalarized + * previously so there is just one 64-bit swizzle channel to deal with for any + * given Vec4 IR source. + */ +void +vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg, + vec4_instruction *inst, int arg) +{ + src_reg reg = inst->src[arg]; + + if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE) + return; + + /* If this is not a 64-bit operand or this is a scalar instruction we don't + * need to do anything about the swizzles. + */ + if(type_sz(reg.type) < 8 || is_align1_df(inst)) { + hw_reg->swizzle = reg.swizzle; + return; + } + + /* Take the 64-bit logical swizzle channel and translate it to 32-bit */ + assert(brw_is_single_value_swizzle(reg.swizzle) || + is_supported_64bit_region(inst, arg)); + + if (is_supported_64bit_region(inst, arg) && + !is_gen7_supported_64bit_swizzle(inst, arg)) { + /* Supported 64-bit swizzles are those such that their first two + * components, when expanded to 32-bit swizzles, match the semantics + * of the original 64-bit swizzle with 2-wide row regioning. + */ + unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0); + unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1); + hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1, + swizzle1 * 2, swizzle1 * 2 + 1); + } else { + /* If we got here then we have one of the following: + * + * 1. An unsupported swizzle, which should be single-value thanks to the + * scalarization pass. + * + * 2. A gen7 supported swizzle. These can be single-value or double-value + * swizzles. If the latter, they are never cross-dvec2 channels. For + * these we always need to activate the gen7 vstride=0 exploit. + */ + unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0); + unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1); + assert((swizzle0 < 2) == (swizzle1 < 2)); + + /* To gain access to Z/W components we need to select the second half + * of the register and then use a X/Y swizzle to select Z/W respectively. + */ + if (swizzle0 >= 2) { + *hw_reg = suboffset(*hw_reg, 2); + swizzle0 -= 2; + swizzle1 -= 2; + } + + /* All gen7-specific supported swizzles require the vstride=0 exploit */ + if (devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg)) + hw_reg->vstride = BRW_VERTICAL_STRIDE_0; + + /* Any 64-bit source with an offset at 16B is intended to address the + * second half of a register and needs a vertical stride of 0 so we: + * + * 1. Don't violate register region restrictions. + * 2. Activate the gen7 instruction decompresion bug exploit when + * execsize > 4 + */ + if (hw_reg->subnr % REG_SIZE == 16) { + assert(devinfo->gen == 7); + hw_reg->vstride = BRW_VERTICAL_STRIDE_0; + } + + hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1, + swizzle1 * 2, swizzle1 * 2 + 1); + } +} + +bool +vec4_visitor::run() +{ + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_prolog(); + + emit_nir_code(); + if (failed) + return false; + base_ir = NULL; + + emit_thread_end(); + + calculate_cfg(); + + /* Before any optimization, push array accesses out to scratch + * space where we need them to be. This pass may allocate new + * virtual GRFs, so we want to do it early. It also makes sure + * that we have reladdr computations available for CSE, since we'll + * often do repeated subexpressions for those. + */ + move_grf_array_access_to_scratch(); + move_uniform_array_access_to_pull_constants(); + + pack_uniform_registers(); + move_push_constants_to_pull_constants(); + split_virtual_grfs(); + +#define OPT(pass, args...) ({ \ + pass_num++; \ + bool this_progress = pass(args); \ + \ + if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ + char filename[64]; \ + snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \ + stage_abbrev, nir->info->name, iteration, pass_num); \ + \ + backend_shader::dump_instructions(filename); \ + } \ + \ + progress = progress || this_progress; \ + this_progress; \ + }) + + + if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { + char filename[64]; + snprintf(filename, 64, "%s-%s-00-00-start", + stage_abbrev, nir->info->name); + + backend_shader::dump_instructions(filename); + } + + bool progress; + int iteration = 0; + int pass_num = 0; + do { + progress = false; + pass_num = 0; + iteration++; + + OPT(opt_predicated_break, this); + OPT(opt_reduce_swizzle); + OPT(dead_code_eliminate); + OPT(dead_control_flow_eliminate, this); + OPT(opt_copy_propagation); + OPT(opt_cmod_propagation); + OPT(opt_cse); + OPT(opt_algebraic); + OPT(opt_register_coalesce); + OPT(eliminate_find_live_channel); + } while (progress); + + pass_num = 0; + + if (OPT(opt_vector_float)) { + OPT(opt_cse); + OPT(opt_copy_propagation, false); + OPT(opt_copy_propagation, true); + OPT(dead_code_eliminate); + } + + if (devinfo->gen <= 5 && OPT(lower_minmax)) { + OPT(opt_cmod_propagation); + OPT(opt_cse); + OPT(opt_copy_propagation); + OPT(dead_code_eliminate); + } + + if (OPT(lower_simd_width)) { + OPT(opt_copy_propagation); + OPT(dead_code_eliminate); + } + + if (failed) + return false; + + OPT(lower_64bit_mad_to_mul_add); + + /* Run this before payload setup because tesselation shaders + * rely on it to prevent cross dvec2 regioning on DF attributes + * that are setup so that XY are on the second half of register and + * ZW are in the first half of the next. + */ + OPT(scalarize_df); + + setup_payload(); + + if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) { + /* Debug of register spilling: Go spill everything. */ + const int grf_count = alloc.count; + float spill_costs[alloc.count]; + bool no_spill[alloc.count]; + evaluate_spill_costs(spill_costs, no_spill); + for (int i = 0; i < grf_count; i++) { + if (no_spill[i]) + continue; + spill_reg(i); + } + + /* We want to run this after spilling because 64-bit (un)spills need to + * emit code to shuffle 64-bit data for the 32-bit scratch read/write + * messages that can produce unsupported 64-bit swizzle regions. + */ + OPT(scalarize_df); + } + + bool allocated_without_spills = reg_allocate(); + + if (!allocated_without_spills) { + compiler->shader_perf_log(log_data, + "%s shader triggered register spilling. " + "Try reducing the number of live vec4 values " + "to improve performance.\n", + stage_name); + + while (!reg_allocate()) { + if (failed) + return false; + } + + /* We want to run this after spilling because 64-bit (un)spills need to + * emit code to shuffle 64-bit data for the 32-bit scratch read/write + * messages that can produce unsupported 64-bit swizzle regions. + */ + OPT(scalarize_df); + } + + opt_schedule_instructions(); + + opt_set_dependency_control(); + + convert_to_hw_regs(); + + if (last_scratch > 0) { + prog_data->base.total_scratch = + brw_get_scratch_size(last_scratch * REG_SIZE); + } + + return !failed; +} + +} /* namespace brw */ + +extern "C" { + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const nir_shader *src_shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar); + brw_nir_lower_vs_inputs(shader, is_scalar, + use_legacy_snorm_formula, key->gl_attrib_wa_flags); + brw_nir_lower_vue_outputs(shader, is_scalar); + shader = brw_postprocess_nir(shader, compiler, is_scalar); + + const unsigned *assembly = NULL; + + prog_data->base.clip_distance_mask = + ((1 << shader->info->clip_distance_array_size) - 1); + prog_data->base.cull_distance_mask = + ((1 << shader->info->cull_distance_array_size) - 1) << + shader->info->clip_distance_array_size; + + unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read); + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (shader->info->system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) | + BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | + BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { + nr_attribute_slots++; + } + + /* gl_DrawID has its very own vec4 */ + if (shader->info->system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { + nr_attribute_slots++; + } + + unsigned nr_attributes = nr_attribute_slots - + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2); + + /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry + * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in + * vec4 mode, the hardware appears to wedge unless we read something. + */ + if (is_scalar) + prog_data->base.urb_read_length = + DIV_ROUND_UP(nr_attribute_slots, 2); + else + prog_data->base.urb_read_length = + DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2); + + prog_data->nr_attributes = nr_attributes; + prog_data->nr_attribute_slots = nr_attribute_slots; + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = + MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots); + + if (compiler->devinfo->gen == 6) + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); + else + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + + if (INTEL_DEBUG & DEBUG_VS) { + fprintf(stderr, "VS Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (is_scalar) { + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, + NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ + shader, 8, shader_time_index); + if (!v.run_vs(clip_planes)) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + + return NULL; + } + + prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, + v.runtime_check_aads_emit, MESA_SHADER_VERTEX); + if (INTEL_DEBUG & DEBUG_VS) { + const char *debug_name = + ralloc_asprintf(mem_ctx, "%s vertex shader %s", + shader->info->label ? shader->info->label : + "unnamed", + shader->info->name); + + g.enable_debug(debug_name); + } + g.generate_code(v.cfg, 8); + assembly = g.get_assembly(final_assembly_size); + } + + if (!assembly) { + prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + + vec4_vs_visitor v(compiler, log_data, key, prog_data, + shader, clip_planes, mem_ctx, + shader_time_index, use_legacy_snorm_formula); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + + return NULL; + } + + assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, + shader, &prog_data->base, v.cfg, + final_assembly_size); + } + + return assembly; +} + +} /* extern "C" */ diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h new file mode 100644 index 00000000000..a84048d8c6a --- /dev/null +++ b/src/intel/compiler/brw_vec4.h @@ -0,0 +1,399 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_VEC4_H +#define BRW_VEC4_H + +#include "brw_shader.h" + +#ifdef __cplusplus +#include "brw_ir_vec4.h" +#endif + +#include "compiler/glsl/ir.h" +#include "compiler/nir/nir.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +const unsigned * +brw_vec4_generate_assembly(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg, + unsigned *out_assembly_size); + +#ifdef __cplusplus +} /* extern "C" */ + +namespace brw { + +class vec4_live_variables; + +/** + * The vertex shader front-end. + * + * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and + * fixed-function) into VS IR. + */ +class vec4_visitor : public backend_shader +{ +public: + vec4_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_sampler_prog_key_data *key, + struct brw_vue_prog_data *prog_data, + const nir_shader *shader, + void *mem_ctx, + bool no_spills, + int shader_time_index); + virtual ~vec4_visitor(); + + dst_reg dst_null_f() + { + return dst_reg(brw_null_reg()); + } + + dst_reg dst_null_df() + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); + } + + dst_reg dst_null_d() + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } + + dst_reg dst_null_ud() + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + } + + const struct brw_sampler_prog_key_data * const key_tex; + struct brw_vue_prog_data * const prog_data; + char *fail_msg; + bool failed; + + /** + * GLSL IR currently being processed, which is associated with our + * driver IR instructions for debugging purposes. + */ + const void *base_ir; + const char *current_annotation; + + int first_non_payload_grf; + unsigned int max_grf; + int *virtual_grf_start; + int *virtual_grf_end; + brw::vec4_live_variables *live_intervals; + dst_reg userplane[MAX_CLIP_PLANES]; + + bool need_all_constants_in_pull_buffer; + + /* Regs for vertex results. Generated at ir_variable visiting time + * for the ir->location's used. + */ + dst_reg output_reg[VARYING_SLOT_TESS_MAX][4]; + unsigned output_num_components[VARYING_SLOT_TESS_MAX][4]; + const char *output_reg_annotation[VARYING_SLOT_TESS_MAX]; + int uniforms; + + src_reg shader_start_time; + + bool run(); + void fail(const char *msg, ...); + + int setup_uniforms(int payload_reg); + + bool reg_allocate_trivial(); + bool reg_allocate(); + void evaluate_spill_costs(float *spill_costs, bool *no_spill); + int choose_spill_reg(struct ra_graph *g); + void spill_reg(int spill_reg); + void move_grf_array_access_to_scratch(); + void move_uniform_array_access_to_pull_constants(); + void move_push_constants_to_pull_constants(); + void split_uniform_registers(); + void pack_uniform_registers(); + void calculate_live_intervals(); + void invalidate_live_intervals(); + void split_virtual_grfs(); + bool opt_vector_float(); + bool opt_reduce_swizzle(); + bool dead_code_eliminate(); + int var_range_start(unsigned v, unsigned n) const; + int var_range_end(unsigned v, unsigned n) const; + bool virtual_grf_interferes(int a, int b); + bool opt_cmod_propagation(); + bool opt_copy_propagation(bool do_constant_prop = true); + bool opt_cse_local(bblock_t *block); + bool opt_cse(); + bool opt_algebraic(); + bool opt_register_coalesce(); + bool eliminate_find_live_channel(); + bool is_dep_ctrl_unsafe(const vec4_instruction *inst); + void opt_set_dependency_control(); + void opt_schedule_instructions(); + void convert_to_hw_regs(); + + bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg); + bool lower_simd_width(); + bool scalarize_df(); + bool lower_64bit_mad_to_mul_add(); + void apply_logical_swizzle(struct brw_reg *hw_reg, + vec4_instruction *inst, int arg); + + vec4_instruction *emit(vec4_instruction *inst); + + vec4_instruction *emit(enum opcode opcode); + vec4_instruction *emit(enum opcode opcode, const dst_reg &dst); + vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, + const src_reg &src0); + vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, + const src_reg &src0, const src_reg &src1); + vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, + const src_reg &src0, const src_reg &src1, + const src_reg &src2); + + vec4_instruction *emit_before(bblock_t *block, + vec4_instruction *inst, + vec4_instruction *new_inst); + +#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &); +#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &); +#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &); + EMIT1(MOV) + EMIT1(NOT) + EMIT1(RNDD) + EMIT1(RNDE) + EMIT1(RNDZ) + EMIT1(FRC) + EMIT1(F32TO16) + EMIT1(F16TO32) + EMIT2(ADD) + EMIT2(MUL) + EMIT2(MACH) + EMIT2(MAC) + EMIT2(AND) + EMIT2(OR) + EMIT2(XOR) + EMIT2(DP3) + EMIT2(DP4) + EMIT2(DPH) + EMIT2(SHL) + EMIT2(SHR) + EMIT2(ASR) + vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1, + enum brw_conditional_mod condition); + vec4_instruction *IF(src_reg src0, src_reg src1, + enum brw_conditional_mod condition); + vec4_instruction *IF(enum brw_predicate predicate); + EMIT1(SCRATCH_READ) + EMIT2(SCRATCH_WRITE) + EMIT3(LRP) + EMIT1(BFREV) + EMIT3(BFE) + EMIT2(BFI1) + EMIT3(BFI2) + EMIT1(FBH) + EMIT1(FBL) + EMIT1(CBIT) + EMIT3(MAD) + EMIT2(ADDC) + EMIT2(SUBB) + EMIT1(DIM) + +#undef EMIT1 +#undef EMIT2 +#undef EMIT3 + + int implied_mrf_writes(vec4_instruction *inst); + + vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, + src_reg src0, src_reg src1); + + vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x, + const src_reg &y, const src_reg &a); + + /** + * Copy any live channel from \p src to the first channel of the + * result. + */ + src_reg emit_uniformize(const src_reg &src); + + src_reg fix_3src_operand(const src_reg &src); + src_reg resolve_source_modifiers(const src_reg &src); + + vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1 = src_reg()); + + src_reg fix_math_operand(const src_reg &src); + + void emit_pack_half_2x16(dst_reg dst, src_reg src0); + void emit_unpack_half_2x16(dst_reg dst, src_reg src0); + void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0); + void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0); + void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0); + void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0); + + void emit_texture(ir_texture_opcode op, + dst_reg dest, + const glsl_type *dest_type, + src_reg coordinate, + int coord_components, + src_reg shadow_comparator, + src_reg lod, src_reg lod2, + src_reg sample_index, + uint32_t constant_offset, + src_reg offset_value, + src_reg mcs, + uint32_t surface, src_reg surface_reg, + src_reg sampler_reg); + + src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate, + src_reg surface); + void emit_gen6_gather_wa(uint8_t wa, dst_reg dst); + + void emit_ndc_computation(); + void emit_psiz_and_flags(dst_reg reg); + vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp); + virtual void emit_urb_slot(dst_reg reg, int varying); + + void emit_shader_time_begin(); + void emit_shader_time_end(); + void emit_shader_time_write(int shader_time_subindex, src_reg value); + + src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst, + src_reg *reladdr, int reg_offset); + void emit_scratch_read(bblock_t *block, vec4_instruction *inst, + dst_reg dst, + src_reg orig_src, + int base_offset); + void emit_scratch_write(bblock_t *block, vec4_instruction *inst, + int base_offset); + void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, + dst_reg dst, + src_reg orig_src, + int base_offset, + src_reg indirect); + void emit_pull_constant_load_reg(dst_reg dst, + src_reg surf_index, + src_reg offset, + bblock_t *before_block, + vec4_instruction *before_inst); + src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block, + vec4_instruction *inst, src_reg src); + + void resolve_ud_negate(src_reg *reg); + + bool lower_minmax(); + + src_reg get_timestamp(); + + void dump_instruction(backend_instruction *inst); + void dump_instruction(backend_instruction *inst, FILE *file); + + bool is_high_sampler(src_reg sampler); + + bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate); + + void emit_conversion_from_double(dst_reg dst, src_reg src, bool saturate, + brw_reg_type single_type); + void emit_conversion_to_double(dst_reg dst, src_reg src, bool saturate, + brw_reg_type single_type); + + src_reg setup_imm_df(double v); + + vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src, + bool for_write, + bblock_t *block = NULL, + vec4_instruction *ref = NULL); + + virtual void emit_nir_code(); + virtual void nir_setup_uniforms(); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + virtual void nir_setup_system_values(); + virtual void nir_emit_impl(nir_function_impl *impl); + virtual void nir_emit_cf_list(exec_list *list); + virtual void nir_emit_if(nir_if *if_stmt); + virtual void nir_emit_loop(nir_loop *loop); + virtual void nir_emit_block(nir_block *block); + virtual void nir_emit_instr(nir_instr *instr); + virtual void nir_emit_load_const(nir_load_const_instr *instr); + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + virtual void nir_emit_alu(nir_alu_instr *instr); + virtual void nir_emit_jump(nir_jump_instr *instr); + virtual void nir_emit_texture(nir_tex_instr *instr); + virtual void nir_emit_undef(nir_ssa_undef_instr *instr); + virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr); + + dst_reg get_nir_dest(const nir_dest &dest, enum brw_reg_type type); + dst_reg get_nir_dest(const nir_dest &dest, nir_alu_type type); + dst_reg get_nir_dest(const nir_dest &dest); + src_reg get_nir_src(const nir_src &src, enum brw_reg_type type, + unsigned num_components = 4); + src_reg get_nir_src(const nir_src &src, nir_alu_type type, + unsigned num_components = 4); + src_reg get_nir_src(const nir_src &src, + unsigned num_components = 4); + src_reg get_indirect_offset(nir_intrinsic_instr *instr); + + virtual dst_reg *make_reg_for_system_value(int location) = 0; + + dst_reg *nir_locals; + dst_reg *nir_ssa_values; + dst_reg *nir_system_values; + +protected: + void emit_vertex(); + void lower_attributes_to_hw_regs(const int *attribute_map, + bool interleaved); + void setup_payload_interference(struct ra_graph *g, int first_payload_node, + int reg_node_count); + virtual void setup_payload() = 0; + virtual void emit_prolog() = 0; + virtual void emit_thread_end() = 0; + virtual void emit_urb_write_header(int mrf) = 0; + virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0; + virtual void gs_emit_vertex(int stream_id); + virtual void gs_end_primitive(); + +private: + /** + * If true, then register allocation should fail instead of spilling. + */ + const bool no_spills; + + int shader_time_index; + + unsigned last_scratch; /**< measured in 32-byte (register size) units */ +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_H */ diff --git a/src/intel/compiler/brw_vec4_builder.h b/src/intel/compiler/brw_vec4_builder.h new file mode 100644 index 00000000000..4c3efe8457b --- /dev/null +++ b/src/intel/compiler/brw_vec4_builder.h @@ -0,0 +1,634 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_VEC4_BUILDER_H +#define BRW_VEC4_BUILDER_H + +#include "brw_ir_vec4.h" +#include "brw_ir_allocator.h" + +namespace brw { + /** + * Toolbox to assemble a VEC4 IR program out of individual instructions. + * + * This object is meant to have an interface consistent with + * brw::fs_builder. They cannot be fully interchangeable because + * brw::fs_builder generates scalar code while brw::vec4_builder generates + * vector code. + */ + class vec4_builder { + public: + /** Type used in this IR to represent a source of an instruction. */ + typedef brw::src_reg src_reg; + + /** Type used in this IR to represent the destination of an instruction. */ + typedef brw::dst_reg dst_reg; + + /** Type used in this IR to represent an instruction. */ + typedef vec4_instruction instruction; + + /** + * Construct a vec4_builder that inserts instructions into \p shader. + */ + vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : + shader(shader), block(NULL), cursor(NULL), + _dispatch_width(dispatch_width), _group(0), + force_writemask_all(false), + annotation() + { + } + + /** + * Construct a vec4_builder that inserts instructions into \p shader + * before instruction \p inst in basic block \p block. The default + * execution controls and debug annotation are initialized from the + * instruction passed as argument. + */ + vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : + shader(shader), block(block), cursor(inst), + _dispatch_width(inst->exec_size), _group(inst->group), + force_writemask_all(inst->force_writemask_all) + { + annotation.str = inst->annotation; + annotation.ir = inst->ir; + } + + /** + * Construct a vec4_builder that inserts instructions before \p cursor + * in basic block \p block, inheriting other code generation parameters + * from this. + */ + vec4_builder + at(bblock_t *block, exec_node *cursor) const + { + vec4_builder bld = *this; + bld.block = block; + bld.cursor = cursor; + return bld; + } + + /** + * Construct a vec4_builder appending instructions at the end of the + * instruction list of the shader, inheriting other code generation + * parameters from this. + */ + vec4_builder + at_end() const + { + return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); + } + + /** + * Construct a builder specifying the default SIMD width and group of + * channel enable signals, inheriting other code generation parameters + * from this. + * + * \p n gives the default SIMD width, \p i gives the slot group used for + * predication and control flow masking in multiples of \p n channels. + */ + vec4_builder + group(unsigned n, unsigned i) const + { + assert(force_writemask_all || + (n <= dispatch_width() && i < dispatch_width() / n)); + vec4_builder bld = *this; + bld._dispatch_width = n; + bld._group += i * n; + return bld; + } + + /** + * Construct a builder with per-channel control flow execution masking + * disabled if \p b is true. If control flow execution masking is + * already disabled this has no effect. + */ + vec4_builder + exec_all(bool b = true) const + { + vec4_builder bld = *this; + if (b) + bld.force_writemask_all = true; + return bld; + } + + /** + * Construct a builder with the given debug annotation info. + */ + vec4_builder + annotate(const char *str, const void *ir = NULL) const + { + vec4_builder bld = *this; + bld.annotation.str = str; + bld.annotation.ir = ir; + return bld; + } + + /** + * Get the SIMD width in use. + */ + unsigned + dispatch_width() const + { + return _dispatch_width; + } + + /** + * Get the channel group in use. + */ + unsigned + group() const + { + return _group; + } + + /** + * Allocate a virtual register of natural vector size (four for this IR) + * and SIMD width. \p n gives the amount of space to allocate in + * dispatch_width units (which is just enough space for four logical + * components in this IR). + */ + dst_reg + vgrf(enum brw_reg_type type, unsigned n = 1) const + { + assert(dispatch_width() <= 32); + + if (n > 0) + return retype(dst_reg(VGRF, shader->alloc.allocate( + n * DIV_ROUND_UP(type_sz(type), 4))), + type); + else + return retype(null_reg_ud(), type); + } + + /** + * Create a null register of floating type. + */ + dst_reg + null_reg_f() const + { + return dst_reg(retype(brw_null_vec(dispatch_width()), + BRW_REGISTER_TYPE_F)); + } + + /** + * Create a null register of signed integer type. + */ + dst_reg + null_reg_d() const + { + return dst_reg(retype(brw_null_vec(dispatch_width()), + BRW_REGISTER_TYPE_D)); + } + + /** + * Create a null register of unsigned integer type. + */ + dst_reg + null_reg_ud() const + { + return dst_reg(retype(brw_null_vec(dispatch_width()), + BRW_REGISTER_TYPE_UD)); + } + + /** + * Insert an instruction into the program. + */ + instruction * + emit(const instruction &inst) const + { + return emit(new(shader->mem_ctx) instruction(inst)); + } + + /** + * Create and insert a nullary control instruction into the program. + */ + instruction * + emit(enum opcode opcode) const + { + return emit(instruction(opcode)); + } + + /** + * Create and insert a nullary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst) const + { + return emit(instruction(opcode, dst)); + } + + /** + * Create and insert a unary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const + { + switch (opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return fix_math_instruction( + emit(instruction(opcode, dst, + fix_math_operand(src0)))); + + default: + return emit(instruction(opcode, dst, src0)); + } + } + + /** + * Create and insert a binary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1) const + { + switch (opcode) { + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return fix_math_instruction( + emit(instruction(opcode, dst, + fix_math_operand(src0), + fix_math_operand(src1)))); + + default: + return emit(instruction(opcode, dst, src0, src1)); + } + } + + /** + * Create and insert a ternary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1, const src_reg &src2) const + { + switch (opcode) { + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + return emit(instruction(opcode, dst, + fix_3src_operand(src0), + fix_3src_operand(src1), + fix_3src_operand(src2))); + + default: + return emit(instruction(opcode, dst, src0, src1, src2)); + } + } + + /** + * Insert a preallocated instruction into the program. + */ + instruction * + emit(instruction *inst) const + { + inst->exec_size = dispatch_width(); + inst->group = group(); + inst->force_writemask_all = force_writemask_all; + inst->size_written = inst->exec_size * type_sz(inst->dst.type); + inst->annotation = annotation.str; + inst->ir = annotation.ir; + + if (block) + static_cast<instruction *>(cursor)->insert_before(block, inst); + else + cursor->insert_before(inst); + + return inst; + } + + /** + * Select \p src0 if the comparison of both sources with the given + * conditional mod evaluates to true, otherwise select \p src1. + * + * Generally useful to get the minimum or maximum of two values. + */ + instruction * + emit_minmax(const dst_reg &dst, const src_reg &src0, + const src_reg &src1, brw_conditional_mod mod) const + { + assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); + + return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Copy any live channel from \p src to the first channel of the result. + */ + src_reg + emit_uniformize(const src_reg &src) const + { + const vec4_builder ubld = exec_all(); + const dst_reg chan_index = + writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); + const dst_reg dst = vgrf(src.type); + + ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); + ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); + + return src_reg(dst); + } + + /** + * Assorted arithmetic ops. + * @{ + */ +#define ALU1(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0); \ + } + +#define ALU2(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1); \ + } + +#define ALU2_ACC(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ + inst->writes_accumulator = true; \ + return inst; \ + } + +#define ALU3(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ + const src_reg &src2) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ + } + + ALU2(ADD) + ALU2_ACC(ADDC) + ALU2(AND) + ALU2(ASR) + ALU2(AVG) + ALU3(BFE) + ALU2(BFI1) + ALU3(BFI2) + ALU1(BFREV) + ALU1(CBIT) + ALU2(CMPN) + ALU3(CSEL) + ALU1(DIM) + ALU2(DP2) + ALU2(DP3) + ALU2(DP4) + ALU2(DPH) + ALU1(F16TO32) + ALU1(F32TO16) + ALU1(FBH) + ALU1(FBL) + ALU1(FRC) + ALU2(LINE) + ALU1(LZD) + ALU2(MAC) + ALU2_ACC(MACH) + ALU3(MAD) + ALU1(MOV) + ALU2(MUL) + ALU1(NOT) + ALU2(OR) + ALU2(PLN) + ALU1(RNDD) + ALU1(RNDE) + ALU1(RNDU) + ALU1(RNDZ) + ALU2(SAD2) + ALU2_ACC(SADA2) + ALU2(SEL) + ALU2(SHL) + ALU2(SHR) + ALU2_ACC(SUBB) + ALU2(XOR) + +#undef ALU3 +#undef ALU2_ACC +#undef ALU2 +#undef ALU1 + /** @} */ + + /** + * CMP: Sets the low bit of the destination channels with the result + * of the comparison, while the upper bits are undefined, and updates + * the flag register with the packed 16 bits of the result. + */ + instruction * + CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, + brw_conditional_mod condition) const + { + /* Take the instruction: + * + * CMP null<d> src0<f> src1<f> + * + * Original gen4 does type conversion to the destination type + * before comparison, producing garbage results for floating + * point comparisons. + * + * The destination type doesn't matter on newer generations, + * so we set the type to match src0 so we can compact the + * instruction. + */ + return set_condmod(condition, + emit(BRW_OPCODE_CMP, retype(dst, src0.type), + fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Gen4 predicated IF. + */ + instruction * + IF(brw_predicate predicate) const + { + return set_predicate(predicate, emit(BRW_OPCODE_IF)); + } + + /** + * Gen6 IF with embedded comparison. + */ + instruction * + IF(const src_reg &src0, const src_reg &src1, + brw_conditional_mod condition) const + { + assert(shader->devinfo->gen == 6); + return set_condmod(condition, + emit(BRW_OPCODE_IF, + null_reg_d(), + fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Emit a linear interpolation instruction. + */ + instruction * + LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, + const src_reg &a) const + { + if (shader->devinfo->gen >= 6) { + /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so + * we need to reorder the operands. + */ + return emit(BRW_OPCODE_LRP, dst, a, y, x); + + } else { + /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ + const dst_reg y_times_a = vgrf(dst.type); + const dst_reg one_minus_a = vgrf(dst.type); + const dst_reg x_times_one_minus_a = vgrf(dst.type); + + MUL(y_times_a, y, a); + ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); + MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); + return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); + } + } + + backend_shader *shader; + + protected: + /** + * Workaround for negation of UD registers. See comment in + * fs_generator::generate_code() for the details. + */ + src_reg + fix_unsigned_negate(const src_reg &src) const + { + if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { + dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); + MOV(temp, src); + return src_reg(temp); + } else { + return src; + } + } + + /** + * Workaround for register access modes not supported by the ternary + * instruction encoding. + */ + src_reg + fix_3src_operand(const src_reg &src) const + { + /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be + * able to use vertical stride of zero to replicate the vec4 uniform, like + * + * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] + * + * But you can't, since vertical stride is always four in three-source + * instructions. Instead, insert a MOV instruction to do the replication so + * that the three-source instruction can consume it. + */ + + /* The MOV is only needed if the source is a uniform or immediate. */ + if (src.file != UNIFORM && src.file != IMM) + return src; + + if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) + return src; + + const dst_reg expanded = vgrf(src.type); + emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); + return src_reg(expanded); + } + + /** + * Workaround for register access modes not supported by the math + * instruction. + */ + src_reg + fix_math_operand(const src_reg &src) const + { + /* The gen6 math instruction ignores the source modifiers -- + * swizzle, abs, negate, and at least some parts of the register + * region description. + * + * Rather than trying to enumerate all these cases, *always* expand the + * operand to a temp GRF for gen6. + * + * For gen7, keep the operand as-is, except if immediate, which gen7 still + * can't use. + */ + if (shader->devinfo->gen == 6 || + (shader->devinfo->gen == 7 && src.file == IMM)) { + const dst_reg tmp = vgrf(src.type); + MOV(tmp, src); + return src_reg(tmp); + } else { + return src; + } + } + + /** + * Workaround other weirdness of the math instruction. + */ + instruction * + fix_math_instruction(instruction *inst) const + { + if (shader->devinfo->gen == 6 && + inst->dst.writemask != WRITEMASK_XYZW) { + const dst_reg tmp = vgrf(inst->dst.type); + MOV(inst->dst, src_reg(tmp)); + inst->dst = tmp; + + } else if (shader->devinfo->gen < 6) { + const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); + inst->base_mrf = 1; + inst->mlen = sources; + } + + return inst; + } + + bblock_t *block; + exec_node *cursor; + + unsigned _dispatch_width; + unsigned _group; + bool force_writemask_all; + + /** Debug annotation info. */ + struct { + const char *str; + const void *ir; + } annotation; + }; +} + +#endif diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp new file mode 100644 index 00000000000..4454cdbfc94 --- /dev/null +++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp @@ -0,0 +1,172 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +/** @file brw_vec4_cmod_propagation.cpp + * + * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check + * brw_fs_cmod_propagation for further details on the rationale behind this + * optimization. + */ + +#include "brw_vec4.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +namespace brw { + +static bool +opt_cmod_propagation_local(bblock_t *block) +{ + bool progress = false; + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { + ip--; + + if ((inst->opcode != BRW_OPCODE_AND && + inst->opcode != BRW_OPCODE_CMP && + inst->opcode != BRW_OPCODE_MOV) || + inst->predicate != BRW_PREDICATE_NONE || + !inst->dst.is_null() || + inst->src[0].file != VGRF || + inst->src[0].abs) + continue; + + if (inst->opcode == BRW_OPCODE_AND && + !(inst->src[1].is_one() && + inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate)) + continue; + + if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) + continue; + + if (inst->opcode == BRW_OPCODE_MOV && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + continue; + + bool read_flag = false; + foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) { + if (regions_overlap(inst->src[0], inst->size_read(0), + scan_inst->dst, scan_inst->size_written)) { + if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) || + scan_inst->dst.offset != inst->src[0].offset || + (scan_inst->dst.writemask != WRITEMASK_X && + scan_inst->dst.writemask != WRITEMASK_XYZW) || + (scan_inst->dst.writemask == WRITEMASK_XYZW && + inst->src[0].swizzle != BRW_SWIZZLE_XYZW) || + (inst->dst.writemask & ~scan_inst->dst.writemask) != 0 || + scan_inst->exec_size != inst->exec_size || + scan_inst->group != inst->group) { + break; + } + + /* CMP's result is the same regardless of dest type. */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + scan_inst->opcode == BRW_OPCODE_CMP && + (inst->dst.type == BRW_REGISTER_TYPE_D || + inst->dst.type == BRW_REGISTER_TYPE_UD)) { + inst->remove(block); + progress = true; + break; + } + + /* If the AND wasn't handled by the previous case, it isn't safe + * to remove it. + */ + if (inst->opcode == BRW_OPCODE_AND) + break; + + /* Comparisons operate differently for ints and floats */ + if (scan_inst->dst.type != inst->dst.type && + (scan_inst->dst.type == BRW_REGISTER_TYPE_F || + inst->dst.type == BRW_REGISTER_TYPE_F)) + break; + + /* If the instruction generating inst's source also wrote the + * flag, and inst is doing a simple .nz comparison, then inst + * is redundant - the appropriate value is already in the flag + * register. Delete inst. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate && + scan_inst->writes_flag()) { + inst->remove(block); + progress = true; + break; + } + + /* The conditional mod of the CMP/CMPN instructions behaves + * specially because the flag output is not calculated from the + * result of the instruction, but the other way around, which + * means that even if the condmod to propagate and the condmod + * from the CMP instruction are the same they will in general give + * different results because they are evaluated based on different + * inputs. + */ + if (scan_inst->opcode == BRW_OPCODE_CMP || + scan_inst->opcode == BRW_OPCODE_CMPN) + break; + + /* Otherwise, try propagating the conditional. */ + enum brw_conditional_mod cond = + inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) + : inst->conditional_mod; + + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + inst->remove(block); + progress = true; + } + break; + } + + if (scan_inst->writes_flag()) + break; + + read_flag = read_flag || scan_inst->reads_flag(); + } + } + + return progress; +} + +bool +vec4_visitor::opt_cmod_propagation() +{ + bool progress = false; + + foreach_block_reverse(block, cfg) { + progress = opt_cmod_propagation_local(block) || progress; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_copy_propagation.cpp b/src/intel/compiler/brw_vec4_copy_propagation.cpp new file mode 100644 index 00000000000..e7f6f93f8bd --- /dev/null +++ b/src/intel/compiler/brw_vec4_copy_propagation.cpp @@ -0,0 +1,558 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file brw_vec4_copy_propagation.cpp + * + * Implements tracking of values copied between registers, and + * optimizations based on that: copy propagation and constant + * propagation. + */ + +#include "brw_vec4.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +namespace brw { + +struct copy_entry { + src_reg *value[4]; + int saturatemask; +}; + +static bool +is_direct_copy(vec4_instruction *inst) +{ + return (inst->opcode == BRW_OPCODE_MOV && + !inst->predicate && + inst->dst.file == VGRF && + inst->dst.offset % REG_SIZE == 0 && + !inst->dst.reladdr && + !inst->src[0].reladdr && + (inst->dst.type == inst->src[0].type || + (inst->dst.type == BRW_REGISTER_TYPE_F && + inst->src[0].type == BRW_REGISTER_TYPE_VF))); +} + +static bool +is_dominated_by_previous_instruction(vec4_instruction *inst) +{ + return (inst->opcode != BRW_OPCODE_DO && + inst->opcode != BRW_OPCODE_WHILE && + inst->opcode != BRW_OPCODE_ELSE && + inst->opcode != BRW_OPCODE_ENDIF); +} + +static bool +is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) +{ + const src_reg *src = values[ch]; + + /* consider GRF only */ + assert(inst->dst.file == VGRF); + if (!src || src->file != VGRF) + return false; + + return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) && + (inst->dst.offset != src->offset || + inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))); +} + +static bool +is_logic_op(enum opcode opcode) +{ + return (opcode == BRW_OPCODE_AND || + opcode == BRW_OPCODE_OR || + opcode == BRW_OPCODE_XOR || + opcode == BRW_OPCODE_NOT); +} + +/** + * Get the origin of a copy as a single register if all components present in + * the given readmask originate from the same register and have compatible + * regions, otherwise return a BAD_FILE register. + */ +static src_reg +get_copy_value(const copy_entry &entry, unsigned readmask) +{ + unsigned swz[4] = {}; + src_reg value; + + for (unsigned i = 0; i < 4; i++) { + if (readmask & (1 << i)) { + if (entry.value[i]) { + src_reg src = *entry.value[i]; + + if (src.file == IMM) { + swz[i] = i; + } else { + swz[i] = BRW_GET_SWZ(src.swizzle, i); + /* Overwrite the original swizzle so the src_reg::equals call + * below doesn't care about it, the correct swizzle will be + * calculated once the swizzles of all components are known. + */ + src.swizzle = BRW_SWIZZLE_XYZW; + } + + if (value.file == BAD_FILE) { + value = src; + } else if (!value.equals(src)) { + return src_reg(); + } + } else { + return src_reg(); + } + } + } + + return swizzle(value, + brw_compose_swizzle(brw_swizzle_for_mask(readmask), + BRW_SWIZZLE4(swz[0], swz[1], + swz[2], swz[3]))); +} + +static bool +try_constant_propagate(const struct gen_device_info *devinfo, + vec4_instruction *inst, + int arg, const copy_entry *entry) +{ + /* For constant propagation, we only handle the same constant + * across all 4 channels. Some day, we should handle the 8-bit + * float vector format, which would let us constant propagate + * vectors better. + * We could be more aggressive here -- some channels might not get used + * based on the destination writemask. + */ + src_reg value = + get_copy_value(*entry, + brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, + WRITEMASK_XYZW)); + + if (value.file != IMM) + return false; + + /* 64-bit types can't be used except for one-source instructions, which + * higher levels should have constant folded away, so there's no point in + * propagating immediates here. + */ + if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8) + return false; + + if (value.type == BRW_REGISTER_TYPE_VF) { + /* The result of bit-casting the component values of a vector float + * cannot in general be represented as an immediate. + */ + if (inst->src[arg].type != BRW_REGISTER_TYPE_F) + return false; + } else { + value.type = inst->src[arg].type; + } + + if (inst->src[arg].abs) { + if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || + !brw_abs_immediate(value.type, &value.as_brw_reg())) { + return false; + } + } + + if (inst->src[arg].negate) { + if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || + !brw_negate_immediate(value.type, &value.as_brw_reg())) { + return false; + } + } + + value = swizzle(value, inst->src[arg].swizzle); + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case SHADER_OPCODE_BROADCAST: + inst->src[arg] = value; + return true; + + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + if (devinfo->gen < 8) + break; + /* fallthrough */ + case BRW_OPCODE_DP2: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_ASR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SUBB: + if (arg == 1) { + inst->src[arg] = value; + return true; + } + break; + + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_OR: + case BRW_OPCODE_AND: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + if (arg == 1) { + inst->src[arg] = value; + return true; + } else if (arg == 0 && inst->src[1].file != IMM) { + /* Fit this constant in by commuting the operands. Exception: we + * can't do this for 32-bit integer MUL/MACH because it's asymmetric. + */ + if ((inst->opcode == BRW_OPCODE_MUL || + inst->opcode == BRW_OPCODE_MACH) && + (inst->src[1].type == BRW_REGISTER_TYPE_D || + inst->src[1].type == BRW_REGISTER_TYPE_UD)) + break; + inst->src[0] = inst->src[1]; + inst->src[1] = value; + return true; + } + break; + case GS_OPCODE_SET_WRITE_OFFSET: + /* This is just a multiply by a constant with special strides. + * The generator will handle immediates in both arguments (generating + * a single MOV of the product). So feel free to propagate in src0. + */ + inst->src[arg] = value; + return true; + + case BRW_OPCODE_CMP: + if (arg == 1) { + inst->src[arg] = value; + return true; + } else if (arg == 0 && inst->src[1].file != IMM) { + enum brw_conditional_mod new_cmod; + + new_cmod = brw_swap_cmod(inst->conditional_mod); + if (new_cmod != BRW_CONDITIONAL_NONE) { + /* Fit this constant in by swapping the operands and + * flipping the test. + */ + inst->src[0] = inst->src[1]; + inst->src[1] = value; + inst->conditional_mod = new_cmod; + return true; + } + } + break; + + case BRW_OPCODE_SEL: + if (arg == 1) { + inst->src[arg] = value; + return true; + } else if (arg == 0 && inst->src[1].file != IMM) { + inst->src[0] = inst->src[1]; + inst->src[1] = value; + + /* If this was predicated, flipping operands means + * we also need to flip the predicate. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { + inst->predicate_inverse = !inst->predicate_inverse; + } + return true; + } + break; + + default: + break; + } + + return false; +} + +static bool +is_align1_opcode(unsigned opcode) +{ + switch (opcode) { + case VEC4_OPCODE_FROM_DOUBLE: + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + return true; + default: + return false; + } +} + +static bool +try_copy_propagate(const struct gen_device_info *devinfo, + vec4_instruction *inst, int arg, + const copy_entry *entry, int attributes_per_reg) +{ + /* Build up the value we are propagating as if it were the source of a + * single MOV + */ + src_reg value = + get_copy_value(*entry, + brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, + WRITEMASK_XYZW)); + + /* Check that we can propagate that value */ + if (value.file != UNIFORM && + value.file != VGRF && + value.file != ATTR) + return false; + + /* In gen < 8 instructions that write 2 registers also need to read 2 + * registers. Make sure we don't break that restriction by copy + * propagating from a uniform. + */ + if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value)) + return false; + + /* There is a regioning restriction such that if execsize == width + * and hstride != 0 then the vstride can't be 0. When we split instrutions + * that take a single-precision source (like F->DF conversions) we end up + * with a 4-wide source on an instruction with an execution size of 4. + * If we then copy-propagate the source from a uniform we also end up with a + * vstride of 0 and we violate the restriction. + */ + if (inst->exec_size == 4 && value.file == UNIFORM && + type_sz(value.type) == 4) + return false; + + /* If the type of the copy value is different from the type of the + * instruction then the swizzles and writemasks involved don't have the same + * meaning and simply replacing the source would produce different semantics. + */ + if (type_sz(value.type) != type_sz(inst->src[arg].type)) + return false; + + if (devinfo->gen >= 8 && (value.negate || value.abs) && + is_logic_op(inst->opcode)) { + return false; + } + + if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE) + return false; + + bool has_source_modifiers = value.negate || value.abs; + + /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on + * instructions. + */ + if ((has_source_modifiers || value.file == UNIFORM || + value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo)) + return false; + + if (has_source_modifiers && + value.type != inst->src[arg].type && + !inst->can_change_types()) + return false; + + if (has_source_modifiers && + inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE) + return false; + + unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle, + value.swizzle); + + /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles + * so copy-propagation won't be safe if the composed swizzle is anything + * other than the identity. + */ + if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW) + return false; + + if (inst->is_3src(devinfo) && + (value.file == UNIFORM || + (value.file == ATTR && attributes_per_reg != 1)) && + !brw_is_single_value_swizzle(composed_swizzle)) + return false; + + if (inst->is_send_from_grf()) + return false; + + /* we can't generally copy-propagate UD negations becuse we + * end up accessing the resulting values as signed integers + * instead. See also resolve_ud_negate(). + */ + if (value.negate && + value.type == BRW_REGISTER_TYPE_UD) + return false; + + /* Don't report progress if this is a noop. */ + if (value.equals(inst->src[arg])) + return false; + + const unsigned dst_saturate_mask = inst->dst.writemask & + brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask); + + if (dst_saturate_mask) { + /* We either saturate all or nothing. */ + if (dst_saturate_mask != inst->dst.writemask) + return false; + + /* Limit saturate propagation only to SEL with src1 bounded within 0.0 + * and 1.0, otherwise skip copy propagate altogether. + */ + switch(inst->opcode) { + case BRW_OPCODE_SEL: + if (arg != 0 || + inst->src[0].type != BRW_REGISTER_TYPE_F || + inst->src[1].file != IMM || + inst->src[1].type != BRW_REGISTER_TYPE_F || + inst->src[1].f < 0.0 || + inst->src[1].f > 1.0) { + return false; + } + if (!inst->saturate) + inst->saturate = true; + break; + default: + return false; + } + } + + /* Build the final value */ + if (inst->src[arg].abs) { + value.negate = false; + value.abs = true; + } + if (inst->src[arg].negate) + value.negate = !value.negate; + + value.swizzle = composed_swizzle; + if (has_source_modifiers && + value.type != inst->src[arg].type) { + assert(inst->can_change_types()); + for (int i = 0; i < 3; i++) { + inst->src[i].type = value.type; + } + inst->dst.type = value.type; + } else { + value.type = inst->src[arg].type; + } + + inst->src[arg] = value; + return true; +} + +bool +vec4_visitor::opt_copy_propagation(bool do_constant_prop) +{ + /* If we are in dual instanced or single mode, then attributes are going + * to be interleaved, so one register contains two attribute slots. + */ + const int attributes_per_reg = + prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; + bool progress = false; + struct copy_entry entries[alloc.total_size]; + + memset(&entries, 0, sizeof(entries)); + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + /* This pass only works on basic blocks. If there's flow + * control, throw out all our information and start from + * scratch. + * + * This should really be fixed by using a structure like in + * src/glsl/opt_copy_propagation.cpp to track available copies. + */ + if (!is_dominated_by_previous_instruction(inst)) { + memset(&entries, 0, sizeof(entries)); + continue; + } + + /* For each source arg, see if each component comes from a copy + * from the same type file (IMM, VGRF, UNIFORM), and try + * optimizing out access to the copy result + */ + for (int i = 2; i >= 0; i--) { + /* Copied values end up in GRFs, and we don't track reladdr + * accesses. + */ + if (inst->src[i].file != VGRF || + inst->src[i].reladdr) + continue; + + /* We only handle register-aligned single GRF copies. */ + if (inst->size_read(i) != REG_SIZE || + inst->src[i].offset % REG_SIZE) + continue; + + const unsigned reg = (alloc.offsets[inst->src[i].nr] + + inst->src[i].offset / REG_SIZE); + const copy_entry &entry = entries[reg]; + + if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry)) + progress = true; + else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg)) + progress = true; + } + + /* Track available source registers. */ + if (inst->dst.file == VGRF) { + const int reg = + alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE; + + /* Update our destination's current channel values. For a direct copy, + * the value is the newly propagated source. Otherwise, we don't know + * the new value, so clear it. + */ + bool direct_copy = is_direct_copy(inst); + entries[reg].saturatemask &= ~inst->dst.writemask; + for (int i = 0; i < 4; i++) { + if (inst->dst.writemask & (1 << i)) { + entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL; + entries[reg].saturatemask |= + inst->saturate && direct_copy ? 1 << i : 0; + } + } + + /* Clear the records for any registers whose current value came from + * our destination's updated channels, as the two are no longer equal. + */ + if (inst->dst.reladdr) + memset(&entries, 0, sizeof(entries)); + else { + for (unsigned i = 0; i < alloc.total_size; i++) { + for (int j = 0; j < 4; j++) { + if (is_channel_updated(inst, entries[i].value, j)) { + entries[i].value[j] = NULL; + entries[i].saturatemask &= ~(1 << j); + } + } + } + } + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_cse.cpp b/src/intel/compiler/brw_vec4_cse.cpp new file mode 100644 index 00000000000..2e65ef78548 --- /dev/null +++ b/src/intel/compiler/brw_vec4_cse.cpp @@ -0,0 +1,296 @@ +/* + * Copyright © 2012, 2013, 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" +#include "brw_vec4_live_variables.h" +#include "brw_cfg.h" + +using namespace brw; + +/** @file brw_vec4_cse.cpp + * + * Support for local common subexpression elimination. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 13.1 (p378). + */ + +namespace { +struct aeb_entry : public exec_node { + /** The instruction that generates the expression value. */ + vec4_instruction *generator; + + /** The temporary where the value is stored. */ + src_reg tmp; +}; +} + +static bool +is_expression(const vec4_instruction *const inst) +{ + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMP: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_LINE: + case BRW_OPCODE_PLN: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case VEC4_OPCODE_UNPACK_UNIFORM: + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + case SHADER_OPCODE_BROADCAST: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return true; + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return inst->mlen == 0; + default: + return false; + } +} + +static bool +operands_match(const vec4_instruction *a, const vec4_instruction *b) +{ + const src_reg *xs = a->src; + const src_reg *ys = b->src; + + if (a->opcode == BRW_OPCODE_MAD) { + return xs[0].equals(ys[0]) && + ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || + (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); + } else if (!a->is_commutative()) { + return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]); + } else { + return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || + (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); + } +} + +static bool +instructions_match(vec4_instruction *a, vec4_instruction *b) +{ + return a->opcode == b->opcode && + a->saturate == b->saturate && + a->predicate == b->predicate && + a->predicate_inverse == b->predicate_inverse && + a->conditional_mod == b->conditional_mod && + a->flag_subreg == b->flag_subreg && + a->dst.type == b->dst.type && + a->offset == b->offset && + a->mlen == b->mlen && + a->base_mrf == b->base_mrf && + a->header_size == b->header_size && + a->shadow_compare == b->shadow_compare && + a->dst.writemask == b->dst.writemask && + a->force_writemask_all == b->force_writemask_all && + a->size_written == b->size_written && + a->exec_size == b->exec_size && + a->group == b->group && + operands_match(a, b); +} + +bool +vec4_visitor::opt_cse_local(bblock_t *block) +{ + bool progress = false; + exec_list aeb; + + void *cse_ctx = ralloc_context(NULL); + + int ip = block->start_ip; + foreach_inst_in_block (vec4_instruction, inst, block) { + /* Skip some cases. */ + if (is_expression(inst) && !inst->predicate && inst->mlen == 0 && + ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || + inst->dst.is_null())) + { + bool found = false; + + foreach_in_list_use_after(aeb_entry, entry, &aeb) { + /* Match current instruction's expression against those in AEB. */ + if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && + instructions_match(inst, entry->generator)) { + found = true; + progress = true; + break; + } + } + + if (!found) { + if (inst->opcode != BRW_OPCODE_MOV || + (inst->opcode == BRW_OPCODE_MOV && + inst->src[0].file == IMM && + inst->src[0].type == BRW_REGISTER_TYPE_VF)) { + /* Our first sighting of this expression. Create an entry. */ + aeb_entry *entry = ralloc(cse_ctx, aeb_entry); + entry->tmp = src_reg(); /* file will be BAD_FILE */ + entry->generator = inst; + aeb.push_tail(entry); + } + } else { + /* This is at least our second sighting of this expression. + * If we don't have a temporary already, make one. + */ + bool no_existing_temp = entry->tmp.file == BAD_FILE; + if (no_existing_temp && !entry->generator->dst.is_null()) { + entry->tmp = retype(src_reg(VGRF, alloc.allocate( + regs_written(entry->generator)), + NULL), inst->dst.type); + + const unsigned width = entry->generator->exec_size; + unsigned component_size = width * type_sz(entry->tmp.type); + unsigned num_copy_movs = + DIV_ROUND_UP(entry->generator->size_written, component_size); + for (unsigned i = 0; i < num_copy_movs; ++i) { + vec4_instruction *copy = + MOV(offset(entry->generator->dst, width, i), + offset(entry->tmp, width, i)); + copy->exec_size = width; + copy->group = entry->generator->group; + copy->force_writemask_all = + entry->generator->force_writemask_all; + entry->generator->insert_after(block, copy); + } + + entry->generator->dst = dst_reg(entry->tmp); + } + + /* dest <- temp */ + if (!inst->dst.is_null()) { + assert(inst->dst.type == entry->tmp.type); + const unsigned width = inst->exec_size; + unsigned component_size = width * type_sz(inst->dst.type); + unsigned num_copy_movs = + DIV_ROUND_UP(inst->size_written, component_size); + for (unsigned i = 0; i < num_copy_movs; ++i) { + vec4_instruction *copy = + MOV(offset(inst->dst, width, i), + offset(entry->tmp, width, i)); + copy->exec_size = inst->exec_size; + copy->group = inst->group; + copy->force_writemask_all = inst->force_writemask_all; + inst->insert_before(block, copy); + } + } + + /* Set our iterator so that next time through the loop inst->next + * will get the instruction in the basic block after the one we've + * removed. + */ + vec4_instruction *prev = (vec4_instruction *)inst->prev; + + inst->remove(block); + inst = prev; + } + } + + foreach_in_list_safe(aeb_entry, entry, &aeb) { + /* Kill all AEB entries that write a different value to or read from + * the flag register if we just wrote it. + */ + if (inst->writes_flag()) { + if (entry->generator->reads_flag() || + (entry->generator->writes_flag() && + !instructions_match(inst, entry->generator))) { + entry->remove(); + ralloc_free(entry); + continue; + } + } + + for (int i = 0; i < 3; i++) { + src_reg *src = &entry->generator->src[i]; + + /* Kill all AEB entries that use the destination we just + * overwrote. + */ + if (inst->dst.file == entry->generator->src[i].file && + inst->dst.nr == entry->generator->src[i].nr) { + entry->remove(); + ralloc_free(entry); + break; + } + + /* Kill any AEB entries using registers that don't get reused any + * more -- a sure sign they'll fail operands_match(). + */ + if (src->file == VGRF) { + if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) { + entry->remove(); + ralloc_free(entry); + break; + } + } + } + } + + ip++; + } + + ralloc_free(cse_ctx); + + return progress; +} + +bool +vec4_visitor::opt_cse() +{ + bool progress = false; + + calculate_live_intervals(); + + foreach_block (block, cfg) { + progress = opt_cse_local(block) || progress; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp new file mode 100644 index 00000000000..5b22a096dd1 --- /dev/null +++ b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp @@ -0,0 +1,160 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" +#include "brw_vec4_live_variables.h" +#include "brw_cfg.h" + +/** @file brw_vec4_dead_code_eliminate.cpp + * + * Dataflow-aware dead code elimination. + * + * Walks the instruction list from the bottom, removing instructions that + * have results that both aren't used in later blocks and haven't been read + * yet in the tail end of this block. + */ + +using namespace brw; + +bool +vec4_visitor::dead_code_eliminate() +{ + bool progress = false; + + calculate_live_intervals(); + + int num_vars = live_intervals->num_vars; + BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars)); + BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1); + + foreach_block_reverse_safe(block, cfg) { + memcpy(live, live_intervals->block_data[block->num].liveout, + sizeof(BITSET_WORD) * BITSET_WORDS(num_vars)); + memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout, + sizeof(BITSET_WORD)); + + foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { + if ((inst->dst.file == VGRF && !inst->has_side_effects()) || + (inst->dst.is_null() && inst->writes_flag())){ + bool result_live[4] = { false }; + if (inst->dst.file == VGRF) { + for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { + for (int c = 0; c < 4; c++) { + const unsigned v = var_from_reg(alloc, inst->dst, c, i); + result_live[c] |= BITSET_TEST(live, v); + } + } + } else { + for (unsigned c = 0; c < 4; c++) + result_live[c] = BITSET_TEST(flag_live, c); + } + + /* If the instruction can't do writemasking, then it's all or + * nothing. + */ + if (!inst->can_do_writemask(devinfo)) { + bool result = result_live[0] | result_live[1] | + result_live[2] | result_live[3]; + result_live[0] = result; + result_live[1] = result; + result_live[2] = result; + result_live[3] = result; + } + + for (int c = 0; c < 4; c++) { + if (!result_live[c] && inst->dst.writemask & (1 << c)) { + inst->dst.writemask &= ~(1 << c); + progress = true; + + if (inst->dst.writemask == 0) { + if (inst->writes_accumulator || inst->writes_flag()) { + inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type)); + } else { + inst->opcode = BRW_OPCODE_NOP; + break; + } + } + } + } + } + + if (inst->dst.is_null() && inst->writes_flag()) { + bool combined_live = false; + for (unsigned c = 0; c < 4; c++) + combined_live |= BITSET_TEST(flag_live, c); + + if (!combined_live) { + inst->opcode = BRW_OPCODE_NOP; + progress = true; + } + } + + if (inst->dst.file == VGRF && !inst->predicate && + !inst->is_align1_partial_write()) { + for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { + for (int c = 0; c < 4; c++) { + if (inst->dst.writemask & (1 << c)) { + const unsigned v = var_from_reg(alloc, inst->dst, c, i); + BITSET_CLEAR(live, v); + } + } + } + } + + if (inst->writes_flag() && !inst->predicate) { + for (unsigned c = 0; c < 4; c++) + BITSET_CLEAR(flag_live, c); + } + + if (inst->opcode == BRW_OPCODE_NOP) { + inst->remove(block); + continue; + } + + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) { + for (int c = 0; c < 4; c++) { + const unsigned v = var_from_reg(alloc, inst->src[i], c, j); + BITSET_SET(live, v); + } + } + } + } + + for (unsigned c = 0; c < 4; c++) { + if (inst->reads_flag(c)) { + BITSET_SET(flag_live, c); + } + } + } + } + + ralloc_free(live); + ralloc_free(flag_live); + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp new file mode 100644 index 00000000000..2ac287f17fa --- /dev/null +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -0,0 +1,2217 @@ +/* Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" +#include "brw_cfg.h" +#include "brw_eu.h" +#include "common/gen_debug.h" + +using namespace brw; + +static void +generate_math1_gen4(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src) +{ + gen4_math(p, + dst, + brw_math_function(inst->opcode), + inst->base_mrf, + src, + BRW_MATH_PRECISION_FULL); +} + +static void +check_gen6_math_src_arg(struct brw_reg src) +{ + /* Source swizzles are ignored. */ + assert(!src.abs); + assert(!src.negate); + assert(src.swizzle == BRW_SWIZZLE_XYZW); +} + +static void +generate_math_gen6(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* Can't do writemask because math can't be align16. */ + assert(dst.writemask == WRITEMASK_XYZW); + /* Source swizzles are ignored. */ + check_gen6_math_src_arg(src0); + if (src1.file == BRW_GENERAL_REGISTER_FILE) + check_gen6_math_src_arg(src1); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1); + brw_set_default_access_mode(p, BRW_ALIGN_16); +} + +static void +generate_math2_gen4(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 + * "Message Payload": + * + * "Operand0[7]. For the INT DIV functions, this operand is the + * denominator." + * ... + * "Operand1[7]. For the INT DIV functions, this operand is the + * numerator." + */ + bool is_int_div = inst->opcode != SHADER_OPCODE_POW; + struct brw_reg &op0 = is_int_div ? src1 : src0; + struct brw_reg &op1 = is_int_div ? src0 : src1; + + brw_push_insn_state(p); + brw_set_default_saturate(p, false); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1); + brw_pop_insn_state(p); + + gen4_math(p, + dst, + brw_math_function(inst->opcode), + inst->base_mrf, + op0, + BRW_MATH_PRECISION_FULL); +} + +static void +generate_tex(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + gl_shader_stage stage, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg surface_index, + struct brw_reg sampler_index) +{ + const struct gen_device_info *devinfo = p->devinfo; + int msg_type = -1; + + if (devinfo->gen >= 5) { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; + } + break; + case SHADER_OPCODE_TXD: + if (inst->shadow_compare) { + /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ + assert(devinfo->gen >= 8 || devinfo->is_haswell); + msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; + } else { + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; + } + break; + case SHADER_OPCODE_TXF: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_CMS_W: + assert(devinfo->gen >= 9); + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; + break; + case SHADER_OPCODE_TXF_CMS: + if (devinfo->gen >= 7) + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; + else + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_MCS: + assert(devinfo->gen >= 7); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; + break; + case SHADER_OPCODE_TXS: + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; + break; + case SHADER_OPCODE_TG4: + if (inst->shadow_compare) { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; + } else { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; + } + break; + case SHADER_OPCODE_TG4_OFFSET: + if (inst->shadow_compare) { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; + } else { + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; + } + break; + case SHADER_OPCODE_SAMPLEINFO: + msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; + break; + default: + unreachable("should not get here: invalid vec4 texture opcode"); + } + } else { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; + assert(inst->mlen == 3); + } else { + msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; + assert(inst->mlen == 2); + } + break; + case SHADER_OPCODE_TXD: + /* There is no sample_d_c message; comparisons are done manually. */ + msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; + assert(inst->mlen == 4); + break; + case SHADER_OPCODE_TXF: + msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; + assert(inst->mlen == 2); + break; + case SHADER_OPCODE_TXS: + msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; + assert(inst->mlen == 2); + break; + default: + unreachable("should not get here: invalid vec4 texture opcode"); + } + } + + assert(msg_type != -1); + + assert(sampler_index.type == BRW_REGISTER_TYPE_UD); + + /* Load the message header if present. If there's a texture offset, we need + * to set it up explicitly and load the offset bitfield. Otherwise, we can + * use an implied move from g0 to the first message register. + */ + if (inst->header_size != 0) { + if (devinfo->gen < 6 && !inst->offset) { + /* Set up an implied move from g0 to the MRF. */ + src = brw_vec8_grf(0, 0); + } else { + struct brw_reg header = + retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); + uint32_t dw2 = 0; + + /* Explicitly set up the message header by copying g0 to the MRF. */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + if (inst->offset) + /* Set the texel offset bits in DWord 2. */ + dw2 = inst->offset; + + if (devinfo->gen >= 9) + /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D, + * based on bit 22 in the header. + */ + dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2; + + /* The VS, DS, and FS stages have the g0.2 payload delivered as 0, + * so header0.2 is 0 when g0 is copied. The HS and GS stages do + * not, so we must set to to 0 to avoid setting undesirable bits + * in the message header. + */ + if (dw2 || + stage == MESA_SHADER_TESS_CTRL || + stage == MESA_SHADER_GEOMETRY) { + brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); + } + + brw_adjust_sampler_state_pointer(p, header, sampler_index); + brw_pop_insn_state(p); + } + } + + uint32_t return_format; + + switch (dst.type) { + case BRW_REGISTER_TYPE_D: + return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; + break; + case BRW_REGISTER_TYPE_UD: + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + break; + default: + return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + break; + } + + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || + inst->opcode == SHADER_OPCODE_TG4_OFFSET) + ? prog_data->base.binding_table.gather_texture_start + : prog_data->base.binding_table.texture_start; + + if (surface_index.file == BRW_IMMEDIATE_VALUE && + sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t surface = surface_index.ud; + uint32_t sampler = sampler_index.ud; + + brw_SAMPLE(p, + dst, + inst->base_mrf, + src, + surface + base_binding_table_index, + sampler % 16, + msg_type, + 1, /* response length */ + inst->mlen, + inst->header_size != 0, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + return_format); + + brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index); + } else { + /* Non-constant sampler index. */ + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); + struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + if (brw_regs_equal(&surface_reg, &sampler_reg)) { + brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + } else { + if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { + brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); + } else { + brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); + brw_OR(p, addr, addr, surface_reg); + } + } + if (base_binding_table_index) + brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); + brw_AND(p, addr, addr, brw_imm_ud(0xfff)); + + brw_pop_insn_state(p); + + if (inst->base_mrf != -1) + gen6_resolve_implied_move(p, &src, inst->base_mrf); + + /* dst = send(offset, a0.0 | <descriptor>) */ + brw_inst *insn = brw_send_indirect_message( + p, BRW_SFID_SAMPLER, dst, src, addr); + brw_set_sampler_message(p, insn, + 0 /* surface */, + 0 /* sampler */, + msg_type, + 1 /* rlen */, + inst->mlen /* mlen */, + inst->header_size != 0 /* header */, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + return_format); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } +} + +static void +generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) +{ + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + brw_vec8_grf(0, 0), /* src */ + inst->urb_write_flags, + inst->mlen, + 0, /* response len */ + inst->offset, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); +} + +static void +generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) +{ + struct brw_reg src = brw_message_reg(inst->base_mrf); + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + src, + inst->urb_write_flags, + inst->mlen, + 0, /* response len */ + inst->offset, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); +} + +static void +generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) +{ + struct brw_reg src = brw_message_reg(inst->base_mrf); + + /* We pass the temporary passed in src0 as the writeback register */ + brw_urb_WRITE(p, + inst->src[0].as_brw_reg(), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + src, + BRW_URB_WRITE_ALLOCATE_COMPLETE, + inst->mlen, + 1, /* response len */ + inst->offset, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); + + /* Now put allocated urb handle in dst.0 */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0), + get_element_ud(inst->src[0].as_brw_reg(), 0)); + brw_pop_insn_state(p); +} + +static void +generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) +{ + struct brw_reg src = brw_message_reg(inst->base_mrf); + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + src, + BRW_URB_WRITE_EOT | inst->urb_write_flags, + inst->mlen, + 0, /* response len */ + 0, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); +} + +static void +generate_gs_set_write_offset(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message + * Header: M0.3): + * + * Slot 0 Offset. This field, after adding to the Global Offset field + * in the message descriptor, specifies the offset (in 256-bit units) + * from the start of the URB entry, as referenced by URB Handle 0, at + * which the data will be accessed. + * + * Similar text describes DWORD M0.4, which is slot 1 offset. + * + * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components + * of the register for geometry shader invocations 0 and 1) by the + * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. + * + * We can do this with the following EU instruction: + * + * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } + */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + assert(p->devinfo->gen >= 7 && + src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_UD && + src1.ud <= USHRT_MAX); + if (src0.file == BRW_IMMEDIATE_VALUE) { + brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), + brw_imm_ud(src0.ud * src1.ud)); + } else { + brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), + retype(src1, BRW_REGISTER_TYPE_UW)); + } + brw_pop_insn_state(p); +} + +static void +generate_gs_set_vertex_count(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) +{ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + if (p->devinfo->gen >= 8) { + /* Move the vertex count into the second MRF for the EOT write. */ + brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), + src); + } else { + /* If we think of the src and dst registers as composed of 8 DWORDs each, + * we want to pick up the contents of DWORDs 0 and 4 from src, truncate + * them to WORDs, and then pack them into DWORD 2 of dst. + * + * It's easier to get the EU to do this if we think of the src and dst + * registers as composed of 16 WORDS each; then, we want to pick up the + * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 + * of dst. + * + * We can do that by the following EU instruction: + * + * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } + */ + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, + suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), + stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); + } + brw_pop_insn_state(p); +} + +static void +generate_gs_svb_write(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + int binding = inst->sol_binding; + bool final_write = inst->sol_final_write; + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + /* Copy Vertex data into M0.x */ + brw_MOV(p, stride(dst, 4, 4, 1), + stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); + brw_pop_insn_state(p); + + brw_push_insn_state(p); + /* Send SVB Write */ + brw_svb_write(p, + final_write ? src1 : brw_null_reg(), /* dest == src1 */ + 1, /* msg_reg_nr */ + dst, /* src0 == previous dst */ + BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */ + final_write); /* send_commit_msg */ + + /* Finally, wait for the write commit to occur so that we can proceed to + * other things safely. + * + * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: + * + * The write commit does not modify the destination register, but + * merely clears the dependency associated with the destination + * register. Thus, a simple “mov” instruction using the register as a + * source is sufficient to wait for the write commit to occur. + */ + if (final_write) { + brw_MOV(p, src1, src1); + } + brw_pop_insn_state(p); +} + +static void +generate_gs_svb_set_destination_index(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src) +{ + int vertex = inst->sol_vertex; + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); + brw_pop_insn_state(p); +} + +static void +generate_gs_set_dword_2(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); + brw_pop_insn_state(p); +} + +static void +generate_gs_prepare_channel_masks(struct brw_codegen *p, + struct brw_reg dst) +{ + /* We want to left shift just DWORD 4 (the x component belonging to the + * second geometry shader invocation) by 4 bits. So generate the + * instruction: + * + * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } + */ + dst = suboffset(vec1(dst), 4); + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_SHL(p, dst, dst, brw_imm_ud(4)); + brw_pop_insn_state(p); +} + +static void +generate_gs_set_channel_masks(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) +{ + /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message + * Header: M0.5): + * + * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask + * + * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 + * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls + * Vertex 0 DATA[7]. This bit is ANDed with the corresponding + * channel enable to determine the final channel enable. For the + * URB_READ_OWORD & URB_READ_HWORD messages, when final channel + * enable is 1 it indicates that Vertex 1 DATA [3] will be included + * in the writeback message. For the URB_WRITE_OWORD & + * URB_WRITE_HWORD messages, when final channel enable is 1 it + * indicates that Vertex 1 DATA [3] will be written to the surface. + * + * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included + * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included + * + * 14 Vertex 1 DATA [2] Channel Mask + * 13 Vertex 1 DATA [1] Channel Mask + * 12 Vertex 1 DATA [0] Channel Mask + * 11 Vertex 0 DATA [3] Channel Mask + * 10 Vertex 0 DATA [2] Channel Mask + * 9 Vertex 0 DATA [1] Channel Mask + * 8 Vertex 0 DATA [0] Channel Mask + * + * (This is from a section of the PRM that is agnostic to the particular + * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to + * geometry shader invocations 0 and 1, respectively). Since we have the + * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, + * and the enable flags for geometry shader invocation 1 in bits 7:0 of + * DWORD 4, we just need to OR them together and store the result in bits + * 15:8 of DWORD 5. + * + * It's easier to get the EU to do this if we think of the src and dst + * registers as composed of 32 bytes each; then, we want to pick up the + * contents of bytes 0 and 16 from src, OR them together, and store them in + * byte 21. + * + * We can do that by the following EU instruction: + * + * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } + * + * Note: this relies on the source register having zeros in (a) bits 7:4 of + * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the + * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which + * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to + * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to + * contain valid channel mask values (which are in the range 0x0-0xf). + */ + dst = retype(dst, BRW_REGISTER_TYPE_UB); + src = retype(src, BRW_REGISTER_TYPE_UB); + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); + brw_pop_insn_state(p); +} + +static void +generate_gs_get_instance_id(struct brw_codegen *p, + struct brw_reg dst) +{ + /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT + * and store into dst.0 & dst.4. So generate the instruction: + * + * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } + */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + dst = retype(dst, BRW_REGISTER_TYPE_UD); + struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + brw_SHR(p, dst, stride(r0, 1, 4, 0), + brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); + brw_pop_insn_state(p); +} + +static void +generate_gs_ff_sync_set_primitives(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1, + struct brw_reg src2) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + /* Save src0 data in 16:31 bits of dst.0 */ + brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), + brw_imm_ud(0xffffu)); + brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); + /* Save src1 data in 0:15 bits of dst.0 */ + brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), + brw_imm_ud(0xffffu)); + brw_OR(p, suboffset(vec1(dst), 0), + suboffset(vec1(dst), 0), + suboffset(vec1(src2), 0)); + brw_pop_insn_state(p); +} + +static void +generate_gs_ff_sync(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + /* This opcode uses an implied MRF register for: + * - the header of the ff_sync message. And as such it is expected to be + * initialized to r0 before calling here. + * - the destination where we will write the allocated URB handle. + */ + struct brw_reg header = + retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); + + /* Overwrite dword 0 of the header (SO vertices to write) and + * dword 1 (number of primitives written). + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); + brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); + brw_pop_insn_state(p); + + /* Allocate URB handle in dst */ + brw_ff_sync(p, + dst, + 0, + header, + 1, /* allocate */ + 1, /* response length */ + 0 /* eot */); + + /* Now put allocated urb handle in header.0 */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); + + /* src1 is not an immediate when we use transform feedback */ + if (src1.file != BRW_IMMEDIATE_VALUE) { + brw_set_default_exec_size(p, BRW_EXECUTE_4); + brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); + } + + brw_pop_insn_state(p); +} + +static void +generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) +{ + /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ + struct brw_reg src = brw_vec8_grf(0, 0); + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); + brw_pop_insn_state(p); +} + +static void +generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; + + /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. + * + * Since we operate in SIMD4x2 mode, we need run half as many threads + * as necessary. So we assign (2i + 1, 2i) as the thread counts. We + * shift right by one less to accomplish the multiplication by two. + */ + dst = retype(dst, BRW_REGISTER_TYPE_UD); + struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); + const int shift = ivb ? 16 : 17; + + brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); + brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), + brw_imm_ud(shift - 1)); + brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); + + brw_pop_insn_state(p); +} + +static void +generate_tcs_urb_write(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg urb_header) +{ + const struct gen_device_info *devinfo = p->devinfo; + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, urb_header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + inst->mlen /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); + if (inst->urb_write_flags & BRW_URB_WRITE_EOT) { + brw_inst_set_eot(devinfo, send, 1); + } else { + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + } + + /* what happens to swizzles? */ +} + + +static void +generate_tcs_input_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg vertex, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation. + * Inputs are a vertex index, and a byte offset from the beginning of + * the vertex. */ + + /* If `vertex` is not an immediate, we clobber a0.0 */ + + assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); + assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); + + assert(dst.file == BRW_GENERAL_REGISTER_FILE); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + /* m0.5 bits 8-15 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); + + /* m0.0-0.1: URB handles */ + if (vertex.file == BRW_IMMEDIATE_VALUE) { + uint32_t vertex_index = vertex.ud; + struct brw_reg index_reg = brw_vec1_grf( + 1 + (vertex_index >> 3), vertex_index & 7); + + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(index_reg, BRW_REGISTER_TYPE_UD)); + } else { + /* Use indirect addressing. ICP Handles are DWords (single channels + * of a register) and start at g1.0. + * + * In order to start our region at g1.0, we add 8 to the vertex index, + * effectively skipping over the 8 channels in g0.0. This gives us a + * DWord offset to the ICP Handle. + * + * Indirect addressing works in terms of bytes, so we then multiply + * the DWord offset by 4 (by shifting left by 2). + */ + struct brw_reg addr = brw_address_reg(0); + + /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ + brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_uw(2)); + brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); + + /* top half: m0.1 = g[1.0 + vertex.4]UD */ + brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_uw(2)); + brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); + } + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + + +static void +generate_tcs_output_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg write_mask, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ + assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); + + assert(write_mask.file == BRW_IMMEDIATE_VALUE); + assert(write_mask.type == BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + unsigned mask = write_mask.ud; + + /* m0.5 bits 15:12 and 11:8 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); + + /* HS patch URB handle is delivered in r0.0 */ + struct brw_reg urb_handle = brw_vec1_grf(0, 0); + + /* m0.0-0.1: URB handles */ + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(urb_handle, BRW_REGISTER_TYPE_UD)); + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + +static void +generate_tes_create_input_read_header(struct brw_codegen *p, + struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* Initialize the register to 0 */ + brw_MOV(p, dst, brw_imm_ud(0)); + + /* Enable all the channels in m0.5 bits 15:8 */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); + + /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety, + * mask out irrelevant "Reserved" bits, as they're not marked MBZ. + */ + brw_AND(p, vec2(get_element_ud(dst, 0)), + retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x1fff)); + brw_pop_insn_state(p); +} + +static void +generate_tes_add_indirect_urb_offset(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg header, + struct brw_reg offset) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_MOV(p, dst, header); + /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */ + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + +static void +generate_vec4_urb_read(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg header) +{ + const struct gen_device_info *devinfo = p->devinfo; + + assert(header.file == BRW_GENERAL_REGISTER_FILE); + assert(header.type == BRW_REGISTER_TYPE_UD); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 1 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); +} + +static void +generate_tcs_release_input(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg vertex, + struct brw_reg is_unpaired) +{ + const struct gen_device_info *devinfo = p->devinfo; + + assert(vertex.file == BRW_IMMEDIATE_VALUE); + assert(vertex.type == BRW_REGISTER_TYPE_UD); + + /* m0.0-0.1: URB handles */ + struct brw_reg urb_handles = + retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), + BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, header, brw_imm_ud(0)); + brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); + brw_pop_insn_state(p); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, header); + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_complete(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? + BRW_URB_SWIZZLE_NONE : + BRW_URB_SWIZZLE_INTERLEAVE); +} + +static void +generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst) +{ + struct brw_reg header = brw_message_reg(inst->base_mrf); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, header, brw_imm_ud(0)); + brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8)); + brw_MOV(p, get_element_ud(header, 0), + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); + brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u)); + brw_pop_insn_state(p); + + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + inst->base_mrf, /* starting mrf reg nr */ + header, + BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD | + BRW_URB_WRITE_USE_CHANNEL_MASKS, + inst->mlen, + 0, /* response len */ + 0, /* urb destination offset */ + 0); +} + +static void +generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D)); + brw_pop_insn_state(p); +} + +static void +generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); +} + +static void +generate_tcs_create_barrier_header(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + struct brw_reg dst) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; + struct brw_reg m0_2 = get_element_ud(dst, 2); + unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* Zero the message header */ + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); + + /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */ + brw_AND(p, m0_2, + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13))); + + /* Shift it up to bits 27:24. */ + brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11)); + + /* Set the Barrier Count and the enable bit */ + brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); + + brw_pop_insn_state(p); +} + +static void +generate_oword_dual_block_offsets(struct brw_codegen *p, + struct brw_reg m1, + struct brw_reg index) +{ + int second_vertex_offset; + + if (p->devinfo->gen >= 6) + second_vertex_offset = 1; + else + second_vertex_offset = 16; + + m1 = retype(m1, BRW_REGISTER_TYPE_D); + + /* Set up M1 (message payload). Only the block offsets in M1.0 and + * M1.4 are used, and the rest are ignored. + */ + struct brw_reg m1_0 = suboffset(vec1(m1), 0); + struct brw_reg m1_4 = suboffset(vec1(m1), 4); + struct brw_reg index_0 = suboffset(vec1(index), 0); + struct brw_reg index_4 = suboffset(vec1(index), 4); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + brw_MOV(p, m1_0, index_0); + + if (index.file == BRW_IMMEDIATE_VALUE) { + index_4.ud += second_vertex_offset; + brw_MOV(p, m1_4, index_4); + } else { + brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); + } + + brw_pop_insn_state(p); +} + +static void +generate_unpack_flags(struct brw_codegen *p, + struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + struct brw_reg flags = brw_flag_reg(0, 0); + struct brw_reg dst_0 = suboffset(vec1(dst), 0); + struct brw_reg dst_4 = suboffset(vec1(dst), 4); + + brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); + brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); + brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); + + brw_pop_insn_state(p); +} + +static void +generate_scratch_read(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index) +{ + const struct gen_device_info *devinfo = p->devinfo; + struct brw_reg header = brw_vec8_grf(0, 0); + + gen6_resolve_implied_move(p, &header, inst->base_mrf); + + generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), + index); + + uint32_t msg_type; + + if (devinfo->gen >= 6) + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else if (devinfo->gen == 5 || devinfo->is_g4x) + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + + const unsigned target_cache = + devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE; + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + if (devinfo->gen < 6) + brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); + brw_set_dp_read_message(p, send, + brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, target_cache, + 2, /* mlen */ + true, /* header_present */ + 1 /* rlen */); +} + +static void +generate_scratch_write(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg index) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + struct brw_reg header = brw_vec8_grf(0, 0); + bool write_commit; + + /* If the instruction is predicated, we'll predicate the send, not + * the header setup. + */ + brw_set_default_predicate_control(p, false); + + gen6_resolve_implied_move(p, &header, inst->base_mrf); + + generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), + index); + + brw_MOV(p, + retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D), + retype(src, BRW_REGISTER_TYPE_D)); + + uint32_t msg_type; + + if (devinfo->gen >= 7) + msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE; + else if (devinfo->gen == 6) + msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; + else + msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; + + brw_set_default_predicate_control(p, inst->predicate); + + /* Pre-gen6, we have to specify write commits to ensure ordering + * between reads and writes within a thread. Afterwards, that's + * guaranteed and write commits only matter for inter-thread + * synchronization. + */ + if (devinfo->gen >= 6) { + write_commit = false; + } else { + /* The visitor set up our destination register to be g0. This + * means that when the next read comes along, we will end up + * reading from g0 and causing a block on the write commit. For + * write-after-read, we are relying on the value of the previous + * read being used (and thus blocking on completion) before our + * write is executed. This means we have to be careful in + * instruction scheduling to not violate this assumption. + */ + write_commit = true; + } + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + if (devinfo->gen < 6) + brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); + brw_set_dp_write_message(p, send, + brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, + target_cache, + 3, /* mlen */ + true, /* header present */ + false, /* not a render target write */ + write_commit, /* rlen */ + false, /* eot */ + write_commit); +} + +static void +generate_pull_constant_load(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) +{ + const struct gen_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE : + BRW_DATAPORT_READ_TARGET_DATA_CACHE); + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.ud; + + struct brw_reg header = brw_vec8_grf(0, 0); + + gen6_resolve_implied_move(p, &header, inst->base_mrf); + + if (devinfo->gen >= 6) { + if (offset.file == BRW_IMMEDIATE_VALUE) { + brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), + BRW_REGISTER_TYPE_D), + brw_imm_d(offset.ud >> 4)); + } else { + brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1), + BRW_REGISTER_TYPE_D), + offset, brw_imm_d(4)); + } + } else { + brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), + BRW_REGISTER_TYPE_D), + offset); + } + + uint32_t msg_type; + + if (devinfo->gen >= 6) + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else if (devinfo->gen == 5 || devinfo->is_g4x) + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + if (devinfo->gen < 6) + brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); + brw_set_dp_read_message(p, send, + surf_index, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, + target_cache, + 2, /* mlen */ + true, /* header_present */ + 1 /* rlen */); +} + +static void +generate_get_buffer_size(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg surf_index) +{ + assert(p->devinfo->gen >= 7); + assert(surf_index.type == BRW_REGISTER_TYPE_UD && + surf_index.file == BRW_IMMEDIATE_VALUE); + + brw_SAMPLE(p, + dst, + inst->base_mrf, + src, + surf_index.ud, + 0, + GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, + 1, /* response length */ + inst->mlen, + inst->header_size > 0, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + BRW_SAMPLER_RETURN_FORMAT_SINT32); + + brw_mark_surface_used(&prog_data->base, surf_index.ud); +} + +static void +generate_pull_constant_load_gen7(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg surf_index, + struct brw_reg offset) +{ + assert(surf_index.type == BRW_REGISTER_TYPE_UD); + + if (surf_index.file == BRW_IMMEDIATE_VALUE) { + + brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, offset); + brw_set_sampler_message(p, insn, + surf_index.ud, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1, /* rlen */ + inst->mlen, + inst->header_size != 0, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + + brw_mark_surface_used(&prog_data->base, surf_index.ud); + + } else { + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + brw_pop_insn_state(p); + + /* dst = send(offset, a0.0 | <descriptor>) */ + brw_inst *insn = brw_send_indirect_message( + p, BRW_SFID_SAMPLER, dst, offset, addr); + brw_set_sampler_message(p, insn, + 0 /* surface */, + 0 /* sampler */, + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1 /* rlen */, + inst->mlen, + inst->header_size != 0, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + } +} + +static void +generate_set_simd4x2_header_gen9(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(dst, 2), + brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); + + brw_pop_insn_state(p); +} + +static void +generate_mov_indirect(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, struct brw_reg reg, + struct brw_reg indirect, struct brw_reg length) +{ + assert(indirect.type == BRW_REGISTER_TYPE_UD); + assert(p->devinfo->gen >= 6); + + unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); + + /* This instruction acts in align1 mode */ + assert(dst.writemask == WRITEMASK_XYZW); + + if (indirect.file == BRW_IMMEDIATE_VALUE) { + imm_byte_offset += indirect.ud; + + reg.nr = imm_byte_offset / REG_SIZE; + reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2; + unsigned shift = (imm_byte_offset / 4) % 4; + reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); + + brw_MOV(p, dst, reg); + } else { + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + struct brw_reg addr = vec8(brw_address_reg(0)); + + /* We need to move the indirect value into the address register. In + * order to make things make some sense, we want to respect at least the + * X component of the swizzle. In order to do that, we need to convert + * the subnr (probably 0) to an align1 subnr and add in the swizzle. + */ + assert(brw_is_single_value_swizzle(indirect.swizzle)); + indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)); + + /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of + * the indirect and splat it out to all four channels of the given half + * of a0. + */ + indirect.subnr *= 2; + indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); + brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); + + /* Now we need to incorporate the swizzle from the source register */ + if (reg.swizzle != BRW_SWIZZLE_XXXX) { + uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 | + BRW_GET_SWZ(reg.swizzle, 1) << 6 | + BRW_GET_SWZ(reg.swizzle, 2) << 10 | + BRW_GET_SWZ(reg.swizzle, 3) << 14; + uv_swiz |= uv_swiz << 16; + + brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz)); + } + + brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type)); + + brw_pop_insn_state(p); + } +} + +static void +generate_code(struct brw_codegen *p, + const struct brw_compiler *compiler, + void *log_data, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg) +{ + const struct gen_device_info *devinfo = p->devinfo; + const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage); + bool debug_flag = INTEL_DEBUG & + intel_debug_flag_for_shader_stage(nir->stage); + struct annotation_info annotation; + memset(&annotation, 0, sizeof(annotation)); + int spill_count = 0, fill_count = 0; + int loop_count = 0; + + foreach_block_and_inst (block, vec4_instruction, inst, cfg) { + struct brw_reg src[3], dst; + + if (unlikely(debug_flag)) + annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); + + for (unsigned int i = 0; i < 3; i++) { + src[i] = inst->src[i].as_brw_reg(); + } + dst = inst->dst.as_brw_reg(); + + brw_set_default_predicate_control(p, inst->predicate); + brw_set_default_predicate_inverse(p, inst->predicate_inverse); + brw_set_default_flag_reg(p, 0, inst->flag_subreg); + brw_set_default_saturate(p, inst->saturate); + brw_set_default_mask_control(p, inst->force_writemask_all); + brw_set_default_acc_write_control(p, inst->writes_accumulator); + brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); + + assert(inst->group % inst->exec_size == 0); + assert(inst->group % 8 == 0 || + inst->dst.type == BRW_REGISTER_TYPE_DF || + inst->src[0].type == BRW_REGISTER_TYPE_DF || + inst->src[1].type == BRW_REGISTER_TYPE_DF || + inst->src[2].type == BRW_REGISTER_TYPE_DF); + if (!inst->force_writemask_all) + brw_set_default_group(p, inst->group); + + assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); + assert(inst->mlen <= BRW_MAX_MSG_LENGTH); + + unsigned pre_emit_nr_insn = p->nr_insn; + + switch (inst->opcode) { + case VEC4_OPCODE_UNPACK_UNIFORM: + case BRW_OPCODE_MOV: + brw_MOV(p, dst, src[0]); + break; + case BRW_OPCODE_ADD: + brw_ADD(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MUL: + brw_MUL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MACH: + brw_MACH(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_MAD: + assert(devinfo->gen >= 6); + brw_MAD(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_FRC: + brw_FRC(p, dst, src[0]); + break; + case BRW_OPCODE_RNDD: + brw_RNDD(p, dst, src[0]); + break; + case BRW_OPCODE_RNDE: + brw_RNDE(p, dst, src[0]); + break; + case BRW_OPCODE_RNDZ: + brw_RNDZ(p, dst, src[0]); + break; + + case BRW_OPCODE_AND: + brw_AND(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_OR: + brw_OR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_XOR: + brw_XOR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_NOT: + brw_NOT(p, dst, src[0]); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_CMP: + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + break; + case BRW_OPCODE_SEL: + brw_SEL(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DPH: + brw_DPH(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP4: + brw_DP4(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP3: + brw_DP3(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP2: + brw_DP2(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_F32TO16: + assert(devinfo->gen >= 7); + brw_F32TO16(p, dst, src[0]); + break; + + case BRW_OPCODE_F16TO32: + assert(devinfo->gen >= 7); + brw_F16TO32(p, dst, src[0]); + break; + + case BRW_OPCODE_LRP: + assert(devinfo->gen >= 6); + brw_LRP(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFREV: + assert(devinfo->gen >= 7); + /* BFREV only supports UD type for src and dst. */ + brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + case BRW_OPCODE_FBH: + assert(devinfo->gen >= 7); + /* FBH only supports UD type for dst. */ + brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_FBL: + assert(devinfo->gen >= 7); + /* FBL only supports UD type for dst. */ + brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_LZD: + brw_LZD(p, dst, src[0]); + break; + case BRW_OPCODE_CBIT: + assert(devinfo->gen >= 7); + /* CBIT only supports UD type for dst. */ + brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); + break; + case BRW_OPCODE_ADDC: + assert(devinfo->gen >= 7); + brw_ADDC(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SUBB: + assert(devinfo->gen >= 7); + brw_SUBB(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MAC: + brw_MAC(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_BFE: + assert(devinfo->gen >= 7); + brw_BFE(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFI1: + assert(devinfo->gen >= 7); + brw_BFI1(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_BFI2: + assert(devinfo->gen >= 7); + brw_BFI2(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_IF: + if (!inst->src[0].is_null()) { + /* The instruction has an embedded compare (only allowed on gen6) */ + assert(devinfo->gen == 6); + gen6_IF(p, inst->conditional_mod, src[0], src[1]); + } else { + brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); + brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate); + } + break; + + case BRW_OPCODE_ELSE: + brw_ELSE(p); + break; + case BRW_OPCODE_ENDIF: + brw_ENDIF(p); + break; + + case BRW_OPCODE_DO: + brw_DO(p, BRW_EXECUTE_8); + break; + + case BRW_OPCODE_BREAK: + brw_BREAK(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + break; + case BRW_OPCODE_CONTINUE: + brw_CONT(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + break; + + case BRW_OPCODE_WHILE: + brw_WHILE(p); + loop_count++; + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->gen >= 7) { + gen6_math(p, dst, brw_math_function(inst->opcode), src[0], + brw_null_reg()); + } else if (devinfo->gen == 6) { + generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); + } else { + generate_math1_gen4(p, inst, dst, src[0]); + } + break; + + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->gen >= 7) { + gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); + } else if (devinfo->gen == 6) { + generate_math_gen6(p, inst, dst, src[0], src[1]); + } else { + generate_math2_gen4(p, inst, dst, src[0], src[1]); + } + break; + + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + generate_tex(p, prog_data, nir->stage, + inst, dst, src[0], src[1], src[2]); + break; + + case VS_OPCODE_URB_WRITE: + generate_vs_urb_write(p, inst); + break; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + generate_scratch_read(p, inst, dst, src[0]); + fill_count++; + break; + + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + generate_scratch_write(p, inst, dst, src[0], src[1]); + spill_count++; + break; + + case VS_OPCODE_PULL_CONSTANT_LOAD: + generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); + break; + + case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); + break; + + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + generate_set_simd4x2_header_gen9(p, inst, dst); + break; + + + case VS_OPCODE_GET_BUFFER_SIZE: + generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); + break; + + case GS_OPCODE_URB_WRITE: + generate_gs_urb_write(p, inst); + break; + + case GS_OPCODE_URB_WRITE_ALLOCATE: + generate_gs_urb_write_allocate(p, inst); + break; + + case GS_OPCODE_SVB_WRITE: + generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); + break; + + case GS_OPCODE_SVB_SET_DST_INDEX: + generate_gs_svb_set_destination_index(p, inst, dst, src[0]); + break; + + case GS_OPCODE_THREAD_END: + generate_gs_thread_end(p, inst); + break; + + case GS_OPCODE_SET_WRITE_OFFSET: + generate_gs_set_write_offset(p, dst, src[0], src[1]); + break; + + case GS_OPCODE_SET_VERTEX_COUNT: + generate_gs_set_vertex_count(p, dst, src[0]); + break; + + case GS_OPCODE_FF_SYNC: + generate_gs_ff_sync(p, inst, dst, src[0], src[1]); + break; + + case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: + generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); + break; + + case GS_OPCODE_SET_PRIMITIVE_ID: + generate_gs_set_primitive_id(p, dst); + break; + + case GS_OPCODE_SET_DWORD_2: + generate_gs_set_dword_2(p, dst, src[0]); + break; + + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + generate_gs_prepare_channel_masks(p, dst); + break; + + case GS_OPCODE_SET_CHANNEL_MASKS: + generate_gs_set_channel_masks(p, dst, src[0]); + break; + + case GS_OPCODE_GET_INSTANCE_ID: + generate_gs_get_instance_id(p, dst); + break; + + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, src[0], + prog_data->base.binding_table.shader_time_start); + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.shader_time_start); + break; + + case SHADER_OPCODE_UNTYPED_ATOMIC: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, + !inst->dst.is_null()); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, + src[2].ud); + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_untyped_surface_write(p, src[0], src[1], inst->mlen, + src[2].ud); + break; + + case SHADER_OPCODE_TYPED_ATOMIC: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, + !inst->dst.is_null()); + break; + + case SHADER_OPCODE_TYPED_SURFACE_READ: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen, + src[2].ud); + break; + + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_typed_surface_write(p, src[0], src[1], inst->mlen, + src[2].ud); + break; + + case SHADER_OPCODE_MEMORY_FENCE: + brw_memory_fence(p, dst); + break; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: { + const struct brw_reg mask = + brw_stage_has_packed_dispatch(devinfo, nir->stage, + &prog_data->base) ? brw_imm_ud(~0u) : + brw_dmask_reg(); + brw_find_live_channel(p, dst, mask); + break; + } + + case SHADER_OPCODE_BROADCAST: + assert(inst->force_writemask_all); + brw_broadcast(p, dst, src[0], src[1]); + break; + + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + generate_unpack_flags(p, dst); + break; + + case VEC4_OPCODE_MOV_BYTES: { + /* Moves the low byte from each channel, using an Align1 access mode + * and a <4,1,0> source region. + */ + assert(src[0].type == BRW_REGISTER_TYPE_UB || + src[0].type == BRW_REGISTER_TYPE_B); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].width = BRW_WIDTH_1; + src[0].hstride = BRW_HORIZONTAL_STRIDE_0; + brw_MOV(p, dst, src[0]); + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case VEC4_OPCODE_FROM_DOUBLE: { + assert(type_sz(src[0].type) == 8); + assert(type_sz(dst.type) == 4); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + dst.hstride = BRW_HORIZONTAL_STRIDE_2; + dst.width = BRW_WIDTH_4; + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].width = BRW_WIDTH_4; + brw_MOV(p, dst, src[0]); + + struct brw_reg dst_as_src = dst; + dst.hstride = BRW_HORIZONTAL_STRIDE_1; + dst.width = BRW_WIDTH_8; + brw_MOV(p, dst, dst_as_src); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case VEC4_OPCODE_TO_DOUBLE: { + assert(type_sz(src[0].type) == 4); + assert(type_sz(dst.type) == 8); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + struct brw_reg tmp = retype(dst, src[0].type); + tmp.hstride = BRW_HORIZONTAL_STRIDE_2; + tmp.width = BRW_WIDTH_4; + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].hstride = BRW_HORIZONTAL_STRIDE_1; + src[0].width = BRW_WIDTH_4; + brw_MOV(p, tmp, src[0]); + + tmp.vstride = BRW_VERTICAL_STRIDE_8; + tmp.hstride = BRW_HORIZONTAL_STRIDE_2; + tmp.width = BRW_WIDTH_4; + brw_MOV(p, dst, tmp); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: { + /* Stores the low/high 32-bit of each 64-bit element in src[0] into + * dst using ALIGN1 mode and a <8,4,2>:UD region on the source. + */ + assert(type_sz(src[0].type) == 8); + assert(type_sz(dst.type) == 4); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + dst = retype(dst, BRW_REGISTER_TYPE_UD); + dst.hstride = BRW_HORIZONTAL_STRIDE_1; + + src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); + if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT) + src[0] = suboffset(src[0], 1); + src[0].vstride = BRW_VERTICAL_STRIDE_8; + src[0].width = BRW_WIDTH_4; + src[0].hstride = BRW_HORIZONTAL_STRIDE_2; + brw_MOV(p, dst, src[0]); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: { + /* Reads consecutive 32-bit elements from src[0] and writes + * them to the low/high 32-bit of each 64-bit element in dst. + */ + assert(type_sz(src[0].type) == 4); + assert(type_sz(dst.type) == 8); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + dst = retype(dst, BRW_REGISTER_TYPE_UD); + if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT) + dst = suboffset(dst, 1); + dst.hstride = BRW_HORIZONTAL_STRIDE_2; + + src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].width = BRW_WIDTH_4; + src[0].hstride = BRW_HORIZONTAL_STRIDE_1; + brw_MOV(p, dst, src[0]); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case VEC4_OPCODE_PACK_BYTES: { + /* Is effectively: + * + * mov(8) dst<16,4,1>:UB src<4,1,0>:UB + * + * but destinations' only regioning is horizontal stride, so instead we + * have to use two instructions: + * + * mov(4) dst<1>:UB src<4,1,0>:UB + * mov(4) dst.16<1>:UB src.16<4,1,0>:UB + * + * where they pack the four bytes from the low and high four DW. + */ + assert(_mesa_is_pow_two(dst.writemask) && + dst.writemask != 0); + unsigned offset = __builtin_ctz(dst.writemask); + + dst.type = BRW_REGISTER_TYPE_UB; + + brw_set_default_access_mode(p, BRW_ALIGN_1); + + src[0].type = BRW_REGISTER_TYPE_UB; + src[0].vstride = BRW_VERTICAL_STRIDE_4; + src[0].width = BRW_WIDTH_1; + src[0].hstride = BRW_HORIZONTAL_STRIDE_0; + dst.subnr = offset * 4; + struct brw_inst *insn = brw_MOV(p, dst, src[0]); + brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); + brw_inst_set_no_dd_clear(p->devinfo, insn, true); + brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check); + + src[0].subnr = 16; + dst.subnr = 16 + offset * 4; + insn = brw_MOV(p, dst, src[0]); + brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); + brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, insn, true); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + } + + case TCS_OPCODE_URB_WRITE: + generate_tcs_urb_write(p, inst, src[0]); + break; + + case VEC4_OPCODE_URB_READ: + generate_vec4_urb_read(p, inst, dst, src[0]); + break; + + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_GET_INSTANCE_ID: + generate_tcs_get_instance_id(p, dst); + break; + + case TCS_OPCODE_GET_PRIMITIVE_ID: + generate_tcs_get_primitive_id(p, dst); + break; + + case TCS_OPCODE_CREATE_BARRIER_HEADER: + generate_tcs_create_barrier_header(p, prog_data, dst); + break; + + case TES_OPCODE_CREATE_INPUT_READ_HEADER: + generate_tes_create_input_read_header(p, dst); + break; + + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]); + break; + + case TES_OPCODE_GET_PRIMITIVE_ID: + generate_tes_get_primitive_id(p, dst); + break; + + case TCS_OPCODE_SRC0_010_IS_ZERO: + /* If src_reg had stride like fs_reg, we wouldn't need this. */ + brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); + break; + + case TCS_OPCODE_RELEASE_INPUT: + generate_tcs_release_input(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_THREAD_END: + generate_tcs_thread_end(p, inst); + break; + + case SHADER_OPCODE_BARRIER: + brw_barrier(p, src[0]); + brw_WAIT(p); + break; + + case SHADER_OPCODE_MOV_INDIRECT: + generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_DIM: + assert(devinfo->is_haswell); + assert(src[0].type == BRW_REGISTER_TYPE_DF); + assert(dst.type == BRW_REGISTER_TYPE_DF); + brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); + break; + + default: + unreachable("Unsupported opcode"); + } + + if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { + /* Handled dependency hints in the generator. */ + + assert(!inst->conditional_mod); + } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { + assert(p->nr_insn == pre_emit_nr_insn + 1 || + !"conditional_mod, no_dd_check, or no_dd_clear set for IR " + "emitting more than 1 instruction"); + + brw_inst *last = &p->store[pre_emit_nr_insn]; + + if (inst->conditional_mod) + brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } + } + + brw_set_uip_jip(p, 0); + annotation_finalize(&annotation, p->next_insn_offset); + +#ifndef NDEBUG + bool validated = brw_validate_instructions(p, 0, &annotation); +#else + if (unlikely(debug_flag)) + brw_validate_instructions(p, 0, &annotation); +#endif + + int before_size = p->next_insn_offset; + brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann); + int after_size = p->next_insn_offset; + + if (unlikely(debug_flag)) { + fprintf(stderr, "Native code for %s %s shader %s:\n", + nir->info->label ? nir->info->label : "unnamed", + _mesa_shader_stage_to_string(nir->stage), nir->info->name); + + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " + "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", + stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, + spill_count, fill_count, before_size, after_size, + 100.0f * (before_size - after_size) / before_size); + + dump_assembly(p->store, annotation.ann_count, annotation.ann, + p->devinfo); + ralloc_free(annotation.mem_ctx); + } + assert(validated); + + compiler->shader_debug_log(log_data, + "%s vec4 shader: %d inst, %d loops, %u cycles, " + "%d:%d spills:fills, compacted %d to %d bytes.", + stage_abbrev, before_size / 16, + loop_count, cfg->cycle_count, spill_count, + fill_count, before_size, after_size); + +} + +extern "C" const unsigned * +brw_vec4_generate_assembly(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const nir_shader *nir, + struct brw_vue_prog_data *prog_data, + const struct cfg_t *cfg, + unsigned *out_assembly_size) +{ + struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); + brw_init_codegen(compiler->devinfo, p, mem_ctx); + brw_set_default_access_mode(p, BRW_ALIGN_16); + + generate_code(p, compiler, log_data, nir, prog_data, cfg); + + return brw_get_program(p, out_assembly_size); +} diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp b/src/intel/compiler/brw_vec4_gs_nir.cpp new file mode 100644 index 00000000000..ed8c03b0594 --- /dev/null +++ b/src/intel/compiler/brw_vec4_gs_nir.cpp @@ -0,0 +1,145 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4_gs_visitor.h" + +namespace brw { + +void +vec4_gs_visitor::nir_setup_inputs() +{ +} + +void +vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ + dst_reg *reg; + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + /* We'll just read g1 directly; don't create a temporary. */ + break; + + case nir_intrinsic_load_invocation_id: + reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; + if (reg->file == BAD_FILE) + *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID); + break; + + default: + vec4_visitor::nir_setup_system_value_intrinsic(instr); + } + +} + +void +vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + dst_reg dest; + src_reg src; + + switch (instr->intrinsic) { + case nir_intrinsic_load_per_vertex_input: { + /* The EmitNoIndirectInput flag guarantees our vertex index will + * be constant. We should handle indirects someday. + */ + nir_const_value *vertex = nir_src_as_const_value(instr->src[0]); + nir_const_value *offset_reg = nir_src_as_const_value(instr->src[1]); + + if (nir_dest_bit_size(instr->dest) == 64) { + src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] + + instr->const_index[0] + offset_reg->u32[0], + glsl_type::dvec4_type); + + dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(tmp, src, false); + + src = src_reg(tmp); + src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr) / 2); + + /* Write to dst reg taking into account original writemask */ + dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); + dest.writemask = brw_writemask_for_size(instr->num_components); + emit(MOV(dest, src)); + } else { + /* Make up a type...we have no way of knowing... */ + const glsl_type *const type = glsl_type::ivec(instr->num_components); + + src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] + + instr->const_index[0] + offset_reg->u32[0], + type); + src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); + + /* gl_PointSize is passed in the .w component of the VUE header */ + if (instr->const_index[0] == VARYING_SLOT_PSIZ) + src.swizzle = BRW_SWIZZLE_WWWW; + + dest = get_nir_dest(instr->dest, src.type); + dest.writemask = brw_writemask_for_size(instr->num_components); + emit(MOV(dest, src)); + } + break; + } + + case nir_intrinsic_load_input: + unreachable("nir_lower_io should have produced per_vertex intrinsics"); + + case nir_intrinsic_emit_vertex_with_counter: { + this->vertex_count = + retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); + int stream_id = instr->const_index[0]; + gs_emit_vertex(stream_id); + break; + } + + case nir_intrinsic_end_primitive_with_counter: + this->vertex_count = + retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); + gs_end_primitive(); + break; + + case nir_intrinsic_set_vertex_count: + this->vertex_count = + retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); + break; + + case nir_intrinsic_load_primitive_id: + assert(gs_prog_data->include_primitive_id); + dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D))); + break; + + case nir_intrinsic_load_invocation_id: { + src_reg invocation_id = + src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]); + assert(invocation_id.file != BAD_FILE); + dest = get_nir_dest(instr->dest, invocation_id.type); + emit(MOV(dest, invocation_id)); + break; + } + + default: + vec4_visitor::nir_emit_intrinsic(instr); + } +} +} diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp new file mode 100644 index 00000000000..4a8b5be30e1 --- /dev/null +++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp @@ -0,0 +1,933 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_gs_visitor.cpp + * + * Geometry-shader-specific code derived from the vec4_visitor class. + */ + +#include "brw_vec4_gs_visitor.h" +#include "gen6_gs_visitor.h" +#include "brw_fs.h" +#include "brw_nir.h" +#include "common/gen_debug.h" + +namespace brw { + +vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, + void *log_data, + struct brw_gs_compile *c, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + void *mem_ctx, + bool no_spills, + int shader_time_index) + : vec4_visitor(compiler, log_data, &c->key.tex, + &prog_data->base, shader, mem_ctx, + no_spills, shader_time_index), + c(c), + gs_prog_data(prog_data) +{ +} + + +dst_reg * +vec4_gs_visitor::make_reg_for_system_value(int location) +{ + dst_reg *reg = new(mem_ctx) dst_reg(this, glsl_type::int_type); + + switch (location) { + case SYSTEM_VALUE_INVOCATION_ID: + this->current_annotation = "initialize gl_InvocationID"; + if (gs_prog_data->invocations > 1) + emit(GS_OPCODE_GET_INSTANCE_ID, *reg); + else + emit(MOV(*reg, brw_imm_ud(0))); + break; + default: + unreachable("not reached"); + } + + return reg; +} + + +int +vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map, + int attributes_per_reg) +{ + /* For geometry shaders there are N copies of the input attributes, where N + * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT * + * i + j] represents attribute j for vertex i. + * + * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time, + * so the total number of input slots that will be delivered to the GS (and + * thus the stride of the input arrays) is urb_read_length * 2. + */ + const unsigned num_input_vertices = nir->info->gs.vertices_in; + assert(num_input_vertices <= MAX_GS_INPUT_VERTICES); + unsigned input_array_stride = prog_data->urb_read_length * 2; + + for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) { + int varying = c->input_vue_map.slot_to_varying[slot]; + for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) { + attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] = + attributes_per_reg * payload_reg + input_array_stride * vertex + + slot; + } + } + + int regs_used = ALIGN(input_array_stride * num_input_vertices, + attributes_per_reg) / attributes_per_reg; + return payload_reg + regs_used; +} + + +void +vec4_gs_visitor::setup_payload() +{ + int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; + + /* If we are in dual instanced or single mode, then attributes are going + * to be interleaved, so one register contains two attribute slots. + */ + int attributes_per_reg = + prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; + + /* If a geometry shader tries to read from an input that wasn't written by + * the vertex shader, that produces undefined results, but it shouldn't + * crash anything. So initialize attribute_map to zeros--that ensures that + * these undefined results are read from r0. + */ + memset(attribute_map, 0, sizeof(attribute_map)); + + int reg = 0; + + /* The payload always contains important data in r0, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. + */ + reg++; + + /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */ + if (gs_prog_data->include_primitive_id) + attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++; + + reg = setup_uniforms(reg); + + reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg); + + lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1); + + this->first_non_payload_grf = reg; +} + + +void +vec4_gs_visitor::emit_prolog() +{ + /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In + * geometry shaders, it isn't (it contains a bunch of information we don't + * need, like the input primitive type). We need r0.2 to be zero in order + * to build scratch read/write messages correctly (otherwise this value + * will be interpreted as a global offset, causing us to do our scratch + * reads/writes to garbage memory). So just set it to zero at the top of + * the shader. + */ + this->current_annotation = "clear r0.2"; + dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); + vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u)); + inst->force_writemask_all = true; + + /* Create a virtual register to hold the vertex count */ + this->vertex_count = src_reg(this, glsl_type::uint_type); + + /* Initialize the vertex_count register to 0 */ + this->current_annotation = "initialize vertex_count"; + inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u))); + inst->force_writemask_all = true; + + if (c->control_data_header_size_bits > 0) { + /* Create a virtual register to hold the current set of control data + * bits. + */ + this->control_data_bits = src_reg(this, glsl_type::uint_type); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (c->control_data_header_size_bits <= 32) { + this->current_annotation = "initialize control data bits"; + inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); + inst->force_writemask_all = true; + } + } + + this->current_annotation = NULL; +} + +void +vec4_gs_visitor::emit_thread_end() +{ + if (c->control_data_header_size_bits > 0) { + /* During shader execution, we only ever call emit_control_data_bits() + * just prior to outputting a vertex. Therefore, the control data bits + * corresponding to the most recently output vertex still need to be + * emitted. + */ + current_annotation = "thread end: emit control data bits"; + emit_control_data_bits(); + } + + /* MRF 0 is reserved for the debugger, so start with message header + * in MRF 1. + */ + int base_mrf = 1; + + bool static_vertex_count = gs_prog_data->static_vertex_count != -1; + + /* If the previous instruction was a URB write, we don't need to issue + * a second one - we can just set the EOT bit on the previous write. + * + * Skip this on Gen8+ unless there's a static vertex count, as we also + * need to write the vertex count out, and combining the two may not be + * possible (or at least not straightforward). + */ + vec4_instruction *last = (vec4_instruction *) instructions.get_tail(); + if (last && last->opcode == GS_OPCODE_URB_WRITE && + !(INTEL_DEBUG & DEBUG_SHADER_TIME) && + devinfo->gen >= 8 && static_vertex_count) { + last->urb_write_flags = BRW_URB_WRITE_EOT | last->urb_write_flags; + return; + } + + current_annotation = "thread end"; + dst_reg mrf_reg(MRF, base_mrf); + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + vec4_instruction *inst = emit(MOV(mrf_reg, r0)); + inst->force_writemask_all = true; + if (devinfo->gen < 8 || !static_vertex_count) + emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + inst = emit(GS_OPCODE_THREAD_END); + inst->base_mrf = base_mrf; + inst->mlen = devinfo->gen >= 8 && !static_vertex_count ? 2 : 1; +} + + +void +vec4_gs_visitor::emit_urb_write_header(int mrf) +{ + /* The SEND instruction that writes the vertex data to the VUE will use + * per_slot_offset=true, which means that DWORDs 3 and 4 of the message + * header specify an offset (in multiples of 256 bits) into the URB entry + * at which the write should take place. + * + * So we have to prepare a message header with the appropriate offset + * values. + */ + dst_reg mrf_reg(MRF, mrf); + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + this->current_annotation = "URB write header"; + vec4_instruction *inst = emit(MOV(mrf_reg, r0)); + inst->force_writemask_all = true; + emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count, + brw_imm_ud(gs_prog_data->output_vertex_size_hwords)); +} + + +vec4_instruction * +vec4_gs_visitor::emit_urb_write_opcode(bool complete) +{ + /* We don't care whether the vertex is complete, because in general + * geometry shaders output multiple vertices, and we don't terminate the + * thread until all vertices are complete. + */ + (void) complete; + + vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE); + inst->offset = gs_prog_data->control_data_header_size_hwords; + + /* We need to increment Global Offset by 1 to make room for Broadwell's + * extra "Vertex Count" payload at the beginning of the URB entry. + */ + if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1) + inst->offset++; + + inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; + return inst; +} + + +/** + * Write out a batch of 32 control data bits from the control_data_bits + * register to the URB. + * + * The current value of the vertex_count register determines which DWORD in + * the URB receives the control data bits. The control_data_bits register is + * assumed to contain the correct data for the vertex that was most recently + * output, and all previous vertices that share the same DWORD. + * + * This function takes care of ensuring that if no vertices have been output + * yet, no control bits are emitted. + */ +void +vec4_gs_visitor::emit_control_data_bits() +{ + assert(c->control_data_bits_per_vertex != 0); + + /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) + * granularity, we need to use two tricks to ensure that the batch of 32 + * control data bits is written to the appropriate DWORD in the URB. To + * select which vec4 we are writing to, we use the "slot {0,1} offset" + * fields of the message header. To select which DWORD in the vec4 we are + * writing to, we use the channel mask fields of the message header. To + * avoid penalizing geometry shaders that emit a small number of vertices + * with extra bookkeeping, we only do each of these tricks when + * c->prog_data.control_data_header_size_bits is large enough to make it + * necessary. + * + * Note: this means that if we're outputting just a single DWORD of control + * data bits, we'll actually replicate it four times since we won't do any + * channel masking. But that's not a problem since in this case the + * hardware only pays attention to the first DWORD. + */ + enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; + if (c->control_data_header_size_bits > 32) + urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; + if (c->control_data_header_size_bits > 128) + urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; + + /* If we are using either channel masks or a per-slot offset, then we + * need to figure out which DWORD we are trying to write to, using the + * formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + */ + src_reg dword_index(this, glsl_type::uint_type); + if (urb_write_flags) { + src_reg prev_count(this, glsl_type::uint_type); + emit(ADD(dst_reg(prev_count), this->vertex_count, + brw_imm_ud(0xffffffffu))); + unsigned log2_bits_per_vertex = + util_last_bit(c->control_data_bits_per_vertex); + emit(SHR(dst_reg(dword_index), prev_count, + brw_imm_ud(6 - log2_bits_per_vertex))); + } + + /* Start building the URB write message. The first MRF gets a copy of + * R0. + */ + int base_mrf = 1; + dst_reg mrf_reg(MRF, base_mrf); + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + vec4_instruction *inst = emit(MOV(mrf_reg, r0)); + inst->force_writemask_all = true; + + if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { + /* Set the per-slot offset to dword_index / 4, to that we'll write to + * the appropriate OWORD within the control data header. + */ + src_reg per_slot_offset(this, glsl_type::uint_type); + emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u))); + emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, + brw_imm_ud(1u)); + } + + if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. We need to do + * this computation with force_writemask_all, otherwise garbage data + * from invocation 0 might clobber the mask for invocation 1 when + * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks + * together. + */ + src_reg channel(this, glsl_type::uint_type); + inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u))); + inst->force_writemask_all = true; + src_reg one(this, glsl_type::uint_type); + inst = emit(MOV(dst_reg(one), brw_imm_ud(1u))); + inst->force_writemask_all = true; + src_reg channel_mask(this, glsl_type::uint_type); + inst = emit(SHL(dst_reg(channel_mask), one, channel)); + inst->force_writemask_all = true; + emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), + channel_mask); + emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); + } + + /* Store the control data bits in the message payload and send it. */ + dst_reg mrf_reg2(MRF, base_mrf + 1); + inst = emit(MOV(mrf_reg2, this->control_data_bits)); + inst->force_writemask_all = true; + inst = emit(GS_OPCODE_URB_WRITE); + inst->urb_write_flags = urb_write_flags; + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1) + inst->offset = 2; + inst->base_mrf = base_mrf; + inst->mlen = 2; +} + +void +vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id) +{ + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(c->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + /* reg::sid = stream_id */ + src_reg sid(this, glsl_type::uint_type); + emit(MOV(dst_reg(sid), brw_imm_ud(stream_id))); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + src_reg shift_count(this, glsl_type::uint_type); + emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u))); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + src_reg mask(this, glsl_type::uint_type); + emit(SHL(dst_reg(mask), sid, shift_count)); + emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); +} + +void +vec4_gs_visitor::gs_emit_vertex(int stream_id) +{ + this->current_annotation = "emit vertex: safety check"; + + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (stream_id > 0 && !nir->info->has_transform_feedback_varyings) + return; + + /* If we're outputting 32 control data bits or less, then we can wait + * until the shader is over to output them all. Otherwise we need to + * output them as we go. Now is the time to do it, since we're about to + * output the vertex_count'th vertex, so it's guaranteed that the + * control data bits associated with the (vertex_count - 1)th vertex are + * correct. + */ + if (c->control_data_header_size_bits > 32) { + this->current_annotation = "emit vertex: emit control data bits"; + /* Only emit control data bits if we've finished accumulating a batch + * of 32 bits. This is the case when: + * + * (vertex_count * bits_per_vertex) % 32 == 0 + * + * (in other words, when the last 5 bits of vertex_count * + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some + * integer n (which is always the case, since bits_per_vertex is + * always 1 or 2), this is equivalent to requiring that the last 5-n + * bits of vertex_count are 0: + * + * vertex_count & (2^(5-n) - 1) == 0 + * + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is + * equivalent to: + * + * vertex_count & (32 / bits_per_vertex - 1) == 0 + */ + vec4_instruction *inst = + emit(AND(dst_null_ud(), this->vertex_count, + brw_imm_ud(32 / c->control_data_bits_per_vertex - 1))); + inst->conditional_mod = BRW_CONDITIONAL_Z; + + emit(IF(BRW_PREDICATE_NORMAL)); + { + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we skip emitting them. + */ + emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), + BRW_CONDITIONAL_NEQ)); + emit(IF(BRW_PREDICATE_NORMAL)); + emit_control_data_bits(); + emit(BRW_OPCODE_ENDIF); + + /* Reset control_data_bits to 0 so we can start accumulating a new + * batch. + * + * Note: in the case where vertex_count == 0, this neutralizes the + * effect of any call to EndPrimitive() that the shader may have + * made before outputting its first vertex. + */ + inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); + inst->force_writemask_all = true; + } + emit(BRW_OPCODE_ENDIF); + } + + this->current_annotation = "emit vertex: vertex data"; + emit_vertex(); + + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for GL_POINTS outputs that don't use streams). + */ + if (c->control_data_header_size_bits > 0 && + gs_prog_data->control_data_format == + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + this->current_annotation = "emit vertex: Stream control data bits"; + set_stream_control_data_bits(stream_id); + } + + this->current_annotation = NULL; +} + +void +vec4_gs_visitor::gs_end_primitive() +{ + /* We can only do EndPrimitive() functionality when the control data + * consists of cut bits. Fortunately, the only time it isn't is when the + * output type is points, in which case EndPrimitive() is a no-op. + */ + if (gs_prog_data->control_data_format != + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { + return; + } + + if (c->control_data_header_size_bits == 0) + return; + + /* Cut bits use one bit per vertex. */ + assert(c->control_data_bits_per_vertex == 1); + + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting + * vertex n, 0 otherwise. So all we need to do here is mark bit + * (vertex_count - 1) % 32 in the cut_bits register to indicate that + * EndPrimitive() was called after emitting vertex (vertex_count - 1); + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. + * + * Note that if EndPrimitve() is called before emitting any vertices, this + * will cause us to set bit 31 of the control_data_bits register to 1. + * That's fine because: + * + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be + * output, so the hardware will ignore cut bit 31. + * + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the + * last vertex, so setting cut bit 31 has no effect (since the primitive + * is automatically ended when the GS terminates). + * + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the + * control_data_bits register to 0 when the first vertex is emitted. + */ + + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ + src_reg one(this, glsl_type::uint_type); + emit(MOV(dst_reg(one), brw_imm_ud(1u))); + src_reg prev_count(this, glsl_type::uint_type); + emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu))); + src_reg mask(this, glsl_type::uint_type); + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << + * ((vertex_count - 1) % 32). + */ + emit(SHL(dst_reg(mask), one, prev_count)); + emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); +} + +static const GLuint gl_prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = { + [GL_POINTS] =_3DPRIM_POINTLIST, + [GL_LINES] = _3DPRIM_LINELIST, + [GL_LINE_LOOP] = _3DPRIM_LINELOOP, + [GL_LINE_STRIP] = _3DPRIM_LINESTRIP, + [GL_TRIANGLES] = _3DPRIM_TRILIST, + [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, + [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN, + [GL_QUADS] = _3DPRIM_QUADLIST, + [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP, + [GL_POLYGON] = _3DPRIM_POLYGON, + [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ, + [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, + [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ, + [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, +}; + +extern "C" const unsigned * +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_gs_prog_key *key, + struct brw_gs_prog_data *prog_data, + const nir_shader *src_shader, + struct gl_program *prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + struct brw_gs_compile c; + memset(&c, 0, sizeof(c)); + c.key = *key; + + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY]; + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); + + /* The GLSL linker will have already matched up GS inputs and the outputs + * of prior stages. The driver does extend VS outputs in some cases, but + * only for legacy OpenGL or Gen4-5 hardware, neither of which offer + * geometry shader support. So we can safely ignore that. + * + * For SSO pipelines, we use a fixed VUE map layout based on variable + * locations, so we can rely on rendezvous-by-location making this work. + */ + GLbitfield64 inputs_read = shader->info->inputs_read; + brw_compute_vue_map(compiler->devinfo, + &c.input_vue_map, inputs_read, + shader->info->separate_shader); + + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar); + brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map); + brw_nir_lower_vue_outputs(shader, is_scalar); + shader = brw_postprocess_nir(shader, compiler, is_scalar); + + prog_data->base.clip_distance_mask = + ((1 << shader->info->clip_distance_array_size) - 1); + prog_data->base.cull_distance_mask = + ((1 << shader->info->cull_distance_array_size) - 1) << + shader->info->clip_distance_array_size; + + prog_data->include_primitive_id = + (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0; + + prog_data->invocations = shader->info->gs.invocations; + + if (compiler->devinfo->gen >= 8) + prog_data->static_vertex_count = nir_gs_count_vertices(shader); + + if (compiler->devinfo->gen >= 7) { + if (shader->info->gs.output_primitive == GL_POINTS) { + /* When the output type is points, the geometry shader may output data + * to multiple streams, and EndPrimitive() has no effect. So we + * configure the hardware to interpret the control data as stream ID. + */ + prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID; + + /* We only have to emit control bits if we are using streams */ + if (prog && prog->info.gs.uses_streams) + c.control_data_bits_per_vertex = 2; + else + c.control_data_bits_per_vertex = 0; + } else { + /* When the output type is triangle_strip or line_strip, EndPrimitive() + * may be used to terminate the current strip and start a new one + * (similar to primitive restart), and outputting data to multiple + * streams is not supported. So we configure the hardware to interpret + * the control data as EndPrimitive information (a.k.a. "cut bits"). + */ + prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT; + + /* We only need to output control data if the shader actually calls + * EndPrimitive(). + */ + c.control_data_bits_per_vertex = + shader->info->gs.uses_end_primitive ? 1 : 0; + } + } else { + /* There are no control data bits in gen6. */ + c.control_data_bits_per_vertex = 0; + } + c.control_data_header_size_bits = + shader->info->gs.vertices_out * c.control_data_bits_per_vertex; + + /* 1 HWORD = 32 bytes = 256 bits */ + prog_data->control_data_header_size_hwords = + ALIGN(c.control_data_header_size_bits, 256) / 256; + + /* Compute the output vertex size. + * + * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex + * Size (p168): + * + * [0,62] indicating [1,63] 16B units + * + * Specifies the size of each vertex stored in the GS output entry + * (following any Control Header data) as a number of 128-bit units + * (minus one). + * + * Programming Restrictions: The vertex size must be programmed as a + * multiple of 32B units with the following exception: Rendering is + * disabled (as per SOL stage state) and the vertex size output by the + * GS thread is 16B. + * + * If rendering is enabled (as per SOL state) the vertex size must be + * programmed as a multiple of 32B units. In other words, the only time + * software can program a vertex size with an odd number of 16B units + * is when rendering is disabled. + * + * Note: B=bytes in the above text. + * + * It doesn't seem worth the extra trouble to optimize the case where the + * vertex size is 16B (especially since this would require special-casing + * the GEN assembly that writes to the URB). So we just set the vertex + * size to a multiple of 32B (2 vec4's) in all cases. + * + * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We + * budget that as follows: + * + * 512 bytes for varyings (a varying component is 4 bytes and + * gl_MaxGeometryOutputComponents = 128) + * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 + * bytes) + * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE + * even if it's not used) + * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots + * whenever clip planes are enabled, even if the shader doesn't + * write to gl_ClipDistance) + * 16 bytes overhead since the VUE size must be a multiple of 32 bytes + * (see below)--this causes up to 1 VUE slot to be wasted + * 400 bytes available for varying packing overhead + * + * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes) + * per interpolation type, so this is plenty. + * + */ + unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16; + assert(compiler->devinfo->gen == 6 || + output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES); + prog_data->output_vertex_size_hwords = + ALIGN(output_vertex_size_bytes, 32) / 32; + + /* Compute URB entry size. The maximum allowed URB entry size is 32k. + * That divides up as follows: + * + * 64 bytes for the control data header (cut indices or StreamID bits) + * 4096 bytes for varyings (a varying component is 4 bytes and + * gl_MaxGeometryTotalOutputComponents = 1024) + * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 + * bytes/vertex and gl_MaxGeometryOutputVertices is 256) + * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE + * even if it's not used) + * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots + * whenever clip planes are enabled, even if the shader doesn't + * write to gl_ClipDistance) + * 4096 bytes overhead since the VUE size must be a multiple of 32 + * bytes (see above)--this causes up to 1 VUE slot to be wasted + * 8128 bytes available for varying packing overhead + * + * Worst-case varying packing overhead is 3/4 of a varying slot per + * interpolation type, which works out to 3072 bytes, so this would allow + * us to accommodate 2 interpolation types without any danger of running + * out of URB space. + * + * In practice, the risk of running out of URB space is very small, since + * the above figures are all worst-case, and most of them scale with the + * number of output vertices. So we'll just calculate the amount of space + * we need, and if it's too large, fail to compile. + * + * The above is for gen7+ where we have a single URB entry that will hold + * all the output. In gen6, we will have to allocate URB entries for every + * vertex we emit, so our URB entries only need to be large enough to hold + * a single vertex. Also, gen6 does not have a control data header. + */ + unsigned output_size_bytes; + if (compiler->devinfo->gen >= 7) { + output_size_bytes = + prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out; + output_size_bytes += 32 * prog_data->control_data_header_size_hwords; + } else { + output_size_bytes = prog_data->output_vertex_size_hwords * 32; + } + + /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output, + * which comes before the control header. + */ + if (compiler->devinfo->gen >= 8) + output_size_bytes += 32; + + /* Shaders can technically set max_vertices = 0, at which point we + * may have a URB size of 0 bytes. Nothing good can come from that, + * so enforce a minimum size. + */ + if (output_size_bytes == 0) + output_size_bytes = 1; + + unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; + if (compiler->devinfo->gen == 6) + max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES; + if (output_size_bytes > max_output_size_bytes) + return NULL; + + + /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and + * a multiple of 128 bytes in gen6. + */ + if (compiler->devinfo->gen >= 7) + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + else + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128; + + assert(shader->info->gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim)); + prog_data->output_topology = + gl_prim_to_hw_prim[shader->info->gs.output_primitive]; + + prog_data->vertices_in = shader->info->gs.vertices_in; + + /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we + * need to program a URB read length of ceiling(num_slots / 2). + */ + prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2; + + /* Now that prog_data setup is done, we are ready to actually compile the + * program. + */ + if (unlikely(INTEL_DEBUG & DEBUG_GS)) { + fprintf(stderr, "GS Input "); + brw_print_vue_map(stderr, &c.input_vue_map); + fprintf(stderr, "GS Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (is_scalar) { + fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader, + shader_time_index); + if (v.run_gs()) { + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; + + fs_generator g(compiler, log_data, mem_ctx, &c.key, + &prog_data->base.base, v.promoted_constants, + false, MESA_SHADER_GEOMETRY); + if (unlikely(INTEL_DEBUG & DEBUG_GS)) { + const char *label = + shader->info->label ? shader->info->label : "unnamed"; + char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s", + label, shader->info->name); + g.enable_debug(name); + } + g.generate_code(v.cfg, 8); + return g.get_assembly(final_assembly_size); + } + } + + if (compiler->devinfo->gen >= 7) { + /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do + * so without spilling. If the GS invocations count > 1, then we can't use + * dual object mode. + */ + if (prog_data->invocations <= 1 && + likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) { + prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + + vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader, + mem_ctx, true /* no_spills */, shader_time_index); + if (v.run()) { + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, + shader, &prog_data->base, v.cfg, + final_assembly_size); + } + } + } + + /* Either we failed to compile in DUAL_OBJECT mode (probably because it + * would have required spilling) or DUAL_OBJECT mode is disabled. So fall + * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers. + * + * FIXME: Single dispatch mode requires that the driver can handle + * interleaving of input registers, but this is already supported (dual + * instance mode has the same requirement). However, to take full advantage + * of single dispatch mode to reduce register pressure we would also need to + * do interleaved outputs, but currently, the vec4 visitor and generator + * classes do not support this, so at the moment register pressure in + * single and dual instance modes is the same. + * + * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS" + * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely + * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode + * is also supported. When InstanceCount=1 (one instance per object) software + * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be + * the best choice for performance, followed by SINGLE mode." + * + * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE + * mode is more performant when invocations > 1. Gen6 only supports + * SINGLE mode. + */ + if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7) + prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE; + else + prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE; + + vec4_gs_visitor *gs = NULL; + const unsigned *ret = NULL; + + if (compiler->devinfo->gen >= 7) + gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data, + shader, mem_ctx, false /* no_spills */, + shader_time_index); + else + gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, prog, + shader, mem_ctx, false /* no_spills */, + shader_time_index); + + if (!gs->run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, gs->fail_msg); + } else { + ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader, + &prog_data->base, gs->cfg, + final_assembly_size); + } + + delete gs; + return ret; +} + + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h b/src/intel/compiler/brw_vec4_gs_visitor.h new file mode 100644 index 00000000000..09221f928d1 --- /dev/null +++ b/src/intel/compiler/brw_vec4_gs_visitor.h @@ -0,0 +1,81 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_gs_visitor.h + * + * Geometry-shader-specific code derived from the vec4_visitor class. + */ + +#ifndef BRW_VEC4_GS_VISITOR_H +#define BRW_VEC4_GS_VISITOR_H + +#include "brw_vec4.h" + +#define MAX_GS_INPUT_VERTICES 6 + +#ifdef __cplusplus +namespace brw { + +class vec4_gs_visitor : public vec4_visitor +{ +public: + vec4_gs_visitor(const struct brw_compiler *compiler, + void *log_data, + struct brw_gs_compile *c, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + void *mem_ctx, + bool no_spills, + int shader_time_index); + + virtual void nir_setup_inputs(); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + +protected: + virtual dst_reg *make_reg_for_system_value(int location); + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + virtual void emit_urb_write_header(int mrf); + virtual vec4_instruction *emit_urb_write_opcode(bool complete); + virtual void gs_emit_vertex(int stream_id); + virtual void gs_end_primitive(); + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + +protected: + int setup_varying_inputs(int payload_reg, int *attribute_map, + int attributes_per_reg); + void emit_control_data_bits(); + void set_stream_control_data_bits(unsigned stream_id); + + src_reg vertex_count; + src_reg control_data_bits; + const struct brw_gs_compile * const c; + struct brw_gs_prog_data * const gs_prog_data; +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_GS_VISITOR_H */ diff --git a/src/intel/compiler/brw_vec4_live_variables.cpp b/src/intel/compiler/brw_vec4_live_variables.cpp new file mode 100644 index 00000000000..73f658cd8fa --- /dev/null +++ b/src/intel/compiler/brw_vec4_live_variables.cpp @@ -0,0 +1,343 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "brw_cfg.h" +#include "brw_vec4_live_variables.h" + +using namespace brw; + +/** @file brw_vec4_live_variables.cpp + * + * Support for computing at the basic block level which variables + * (virtual GRFs in our case) are live at entry and exit. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 14.1 (p444). + */ + +/** + * Sets up the use[] and def[] arrays. + * + * The basic-block-level live variable analysis needs to know which + * variables get used before they're completely defined, and which + * variables are completely defined before they're used. + * + * We independently track each channel of a vec4. This is because we need to + * be able to recognize a sequence like: + * + * ... + * DP4 tmp.x a b; + * DP4 tmp.y c d; + * MUL result.xy tmp.xy e.xy + * ... + * + * as having tmp live only across that sequence (assuming it's used nowhere + * else), because it's a common pattern. A more conservative approach that + * doesn't get tmp marked a deffed in this block will tend to result in + * spilling. + */ +void +vec4_live_variables::setup_def_use() +{ + int ip = 0; + + foreach_block (block, cfg) { + assert(ip == block->start_ip); + if (block->num > 0) + assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); + + foreach_inst_in_block(vec4_instruction, inst, block) { + struct block_data *bd = &block_data[block->num]; + + /* Set use[] for this instruction */ + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) { + for (int c = 0; c < 4; c++) { + const unsigned v = var_from_reg(alloc, inst->src[i], c, j); + if (!BITSET_TEST(bd->def, v)) + BITSET_SET(bd->use, v); + } + } + } + } + for (unsigned c = 0; c < 4; c++) { + if (inst->reads_flag(c) && + !BITSET_TEST(bd->flag_def, c)) { + BITSET_SET(bd->flag_use, c); + } + } + + /* Check for unconditional writes to whole registers. These + * are the things that screen off preceding definitions of a + * variable, and thus qualify for being in def[]. + */ + if (inst->dst.file == VGRF && + (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) { + for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { + for (int c = 0; c < 4; c++) { + if (inst->dst.writemask & (1 << c)) { + const unsigned v = var_from_reg(alloc, inst->dst, c, i); + if (!BITSET_TEST(bd->use, v)) + BITSET_SET(bd->def, v); + } + } + } + } + if (inst->writes_flag()) { + for (unsigned c = 0; c < 4; c++) { + if ((inst->dst.writemask & (1 << c)) && + !BITSET_TEST(bd->flag_use, c)) { + BITSET_SET(bd->flag_def, c); + } + } + } + + ip++; + } + } +} + +/** + * The algorithm incrementally sets bits in liveout and livein, + * propagating it through control flow. It will eventually terminate + * because it only ever adds bits, and stops when no bits are added in + * a pass. + */ +void +vec4_live_variables::compute_live_variables() +{ + bool cont = true; + + while (cont) { + cont = false; + + foreach_block_reverse (block, cfg) { + struct block_data *bd = &block_data[block->num]; + + /* Update liveout */ + foreach_list_typed(bblock_link, child_link, link, &block->children) { + struct block_data *child_bd = &block_data[child_link->block->num]; + + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = (child_bd->livein[i] & + ~bd->liveout[i]); + if (new_liveout) { + bd->liveout[i] |= new_liveout; + cont = true; + } + } + BITSET_WORD new_liveout = (child_bd->flag_livein[0] & + ~bd->flag_liveout[0]); + if (new_liveout) { + bd->flag_liveout[0] |= new_liveout; + cont = true; + } + } + + /* Update livein */ + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = (bd->use[i] | + (bd->liveout[i] & + ~bd->def[i])); + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + cont = true; + } + } + BITSET_WORD new_livein = (bd->flag_use[0] | + (bd->flag_liveout[0] & + ~bd->flag_def[0])); + if (new_livein & ~bd->flag_livein[0]) { + bd->flag_livein[0] |= new_livein; + cont = true; + } + } + } +} + +vec4_live_variables::vec4_live_variables(const simple_allocator &alloc, + cfg_t *cfg) + : alloc(alloc), cfg(cfg) +{ + mem_ctx = ralloc_context(NULL); + + num_vars = alloc.total_size * 8; + block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + + bitset_words = BITSET_WORDS(num_vars); + for (int i = 0; i < cfg->num_blocks; i++) { + block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); + + block_data[i].flag_def[0] = 0; + block_data[i].flag_use[0] = 0; + block_data[i].flag_livein[0] = 0; + block_data[i].flag_liveout[0] = 0; + } + + setup_def_use(); + compute_live_variables(); +} + +vec4_live_variables::~vec4_live_variables() +{ + ralloc_free(mem_ctx); +} + +#define MAX_INSTRUCTION (1 << 30) + +/** + * Computes a conservative start/end of the live intervals for each virtual GRF. + * + * We could expose per-channel live intervals to the consumer based on the + * information we computed in vec4_live_variables, except that our only + * current user is virtual_grf_interferes(). So we instead union the + * per-channel ranges into a per-vgrf range for virtual_grf_start[] and + * virtual_grf_end[]. + * + * We could potentially have virtual_grf_interferes() do the test per-channel, + * which would let some interesting register allocation occur (particularly on + * code-generated GLSL sequences from the Cg compiler which does register + * allocation at the GLSL level and thus reuses components of the variable + * with distinct lifetimes). But right now the complexity of doing so doesn't + * seem worth it, since having virtual_grf_interferes() be cheap is important + * for register allocation performance. + */ +void +vec4_visitor::calculate_live_intervals() +{ + if (this->live_intervals) + return; + + int *start = ralloc_array(mem_ctx, int, this->alloc.total_size * 8); + int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * 8); + ralloc_free(this->virtual_grf_start); + ralloc_free(this->virtual_grf_end); + this->virtual_grf_start = start; + this->virtual_grf_end = end; + + for (unsigned i = 0; i < this->alloc.total_size * 8; i++) { + start[i] = MAX_INSTRUCTION; + end[i] = -1; + } + + /* Start by setting up the intervals with no knowledge of control + * flow. + */ + int ip = 0; + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) { + for (int c = 0; c < 4; c++) { + const unsigned v = var_from_reg(alloc, inst->src[i], c, j); + start[v] = MIN2(start[v], ip); + end[v] = ip; + } + } + } + } + + if (inst->dst.file == VGRF) { + for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { + for (int c = 0; c < 4; c++) { + if (inst->dst.writemask & (1 << c)) { + const unsigned v = var_from_reg(alloc, inst->dst, c, i); + start[v] = MIN2(start[v], ip); + end[v] = ip; + } + } + } + } + + ip++; + } + + /* Now, extend those intervals using our analysis of control flow. + * + * The control flow-aware analysis was done at a channel level, while at + * this point we're distilling it down to vgrfs. + */ + this->live_intervals = new(mem_ctx) vec4_live_variables(alloc, cfg); + + foreach_block (block, cfg) { + struct block_data *bd = &live_intervals->block_data[block->num]; + + for (int i = 0; i < live_intervals->num_vars; i++) { + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } + + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } + } + } +} + +void +vec4_visitor::invalidate_live_intervals() +{ + ralloc_free(live_intervals); + live_intervals = NULL; +} + +int +vec4_visitor::var_range_start(unsigned v, unsigned n) const +{ + int start = INT_MAX; + + for (unsigned i = 0; i < n; i++) + start = MIN2(start, virtual_grf_start[v + i]); + + return start; +} + +int +vec4_visitor::var_range_end(unsigned v, unsigned n) const +{ + int end = INT_MIN; + + for (unsigned i = 0; i < n; i++) + end = MAX2(end, virtual_grf_end[v + i]); + + return end; +} + +bool +vec4_visitor::virtual_grf_interferes(int a, int b) +{ + return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <= + var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) || + (var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <= + var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a]))); +} diff --git a/src/intel/compiler/brw_vec4_live_variables.h b/src/intel/compiler/brw_vec4_live_variables.h new file mode 100644 index 00000000000..8807c453743 --- /dev/null +++ b/src/intel/compiler/brw_vec4_live_variables.h @@ -0,0 +1,112 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "util/bitset.h" +#include "brw_vec4.h" + +namespace brw { + +struct block_data { + /** + * Which variables are defined before being used in the block. + * + * Note that for our purposes, "defined" means unconditionally, completely + * defined. + */ + BITSET_WORD *def; + + /** + * Which variables are used before being defined in the block. + */ + BITSET_WORD *use; + + /** Which defs reach the entry point of the block. */ + BITSET_WORD *livein; + + /** Which defs reach the exit point of the block. */ + BITSET_WORD *liveout; + + BITSET_WORD flag_def[1]; + BITSET_WORD flag_use[1]; + BITSET_WORD flag_livein[1]; + BITSET_WORD flag_liveout[1]; +}; + +class vec4_live_variables { +public: + DECLARE_RALLOC_CXX_OPERATORS(vec4_live_variables) + + vec4_live_variables(const simple_allocator &alloc, cfg_t *cfg); + ~vec4_live_variables(); + + int num_vars; + int bitset_words; + + /** Per-basic-block information on live variables */ + struct block_data *block_data; + +protected: + void setup_def_use(); + void compute_live_variables(); + + const simple_allocator &alloc; + cfg_t *cfg; + void *mem_ctx; +}; + +/* Returns the variable index for the k-th dword of the c-th component of + * register reg. + */ +inline unsigned +var_from_reg(const simple_allocator &alloc, const src_reg ®, + unsigned c = 0, unsigned k = 0) +{ + assert(reg.file == VGRF && reg.nr < alloc.count && c < 4); + const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); + unsigned result = + 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + + (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize; + /* Do not exceed the limit for this register */ + assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr])); + return result; +} + +inline unsigned +var_from_reg(const simple_allocator &alloc, const dst_reg ®, + unsigned c = 0, unsigned k = 0) +{ + assert(reg.file == VGRF && reg.nr < alloc.count && c < 4); + const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); + unsigned result = + 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + + (c + k / csize * 4) * csize + k % csize; + /* Do not exceed the limit for this register */ + assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr])); + return result; +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp new file mode 100644 index 00000000000..4e88b795049 --- /dev/null +++ b/src/intel/compiler/brw_vec4_nir.cpp @@ -0,0 +1,2407 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "brw_vec4.h" +#include "brw_vec4_builder.h" +#include "brw_vec4_surface_builder.h" + +using namespace brw; +using namespace brw::surface_access; + +namespace brw { + +void +vec4_visitor::emit_nir_code() +{ + if (nir->num_uniforms > 0) + nir_setup_uniforms(); + + nir_setup_system_values(); + + /* get the main function and emit it */ + nir_foreach_function(function, nir) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + nir_emit_impl(function->impl); + } +} + +void +vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ + dst_reg *reg; + + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + unreachable("should be lowered by lower_vertex_id()."); + + case nir_intrinsic_load_vertex_id_zero_base: + reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]; + if (reg->file == BAD_FILE) + *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + break; + + case nir_intrinsic_load_base_vertex: + reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX]; + if (reg->file == BAD_FILE) + *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX); + break; + + case nir_intrinsic_load_instance_id: + reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID]; + if (reg->file == BAD_FILE) + *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID); + break; + + case nir_intrinsic_load_base_instance: + reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE]; + if (reg->file == BAD_FILE) + *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE); + break; + + case nir_intrinsic_load_draw_id: + reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID]; + if (reg->file == BAD_FILE) + *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID); + break; + + default: + break; + } +} + +static bool +setup_system_values_block(nir_block *block, vec4_visitor *v) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + v->nir_setup_system_value_intrinsic(intrin); + } + + return true; +} + +void +vec4_visitor::nir_setup_system_values() +{ + nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX); + for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { + nir_system_values[i] = dst_reg(); + } + + nir_foreach_function(function, nir) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + nir_foreach_block(block, function->impl) { + setup_system_values_block(block, this); + } + } +} + +void +vec4_visitor::nir_setup_uniforms() +{ + uniforms = nir->num_uniforms / 16; +} + +void +vec4_visitor::nir_emit_impl(nir_function_impl *impl) +{ + nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc); + for (unsigned i = 0; i < impl->reg_alloc; i++) { + nir_locals[i] = dst_reg(); + } + + foreach_list_typed(nir_register, reg, node, &impl->registers) { + unsigned array_elems = + reg->num_array_elems == 0 ? 1 : reg->num_array_elems; + const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32); + nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs)); + + if (reg->bit_size == 64) + nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF; + } + + nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc); + + nir_emit_cf_list(&impl->body); +} + +void +vec4_visitor::nir_emit_cf_list(exec_list *list) +{ + exec_list_validate(list); + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_if: + nir_emit_if(nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + nir_emit_loop(nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_block: + nir_emit_block(nir_cf_node_as_block(node)); + break; + + default: + unreachable("Invalid CFG node block"); + } + } +} + +void +vec4_visitor::nir_emit_if(nir_if *if_stmt) +{ + /* First, put the condition in f0 */ + src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1); + vec4_instruction *inst = emit(MOV(dst_null_d(), condition)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* We can just predicate based on the X channel, as the condition only + * goes on its own line */ + emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X)); + + nir_emit_cf_list(&if_stmt->then_list); + + /* note: if the else is empty, dead CF elimination will remove it */ + emit(BRW_OPCODE_ELSE); + + nir_emit_cf_list(&if_stmt->else_list); + + emit(BRW_OPCODE_ENDIF); +} + +void +vec4_visitor::nir_emit_loop(nir_loop *loop) +{ + emit(BRW_OPCODE_DO); + + nir_emit_cf_list(&loop->body); + + emit(BRW_OPCODE_WHILE); +} + +void +vec4_visitor::nir_emit_block(nir_block *block) +{ + nir_foreach_instr(instr, block) { + nir_emit_instr(instr); + } +} + +void +vec4_visitor::nir_emit_instr(nir_instr *instr) +{ + base_ir = instr; + + switch (instr->type) { + case nir_instr_type_load_const: + nir_emit_load_const(nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_intrinsic: + nir_emit_intrinsic(nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_alu: + nir_emit_alu(nir_instr_as_alu(instr)); + break; + + case nir_instr_type_jump: + nir_emit_jump(nir_instr_as_jump(instr)); + break; + + case nir_instr_type_tex: + nir_emit_texture(nir_instr_as_tex(instr)); + break; + + case nir_instr_type_ssa_undef: + nir_emit_undef(nir_instr_as_ssa_undef(instr)); + break; + + default: + fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n"); + break; + } +} + +static dst_reg +dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg, + unsigned base_offset, nir_src *indirect) +{ + dst_reg reg; + + reg = v->nir_locals[nir_reg->index]; + if (nir_reg->bit_size == 64) + reg.type = BRW_REGISTER_TYPE_DF; + reg = offset(reg, 8, base_offset); + if (indirect) { + reg.reladdr = + new(v->mem_ctx) src_reg(v->get_nir_src(*indirect, + BRW_REGISTER_TYPE_D, + 1)); + } + return reg; +} + +dst_reg +vec4_visitor::get_nir_dest(const nir_dest &dest) +{ + if (dest.is_ssa) { + dst_reg dst = + dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32))); + if (dest.ssa.bit_size == 64) + dst.type = BRW_REGISTER_TYPE_DF; + nir_ssa_values[dest.ssa.index] = dst; + return dst; + } else { + return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset, + dest.reg.indirect); + } +} + +dst_reg +vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type) +{ + return retype(get_nir_dest(dest), type); +} + +dst_reg +vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type) +{ + return get_nir_dest(dest, brw_type_for_nir_type(devinfo, type)); +} + +src_reg +vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type, + unsigned num_components) +{ + dst_reg reg; + + if (src.is_ssa) { + assert(src.ssa != NULL); + reg = nir_ssa_values[src.ssa->index]; + } + else { + reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset, + src.reg.indirect); + } + + reg = retype(reg, type); + + src_reg reg_as_src = src_reg(reg); + reg_as_src.swizzle = brw_swizzle_for_size(num_components); + return reg_as_src; +} + +src_reg +vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type, + unsigned num_components) +{ + return get_nir_src(src, brw_type_for_nir_type(devinfo, type), + num_components); +} + +src_reg +vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components) +{ + /* if type is not specified, default to signed int */ + return get_nir_src(src, nir_type_int32, num_components); +} + +src_reg +vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u32[0] == 0); + return src_reg(); + } + + return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); +} + +void +vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) +{ + dst_reg reg; + + if (instr->def.bit_size == 64) { + reg = dst_reg(VGRF, alloc.allocate(2)); + reg.type = BRW_REGISTER_TYPE_DF; + } else { + reg = dst_reg(VGRF, alloc.allocate(1)); + reg.type = BRW_REGISTER_TYPE_D; + } + + unsigned remaining = brw_writemask_for_size(instr->def.num_components); + + /* @FIXME: consider emitting vector operations to save some MOVs in + * cases where the components are representable in 8 bits. + * For now, we emit a MOV for each distinct value. + */ + for (unsigned i = 0; i < instr->def.num_components; i++) { + unsigned writemask = 1 << i; + + if ((remaining & writemask) == 0) + continue; + + for (unsigned j = i; j < instr->def.num_components; j++) { + if ((instr->def.bit_size == 32 && + instr->value.u32[i] == instr->value.u32[j]) || + (instr->def.bit_size == 64 && + instr->value.f64[i] == instr->value.f64[j])) { + writemask |= 1 << j; + } + } + + reg.writemask = writemask; + if (instr->def.bit_size == 64) { + emit(MOV(reg, setup_imm_df(instr->value.f64[i]))); + } else { + emit(MOV(reg, brw_imm_d(instr->value.i32[i]))); + } + + remaining &= ~writemask; + } + + /* Set final writemask */ + reg.writemask = brw_writemask_for_size(instr->def.num_components); + + nir_ssa_values[instr->def.index] = reg; +} + +void +vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + dst_reg dest; + src_reg src; + + switch (instr->intrinsic) { + + case nir_intrinsic_load_input: { + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + + /* We set EmitNoIndirectInput for VS */ + assert(const_offset); + + dest = get_nir_dest(instr->dest); + dest.writemask = brw_writemask_for_size(instr->num_components); + + src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0], + glsl_type::uvec4_type); + src = retype(src, dest.type); + + bool is_64bit = nir_dest_bit_size(instr->dest) == 64; + if (is_64bit) { + dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); + src.swizzle = BRW_SWIZZLE_XYZW; + shuffle_64bit_data(tmp, src, false); + emit(MOV(dest, src_reg(tmp))); + } else { + /* Swizzle source based on component layout qualifier */ + src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); + emit(MOV(dest, src)); + } + break; + } + + case nir_intrinsic_store_output: { + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + assert(const_offset); + + int varying = instr->const_index[0] + const_offset->u32[0]; + + bool is_64bit = nir_src_bit_size(instr->src[0]) == 64; + if (is_64bit) { + src_reg data; + src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_DF, + instr->num_components); + data = src_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(dst_reg(data), src, true); + src = retype(data, BRW_REGISTER_TYPE_F); + } else { + src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, + instr->num_components); + } + + unsigned c = nir_intrinsic_component(instr); + output_reg[varying][c] = dst_reg(src); + output_num_components[varying][c] = instr->num_components; + + unsigned num_components = instr->num_components; + if (is_64bit) + num_components *= 2; + + output_reg[varying][c] = dst_reg(src); + output_num_components[varying][c] = MIN2(4, num_components); + + if (is_64bit && num_components > 4) { + assert(num_components <= 8); + output_reg[varying + 1][c] = byte_offset(dst_reg(src), REG_SIZE); + output_num_components[varying + 1][c] = num_components - 4; + } + break; + } + + case nir_intrinsic_get_buffer_size: { + nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; + + const unsigned index = + prog_data->base.binding_table.ssbo_start + ssbo_index; + dst_reg result_dst = get_nir_dest(instr->dest); + vec4_instruction *inst = new(mem_ctx) + vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst); + + inst->base_mrf = 2; + inst->mlen = 1; /* always at least one */ + inst->src[1] = brw_imm_ud(index); + + /* MRF for the first parameter */ + src_reg lod = brw_imm_d(0); + int param_base = inst->base_mrf; + int writemask = WRITEMASK_X; + emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod)); + + emit(inst); + + brw_mark_surface_used(&prog_data->base, index); + break; + } + + case nir_intrinsic_store_ssbo: { + assert(devinfo->gen >= 7); + + /* Block index */ + src_reg surf_index; + nir_const_value *const_uniform_block = + nir_src_as_const_value(instr->src[1]); + if (const_uniform_block) { + unsigned index = prog_data->base.binding_table.ssbo_start + + const_uniform_block->u32[0]; + surf_index = brw_imm_ud(index); + brw_mark_surface_used(&prog_data->base, index); + } else { + surf_index = src_reg(this, glsl_type::uint_type); + emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1), + brw_imm_ud(prog_data->base.binding_table.ssbo_start))); + surf_index = emit_uniformize(surf_index); + + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + /* Offset */ + src_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); + if (const_offset) { + offset_reg = brw_imm_ud(const_offset->u32[0]); + } else { + offset_reg = get_nir_src(instr->src[2], 1); + } + + /* Value */ + src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4); + + /* Writemask */ + unsigned write_mask = instr->const_index[0]; + + /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped + * writes will use SIMD8 mode. In order to hide this and keep symmetry across + * typed and untyped messages and across hardware platforms, the + * current implementation of the untyped messages will transparently convert + * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it + * and enabling only channel X on the SEND instruction. + * + * The above, works well for full vector writes, but not for partial writes + * where we want to write some channels and not others, like when we have + * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are + * quite restrictive with regards to the channel enables we can configure in + * the message descriptor (not all combinations are allowed) we cannot simply + * implement these scenarios with a single message while keeping the + * aforementioned symmetry in the implementation. For now we de decided that + * it is better to keep the symmetry to reduce complexity, so in situations + * such as the one described we end up emitting two untyped write messages + * (one for xy and another for w). + * + * The code below packs consecutive channels into a single write message, + * detects gaps in the vector write and if needed, sends a second message + * with the remaining channels. If in the future we decide that we want to + * emit a single message at the expense of losing the symmetry in the + * implementation we can: + * + * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8 + * message payload. In this mode we can write up to 8 offsets and dwords + * to the red channel only (for the two vec4s in the SIMD4x2 execution) + * and select which of the 8 channels carry data to write by setting the + * appropriate writemask in the dst register of the SEND instruction. + * It would require to write a new generator opcode specifically for + * IvyBridge since we would need to prepare a SIMD8 payload that could + * use any channel, not just X. + * + * 2) For Haswell+: Simply send a single write message but set the writemask + * on the dst of the SEND instruction to select the channels we want to + * write. It would require to modify the current messages to receive + * and honor the writemask provided. + */ + const vec4_builder bld = vec4_builder(this).at_end() + .annotate(current_annotation, base_ir); + + unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32; + if (type_slots == 2) { + dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true); + val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F)); + } + + uint8_t swizzle[4] = { 0, 0, 0, 0}; + int num_channels = 0; + unsigned skipped_channels = 0; + int num_components = instr->num_components; + for (int i = 0; i < num_components; i++) { + /* Read components Z/W of a dvec from the appropriate place. We will + * also have to adjust the swizzle (we do that with the '% 4' below) + */ + if (i == 2 && type_slots == 2) + val_reg = byte_offset(val_reg, REG_SIZE); + + /* Check if this channel needs to be written. If so, record the + * channel we need to take the data from in the swizzle array + */ + int component_mask = 1 << i; + int write_test = write_mask & component_mask; + if (write_test) { + /* If we are writing doubles we have to write 2 channels worth of + * of data (64 bits) for each double component. + */ + swizzle[num_channels++] = (i * type_slots) % 4; + if (type_slots == 2) + swizzle[num_channels++] = (i * type_slots + 1) % 4; + } + + /* If we don't have to write this channel it means we have a gap in the + * vector, so write the channels we accumulated until now, if any. Do + * the same if this was the last component in the vector, if we have + * enough channels for a full vec4 write or if we have processed + * components XY of a dvec (since components ZW are not in the same + * SIMD register) + */ + if (!write_test || i == num_components - 1 || num_channels == 4 || + (i == 1 && type_slots == 2)) { + if (num_channels > 0) { + /* We have channels to write, so update the offset we need to + * write at to skip the channels we skipped, if any. + */ + if (skipped_channels > 0) { + if (offset_reg.file == IMM) { + offset_reg.ud += 4 * skipped_channels; + } else { + emit(ADD(dst_reg(offset_reg), offset_reg, + brw_imm_ud(4 * skipped_channels))); + } + } + + /* Swizzle the data register so we take the data from the channels + * we need to write and send the write message. This will write + * num_channels consecutive dwords starting at offset. + */ + val_reg.swizzle = + BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + emit_untyped_write(bld, surf_index, offset_reg, val_reg, + 1 /* dims */, num_channels /* size */, + BRW_PREDICATE_NONE); + + /* If we have to do a second write we will have to update the + * offset so that we jump over the channels we have just written + * now. + */ + skipped_channels = num_channels; + + /* Restart the count for the next write message */ + num_channels = 0; + } + + /* If we didn't write the channel, increase skipped count */ + if (!write_test) + skipped_channels += type_slots; + } + } + + break; + } + + case nir_intrinsic_load_ssbo: { + assert(devinfo->gen >= 7); + + nir_const_value *const_uniform_block = + nir_src_as_const_value(instr->src[0]); + + src_reg surf_index; + if (const_uniform_block) { + unsigned index = prog_data->base.binding_table.ssbo_start + + const_uniform_block->u32[0]; + surf_index = brw_imm_ud(index); + + brw_mark_surface_used(&prog_data->base, index); + } else { + surf_index = src_reg(this, glsl_type::uint_type); + emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1), + brw_imm_ud(prog_data->base.binding_table.ssbo_start))); + surf_index = emit_uniformize(surf_index); + + /* Assume this may touch any UBO. It would be nice to provide + * a tighter bound, but the array information is already lowered away. + */ + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + src_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset) { + offset_reg = brw_imm_ud(const_offset->u32[0]); + } else { + offset_reg = get_nir_src(instr->src[1], 1); + } + + /* Read the vector */ + const vec4_builder bld = vec4_builder(this).at_end() + .annotate(current_annotation, base_ir); + + src_reg read_result; + dst_reg dest = get_nir_dest(instr->dest); + if (type_sz(dest.type) < 8) { + read_result = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, 4 /* size*/, + BRW_PREDICATE_NONE); + } else { + src_reg shuffled = src_reg(this, glsl_type::dvec4_type); + + src_reg temp; + temp = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, 4 /* size*/, + BRW_PREDICATE_NONE); + emit(MOV(dst_reg(retype(shuffled, temp.type)), temp)); + + if (offset_reg.file == IMM) + offset_reg.ud += 16; + else + emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16))); + + temp = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, 4 /* size*/, + BRW_PREDICATE_NONE); + emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)), + temp)); + + read_result = src_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(dst_reg(read_result), shuffled, false); + } + + read_result.type = dest.type; + read_result.swizzle = brw_swizzle_for_size(instr->num_components); + emit(MOV(dest, read_result)); + break; + } + + case nir_intrinsic_ssbo_atomic_add: + nir_emit_ssbo_atomic(BRW_AOP_ADD, instr); + break; + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_and: + nir_emit_ssbo_atomic(BRW_AOP_AND, instr); + break; + case nir_intrinsic_ssbo_atomic_or: + nir_emit_ssbo_atomic(BRW_AOP_OR, instr); + break; + case nir_intrinsic_ssbo_atomic_xor: + nir_emit_ssbo_atomic(BRW_AOP_XOR, instr); + break; + case nir_intrinsic_ssbo_atomic_exchange: + nir_emit_ssbo_atomic(BRW_AOP_MOV, instr); + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr); + break; + + case nir_intrinsic_load_vertex_id: + unreachable("should be lowered by lower_vertex_id()"); + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_invocation_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + src_reg val = src_reg(nir_system_values[sv]); + assert(val.file != BAD_FILE); + dest = get_nir_dest(instr->dest, val.type); + emit(MOV(dest, val)); + break; + } + + case nir_intrinsic_load_uniform: { + /* Offsets are in bytes but they should always be multiples of 4 */ + assert(nir_intrinsic_base(instr) % 4 == 0); + + dest = get_nir_dest(instr->dest); + + src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16)); + src.type = dest.type; + + /* Uniforms don't actually have to be vec4 aligned. In the case that + * it isn't, we have to use a swizzle to shift things around. They + * do still have the std140 alignment requirement that vec2's have to + * be vec2-aligned and vec3's and vec4's have to be vec4-aligned. + * + * The swizzle also works in the indirect case as the generator adds + * the swizzle to the offset for us. + */ + unsigned shift = (nir_intrinsic_base(instr) % 16) / 4; + assert(shift + instr->num_components <= 4); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + /* Offsets are in bytes but they should always be multiples of 4 */ + assert(const_offset->u32[0] % 4 == 0); + + unsigned offset = const_offset->u32[0] + shift * 4; + src.offset = ROUND_DOWN_TO(offset, 16); + shift = (offset % 16) / 4; + src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); + + emit(MOV(dest, src)); + } else { + src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); + + src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); + + /* MOV_INDIRECT is going to stomp the whole thing anyway */ + dest.writemask = WRITEMASK_XYZW; + + emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, + indirect, brw_imm_ud(instr->const_index[1])); + } + break; + } + + case nir_intrinsic_atomic_counter_read: + case nir_intrinsic_atomic_counter_inc: + case nir_intrinsic_atomic_counter_dec: { + unsigned surf_index = prog_data->base.binding_table.abo_start + + (unsigned) instr->const_index[0]; + const vec4_builder bld = + vec4_builder(this).at_end().annotate(current_annotation, base_ir); + + /* Get some metadata from the image intrinsic. */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + + /* Get the arguments of the atomic intrinsic. */ + src_reg offset = get_nir_src(instr->src[0], nir_type_int32, + instr->num_components); + const src_reg surface = brw_imm_ud(surf_index); + const src_reg src0 = (info->num_srcs >= 2 + ? get_nir_src(instr->src[1]) : src_reg()); + const src_reg src1 = (info->num_srcs >= 3 + ? get_nir_src(instr->src[2]) : src_reg()); + + src_reg tmp; + + dest = get_nir_dest(instr->dest); + + if (instr->intrinsic == nir_intrinsic_atomic_counter_read) { + tmp = emit_untyped_read(bld, surface, offset, 1, 1); + } else { + tmp = emit_untyped_atomic(bld, surface, offset, + src0, src1, + 1, 1, + get_atomic_counter_op(instr->intrinsic)); + } + + bld.MOV(retype(dest, tmp.type), tmp); + brw_mark_surface_used(stage_prog_data, surf_index); + break; + } + + case nir_intrinsic_load_ubo: { + nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]); + src_reg surf_index; + + dest = get_nir_dest(instr->dest); + + if (const_block_index) { + /* The block index is a constant, so just emit the binding table entry + * as an immediate. + */ + const unsigned index = prog_data->base.binding_table.ubo_start + + const_block_index->u32[0]; + surf_index = brw_imm_ud(index); + brw_mark_surface_used(&prog_data->base, index); + } else { + /* The block index is not a constant. Evaluate the index expression + * per-channel and add the base UBO index; we have to select a value + * from any live channel. + */ + surf_index = src_reg(this, glsl_type::uint_type); + emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32, + instr->num_components), + brw_imm_ud(prog_data->base.binding_table.ubo_start))); + surf_index = emit_uniformize(surf_index); + + /* Assume this may touch any UBO. It would be nice to provide + * a tighter bound, but the array information is already lowered away. + */ + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.ubo_start + + nir->info->num_ubos - 1); + } + + src_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset) { + offset_reg = brw_imm_ud(const_offset->u32[0] & ~15); + } else { + offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1); + } + + src_reg packed_consts; + if (nir_dest_bit_size(instr->dest) == 32) { + packed_consts = src_reg(this, glsl_type::vec4_type); + emit_pull_constant_load_reg(dst_reg(packed_consts), + surf_index, + offset_reg, + NULL, NULL /* before_block/inst */); + } else { + src_reg temp = src_reg(this, glsl_type::dvec4_type); + src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F); + + emit_pull_constant_load_reg(dst_reg(temp_float), + surf_index, offset_reg, NULL, NULL); + if (offset_reg.file == IMM) + offset_reg.ud += 16; + else + emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u))); + emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)), + surf_index, offset_reg, NULL, NULL); + + packed_consts = src_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(dst_reg(packed_consts), temp, false); + } + + packed_consts.swizzle = brw_swizzle_for_size(instr->num_components); + if (const_offset) { + unsigned type_size = type_sz(dest.type); + packed_consts.swizzle += + BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size, + const_offset->u32[0] % 16 / type_size, + const_offset->u32[0] % 16 / type_size, + const_offset->u32[0] % 16 / type_size); + } + + emit(MOV(dest, retype(packed_consts, dest.type))); + + break; + } + + case nir_intrinsic_memory_barrier: { + const vec4_builder bld = + vec4_builder(this).at_end().annotate(current_annotation, base_ir); + const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) + ->size_written = 2 * REG_SIZE; + break; + } + + case nir_intrinsic_shader_clock: { + /* We cannot do anything if there is an event, so ignore it for now */ + const src_reg shader_clock = get_timestamp(); + const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type); + + dest = get_nir_dest(instr->dest, type); + emit(MOV(dest, shader_clock)); + break; + } + + default: + unreachable("Unknown intrinsic"); + } +} + +void +vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) +{ + dst_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + src_reg surface; + nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); + if (const_surface) { + unsigned surf_index = prog_data->base.binding_table.ssbo_start + + const_surface->u32[0]; + surface = brw_imm_ud(surf_index); + brw_mark_surface_used(&prog_data->base, surf_index); + } else { + surface = src_reg(this, glsl_type::uint_type); + emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]), + brw_imm_ud(prog_data->base.binding_table.ssbo_start))); + + /* Assume this may touch any UBO. This is the same we do for other + * UBO/SSBO accesses with non-constant surface. + */ + brw_mark_surface_used(&prog_data->base, + prog_data->base.binding_table.ssbo_start + + nir->info->num_ssbos - 1); + } + + src_reg offset = get_nir_src(instr->src[1], 1); + src_reg data1 = get_nir_src(instr->src[2], 1); + src_reg data2; + if (op == BRW_AOP_CMPWR) + data2 = get_nir_src(instr->src[3], 1); + + /* Emit the actual atomic operation operation */ + const vec4_builder bld = + vec4_builder(this).at_end().annotate(current_annotation, base_ir); + + src_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); + dest.type = atomic_result.type; + bld.MOV(dest, atomic_result); +} + +static unsigned +brw_swizzle_for_nir_swizzle(uint8_t swizzle[4]) +{ + return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); +} + +static enum brw_conditional_mod +brw_conditional_for_nir_comparison(nir_op op) +{ + switch (op) { + case nir_op_flt: + case nir_op_ilt: + case nir_op_ult: + return BRW_CONDITIONAL_L; + + case nir_op_fge: + case nir_op_ige: + case nir_op_uge: + return BRW_CONDITIONAL_GE; + + case nir_op_feq: + case nir_op_ieq: + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + return BRW_CONDITIONAL_Z; + + case nir_op_fne: + case nir_op_ine: + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + return BRW_CONDITIONAL_NZ; + + default: + unreachable("not reached: bad operation for comparison"); + } +} + +bool +vec4_visitor::optimize_predicate(nir_alu_instr *instr, + enum brw_predicate *predicate) +{ + if (!instr->src[0].src.is_ssa || + instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *cmp_instr = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + switch (cmp_instr->op) { + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + *predicate = BRW_PREDICATE_ALIGN16_ANY4H; + break; + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + *predicate = BRW_PREDICATE_ALIGN16_ALL4H; + break; + default: + return false; + } + + unsigned size_swizzle = + brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); + + src_reg op[2]; + assert(nir_op_infos[cmp_instr->op].num_inputs == 2); + for (unsigned i = 0; i < 2; i++) { + nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i]; + unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src); + type = (nir_alu_type) (((unsigned) type) | bit_size); + op[i] = get_nir_src(cmp_instr->src[i].src, type, 4); + unsigned base_swizzle = + brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); + op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); + op[i].abs = cmp_instr->src[i].abs; + op[i].negate = cmp_instr->src[i].negate; + } + + emit(CMP(dst_null_d(), op[0], op[1], + brw_conditional_for_nir_comparison(cmp_instr->op))); + + return true; +} + +static void +emit_find_msb_using_lzd(const vec4_builder &bld, + const dst_reg &dst, + const src_reg &src, + bool is_signed) +{ + vec4_instruction *inst; + src_reg temp = src; + + if (is_signed) { + /* LZD of an absolute value source almost always does the right + * thing. There are two problem values: + * + * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns + * 0. However, findMSB(int(0x80000000)) == 30. + * + * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns + * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: + * + * For a value of zero or negative one, -1 will be returned. + * + * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but + * findMSB(-(1<<x)) should return x-1. + * + * For all negative number cases, including 0x80000000 and + * 0xffffffff, the correct value is obtained from LZD if instead of + * negating the (already negative) value the logical-not is used. A + * conditonal logical-not can be achieved in two instructions. + */ + temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D)); + + bld.ASR(dst_reg(temp), src, brw_imm_d(31)); + bld.XOR(dst_reg(temp), temp, src); + } + + bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD), + retype(temp, BRW_REGISTER_TYPE_UD)); + + /* LZD counts from the MSB side, while GLSL's findMSB() wants the count + * from the LSB side. Subtract the result from 31 to convert the MSB count + * into an LSB count. If no bits are set, LZD will return 32. 31-32 = -1, + * which is exactly what findMSB() is supposed to return. + */ + inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D), + brw_imm_d(31)); + inst->src[0].negate = true; +} + +void +vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src, + bool saturate, + brw_reg_type single_type) +{ + /* BDW PRM vol 15 - workarounds: + * DF->f format conversion for Align16 has wrong emask calculation when + * source is immediate. + */ + if (devinfo->gen == 8 && single_type == BRW_REGISTER_TYPE_F && + src.file == BRW_IMMEDIATE_VALUE) { + vec4_instruction *inst = emit(MOV(dst, brw_imm_f(src.df))); + inst->saturate = saturate; + return; + } + + dst_reg temp = dst_reg(this, glsl_type::dvec4_type); + emit(MOV(temp, src)); + + dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type); + temp2 = retype(temp2, single_type); + emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp)) + ->size_written = 2 * REG_SIZE; + + vec4_instruction *inst = emit(MOV(dst, src_reg(temp2))); + inst->saturate = saturate; +} + +void +vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src, + bool saturate, + brw_reg_type single_type) +{ + dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type)); + src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), single_type); + emit(MOV(dst_reg(tmp_src), retype(src, single_type))); + emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src); + vec4_instruction *inst = emit(MOV(dst, src_reg(tmp_dst))); + inst->saturate = saturate; +} + +src_reg +vec4_visitor::setup_imm_df(double v) +{ + assert(devinfo->gen >= 7); + + if (devinfo->gen >= 8) + return brw_imm_df(v); + + /* gen7.5 does not support DF immediates straighforward but the DIM + * instruction allows to set the 64-bit immediate value. + */ + if (devinfo->is_haswell) { + dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF); + emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true; + return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); + } + + /* gen7 does not support DF immediates */ + union { + double d; + struct { + uint32_t i1; + uint32_t i2; + }; + } di; + + di.d = v; + + /* Write the low 32-bit of the constant to the X:UD channel and the + * high 32-bit to the Y:UD channel to build the constant in a VGRF. + * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes + * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle + * XXXX so any access to the VGRF only reads the constant data in these + * channels. + */ + const dst_reg tmp = + retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD); + for (int n = 0; n < 2; n++) { + emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1))) + ->force_writemask_all = true; + emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2))) + ->force_writemask_all = true; + } + + return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); +} + +void +vec4_visitor::nir_emit_alu(nir_alu_instr *instr) +{ + vec4_instruction *inst; + + nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type | + nir_dest_bit_size(instr->dest.dest)); + dst_reg dst = get_nir_dest(instr->dest.dest, dst_type); + dst.writemask = instr->dest.write_mask; + + src_reg op[4]; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + nir_alu_type src_type = (nir_alu_type) + (nir_op_infos[instr->op].input_types[i] | + nir_src_bit_size(instr->src[i].src)); + op[i] = get_nir_src(instr->src[i].src, src_type, 4); + op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle); + op[i].abs = instr->src[i].abs; + op[i].negate = instr->src[i].negate; + } + + switch (instr->op) { + case nir_op_imov: + case nir_op_fmov: + inst = emit(MOV(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + unreachable("not reached: should be handled by lower_vec_to_movs()"); + + case nir_op_i2f: + case nir_op_u2f: + inst = emit(MOV(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_f2i: + case nir_op_f2u: + inst = emit(MOV(dst, op[0])); + break; + + case nir_op_d2f: + emit_conversion_from_double(dst, op[0], instr->dest.saturate, + BRW_REGISTER_TYPE_F); + break; + + case nir_op_f2d: + emit_conversion_to_double(dst, op[0], instr->dest.saturate, + BRW_REGISTER_TYPE_F); + break; + + case nir_op_d2i: + case nir_op_d2u: + emit_conversion_from_double(dst, op[0], instr->dest.saturate, + instr->op == nir_op_d2i ? BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_UD); + break; + + case nir_op_i2d: + case nir_op_u2d: + emit_conversion_to_double(dst, op[0], instr->dest.saturate, + instr->op == nir_op_i2d ? BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_UD); + break; + + case nir_op_iadd: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + case nir_op_fadd: + inst = emit(ADD(dst, op[0], op[1])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fmul: + inst = emit(MUL(dst, op[0], op[1])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_imul: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + if (devinfo->gen < 8) { + nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src); + nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); + + /* For integer multiplication, the MUL uses the low 16 bits of one of + * the operands (src0 through SNB, src1 on IVB and later). The MACH + * accumulates in the contribution of the upper 16 bits of that + * operand. If we can determine that one of the args is in the low + * 16 bits, though, we can just emit a single MUL. + */ + if (value0 && value0->u32[0] < (1 << 16)) { + if (devinfo->gen < 7) + emit(MUL(dst, op[0], op[1])); + else + emit(MUL(dst, op[1], op[0])); + } else if (value1 && value1->u32[0] < (1 << 16)) { + if (devinfo->gen < 7) + emit(MUL(dst, op[1], op[0])); + else + emit(MUL(dst, op[0], op[1])); + } else { + struct brw_reg acc = retype(brw_acc_reg(8), dst.type); + + emit(MUL(acc, op[0], op[1])); + emit(MACH(dst_null_d(), op[0], op[1])); + emit(MOV(dst, src_reg(acc))); + } + } else { + emit(MUL(dst, op[0], op[1])); + } + break; + } + + case nir_op_imul_high: + case nir_op_umul_high: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + struct brw_reg acc = retype(brw_acc_reg(8), dst.type); + + if (devinfo->gen >= 8) + emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW))); + else + emit(MUL(acc, op[0], op[1])); + + emit(MACH(dst, op[0], op[1])); + break; + } + + case nir_op_frcp: + inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fexp2: + inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_flog2: + inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fsin: + inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fcos: + inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_idiv: + case nir_op_udiv: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]); + break; + + case nir_op_umod: + case nir_op_irem: + /* According to the sign table for INT DIV in the Ivy Bridge PRM, it + * appears that our hardware just does the right thing for signed + * remainder. + */ + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); + break; + + case nir_op_imod: { + /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ + inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); + + /* Math instructions don't support conditional mod */ + inst = emit(MOV(dst_null_d(), src_reg(dst))); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Now, we need to determine if signs of the sources are different. + * When we XOR the sources, the top bit is 0 if they are the same and 1 + * if they are different. We can then use a conditional modifier to + * turn that into a predicate. This leads us to an XOR.l instruction. + * + * Technically, according to the PRM, you're not allowed to use .l on a + * XOR instruction. However, emperical experiments and Curro's reading + * of the simulator source both indicate that it's safe. + */ + src_reg tmp = src_reg(this, glsl_type::ivec4_type); + inst = emit(XOR(dst_reg(tmp), op[0], op[1])); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_L; + + /* If the result of the initial remainder operation is non-zero and the + * two sources have different signs, add in a copy of op[1] to get the + * final integer modulus value. + */ + inst = emit(ADD(dst, src_reg(dst), op[1])); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + + case nir_op_ldexp: + unreachable("not reached: should be handled by ldexp_to_arith()"); + + case nir_op_fsqrt: + inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_frsq: + inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fpow: + inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_uadd_carry: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); + + emit(ADDC(dst_null_ud(), op[0], op[1])); + emit(MOV(dst, src_reg(acc))); + break; + } + + case nir_op_usub_borrow: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); + + emit(SUBB(dst_null_ud(), op[0], op[1])); + emit(MOV(dst, src_reg(acc))); + break; + } + + case nir_op_ftrunc: + inst = emit(RNDZ(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fceil: { + src_reg tmp = src_reg(this, glsl_type::float_type); + tmp.swizzle = + brw_swizzle_for_size(instr->src[0].src.is_ssa ? + instr->src[0].src.ssa->num_components : + instr->src[0].src.reg.reg->num_components); + + op[0].negate = !op[0].negate; + emit(RNDD(dst_reg(tmp), op[0])); + tmp.negate = true; + inst = emit(MOV(dst, tmp)); + inst->saturate = instr->dest.saturate; + break; + } + + case nir_op_ffloor: + inst = emit(RNDD(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_ffract: + inst = emit(FRC(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fround_even: + inst = emit(RNDE(dst, op[0])); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fquantize2f16: { + /* See also vec4_visitor::emit_pack_half_2x16() */ + src_reg tmp16 = src_reg(this, glsl_type::uvec4_type); + src_reg tmp32 = src_reg(this, glsl_type::vec4_type); + src_reg zero = src_reg(this, glsl_type::vec4_type); + + /* Check for denormal */ + src_reg abs_src0 = op[0]; + abs_src0.abs = true; + emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), + BRW_CONDITIONAL_L)); + /* Get the appropriately signed zero */ + emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x80000000))); + /* Do the actual F32 -> F16 -> F32 conversion */ + emit(F32TO16(dst_reg(tmp16), op[0])); + emit(F16TO32(dst_reg(tmp32), tmp16)); + /* Select that or zero based on normal status */ + inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->saturate = instr->dest.saturate; + break; + } + + case nir_op_imin: + case nir_op_umin: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + case nir_op_fmin: + inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_imax: + case nir_op_umax: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + case nir_op_fmax: + inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + unreachable("derivatives are not valid in vertex shaders"); + + case nir_op_ilt: + case nir_op_ult: + case nir_op_ige: + case nir_op_uge: + case nir_op_ieq: + case nir_op_ine: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + /* Fallthrough */ + case nir_op_flt: + case nir_op_fge: + case nir_op_feq: + case nir_op_fne: { + enum brw_conditional_mod conditional_mod = + brw_conditional_for_nir_comparison(instr->op); + + if (nir_src_bit_size(instr->src[0].src) < 64) { + emit(CMP(dst, op[0], op[1], conditional_mod)); + } else { + /* Produce a 32-bit boolean result from the DF comparison by selecting + * only the low 32-bit in each DF produced. Do this in a temporary + * so we can then move from there to the result using align16 again + * to honor the original writemask. + */ + dst_reg temp = dst_reg(this, glsl_type::dvec4_type); + emit(CMP(temp, op[0], op[1], conditional_mod)); + dst_reg result = dst_reg(this, glsl_type::bvec4_type); + emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp)); + emit(MOV(dst, src_reg(result))); + } + break; + } + + case nir_op_ball_iequal2: + case nir_op_ball_iequal3: + case nir_op_ball_iequal4: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + /* Fallthrough */ + case nir_op_ball_fequal2: + case nir_op_ball_fequal3: + case nir_op_ball_fequal4: { + unsigned swiz = + brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); + + emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), + brw_conditional_for_nir_comparison(instr->op))); + emit(MOV(dst, brw_imm_d(0))); + inst = emit(MOV(dst, brw_imm_d(~0))); + inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H; + break; + } + + case nir_op_bany_inequal2: + case nir_op_bany_inequal3: + case nir_op_bany_inequal4: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + /* Fallthrough */ + case nir_op_bany_fnequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_fnequal4: { + unsigned swiz = + brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); + + emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), + brw_conditional_for_nir_comparison(instr->op))); + + emit(MOV(dst, brw_imm_d(0))); + inst = emit(MOV(dst, brw_imm_d(~0))); + inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; + break; + } + + case nir_op_inot: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + } + emit(NOT(dst, op[0])); + break; + + case nir_op_ixor: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + emit(XOR(dst, op[0], op[1])); + break; + + case nir_op_ior: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + emit(OR(dst, op[0], op[1])); + break; + + case nir_op_iand: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + emit(AND(dst, op[0], op[1])); + break; + + case nir_op_b2i: + case nir_op_b2f: + emit(MOV(dst, negate(op[0]))); + break; + + case nir_op_f2b: + emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); + break; + + case nir_op_d2b: { + /* We use a MOV with conditional_mod to check if the provided value is + * 0.0. We want this to flush denormalized numbers to zero, so we set a + * source modifier on the source operand to trigger this, as source + * modifiers don't affect the result of the testing against 0.0. + */ + src_reg value = op[0]; + value.abs = true; + vec4_instruction *inst = emit(MOV(dst_null_df(), value)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + src_reg one = src_reg(this, glsl_type::ivec4_type); + emit(MOV(dst_reg(one), brw_imm_d(~0))); + inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0)); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + + case nir_op_i2b: + emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); + break; + + case nir_op_fnoise1_1: + case nir_op_fnoise1_2: + case nir_op_fnoise1_3: + case nir_op_fnoise1_4: + case nir_op_fnoise2_1: + case nir_op_fnoise2_2: + case nir_op_fnoise2_3: + case nir_op_fnoise2_4: + case nir_op_fnoise3_1: + case nir_op_fnoise3_2: + case nir_op_fnoise3_3: + case nir_op_fnoise3_4: + case nir_op_fnoise4_1: + case nir_op_fnoise4_2: + case nir_op_fnoise4_3: + case nir_op_fnoise4_4: + unreachable("not reached: should be handled by lower_noise"); + + case nir_op_unpack_half_2x16_split_x: + case nir_op_unpack_half_2x16_split_y: + case nir_op_pack_half_2x16_split: + unreachable("not reached: should not occur in vertex shader"); + + case nir_op_unpack_snorm_2x16: + case nir_op_unpack_unorm_2x16: + case nir_op_pack_snorm_2x16: + case nir_op_pack_unorm_2x16: + unreachable("not reached: should be handled by lower_packing_builtins"); + + case nir_op_pack_uvec4_to_uint: + unreachable("not reached"); + + case nir_op_pack_uvec2_to_uint: { + dst_reg tmp1 = dst_reg(this, glsl_type::uint_type); + tmp1.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_YYYY; + emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); + + dst_reg tmp2 = dst_reg(this, glsl_type::uint_type); + tmp2.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_XXXX; + emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); + + emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); + break; + } + + case nir_op_pack_64_2x32_split: { + dst_reg result = dst_reg(this, glsl_type::dvec4_type); + dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); + emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD))); + emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp)); + emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD))); + emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp)); + emit(MOV(dst, src_reg(result))); + break; + } + + case nir_op_unpack_64_2x32_split_x: + case nir_op_unpack_64_2x32_split_y: { + enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ? + VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT; + dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); + emit(MOV(tmp, op[0])); + dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type); + emit(oper, tmp2, src_reg(tmp)); + emit(MOV(dst, src_reg(tmp2))); + break; + } + + case nir_op_unpack_half_2x16: + /* As NIR does not guarantee that we have a correct swizzle outside the + * boundaries of a vector, and the implementation of emit_unpack_half_2x16 + * uses the source operand in an operation with WRITEMASK_Y while our + * source operand has only size 1, it accessed incorrect data producing + * regressions in Piglit. We repeat the swizzle of the first component on the + * rest of components to avoid regressions. In the vec4_visitor IR code path + * this is not needed because the operand has already the correct swizzle. + */ + op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle); + emit_unpack_half_2x16(dst, op[0]); + break; + + case nir_op_pack_half_2x16: + emit_pack_half_2x16(dst, op[0]); + break; + + case nir_op_unpack_unorm_4x8: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_unpack_unorm_4x8(dst, op[0]); + break; + + case nir_op_pack_unorm_4x8: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_pack_unorm_4x8(dst, op[0]); + break; + + case nir_op_unpack_snorm_4x8: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_unpack_snorm_4x8(dst, op[0]); + break; + + case nir_op_pack_snorm_4x8: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_pack_snorm_4x8(dst, op[0]); + break; + + case nir_op_bitfield_reverse: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(BFREV(dst, op[0])); + break; + + case nir_op_bit_count: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(CBIT(dst, op[0])); + break; + + case nir_op_ufind_msb: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false); + break; + + case nir_op_ifind_msb: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + vec4_builder bld = vec4_builder(this).at_end(); + src_reg src(dst); + + if (devinfo->gen < 7) { + emit_find_msb_using_lzd(bld, dst, op[0], true); + } else { + emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0])); + + /* FBH counts from the MSB side, while GLSL's findMSB() wants the + * count from the LSB side. If FBH didn't return an error + * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB + * count into an LSB count. + */ + bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ); + + inst = bld.ADD(dst, src, brw_imm_d(31)); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->src[0].negate = true; + } + break; + } + + case nir_op_find_lsb: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + vec4_builder bld = vec4_builder(this).at_end(); + + if (devinfo->gen < 7) { + dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D); + + /* (x & -x) generates a value that consists of only the LSB of x. + * For all powers of 2, findMSB(y) == findLSB(y). + */ + src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D)); + src_reg negated_src = src; + + /* One must be negated, and the other must be non-negated. It + * doesn't matter which is which. + */ + negated_src.negate = true; + src.negate = false; + + bld.AND(temp, src, negated_src); + emit_find_msb_using_lzd(bld, dst, src_reg(temp), false); + } else { + bld.FBL(dst, op[0]); + } + break; + } + + case nir_op_ubitfield_extract: + case nir_op_ibitfield_extract: + unreachable("should have been lowered"); + case nir_op_ubfe: + case nir_op_ibfe: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + + emit(BFE(dst, op[2], op[1], op[0])); + break; + + case nir_op_bfm: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(BFI1(dst, op[0], op[1])); + break; + + case nir_op_bfi: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + + emit(BFI2(dst, op[0], op[1], op[2])); + break; + + case nir_op_bitfield_insert: + unreachable("not reached: should have been lowered"); + + case nir_op_fsign: + if (type_sz(op[0].type) < 8) { + /* AND(val, 0x80000000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not + * zero. + */ + emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); + + op[0].type = BRW_REGISTER_TYPE_UD; + dst.type = BRW_REGISTER_TYPE_UD; + emit(AND(dst, op[0], brw_imm_ud(0x80000000u))); + + inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u))); + inst->predicate = BRW_PREDICATE_NORMAL; + dst.type = BRW_REGISTER_TYPE_F; + + if (instr->dest.saturate) { + inst = emit(MOV(dst, src_reg(dst))); + inst->saturate = true; + } + } else { + /* For doubles we do the same but we need to consider: + * + * - We use a MOV with conditional_mod instead of a CMP so that we can + * skip loading a 0.0 immediate. We use a source modifier on the + * source of the MOV so that we flush denormalized values to 0. + * Since we want to compare against 0, this won't alter the result. + * - We need to extract the high 32-bit of each DF where the sign + * is stored. + * - We need to produce a DF result. + */ + + /* Check for zero */ + src_reg value = op[0]; + value.abs = true; + inst = emit(MOV(dst_null_df(), value)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* AND each high 32-bit channel with 0x80000000u */ + dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); + emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]); + emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u))); + + /* Add 1.0 to each channel, predicated to skip the cases where the + * channel's value was 0 + */ + inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u))); + inst->predicate = BRW_PREDICATE_NORMAL; + + /* Now convert the result from float to double */ + emit_conversion_to_double(dst, src_reg(tmp), instr->dest.saturate, + BRW_REGISTER_TYPE_F); + } + break; + + case nir_op_isign: + /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1). + * -> non-negative val generates 0x00000000. + * Predicated OR sets 1 if val is positive. + */ + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G)); + emit(ASR(dst, op[0], brw_imm_d(31))); + inst = emit(OR(dst, src_reg(dst), brw_imm_d(1))); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + + case nir_op_ishl: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(SHL(dst, op[0], op[1])); + break; + + case nir_op_ishr: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(ASR(dst, op[0], op[1])); + break; + + case nir_op_ushr: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + emit(SHR(dst, op[0], op[1])); + break; + + case nir_op_ffma: + if (type_sz(dst.type) == 8) { + dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type); + emit(MUL(mul_dst, op[1], op[0])); + inst = emit(ADD(dst, src_reg(mul_dst), op[2])); + inst->saturate = instr->dest.saturate; + } else { + op[0] = fix_3src_operand(op[0]); + op[1] = fix_3src_operand(op[1]); + op[2] = fix_3src_operand(op[2]); + + inst = emit(MAD(dst, op[2], op[1], op[0])); + inst->saturate = instr->dest.saturate; + } + break; + + case nir_op_flrp: + inst = emit_lrp(dst, op[0], op[1], op[2]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_bcsel: + enum brw_predicate predicate; + if (!optimize_predicate(instr, &predicate)) { + emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); + switch (dst.writemask) { + case WRITEMASK_X: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; + break; + case WRITEMASK_Y: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; + break; + case WRITEMASK_Z: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; + break; + case WRITEMASK_W: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; + break; + default: + predicate = BRW_PREDICATE_NORMAL; + break; + } + } + inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); + inst->predicate = predicate; + break; + + case nir_op_fdot_replicated2: + inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fdot_replicated3: + inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fdot_replicated4: + inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_fdph_replicated: + inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_iabs: + case nir_op_ineg: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + case nir_op_fabs: + case nir_op_fneg: + case nir_op_fsat: + unreachable("not reached: should be lowered by lower_source mods"); + + case nir_op_fdiv: + unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler"); + + case nir_op_fmod: + unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler"); + + case nir_op_fsub: + case nir_op_isub: + unreachable("not reached: should be handled by ir_sub_to_add_neg"); + + default: + unreachable("Unimplemented ALU operation"); + } + + /* If we need to do a boolean resolve, replace the result with -(x & 1) + * to sign extend the low bit to 0/~0 + */ + if (devinfo->gen <= 5 && + (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == + BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { + dst_reg masked = dst_reg(this, glsl_type::int_type); + masked.writemask = dst.writemask; + emit(AND(masked, src_reg(dst), brw_imm_d(1))); + src_reg masked_neg = src_reg(masked); + masked_neg.negate = true; + emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg)); + } +} + +void +vec4_visitor::nir_emit_jump(nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + emit(BRW_OPCODE_BREAK); + break; + + case nir_jump_continue: + emit(BRW_OPCODE_CONTINUE); + break; + + case nir_jump_return: + /* fall through */ + default: + unreachable("unknown jump"); + } +} + +enum ir_texture_opcode +ir_texture_opcode_for_nir_texop(nir_texop texop) +{ + enum ir_texture_opcode op; + + switch (texop) { + case nir_texop_lod: op = ir_lod; break; + case nir_texop_query_levels: op = ir_query_levels; break; + case nir_texop_texture_samples: op = ir_texture_samples; break; + case nir_texop_tex: op = ir_tex; break; + case nir_texop_tg4: op = ir_tg4; break; + case nir_texop_txb: op = ir_txb; break; + case nir_texop_txd: op = ir_txd; break; + case nir_texop_txf: op = ir_txf; break; + case nir_texop_txf_ms: op = ir_txf_ms; break; + case nir_texop_txl: op = ir_txl; break; + case nir_texop_txs: op = ir_txs; break; + case nir_texop_samples_identical: op = ir_samples_identical; break; + default: + unreachable("unknown texture opcode"); + } + + return op; +} +const glsl_type * +glsl_type_for_nir_alu_type(nir_alu_type alu_type, + unsigned components) +{ + return glsl_type::get_instance(brw_glsl_base_type_for_nir_type(alu_type), + components, 1); +} + +void +vec4_visitor::nir_emit_texture(nir_tex_instr *instr) +{ + unsigned texture = instr->texture_index; + unsigned sampler = instr->sampler_index; + src_reg texture_reg = brw_imm_ud(texture); + src_reg sampler_reg = brw_imm_ud(sampler); + src_reg coordinate; + const glsl_type *coord_type = NULL; + src_reg shadow_comparator; + src_reg offset_value; + src_reg lod, lod2; + src_reg sample_index; + src_reg mcs; + + const glsl_type *dest_type = + glsl_type_for_nir_alu_type(instr->dest_type, + nir_tex_instr_dest_size(instr)); + dst_reg dest = get_nir_dest(instr->dest, instr->dest_type); + + /* The hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + lod = brw_imm_d(0); + + /* Load the texture operation sources */ + uint32_t constant_offset = 0; + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_comparator: + shadow_comparator = get_nir_src(instr->src[i].src, + BRW_REGISTER_TYPE_F, 1); + break; + + case nir_tex_src_coord: { + unsigned src_size = nir_tex_instr_src_size(instr, i); + + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_samples_identical: + coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, + src_size); + coord_type = glsl_type::ivec(src_size); + break; + + default: + coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, + src_size); + coord_type = glsl_type::vec(src_size); + break; + } + break; + } + + case nir_tex_src_ddx: + lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, + nir_tex_instr_src_size(instr, i)); + break; + + case nir_tex_src_ddy: + lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, + nir_tex_instr_src_size(instr, i)); + break; + + case nir_tex_src_lod: + switch (instr->op) { + case nir_texop_txs: + case nir_texop_txf: + lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); + break; + + default: + lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1); + break; + } + break; + + case nir_tex_src_ms_index: { + sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); + break; + } + + case nir_tex_src_offset: { + nir_const_value *const_offset = + nir_src_as_const_value(instr->src[i].src); + if (!const_offset || + !brw_texture_offset(const_offset->i32, + nir_tex_instr_src_size(instr, i), + &constant_offset)) { + offset_value = + get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); + } + break; + } + + case nir_tex_src_texture_offset: { + /* The highest texture which may be used by this operation is + * the last element of the array. Mark it here, because the generator + * doesn't have enough information to determine the bound. + */ + uint32_t array_size = instr->texture_array_size; + uint32_t max_used = texture + array_size - 1; + if (instr->op == nir_texop_tg4) { + max_used += prog_data->base.binding_table.gather_texture_start; + } else { + max_used += prog_data->base.binding_table.texture_start; + } + + brw_mark_surface_used(&prog_data->base, max_used); + + /* Emit code to evaluate the actual indexing expression */ + src_reg src = get_nir_src(instr->src[i].src, 1); + src_reg temp(this, glsl_type::uint_type); + emit(ADD(dst_reg(temp), src, brw_imm_ud(texture))); + texture_reg = emit_uniformize(temp); + break; + } + + case nir_tex_src_sampler_offset: { + /* Emit code to evaluate the actual indexing expression */ + src_reg src = get_nir_src(instr->src[i].src, 1); + src_reg temp(this, glsl_type::uint_type); + emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler))); + sampler_reg = emit_uniformize(temp); + break; + } + + case nir_tex_src_projector: + unreachable("Should be lowered by do_lower_texture_projection"); + + case nir_tex_src_bias: + unreachable("LOD bias is not valid for vertex shaders.\n"); + + default: + unreachable("unknown texture source"); + } + } + + if (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical) { + assert(coord_type != NULL); + if (devinfo->gen >= 7 && + key_tex->compressed_multisample_layout_mask & (1 << texture)) { + mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg); + } else { + mcs = brw_imm_ud(0u); + } + } + + /* Stuff the channel select bits in the top of the texture offset */ + if (instr->op == nir_texop_tg4) { + if (instr->component == 1 && + (key_tex->gather_channel_quirk_mask & (1 << texture))) { + /* gather4 sampler is broken for green channel on RG32F -- + * we must ask for blue instead. + */ + constant_offset |= 2 << 16; + } else { + constant_offset |= instr->component << 16; + } + } + + ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op); + + emit_texture(op, dest, dest_type, coordinate, instr->coord_components, + shadow_comparator, + lod, lod2, sample_index, + constant_offset, offset_value, mcs, + texture, texture_reg, sampler_reg); +} + +void +vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr) +{ + nir_ssa_values[instr->def.index] = + dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32))); +} + +/* SIMD4x2 64bit data is stored in register space like this: + * + * r0.0:DF x0 y0 z0 w0 + * r1.0:DF x1 y1 z1 w1 + * + * When we need to write data such as this to memory using 32-bit write + * messages we need to shuffle it in this fashion: + * + * r0.0:DF x0 y0 x1 y1 (to be written at base offset) + * r0.0:DF z0 w0 z1 w1 (to be written at base offset + 16) + * + * We need to do the inverse operation when we read using 32-bit messages, + * which we can do by applying the same exact shuffling on the 64-bit data + * read, only that because the data for each vertex is positioned differently + * we need to apply different channel enables. + * + * This function takes 64bit data and shuffles it as explained above. + * + * The @for_write parameter is used to specify if the shuffling is being done + * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit + * write message (for_write = true), or instead we are doing the inverse + * operation and we have just read 64-bit data using a 32-bit messages that we + * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false). + * + * If @block and @ref are non-NULL, then the shuffling is done after @ref, + * otherwise the instructions are emitted normally at the end. The function + * returns the last instruction inserted. + * + * Notice that @src and @dst cannot be the same register. + */ +vec4_instruction * +vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write, + bblock_t *block, vec4_instruction *ref) +{ + assert(type_sz(src.type) == 8); + assert(type_sz(dst.type) == 8); + assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE)); + assert(!ref == !block); + + const vec4_builder bld = !ref ? vec4_builder(this).at_end() : + vec4_builder(this).at(block, ref->next); + + /* Resolve swizzle in src */ + vec4_instruction *inst; + if (src.swizzle != BRW_SWIZZLE_XYZW) { + dst_reg data = dst_reg(this, glsl_type::dvec4_type); + inst = bld.MOV(data, src); + src = src_reg(data); + } + + /* dst+0.XY = src+0.XY */ + inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src); + + /* dst+0.ZW = src+1.XY */ + inst = bld.group(4, for_write ? 1 : 0) + .MOV(writemask(dst, WRITEMASK_ZW), + swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY)); + + /* dst+1.XY = src+0.ZW */ + inst = bld.group(4, for_write ? 0 : 1) + .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY), + swizzle(src, BRW_SWIZZLE_ZWZW)); + + /* dst+1.ZW = src+1.ZW */ + inst = bld.group(4, 1) + .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW), + byte_offset(src, REG_SIZE)); + + return inst; +} + +} diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp new file mode 100644 index 00000000000..e3b46cc2f7f --- /dev/null +++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp @@ -0,0 +1,558 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/register_allocate.h" +#include "brw_vec4.h" +#include "brw_cfg.h" + +using namespace brw; + +namespace brw { + +static void +assign(unsigned int *reg_hw_locations, backend_reg *reg) +{ + if (reg->file == VGRF) { + reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE; + reg->offset %= REG_SIZE; + } +} + +bool +vec4_visitor::reg_allocate_trivial() +{ + unsigned int hw_reg_mapping[this->alloc.count]; + bool virtual_grf_used[this->alloc.count]; + int next; + + /* Calculate which virtual GRFs are actually in use after whatever + * optimization passes have occurred. + */ + for (unsigned i = 0; i < this->alloc.count; i++) { + virtual_grf_used[i] = false; + } + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == VGRF) + virtual_grf_used[inst->dst.nr] = true; + + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) + virtual_grf_used[inst->src[i].nr] = true; + } + } + + hw_reg_mapping[0] = this->first_non_payload_grf; + next = hw_reg_mapping[0] + this->alloc.sizes[0]; + for (unsigned i = 1; i < this->alloc.count; i++) { + if (virtual_grf_used[i]) { + hw_reg_mapping[i] = next; + next += this->alloc.sizes[i]; + } + } + prog_data->total_grf = next; + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + assign(hw_reg_mapping, &inst->dst); + assign(hw_reg_mapping, &inst->src[0]); + assign(hw_reg_mapping, &inst->src[1]); + assign(hw_reg_mapping, &inst->src[2]); + } + + if (prog_data->total_grf > max_grf) { + fail("Ran out of regs on trivial allocator (%d/%d)\n", + prog_data->total_grf, max_grf); + return false; + } + + return true; +} + +extern "C" void +brw_vec4_alloc_reg_set(struct brw_compiler *compiler) +{ + int base_reg_count = + compiler->devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; + + /* After running split_virtual_grfs(), almost all VGRFs will be of size 1. + * SEND-from-GRF sources cannot be split, so we also need classes for each + * potential message length. + */ + const int class_count = MAX_VGRF_SIZE; + int class_sizes[MAX_VGRF_SIZE]; + + for (int i = 0; i < class_count; i++) + class_sizes[i] = i + 1; + + /* Compute the total number of registers across all classes. */ + int ra_reg_count = 0; + for (int i = 0; i < class_count; i++) { + ra_reg_count += base_reg_count - (class_sizes[i] - 1); + } + + ralloc_free(compiler->vec4_reg_set.ra_reg_to_grf); + compiler->vec4_reg_set.ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count); + ralloc_free(compiler->vec4_reg_set.regs); + compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count, false); + if (compiler->devinfo->gen >= 6) + ra_set_allocate_round_robin(compiler->vec4_reg_set.regs); + ralloc_free(compiler->vec4_reg_set.classes); + compiler->vec4_reg_set.classes = ralloc_array(compiler, int, class_count); + + /* Now, add the registers to their classes, and add the conflicts + * between them and the base GRF registers (and also each other). + */ + int reg = 0; + unsigned *q_values[MAX_VGRF_SIZE]; + for (int i = 0; i < class_count; i++) { + int class_reg_count = base_reg_count - (class_sizes[i] - 1); + compiler->vec4_reg_set.classes[i] = ra_alloc_reg_class(compiler->vec4_reg_set.regs); + + q_values[i] = new unsigned[MAX_VGRF_SIZE]; + + for (int j = 0; j < class_reg_count; j++) { + ra_class_add_reg(compiler->vec4_reg_set.regs, compiler->vec4_reg_set.classes[i], reg); + + compiler->vec4_reg_set.ra_reg_to_grf[reg] = j; + + for (int base_reg = j; + base_reg < j + class_sizes[i]; + base_reg++) { + ra_add_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg); + } + + reg++; + } + + for (int j = 0; j < class_count; j++) { + /* Calculate the q values manually because the algorithm used by + * ra_set_finalize() to do it has higher complexity affecting the + * start-up time of some applications. q(i, j) is just the maximum + * number of registers from class i a register from class j can + * conflict with. + */ + q_values[i][j] = class_sizes[i] + class_sizes[j] - 1; + } + } + assert(reg == ra_reg_count); + + for (int reg = 0; reg < base_reg_count; reg++) + ra_make_reg_conflicts_transitive(compiler->vec4_reg_set.regs, reg); + + ra_set_finalize(compiler->vec4_reg_set.regs, q_values); + + for (int i = 0; i < MAX_VGRF_SIZE; i++) + delete[] q_values[i]; +} + +void +vec4_visitor::setup_payload_interference(struct ra_graph *g, + int first_payload_node, + int reg_node_count) +{ + int payload_node_count = this->first_non_payload_grf; + + for (int i = 0; i < payload_node_count; i++) { + /* Mark each payload reg node as being allocated to its physical register. + * + * The alternative would be to have per-physical register classes, which + * would just be silly. + */ + ra_set_node_reg(g, first_payload_node + i, i); + + /* For now, just mark each payload node as interfering with every other + * node to be allocated. + */ + for (int j = 0; j < reg_node_count; j++) { + ra_add_node_interference(g, first_payload_node + i, j); + } + } +} + +bool +vec4_visitor::reg_allocate() +{ + unsigned int hw_reg_mapping[alloc.count]; + int payload_reg_count = this->first_non_payload_grf; + + /* Using the trivial allocator can be useful in debugging undefined + * register access as a result of broken optimization passes. + */ + if (0) + return reg_allocate_trivial(); + + calculate_live_intervals(); + + int node_count = alloc.count; + int first_payload_node = node_count; + node_count += payload_reg_count; + struct ra_graph *g = + ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count); + + for (unsigned i = 0; i < alloc.count; i++) { + int size = this->alloc.sizes[i]; + assert(size >= 1 && size <= MAX_VGRF_SIZE); + ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]); + + for (unsigned j = 0; j < i; j++) { + if (virtual_grf_interferes(i, j)) { + ra_add_node_interference(g, i, j); + } + } + } + + /* Certain instructions can't safely use the same register for their + * sources and destination. Add interference. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); + } + } + } + } + + setup_payload_interference(g, first_payload_node, node_count); + + if (!ra_allocate(g)) { + /* Failed to allocate registers. Spill a reg, and the caller will + * loop back into here to try again. + */ + int reg = choose_spill_reg(g); + if (this->no_spills) { + fail("Failure to register allocate. Reduce number of live " + "values to avoid this."); + } else if (reg == -1) { + fail("no register to spill\n"); + } else { + spill_reg(reg); + } + ralloc_free(g); + return false; + } + + /* Get the chosen virtual registers for each node, and map virtual + * regs in the register classes back down to real hardware reg + * numbers. + */ + prog_data->total_grf = payload_reg_count; + for (unsigned i = 0; i < alloc.count; i++) { + int reg = ra_get_node_reg(g, i); + + hw_reg_mapping[i] = compiler->vec4_reg_set.ra_reg_to_grf[reg]; + prog_data->total_grf = MAX2(prog_data->total_grf, + hw_reg_mapping[i] + alloc.sizes[i]); + } + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + assign(hw_reg_mapping, &inst->dst); + assign(hw_reg_mapping, &inst->src[0]); + assign(hw_reg_mapping, &inst->src[1]); + assign(hw_reg_mapping, &inst->src[2]); + } + + ralloc_free(g); + + return true; +} + +/** + * When we decide to spill a register, instead of blindly spilling every use, + * save unspills when the spill register is used (read) in consecutive + * instructions. This can potentially save a bunch of unspills that would + * have very little impact in register allocation anyway. + * + * Notice that we need to account for this behavior when spilling a register + * and when evaluating spilling costs. This function is designed so it can + * be called from both places and avoid repeating the logic. + * + * - When we call this function from spill_reg(), we pass in scratch_reg the + * actual unspill/spill register that we want to reuse in the current + * instruction. + * + * - When we call this from evaluate_spill_costs(), we pass the register for + * which we are evaluating spilling costs. + * + * In either case, we check if the previous instructions read scratch_reg until + * we find one that writes to it with a compatible mask or does not read/write + * scratch_reg at all. + */ +static bool +can_use_scratch_for_source(const vec4_instruction *inst, unsigned i, + unsigned scratch_reg) +{ + assert(inst->src[i].file == VGRF); + bool prev_inst_read_scratch_reg = false; + + /* See if any previous source in the same instructions reads scratch_reg */ + for (unsigned n = 0; n < i; n++) { + if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg) + prev_inst_read_scratch_reg = true; + } + + /* Now check if previous instructions read/write scratch_reg */ + for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev; + !prev_inst->is_head_sentinel(); + prev_inst = (vec4_instruction *) prev_inst->prev) { + + /* If the previous instruction writes to scratch_reg then we can reuse + * it if the write is not conditional and the channels we write are + * compatible with our read mask + */ + if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) { + return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) && + (brw_mask_for_swizzle(inst->src[i].swizzle) & + ~prev_inst->dst.writemask) == 0; + } + + /* Skip scratch read/writes so that instructions generated by spilling + * other registers (that won't read/write scratch_reg) do not stop us from + * reusing scratch_reg for this instruction. + */ + if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE || + prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) + continue; + + /* If the previous instruction does not write to scratch_reg, then check + * if it reads it + */ + int n; + for (n = 0; n < 3; n++) { + if (prev_inst->src[n].file == VGRF && + prev_inst->src[n].nr == scratch_reg) { + prev_inst_read_scratch_reg = true; + break; + } + } + if (n == 3) { + /* The previous instruction does not read scratch_reg. At this point, + * if no previous instruction has read scratch_reg it means that we + * will need to unspill it here and we can't reuse it (so we return + * false). Otherwise, if we found at least one consecutive instruction + * that read scratch_reg, then we know that we got here from + * evaluate_spill_costs (since for the spill_reg path any block of + * consecutive instructions using scratch_reg must start with a write + * to that register, so we would've exited the loop in the check for + * the write that we have at the start of this loop), and in that case + * it means that we found the point at which the scratch_reg would be + * unspilled. Since we always unspill a full vec4, it means that we + * have all the channels available and we can just return true to + * signal that we can reuse the register in the current instruction + * too. + */ + return prev_inst_read_scratch_reg; + } + } + + return prev_inst_read_scratch_reg; +} + +static inline unsigned +spill_cost_for_type(enum brw_reg_type type) +{ + /* Spilling of a 64-bit register involves emitting 2 32-bit scratch + * messages plus the 64b/32b shuffling code. + */ + return type_sz(type) == 8 ? 2.25f : 1.0f; +} + +void +vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) +{ + float loop_scale = 1.0; + + unsigned *reg_type_size = (unsigned *) + ralloc_size(NULL, this->alloc.count * sizeof(unsigned)); + + for (unsigned i = 0; i < this->alloc.count; i++) { + spill_costs[i] = 0.0; + no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2; + reg_type_size[i] = 0; + } + + /* Calculate costs for spilling nodes. Call it a cost of 1 per + * spill/unspill we'll have to do, and guess that the insides of + * loops run 10 times. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) { + /* We will only unspill src[i] it it wasn't unspilled for the + * previous instruction, in which case we'll just reuse the scratch + * reg for this instruction. + */ + if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) { + spill_costs[inst->src[i].nr] += + loop_scale * spill_cost_for_type(inst->src[i].type); + if (inst->src[i].reladdr || + inst->src[i].offset >= REG_SIZE) + no_spill[inst->src[i].nr] = true; + + /* We don't support unspills of partial DF reads. + * + * Our 64-bit unspills are implemented with two 32-bit scratch + * messages, each one reading that for both SIMD4x2 threads that + * we need to shuffle into correct 64-bit data. Ensure that we + * are reading data for both threads. + */ + if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8) + no_spill[inst->src[i].nr] = true; + } + + /* We can't spill registers that mix 32-bit and 64-bit access (that + * contain 64-bit data that is operated on via 32-bit instructions) + */ + unsigned type_size = type_sz(inst->src[i].type); + if (reg_type_size[inst->src[i].nr] == 0) + reg_type_size[inst->src[i].nr] = type_size; + else if (reg_type_size[inst->src[i].nr] != type_size) + no_spill[inst->src[i].nr] = true; + } + } + + if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) { + spill_costs[inst->dst.nr] += + loop_scale * spill_cost_for_type(inst->dst.type); + if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE) + no_spill[inst->dst.nr] = true; + + /* We don't support spills of partial DF writes. + * + * Our 64-bit spills are implemented with two 32-bit scratch messages, + * each one writing that for both SIMD4x2 threads. Ensure that we + * are writing data for both threads. + */ + if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8) + no_spill[inst->dst.nr] = true; + + /* FROM_DOUBLE opcodes are setup so that they use a dst register + * with a size of 2 even if they only produce a single-precison + * result (this is so that the opcode can use the larger register to + * produce a 64-bit aligned intermediary result as required by the + * hardware during the conversion process). This creates a problem for + * spilling though, because when we attempt to emit a spill for the + * dst we see a 32-bit destination and emit a scratch write that + * allocates a single spill register. + */ + if (inst->opcode == VEC4_OPCODE_FROM_DOUBLE) + no_spill[inst->dst.nr] = true; + + /* We can't spill registers that mix 32-bit and 64-bit access (that + * contain 64-bit data that is operated on via 32-bit instructions) + */ + unsigned type_size = type_sz(inst->dst.type); + if (reg_type_size[inst->dst.nr] == 0) + reg_type_size[inst->dst.nr] = type_size; + else if (reg_type_size[inst->dst.nr] != type_size) + no_spill[inst->dst.nr] = true; + } + + switch (inst->opcode) { + + case BRW_OPCODE_DO: + loop_scale *= 10; + break; + + case BRW_OPCODE_WHILE: + loop_scale /= 10; + break; + + case SHADER_OPCODE_GEN4_SCRATCH_READ: + case SHADER_OPCODE_GEN4_SCRATCH_WRITE: + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF) + no_spill[inst->src[i].nr] = true; + } + if (inst->dst.file == VGRF) + no_spill[inst->dst.nr] = true; + break; + + default: + break; + } + } + + ralloc_free(reg_type_size); +} + +int +vec4_visitor::choose_spill_reg(struct ra_graph *g) +{ + float spill_costs[this->alloc.count]; + bool no_spill[this->alloc.count]; + + evaluate_spill_costs(spill_costs, no_spill); + + for (unsigned i = 0; i < this->alloc.count; i++) { + if (!no_spill[i]) + ra_set_node_spill_cost(g, i, spill_costs[i]); + } + + return ra_get_best_spill_node(g); +} + +void +vec4_visitor::spill_reg(int spill_reg_nr) +{ + assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2); + unsigned int spill_offset = last_scratch; + last_scratch += alloc.sizes[spill_reg_nr]; + + /* Generate spill/unspill instructions for the objects being spilled. */ + int scratch_reg = -1; + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) { + if (scratch_reg == -1 || + !can_use_scratch_for_source(inst, i, scratch_reg)) { + /* We need to unspill anyway so make sure we read the full vec4 + * in any case. This way, the cached register can be reused + * for consecutive instructions that read different channels of + * the same vec4. + */ + scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]); + src_reg temp = inst->src[i]; + temp.nr = scratch_reg; + temp.offset = 0; + temp.swizzle = BRW_SWIZZLE_XYZW; + emit_scratch_read(block, inst, + dst_reg(temp), inst->src[i], spill_offset); + temp.offset = inst->src[i].offset; + } + assert(scratch_reg != -1); + inst->src[i].nr = scratch_reg; + } + } + + if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) { + emit_scratch_write(block, inst, spill_offset); + scratch_reg = inst->dst.nr; + } + } + + invalidate_live_intervals(); +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_surface_builder.cpp b/src/intel/compiler/brw_vec4_surface_builder.cpp new file mode 100644 index 00000000000..00c94fedca2 --- /dev/null +++ b/src/intel/compiler/brw_vec4_surface_builder.cpp @@ -0,0 +1,332 @@ +/* + * Copyright © 2013-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4_surface_builder.h" + +using namespace brw; + +namespace { + namespace array_utils { + /** + * Copy one every \p src_stride logical components of the argument into + * one every \p dst_stride logical components of the result. + */ + src_reg + emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size, + unsigned dst_stride, unsigned src_stride) + { + if (src_stride == 1 && dst_stride == 1) { + return src; + } else { + const dst_reg dst = bld.vgrf(src.type, + DIV_ROUND_UP(size * dst_stride, 4)); + + for (unsigned i = 0; i < size; ++i) + bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4), + 1 << (i * dst_stride % 4)), + swizzle(offset(src, 8, i * src_stride / 4), + brw_swizzle_for_mask(1 << (i * src_stride % 4)))); + + return src_reg(dst); + } + } + + /** + * Convert a VEC4 into an array of registers with the layout expected by + * the recipient shared unit. If \p has_simd4x2 is true the argument is + * left unmodified in SIMD4x2 form, otherwise it will be rearranged into + * a SIMD8 vector. + */ + src_reg + emit_insert(const vec4_builder &bld, const src_reg &src, + unsigned n, bool has_simd4x2) + { + if (src.file == BAD_FILE || n == 0) { + return src_reg(); + + } else { + /* Pad unused components with zeroes. */ + const unsigned mask = (1 << n) - 1; + const dst_reg tmp = bld.vgrf(src.type); + + bld.MOV(writemask(tmp, mask), src); + if (n < 4) + bld.MOV(writemask(tmp, ~mask), brw_imm_d(0)); + + return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1); + } + } + + /** + * Convert an array of registers back into a VEC4 according to the + * layout expected from some shared unit. If \p has_simd4x2 is true the + * argument is left unmodified in SIMD4x2 form, otherwise it will be + * rearranged from SIMD8 form. + */ + src_reg + emit_extract(const vec4_builder &bld, const src_reg src, + unsigned n, bool has_simd4x2) + { + if (src.file == BAD_FILE || n == 0) { + return src_reg(); + + } else { + return emit_stride(bld, src, n, 1, has_simd4x2 ? 1 : 4); + } + } + } +} + +namespace brw { + namespace surface_access { + namespace { + using namespace array_utils; + + /** + * Generate a send opcode for a surface message and return the + * result. + */ + src_reg + emit_send(const vec4_builder &bld, enum opcode op, + const src_reg &header, + const src_reg &addr, unsigned addr_sz, + const src_reg &src, unsigned src_sz, + const src_reg &surface, + unsigned arg, unsigned ret_sz, + brw_predicate pred = BRW_PREDICATE_NONE) + { + /* Calculate the total number of components of the payload. */ + const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1); + const unsigned sz = header_sz + addr_sz + src_sz; + + /* Construct the payload. */ + const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); + unsigned n = 0; + + if (header_sz) + bld.exec_all().MOV(offset(payload, 8, n++), + retype(header, BRW_REGISTER_TYPE_UD)); + + for (unsigned i = 0; i < addr_sz; i++) + bld.MOV(offset(payload, 8, n++), + offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i)); + + for (unsigned i = 0; i < src_sz; i++) + bld.MOV(offset(payload, 8, n++), + offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i)); + + /* Reduce the dynamically uniform surface index to a single + * scalar. + */ + const src_reg usurface = bld.emit_uniformize(surface); + + /* Emit the message send instruction. */ + const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz); + vec4_instruction *inst = + bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg)); + inst->mlen = sz; + inst->size_written = ret_sz * REG_SIZE; + inst->header_size = header_sz; + inst->predicate = pred; + + return src_reg(dst); + } + } + + /** + * Emit an untyped surface read opcode. \p dims determines the number + * of components of the address and \p size the number of components of + * the returned value. + */ + src_reg + emit_untyped_read(const vec4_builder &bld, + const src_reg &surface, const src_reg &addr, + unsigned dims, unsigned size, + brw_predicate pred) + { + return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ, src_reg(), + emit_insert(bld, addr, dims, true), 1, + src_reg(), 0, + surface, size, 1, pred); + } + + /** + * Emit an untyped surface write opcode. \p dims determines the number + * of components of the address and \p size the number of components of + * the argument. + */ + void + emit_untyped_write(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, const src_reg &src, + unsigned dims, unsigned size, + brw_predicate pred) + { + const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 || + bld.shader->devinfo->is_haswell); + emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(), + emit_insert(bld, addr, dims, has_simd4x2), + has_simd4x2 ? 1 : dims, + emit_insert(bld, src, size, has_simd4x2), + has_simd4x2 ? 1 : size, + surface, size, 0, pred); + } + + /** + * Emit an untyped surface atomic opcode. \p dims determines the number + * of components of the address and \p rsize the number of components of + * the returned value (either zero or one). + */ + src_reg + emit_untyped_atomic(const vec4_builder &bld, + const src_reg &surface, const src_reg &addr, + const src_reg &src0, const src_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred) + { + const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 || + bld.shader->devinfo->is_haswell); + + /* Zip the components of both sources, they are represented as the X + * and Y components of the same vector. + */ + const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); + const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD); + + if (size >= 1) + bld.MOV(writemask(srcs, WRITEMASK_X), src0); + if (size >= 2) + bld.MOV(writemask(srcs, WRITEMASK_Y), src1); + + return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC, src_reg(), + emit_insert(bld, addr, dims, has_simd4x2), + has_simd4x2 ? 1 : dims, + emit_insert(bld, src_reg(srcs), size, has_simd4x2), + has_simd4x2 && size ? 1 : size, + surface, op, rsize, pred); + } + + namespace { + /** + * Initialize the header present in typed surface messages. + */ + src_reg + emit_typed_message_header(const vec4_builder &bld) + { + const vec4_builder ubld = bld.exec_all(); + const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); + + ubld.MOV(dst, brw_imm_d(0)); + + if (bld.shader->devinfo->gen == 7 && + !bld.shader->devinfo->is_haswell) { + /* The sample mask is used on IVB for the SIMD8 messages that + * have no SIMD4x2 variant. We only use the two X channels + * in that case, mask everything else out. + */ + ubld.MOV(writemask(dst, WRITEMASK_W), brw_imm_d(0x11)); + } + + return src_reg(dst); + } + } + + /** + * Emit a typed surface read opcode. \p dims determines the number of + * components of the address and \p size the number of components of the + * returned value. + */ + src_reg + emit_typed_read(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, unsigned dims, unsigned size) + { + const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 || + bld.shader->devinfo->is_haswell); + const src_reg tmp = + emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ, + emit_typed_message_header(bld), + emit_insert(bld, addr, dims, has_simd4x2), + has_simd4x2 ? 1 : dims, + src_reg(), 0, + surface, size, + has_simd4x2 ? 1 : size); + + return emit_extract(bld, tmp, size, has_simd4x2); + } + + /** + * Emit a typed surface write opcode. \p dims determines the number of + * components of the address and \p size the number of components of the + * argument. + */ + void + emit_typed_write(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, const src_reg &src, + unsigned dims, unsigned size) + { + const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 || + bld.shader->devinfo->is_haswell); + emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE, + emit_typed_message_header(bld), + emit_insert(bld, addr, dims, has_simd4x2), + has_simd4x2 ? 1 : dims, + emit_insert(bld, src, size, has_simd4x2), + has_simd4x2 ? 1 : size, + surface, size, 0); + } + + /** + * Emit a typed surface atomic opcode. \p dims determines the number of + * components of the address and \p rsize the number of components of + * the returned value (either zero or one). + */ + src_reg + emit_typed_atomic(const vec4_builder &bld, + const src_reg &surface, const src_reg &addr, + const src_reg &src0, const src_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred) + { + const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 || + bld.shader->devinfo->is_haswell); + + /* Zip the components of both sources, they are represented as the X + * and Y components of the same vector. + */ + const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); + const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD); + + if (size >= 1) + bld.MOV(writemask(srcs, WRITEMASK_X), src0); + if (size >= 2) + bld.MOV(writemask(srcs, WRITEMASK_Y), src1); + + return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC, + emit_typed_message_header(bld), + emit_insert(bld, addr, dims, has_simd4x2), + has_simd4x2 ? 1 : dims, + emit_insert(bld, src_reg(srcs), size, has_simd4x2), + has_simd4x2 ? 1 : size, + surface, op, rsize, pred); + } + } +} diff --git a/src/intel/compiler/brw_vec4_surface_builder.h b/src/intel/compiler/brw_vec4_surface_builder.h new file mode 100644 index 00000000000..6e61c0fce9b --- /dev/null +++ b/src/intel/compiler/brw_vec4_surface_builder.h @@ -0,0 +1,69 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2013-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_VEC4_SURFACE_BUILDER_H +#define BRW_VEC4_SURFACE_BUILDER_H + +#include "brw_vec4_builder.h" + +namespace brw { + namespace surface_access { + src_reg + emit_untyped_read(const vec4_builder &bld, + const src_reg &surface, const src_reg &addr, + unsigned dims, unsigned size, + brw_predicate pred = BRW_PREDICATE_NONE); + + void + emit_untyped_write(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, const src_reg &src, + unsigned dims, unsigned size, + brw_predicate pred = BRW_PREDICATE_NONE); + + src_reg + emit_untyped_atomic(const vec4_builder &bld, + const src_reg &surface, const src_reg &addr, + const src_reg &src0, const src_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred = BRW_PREDICATE_NONE); + + src_reg + emit_typed_read(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, unsigned dims, unsigned size); + + void + emit_typed_write(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, const src_reg &src, + unsigned dims, unsigned size); + + src_reg + emit_typed_atomic(const vec4_builder &bld, const src_reg &surface, + const src_reg &addr, + const src_reg &src0, const src_reg &src1, + unsigned dims, unsigned rsize, unsigned op, + brw_predicate pred = BRW_PREDICATE_NONE); + } +} + +#endif diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp new file mode 100644 index 00000000000..d4a647d029f --- /dev/null +++ b/src/intel/compiler/brw_vec4_tcs.cpp @@ -0,0 +1,516 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.cpp + * + * Tessellaton control shader specific code derived from the vec4_visitor class. + */ + +#include "brw_nir.h" +#include "brw_vec4_tcs.h" +#include "brw_fs.h" +#include "common/gen_debug.h" + +namespace brw { + +vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map) + : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, + nir, mem_ctx, false, shader_time_index), + input_vue_map(input_vue_map), key(key) +{ +} + + +void +vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ +} + +dst_reg * +vec4_tcs_visitor::make_reg_for_system_value(int location) +{ + return NULL; +} + + +void +vec4_tcs_visitor::setup_payload() +{ + int reg = 0; + + /* The payload always contains important data in r0, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. + */ + reg++; + + /* r1.0 - r4.7 may contain the input control point URB handles, + * which we use to pull vertex data. + */ + reg += 4; + + /* Push constants may start at r5.0 */ + reg = setup_uniforms(reg); + + this->first_non_payload_grf = reg; +} + + +void +vec4_tcs_visitor::emit_prolog() +{ + invocation_id = src_reg(this, glsl_type::uint_type); + emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); + + /* HS threads are dispatched with the dispatch mask set to 0xFF. + * If there are an odd number of output vertices, then the final + * HS instance dispatched will only have its bottom half doing real + * work, and so we need to disable the upper half: + */ + if (nir->info->tess.tcs_vertices_out % 2) { + emit(CMP(dst_null_d(), invocation_id, + brw_imm_ud(nir->info->tess.tcs_vertices_out), + BRW_CONDITIONAL_L)); + + /* Matching ENDIF is in emit_thread_end() */ + emit(IF(BRW_PREDICATE_NORMAL)); + } +} + + +void +vec4_tcs_visitor::emit_thread_end() +{ + vec4_instruction *inst; + current_annotation = "thread end"; + + if (nir->info->tess.tcs_vertices_out % 2) { + emit(BRW_OPCODE_ENDIF); + } + + if (devinfo->gen == 7) { + struct brw_tcs_prog_data *tcs_prog_data = + (struct brw_tcs_prog_data *) prog_data; + + current_annotation = "release input vertices"; + + /* Synchronize all threads, so we know that no one is still + * using the input URB handles. + */ + if (tcs_prog_data->instances > 1) { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + } + + /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. + * We want to compare the bottom half of invocation_id with 0, but + * use that truth value for the top half as well. Unfortunately, + * we don't have stride in the vec4 world, nor UV immediates in + * align16, so we need an opcode to get invocation_id<0,4,0>. + */ + set_condmod(BRW_CONDITIONAL_Z, + emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), + invocation_id)); + emit(IF(BRW_PREDICATE_NORMAL)); + for (unsigned i = 0; i < key->input_vertices; i += 2) { + /* If we have an odd number of input vertices, the last will be + * unpaired. We don't want to use an interleaved URB write in + * that case. + */ + const bool is_unpaired = i == key->input_vertices - 1; + + dst_reg header(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), + brw_imm_ud(is_unpaired)); + } + emit(BRW_OPCODE_ENDIF); + } + + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + emit_shader_time_end(); + + inst = emit(TCS_OPCODE_THREAD_END); + inst->base_mrf = 14; + inst->mlen = 2; +} + + +void +vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + unsigned first_component, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + dst_reg temp(this, glsl_type::ivec4_type); + temp.type = dst.type; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, + indirect_offset); + inst->force_writemask_all = true; + + /* Read into a temporary, ignoring writemasking. */ + inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); + inst->offset = base_offset; + inst->mlen = 1; + inst->base_mrf = -1; + + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); + } else { + src_reg src = src_reg(temp); + src.swizzle = BRW_SWZ_COMP_INPUT(first_component); + emit(MOV(dst, src)); + } +} + +void +vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + unsigned first_component, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, + brw_imm_ud(dst.writemask << first_component), indirect_offset); + inst->force_writemask_all = true; + + vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); + read->offset = base_offset; + read->mlen = 1; + read->base_mrf = -1; + + if (first_component) { + /* Read into a temporary and copy with a swizzle and writemask. */ + read->dst = retype(dst_reg(this, glsl_type::ivec4_type), dst.type); + emit(MOV(dst, swizzle(src_reg(read->dst), + BRW_SWZ_COMP_INPUT(first_component)))); + } +} + +void +vec4_tcs_visitor::emit_urb_write(const src_reg &value, + unsigned writemask, + unsigned base_offset, + const src_reg &indirect_offset) +{ + if (writemask == 0) + return; + + src_reg message(this, glsl_type::uvec4_type, 2); + vec4_instruction *inst; + + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), + brw_imm_ud(writemask), indirect_offset); + inst->force_writemask_all = true; + inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE), + value)); + inst->force_writemask_all = true; + + inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); + inst->offset = base_offset; + inst->mlen = 2; + inst->base_mrf = -1; +} + +void +vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_invocation_id: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), + invocation_id)); + break; + case nir_intrinsic_load_primitive_id: + emit(TCS_OPCODE_GET_PRIMITIVE_ID, + get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); + break; + case nir_intrinsic_load_patch_vertices_in: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), + brw_imm_d(key->input_vertices))); + break; + case nir_intrinsic_load_per_vertex_input: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); + src_reg vertex_index = + vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) + : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); + + unsigned first_component = nir_intrinsic_component(instr); + if (nir_dest_bit_size(instr->dest) == 64) { + /* We need to emit up to two 32-bit URB reads, then shuffle + * the result into a temporary, then move to the destination + * honoring the writemask + * + * We don't need to divide first_component by 2 because + * emit_input_urb_read takes a 32-bit type. + */ + dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); + dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D); + emit_input_urb_read(tmp_d, vertex_index, imm_offset, + first_component, indirect_offset); + if (instr->num_components > 2) { + emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index, + imm_offset + 1, 0, indirect_offset); + } + + src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF); + dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(shuffled, tmp_src, false); + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); + dst.writemask = brw_writemask_for_size(instr->num_components); + emit(MOV(dst, src_reg(shuffled))); + } else { + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + emit_input_urb_read(dst, vertex_index, imm_offset, + first_component, indirect_offset); + } + break; + } + case nir_intrinsic_load_input: + unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); + break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + + emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), + indirect_offset); + break; + } + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: { + src_reg value = get_nir_src(instr->src[0]); + unsigned mask = instr->const_index[1]; + unsigned swiz = BRW_SWIZZLE_XYZW; + + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + unsigned first_component = nir_intrinsic_component(instr); + if (first_component) { + if (nir_src_bit_size(instr->src[0]) == 64) + first_component /= 2; + assert(swiz == BRW_SWIZZLE_XYZW); + swiz = BRW_SWZ_COMP_OUTPUT(first_component); + mask = mask << first_component; + } + + if (nir_src_bit_size(instr->src[0]) == 64) { + /* For 64-bit data we need to shuffle the data before we write and + * emit two messages. Also, since each channel is twice as large we + * need to fix the writemask in each 32-bit message to account for it. + */ + value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz); + dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); + shuffle_64bit_data(shuffled, value, true); + src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); + + for (int n = 0; n < 2; n++) { + unsigned fixed_mask = 0; + if (mask & WRITEMASK_X) + fixed_mask |= WRITEMASK_XY; + if (mask & WRITEMASK_Y) + fixed_mask |= WRITEMASK_ZW; + emit_urb_write(shuffled_float, fixed_mask, + imm_offset, indirect_offset); + + shuffled_float = byte_offset(shuffled_float, REG_SIZE); + mask >>= 2; + imm_offset++; + } + } else { + emit_urb_write(swizzle(value, swiz), mask, + imm_offset, indirect_offset); + } + break; + } + + case nir_intrinsic_barrier: { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + break; + } + + default: + vec4_visitor::nir_emit_intrinsic(instr); + } +} + + +extern "C" const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *src_shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + struct brw_vue_prog_data *vue_prog_data = &prog_data->base; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir->info->outputs_written = key->outputs_written; + nir->info->patch_outputs_written = key->patch_outputs_written; + + struct brw_vue_map input_vue_map; + brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read, + nir->info->separate_shader); + brw_compute_tess_vue_map(&vue_prog_data->vue_map, + nir->info->outputs_written, + nir->info->patch_outputs_written); + + nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar); + brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map); + brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map, + key->tes_primitive_mode); + if (key->quads_workaround) + brw_nir_apply_tcs_quads_workaround(nir); + + nir = brw_postprocess_nir(nir, compiler, is_scalar); + + if (is_scalar) + prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 8); + else + prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 2); + + /* Compute URB entry size. The maximum allowed URB entry size is 32k. + * That divides up as follows: + * + * 32 bytes for the patch header (tessellation factors) + * 480 bytes for per-patch varyings (a varying component is 4 bytes and + * gl_MaxTessPatchComponents = 120) + * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, + * gl_MaxPatchVertices = 32 and + * gl_MaxTessControlOutputComponents = 128) + * + * 15808 bytes left for varying packing overhead + */ + const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots; + const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots; + unsigned output_size_bytes = 0; + /* Note that the patch header is counted in num_per_patch_slots. */ + output_size_bytes += num_per_patch_slots * 16; + output_size_bytes += nir->info->tess.tcs_vertices_out * + num_per_vertex_slots * 16; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES) + return NULL; + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + /* HS does not use the usual payload pushing from URB to GRFs, + * because we don't have enough registers for a full-size payload, and + * the hardware is broken on Haswell anyway. + */ + vue_prog_data->urb_read_length = 0; + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { + fprintf(stderr, "TCS Input "); + brw_print_vue_map(stderr, &input_vue_map); + fprintf(stderr, "TCS Output "); + brw_print_vue_map(stderr, &vue_prog_data->vue_map); + } + + if (is_scalar) { + fs_visitor v(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, NULL, nir, 8, + shader_time_index, &input_vue_map); + if (!v.run_tcs_single_patch()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, false, + MESA_SHADER_TESS_CTRL); + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { + g.enable_debug(ralloc_asprintf(mem_ctx, + "%s tessellation control shader %s", + nir->info->label ? nir->info->label + : "unnamed", + nir->info->name)); + } + + g.generate_code(v.cfg, 8); + + return g.get_assembly(final_assembly_size); + } else { + vec4_tcs_visitor v(compiler, log_data, key, prog_data, + nir, mem_ctx, shader_time_index, &input_vue_map); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) + v.dump_instructions(); + + + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, + &prog_data->base, v.cfg, + final_assembly_size); + } +} + + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_tcs.h b/src/intel/compiler/brw_vec4_tcs.h new file mode 100644 index 00000000000..030eb5e6603 --- /dev/null +++ b/src/intel/compiler/brw_vec4_tcs.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.h + * + * The vec4-mode tessellation control shader compiler backend. + */ + +#ifndef BRW_VEC4_TCS_H +#define BRW_VEC4_TCS_H + +#include "brw_compiler.h" +#include "brw_vec4.h" + +#ifdef __cplusplus +namespace brw { + +class vec4_tcs_visitor : public vec4_visitor +{ +public: + vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map); + +protected: + virtual dst_reg *make_reg_for_system_value(int location); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + + void emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + unsigned first_component, + const src_reg &indirect_offset); + void emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + unsigned first_component, + const src_reg &indirect_offset); + + void emit_urb_write(const src_reg &value, unsigned writemask, + unsigned base_offset, const src_reg &indirect_offset); + + /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage + * must provide implementations of these: + */ + virtual void emit_urb_write_header(int mrf) {} + virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; } + + const struct brw_vue_map *input_vue_map; + + const struct brw_tcs_prog_key *key; + src_reg invocation_id; +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_TCS_H */ diff --git a/src/intel/compiler/brw_vec4_tes.cpp b/src/intel/compiler/brw_vec4_tes.cpp new file mode 100644 index 00000000000..bcf9a87eb01 --- /dev/null +++ b/src/intel/compiler/brw_vec4_tes.cpp @@ -0,0 +1,296 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tes.cpp + * + * Tessellaton evaluation shader specific code derived from the vec4_visitor class. + */ + +#include "brw_vec4_tes.h" +#include "brw_cfg.h" +#include "common/gen_debug.h" + +namespace brw { + +vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const nir_shader *shader, + void *mem_ctx, + int shader_time_index) + : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, + shader, mem_ctx, false, shader_time_index) +{ +} + + +dst_reg * +vec4_tes_visitor::make_reg_for_system_value(int location) +{ + return NULL; +} + +void +vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_tess_level_outer: + case nir_intrinsic_load_tess_level_inner: + break; + default: + vec4_visitor::nir_setup_system_value_intrinsic(instr); + } +} + + +void +vec4_tes_visitor::setup_payload() +{ + int reg = 0; + + /* The payload always contains important data in r0 and r1, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. + */ + reg += 2; + + reg = setup_uniforms(reg); + + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != ATTR) + continue; + + bool is_64bit = type_sz(inst->src[i].type) == 8; + + unsigned slot = inst->src[i].nr + inst->src[i].offset / 16; + struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2)); + grf = stride(grf, 0, is_64bit ? 2 : 4, 1); + grf.swizzle = inst->src[i].swizzle; + grf.type = inst->src[i].type; + grf.abs = inst->src[i].abs; + grf.negate = inst->src[i].negate; + + /* For 64-bit attributes we can end up with components XY in the + * second half of a register and components ZW in the first half + * of the next. Fix it up here. + */ + if (is_64bit && grf.subnr > 0) { + /* We can't do swizzles that mix XY and ZW channels in this case. + * Such cases should have been handled by the scalarization pass. + */ + assert((brw_mask_for_swizzle(grf.swizzle) & 0x3) ^ + (brw_mask_for_swizzle(grf.swizzle) & 0xc)); + if (brw_mask_for_swizzle(grf.swizzle) & 0xc) { + grf.subnr = 0; + grf.nr++; + grf.swizzle -= BRW_SWIZZLE_ZZZZ; + } + } + + inst->src[i] = grf; + } + } + + reg += 8 * prog_data->urb_read_length; + + this->first_non_payload_grf = reg; +} + + +void +vec4_tes_visitor::emit_prolog() +{ + input_read_header = src_reg(this, glsl_type::uvec4_type); + emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header)); + + this->current_annotation = NULL; +} + + +void +vec4_tes_visitor::emit_urb_write_header(int mrf) +{ + /* No need to do anything for DS; an implied write to this MRF will be + * performed by VS_OPCODE_URB_WRITE. + */ + (void) mrf; +} + + +vec4_instruction * +vec4_tes_visitor::emit_urb_write_opcode(bool complete) +{ + /* For DS, the URB writes end the thread. */ + if (complete) { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + } + + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst->urb_write_flags = complete ? + BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS; + + return inst; +} + +void +vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + const struct brw_tes_prog_data *tes_prog_data = + (const struct brw_tes_prog_data *) prog_data; + + switch (instr->intrinsic) { + case nir_intrinsic_load_tess_coord: + /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */ + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + src_reg(brw_vec8_grf(1, 0)))); + break; + case nir_intrinsic_load_tess_level_outer: + if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), + BRW_SWIZZLE_ZWZW))); + } else { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), + BRW_SWIZZLE_WZYX))); + } + break; + case nir_intrinsic_load_tess_level_inner: + if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + swizzle(src_reg(ATTR, 0, glsl_type::vec4_type), + BRW_SWIZZLE_WZYX))); + } else { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + src_reg(ATTR, 1, glsl_type::float_type))); + } + break; + case nir_intrinsic_load_primitive_id: + emit(TES_OPCODE_GET_PRIMITIVE_ID, + get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + src_reg header = input_read_header; + bool is_64bit = nir_dest_bit_size(instr->dest) == 64; + unsigned first_component = nir_intrinsic_component(instr); + if (is_64bit) + first_component /= 2; + + if (indirect_offset.file != BAD_FILE) { + header = src_reg(this, glsl_type::uvec4_type); + emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header), + input_read_header, indirect_offset); + } else { + /* Arbitrarily only push up to 24 vec4 slots worth of data, + * which is 12 registers (since each holds 2 vec4 slots). + */ + const unsigned max_push_slots = 24; + if (imm_offset < max_push_slots) { + const glsl_type *src_glsl_type = + is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type; + src_reg src = src_reg(ATTR, imm_offset, src_glsl_type); + src.swizzle = BRW_SWZ_COMP_INPUT(first_component); + + const brw_reg_type dst_reg_type = + is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D; + emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src)); + + prog_data->urb_read_length = + MAX2(prog_data->urb_read_length, + DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2)); + break; + } + } + + if (!is_64bit) { + dst_reg temp(this, glsl_type::ivec4_type); + vec4_instruction *read = + emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); + read->offset = imm_offset; + read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; + + src_reg src = src_reg(temp); + src.swizzle = BRW_SWZ_COMP_INPUT(first_component); + + /* Copy to target. We might end up with some funky writemasks landing + * in here, but we really don't want them in the above pseudo-ops. + */ + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + emit(MOV(dst, src)); + } else { + /* For 64-bit we need to load twice as many 32-bit components, and for + * dvec3/4 we need to emit 2 URB Read messages + */ + dst_reg temp(this, glsl_type::dvec4_type); + dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D); + + vec4_instruction *read = + emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header)); + read->offset = imm_offset; + read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; + + if (instr->num_components > 2) { + read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE), + src_reg(header)); + read->offset = imm_offset + 1; + read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; + } + + src_reg temp_as_src = src_reg(temp); + temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component); + + dst_reg shuffled(this, glsl_type::dvec4_type); + shuffle_64bit_data(shuffled, temp_as_src, false); + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); + dst.writemask = brw_writemask_for_size(instr->num_components); + emit(MOV(dst, src_reg(shuffled))); + } + break; + } + default: + vec4_visitor::nir_emit_intrinsic(instr); + } +} + + +void +vec4_tes_visitor::emit_thread_end() +{ + /* For DS, we always end the thread by emitting a single vertex. + * emit_urb_write_opcode() will take care of setting the eot flag on the + * SEND instruction. + */ + emit_vertex(); +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_tes.h b/src/intel/compiler/brw_vec4_tes.h new file mode 100644 index 00000000000..31a28f35974 --- /dev/null +++ b/src/intel/compiler/brw_vec4_tes.h @@ -0,0 +1,68 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tes.h + * + * The vec4 mode tessellation evaluation shader compiler backend. + */ + +#ifndef BRW_VEC4_TES_H +#define BRW_VEC4_TES_H + +#include "brw_vec4.h" + +#ifdef __cplusplus +namespace brw { + +class vec4_tes_visitor : public vec4_visitor +{ +public: + vec4_tes_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index); + +protected: + virtual dst_reg *make_reg_for_system_value(int location); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + + virtual void emit_urb_write_header(int mrf); + virtual vec4_instruction *emit_urb_write_opcode(bool complete); + +private: + src_reg input_read_header; +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_TES_H */ diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp new file mode 100644 index 00000000000..262a084ca87 --- /dev/null +++ b/src/intel/compiler/brw_vec4_visitor.cpp @@ -0,0 +1,1917 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_vec4.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +namespace brw { + +vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, + const src_reg &src0, const src_reg &src1, + const src_reg &src2) +{ + this->opcode = opcode; + this->dst = dst; + this->src[0] = src0; + this->src[1] = src1; + this->src[2] = src2; + this->saturate = false; + this->force_writemask_all = false; + this->no_dd_clear = false; + this->no_dd_check = false; + this->writes_accumulator = false; + this->conditional_mod = BRW_CONDITIONAL_NONE; + this->predicate = BRW_PREDICATE_NONE; + this->predicate_inverse = false; + this->target = 0; + this->shadow_compare = false; + this->ir = NULL; + this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; + this->header_size = 0; + this->flag_subreg = 0; + this->mlen = 0; + this->base_mrf = 0; + this->offset = 0; + this->exec_size = 8; + this->group = 0; + this->size_written = (dst.file == BAD_FILE ? + 0 : this->exec_size * type_sz(dst.type)); + this->annotation = NULL; +} + +vec4_instruction * +vec4_visitor::emit(vec4_instruction *inst) +{ + inst->ir = this->base_ir; + inst->annotation = this->current_annotation; + + this->instructions.push_tail(inst); + + return inst; +} + +vec4_instruction * +vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, + vec4_instruction *new_inst) +{ + new_inst->ir = inst->ir; + new_inst->annotation = inst->annotation; + + inst->insert_before(block, new_inst); + + return inst; +} + +vec4_instruction * +vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1, const src_reg &src2) +{ + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); +} + + +vec4_instruction * +vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1) +{ + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); +} + +vec4_instruction * +vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) +{ + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); +} + +vec4_instruction * +vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) +{ + return emit(new(mem_ctx) vec4_instruction(opcode, dst)); +} + +vec4_instruction * +vec4_visitor::emit(enum opcode opcode) +{ + return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); +} + +#define ALU1(op) \ + vec4_instruction * \ + vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ + { \ + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ + } + +#define ALU2(op) \ + vec4_instruction * \ + vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ + const src_reg &src1) \ + { \ + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ + src0, src1); \ + } + +#define ALU2_ACC(op) \ + vec4_instruction * \ + vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ + const src_reg &src1) \ + { \ + vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ + BRW_OPCODE_##op, dst, src0, src1); \ + inst->writes_accumulator = true; \ + return inst; \ + } + +#define ALU3(op) \ + vec4_instruction * \ + vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ + const src_reg &src1, const src_reg &src2) \ + { \ + assert(devinfo->gen >= 6); \ + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ + src0, src1, src2); \ + } + +ALU1(NOT) +ALU1(MOV) +ALU1(FRC) +ALU1(RNDD) +ALU1(RNDE) +ALU1(RNDZ) +ALU1(F32TO16) +ALU1(F16TO32) +ALU2(ADD) +ALU2(MUL) +ALU2_ACC(MACH) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(DP3) +ALU2(DP4) +ALU2(DPH) +ALU2(SHL) +ALU2(SHR) +ALU2(ASR) +ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU3(MAD) +ALU2_ACC(ADDC) +ALU2_ACC(SUBB) +ALU2(MAC) +ALU1(DIM) + +/** Gen4 predicated IF. */ +vec4_instruction * +vec4_visitor::IF(enum brw_predicate predicate) +{ + vec4_instruction *inst; + + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); + inst->predicate = predicate; + + return inst; +} + +/** Gen6 IF with embedded comparison. */ +vec4_instruction * +vec4_visitor::IF(src_reg src0, src_reg src1, + enum brw_conditional_mod condition) +{ + assert(devinfo->gen == 6); + + vec4_instruction *inst; + + resolve_ud_negate(&src0); + resolve_ud_negate(&src1); + + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), + src0, src1); + inst->conditional_mod = condition; + + return inst; +} + +/** + * CMP: Sets the low bit of the destination channels with the result + * of the comparison, while the upper bits are undefined, and updates + * the flag register with the packed 16 bits of the result. + */ +vec4_instruction * +vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, + enum brw_conditional_mod condition) +{ + vec4_instruction *inst; + + /* Take the instruction: + * + * CMP null<d> src0<f> src1<f> + * + * Original gen4 does type conversion to the destination type before + * comparison, producing garbage results for floating point comparisons. + * + * The destination type doesn't matter on newer generations, so we set the + * type to match src0 so we can compact the instruction. + */ + dst.type = src0.type; + + resolve_ud_negate(&src0); + resolve_ud_negate(&src1); + + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); + inst->conditional_mod = condition; + + return inst; +} + +vec4_instruction * +vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) +{ + vec4_instruction *inst; + + inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ, + dst, index); + inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; + inst->mlen = 2; + + return inst; +} + +vec4_instruction * +vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, + const src_reg &index) +{ + vec4_instruction *inst; + + inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE, + dst, src, index); + inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen); + inst->mlen = 3; + + return inst; +} + +src_reg +vec4_visitor::fix_3src_operand(const src_reg &src) +{ + /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be + * able to use vertical stride of zero to replicate the vec4 uniform, like + * + * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] + * + * But you can't, since vertical stride is always four in three-source + * instructions. Instead, insert a MOV instruction to do the replication so + * that the three-source instruction can consume it. + */ + + /* The MOV is only needed if the source is a uniform or immediate. */ + if (src.file != UNIFORM && src.file != IMM) + return src; + + if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) + return src; + + dst_reg expanded = dst_reg(this, glsl_type::vec4_type); + expanded.type = src.type; + emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); + return src_reg(expanded); +} + +src_reg +vec4_visitor::resolve_source_modifiers(const src_reg &src) +{ + if (!src.abs && !src.negate) + return src; + + dst_reg resolved = dst_reg(this, glsl_type::ivec4_type); + resolved.type = src.type; + emit(MOV(resolved, src)); + + return src_reg(resolved); +} + +src_reg +vec4_visitor::fix_math_operand(const src_reg &src) +{ + if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE) + return src; + + /* The gen6 math instruction ignores the source modifiers -- + * swizzle, abs, negate, and at least some parts of the register + * region description. + * + * Rather than trying to enumerate all these cases, *always* expand the + * operand to a temp GRF for gen6. + * + * For gen7, keep the operand as-is, except if immediate, which gen7 still + * can't use. + */ + + if (devinfo->gen == 7 && src.file != IMM) + return src; + + dst_reg expanded = dst_reg(this, glsl_type::vec4_type); + expanded.type = src.type; + emit(MOV(expanded, src)); + return src_reg(expanded); +} + +vec4_instruction * +vec4_visitor::emit_math(enum opcode opcode, + const dst_reg &dst, + const src_reg &src0, const src_reg &src1) +{ + vec4_instruction *math = + emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); + + if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) { + /* MATH on Gen6 must be align1, so we can't do writemasks. */ + math->dst = dst_reg(this, glsl_type::vec4_type); + math->dst.type = dst.type; + math = emit(MOV(dst, src_reg(math->dst))); + } else if (devinfo->gen < 6) { + math->base_mrf = 1; + math->mlen = src1.file == BAD_FILE ? 1 : 2; + } + + return math; +} + +void +vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) +{ + if (devinfo->gen < 7) { + unreachable("ir_unop_pack_half_2x16 should be lowered"); + } + + assert(dst.type == BRW_REGISTER_TYPE_UD); + assert(src0.type == BRW_REGISTER_TYPE_F); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: + * + * Because this instruction does not have a 16-bit floating-point type, + * the destination data type must be Word (W). + * + * The destination must be DWord-aligned and specify a horizontal stride + * (HorzStride) of 2. The 16-bit result is stored in the lower word of + * each destination channel and the upper word is not modified. + * + * The above restriction implies that the f32to16 instruction must use + * align1 mode, because only in align1 mode is it possible to specify + * horizontal stride. We choose here to defy the hardware docs and emit + * align16 instructions. + * + * (I [chadv] did attempt to emit align1 instructions for VS f32to16 + * instructions. I was partially successful in that the code passed all + * tests. However, the code was dubiously correct and fragile, and the + * tests were not harsh enough to probe that frailty. Not trusting the + * code, I chose instead to remain in align16 mode in defiance of the hw + * docs). + * + * I've [chadv] experimentally confirmed that, on gen7 hardware and the + * simulator, emitting a f32to16 in align16 mode with UD as destination + * data type is safe. The behavior differs from that specified in the PRM + * in that the upper word of each destination channel is cleared to 0. + */ + + dst_reg tmp_dst(this, glsl_type::uvec2_type); + src_reg tmp_src(tmp_dst); + +#if 0 + /* Verify the undocumented behavior on which the following instructions + * rely. If f32to16 fails to clear the upper word of the X and Y channels, + * then the result of the bit-or instruction below will be incorrect. + * + * You should inspect the disasm output in order to verify that the MOV is + * not optimized away. + */ + emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); +#endif + + /* Give tmp the form below, where "." means untouched. + * + * w z y x w z y x + * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| + * + * That the upper word of each write-channel be 0 is required for the + * following bit-shift and bit-or instructions to work. Note that this + * relies on the undocumented hardware behavior mentioned above. + */ + tmp_dst.writemask = WRITEMASK_XY; + emit(F32TO16(tmp_dst, src0)); + + /* Give the write-channels of dst the form: + * 0xhhhh0000 + */ + tmp_src.swizzle = BRW_SWIZZLE_YYYY; + emit(SHL(dst, tmp_src, brw_imm_ud(16u))); + + /* Finally, give the write-channels of dst the form of packHalf2x16's + * output: + * 0xhhhhllll + */ + tmp_src.swizzle = BRW_SWIZZLE_XXXX; + emit(OR(dst, src_reg(dst), tmp_src)); +} + +void +vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) +{ + if (devinfo->gen < 7) { + unreachable("ir_unop_unpack_half_2x16 should be lowered"); + } + + assert(dst.type == BRW_REGISTER_TYPE_F); + assert(src0.type == BRW_REGISTER_TYPE_UD); + + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: + * + * Because this instruction does not have a 16-bit floating-point type, + * the source data type must be Word (W). The destination type must be + * F (Float). + * + * To use W as the source data type, we must adjust horizontal strides, + * which is only possible in align1 mode. All my [chadv] attempts at + * emitting align1 instructions for unpackHalf2x16 failed to pass the + * Piglit tests, so I gave up. + * + * I've verified that, on gen7 hardware and the simulator, it is safe to + * emit f16to32 in align16 mode with UD as source data type. + */ + + dst_reg tmp_dst(this, glsl_type::uvec2_type); + src_reg tmp_src(tmp_dst); + + tmp_dst.writemask = WRITEMASK_X; + emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); + + tmp_dst.writemask = WRITEMASK_Y; + emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); + + dst.writemask = WRITEMASK_XY; + emit(F16TO32(dst, tmp_src)); +} + +void +vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) +{ + /* Instead of splitting the 32-bit integer, shifting, and ORing it back + * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate + * is not suitable to generate the shift values, but we can use the packed + * vector float and a type-converting MOV. + */ + dst_reg shift(this, glsl_type::uvec4_type); + emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); + + dst_reg shifted(this, glsl_type::uvec4_type); + src0.swizzle = BRW_SWIZZLE_XXXX; + emit(SHR(shifted, src0, src_reg(shift))); + + shifted.type = BRW_REGISTER_TYPE_UB; + dst_reg f(this, glsl_type::vec4_type); + emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); + + emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); +} + +void +vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) +{ + /* Instead of splitting the 32-bit integer, shifting, and ORing it back + * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate + * is not suitable to generate the shift values, but we can use the packed + * vector float and a type-converting MOV. + */ + dst_reg shift(this, glsl_type::uvec4_type); + emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); + + dst_reg shifted(this, glsl_type::uvec4_type); + src0.swizzle = BRW_SWIZZLE_XXXX; + emit(SHR(shifted, src0, src_reg(shift))); + + shifted.type = BRW_REGISTER_TYPE_B; + dst_reg f(this, glsl_type::vec4_type); + emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); + + dst_reg scaled(this, glsl_type::vec4_type); + emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); + + dst_reg max(this, glsl_type::vec4_type); + emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); + emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); +} + +void +vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) +{ + dst_reg saturated(this, glsl_type::vec4_type); + vec4_instruction *inst = emit(MOV(saturated, src0)); + inst->saturate = true; + + dst_reg scaled(this, glsl_type::vec4_type); + emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); + + dst_reg rounded(this, glsl_type::vec4_type); + emit(RNDE(rounded, src_reg(scaled))); + + dst_reg u(this, glsl_type::uvec4_type); + emit(MOV(u, src_reg(rounded))); + + src_reg bytes(u); + emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); +} + +void +vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) +{ + dst_reg max(this, glsl_type::vec4_type); + emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); + + dst_reg min(this, glsl_type::vec4_type); + emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); + + dst_reg scaled(this, glsl_type::vec4_type); + emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); + + dst_reg rounded(this, glsl_type::vec4_type); + emit(RNDE(rounded, src_reg(scaled))); + + dst_reg i(this, glsl_type::ivec4_type); + emit(MOV(i, src_reg(rounded))); + + src_reg bytes(i); + emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); +} + +/* + * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == + * false) elements needed to pack a type. + */ +static int +type_size_xvec4(const struct glsl_type *type, bool as_vec4) +{ + unsigned int i; + int size; + + switch (type->base_type) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: + if (type->is_matrix()) { + const glsl_type *col_type = type->column_type(); + unsigned col_slots = + (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; + return type->matrix_columns * col_slots; + } else { + /* Regardless of size of vector, it gets a vec4. This is bad + * packing for things like floats, but otherwise arrays become a + * mess. Hopefully a later pass over the code can pack scalars + * down if appropriate. + */ + return (as_vec4 && type->is_dual_slot()) ? 2 : 1; + } + case GLSL_TYPE_ARRAY: + assert(type->length > 0); + return type_size_xvec4(type->fields.array, as_vec4) * type->length; + case GLSL_TYPE_STRUCT: + size = 0; + for (i = 0; i < type->length; i++) { + size += type_size_xvec4(type->fields.structure[i].type, as_vec4); + } + return size; + case GLSL_TYPE_SUBROUTINE: + return 1; + + case GLSL_TYPE_SAMPLER: + /* Samplers take up no register space, since they're baked in at + * link time. + */ + return 0; + case GLSL_TYPE_ATOMIC_UINT: + return 0; + case GLSL_TYPE_IMAGE: + return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: + unreachable("not reached"); + } + + return 0; +} + +/** + * Returns the minimum number of vec4 elements needed to pack a type. + * + * For simple types, it will return 1 (a single vec4); for matrices, the + * number of columns; for array and struct, the sum of the vec4_size of + * each of its elements; and for sampler and atomic, zero. + * + * This method is useful to calculate how much register space is needed to + * store a particular type. + */ +extern "C" int +type_size_vec4(const struct glsl_type *type) +{ + return type_size_xvec4(type, true); +} + +/** + * Returns the minimum number of dvec4 elements needed to pack a type. + * + * For simple types, it will return 1 (a single dvec4); for matrices, the + * number of columns; for array and struct, the sum of the dvec4_size of + * each of its elements; and for sampler and atomic, zero. + * + * This method is useful to calculate how much register space is needed to + * store a particular type. + * + * Measuring double-precision vertex inputs as dvec4 is required because + * ARB_vertex_attrib_64bit states that these uses the same number of locations + * than the single-precision version. That is, two consecutives dvec4 would be + * located in location "x" and location "x+1", not "x+2". + * + * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, + * remap_vs_attrs() will take in account both the location and also if the + * type fits in one or two vec4 slots. + */ +extern "C" int +type_size_dvec4(const struct glsl_type *type) +{ + return type_size_xvec4(type, false); +} + +src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) +{ + init(); + + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type)); + + if (type->is_array() || type->is_record()) { + this->swizzle = BRW_SWIZZLE_NOOP; + } else { + this->swizzle = brw_swizzle_for_size(type->vector_elements); + } + + this->type = brw_type_for_base_type(type); +} + +src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) +{ + assert(size > 0); + + init(); + + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type) * size); + + this->swizzle = BRW_SWIZZLE_NOOP; + + this->type = brw_type_for_base_type(type); +} + +dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) +{ + init(); + + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type)); + + if (type->is_array() || type->is_record()) { + this->writemask = WRITEMASK_XYZW; + } else { + this->writemask = (1 << type->vector_elements) - 1; + } + + this->type = brw_type_for_base_type(type); +} + +vec4_instruction * +vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, + src_reg src0, src_reg src1) +{ + vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); + inst->conditional_mod = conditionalmod; + return inst; +} + +vec4_instruction * +vec4_visitor::emit_lrp(const dst_reg &dst, + const src_reg &x, const src_reg &y, const src_reg &a) +{ + if (devinfo->gen >= 6) { + /* Note that the instruction's argument order is reversed from GLSL + * and the IR. + */ + return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y), + fix_3src_operand(x))); + } else { + /* Earlier generations don't support three source operations, so we + * need to emit x*(1-a) + y*a. + */ + dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type); + dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type); + dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type); + y_times_a.writemask = dst.writemask; + one_minus_a.writemask = dst.writemask; + x_times_one_minus_a.writemask = dst.writemask; + + emit(MUL(y_times_a, y, a)); + emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f))); + emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a))); + return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a))); + } +} + +/** + * Emits the instructions needed to perform a pull constant load. before_block + * and before_inst can be NULL in which case the instruction will be appended + * to the end of the instruction list. + */ +void +vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, + src_reg surf_index, + src_reg offset_reg, + bblock_t *before_block, + vec4_instruction *before_inst) +{ + assert((before_inst == NULL && before_block == NULL) || + (before_inst && before_block)); + + vec4_instruction *pull; + + if (devinfo->gen >= 9) { + /* Gen9+ needs a message header in order to use SIMD4x2 mode */ + src_reg header(this, glsl_type::uvec4_type, 2); + + pull = new(mem_ctx) + vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, + dst_reg(header)); + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); + + dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE), + offset_reg.type); + pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); + + pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, + dst, + surf_index, + header); + pull->mlen = 2; + pull->header_size = 1; + } else if (devinfo->gen >= 7) { + dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); + + grf_offset.type = offset_reg.type; + + pull = MOV(grf_offset, offset_reg); + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); + + pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, + dst, + surf_index, + src_reg(grf_offset)); + pull->mlen = 1; + } else { + pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, + dst, + surf_index, + offset_reg); + pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; + pull->mlen = 1; + } + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); +} + +src_reg +vec4_visitor::emit_uniformize(const src_reg &src) +{ + const src_reg chan_index(this, glsl_type::uint_type); + const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), + src.type); + + emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) + ->force_writemask_all = true; + emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) + ->force_writemask_all = true; + + return src_reg(dst); +} + +src_reg +vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, + src_reg coordinate, src_reg surface) +{ + vec4_instruction *inst = + new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, + dst_reg(this, glsl_type::uvec4_type)); + inst->base_mrf = 2; + inst->src[1] = surface; + inst->src[2] = surface; + + int param_base; + + if (devinfo->gen >= 9) { + /* Gen9+ needs a message header in order to use SIMD4x2 mode */ + vec4_instruction *header_inst = new(mem_ctx) + vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, + dst_reg(MRF, inst->base_mrf)); + + emit(header_inst); + + inst->mlen = 2; + inst->header_size = 1; + param_base = inst->base_mrf + 1; + } else { + inst->mlen = 1; + param_base = inst->base_mrf; + } + + /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ + int coord_mask = (1 << coordinate_type->vector_elements) - 1; + int zero_mask = 0xf & ~coord_mask; + + emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), + coordinate)); + + emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), + brw_imm_d(0))); + + emit(inst); + return src_reg(inst->dst); +} + +bool +vec4_visitor::is_high_sampler(src_reg sampler) +{ + if (devinfo->gen < 8 && !devinfo->is_haswell) + return false; + + return sampler.file != IMM || sampler.ud >= 16; +} + +void +vec4_visitor::emit_texture(ir_texture_opcode op, + dst_reg dest, + const glsl_type *dest_type, + src_reg coordinate, + int coord_components, + src_reg shadow_comparator, + src_reg lod, src_reg lod2, + src_reg sample_index, + uint32_t constant_offset, + src_reg offset_value, + src_reg mcs, + uint32_t surface, + src_reg surface_reg, + src_reg sampler_reg) +{ + /* The sampler can only meaningfully compute LOD for fragment shader + * messages. For all other stages, we change the opcode to TXL and hardcode + * the LOD to 0. + * + * textureQueryLevels() is implemented in terms of TXS so we need to pass a + * valid LOD argument. + */ + if (op == ir_tex || op == ir_query_levels) { + assert(lod.file == BAD_FILE); + lod = brw_imm_f(0.0f); + } + + enum opcode opcode; + switch (op) { + case ir_tex: opcode = SHADER_OPCODE_TXL; break; + case ir_txl: opcode = SHADER_OPCODE_TXL; break; + case ir_txd: opcode = SHADER_OPCODE_TXD; break; + case ir_txf: opcode = SHADER_OPCODE_TXF; break; + case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W : + SHADER_OPCODE_TXF_CMS); break; + case ir_txs: opcode = SHADER_OPCODE_TXS; break; + case ir_tg4: opcode = offset_value.file != BAD_FILE + ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; + case ir_query_levels: opcode = SHADER_OPCODE_TXS; break; + case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; + case ir_txb: + unreachable("TXB is not valid for vertex shaders."); + case ir_lod: + unreachable("LOD is not valid for vertex shaders."); + case ir_samples_identical: { + /* There are some challenges implementing this for vec4, and it seems + * unlikely to be used anyway. For now, just return false ways. + */ + emit(MOV(dest, brw_imm_ud(0u))); + return; + } + default: + unreachable("Unrecognized tex op"); + } + + vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); + + inst->offset = constant_offset; + + /* The message header is necessary for: + * - Gen4 (always) + * - Gen9+ for selecting SIMD4x2 + * - Texel offsets + * - Gather channel selection + * - Sampler indices too large to fit in a 4-bit value. + * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal + */ + inst->header_size = + (devinfo->gen < 5 || devinfo->gen >= 9 || + inst->offset != 0 || op == ir_tg4 || + op == ir_texture_samples || + is_high_sampler(sampler_reg)) ? 1 : 0; + inst->base_mrf = 2; + inst->mlen = inst->header_size; + inst->dst.writemask = WRITEMASK_XYZW; + inst->shadow_compare = shadow_comparator.file != BAD_FILE; + + inst->src[1] = surface_reg; + inst->src[2] = sampler_reg; + + /* MRF for the first parameter */ + int param_base = inst->base_mrf + inst->header_size; + + if (op == ir_txs || op == ir_query_levels) { + int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X; + emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); + inst->mlen++; + } else if (op == ir_texture_samples) { + inst->dst.writemask = WRITEMASK_X; + } else { + /* Load the coordinate */ + /* FINISHME: gl_clamp_mask and saturate */ + int coord_mask = (1 << coord_components) - 1; + int zero_mask = 0xf & ~coord_mask; + + emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), + coordinate)); + inst->mlen++; + + if (zero_mask != 0) { + emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), + brw_imm_d(0))); + } + /* Load the shadow comparator */ + if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) { + emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, + WRITEMASK_X), + shadow_comparator)); + inst->mlen++; + } + + /* Load the LOD info */ + if (op == ir_tex || op == ir_txl) { + int mrf, writemask; + if (devinfo->gen >= 5) { + mrf = param_base + 1; + if (shadow_comparator.file != BAD_FILE) { + writemask = WRITEMASK_Y; + /* mlen already incremented */ + } else { + writemask = WRITEMASK_X; + inst->mlen++; + } + } else /* devinfo->gen == 4 */ { + mrf = param_base; + writemask = WRITEMASK_W; + } + emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); + } else if (op == ir_txf) { + emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); + } else if (op == ir_txf_ms) { + emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), + sample_index)); + if (opcode == SHADER_OPCODE_TXF_CMS_W) { + /* MCS data is stored in the first two channels of ‘mcs’, but we + * need to get it into the .y and .z channels of the second vec4 + * of params. + */ + mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1); + emit(MOV(dst_reg(MRF, param_base + 1, + glsl_type::uint_type, WRITEMASK_YZ), + mcs)); + } else if (devinfo->gen >= 7) { + /* MCS data is in the first channel of `mcs`, but we need to get it into + * the .y channel of the second vec4 of params, so replicate .x across + * the whole vec4 and then mask off everything except .y + */ + mcs.swizzle = BRW_SWIZZLE_XXXX; + emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), + mcs)); + } + inst->mlen++; + } else if (op == ir_txd) { + const brw_reg_type type = lod.type; + + if (devinfo->gen >= 5) { + lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); + lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); + inst->mlen++; + + if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) { + lod.swizzle = BRW_SWIZZLE_ZZZZ; + lod2.swizzle = BRW_SWIZZLE_ZZZZ; + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); + inst->mlen++; + + if (shadow_comparator.file != BAD_FILE) { + emit(MOV(dst_reg(MRF, param_base + 2, + shadow_comparator.type, WRITEMASK_Z), + shadow_comparator)); + } + } + } else /* devinfo->gen == 4 */ { + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); + inst->mlen += 2; + } + } else if (op == ir_tg4 && offset_value.file != BAD_FILE) { + if (shadow_comparator.file != BAD_FILE) { + emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), + shadow_comparator)); + } + + emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), + offset_value)); + inst->mlen++; + } + } + + emit(inst); + + /* fixup num layers (z) for cube arrays: hardware returns faces * layers; + * spec requires layers. + */ + if (op == ir_txs && devinfo->gen < 7) { + /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ + emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), + src_reg(inst->dst), brw_imm_d(1)); + } + + if (devinfo->gen == 6 && op == ir_tg4) { + emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst); + } + + if (op == ir_query_levels) { + /* # levels is in .w */ + src_reg swizzled(dest); + swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, + SWIZZLE_W, SWIZZLE_W); + emit(MOV(dest, swizzled)); + } +} + +/** + * Apply workarounds for Gen6 gather with UINT/SINT + */ +void +vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst) +{ + if (!wa) + return; + + int width = (wa & WA_8BIT) ? 8 : 16; + dst_reg dst_f = dst; + dst_f.type = BRW_REGISTER_TYPE_F; + + /* Convert from UNORM to UINT */ + emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1)))); + emit(MOV(dst, src_reg(dst_f))); + + if (wa & WA_SIGN) { + /* Reinterpret the UINT value as a signed INT value by + * shifting the sign bit into place, then shifting back + * preserving sign. + */ + emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width))); + emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width))); + } +} + +void +vec4_visitor::gs_emit_vertex(int /* stream_id */) +{ + unreachable("not reached"); +} + +void +vec4_visitor::gs_end_primitive() +{ + unreachable("not reached"); +} + +void +vec4_visitor::emit_ndc_computation() +{ + if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) + return; + + /* Get the position */ + src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); + + /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ + dst_reg ndc = dst_reg(this, glsl_type::vec4_type); + output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; + output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; + + current_annotation = "NDC"; + dst_reg ndc_w = ndc; + ndc_w.writemask = WRITEMASK_W; + src_reg pos_w = pos; + pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); + emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); + + dst_reg ndc_xyz = ndc; + ndc_xyz.writemask = WRITEMASK_XYZ; + + emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); +} + +void +vec4_visitor::emit_psiz_and_flags(dst_reg reg) +{ + if (devinfo->gen < 6 && + ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || + output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || + devinfo->has_negative_rhw_bug)) { + dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); + dst_reg header1_w = header1; + header1_w.writemask = WRITEMASK_W; + + emit(MOV(header1, brw_imm_ud(0u))); + + if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { + src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); + + current_annotation = "Point size"; + emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); + emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); + } + + if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { + current_annotation = "Clipping flags"; + dst_reg flags0 = dst_reg(this, glsl_type::uint_type); + dst_reg flags1 = dst_reg(this, glsl_type::uint_type); + + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); + emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); + + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); + emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); + emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); + } + + /* i965 clipping workaround: + * 1) Test for -ve rhw + * 2) If set, + * set ndc = (0,0,0,0) + * set ucp[6] = 1 + * + * Later, clipping will detect ucp[6] and ensure the primitive is + * clipped against all fixed planes. + */ + if (devinfo->has_negative_rhw_bug && + output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { + src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); + ndc_w.swizzle = BRW_SWIZZLE_WWWW; + emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); + vec4_instruction *inst; + inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); + inst->predicate = BRW_PREDICATE_NORMAL; + output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; + inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + } + + emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); + } else if (devinfo->gen < 6) { + emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); + } else { + emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); + if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { + dst_reg reg_w = reg; + reg_w.writemask = WRITEMASK_W; + src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); + reg_as_src.type = reg_w.type; + reg_as_src.swizzle = brw_swizzle_for_size(1); + emit(MOV(reg_w, reg_as_src)); + } + if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) { + dst_reg reg_y = reg; + reg_y.writemask = WRITEMASK_Y; + reg_y.type = BRW_REGISTER_TYPE_D; + output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; + emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); + } + if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { + dst_reg reg_z = reg; + reg_z.writemask = WRITEMASK_Z; + reg_z.type = BRW_REGISTER_TYPE_D; + output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; + emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); + } + } +} + +vec4_instruction * +vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) +{ + assert(varying < VARYING_SLOT_MAX); + + unsigned num_comps = output_num_components[varying][component]; + if (num_comps == 0) + return NULL; + + assert(output_reg[varying][component].type == reg.type); + current_annotation = output_reg_annotation[varying]; + if (output_reg[varying][component].file != BAD_FILE) { + src_reg src = src_reg(output_reg[varying][component]); + src.swizzle = BRW_SWZ_COMP_OUTPUT(component); + reg.writemask = + brw_writemask_for_component_packing(num_comps, component); + return emit(MOV(reg, src)); + } + return NULL; +} + +void +vec4_visitor::emit_urb_slot(dst_reg reg, int varying) +{ + reg.type = BRW_REGISTER_TYPE_F; + output_reg[varying][0].type = reg.type; + + switch (varying) { + case VARYING_SLOT_PSIZ: + { + /* PSIZ is always in slot 0, and is coupled with other flags. */ + current_annotation = "indices, point width, clip flags"; + emit_psiz_and_flags(reg); + break; + } + case BRW_VARYING_SLOT_NDC: + current_annotation = "NDC"; + if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) + emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); + break; + case VARYING_SLOT_POS: + current_annotation = "gl_Position"; + if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) + emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); + break; + case VARYING_SLOT_EDGE: + /* This is present when doing unfilled polygons. We're supposed to copy + * the edge flag from the user-provided vertex array + * (glEdgeFlagPointer), or otherwise we'll copy from the current value + * of that attribute (starts as 1.0f). This is then used in clipping to + * determine which edges should be drawn as wireframe. + */ + current_annotation = "edge flag"; + emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG, + glsl_type::float_type, WRITEMASK_XYZW)))); + break; + case BRW_VARYING_SLOT_PAD: + /* No need to write to this slot */ + break; + default: + for (int i = 0; i < 4; i++) { + emit_generic_urb_slot(reg, varying, i); + } + break; + } +} + +static int +align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen) +{ + if (devinfo->gen >= 6) { + /* URB data written (does not include the message header reg) must + * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, + * section 5.4.3.2.2: URB_INTERLEAVED. + * + * URB entries are allocated on a multiple of 1024 bits, so an + * extra 128 bits written here to make the end align to 256 is + * no problem. + */ + if ((mlen % 2) != 1) + mlen++; + } + + return mlen; +} + + +/** + * Generates the VUE payload plus the necessary URB write instructions to + * output it. + * + * The VUE layout is documented in Volume 2a. + */ +void +vec4_visitor::emit_vertex() +{ + /* MRF 0 is reserved for the debugger, so start with message header + * in MRF 1. + */ + int base_mrf = 1; + int mrf = base_mrf; + /* In the process of generating our URB write message contents, we + * may need to unspill a register or load from an array. Those + * reads would use MRFs 14-15. + */ + int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); + + /* The following assertion verifies that max_usable_mrf causes an + * even-numbered amount of URB write data, which will meet gen6's + * requirements for length alignment. + */ + assert ((max_usable_mrf - base_mrf) % 2 == 0); + + /* First mrf is the g0-based message header containing URB handles and + * such. + */ + emit_urb_write_header(mrf++); + + if (devinfo->gen < 6) { + emit_ndc_computation(); + } + + /* We may need to split this up into several URB writes, so do them in a + * loop. + */ + int slot = 0; + bool complete = false; + do { + /* URB offset is in URB row increments, and each of our MRFs is half of + * one of those, since we're doing interleaved writes. + */ + int offset = slot / 2; + + mrf = base_mrf + 1; + for (; slot < prog_data->vue_map.num_slots; ++slot) { + emit_urb_slot(dst_reg(MRF, mrf++), + prog_data->vue_map.slot_to_varying[slot]); + + /* If this was max_usable_mrf, we can't fit anything more into this + * URB WRITE. Same thing if we reached the maximum length available. + */ + if (mrf > max_usable_mrf || + align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { + slot++; + break; + } + } + + complete = slot >= prog_data->vue_map.num_slots; + current_annotation = "URB write"; + vec4_instruction *inst = emit_urb_write_opcode(complete); + inst->base_mrf = base_mrf; + inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); + inst->offset += offset; + } while(!complete); +} + + +src_reg +vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, + src_reg *reladdr, int reg_offset) +{ + /* Because we store the values to scratch interleaved like our + * vertex data, we need to scale the vec4 index by 2. + */ + int message_header_scale = 2; + + /* Pre-gen6, the message header uses byte offsets instead of vec4 + * (16-byte) offset units. + */ + if (devinfo->gen < 6) + message_header_scale *= 16; + + if (reladdr) { + /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have + * to multiply the reladdr by 2. Notice that the reg_offset part + * is in units of 16 bytes and is used to select the low/high 16-byte + * chunk of a full dvec4, so we don't want to multiply that part. + */ + src_reg index = src_reg(this, glsl_type::int_type); + if (type_sz(inst->dst.type) < 8) { + emit_before(block, inst, ADD(dst_reg(index), *reladdr, + brw_imm_d(reg_offset))); + emit_before(block, inst, MUL(dst_reg(index), index, + brw_imm_d(message_header_scale))); + } else { + emit_before(block, inst, MUL(dst_reg(index), *reladdr, + brw_imm_d(message_header_scale * 2))); + emit_before(block, inst, ADD(dst_reg(index), index, + brw_imm_d(reg_offset * message_header_scale))); + } + return index; + } else { + return brw_imm_d(reg_offset * message_header_scale); + } +} + +/** + * Emits an instruction before @inst to load the value named by @orig_src + * from scratch space at @base_offset to @temp. + * + * @base_offset is measured in 32-byte units (the size of a register). + */ +void +vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, + dst_reg temp, src_reg orig_src, + int base_offset) +{ + assert(orig_src.offset % REG_SIZE == 0); + int reg_offset = base_offset + orig_src.offset / REG_SIZE; + src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, + reg_offset); + + if (type_sz(orig_src.type) < 8) { + emit_before(block, inst, SCRATCH_READ(temp, index)); + } else { + dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); + dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); + emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); + index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); + vec4_instruction *last_read = + SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); + emit_before(block, inst, last_read); + shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read); + } +} + +/** + * Emits an instruction after @inst to store the value to be written + * to @orig_dst to scratch space at @base_offset, from @temp. + * + * @base_offset is measured in 32-byte units (the size of a register). + */ +void +vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, + int base_offset) +{ + assert(inst->dst.offset % REG_SIZE == 0); + int reg_offset = base_offset + inst->dst.offset / REG_SIZE; + src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, + reg_offset); + + /* Create a temporary register to store *inst's result in. + * + * We have to be careful in MOVing from our temporary result register in + * the scratch write. If we swizzle from channels of the temporary that + * weren't initialized, it will confuse live interval analysis, which will + * make spilling fail to make progress. + */ + bool is_64bit = type_sz(inst->dst.type) == 8; + const glsl_type *alloc_type = + is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; + const src_reg temp = swizzle(retype(src_reg(this, alloc_type), + inst->dst.type), + brw_swizzle_for_mask(inst->dst.writemask)); + + if (!is_64bit) { + dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), + inst->dst.writemask)); + vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); + if (inst->opcode != BRW_OPCODE_SEL) + write->predicate = inst->predicate; + write->ir = inst->ir; + write->annotation = inst->annotation; + inst->insert_after(block, write); + } else { + dst_reg shuffled = dst_reg(this, alloc_type); + vec4_instruction *last = + shuffle_64bit_data(shuffled, temp, true, block, inst); + src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); + + uint8_t mask = 0; + if (inst->dst.writemask & WRITEMASK_X) + mask |= WRITEMASK_XY; + if (inst->dst.writemask & WRITEMASK_Y) + mask |= WRITEMASK_ZW; + if (mask) { + dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); + + vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); + if (inst->opcode != BRW_OPCODE_SEL) + write->predicate = inst->predicate; + write->ir = inst->ir; + write->annotation = inst->annotation; + last->insert_after(block, write); + } + + mask = 0; + if (inst->dst.writemask & WRITEMASK_Z) + mask |= WRITEMASK_XY; + if (inst->dst.writemask & WRITEMASK_W) + mask |= WRITEMASK_ZW; + if (mask) { + dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); + + src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, + reg_offset + 1); + vec4_instruction *write = + SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); + if (inst->opcode != BRW_OPCODE_SEL) + write->predicate = inst->predicate; + write->ir = inst->ir; + write->annotation = inst->annotation; + last->insert_after(block, write); + } + } + + inst->dst.file = temp.file; + inst->dst.nr = temp.nr; + inst->dst.offset %= REG_SIZE; + inst->dst.reladdr = NULL; +} + +/** + * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, + * adds the scratch read(s) before \p inst. The function also checks for + * recursive reladdr scratch accesses, issuing the corresponding scratch + * loads and rewriting reladdr references accordingly. + * + * \return \p src if it did not require a scratch load, otherwise, the + * register holding the result of the scratch load that the caller should + * use to rewrite src. + */ +src_reg +vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, + vec4_instruction *inst, src_reg src) +{ + /* Resolve recursive reladdr scratch access by calling ourselves + * with src.reladdr + */ + if (src.reladdr) + *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, + *src.reladdr); + + /* Now handle scratch access on src */ + if (src.file == VGRF && scratch_loc[src.nr] != -1) { + dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? + glsl_type::dvec4_type : glsl_type::vec4_type); + emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); + src.nr = temp.nr; + src.offset %= REG_SIZE; + src.reladdr = NULL; + } + + return src; +} + +/** + * We can't generally support array access in GRF space, because a + * single instruction's destination can only span 2 contiguous + * registers. So, we send all GRF arrays that get variable index + * access to scratch space. + */ +void +vec4_visitor::move_grf_array_access_to_scratch() +{ + int scratch_loc[this->alloc.count]; + memset(scratch_loc, -1, sizeof(scratch_loc)); + + /* First, calculate the set of virtual GRFs that need to be punted + * to scratch due to having any array access on them, and where in + * scratch. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + if (inst->dst.file == VGRF && inst->dst.reladdr) { + if (scratch_loc[inst->dst.nr] == -1) { + scratch_loc[inst->dst.nr] = last_scratch; + last_scratch += this->alloc.sizes[inst->dst.nr]; + } + + for (src_reg *iter = inst->dst.reladdr; + iter->reladdr; + iter = iter->reladdr) { + if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { + scratch_loc[iter->nr] = last_scratch; + last_scratch += this->alloc.sizes[iter->nr]; + } + } + } + + for (int i = 0 ; i < 3; i++) { + for (src_reg *iter = &inst->src[i]; + iter->reladdr; + iter = iter->reladdr) { + if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { + scratch_loc[iter->nr] = last_scratch; + last_scratch += this->alloc.sizes[iter->nr]; + } + } + } + } + + /* Now, for anything that will be accessed through scratch, rewrite + * it to load/store. Note that this is a _safe list walk, because + * we may generate a new scratch_write instruction after the one + * we're processing. + */ + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + base_ir = inst->ir; + current_annotation = inst->annotation; + + /* First handle scratch access on the dst. Notice we have to handle + * the case where the dst's reladdr also points to scratch space. + */ + if (inst->dst.reladdr) + *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, + *inst->dst.reladdr); + + /* Now that we have handled any (possibly recursive) reladdr scratch + * accesses for dst we can safely do the scratch write for dst itself + */ + if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) + emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); + + /* Now handle scratch access on any src. In this case, since inst->src[i] + * already is a src_reg, we can just call emit_resolve_reladdr with + * inst->src[i] and it will take care of handling scratch loads for + * both src and src.reladdr (recursively). + */ + for (int i = 0 ; i < 3; i++) { + inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, + inst->src[i]); + } + } +} + +/** + * Emits an instruction before @inst to load the value named by @orig_src + * from the pull constant buffer (surface) at @base_offset to @temp. + */ +void +vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, + dst_reg temp, src_reg orig_src, + int base_offset, src_reg indirect) +{ + assert(orig_src.offset % 16 == 0); + const unsigned index = prog_data->base.binding_table.pull_constants_start; + + /* For 64bit loads we need to emit two 32-bit load messages and we also + * we need to shuffle the 32-bit data result into proper 64-bit data. To do + * that we emit the 32-bit loads into a temporary and we shuffle the result + * into the original destination. + */ + dst_reg orig_temp = temp; + bool is_64bit = type_sz(orig_src.type) == 8; + if (is_64bit) { + assert(type_sz(temp.type) == 8); + dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type); + temp = retype(temp_df, BRW_REGISTER_TYPE_F); + } + + src_reg src = orig_src; + for (int i = 0; i < (is_64bit ? 2 : 1); i++) { + int reg_offset = base_offset + src.offset / 16; + + src_reg offset; + if (indirect.file != BAD_FILE) { + offset = src_reg(this, glsl_type::uint_type); + emit_before(block, inst, ADD(dst_reg(offset), indirect, + brw_imm_ud(reg_offset * 16))); + } else if (devinfo->gen >= 8) { + /* Store the offset in a GRF so we can send-from-GRF. */ + offset = src_reg(this, glsl_type::uint_type); + emit_before(block, inst, MOV(dst_reg(offset), + brw_imm_ud(reg_offset * 16))); + } else { + offset = brw_imm_d(reg_offset * 16); + } + + emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE), + brw_imm_ud(index), + offset, + block, inst); + + src = byte_offset(src, 16); + } + + brw_mark_surface_used(&prog_data->base, index); + + if (is_64bit) { + temp = retype(temp, BRW_REGISTER_TYPE_DF); + shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst); + } +} + +/** + * Implements array access of uniforms by inserting a + * PULL_CONSTANT_LOAD instruction. + * + * Unlike temporary GRF array access (where we don't support it due to + * the difficulty of doing relative addressing on instruction + * destinations), we could potentially do array access of uniforms + * that were loaded in GRF space as push constants. In real-world + * usage we've seen, though, the arrays being used are always larger + * than we could load as push constants, so just always move all + * uniform array access out to a pull constant buffer. + */ +void +vec4_visitor::move_uniform_array_access_to_pull_constants() +{ + /* The vulkan dirver doesn't support pull constants other than UBOs so + * everything has to be pushed regardless. + */ + if (stage_prog_data->pull_param == NULL) { + split_uniform_registers(); + return; + } + + int pull_constant_loc[this->uniforms]; + memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); + + /* First, walk through the instructions and determine which things need to + * be pulled. We mark something as needing to be pulled by setting + * pull_constant_loc to 0. + */ + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; + + int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; + + for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) + pull_constant_loc[uniform_nr + j] = 0; + } + + /* Next, we walk the list of uniforms and assign real pull constant + * locations and set their corresponding entries in pull_param. + */ + for (int j = 0; j < this->uniforms; j++) { + if (pull_constant_loc[j] < 0) + continue; + + pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; + + for (int i = 0; i < 4; i++) { + stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] + = stage_prog_data->param[j * 4 + i]; + } + } + + /* Finally, we can walk through the instructions and lower MOV_INDIRECT + * instructions to actual uniform pulls. + */ + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; + + int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; + + assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); + + emit_pull_constant_load(block, inst, inst->dst, inst->src[0], + pull_constant_loc[uniform_nr], inst->src[1]); + inst->remove(block); + } + + /* Now there are no accesses of the UNIFORM file with a reladdr, so + * no need to track them as larger-than-vec4 objects. This will be + * relied on in cutting out unused uniform vectors from push + * constants. + */ + split_uniform_registers(); +} + +void +vec4_visitor::resolve_ud_negate(src_reg *reg) +{ + if (reg->type != BRW_REGISTER_TYPE_UD || + !reg->negate) + return; + + src_reg temp = src_reg(this, glsl_type::uvec4_type); + emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); + *reg = temp; +} + +vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_sampler_prog_key_data *key_tex, + struct brw_vue_prog_data *prog_data, + const nir_shader *shader, + void *mem_ctx, + bool no_spills, + int shader_time_index) + : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base), + key_tex(key_tex), + prog_data(prog_data), + fail_msg(NULL), + first_non_payload_grf(0), + need_all_constants_in_pull_buffer(false), + no_spills(no_spills), + shader_time_index(shader_time_index), + last_scratch(0) +{ + this->failed = false; + + this->base_ir = NULL; + this->current_annotation = NULL; + memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); + + memset(this->output_num_components, 0, sizeof(this->output_num_components)); + + this->virtual_grf_start = NULL; + this->virtual_grf_end = NULL; + this->live_intervals = NULL; + + this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; + + this->uniforms = 0; +} + +vec4_visitor::~vec4_visitor() +{ +} + + +void +vec4_visitor::fail(const char *format, ...) +{ + va_list va; + char *msg; + + if (failed) + return; + + failed = true; + + va_start(va, format); + msg = ralloc_vasprintf(mem_ctx, format, va); + va_end(va); + msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); + + this->fail_msg = msg; + + if (debug_enabled) { + fprintf(stderr, "%s", msg); + } +} + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_vs.h b/src/intel/compiler/brw_vec4_vs.h new file mode 100644 index 00000000000..8c346d7636a --- /dev/null +++ b/src/intel/compiler/brw_vec4_vs.h @@ -0,0 +1,68 @@ +/* + * Copyright © 2006 - 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_VEC4_VS_VISITOR_H +#define BRW_VEC4_VS_VISITOR_H + +#include "brw_vec4.h" + +namespace brw { + +class vec4_vs_visitor : public vec4_visitor +{ +public: + vec4_vs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *vs_prog_data, + const nir_shader *shader, + gl_clip_plane *clip_planes, + void *mem_ctx, + int shader_time_index, + bool use_legacy_snorm_formula); + +protected: + virtual dst_reg *make_reg_for_system_value(int location); + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + virtual void emit_urb_write_header(int mrf); + virtual void emit_urb_slot(dst_reg reg, int varying); + virtual vec4_instruction *emit_urb_write_opcode(bool complete); + +private: + int setup_attributes(int payload_reg); + void setup_uniform_clipplane_values(); + void emit_clip_distances(dst_reg reg, int offset); + + const struct brw_vs_prog_key *const key; + struct brw_vs_prog_data * const vs_prog_data; + + gl_clip_plane *clip_planes; + + bool use_legacy_snorm_formula; +}; + +} /* namespace brw */ + +#endif /* BRW_VEC4_VS_VISITOR_H */ diff --git a/src/intel/compiler/brw_vec4_vs_visitor.cpp b/src/intel/compiler/brw_vec4_vs_visitor.cpp new file mode 100644 index 00000000000..0cec77990d6 --- /dev/null +++ b/src/intel/compiler/brw_vec4_vs_visitor.cpp @@ -0,0 +1,221 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#include "brw_vec4_vs.h" +#include "common/gen_debug.h" + +namespace brw { + +void +vec4_vs_visitor::emit_prolog() +{ +} + + +dst_reg * +vec4_vs_visitor::make_reg_for_system_value(int location) +{ + /* VertexID is stored by the VF as the last vertex element, but + * we don't represent it with a flag in inputs_read, so we call + * it VERT_ATTRIB_MAX, which setup_attributes() picks up on. + */ + dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX); + + switch (location) { + case SYSTEM_VALUE_BASE_VERTEX: + reg->writemask = WRITEMASK_X; + vs_prog_data->uses_basevertex = true; + break; + case SYSTEM_VALUE_BASE_INSTANCE: + reg->writemask = WRITEMASK_Y; + vs_prog_data->uses_baseinstance = true; + break; + case SYSTEM_VALUE_VERTEX_ID: + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + reg->writemask = WRITEMASK_Z; + vs_prog_data->uses_vertexid = true; + break; + case SYSTEM_VALUE_INSTANCE_ID: + reg->writemask = WRITEMASK_W; + vs_prog_data->uses_instanceid = true; + break; + case SYSTEM_VALUE_DRAW_ID: + reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX + 1); + reg->writemask = WRITEMASK_X; + vs_prog_data->uses_drawid = true; + break; + default: + unreachable("not reached"); + } + + return reg; +} + + +void +vec4_vs_visitor::emit_urb_write_header(int mrf) +{ + /* No need to do anything for VS; an implied write to this MRF will be + * performed by VS_OPCODE_URB_WRITE. + */ + (void) mrf; +} + + +vec4_instruction * +vec4_vs_visitor::emit_urb_write_opcode(bool complete) +{ + /* For VS, the URB writes end the thread. */ + if (complete) { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + } + + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst->urb_write_flags = complete ? + BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS; + + return inst; +} + + +void +vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying) +{ + reg.type = BRW_REGISTER_TYPE_F; + output_reg[varying][0].type = reg.type; + + switch (varying) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: { + /* These built-in varyings are only supported in compatibility mode, + * and we only support GS in core profile. So, this must be a vertex + * shader. + */ + vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0); + if (inst && key->clamp_vertex_color) + inst->saturate = true; + break; + } + default: + return vec4_visitor::emit_urb_slot(reg, varying); + } +} + + +void +vec4_vs_visitor::emit_clip_distances(dst_reg reg, int offset) +{ + /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables): + * + * "If a linked set of shaders forming the vertex stage contains no + * static write to gl_ClipVertex or gl_ClipDistance, but the + * application has requested clipping against user clip planes through + * the API, then the coordinate written to gl_Position is used for + * comparison against the user clip planes." + * + * This function is only called if the shader didn't write to + * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping + * if the user wrote to it; otherwise we use gl_Position. + */ + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX; + if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) { + clip_vertex = VARYING_SLOT_POS; + } + + for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4; + ++i) { + reg.writemask = 1 << i; + emit(DP4(reg, + src_reg(output_reg[clip_vertex][0]), + src_reg(this->userplane[i + offset]))); + } +} + + +void +vec4_vs_visitor::setup_uniform_clipplane_values() +{ + for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { + this->userplane[i] = dst_reg(UNIFORM, this->uniforms); + this->userplane[i].type = BRW_REGISTER_TYPE_F; + for (int j = 0; j < 4; ++j) { + stage_prog_data->param[this->uniforms * 4 + j] = + (gl_constant_value *) &clip_planes[i][j]; + } + ++this->uniforms; + } +} + + +void +vec4_vs_visitor::emit_thread_end() +{ + setup_uniform_clipplane_values(); + + /* Lower legacy ff and ClipVertex clipping to clip distances */ + if (key->nr_userclip_plane_consts > 0) { + current_annotation = "user clip distances"; + + output_reg[VARYING_SLOT_CLIP_DIST0][0] = + dst_reg(this, glsl_type::vec4_type); + output_reg[VARYING_SLOT_CLIP_DIST1][0] = + dst_reg(this, glsl_type::vec4_type); + output_num_components[VARYING_SLOT_CLIP_DIST0][0] = 4; + output_num_components[VARYING_SLOT_CLIP_DIST1][0] = 4; + + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0][0], 0); + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1][0], 4); + } + + /* For VS, we always end the thread by emitting a single vertex. + * emit_urb_write_opcode() will take care of setting the eot flag on the + * SEND instruction. + */ + emit_vertex(); +} + + +vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *vs_prog_data, + const nir_shader *shader, + gl_clip_plane *clip_planes, + void *mem_ctx, + int shader_time_index, + bool use_legacy_snorm_formula) + : vec4_visitor(compiler, log_data, &key->tex, &vs_prog_data->base, shader, + mem_ctx, false /* no_spills */, shader_time_index), + key(key), + vs_prog_data(vs_prog_data), + clip_planes(clip_planes), + use_legacy_snorm_formula(use_legacy_snorm_formula) +{ +} + + +} /* namespace brw */ diff --git a/src/intel/compiler/brw_vue_map.c b/src/intel/compiler/brw_vue_map.c new file mode 100644 index 00000000000..e14cba8f67d --- /dev/null +++ b/src/intel/compiler/brw_vue_map.c @@ -0,0 +1,307 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file brw_vue_map.c + * + * This file computes the "VUE map" for a (non-fragment) shader stage, which + * describes the layout of its output varyings. The VUE map is used to match + * outputs from one stage with the inputs of the next. + * + * Largely, varyings can be placed however we like - producers/consumers simply + * have to agree on the layout. However, there is also a "VUE Header" that + * prescribes a fixed-layout for items that interact with fixed function + * hardware, such as the clipper and rasterizer. + * + * Authors: + * Paul Berry <[email protected]> + * Chris Forbes <[email protected]> + * Eric Anholt <[email protected]> + */ + + +#include "brw_compiler.h" +#include "common/gen_debug.h" + +static inline void +assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot) +{ + /* Make sure this varying hasn't been assigned a slot already */ + assert (vue_map->varying_to_slot[varying] == -1); + + vue_map->varying_to_slot[varying] = slot; + vue_map->slot_to_varying[slot] = varying; +} + +/** + * Compute the VUE map for a shader stage. + */ +void +brw_compute_vue_map(const struct gen_device_info *devinfo, + struct brw_vue_map *vue_map, + uint64_t slots_valid, + bool separate) +{ + /* Keep using the packed/contiguous layout on old hardware - we only need + * the SSO layout when using geometry/tessellation shaders or 32 FS input + * varyings, which only exist on Gen >= 6. It's also a bit more efficient. + */ + if (devinfo->gen < 6) + separate = false; + + if (separate) { + /* In SSO mode, we don't know whether the adjacent stage will + * read/write gl_ClipDistance, which has a fixed slot location. + * We have to assume the worst and reserve a slot for it, or else + * the rest of our varyings will be off by a slot. + * + * Note that we don't have to worry about COL/BFC, as those built-in + * variables only exist in legacy GL, which only supports VS and FS. + */ + slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); + slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); + } + + vue_map->slots_valid = slots_valid; + vue_map->separate = separate; + + /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they + * are stored in the first VUE slot (VARYING_SLOT_PSIZ). + */ + slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT); + + /* Make sure that the values we store in vue_map->varying_to_slot and + * vue_map->slot_to_varying won't overflow the signed chars that are used + * to store them. Note that since vue_map->slot_to_varying sometimes holds + * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that + * BRW_VARYING_SLOT_COUNT is <= 127, not 128. + */ + STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127); + + for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) { + vue_map->varying_to_slot[i] = -1; + vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; + } + + int slot = 0; + + /* VUE header: format depends on chip generation and whether clipping is + * enabled. + * + * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30), + * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout. + */ + if (devinfo->gen < 6) { + /* There are 8 dwords in VUE header pre-Ironlake: + * dword 0-3 is indices, point width, clip flags. + * dword 4-7 is ndc position + * dword 8-11 is the first vertex data. + * + * On Ironlake the VUE header is nominally 20 dwords, but the hardware + * will accept the same header layout as Gen4 [and should be a bit faster] + */ + assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++); + assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC, slot++); + assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++); + } else { + /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge: + * dword 0-3 of the header is indices, point width, clip flags. + * dword 4-7 is the 4D space position + * dword 8-15 of the vertex header is the user clip distance if + * enabled. + * dword 8-11 or 16-19 is the first vertex element data we fill. + */ + assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++); + assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++); + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) + assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0, slot++); + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) + assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1, slot++); + + /* front and back colors need to be consecutive so that we can use + * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing + * two-sided color. + */ + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0)) + assign_vue_slot(vue_map, VARYING_SLOT_COL0, slot++); + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0)) + assign_vue_slot(vue_map, VARYING_SLOT_BFC0, slot++); + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1)) + assign_vue_slot(vue_map, VARYING_SLOT_COL1, slot++); + if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1)) + assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++); + } + + /* The hardware doesn't care about the rest of the vertex outputs, so we + * can assign them however we like. For normal programs, we simply assign + * them contiguously. + * + * For separate shader pipelines, we first assign built-in varyings + * contiguous slots. This works because ARB_separate_shader_objects + * requires that all shaders have matching built-in varying interface + * blocks. Next, we assign generic varyings based on their location + * (either explicit or linker assigned). This guarantees a fixed layout. + * + * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX, + * since it's encoded as the clip distances by emit_clip_distances(). + * However, it may be output by transform feedback, and we'd rather not + * recompute state when TF changes, so we just always include it. + */ + uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); + while (builtins != 0) { + const int varying = ffsll(builtins) - 1; + if (vue_map->varying_to_slot[varying] == -1) { + assign_vue_slot(vue_map, varying, slot++); + } + builtins &= ~BITFIELD64_BIT(varying); + } + + const int first_generic_slot = slot; + uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); + while (generics != 0) { + const int varying = ffsll(generics) - 1; + if (separate) { + slot = first_generic_slot + varying - VARYING_SLOT_VAR0; + } + assign_vue_slot(vue_map, varying, slot++); + generics &= ~BITFIELD64_BIT(varying); + } + + vue_map->num_slots = slot; + vue_map->num_per_vertex_slots = 0; + vue_map->num_per_patch_slots = 0; +} + +/** + * Compute the VUE map for tessellation control shader outputs and + * tessellation evaluation shader inputs. + */ +void +brw_compute_tess_vue_map(struct brw_vue_map *vue_map, + uint64_t vertex_slots, + uint32_t patch_slots) +{ + /* I don't think anything actually uses this... */ + vue_map->slots_valid = vertex_slots; + + /* separate isn't really meaningful, but make sure it's initialized */ + vue_map->separate = false; + + vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER | + VARYING_BIT_TESS_LEVEL_INNER); + + /* Make sure that the values we store in vue_map->varying_to_slot and + * vue_map->slot_to_varying won't overflow the signed chars that are used + * to store them. Note that since vue_map->slot_to_varying sometimes holds + * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that + * VARYING_SLOT_TESS_MAX is <= 127, not 128. + */ + STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127); + + for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) { + vue_map->varying_to_slot[i] = -1; + vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; + } + + int slot = 0; + + /* The first 8 DWords are reserved for the "Patch Header". + * + * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout + * depends on the domain type. They might not be in slots 0 and 1 as + * described here, but pretending they're separate allows us to uniquely + * identify them by distinct slot locations. + */ + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++); + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++); + + /* first assign per-patch varyings */ + while (patch_slots != 0) { + const int varying = ffsll(patch_slots) - 1; + if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) { + assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++); + } + patch_slots &= ~BITFIELD64_BIT(varying); + } + + /* apparently, including the patch header... */ + vue_map->num_per_patch_slots = slot; + + /* then assign per-vertex varyings for each vertex in our patch */ + while (vertex_slots != 0) { + const int varying = ffsll(vertex_slots) - 1; + if (vue_map->varying_to_slot[varying] == -1) { + assign_vue_slot(vue_map, varying, slot++); + } + vertex_slots &= ~BITFIELD64_BIT(varying); + } + + vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots; + vue_map->num_slots = slot; +} + +static const char * +varying_name(brw_varying_slot slot) +{ + assume(slot < BRW_VARYING_SLOT_COUNT); + + if (slot < VARYING_SLOT_MAX) + return gl_varying_slot_name(slot); + + static const char *brw_names[] = { + [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC", + [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD", + [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC", + }; + + return brw_names[slot - VARYING_SLOT_MAX]; +} + +void +brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map) +{ + if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) { + fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n", + vue_map->num_slots, + vue_map->num_per_patch_slots, + vue_map->num_per_vertex_slots, + vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) { + fprintf(fp, " [%d] VARYING_SLOT_PATCH%d\n", i, + vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0); + } else { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } + } + } else { + fprintf(fp, "VUE map (%d slots, %s)\n", + vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } + } + fprintf(fp, "\n"); +} diff --git a/src/intel/compiler/brw_wm_iz.cpp b/src/intel/compiler/brw_wm_iz.cpp new file mode 100644 index 00000000000..5162a369765 --- /dev/null +++ b/src/intel/compiler/brw_wm_iz.cpp @@ -0,0 +1,169 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_fs.h" + + +#undef P /* prompted depth */ +#undef C /* computed */ +#undef N /* non-promoted? */ + +#define P 0 +#define C 1 +#define N 2 + +static const struct { + GLuint mode:2; + GLuint sd_present:1; + GLuint sd_to_rt:1; + GLuint dd_present:1; + GLuint ds_present:1; +} wm_iz_table[BRW_WM_IZ_BIT_MAX] = +{ + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 0, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 } +}; + +/** + * \param line_aa BRW_WM_AA_NEVER, BRW_WM_AA_ALWAYS or BRW_WM_AA_SOMETIMES + * \param lookup bitmask of BRW_WM_IZ_* flags + */ +void fs_visitor::setup_fs_payload_gen4() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + GLuint reg = 2; + bool kill_stats_promoted_workaround = false; + int lookup = key->iz_lookup; + + assert(lookup < BRW_WM_IZ_BIT_MAX); + + /* Crazy workaround in the windowizer, which we need to track in + * our register allocation and render target writes. See the "If + * statistics are enabled..." paragraph of 11.5.3.2: Early Depth + * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec. + */ + if (key->stats_wm && + (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) && + wm_iz_table[lookup].mode == P) { + kill_stats_promoted_workaround = true; + } + + prog_data->uses_src_depth = + (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth || + kill_stats_promoted_workaround) { + payload.source_depth_reg = reg; + reg += 2; + } + + if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround) + source_depth_to_render_target = true; + + if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_WM_AA_NEVER) { + payload.aa_dest_stencil_reg = reg; + runtime_check_aads_emit = + !wm_iz_table[lookup].ds_present && key->line_aa == BRW_WM_AA_SOMETIMES; + reg++; + } + + if (wm_iz_table[lookup].dd_present) { + payload.dest_depth_reg = reg; + reg+=2; + } + + payload.num_regs = reg; +} + diff --git a/src/intel/compiler/gen6_gs_visitor.cpp b/src/intel/compiler/gen6_gs_visitor.cpp new file mode 100644 index 00000000000..075bc4ad487 --- /dev/null +++ b/src/intel/compiler/gen6_gs_visitor.cpp @@ -0,0 +1,753 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * This code is based on original work by Ilia Mirkin. + */ + +/** + * \file gen6_gs_visitor.cpp + * + * Gen6 geometry shader implementation + */ + +#include "gen6_gs_visitor.h" +#include "brw_eu.h" + +namespace brw { + +void +gen6_gs_visitor::emit_prolog() +{ + vec4_gs_visitor::emit_prolog(); + + /* Gen6 geometry shaders require to allocate an initial VUE handle via + * FF_SYNC message, however the documentation remarks that only one thread + * can write to the URB simultaneously and the FF_SYNC message provides the + * synchronization mechanism for this, so using this message effectively + * stalls the thread until it is its turn to write to the URB. Because of + * this, the best way to implement geometry shader algorithms in gen6 is to + * execute the algorithm before the FF_SYNC message to maximize parallelism. + * + * To achieve this we buffer the geometry shader outputs for each emitted + * vertex in vertex_output during operation. Then, when we have processed + * the last vertex (that is, at thread end time), we send the FF_SYNC + * message to allocate the initial VUE handle and write all buffered vertex + * data to the URB in one go. + * + * For each emitted vertex, vertex_output will hold vue_map.num_slots + * data items plus one additional item to hold required flags + * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) + * which come right after the data items for that vertex. Vertex data and + * flags for the next vertex come right after the data items and flags for + * the previous vertex. + */ + this->current_annotation = "gen6 prolog"; + this->vertex_output = src_reg(this, + glsl_type::uint_type, + (prog_data->vue_map.num_slots + 1) * + nir->info->gs.vertices_out); + this->vertex_output_offset = src_reg(this, glsl_type::uint_type); + emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); + + /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), + * so initialize it once to R0. + */ + vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), + retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UD))); + inst->force_writemask_all = true; + + /* This will be used as a temporary to store writeback data of FF_SYNC + * and URB_WRITE messages. + */ + this->temp = src_reg(this, glsl_type::uint_type); + + /* This will be used to know when we are processing the first vertex of + * a primitive. We will set this to URB_WRITE_PRIM_START only when we know + * that we are processing the first vertex in the primitive and to zero + * otherwise. This way we can use its value directly in the URB write + * headers. + */ + this->first_vertex = src_reg(this, glsl_type::uint_type); + emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START))); + + /* The FF_SYNC message requires to know the number of primitives generated, + * so keep a counter for this. + */ + this->prim_count = src_reg(this, glsl_type::uint_type); + emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u))); + + if (prog->info.has_transform_feedback_varyings) { + /* Create a virtual register to hold destination indices in SOL */ + this->destination_indices = src_reg(this, glsl_type::uvec4_type); + /* Create a virtual register to hold number of written primitives */ + this->sol_prim_written = src_reg(this, glsl_type::uint_type); + /* Create a virtual register to hold Streamed Vertex Buffer Indices */ + this->svbi = src_reg(this, glsl_type::uvec4_type); + /* Create a virtual register to hold max values of SVBI */ + this->max_svbi = src_reg(this, glsl_type::uvec4_type); + emit(MOV(dst_reg(this->max_svbi), + src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); + + xfb_setup(); + } + + /* PrimitveID is delivered in r0.1 of the thread payload. If the program + * needs it we have to move it to a separate register where we can map + * the atttribute. + * + * Notice that we cannot use a virtual register for this, because we need to + * map all input attributes to hardware registers in setup_payload(), + * which happens before virtual registers are mapped to hardware registers. + * We could work around that issue if we were able to compute the first + * non-payload register here and move the PrimitiveID information to that + * register, but we can't because at this point we don't know the final + * number uniforms that will be included in the payload. + * + * So, what we do is to place PrimitiveID information in r1, which is always + * delivered as part of the payload, but its only populated with data + * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE + * in the 3DSTATE_GS state packet. That information can be obtained by other + * means though, so we can safely use r1 for this purpose. + */ + if (gs_prog_data->include_primitive_id) { + this->primitive_id = + src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); + } +} + +void +gen6_gs_visitor::gs_emit_vertex(int stream_id) +{ + this->current_annotation = "gen6 emit vertex"; + + /* Buffer all output slots for this vertex in vertex_output */ + for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { + int varying = prog_data->vue_map.slot_to_varying[slot]; + if (varying != VARYING_SLOT_PSIZ) { + dst_reg dst(this->vertex_output); + dst.reladdr = ralloc(mem_ctx, src_reg); + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); + emit_urb_slot(dst, varying); + } else { + /* The PSIZ slot can pack multiple varyings in different channels + * and emit_urb_slot() will produce a MOV instruction for each of + * them. Since we are writing to an array, that will translate to + * possibly multiple MOV instructions with an array destination and + * each will generate a scratch write with the same offset into + * scratch space (thus, each one overwriting the previous). This is + * not what we want. What we will do instead is emit PSIZ to a + * a regular temporary register, then move that resgister into the + * array. This way we only have one instruction with an array + * destination and we only produce a single scratch write. + */ + dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); + emit_urb_slot(tmp, varying); + dst_reg dst(this->vertex_output); + dst.reladdr = ralloc(mem_ctx, src_reg); + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); + vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); + inst->force_writemask_all = true; + } + + emit(ADD(dst_reg(this->vertex_output_offset), + this->vertex_output_offset, brw_imm_ud(1u))); + } + + /* Now buffer flags for this vertex */ + dst_reg dst(this->vertex_output); + dst.reladdr = ralloc(mem_ctx, src_reg); + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); + if (nir->info->gs.output_primitive == GL_POINTS) { + /* If we are outputting points, then every vertex has PrimStart and + * PrimEnd set. + */ + emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | + URB_WRITE_PRIM_START | URB_WRITE_PRIM_END))); + emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); + } else { + /* Otherwise, we can only set the PrimStart flag, which we have stored + * in the first_vertex register. We will have to wait until we execute + * EndPrimitive() or we end the thread to set the PrimEnd flag on a + * vertex. + */ + emit(OR(dst, this->first_vertex, + brw_imm_ud(gs_prog_data->output_topology << + URB_WRITE_PRIM_TYPE_SHIFT))); + emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u))); + } + emit(ADD(dst_reg(this->vertex_output_offset), + this->vertex_output_offset, brw_imm_ud(1u))); +} + +void +gen6_gs_visitor::gs_end_primitive() +{ + this->current_annotation = "gen6 end primitive"; + /* Calling EndPrimitive() is optional for point output. In this case we set + * the PrimEnd flag when we process EmitVertex(). + */ + if (nir->info->gs.output_primitive == GL_POINTS) + return; + + /* Otherwise we know that the last vertex we have processed was the last + * vertex in the primitive and we need to set its PrimEnd flag, so do this + * unless we haven't emitted that vertex at all (vertex_count != 0). + * + * Notice that we have already incremented vertex_count when we processed + * the last emit_vertex, so we need to take that into account in the + * comparison below (hence the num_output_vertices + 1 in the comparison + * below). + */ + unsigned num_output_vertices = nir->info->gs.vertices_out; + emit(CMP(dst_null_ud(), this->vertex_count, + brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L)); + vec4_instruction *inst = emit(CMP(dst_null_ud(), + this->vertex_count, brw_imm_ud(0u), + BRW_CONDITIONAL_NEQ)); + inst->predicate = BRW_PREDICATE_NORMAL; + emit(IF(BRW_PREDICATE_NORMAL)); + { + /* vertex_output_offset is already pointing at the first entry of the + * next vertex. So subtract 1 to modify the flags for the previous + * vertex. + */ + src_reg offset(this, glsl_type::uint_type); + emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1))); + + src_reg dst(this->vertex_output); + dst.reladdr = ralloc(mem_ctx, src_reg); + memcpy(dst.reladdr, &offset, sizeof(src_reg)); + + emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END))); + emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); + + /* Set the first vertex flag to indicate that the next vertex will start + * a primitive. + */ + emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START))); + } + emit(BRW_OPCODE_ENDIF); +} + +void +gen6_gs_visitor::emit_urb_write_header(int mrf) +{ + this->current_annotation = "gen6 urb header"; + /* Compute offset of the flags for the current vertex in vertex_output and + * write them in dw2 of the message header. + * + * Notice that by the time that emit_thread_end() calls here + * vertex_output_offset should point to the first data item of the current + * vertex in vertex_output, thus we only need to add the number of output + * slots per vertex to that offset to obtain the flags data offset. + */ + src_reg flags_offset(this, glsl_type::uint_type); + emit(ADD(dst_reg(flags_offset), + this->vertex_output_offset, + brw_imm_d(prog_data->vue_map.num_slots))); + + src_reg flags_data(this->vertex_output); + flags_data.reladdr = ralloc(mem_ctx, src_reg); + memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); + + emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); +} + +static int +align_interleaved_urb_mlen(int mlen) +{ + /* URB data written (does not include the message header reg) must + * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, + * section 5.4.3.2.2: URB_INTERLEAVED. + */ + if ((mlen % 2) != 1) + mlen++; + return mlen; +} + +void +gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf, + int last_mrf, int urb_offset) +{ + vec4_instruction *inst = NULL; + + if (!complete) { + /* If the vertex is not complete we don't have to do anything special */ + inst = emit(GS_OPCODE_URB_WRITE); + inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; + } else { + /* Otherwise we always request to allocate a new VUE handle. If this is + * the last write before the EOT message and the new handle never gets + * used it will be dereferenced when we send the EOT message. This is + * necessary to avoid different setups for the EOT message (one for the + * case when there is no output and another for the case when there is) + * which would require to end the program with an IF/ELSE/ENDIF block, + * something we do not want. + */ + inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE); + inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; + inst->dst = dst_reg(MRF, base_mrf); + inst->src[0] = this->temp; + } + + inst->base_mrf = base_mrf; + inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf); + inst->offset = urb_offset; +} + +void +gen6_gs_visitor::emit_thread_end() +{ + /* Make sure the current primitive is ended: we know it is not ended when + * first_vertex is not zero. This is only relevant for outputs other than + * points because in the point case we set PrimEnd on all vertices. + */ + if (nir->info->gs.output_primitive != GL_POINTS) { + emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z)); + emit(IF(BRW_PREDICATE_NORMAL)); + gs_end_primitive(); + emit(BRW_OPCODE_ENDIF); + } + + /* Here we have to: + * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. + * 2) Loop over all buffered vertex data and write it to corresponding + * URB entries. + * 3) Allocate new VUE handles for all vertices other than the first. + * 4) Send a final EOT message. + */ + + /* MRF 0 is reserved for the debugger, so start with message header + * in MRF 1. + */ + int base_mrf = 1; + + /* In the process of generating our URB write message contents, we + * may need to unspill a register or load from an array. Those + * reads would use MRFs 21..23 + */ + int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); + + /* Issue the FF_SYNC message and obtain the initial VUE handle. */ + emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G)); + emit(IF(BRW_PREDICATE_NORMAL)); + { + this->current_annotation = "gen6 thread end: ff_sync"; + + vec4_instruction *inst; + if (prog->info.has_transform_feedback_varyings) { + src_reg sol_temp(this, glsl_type::uvec4_type); + emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, + dst_reg(this->svbi), + this->vertex_count, + this->prim_count, + sol_temp); + inst = emit(GS_OPCODE_FF_SYNC, + dst_reg(this->temp), this->prim_count, this->svbi); + } else { + inst = emit(GS_OPCODE_FF_SYNC, + dst_reg(this->temp), this->prim_count, brw_imm_ud(0u)); + } + inst->base_mrf = base_mrf; + + /* Loop over all buffered vertices and emit URB write messages */ + this->current_annotation = "gen6 thread end: urb writes init"; + src_reg vertex(this, glsl_type::uint_type); + emit(MOV(dst_reg(vertex), brw_imm_ud(0u))); + emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); + + this->current_annotation = "gen6 thread end: urb writes"; + emit(BRW_OPCODE_DO); + { + emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); + inst = emit(BRW_OPCODE_BREAK); + inst->predicate = BRW_PREDICATE_NORMAL; + + /* First we prepare the message header */ + emit_urb_write_header(base_mrf); + + /* Then add vertex data to the message in interleaved fashion */ + int slot = 0; + bool complete = false; + do { + int mrf = base_mrf + 1; + + /* URB offset is in URB row increments, and each of our MRFs is half + * of one of those, since we're doing interleaved writes. + */ + int urb_offset = slot / 2; + + for (; slot < prog_data->vue_map.num_slots; ++slot) { + int varying = prog_data->vue_map.slot_to_varying[slot]; + current_annotation = output_reg_annotation[varying]; + + /* Compute offset of this slot for the current vertex + * in vertex_output + */ + src_reg data(this->vertex_output); + data.reladdr = ralloc(mem_ctx, src_reg); + memcpy(data.reladdr, &this->vertex_output_offset, + sizeof(src_reg)); + + /* Copy this slot to the appropriate message register */ + dst_reg reg = dst_reg(MRF, mrf); + reg.type = output_reg[varying][0].type; + data.type = reg.type; + vec4_instruction *inst = emit(MOV(reg, data)); + inst->force_writemask_all = true; + + mrf++; + emit(ADD(dst_reg(this->vertex_output_offset), + this->vertex_output_offset, brw_imm_ud(1u))); + + /* If this was max_usable_mrf, we can't fit anything more into + * this URB WRITE. Same if we reached the max. message length. + */ + if (mrf > max_usable_mrf || + align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { + slot++; + break; + } + } + + complete = slot >= prog_data->vue_map.num_slots; + emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); + } while (!complete); + + /* Skip over the flags data item so that vertex_output_offset points + * to the first data item of the next vertex, so that we can start + * writing the next vertex. + */ + emit(ADD(dst_reg(this->vertex_output_offset), + this->vertex_output_offset, brw_imm_ud(1u))); + + emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u))); + } + emit(BRW_OPCODE_WHILE); + + if (prog->info.has_transform_feedback_varyings) + xfb_write(); + } + emit(BRW_OPCODE_ENDIF); + + /* Finally, emit EOT message. + * + * In gen6 we need to end the thread differently depending on whether we have + * emitted at least one vertex or not. In case we did, the EOT message must + * always include the COMPLETE flag or else the GPU hangs. If we have not + * produced any output we can't use the COMPLETE flag. + * + * However, this would lead us to end the program with an ENDIF opcode, + * which we want to avoid, so what we do is that we always request a new + * VUE handle every time we do a URB WRITE, even for the last vertex we emit. + * With this we make sure that whether we have emitted at least one vertex + * or none at all, we have to finish the thread without writing to the URB, + * which works for both cases by setting the COMPLETE and UNUSED flags in + * the EOT message. + */ + this->current_annotation = "gen6 thread end: EOT"; + + if (prog->info.has_transform_feedback_varyings) { + /* When emitting EOT, set SONumPrimsWritten Increment Value. */ + src_reg data(this, glsl_type::uint_type); + emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu))); + emit(SHL(dst_reg(data), data, brw_imm_ud(16u))); + emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); + } + + vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); + inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; + inst->base_mrf = base_mrf; + inst->mlen = 1; +} + +void +gen6_gs_visitor::setup_payload() +{ + int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; + + /* Attributes are going to be interleaved, so one register contains two + * attribute slots. + */ + int attributes_per_reg = 2; + + /* If a geometry shader tries to read from an input that wasn't written by + * the vertex shader, that produces undefined results, but it shouldn't + * crash anything. So initialize attribute_map to zeros--that ensures that + * these undefined results are read from r0. + */ + memset(attribute_map, 0, sizeof(attribute_map)); + + int reg = 0; + + /* The payload always contains important data in r0. */ + reg++; + + /* r1 is always part of the payload and it holds information relevant + * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in + * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID + * information (and move the original value to a virtual register if + * necessary). + */ + if (gs_prog_data->include_primitive_id) + attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg; + reg++; + + reg = setup_uniforms(reg); + + reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg); + + lower_attributes_to_hw_regs(attribute_map, true); + + this->first_non_payload_grf = reg; +} + +void +gen6_gs_visitor::xfb_setup() +{ + static const unsigned swizzle_for_offset[4] = { + BRW_SWIZZLE4(0, 1, 2, 3), + BRW_SWIZZLE4(1, 2, 3, 3), + BRW_SWIZZLE4(2, 3, 3, 3), + BRW_SWIZZLE4(3, 3, 3, 3) + }; + + const struct gl_transform_feedback_info *linked_xfb_info = + this->prog->sh.LinkedTransformFeedback; + int i; + + /* Make sure that the VUE slots won't overflow the unsigned chars in + * prog_data->transform_feedback_bindings[]. + */ + STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); + + /* Make sure that we don't need more binding table entries than we've + * set aside for use in transform feedback. (We shouldn't, since we + * set aside enough binding table entries to have one per component). + */ + assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS); + + gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs; + for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) { + gs_prog_data->transform_feedback_bindings[i] = + linked_xfb_info->Outputs[i].OutputRegister; + gs_prog_data->transform_feedback_swizzles[i] = + swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset]; + } +} + +void +gen6_gs_visitor::xfb_write() +{ + unsigned num_verts; + + if (!gs_prog_data->num_transform_feedback_bindings) + return; + + switch (gs_prog_data->output_topology) { + case _3DPRIM_POINTLIST: + num_verts = 1; + break; + case _3DPRIM_LINELIST: + case _3DPRIM_LINESTRIP: + case _3DPRIM_LINELOOP: + num_verts = 2; + break; + case _3DPRIM_TRILIST: + case _3DPRIM_TRIFAN: + case _3DPRIM_TRISTRIP: + case _3DPRIM_RECTLIST: + num_verts = 3; + break; + case _3DPRIM_QUADLIST: + case _3DPRIM_QUADSTRIP: + case _3DPRIM_POLYGON: + num_verts = 3; + break; + default: + unreachable("Unexpected primitive type in Gen6 SOL program."); + } + + this->current_annotation = "gen6 thread end: svb writes init"; + + emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); + emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u))); + + /* Check that at least one primitive can be written + * + * Note: since we use the binding table to keep track of buffer offsets + * and stride, the GS doesn't need to keep track of a separate pointer + * into each buffer; it uses a single pointer which increments by 1 for + * each vertex. So we use SVBI0 for this pointer, regardless of whether + * transform feedback is in interleaved or separate attribs mode. + */ + src_reg sol_temp(this, glsl_type::uvec4_type); + emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts))); + + /* Compare SVBI calculated number with the maximum value, which is + * in R1.4 (previously saved in this->max_svbi) for gen6. + */ + emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); + emit(IF(BRW_PREDICATE_NORMAL)); + { + vec4_instruction *inst = emit(MOV(dst_reg(destination_indices), + brw_imm_vf4(brw_float_to_vf(0.0), + brw_float_to_vf(1.0), + brw_float_to_vf(2.0), + brw_float_to_vf(0.0)))); + inst->force_writemask_all = true; + + emit(ADD(dst_reg(this->destination_indices), + this->destination_indices, + this->svbi)); + } + emit(BRW_OPCODE_ENDIF); + + /* Write transform feedback data for all processed vertices. */ + for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) { + emit(MOV(dst_reg(sol_temp), brw_imm_d(i))); + emit(CMP(dst_null_d(), sol_temp, this->vertex_count, + BRW_CONDITIONAL_L)); + emit(IF(BRW_PREDICATE_NORMAL)); + { + xfb_program(i, num_verts); + } + emit(BRW_OPCODE_ENDIF); + } +} + +void +gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) +{ + unsigned binding; + unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings; + src_reg sol_temp(this, glsl_type::uvec4_type); + + /* Check for buffer overflow: we need room to write the complete primitive + * (all vertices). Otherwise, avoid writing any vertices for it + */ + emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u))); + emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts))); + emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); + emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); + emit(IF(BRW_PREDICATE_NORMAL)); + { + /* Avoid overwriting MRF 1 as it is used as URB write message header */ + dst_reg mrf_reg(MRF, 2); + + this->current_annotation = "gen6: emit SOL vertex data"; + /* For each vertex, generate code to output each varying using the + * appropriate binding table entry. + */ + for (binding = 0; binding < num_bindings; ++binding) { + unsigned char varying = + gs_prog_data->transform_feedback_bindings[binding]; + + /* Set up the correct destination index for this vertex */ + vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, + mrf_reg, + this->destination_indices); + inst->sol_vertex = vertex % num_verts; + + /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: + * + * "Prior to End of Thread with a URB_WRITE, the kernel must + * ensure that all writes are complete by sending the final + * write as a committed write." + */ + bool final_write = binding == (unsigned) num_bindings - 1 && + inst->sol_vertex == num_verts - 1; + + /* Compute offset of this varying for the current vertex + * in vertex_output + */ + this->current_annotation = output_reg_annotation[varying]; + src_reg data(this->vertex_output); + data.reladdr = ralloc(mem_ctx, src_reg); + int offset = get_vertex_output_offset_for_varying(vertex, varying); + emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset))); + memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); + data.type = output_reg[varying][0].type; + + /* PSIZ, LAYER and VIEWPORT are packed in different channels of the + * same slot, so make sure we write the appropriate channel + */ + if (varying == VARYING_SLOT_PSIZ) + data.swizzle = BRW_SWIZZLE_WWWW; + else if (varying == VARYING_SLOT_LAYER) + data.swizzle = BRW_SWIZZLE_YYYY; + else if (varying == VARYING_SLOT_VIEWPORT) + data.swizzle = BRW_SWIZZLE_ZZZZ; + else + data.swizzle = gs_prog_data->transform_feedback_swizzles[binding]; + + /* Write data */ + inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); + inst->sol_binding = binding; + inst->sol_final_write = final_write; + + if (final_write) { + /* This is the last vertex of the primitive, then increment + * SO num primitive counter and destination indices. + */ + emit(ADD(dst_reg(this->destination_indices), + this->destination_indices, + brw_imm_ud(num_verts))); + emit(ADD(dst_reg(this->sol_prim_written), + this->sol_prim_written, brw_imm_ud(1u))); + } + + } + this->current_annotation = NULL; + } + emit(BRW_OPCODE_ENDIF); +} + +int +gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying) +{ + /* Find the output slot assigned to this varying. + * + * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot + * as VARYING_SLOT_PSIZ. + */ + if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) + varying = VARYING_SLOT_PSIZ; + int slot = prog_data->vue_map.varying_to_slot[varying]; + + if (slot < 0) { + /* This varying does not exist in the VUE so we are not writing to it + * and its value is undefined. We still want to return a valid offset + * into vertex_output though, to prevent any out-of-bound accesses into + * the vertex_output array. Since the value for this varying is undefined + * we don't really care for the value we assign to it, so any offset + * within the limits of vertex_output will do. + */ + slot = 0; + } + + return vertex * (prog_data->vue_map.num_slots + 1) + slot; +} + +} /* namespace brw */ diff --git a/src/intel/compiler/gen6_gs_visitor.h b/src/intel/compiler/gen6_gs_visitor.h new file mode 100644 index 00000000000..1bdcf925880 --- /dev/null +++ b/src/intel/compiler/gen6_gs_visitor.h @@ -0,0 +1,91 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef GEN6_GS_VISITOR_H +#define GEN6_GS_VISITOR_H + +#include "brw_vec4.h" +#include "brw_vec4_gs_visitor.h" + +#ifdef __cplusplus + +namespace brw { + +class gen6_gs_visitor : public vec4_gs_visitor +{ +public: + gen6_gs_visitor(const struct brw_compiler *comp, + void *log_data, + struct brw_gs_compile *c, + struct brw_gs_prog_data *prog_data, + struct gl_program *prog, + const nir_shader *shader, + void *mem_ctx, + bool no_spills, + int shader_time_index) : + vec4_gs_visitor(comp, log_data, c, prog_data, shader, mem_ctx, no_spills, + shader_time_index), + prog(prog) + { + } + +protected: + virtual void emit_prolog(); + virtual void emit_thread_end(); + virtual void gs_emit_vertex(int stream_id); + virtual void gs_end_primitive(); + virtual void emit_urb_write_header(int mrf); + virtual void emit_urb_write_opcode(bool complete, + int base_mrf, + int last_mrf, + int urb_offset); + virtual void setup_payload(); + +private: + void xfb_write(); + void xfb_program(unsigned vertex, unsigned num_verts); + void xfb_setup(); + int get_vertex_output_offset_for_varying(int vertex, int varying); + + const struct gl_program *prog; + + src_reg vertex_output; + src_reg vertex_output_offset; + src_reg temp; + src_reg first_vertex; + src_reg prim_count; + src_reg primitive_id; + + /* Transform Feedback members */ + src_reg sol_prim_written; + src_reg svbi; + src_reg max_svbi; + src_reg destination_indices; +}; + +} /* namespace brw */ + +#endif /* __cplusplus */ + +#endif /* GEN6_GS_VISITOR_H */ diff --git a/src/intel/compiler/intel_asm_annotation.c b/src/intel/compiler/intel_asm_annotation.c new file mode 100644 index 00000000000..1f3b78476e3 --- /dev/null +++ b/src/intel/compiler/intel_asm_annotation.c @@ -0,0 +1,198 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_cfg.h" +#include "brw_eu.h" +#include "common/gen_debug.h" +#include "intel_asm_annotation.h" +#include "compiler/nir/nir.h" + +void +dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, + const struct gen_device_info *devinfo) +{ + const char *last_annotation_string = NULL; + const void *last_annotation_ir = NULL; + + for (int i = 0; i < num_annotations; i++) { + int start_offset = annotation[i].offset; + int end_offset = annotation[i + 1].offset; + + if (annotation[i].block_start) { + fprintf(stderr, " START B%d", annotation[i].block_start->num); + foreach_list_typed(struct bblock_link, predecessor_link, link, + &annotation[i].block_start->parents) { + struct bblock_t *predecessor_block = predecessor_link->block; + fprintf(stderr, " <-B%d", predecessor_block->num); + } + fprintf(stderr, " (%u cycles)\n", annotation[i].block_start->cycle_count); + } + + if (last_annotation_ir != annotation[i].ir) { + last_annotation_ir = annotation[i].ir; + if (last_annotation_ir) { + fprintf(stderr, " "); + nir_print_instr(annotation[i].ir, stderr); + fprintf(stderr, "\n"); + } + } + + if (last_annotation_string != annotation[i].annotation) { + last_annotation_string = annotation[i].annotation; + if (last_annotation_string) + fprintf(stderr, " %s\n", last_annotation_string); + } + + brw_disassemble(devinfo, assembly, start_offset, end_offset, stderr); + + if (annotation[i].error) { + fputs(annotation[i].error, stderr); + } + + if (annotation[i].block_end) { + fprintf(stderr, " END B%d", annotation[i].block_end->num); + foreach_list_typed(struct bblock_link, successor_link, link, + &annotation[i].block_end->children) { + struct bblock_t *successor_block = successor_link->block; + fprintf(stderr, " ->B%d", successor_block->num); + } + fprintf(stderr, "\n"); + } + } + fprintf(stderr, "\n"); +} + +static bool +annotation_array_ensure_space(struct annotation_info *annotation) +{ + if (annotation->ann_size <= annotation->ann_count) { + int old_size = annotation->ann_size; + annotation->ann_size = MAX2(1024, annotation->ann_size * 2); + annotation->ann = reralloc(annotation->mem_ctx, annotation->ann, + struct annotation, annotation->ann_size); + if (!annotation->ann) + return false; + + memset(annotation->ann + old_size, 0, + (annotation->ann_size - old_size) * sizeof(struct annotation)); + } + + return true; +} + +void annotate(const struct gen_device_info *devinfo, + struct annotation_info *annotation, const struct cfg_t *cfg, + struct backend_instruction *inst, unsigned offset) +{ + if (annotation->mem_ctx == NULL) + annotation->mem_ctx = ralloc_context(NULL); + + if (!annotation_array_ensure_space(annotation)) + return; + + struct annotation *ann = &annotation->ann[annotation->ann_count++]; + ann->offset = offset; + if ((INTEL_DEBUG & DEBUG_ANNOTATION) != 0) { + ann->ir = inst->ir; + ann->annotation = inst->annotation; + } + + if (bblock_start(cfg->blocks[annotation->cur_block]) == inst) { + ann->block_start = cfg->blocks[annotation->cur_block]; + } + + /* There is no hardware DO instruction on Gen6+, so since DO always + * starts a basic block, we need to set the .block_start of the next + * instruction's annotation with a pointer to the bblock started by + * the DO. + * + * There's also only complication from emitting an annotation without + * a corresponding hardware instruction to disassemble. + */ + if (devinfo->gen >= 6 && inst->opcode == BRW_OPCODE_DO) { + annotation->ann_count--; + } + + if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) { + ann->block_end = cfg->blocks[annotation->cur_block]; + annotation->cur_block++; + } +} + +void +annotation_finalize(struct annotation_info *annotation, + unsigned next_inst_offset) +{ + if (!annotation->ann_count) + return; + + if (annotation->ann_count == annotation->ann_size) { + annotation->ann = reralloc(annotation->mem_ctx, annotation->ann, + struct annotation, annotation->ann_size + 1); + } + annotation->ann[annotation->ann_count].offset = next_inst_offset; +} + +void +annotation_insert_error(struct annotation_info *annotation, unsigned offset, + const char *error) +{ + struct annotation *ann; + + if (!annotation->ann_count) + return; + + /* We may have to split an annotation, so ensure we have enough space + * allocated for that case up front. + */ + if (!annotation_array_ensure_space(annotation)) + return; + + assume(annotation->ann_count > 0); + + for (int i = 0; i < annotation->ann_count; i++) { + struct annotation *cur = &annotation->ann[i]; + struct annotation *next = &annotation->ann[i + 1]; + ann = cur; + + if (next->offset <= offset) + continue; + + if (offset + sizeof(brw_inst) != next->offset) { + memmove(next, cur, + (annotation->ann_count - i + 2) * sizeof(struct annotation)); + cur->error = NULL; + cur->error_length = 0; + cur->block_end = NULL; + next->offset = offset + sizeof(brw_inst); + next->block_start = NULL; + annotation->ann_count++; + } + break; + } + + if (ann->error) + ralloc_strcat(&ann->error, error); + else + ann->error = ralloc_strdup(annotation->mem_ctx, error); +} diff --git a/src/intel/compiler/intel_asm_annotation.h b/src/intel/compiler/intel_asm_annotation.h new file mode 100644 index 00000000000..2d905b10a96 --- /dev/null +++ b/src/intel/compiler/intel_asm_annotation.h @@ -0,0 +1,80 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _INTEL_ASM_ANNOTATION_H +#define _INTEL_ASM_ANNOTATION_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct backend_instruction; +struct cfg_t; + +struct annotation { + int offset; + + size_t error_length; + char *error; + + /* Pointers to the basic block in the CFG if the instruction group starts + * or ends a basic block. + */ + struct bblock_t *block_start; + struct bblock_t *block_end; + + /* Annotation for the generated IR. One of the two can be set. */ + const void *ir; + const char *annotation; +}; + +struct annotation_info { + void *mem_ctx; + struct annotation *ann; + int ann_count; + int ann_size; + + /** Block index in the cfg. */ + int cur_block; +}; + +void +dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, + const struct gen_device_info *devinfo); + +void +annotate(const struct gen_device_info *devinfo, + struct annotation_info *annotation, const struct cfg_t *cfg, + struct backend_instruction *inst, unsigned offset); +void +annotation_finalize(struct annotation_info *annotation, unsigned offset); + +void +annotation_insert_error(struct annotation_info *annotation, unsigned offset, + const char *error); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _INTEL_ASM_ANNOTATION_H */ diff --git a/src/intel/compiler/test_eu_compact.c b/src/intel/compiler/test_eu_compact.c new file mode 100644 index 00000000000..77a57f4aa65 --- /dev/null +++ b/src/intel/compiler/test_eu_compact.c @@ -0,0 +1,300 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include "util/ralloc.h" +#include "brw_eu.h" + +static bool +test_compact_instruction(struct brw_codegen *p, brw_inst src) +{ + brw_compact_inst dst; + memset(&dst, 0xd0, sizeof(dst)); + + if (brw_try_compact_instruction(p->devinfo, &dst, &src)) { + brw_inst uncompacted; + + brw_uncompact_instruction(p->devinfo, &uncompacted, &dst); + if (memcmp(&uncompacted, &src, sizeof(src))) { + brw_debug_compact_uncompact(p->devinfo, &src, &uncompacted); + return false; + } + } else { + brw_compact_inst unchanged; + memset(&unchanged, 0xd0, sizeof(unchanged)); + /* It's not supposed to change dst unless it compacted. */ + if (memcmp(&unchanged, &dst, sizeof(dst))) { + fprintf(stderr, "Failed to compact, but dst changed\n"); + fprintf(stderr, " Instruction: "); + brw_disassemble_inst(stderr, p->devinfo, &src, false); + return false; + } + } + + return true; +} + +/** + * When doing fuzz testing, pad bits won't round-trip. + * + * This sort of a superset of skip_bit, which is testing for changing bits that + * aren't worth testing for fuzzing. We also just want to clear bits that + * become meaningless once fuzzing twiddles a related bit. + */ +static void +clear_pad_bits(const struct gen_device_info *devinfo, brw_inst *inst) +{ + if (brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SEND && + brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SENDC && + brw_inst_opcode(devinfo, inst) != BRW_OPCODE_BREAK && + brw_inst_opcode(devinfo, inst) != BRW_OPCODE_CONTINUE && + brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) { + brw_inst_set_bits(inst, 127, 111, 0); + } +} + +static bool +skip_bit(const struct gen_device_info *devinfo, brw_inst *src, int bit) +{ + /* pad bit */ + if (bit == 7) + return true; + + /* The compact bit -- uncompacted can't have it set. */ + if (bit == 29) + return true; + + /* pad bit */ + if (bit == 47) + return true; + + /* pad bits */ + if (bit >= 90 && bit <= 95) + return true; + + /* sometimes these are pad bits. */ + if (brw_inst_opcode(devinfo, src) != BRW_OPCODE_SEND && + brw_inst_opcode(devinfo, src) != BRW_OPCODE_SENDC && + brw_inst_opcode(devinfo, src) != BRW_OPCODE_BREAK && + brw_inst_opcode(devinfo, src) != BRW_OPCODE_CONTINUE && + brw_inst_src0_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE && + bit >= 121) { + return true; + } + + return false; +} + +static bool +test_fuzz_compact_instruction(struct brw_codegen *p, brw_inst src) +{ + for (int bit0 = 0; bit0 < 128; bit0++) { + if (skip_bit(p->devinfo, &src, bit0)) + continue; + + for (int bit1 = 0; bit1 < 128; bit1++) { + brw_inst instr = src; + uint32_t *bits = (uint32_t *)&instr; + + if (skip_bit(p->devinfo, &src, bit1)) + continue; + + bits[bit0 / 32] ^= (1 << (bit0 & 31)); + bits[bit1 / 32] ^= (1 << (bit1 & 31)); + + clear_pad_bits(p->devinfo, &instr); + + if (!test_compact_instruction(p, instr)) { + printf(" twiddled bits for fuzzing %d, %d\n", bit0, bit1); + return false; + } + } + } + + return true; +} + +static void +gen_ADD_GRF_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec8_grf(0, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + struct brw_reg g4 = brw_vec8_grf(4, 0); + + brw_ADD(p, g0, g2, g4); +} + +static void +gen_ADD_GRF_GRF_IMM(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec8_grf(0, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + + brw_ADD(p, g0, g2, brw_imm_f(1.0)); +} + +static void +gen_ADD_GRF_GRF_IMM_d(struct brw_codegen *p) +{ + struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D); + struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D); + + brw_ADD(p, g0, g2, brw_imm_d(1)); +} + +static void +gen_MOV_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec8_grf(0, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + + brw_MOV(p, g0, g2); +} + +static void +gen_ADD_MRF_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + struct brw_reg g4 = brw_vec8_grf(4, 0); + + brw_ADD(p, m6, g2, g4); +} + +static void +gen_ADD_vec1_GRF_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec1_grf(0, 0); + struct brw_reg g2 = brw_vec1_grf(2, 0); + struct brw_reg g4 = brw_vec1_grf(4, 0); + + brw_ADD(p, g0, g2, g4); +} + +static void +gen_PLN_MRF_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0); + struct brw_reg interp = brw_vec1_grf(2, 0); + struct brw_reg g4 = brw_vec8_grf(4, 0); + + brw_PLN(p, m6, interp, g4); +} + +static void +gen_f0_0_MOV_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec8_grf(0, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + + brw_push_insn_state(p); + brw_set_default_predicate_control(p, true); + brw_MOV(p, g0, g2); + brw_pop_insn_state(p); +} + +/* The handling of f0.1 vs f0.0 changes between gen6 and gen7. Explicitly test + * it, so that we run the fuzzing can run over all the other bits that might + * interact with it. + */ +static void +gen_f0_1_MOV_GRF_GRF(struct brw_codegen *p) +{ + struct brw_reg g0 = brw_vec8_grf(0, 0); + struct brw_reg g2 = brw_vec8_grf(2, 0); + + brw_push_insn_state(p); + brw_set_default_predicate_control(p, true); + brw_inst *mov = brw_MOV(p, g0, g2); + brw_inst_set_flag_subreg_nr(p->devinfo, mov, 1); + brw_pop_insn_state(p); +} + +struct { + void (*func)(struct brw_codegen *p); +} tests[] = { + { gen_MOV_GRF_GRF }, + { gen_ADD_GRF_GRF_GRF }, + { gen_ADD_GRF_GRF_IMM }, + { gen_ADD_GRF_GRF_IMM_d }, + { gen_ADD_MRF_GRF_GRF }, + { gen_ADD_vec1_GRF_GRF_GRF }, + { gen_PLN_MRF_GRF_GRF }, + { gen_f0_0_MOV_GRF_GRF }, + { gen_f0_1_MOV_GRF_GRF }, +}; + +static bool +run_tests(const struct gen_device_info *devinfo) +{ + brw_init_compaction_tables(devinfo); + bool fail = false; + + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + for (int align_16 = 0; align_16 <= 1; align_16++) { + struct brw_codegen *p = rzalloc(NULL, struct brw_codegen); + brw_init_codegen(devinfo, p, p); + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + if (align_16) + brw_set_default_access_mode(p, BRW_ALIGN_16); + else + brw_set_default_access_mode(p, BRW_ALIGN_1); + + tests[i].func(p); + assert(p->nr_insn == 1); + + if (!test_compact_instruction(p, p->store[0])) { + fail = true; + continue; + } + + if (!test_fuzz_compact_instruction(p, p->store[0])) { + fail = true; + continue; + } + + ralloc_free(p); + } + } + + return fail; +} + +int +main(int argc, char **argv) +{ + struct gen_device_info *devinfo = calloc(1, sizeof(*devinfo)); + devinfo->gen = 6; + bool fail = false; + + for (devinfo->gen = 6; devinfo->gen <= 7; devinfo->gen++) { + fail |= run_tests(devinfo); + } + + return fail; +} diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp new file mode 100644 index 00000000000..76652dc43d0 --- /dev/null +++ b/src/intel/compiler/test_eu_validate.cpp @@ -0,0 +1,847 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_eu.h" +#include "util/ralloc.h" + +enum subgen { + IS_G45 = 1, + IS_BYT, + IS_HSW, + IS_CHV, + IS_BXT, + IS_KBL, +}; + +static const struct gen_info { + const char *name; + int gen; + enum subgen subgen; +} gens[] = { + { "brw", 4 }, + { "g45", 4, IS_G45 }, + { "ilk", 5 }, + { "snb", 6 }, + { "ivb", 7 }, + { "byt", 7, IS_BYT }, + { "hsw", 7, IS_HSW }, + { "bdw", 8 }, + { "chv", 8, IS_CHV }, + { "skl", 9 }, + { "bxt", 9, IS_BXT }, + { "kbl", 9, IS_KBL }, +}; + +class validation_test: public ::testing::TestWithParam<struct gen_info> { + virtual void SetUp(); + +public: + validation_test(); + virtual ~validation_test(); + + struct brw_codegen *p; + struct gen_device_info devinfo; +}; + +validation_test::validation_test() +{ + p = rzalloc(NULL, struct brw_codegen); + memset(&devinfo, 0, sizeof(devinfo)); +} + +validation_test::~validation_test() +{ + ralloc_free(p); +} + +void validation_test::SetUp() +{ + struct gen_info info = GetParam(); + + devinfo.gen = info.gen; + devinfo.is_g4x = info.subgen == IS_G45; + devinfo.is_baytrail = info.subgen == IS_BYT; + devinfo.is_haswell = info.subgen == IS_HSW; + devinfo.is_cherryview = info.subgen == IS_CHV; + devinfo.is_broxton = info.subgen == IS_BXT; + devinfo.is_kabylake = info.subgen == IS_KBL; + + brw_init_codegen(&devinfo, p, p); +} + +struct gen_name { + template <class ParamType> + std::string + operator()(const ::testing::TestParamInfo<ParamType>& info) const { + return info.param.name; + } +}; + +INSTANTIATE_TEST_CASE_P(eu_assembly, validation_test, + ::testing::ValuesIn(gens), + gen_name()); + +static bool +validate(struct brw_codegen *p) +{ + const bool print = getenv("TEST_DEBUG"); + struct annotation_info annotation; + memset(&annotation, 0, sizeof(annotation)); + + if (print) { + annotation.mem_ctx = ralloc_context(NULL); + annotation.ann_count = 1; + annotation.ann_size = 2; + annotation.ann = rzalloc_array(annotation.mem_ctx, struct annotation, + annotation.ann_size); + annotation.ann[annotation.ann_count].offset = p->next_insn_offset; + } + + bool ret = brw_validate_instructions(p, 0, &annotation); + + if (print) { + dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo); + ralloc_free(annotation.mem_ctx); + } + + return ret; +} + +#define last_inst (&p->store[p->nr_insn - 1]) +#define g0 brw_vec8_grf(0, 0) +#define null brw_null_reg() + +static void +clear_instructions(struct brw_codegen *p) +{ + p->next_insn_offset = 0; + p->nr_insn = 0; +} + +TEST_P(validation_test, sanity) +{ + brw_ADD(p, g0, g0, g0); + + EXPECT_TRUE(validate(p)); +} + +TEST_P(validation_test, src0_null_reg) +{ + brw_MOV(p, g0, null); + + EXPECT_FALSE(validate(p)); +} + +TEST_P(validation_test, src1_null_reg) +{ + brw_ADD(p, g0, g0, null); + + EXPECT_FALSE(validate(p)); +} + +TEST_P(validation_test, math_src0_null_reg) +{ + if (devinfo.gen >= 6) { + gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, null, null); + } else { + gen4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, null, BRW_MATH_PRECISION_FULL); + } + + EXPECT_FALSE(validate(p)); +} + +TEST_P(validation_test, math_src1_null_reg) +{ + if (devinfo.gen >= 6) { + gen6_math(p, g0, BRW_MATH_FUNCTION_POW, g0, null); + EXPECT_FALSE(validate(p)); + } else { + /* Math instructions on Gen4/5 are actually SEND messages with payloads. + * src1 is an immediate message descriptor set by gen4_math. + */ + } +} + +TEST_P(validation_test, opcode46) +{ + /* opcode 46 is "push" on Gen 4 and 5 + * "fork" on Gen 6 + * reserved on Gen 7 + * "goto" on Gen8+ + */ + brw_next_insn(p, 46); + + if (devinfo.gen == 7) { + EXPECT_FALSE(validate(p)); + } else { + EXPECT_TRUE(validate(p)); + } +} + +/* When the Execution Data Type is wider than the destination data type, the + * destination must [...] specify a HorzStride equal to the ratio in sizes of + * the two data types. + */ +TEST_P(validation_test, dest_stride_must_be_equal_to_the_ratio_of_exec_size_to_dest_size) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + + EXPECT_TRUE(validate(p)); +} + +/* When the Execution Data Type is wider than the destination data type, the + * destination must be aligned as required by the wider execution data type + * [...] + */ +TEST_P(validation_test, dst_subreg_must_be_aligned_to_exec_type_size) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 2); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 8); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_TRUE(validate(p)); +} + +/* ExecSize must be greater than or equal to Width. */ +TEST_P(validation_test, exec_size_less_than_width) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_16); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_16); + + EXPECT_FALSE(validate(p)); +} + +/* If ExecSize = Width and HorzStride ≠ 0, + * VertStride must be set to Width * HorzStride. + */ +TEST_P(validation_test, vertical_stride_is_width_by_horizontal_stride) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + + EXPECT_FALSE(validate(p)); +} + +/* If Width = 1, HorzStride must be 0 regardless of the values + * of ExecSize and VertStride. + */ +TEST_P(validation_test, horizontal_stride_must_be_0_if_width_is_1) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_FALSE(validate(p)); +} + +/* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */ +TEST_P(validation_test, scalar_region_must_be_0_1_0) +{ + struct brw_reg g0_0 = brw_vec1_grf(0, 0); + + brw_ADD(p, g0, g0, g0_0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0_0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); +} + +/* If VertStride = HorzStride = 0, Width must be 1 regardless of the value + * of ExecSize. + */ +TEST_P(validation_test, zero_stride_implies_0_1_0) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); +} + +/* Dst.HorzStride must not be 0. */ +TEST_P(validation_test, dst_horizontal_stride_0) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + EXPECT_FALSE(validate(p)); +} + +/* VertStride must be used to cross GRF register boundaries. This rule implies + * that elements within a 'Width' cannot cross GRF boundaries. + */ +TEST_P(validation_test, must_not_cross_grf_boundary_in_a_width) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 4); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 4); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_FALSE(validate(p)); +} + +/* Destination Horizontal must be 1 in Align16 */ +TEST_P(validation_test, dst_hstride_on_align16_must_be_1) +{ + brw_set_default_access_mode(p, BRW_ALIGN_16); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_TRUE(validate(p)); +} + +/* VertStride must be 0 or 4 in Align16 */ +TEST_P(validation_test, vstride_on_align16_must_be_0_or_4) +{ + const struct { + enum brw_vertical_stride vstride; + bool expected_result; + } vstride[] = { + { BRW_VERTICAL_STRIDE_0, true }, + { BRW_VERTICAL_STRIDE_1, false }, + { BRW_VERTICAL_STRIDE_2, devinfo.is_haswell || devinfo.gen >= 8 }, + { BRW_VERTICAL_STRIDE_4, true }, + { BRW_VERTICAL_STRIDE_8, false }, + { BRW_VERTICAL_STRIDE_16, false }, + { BRW_VERTICAL_STRIDE_32, false }, + { BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL, false }, + }; + + brw_set_default_access_mode(p, BRW_ALIGN_16); + + for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) { + brw_ADD(p, g0, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, vstride[i].vstride); + + EXPECT_EQ(vstride[i].expected_result, validate(p)); + + clear_instructions(p); + } + + for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) { + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_vstride(&devinfo, last_inst, vstride[i].vstride); + + EXPECT_EQ(vstride[i].expected_result, validate(p)); + + clear_instructions(p); + } +} + +/* In Direct Addressing mode, a source cannot span more than 2 adjacent GRF + * registers. + */ +TEST_P(validation_test, source_cannot_span_more_than_2_registers) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 2); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + + EXPECT_TRUE(validate(p)); +} + +/* A destination cannot span more than 2 adjacent GRF registers. */ +TEST_P(validation_test, destination_cannot_span_more_than_2_registers) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_8); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 6); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_TRUE(validate(p)); +} + +TEST_P(validation_test, src_region_spans_two_regs_dst_region_spans_one) +{ + /* Writes to dest are to the lower OWord */ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + /* Writes to dest are to the upper OWord */ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + /* Writes to dest are evenly split between OWords */ + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + /* Writes to dest are uneven between OWords */ + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 10); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + if (devinfo.gen >= 9) { + EXPECT_TRUE(validate(p)); + } else { + EXPECT_FALSE(validate(p)); + } +} + +TEST_P(validation_test, dst_elements_must_be_evenly_split_between_registers) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4); + + if (devinfo.gen >= 9) { + EXPECT_TRUE(validate(p)); + } else { + EXPECT_FALSE(validate(p)); + } + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + if (devinfo.gen >= 6) { + gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4); + + EXPECT_FALSE(validate(p)); + } +} + +TEST_P(validation_test, two_src_two_dst_source_offsets_must_be_same) +{ + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4); + brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 16); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + if (devinfo.gen <= 7) { + EXPECT_FALSE(validate(p)); + } else { + EXPECT_TRUE(validate(p)); + } + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_8); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_TRUE(validate(p)); +} + +#if 0 +TEST_P(validation_test, two_src_two_dst_each_dst_must_be_derived_from_one_src) +{ + // mov (16) r10.0<2>:w r12.4<4;4,1>:w + + brw_MOV(p, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + +#if 0 + brw_ADD(p, g0, g0, g0); + brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 16); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + + EXPECT_FALSE(validate(p)); + #endif +} +#endif + +TEST_P(validation_test, one_src_two_dst) +{ + struct brw_reg g0_0 = brw_vec1_grf(0, 0); + + brw_ADD(p, g0, g0_0, g0_0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + + if (devinfo.gen >= 8) { + EXPECT_TRUE(validate(p)); + } else { + EXPECT_FALSE(validate(p)); + } + + clear_instructions(p); + + brw_ADD(p, g0, g0, g0); + brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W); + + if (devinfo.gen >= 8) { + EXPECT_TRUE(validate(p)); + } else { + EXPECT_FALSE(validate(p)); + } +} + +TEST_P(validation_test, packed_byte_destination) +{ + static const struct { + enum brw_reg_type dst_type; + enum brw_reg_type src_type; + bool neg, abs, sat; + bool expected_result; + } move[] = { + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 0, true }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 0, true }, + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 0, true }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 0, true }, + + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 1, 0, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 1, 0, 0, false }, + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 1, 0, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 1, 0, 0, false }, + + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 1, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 1, 0, false }, + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 1, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 1, 0, false }, + + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 1, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 1, false }, + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 1, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 1, false }, + + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UW, 0, 0, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_W , 0, 0, 0, false }, + { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UD, 0, 0, 0, false }, + { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_D , 0, 0, 0, false }, + }; + + for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) { + brw_MOV(p, retype(g0, move[i].dst_type), retype(g0, move[i].src_type)); + brw_inst_set_src0_negate(&devinfo, last_inst, move[i].neg); + brw_inst_set_src0_abs(&devinfo, last_inst, move[i].abs); + brw_inst_set_saturate(&devinfo, last_inst, move[i].sat); + + EXPECT_EQ(move[i].expected_result, validate(p)); + + clear_instructions(p); + } + + brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_UB), + retype(g0, BRW_REGISTER_TYPE_UB), + retype(g0, BRW_REGISTER_TYPE_UB)); + brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B), + retype(g0, BRW_REGISTER_TYPE_B), + retype(g0, BRW_REGISTER_TYPE_B)); + brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL); + + EXPECT_FALSE(validate(p)); +} + +TEST_P(validation_test, byte_destination_relaxed_alignment) +{ + brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B), + retype(g0, BRW_REGISTER_TYPE_W), + retype(g0, BRW_REGISTER_TYPE_W)); + brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + + brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B), + retype(g0, BRW_REGISTER_TYPE_W), + retype(g0, BRW_REGISTER_TYPE_W)); + brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL); + brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2); + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 1); + + if (devinfo.gen > 4 || devinfo.is_g4x) { + EXPECT_TRUE(validate(p)); + } else { + EXPECT_FALSE(validate(p)); + } + +} diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp new file mode 100644 index 00000000000..a97e374f74e --- /dev/null +++ b/src/intel/compiler/test_fs_cmod_propagation.cpp @@ -0,0 +1,556 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_fs.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class cmod_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct brw_wm_prog_data *prog_data; + struct gl_shader_program *shader_prog; + fs_visitor *v; +}; + +class cmod_propagation_fs_visitor : public fs_visitor +{ +public: + cmod_propagation_fs_visitor(struct brw_compiler *compiler, + struct brw_wm_prog_data *prog_data, + nir_shader *shader) + : fs_visitor(compiler, NULL, NULL, NULL, + &prog_data->base, (struct gl_program *) NULL, + shader, 8, -1) {} +}; + + +void cmod_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + prog_data = ralloc(NULL, struct brw_wm_prog_data); + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL); + + v = new cmod_propagation_fs_visitor(compiler, prog_data, shader); + + devinfo->gen = 4; +} + +static fs_inst * +instruction(bblock_t *block, int num) +{ + fs_inst *inst = (fs_inst *)block->start(); + for (int i = 0; i < num; i++) { + inst = (fs_inst *)inst->next; + } + return inst; +} + +static bool +cmod_propagation(fs_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->cfg->dump(v); + } + + bool ret = v->opt_cmod_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->cfg->dump(v); + } + + return ret; +} + +TEST_F(cmod_propagation_test, basic) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest src0 src1 + * 1: cmp.ge.f0(8) null dest 0.0f + * + * = After = + * 0: add.ge.f0(8) dest src0 src1 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_nonzero) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg nonzero(brw_imm_f(1.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest src0 src1 + * 1: cmp.ge.f0(8) null dest 1.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, non_cmod_instruction) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::uint_type); + fs_reg src0 = v->vgrf(glsl_type::uint_type); + fs_reg zero(brw_imm_ud(0u)); + bld.FBL(dest, src0); + bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: fbl(8) dest src0 + * 1: cmp.ge.f0(8) null dest 0u + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_write) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest src0 src1 + * 1: cmp.ge.f0(8) null src2 0.0f + * 2: cmp.ge.f0(8) null dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read) +{ + const fs_builder &bld = v->bld; + fs_reg dest0 = v->vgrf(glsl_type::float_type); + fs_reg dest1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest0, src0, src1); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest0 src0 src1 + * 1: (+f0) sel(8) dest1 src2 0.0f + * 2: cmp.ge.f0(8) null dest0 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_dest_write) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::vec4_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::vec2_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(offset(dest, bld, 2), src0, src1); + bld.emit(SHADER_OPCODE_TEX, dest, src2) + ->size_written = 4 * REG_SIZE; + bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest+2 src0 src1 + * 1: tex(8) rlen 4 dest+0 src2 + * 2: cmp.ge.f0(8) null dest+2 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read_same_value) +{ + const fs_builder &bld = v->bld; + fs_reg dest0 = v->vgrf(glsl_type::float_type); + fs_reg dest1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1)); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add.ge.f0(8) dest0 src0 src1 + * 1: (+f0) sel(8) dest1 src2 0.0f + * 2: cmp.ge.f0(8) null dest0 0.0f + * + * = After = + * 0: add.ge.f0(8) dest0 src0 src1 + * 1: (+f0) sel(8) dest1 src2 0.0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); +} + +TEST_F(cmod_propagation_test, negate) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + dest.negate = true; + bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest src0 src1 + * 1: cmp.ge.f0(8) null -dest 0.0f + * + * = After = + * 0: add.le.f0(8) dest src0 src1 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, movnz) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE); + set_condmod(BRW_CONDITIONAL_NZ, + bld.MOV(bld.null_reg_f(), dest)); + + /* = Before = + * + * 0: cmp.ge.f0(8) dest src0 src1 + * 1: mov.nz.f0(8) null dest + * + * = After = + * 0: cmp.ge.f0(8) dest src0 src1 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, different_types_cmod_with_zero) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::int_type); + fs_reg src1 = v->vgrf(glsl_type::int_type); + fs_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero, + BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add(8) dest:D src0:D src1:D + * 1: cmp.ge.f0(8) null:F dest:F 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andnz_one) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + fs_reg one(brw_imm_d(1)); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.AND(bld.null_reg_d(), dest, one)); + + /* = Before = + * 0: cmp.l.f0(8) dest:F src0:F 0F + * 1: and.nz.f0(8) null:D dest:D 1D + * + * = After = + * 0: cmp.l.f0(8) dest:F src0:F 0F + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_TRUE(retype(dest, BRW_REGISTER_TYPE_F) + .equals(instruction(block0, 0)->dst)); +} + +TEST_F(cmod_propagation_test, andnz_non_one) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + fs_reg nonone(brw_imm_d(38)); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.AND(bld.null_reg_d(), dest, nonone)); + + /* = Before = + * 0: cmp.l.f0(8) dest:F src0:F 0F + * 1: and.nz.f0(8) null:D dest:D 38D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andz_one) +{ + const fs_builder &bld = v->bld; + fs_reg dest = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0.0f)); + fs_reg one(brw_imm_d(1)); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_Z, + bld.AND(bld.null_reg_d(), dest, one)); + + /* = Before = + * 0: cmp.l.f0(8) dest:F src0:F 0F + * 1: and.z.f0(8) null:D dest:D 1D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod); +} diff --git a/src/intel/compiler/test_fs_copy_propagation.cpp b/src/intel/compiler/test_fs_copy_propagation.cpp new file mode 100644 index 00000000000..37736ec86f4 --- /dev/null +++ b/src/intel/compiler/test_fs_copy_propagation.cpp @@ -0,0 +1,213 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_fs.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class copy_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct brw_wm_prog_data *prog_data; + struct gl_shader_program *shader_prog; + fs_visitor *v; +}; + +class copy_propagation_fs_visitor : public fs_visitor +{ +public: + copy_propagation_fs_visitor(struct brw_compiler *compiler, + struct brw_wm_prog_data *prog_data, + nir_shader *shader) + : fs_visitor(compiler, NULL, NULL, NULL, + &prog_data->base, (struct gl_program *) NULL, + shader, 8, -1) {} +}; + + +void copy_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + prog_data = ralloc(NULL, struct brw_wm_prog_data); + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL); + + v = new copy_propagation_fs_visitor(compiler, prog_data, shader); + + devinfo->gen = 4; +} + +static fs_inst * +instruction(bblock_t *block, int num) +{ + fs_inst *inst = (fs_inst *)block->start(); + for (int i = 0; i < num; i++) { + inst = (fs_inst *)inst->next; + } + return inst; +} + +static bool +copy_propagation(fs_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->cfg->dump(v); + } + + bool ret = v->opt_copy_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->cfg->dump(v); + } + + return ret; +} + +TEST_F(copy_propagation_test, basic) +{ + const fs_builder &bld = v->bld; + fs_reg vgrf0 = v->vgrf(glsl_type::float_type); + fs_reg vgrf1 = v->vgrf(glsl_type::float_type); + fs_reg vgrf2 = v->vgrf(glsl_type::float_type); + fs_reg vgrf3 = v->vgrf(glsl_type::float_type); + bld.MOV(vgrf0, vgrf2); + bld.ADD(vgrf1, vgrf0, vgrf3); + + /* = Before = + * + * 0: mov(8) vgrf0 vgrf2 + * 1: add(8) vgrf1 vgrf0 vgrf3 + * + * = After = + * 0: mov(8) vgrf0 vgrf2 + * 1: add(8) vgrf1 vgrf2 vgrf3 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + fs_inst *mov = instruction(block0, 0); + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->dst.equals(vgrf0)); + EXPECT_TRUE(mov->src[0].equals(vgrf2)); + + fs_inst *add = instruction(block0, 1); + EXPECT_EQ(BRW_OPCODE_ADD, add->opcode); + EXPECT_TRUE(add->dst.equals(vgrf1)); + EXPECT_TRUE(add->src[0].equals(vgrf2)); + EXPECT_TRUE(add->src[1].equals(vgrf3)); +} + +TEST_F(copy_propagation_test, maxmax_sat_imm) +{ + const fs_builder &bld = v->bld; + fs_reg vgrf0 = v->vgrf(glsl_type::float_type); + fs_reg vgrf1 = v->vgrf(glsl_type::float_type); + fs_reg vgrf2 = v->vgrf(glsl_type::float_type); + + static const struct { + enum brw_conditional_mod conditional_mod; + float immediate; + bool expected_result; + } test[] = { + /* conditional mod, imm, expected_result */ + { BRW_CONDITIONAL_GE , 0.1f, true }, + { BRW_CONDITIONAL_L , 0.1f, true }, + { BRW_CONDITIONAL_GE , 0.5f, true }, + { BRW_CONDITIONAL_L , 0.5f, true }, + { BRW_CONDITIONAL_GE , 0.9f, true }, + { BRW_CONDITIONAL_L , 0.9f, true }, + { BRW_CONDITIONAL_GE , -1.5f, false }, + { BRW_CONDITIONAL_L , -1.5f, false }, + { BRW_CONDITIONAL_GE , 1.5f, false }, + { BRW_CONDITIONAL_L , 1.5f, false }, + + { BRW_CONDITIONAL_NONE, 0.5f, false }, + { BRW_CONDITIONAL_Z , 0.5f, false }, + { BRW_CONDITIONAL_NZ , 0.5f, false }, + { BRW_CONDITIONAL_G , 0.5f, false }, + { BRW_CONDITIONAL_LE , 0.5f, false }, + { BRW_CONDITIONAL_R , 0.5f, false }, + { BRW_CONDITIONAL_O , 0.5f, false }, + { BRW_CONDITIONAL_U , 0.5f, false }, + }; + + for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) { + fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1)); + fs_inst *sel = set_condmod(test[i].conditional_mod, + bld.SEL(vgrf2, vgrf0, + brw_imm_f(test[i].immediate))); + + v->calculate_cfg(); + + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_EQ(test[i].expected_result, copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->saturate); + EXPECT_TRUE(mov->dst.equals(vgrf0)); + EXPECT_TRUE(mov->src[0].equals(vgrf1)); + + EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode); + EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod); + EXPECT_EQ(test[i].expected_result, sel->saturate); + EXPECT_TRUE(sel->dst.equals(vgrf2)); + if (test[i].expected_result) { + EXPECT_TRUE(sel->src[0].equals(vgrf1)); + } else { + EXPECT_TRUE(sel->src[0].equals(vgrf0)); + } + EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate))); + + delete v->cfg; + v->cfg = NULL; + } +} diff --git a/src/intel/compiler/test_fs_saturate_propagation.cpp b/src/intel/compiler/test_fs_saturate_propagation.cpp new file mode 100644 index 00000000000..db472143994 --- /dev/null +++ b/src/intel/compiler/test_fs_saturate_propagation.cpp @@ -0,0 +1,600 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_fs.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class saturate_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct brw_wm_prog_data *prog_data; + struct gl_shader_program *shader_prog; + fs_visitor *v; +}; + +class saturate_propagation_fs_visitor : public fs_visitor +{ +public: + saturate_propagation_fs_visitor(struct brw_compiler *compiler, + struct brw_wm_prog_data *prog_data, + nir_shader *shader) + : fs_visitor(compiler, NULL, NULL, NULL, + &prog_data->base, (struct gl_program *) NULL, + shader, 8, -1) {} +}; + + +void saturate_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + prog_data = ralloc(NULL, struct brw_wm_prog_data); + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL); + + v = new saturate_propagation_fs_visitor(compiler, prog_data, shader); + + devinfo->gen = 4; +} + +static fs_inst * +instruction(bblock_t *block, int num) +{ + fs_inst *inst = (fs_inst *)block->start(); + for (int i = 0; i < num; i++) { + inst = (fs_inst *)inst->next; + } + return inst; +} + +static bool +saturate_propagation(fs_visitor *v) +{ + const bool print = false; + + if (print) { + fprintf(stderr, "= Before =\n"); + v->cfg->dump(v); + } + + bool ret = v->opt_saturate_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->cfg->dump(v); + } + + return ret; +} + +TEST_F(saturate_propagation_test, basic) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1); + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * + * = After = + * 0: add.sat(8) dst0 src0 src1 + * 1: mov(8) dst1 dst0 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_TRUE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, other_non_saturated_use) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1); + set_saturate(true, bld.MOV(dst1, dst0)); + bld.ADD(dst2, dst0, src0); + + /* = Before = + * + * 0: add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * 2: add(8) dst2 dst0 src0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 2)->opcode); +} + +TEST_F(saturate_propagation_test, predicated_instruction) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1) + ->predicate = BRW_PREDICATE_NORMAL; + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: (+f0) add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, neg_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + bld.RNDU(dst0, src0); + dst0.negate = true; + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: rndu(8) dst0 src0 + * 1: mov.sat(8) dst1 -dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_RNDU, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, add_neg_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1); + dst0.negate = true; + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 -dst0 + * + * = After = + * 0: add.sat(8) dst0 -src0 -src1 + * 1: mov(8) dst1 dst0 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_TRUE(instruction(block0, 0)->saturate); + EXPECT_TRUE(instruction(block0, 0)->src[0].negate); + EXPECT_TRUE(instruction(block0, 0)->src[1].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, mul_neg_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.MUL(dst0, src0, src1); + dst0.negate = true; + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: mul(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 -dst0 + * + * = After = + * 0: mul.sat(8) dst0 src0 -src1 + * 1: mov(8) dst1 dst0 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_TRUE(instruction(block0, 0)->saturate); + EXPECT_TRUE(instruction(block0, 0)->src[0].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 1)->saturate); + EXPECT_FALSE(instruction(block0, 1)->src[0].negate); +} + +TEST_F(saturate_propagation_test, mul_mov_sat_neg_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.MUL(dst0, src0, src1); + set_saturate(true, bld.MOV(dst1, dst0)); + dst0.negate = true; + set_saturate(true, bld.MOV(dst2, dst0)); + + /* = Before = + * + * 0: mul(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * 2: mov.sat(8) dst2 -dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_FALSE(instruction(block0, 0)->src[1].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode); + EXPECT_TRUE(instruction(block0, 2)->src[0].negate); + EXPECT_TRUE(instruction(block0, 2)->saturate); +} + +TEST_F(saturate_propagation_test, mul_neg_mov_sat_neg_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.MUL(dst0, src0, src1); + dst0.negate = true; + set_saturate(true, bld.MOV(dst1, dst0)); + set_saturate(true, bld.MOV(dst2, dst0)); + + /* = Before = + * + * 0: mul(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 -dst0 + * 2: mov.sat(8) dst2 -dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_FALSE(instruction(block0, 0)->src[1].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->src[0].negate); + EXPECT_TRUE(instruction(block0, 1)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode); + EXPECT_TRUE(instruction(block0, 2)->src[0].negate); + EXPECT_TRUE(instruction(block0, 2)->saturate); +} + +TEST_F(saturate_propagation_test, abs_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1); + dst0.abs = true; + set_saturate(true, bld.MOV(dst1, dst0)); + + /* = Before = + * + * 0: add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 (abs)dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, producer_saturates) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + set_saturate(true, bld.ADD(dst0, src0, src1)); + set_saturate(true, bld.MOV(dst1, dst0)); + bld.MOV(dst2, dst0); + + /* = Before = + * + * 0: add.sat(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * 2: mov(8) dst2 dst0 + * + * = After = + * 0: add.sat(8) dst0 src0 src1 + * 1: mov(8) dst1 dst0 + * 2: mov(8) dst2 dst0 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_TRUE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 1)->saturate); +} + +TEST_F(saturate_propagation_test, intervening_saturating_copy) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.ADD(dst0, src0, src1); + set_saturate(true, bld.MOV(dst1, dst0)); + set_saturate(true, bld.MOV(dst2, dst0)); + + /* = Before = + * + * 0: add(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 dst0 + * 2: mov.sat(8) dst2 dst0 + * + * = After = + * 0: add.sat(8) dst0 src0 src1 + * 1: mov(8) dst1 dst0 + * 2: mov(8) dst2 dst0 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_TRUE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 1)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode); + EXPECT_FALSE(instruction(block0, 2)->saturate); +} + +TEST_F(saturate_propagation_test, intervening_dest_write) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::vec4_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + fs_reg src2 = v->vgrf(glsl_type::vec2_type); + bld.ADD(offset(dst0, bld, 2), src0, src1); + bld.emit(SHADER_OPCODE_TEX, dst0, src2) + ->size_written = 4 * REG_SIZE; + set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2))); + + /* = Before = + * + * 0: add(8) dst0+2 src0 src1 + * 1: tex(8) rlen 4 dst0+0 src2 + * 2: mov.sat(8) dst1 dst0+2 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode); + EXPECT_TRUE(instruction(block0, 2)->saturate); +} + +TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat) +{ + const fs_builder &bld = v->bld; + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg dst2 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg src1 = v->vgrf(glsl_type::float_type); + bld.MUL(dst0, src0, src1); + dst0.negate = true; + set_saturate(true, bld.MOV(dst1, dst0)); + dst0.negate = false; + set_saturate(true, bld.MOV(dst2, dst0)); + + /* = Before = + * + * 0: mul(8) dst0 src0 src1 + * 1: mov.sat(8) dst1 -dst0 + * 2: mov.sat(8) dst2 dst0 + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(saturate_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_FALSE(instruction(block0, 0)->saturate); + EXPECT_FALSE(instruction(block0, 0)->src[1].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode); + EXPECT_TRUE(instruction(block0, 1)->saturate); + EXPECT_TRUE(instruction(block0, 1)->src[0].negate); + EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode); + EXPECT_TRUE(instruction(block0, 2)->saturate); +} diff --git a/src/intel/compiler/test_vec4_cmod_propagation.cpp b/src/intel/compiler/test_vec4_cmod_propagation.cpp new file mode 100644 index 00000000000..7d9792b4a55 --- /dev/null +++ b/src/intel/compiler/test_vec4_cmod_propagation.cpp @@ -0,0 +1,823 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Based on test_fs_cmod_propagation.cpp + */ + +#include <gtest/gtest.h> +#include "brw_vec4.h" +#include "brw_vec4_builder.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class cmod_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct gl_shader_program *shader_prog; + struct brw_vue_prog_data *prog_data; + vec4_visitor *v; +}; + +class cmod_propagation_vec4_visitor : public vec4_visitor +{ +public: + cmod_propagation_vec4_visitor(struct brw_compiler *compiler, + nir_shader *shader, + struct brw_vue_prog_data *prog_data) + : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL, + false, -1) + { + prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + } + +protected: + /* Dummy implementation for pure virtual methods */ + virtual dst_reg *make_reg_for_system_value(int location) + { + unreachable("Not reached"); + } + + virtual void setup_payload() + { + unreachable("Not reached"); + } + + virtual void emit_prolog() + { + unreachable("Not reached"); + } + + virtual void emit_program_code() + { + unreachable("Not reached"); + } + + virtual void emit_thread_end() + { + unreachable("Not reached"); + } + + virtual void emit_urb_write_header(int mrf) + { + unreachable("Not reached"); + } + + virtual vec4_instruction *emit_urb_write_opcode(bool complete) + { + unreachable("Not reached"); + } +}; + + +void cmod_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data)); + compiler->devinfo = devinfo; + + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL); + + v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data); + + devinfo->gen = 4; +} + +static vec4_instruction * +instruction(bblock_t *block, int num) +{ + vec4_instruction *inst = (vec4_instruction *)block->start(); + for (int i = 0; i < num; i++) { + inst = (vec4_instruction *)inst->next; + } + return inst; +} + +static bool +cmod_propagation(vec4_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->dump_instructions(); + } + + bool ret = v->opt_cmod_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->dump_instructions(); + } + + return ret; +} + +TEST_F(cmod_propagation_test, basic) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.ADD(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest.x src0.xxxx src1.xxxx + * 1: cmp.ge.f0 null.x dest.xxxx 0.0f + * + * = After = + * 0: add.ge.f0 dest.x src0.xxxx src1.xxxx + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, basic_different_dst_writemask) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + dst_reg dest_null = bld.null_reg_f(); + + bld.ADD(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest.x src0 src1 + * 1: cmp.ge.f0 null.xyzw dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andz_one) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + src_reg one(brw_imm_d(1)); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_Z, + bld.AND(bld.null_reg_d(), src_reg(dest), one)); + + /* = Before = + * 0: cmp.l.f0 dest:F src0:F 0F + * 1: and.z.f0 null:D dest:D 1D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, non_cmod_instruction) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::uint_type); + src_reg src0 = src_reg(v, glsl_type::uint_type); + src_reg zero(brw_imm_ud(0u)); + bld.FBL(dest, src0); + bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: fbl dest src0 + * 1: cmp.ge.f0 null dest 0u + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_write) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE); + bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest src0 src1 + * 1: cmp.ge.f0 null src2 0.0f + * 2: cmp.ge.f0 null dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest0 = dst_reg(v, glsl_type::float_type); + dst_reg dest1 = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest0, src0, src1); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + * 2: cmp.ge.f0 null dest0 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_dest_write) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::vec2_type); + src_reg zero(brw_imm_f(0.0f)); + bld.ADD(offset(dest, 8, 2), src0, src1); + bld.emit(SHADER_OPCODE_TEX, dest, src2) + ->size_written = 4 * REG_SIZE; + bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 8, 2), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest+2 src0 src1 + * 1: tex rlen 4 dest+0 src2 + * 2: cmp.ge.f0 null dest+2 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, intervening_flag_read_same_value) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest0 = dst_reg(v, glsl_type::float_type); + dst_reg dest1 = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg src2 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1)); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); + bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add.ge.f0 dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + * 2: cmp.ge.f0 null.x dest0 0.0f + * + * = After = + * 0: add.ge.f0 dest0 src0 src1 + * 1: (+f0) sel dest1 src2 0.0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); +} + +TEST_F(cmod_propagation_test, negate) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + src_reg tmp_src = src_reg(dest); + tmp_src.negate = true; + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest src0 src1 + * 1: cmp.ge.f0 null.x -dest 0.0f + * + * = After = + * 0: add.le.f0 dest src0 src1 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, movnz) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::float_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg src1 = src_reg(v, glsl_type::float_type); + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.MOV(dest_null, src_reg(dest))); + + /* = Before = + * + * 0: cmp.l.f0 dest:F src0:F src1:F + * 1: mov.nz.f0 null.x dest:F + * + * = After = + * 0: cmp.l.f0 dest src0:F src1:F + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, different_types_cmod_with_zero) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::int_type); + src_reg src1 = src_reg(v, glsl_type::int_type); + src_reg zero(brw_imm_f(0.0f)); + bld.ADD(dest, src0, src1); + bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero, + BRW_CONDITIONAL_GE); + + /* = Before = + * + * 0: add dest:D src0:D src1:D + * 1: cmp.ge.f0 null:F dest:F 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, andnz_non_one) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::int_type); + src_reg src0 = src_reg(v, glsl_type::float_type); + src_reg zero(brw_imm_f(0.0f)); + src_reg nonone(brw_imm_d(38)); + + bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); + set_condmod(BRW_CONDITIONAL_NZ, + bld.AND(bld.null_reg_d(), src_reg(dest), nonone)); + + /* = Before = + * 0: cmp.l.f0 dest:F src0:F 0F + * 1: and.nz.f0 null:D dest:D 38D + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} + +/* Note that basic is using glsl_type:float types, while this one is using + * glsl_type::vec4 */ +TEST_F(cmod_propagation_test, basic_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(brw_imm_f(0.0f)); + + bld.MUL(dest, src0, src1); + bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest.xyzw src0.xyzw src1.xyzw + * 1: cmp.nz.f0.0 null.xyzw dest.xyzw 0.0f + * + * = After = + * 0: mul.nz.f0.0 dest.xyzw src0.xyzw src1.xyzw + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(brw_imm_f(0.0f)); + dst_reg dest_null = bld.null_reg_f(); + + bld.MUL(dest, src0, src1); + bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest.x src0 src1 + * 1: cmp.nz.f0.0 null dest 0.0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mad_one_component_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg src2 = src_reg(v, glsl_type::vec4_type); + src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; + src2.negate = true; + src_reg zero(brw_imm_f(0.0f)); + src_reg tmp(dest); + tmp.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_f(); + dest_null.writemask = WRITEMASK_X; + + bld.MAD(dest, src0, src1, src2); + bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); + + /* = Before = + * + * 0: mad dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + * 1: cmp.l.f0.0 null.x:F dest.xxxx:F 0.0f + * + * = After = + * 0: mad.l.f0 dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mad_more_one_component_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + dest.writemask = WRITEMASK_XW; + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg src2 = src_reg(v, glsl_type::vec4_type); + src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; + src2.negate = true; + src_reg zero(brw_imm_f(0.0f)); + src_reg tmp(dest); + tmp.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_f(); + + bld.MAD(dest, src0, src1, src2); + bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); + + /* = Before = + * + * 0: mad dest.xw:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F + * 1: cmp.l.f0.0 null:F dest.xxxx:F zeroF + * + * = After = + * (No changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_mov_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::ivec4_type); + dest.writemask = WRITEMASK_X; + src_reg src0 = src_reg(v, glsl_type::ivec4_type); + src0.swizzle = BRW_SWIZZLE_XXXX; + src0.file = UNIFORM; + src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D); + src_reg mov_src = src_reg(dest); + mov_src.swizzle = BRW_SWIZZLE_XXXX; + dst_reg dest_null = bld.null_reg_d(); + dest_null.writemask = WRITEMASK_X; + + bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE); + set_condmod(BRW_CONDITIONAL_NZ, + bld.MOV(dest_null, mov_src)); + + /* = Before = + * + * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D + * 1: mov.nz.f0 null.x:D dest.xxxx:D + * + * = After = + * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4) +{ + const vec4_builder bld = vec4_builder(v).at_end(); + dst_reg dest = dst_reg(v, glsl_type::vec4_type); + src_reg src0 = src_reg(v, glsl_type::vec4_type); + src_reg src1 = src_reg(v, glsl_type::vec4_type); + src_reg zero(brw_imm_f(0.0f)); + src_reg cmp_src = src_reg(dest); + cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2); + + bld.MUL(dest, src0, src1); + bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: mul dest src0 src1 + * 1: cmp.nz.f0.0 null dest.xywz 0.0f + * + * = After = + * (No changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); +} diff --git a/src/intel/compiler/test_vec4_copy_propagation.cpp b/src/intel/compiler/test_vec4_copy_propagation.cpp new file mode 100644 index 00000000000..f4f91d8c8c7 --- /dev/null +++ b/src/intel/compiler/test_vec4_copy_propagation.cpp @@ -0,0 +1,181 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_vec4.h" +#include "program/program.h" + +using namespace brw; + +int ret = 0; + +class copy_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct gl_shader_program *shader_prog; + struct brw_vue_prog_data *prog_data; + vec4_visitor *v; +}; + +class copy_propagation_vec4_visitor : public vec4_visitor +{ +public: + copy_propagation_vec4_visitor(struct brw_compiler *compiler, + nir_shader *shader, + struct brw_vue_prog_data *prog_data) + : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL, + false /* no_spills */, -1) + { + prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + } + +protected: + virtual dst_reg *make_reg_for_system_value(int location) + { + unreachable("Not reached"); + } + + virtual void setup_payload() + { + unreachable("Not reached"); + } + + virtual void emit_prolog() + { + unreachable("Not reached"); + } + + virtual void emit_thread_end() + { + unreachable("Not reached"); + } + + virtual void emit_urb_write_header(int mrf) + { + unreachable("Not reached"); + } + + virtual vec4_instruction *emit_urb_write_opcode(bool complete) + { + unreachable("Not reached"); + } +}; + + +void copy_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data)); + compiler->devinfo = devinfo; + + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL); + + v = new copy_propagation_vec4_visitor(compiler, shader, prog_data); + + devinfo->gen = 4; +} + +static void +copy_propagation(vec4_visitor *v) +{ + bool print = false; + + if (print) { + fprintf(stderr, "instructions before:\n"); + v->dump_instructions(); + } + + v->calculate_cfg(); + v->opt_copy_propagation(); + + if (print) { + fprintf(stderr, "instructions after:\n"); + v->dump_instructions(); + } +} + +TEST_F(copy_propagation_test, test_swizzle_swizzle) +{ + dst_reg a = dst_reg(v, glsl_type::vec4_type); + dst_reg b = dst_reg(v, glsl_type::vec4_type); + dst_reg c = dst_reg(v, glsl_type::vec4_type); + + v->emit(v->ADD(a, src_reg(a), src_reg(a))); + + v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_Y, + SWIZZLE_Z, + SWIZZLE_W, + SWIZZLE_X)))); + + vec4_instruction *test_mov = + v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_Y, + SWIZZLE_Z, + SWIZZLE_W, + SWIZZLE_X))); + v->emit(test_mov); + + copy_propagation(v); + + EXPECT_EQ(test_mov->src[0].nr, a.nr); + EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_Z, + SWIZZLE_W, + SWIZZLE_X, + SWIZZLE_Y)); +} + +TEST_F(copy_propagation_test, test_swizzle_writemask) +{ + dst_reg a = dst_reg(v, glsl_type::vec4_type); + dst_reg b = dst_reg(v, glsl_type::vec4_type); + dst_reg c = dst_reg(v, glsl_type::vec4_type); + + v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_X, + SWIZZLE_Y, + SWIZZLE_X, + SWIZZLE_Z)))); + + v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f))); + + vec4_instruction *test_mov = + v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_W, + SWIZZLE_W, + SWIZZLE_W, + SWIZZLE_W))); + v->emit(test_mov); + + copy_propagation(v); + + /* should not copy propagate */ + EXPECT_EQ(test_mov->src[0].nr, b.nr); + EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W, + SWIZZLE_W, + SWIZZLE_W, + SWIZZLE_W)); +} diff --git a/src/intel/compiler/test_vec4_register_coalesce.cpp b/src/intel/compiler/test_vec4_register_coalesce.cpp new file mode 100644 index 00000000000..a3dbb0a72e4 --- /dev/null +++ b/src/intel/compiler/test_vec4_register_coalesce.cpp @@ -0,0 +1,242 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include "brw_vec4.h" +#include "program/program.h" + +using namespace brw; + +int ret = 0; + +#define register_coalesce(v) _register_coalesce(v, __func__) + +class register_coalesce_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct gl_shader_program *shader_prog; + struct brw_vue_prog_data *prog_data; + vec4_visitor *v; +}; + + +class register_coalesce_vec4_visitor : public vec4_visitor +{ +public: + register_coalesce_vec4_visitor(struct brw_compiler *compiler, + nir_shader *shader, + struct brw_vue_prog_data *prog_data) + : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL, + false /* no_spills */, -1) + { + prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; + } + +protected: + virtual dst_reg *make_reg_for_system_value(int location) + { + unreachable("Not reached"); + } + + virtual void setup_payload() + { + unreachable("Not reached"); + } + + virtual void emit_prolog() + { + unreachable("Not reached"); + } + + virtual void emit_thread_end() + { + unreachable("Not reached"); + } + + virtual void emit_urb_write_header(int mrf) + { + unreachable("Not reached"); + } + + virtual vec4_instruction *emit_urb_write_opcode(bool complete) + { + unreachable("Not reached"); + } +}; + + +void register_coalesce_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data)); + compiler->devinfo = devinfo; + + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL); + + v = new register_coalesce_vec4_visitor(compiler, shader, prog_data); + + devinfo->gen = 4; +} + +static void +_register_coalesce(vec4_visitor *v, const char *func) +{ + bool print = false; + + if (print) { + printf("%s: instructions before:\n", func); + v->dump_instructions(); + } + + v->calculate_cfg(); + v->opt_register_coalesce(); + + if (print) { + printf("%s: instructions after:\n", func); + v->dump_instructions(); + } +} + +TEST_F(register_coalesce_test, test_compute_to_mrf) +{ + src_reg something = src_reg(v, glsl_type::float_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + dst_reg init; + + dst_reg m0 = dst_reg(MRF, 0); + m0.writemask = WRITEMASK_X; + m0.type = BRW_REGISTER_TYPE_F; + + vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f))); + v->emit(v->MOV(m0, src_reg(temp))); + + register_coalesce(v); + + EXPECT_EQ(mul->dst.file, MRF); +} + + +TEST_F(register_coalesce_test, test_multiple_use) +{ + src_reg something = src_reg(v, glsl_type::float_type); + dst_reg temp = dst_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg m0 = dst_reg(MRF, 0); + m0.writemask = WRITEMASK_X; + m0.type = BRW_REGISTER_TYPE_F; + + dst_reg m1 = dst_reg(MRF, 1); + m1.writemask = WRITEMASK_XYZW; + m1.type = BRW_REGISTER_TYPE_F; + + src_reg src = src_reg(temp); + vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f))); + src.swizzle = BRW_SWIZZLE_XXXX; + v->emit(v->MOV(m0, src)); + src.swizzle = BRW_SWIZZLE_XYZW; + v->emit(v->MOV(m1, src)); + + register_coalesce(v); + + EXPECT_NE(mul->dst.file, MRF); +} + +TEST_F(register_coalesce_test, test_dp4_mrf) +{ + src_reg some_src_1 = src_reg(v, glsl_type::vec4_type); + src_reg some_src_2 = src_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg m0 = dst_reg(MRF, 0); + m0.writemask = WRITEMASK_Y; + m0.type = BRW_REGISTER_TYPE_F; + + dst_reg temp = dst_reg(v, glsl_type::float_type); + + vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2)); + v->emit(v->MOV(m0, src_reg(temp))); + + register_coalesce(v); + + EXPECT_EQ(dp4->dst.file, MRF); + EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); +} + +TEST_F(register_coalesce_test, test_dp4_grf) +{ + src_reg some_src_1 = src_reg(v, glsl_type::vec4_type); + src_reg some_src_2 = src_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg to = dst_reg(v, glsl_type::vec4_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + + vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2)); + to.writemask = WRITEMASK_Y; + v->emit(v->MOV(to, src_reg(temp))); + + /* if we don't do something with the result, the automatic dead code + * elimination will remove all our instructions. + */ + src_reg src = src_reg(to); + src.negate = true; + v->emit(v->MOV(dst_reg(MRF, 0), src)); + + register_coalesce(v); + + EXPECT_EQ(dp4->dst.nr, to.nr); + EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); +} + +TEST_F(register_coalesce_test, test_channel_mul_grf) +{ + src_reg some_src_1 = src_reg(v, glsl_type::vec4_type); + src_reg some_src_2 = src_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg to = dst_reg(v, glsl_type::vec4_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + + vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2)); + to.writemask = WRITEMASK_Y; + v->emit(v->MOV(to, src_reg(temp))); + + /* if we don't do something with the result, the automatic dead code + * elimination will remove all our instructions. + */ + src_reg src = src_reg(to); + src.negate = true; + v->emit(v->MOV(dst_reg(MRF, 0), src)); + + register_coalesce(v); + + EXPECT_EQ(mul->dst.nr, to.nr); +} diff --git a/src/intel/compiler/test_vf_float_conversions.cpp b/src/intel/compiler/test_vf_float_conversions.cpp new file mode 100644 index 00000000000..7af97d0d097 --- /dev/null +++ b/src/intel/compiler/test_vf_float_conversions.cpp @@ -0,0 +1,110 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <gtest/gtest.h> +#include <math.h> +#include "brw_reg.h" + +class vf_float_conversion_test : public ::testing::Test { + virtual void SetUp(); + +public: + float vf_to_float[128]; +}; + +void vf_float_conversion_test::SetUp() { + /* 0 is special cased. */ + vf_to_float[0] = 0.0; + + for (int vf = 1; vf < 128; vf++) { + int ebits = (vf >> 4) & 0x7; + int mbits = vf & 0xf; + + float x = 1.0f + mbits / 16.0f; + int exp = ebits - 3; + + vf_to_float[vf] = ldexpf(x, exp); + } +} + +union fu { + float f; + unsigned u; +}; + +static unsigned +f2u(float f) +{ + union fu fu; + fu.f = f; + return fu.u; +} + +TEST_F(vf_float_conversion_test, test_vf_to_float) +{ + for (int vf = 0; vf < 256; vf++) { + float expected = vf_to_float[vf % 128]; + if (vf > 127) + expected = -expected; + + EXPECT_EQ(f2u(expected), f2u(brw_vf_to_float(vf))); + } +} + +TEST_F(vf_float_conversion_test, test_float_to_vf) +{ + for (int vf = 0; vf < 256; vf++) { + float f = vf_to_float[vf % 128]; + if (vf > 127) + f = -f; + + EXPECT_EQ(vf, brw_float_to_vf(f)); + } +} + +TEST_F(vf_float_conversion_test, test_special_case_0) +{ + /* ±0.0f are special cased to the VFs that would otherwise correspond + * to ±0.125f. Make sure we can't convert these values to VF. + */ + EXPECT_EQ(brw_float_to_vf(+0.125f), -1); + EXPECT_EQ(brw_float_to_vf(-0.125f), -1); + + EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(+0.0f))), f2u(+0.0f)); + EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(-0.0f))), f2u(-0.0f)); +} + +TEST_F(vf_float_conversion_test, test_nonrepresentable_float_input) +{ + EXPECT_EQ(brw_float_to_vf(+32.0f), -1); + EXPECT_EQ(brw_float_to_vf(-32.0f), -1); + + EXPECT_EQ(brw_float_to_vf(+16.5f), -1); + EXPECT_EQ(brw_float_to_vf(-16.5f), -1); + + EXPECT_EQ(brw_float_to_vf(+8.25f), -1); + EXPECT_EQ(brw_float_to_vf(-8.25f), -1); + + EXPECT_EQ(brw_float_to_vf(+4.125f), -1); + EXPECT_EQ(brw_float_to_vf(-4.125f), -1); +} |