diff options
author | Vadim Girlin <[email protected]> | 2013-04-30 20:51:36 +0400 |
---|---|---|
committer | Vadim Girlin <[email protected]> | 2013-04-30 21:50:47 +0400 |
commit | 2cd769179345799d383f92dd615991755ec24be1 (patch) | |
tree | 9863c9e92e645cad35a861b7de76f0c98d64b0d0 /src/gallium/drivers/r600/sb | |
parent | fbb065d629d2f79a6224fc3e5e89d5acc275e3b4 (diff) |
r600g/sb: initial commit of the optimizing shader backend
Diffstat (limited to 'src/gallium/drivers/r600/sb')
35 files changed, 17498 insertions, 0 deletions
diff --git a/src/gallium/drivers/r600/sb/notes.markdown b/src/gallium/drivers/r600/sb/notes.markdown new file mode 100644 index 00000000000..056497754a2 --- /dev/null +++ b/src/gallium/drivers/r600/sb/notes.markdown @@ -0,0 +1,413 @@ +r600-sb +======= + +* * * * * + +Debugging +--------- + +### Environment variables + +- **R600\_DEBUG** + + There are new flags: + + - **sb** - Enable optimization of graphics shaders + - **sbcl** - Enable optimization of compute shaders (experimental) + - **sbdry** - Dry run, optimize but use source bytecode - + useful if you only want to check shader dumps + without the risk of lockups and other problems + - **sbstat** - Print optimization statistics (only time so far) + - **sbdump** - Print IR after some passes. + +### Regression debugging + +If there are any regressions as compared to the default backend +(R600\_SB=0), it's possible to use the following environment variables +to find the incorrectly optimized shader that causes the regression. + +- **R600\_SB\_DSKIP\_MODE** - allows to skip optimization for some + shaders + - 0 - disabled (default) + - 1 - skip optimization for the shaders in the range + [R600\_SB\_DSKIP\_START; R600\_SB\_DSKIP\_END], that is, + optimize only the shaders that are not in this range + - 2 - optimize only the shaders in the range + [R600\_SB\_DSKIP\_START; R600\_SB\_DSKIP\_END] + +- **R600\_SB\_DSKIP\_START** - start of the range (1-based) + +- **R600\_SB\_DSKIP\_END** - end of the range (1-based) + +Example - optimize only the shaders 5, 6, and 7: + + R600_SB_DSKIP_START=5 R600_SB_DSKIP_END=7 R600_SB_DSKIP_MODE=2 + +All shaders compiled by the application are numbered starting from 1, +the number of shaders used by the application may be obtained by running +it with "R600_DEBUG=sb,sbstat" - it will print "sb: shader \#index\#" +for each compiled shader. + +After figuring out the total number of shaders used by the application, +the variables above allow to use bisection to find the shader that is +the cause of regression. E.g. if the application uses 100 shaders, we +can divide the range [1; 100] and run the application with the +optimization enabled only for the first half of the shaders: + + R600_SB_DSKIP_START=1 R600_SB_DSKIP_END=50 R600_SB_DSKIP_MODE=2 <app> + +If the regression is reproduced with these parameters, then the failing +shader is in the range [1; 50], if it's not reproduced - then it's in +the range [51; 100]. Then we can divide the new range again and repeat +the testing, until we'll reduce the range to a single failing shader. + +*NOTE: This method relies on the assumption that the application +produces the same sequence of the shaders on each run. It's not always +true - some applications may produce different sequences of the shaders, +in such cases the tools like apitrace may be used to record the trace +with the application, then this method may be applied when replaying the +trace - also this may be faster and/or more convenient than testing the +application itself.* + +* * * * * + +Intermediate Representation +--------------------------- + +### Values + +All kinds of the operands (literal constants, references to kcache +constants, references to GPRs, etc) are currently represented by the +**value** class (possibly it makes sense to switch to hierarchy of +classes derived from **value** instead, to save some memory). + +All values (except some pseudo values like the exec\_mask or predicate +register) represent 32bit scalar values - there are no vector values, +CF/FETCH instructions use groups of 4 values for src and dst operands. + +### Nodes + +Shader programs are represented using the tree data structure, some +nodes contain a list of subnodes. + +#### Control flow nodes + +Control flow information is represented using four special node types +(based on the ideas from [[1]](#references) ) + +- **region\_node** - single-entry, single-exit region. + + All loops and if's in the program are enclosed in region nodes. + Region nodes have two containers for phi nodes - + region\_node::loop\_phi contains the phi expressions to be executed + at the region entry, region\_node::phi contains the phi expressions + to be executed at the region exit. It's the only type of the node + that contains associated phi expressions. + +- **depart\_node** - "depart region \$id after { ... }" + + Depart target region (jump to exit point) after executing contained + code. + +- **repeat\_node** - "repeat region \$id after { ... }" + + Repeat target region (jump to entry point) after executing contained + code. + +- **if\_node** - "if (cond) { ... }" + + Execute contained code if condition is true. The difference from + [[1]](#references) is that we don't have associated phi expressions + for the **if\_node**, we enclose **if\_node** in the + **region\_node** and store corresponding phi's in the + **region\_node**, this allows more uniform handling. + +The target region of depart and repeat nodes is always the region where +they are located (possibly in the nested region), there are no arbitrary +jumps/goto's - control flow in the program is always structured. + +Typical control flow constructs can be represented as in the following +examples: + +GLSL: + + if (cond) { + < 1 > + } else { + < 2 > + } + +IR: + + region #0 { + depart region #0 after { + if (cond) { + depart region #0 after { + < 1 > + } + } + < 2 > + } + <region #0 phi nodes > + } + +GLSL: + + while (cond) { + < 1 > + } + +IR: + + region #0 { + <region #0 loop_phi nodes> + repeat region #0 after { + region #1 { + depart region #1 after { + if (!cond) { + depart region #0 + } + } + } + < 1 > + } + <region #0 phi nodes> + } + +'Break' and 'continue' inside the loops are directly translated to the +depart and repeat nodes for the corresponding loop region. + +This may look a bit too complicated, but in fact this allows more simple +and uniform handling of the control flow. + +All loop\_phi and phi nodes for some region always have the same number +of source operands. The number of source operands for +region\_node::loop\_phi nodes is 1 + number of repeat nodes that +reference this region as a target. The number of source operands for +region\_node::phi nodes is equal to the number of depart nodes that +reference this region as a target. All depart/repeat nodes for the +region have unique indices equal to the index of source operand for +phi/loop\_phi nodes. + +First source operand for region\_node::loop\_phi nodes (src[0]) is an +incoming value that enters the region from the outside. Each remaining +source operand comes from the corresponding repeat node. + +More complex example: + +GLSL: + + a = 1; + while (a < 5) { + a = a * 2; + if (b == 3) { + continue; + } else { + a = 6; + } + if (c == 4) + break; + a = a + 1; + } + +IR with SSA form: + + a.1 = 1; + region #0 { + // loop phi values: src[0] - incoming, src[1] - from repeat_1, src[2] - from repeat_2 + region#0 loop_phi: a.2 = phi a.1, a.6, a.3 + + repeat_1 region #0 after { + a.3 = a.2 * 2; + cond1 = (b == 3); + region #1 { + depart_0 region #1 after { + if (cond1) { + repeat_2 region #0; + } + } + a.4 = 6; + + region #1 phi: a.5 = phi a.4; // src[0] - from depart_0 + } + cond2 = (c == 4); + region #2 { + depart_0 region #2 after { + if (cond2) { + depart_0 region #0; + } + } + } + a.6 = a.5 + 1; + } + + region #0 phi: a.7 = phi a.5 // src[0] from depart_0 + } + +Phi nodes with single source operand are just copies, they are not +really necessary, but this allows to handle all **depart\_node**s in the +uniform way. + +#### Instruction nodes + +Instruction nodes represent different kinds of instructions - +**alu\_node**, **cf\_node**, **fetch\_node**, etc. Each of them contains +the "bc" structure where all fields of the bytecode are stored (the type +is **bc\_alu** for **alu\_node**, etc). The operands are represented +using the vectors of pointers to **value** class (node::src, node::dst) + +#### SSA-specific nodes + +Phi nodes currently don't have special node class, they are stored as +**node**. Destination vector contains a single destination value, source +vector contains 1 or more source values. + +Psi nodes [[5], [6]](#references) also don't have a special node class +and stored as **node**. Source vector contains 3 values for each source +operand - the **value** of predicate, **value** of corresponding +PRED\_SEL field, and the source **value** itself. + +### Indirect addressing + +Special kind of values (VLK\_RELREG) is used to represent indirect +operands. These values don't have SSA versions. The representation is +mostly based on the [[2]](#references). Indirect operand contains the +"offset/address" value (value::rel), (e.g. some SSA version of the AR +register value, though after some passes it may be any value - constant, +register, etc), also it contains the maydef and mayuse vectors of +pointers to **value**s (similar to dst/src vectors in the **node**) to +represent the effects of aliasing in the SSA form. + +E.g. if we have the array R5.x ... R8.x and the following instruction : + + MOV R0.x, R[5 + AR].x + +then source indirect operand is represented with the VLK\_RELREG value, +value::rel is AR, value::maydef is empty (in fact it always contain the +same number of elements as mayuse to simplify the handling, but they are +NULLs), value::mayuse contains [R5.x, R6.x, R7.x, R8.x] (or the +corresponding SSA versions after ssa\_rename). + +Additional "virtual variables" as in [HSSA [2]](#references) are not +used, also there is no special handling for "zero versions". Typical +programs in our case are small, indirect addressing is rare, array sizes +are limited by max gpr number, so we don't really need to use special +tricks to avoid the explosion of value versions. Also this allows more +precise liveness computation for array elements without modifications to +the algorithms. + +With the following instruction: + + MOV R[5+AR].x, R0.x + +we'll have both maydef and mayuse vectors for dst operand filled with +array values initially: [R5.x, R6.x, R7.x, R8.x]. After the ssa\_rename +pass mayuse will contain previous versions, maydef will contain new +potentially-defined versions. + +* * * * * + +Passes +------ + +- **bc\_parser** - creates the IR from the source bytecode, + initializes src and dst value vectors for instruction nodes. Most + ALU nodes have one dst operand and the number of source operands is + equal to the number of source operands for the ISA instruction. + Nodes for PREDSETxx instructions have 3 dst operands - dst[0] is dst + gpr as in the original instruction, other two are pseudo-operands + that represent possibly updated predicate and exec\_mask. Predicate + values are used in the predicated alu instructions (node::pred), + exec\_mask values are used in the if\_nodes (if\_node::cond). Each + vector operand in the CF/TEX/VTX instructions is represented with 4 + values - components of the vector. + +- **ssa\_prepare** - creates phi expressions. + +- **ssa\_rename** - renames the values (assigns versions). + +- **liveness** - liveness computation, sets 'dead' flag for unused + nodes and values, optionally computes interference information for + the values. + +- **dce\_cleanup** - eliminates 'dead' nodes, also removes some + unnecessary nodes created by bc\_parser, e.g. the nodes for the JUMP + instructions in the source, containers for ALU groups (they were + only needed for the ssa\_rename pass) + +- **if\_conversion** - converts control flow with if\_nodes to the + data flow in cases where it can improve performance (small alu-only + branches). Both branches are executed speculatively and the phi + expressions are replaced with conditional moves (CNDxx) to select + the final value using the same condition predicate as was used by + the original if\_node. E.g. **if\_node** used dst[2] from PREDSETxx + instruction, CNDxx now uses dst[0] from the same PREDSETxx + instruction. + +- **peephole** - peephole optimizations + +- **gvn** - Global Value Numbering [[2]](#references), + [[3]](#references) + +- **gcm** - Global Code Motion [[3]](#references). Also performs + grouping of the instructions of the same kind (CF/FETCH/ALU). + +- register allocation passes, some ideas are used from + [[4]](#references), but implementation is simplified to make it more + efficient in terms of the compilation speed (e.g. no recursive + recoloring) while achieving good enough results. + + - **ra\_split** - prepares the program to register allocation. + Splits live ranges for constrained values by inserting the + copies to/from temporary values, so that the live range of the + constrained values becomes minimal. + + - **ra\_coalesce** - performs global allocation on registers used + in CF/FETCH instructions. It's performed first to make sure they + end up in the same GPR. Also tries to allocate all values + involved in copies (inserted by the ra\_split pass) to the same + register, so that the copies may be eliminated. + + - **ra\_init** - allocates gpr arrays (if indirect addressing is + used), and remaining values. + +- **post\_scheduler** - ALU scheduler, handles VLIW packing and + performs the final register allocation for local values inside ALU + clauses. Eliminates all coalesced copies (if src and dst of the copy + are allocated to the same register). + +- **ra\_checker** - optional debugging pass that tries to catch basic + errors of the scheduler or regalloc, + +- **bc\_finalize** - propagates the regalloc information from values + in node::src and node::dst vectors to the bytecode fields, converts + control flow structure (region/depart/repeat) to the target + instructions (JUMP/ELSE/POP, + LOOP\_START/LOOP\_END/LOOP\_CONTINUE/LOOP\_BREAK). + +- **bc\_builder** - builds final bytecode, + +* * * * * + +References +---------- + +[1] ["Tree-Based Code Optimization. A Thesis Proposal", Carl +McConnell](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.38.4210&rep=rep1&type=pdf) + +[2] ["Effective Representation of Aliases and Indirect Memory Operations +in SSA Form", Fred Chow, Sun Chan, Shin-Ming Liu, Raymond Lo, Mark +Streich](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.6974&rep=rep1&type=pdf) + +[3] ["Global Code Motion. Global Value Numbering.", Cliff +Click](http://www.cs.washington.edu/education/courses/cse501/06wi/reading/click-pldi95.pdf) + +[4] ["Register Allocation for Programs in SSA Form", Sebastian +Hack](http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/6532) + +[5] ["An extension to the SSA representation for predicated code", +Francois de +Ferriere](http://www.cdl.uni-saarland.de/ssasem/talks/Francois.de.Ferriere.pdf) + +[6] ["Improvements to the Psi-SSA Representation", F. de +Ferriere](http://www.scopesconf.org/scopes-07/presentations/3_Presentation.pdf) diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h new file mode 100644 index 00000000000..be68f9fd925 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -0,0 +1,785 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef SB_BC_H_ +#define SB_BC_H_ + +extern "C" { +#include <stdint.h> +#include "r600_isa.h" +} + +#include <vector> +#include <stack> + +struct r600_bytecode; +struct r600_shader; + +namespace r600_sb { + +class hw_encoding_format; +class alu_node; +class cf_node; +class fetch_node; +class alu_group_node; +class region_node; +class shader; + +enum shader_target +{ + TARGET_UNKNOWN, + TARGET_VS, + TARGET_PS, + TARGET_GS, + TARGET_COMPUTE, + TARGET_FETCH, + + TARGET_NUM +}; + +enum sb_hw_class_bits +{ + HB_R6 = (1<<0), + HB_R7 = (1<<1), + HB_EG = (1<<2), + HB_CM = (1<<3), + + HB_R6R7 = (HB_R6 | HB_R7), + HB_EGCM = (HB_EG | HB_CM), + HB_R6R7EG = (HB_R6 | HB_R7 | HB_EG), + HB_R7EGCM = (HB_R7 | HB_EG | HB_CM), + + HB_ALL = (HB_R6 | HB_R7 | HB_EG | HB_CM) +}; + +enum sb_hw_chip +{ + HW_CHIP_UNKNOWN, + HW_CHIP_R600, + HW_CHIP_RV610, + HW_CHIP_RV630, + HW_CHIP_RV670, + HW_CHIP_RV620, + HW_CHIP_RV635, + HW_CHIP_RS780, + HW_CHIP_RS880, + HW_CHIP_RV770, + HW_CHIP_RV730, + HW_CHIP_RV710, + HW_CHIP_RV740, + HW_CHIP_CEDAR, + HW_CHIP_REDWOOD, + HW_CHIP_JUNIPER, + HW_CHIP_CYPRESS, + HW_CHIP_HEMLOCK, + HW_CHIP_PALM, + HW_CHIP_SUMO, + HW_CHIP_SUMO2, + HW_CHIP_BARTS, + HW_CHIP_TURKS, + HW_CHIP_CAICOS, + HW_CHIP_CAYMAN, + HW_CHIP_ARUBA +}; + +enum sb_hw_class +{ + HW_CLASS_UNKNOWN, + HW_CLASS_R600, + HW_CLASS_R700, + HW_CLASS_EVERGREEN, + HW_CLASS_CAYMAN +}; + +enum alu_slots { + SLOT_X = 0, + SLOT_Y = 1, + SLOT_Z = 2, + SLOT_W = 3, + SLOT_TRANS = 4 +}; + +enum misc_consts { + MAX_ALU_LITERALS = 4, + MAX_ALU_SLOTS = 128, + MAX_GPR = 128, + MAX_CHAN = 4 + +}; + +enum alu_src_sel { + + ALU_SRC_LDS_OQ_A = 219, + ALU_SRC_LDS_OQ_B = 220, + ALU_SRC_LDS_OQ_A_POP = 221, + ALU_SRC_LDS_OQ_B_POP = 222, + ALU_SRC_LDS_DIRECT_A = 223, + ALU_SRC_LDS_DIRECT_B = 224, + ALU_SRC_TIME_HI = 227, + ALU_SRC_TIME_LO = 228, + ALU_SRC_MASK_HI = 229, + ALU_SRC_MASK_LO = 230, + ALU_SRC_HW_WAVE_ID = 231, + ALU_SRC_SIMD_ID = 232, + ALU_SRC_SE_ID = 233, + ALU_SRC_HW_THREADGRP_ID = 234, + ALU_SRC_WAVE_ID_IN_GRP = 235, + ALU_SRC_NUM_THREADGRP_WAVES = 236, + ALU_SRC_HW_ALU_ODD = 237, + ALU_SRC_LOOP_IDX = 238, + ALU_SRC_PARAM_BASE_ADDR = 240, + ALU_SRC_NEW_PRIM_MASK = 241, + ALU_SRC_PRIM_MASK_HI = 242, + ALU_SRC_PRIM_MASK_LO = 243, + ALU_SRC_1_DBL_L = 244, + ALU_SRC_1_DBL_M = 245, + ALU_SRC_0_5_DBL_L = 246, + ALU_SRC_0_5_DBL_M = 247, + ALU_SRC_0 = 248, + ALU_SRC_1 = 249, + ALU_SRC_1_INT = 250, + ALU_SRC_M_1_INT = 251, + ALU_SRC_0_5 = 252, + ALU_SRC_LITERAL = 253, + ALU_SRC_PV = 254, + ALU_SRC_PS = 255, + + ALU_SRC_PARAM_OFFSET = 448 +}; + +enum alu_predicate_select +{ + PRED_SEL_OFF = 0, +// RESERVED = 1, + PRED_SEL_0 = 2, + PRED_SEL_1 = 3 +}; + + +enum alu_omod { + OMOD_OFF = 0, + OMOD_M2 = 1, + OMOD_M4 = 2, + OMOD_D2 = 3 +}; + +enum alu_index_mode { + INDEX_AR_X = 0, + INDEX_AR_Y_R600 = 1, + INDEX_AR_Z_R600 = 2, + INDEX_AR_W_R600 = 3, + + INDEX_LOOP = 4, + INDEX_GLOBAL = 5, + INDEX_GLOBAL_AR_X = 6 +}; + +enum alu_cayman_mova_dst { + CM_MOVADST_AR_X, + CM_MOVADST_PC, + CM_MOVADST_IDX0, + CM_MOVADST_IDX1, + CM_MOVADST_CG0, // clause-global byte 0 + CM_MOVADST_CG1, + CM_MOVADST_CG2, + CM_MOVADST_CG3 +}; + +enum alu_cayman_exec_mask_op { + CM_EMO_DEACTIVATE, + CM_EMO_BREAK, + CM_EMO_CONTINUE, + CM_EMO_KILL +}; + + +enum cf_exp_type { + EXP_PIXEL, + EXP_POS, + EXP_PARAM, + + EXP_TYPE_COUNT +}; + +enum cf_mem_type { + MEM_WRITE, + MEM_WRITE_IND, + MEM_WRITE_ACK, + MEM_WRITE_IND_ACK +}; + + +enum alu_kcache_mode { + KC_LOCK_NONE, + KC_LOCK_1, + KC_LOCK_2, + KC_LOCK_LOOP +}; + +enum alu_kcache_index_mode { + KC_INDEX_NONE, + KC_INDEX_0, + KC_INDEX_1, + KC_INDEX_INVALID +}; + +enum chan_select { + SEL_X = 0, + SEL_Y = 1, + SEL_Z = 2, + SEL_W = 3, + SEL_0 = 4, + SEL_1 = 5, +// RESERVED = 6, + SEL_MASK = 7 +}; + +enum bank_swizzle { + VEC_012 = 0, + VEC_021 = 1, + VEC_120 = 2, + VEC_102 = 3, + VEC_201 = 4, + VEC_210 = 5, + + VEC_NUM = 6, + + SCL_210 = 0, + SCL_122 = 1, + SCL_212 = 2, + SCL_221 = 3, + + SCL_NUM = 4 + +}; + +enum sched_queue_id { + SQ_CF, + SQ_ALU, + SQ_TEX, + SQ_VTX, + + SQ_NUM +}; + +struct literal { + union { + int32_t i; + uint32_t u; + float f; + }; + + literal(int32_t i = 0) : i(i) {} + literal(uint32_t u) : u(u) {} + literal(float f) : f(f) {} + literal(double f) : f(f) {} + operator uint32_t() const { return u; } + bool operator ==(literal l) { return u == l.u; } + bool operator ==(int v_int) { return i == v_int; } + bool operator ==(unsigned v_uns) { return u == v_uns; } +}; + +struct bc_kcache { + unsigned mode; + unsigned bank; + unsigned addr; + unsigned index_mode; +} ; + +// TODO optimize bc structures + +struct bc_cf { + + bc_kcache kc[4]; + + unsigned id; + + + const cf_op_info * op_ptr; + unsigned op; + + unsigned addr:32; + + unsigned alt_const:1; + unsigned uses_waterfall:1; + + unsigned barrier:1; + unsigned count:7; + unsigned pop_count:3; + unsigned call_count:6; + unsigned whole_quad_mode:1; + unsigned valid_pixel_mode:1; + + unsigned jumptable_sel:3; + unsigned cf_const:5; + unsigned cond:2; + unsigned end_of_program:1; + + unsigned array_base:13; + unsigned elem_size:2; + unsigned index_gpr:7; + unsigned rw_gpr:7; + unsigned rw_rel:1; + unsigned type:2; + + unsigned burst_count:4; + unsigned mark:1; + unsigned sel[4]; + + unsigned array_size:12; + unsigned comp_mask:4; + + unsigned rat_id:4; + unsigned rat_inst:6; + unsigned rat_index_mode:2; + + void set_op(unsigned op) { this->op = op; op_ptr = r600_isa_cf(op); } + + bool is_alu_extended() { + assert(op_ptr->flags & CF_ALU); + return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE; + } + +}; + +struct bc_alu_src { + unsigned sel:9; + unsigned chan:2; + unsigned neg:1; + unsigned abs:1; + unsigned rel:1; + literal value; +}; + +struct bc_alu { + const alu_op_info * op_ptr; + unsigned op; + + bc_alu_src src[3]; + + unsigned dst_gpr:7; + unsigned dst_chan:2; + unsigned dst_rel:1; + unsigned clamp:1; + unsigned omod:2; + unsigned bank_swizzle:3; + + unsigned index_mode:3; + unsigned last:1; + unsigned pred_sel:2; + + unsigned fog_merge:1; + unsigned write_mask:1; + unsigned update_exec_mask:1; + unsigned update_pred:1; + + unsigned slot:3; + + alu_op_flags slot_flags; + + void set_op(unsigned op) { + this->op = op; + op_ptr = r600_isa_alu(op); + } +}; + +struct bc_fetch { + const fetch_op_info * op_ptr; + unsigned op; + + unsigned bc_frac_mode:1; + unsigned fetch_whole_quad:1; + unsigned resource_id:8; + + unsigned src_gpr:7; + unsigned src_rel:1; + unsigned src_sel[4]; + + unsigned dst_gpr:7; + unsigned dst_rel:1; + unsigned dst_sel[4]; + + unsigned alt_const:1; + + unsigned inst_mod:2; + unsigned resource_index_mode:2; + unsigned sampler_index_mode:2; + + unsigned coord_type[4]; + unsigned lod_bias:7; + + unsigned offset[3]; + + unsigned sampler_id:5; + + + unsigned fetch_type:2; + unsigned mega_fetch_count:6; + unsigned coalesced_read:1; + unsigned structured_read:2; + unsigned lds_req:1; + + unsigned data_format:6; + unsigned format_comp_all:1; + unsigned num_format_all:2; + unsigned semantic_id:8; + unsigned srf_mode_all:1; + unsigned use_const_fields:1; + + unsigned const_buf_no_stride:1; + unsigned endian_swap:2; + unsigned mega_fetch:1; + + void set_op(unsigned op) { this->op = op; op_ptr = r600_isa_fetch(op); } +}; + +class sb_context { + +public: + + r600_isa *isa; + + sb_hw_chip hw_chip; + sb_hw_class hw_class; + + unsigned alu_temp_gprs; + unsigned max_fetch; + bool has_trans; + unsigned vtx_src_num; + unsigned num_slots; + bool uses_mova_gpr; + + unsigned stack_entry_size; + + static unsigned dump_pass; + static unsigned dump_stat; + + static unsigned dry_run; + + static unsigned dskip_start; + static unsigned dskip_end; + static unsigned dskip_mode; + + sb_context() + : isa(0), hw_chip(HW_CHIP_UNKNOWN), hw_class(HW_CLASS_UNKNOWN) {} + + int init(r600_isa *isa, sb_hw_chip chip, sb_hw_class cclass); + + bool is_r600() {return hw_class == HW_CLASS_R600;} + bool is_r700() {return hw_class == HW_CLASS_R700;} + bool is_evergreen() {return hw_class == HW_CLASS_EVERGREEN;} + bool is_cayman() {return hw_class == HW_CLASS_CAYMAN;} + bool is_egcm() {return hw_class >= HW_CLASS_EVERGREEN;} + + sb_hw_class_bits hw_class_bit() { + switch (hw_class) { + case HW_CLASS_R600:return HB_R6; + case HW_CLASS_R700:return HB_R7; + case HW_CLASS_EVERGREEN:return HB_EG; + case HW_CLASS_CAYMAN:return HB_CM; + default: assert(!"unknown hw class"); return (sb_hw_class_bits)0; + + } + } + + unsigned cf_opcode(unsigned op) { + return r600_isa_cf_opcode(isa->hw_class, op); + } + + unsigned alu_opcode(unsigned op) { + return r600_isa_alu_opcode(isa->hw_class, op); + } + + unsigned alu_slots(unsigned op) { + return r600_isa_alu_slots(isa->hw_class, op); + } + + unsigned alu_slots(const alu_op_info * op_ptr) { + return op_ptr->slots[isa->hw_class]; + } + + unsigned alu_slots_mask(const alu_op_info * op_ptr) { + unsigned mask = 0; + unsigned slot_flags = alu_slots(op_ptr); + if (slot_flags & AF_V) + mask = 0b01111; + if (!is_cayman() && (slot_flags & AF_S)) + mask |= 0b10000; + return mask; + } + + unsigned fetch_opcode(unsigned op) { + return r600_isa_fetch_opcode(isa->hw_class, op); + } + + bool is_kcache_sel(unsigned sel) { + return ((sel >= 128 && sel < 192) || (sel >= 256 && sel < 320)); + } + +}; + +#define SB_DUMP_STAT(a) do { if (sb_context::dump_stat) { a } } while (0) +#define SB_DUMP_PASS(a) do { if (sb_context::dump_pass) { a } } while (0) + +class bc_decoder { + + sb_context &ctx; + + uint32_t* dw; + unsigned ndw; + +public: + + bc_decoder(sb_context &sctx, uint32_t *data, unsigned size) + : ctx(sctx), dw(data), ndw(size) {} + + int decode_cf(unsigned &i, bc_cf &bc); + int decode_alu(unsigned &i, bc_alu &bc); + int decode_fetch(unsigned &i, bc_fetch &bc); + +private: + int decode_cf_alu(unsigned &i, bc_cf &bc); + int decode_cf_exp(unsigned &i, bc_cf &bc); + int decode_cf_mem(unsigned &i, bc_cf &bc); + + int decode_fetch_vtx(unsigned &i, bc_fetch &bc); +}; + +// bytecode format definition + +class hw_encoding_format { + const sb_hw_class_bits hw_target; //FIXME: debug - remove after testing + hw_encoding_format(); +protected: + uint32_t value; +public: + hw_encoding_format(sb_hw_class_bits hw) + : hw_target(hw), value(0) {} + hw_encoding_format(uint32_t v, sb_hw_class_bits hw) + : hw_target(hw), value(v) {} + uint32_t get_value(sb_hw_class_bits hw) const { + assert((hw & hw_target) == hw); + return value; + } +}; + +#define BC_FORMAT_BEGIN_HW(fmt, hwset) \ +class fmt##_##hwset : public hw_encoding_format {\ + typedef fmt##_##hwset thistype; \ +public: \ + fmt##_##hwset() : hw_encoding_format(HB_##hwset) {}; \ + fmt##_##hwset(uint32_t v) : hw_encoding_format(v, HB_##hwset) {}; + +#define BC_FORMAT_BEGIN(fmt) BC_FORMAT_BEGIN_HW(fmt, ALL) + +#define BC_FORMAT_END(fmt) }; + +// bytecode format field definition + +#define BC_FIELD(fmt, name, shortname, last_bit, first_bit) \ + thistype & name(unsigned v) { \ + value |= ((v&((1ull<<((last_bit)-(first_bit)+1))-1))<<(first_bit)); \ + return *this; \ + } \ + unsigned get_##name() const { \ + return (value>>(first_bit))&((1ull<<((last_bit)-(first_bit)+1))-1); \ + } \ + +#define BC_RSRVD(fmt, last_bit, first_bit) + +// CLAMP macro defined elsewhere interferes with bytecode field name +#undef CLAMP + +#include "sb_bc_fmt_def.inc" + +#undef BC_FORMAT_BEGIN +#undef BC_FORMAT_END +#undef BC_FIELD +#undef BC_RSRVD + +class bc_parser { + sb_context & ctx; + + bc_decoder *dec; + + r600_bytecode *bc; + r600_shader *pshader; + + uint32_t *dw; + unsigned bc_ndw; + + unsigned max_cf; + + shader *sh; + + int error; + + alu_node *slots[2][5]; + unsigned cgroup; + + typedef std::vector<cf_node*> id_cf_map; + id_cf_map cf_map; + + typedef std::stack<region_node*> region_stack; + region_stack loop_stack; + + int enable_dump; + int optimize; + +public: + + bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader, + int dump_source, int optimize) : + ctx(sctx), dec(), bc(bc), pshader(pshader), + dw(), bc_ndw(), max_cf(), + sh(), error(), slots(), cgroup(), + cf_map(), loop_stack(), enable_dump(dump_source), + optimize(optimize) { } + + int parse(); + + shader* get_shader() { assert(!error); return sh; } + +private: + + int parse_shader(); + + int parse_decls(); + + int parse_cf(unsigned &i, bool &eop); + + int parse_alu_clause(cf_node *cf); + int parse_alu_group(cf_node* cf, unsigned &i, unsigned &gcnt); + + int parse_fetch_clause(cf_node *cf); + + int prepare_ir(); + int prepare_loop(cf_node *c); + int prepare_if(cf_node *c); + int prepare_alu_clause(cf_node *c); + +}; + + + + +class bytecode { + typedef std::vector<uint32_t> bc_vector; + sb_hw_class_bits hw_class_bit; + + bc_vector bc; + + unsigned pos; + +public: + + bytecode(sb_hw_class_bits hw, unsigned rdw = 256) + : hw_class_bit(hw), pos(0) { bc.reserve(rdw); } + + unsigned ndw() { return bc.size(); } + + void write_data(uint32_t* dst) { + memcpy(dst, bc.data(), 4 * bc.size()); + } + + void align(unsigned a) { + unsigned size = bc.size(); + size = (size + a - 1) & ~(a-1); + bc.resize(size); + } + + void set_size(unsigned sz) { + assert(sz >= bc.size()); + bc.resize(sz); + } + + void seek(unsigned p) { + if (p != pos) { + if (p > bc.size()) { + bc.resize(p); + } + pos = p; + } + } + + unsigned get_pos() { return pos; } + uint32_t *data() { return bc.data(); } + + bytecode & operator <<(uint32_t v) { + if (pos == ndw()) { + bc.push_back(v); + } else + bc.at(pos) = v; + ++pos; + return *this; + } + + bytecode & operator <<(const hw_encoding_format &e) { + *this << e.get_value(hw_class_bit); + return *this; + } + + bytecode & operator <<(const bytecode &b) { + bc.insert(bc.end(), b.bc.begin(), b.bc.end()); + return *this; + } + + uint32_t at(unsigned dw_id) { return bc.at(dw_id); } +}; + + +class bc_builder { + shader &sh; + sb_context &ctx; + bytecode bb; + int error; + +public: + + bc_builder(shader &s); + int build(); + bytecode& get_bytecode() { assert(!error); return bb; } + +private: + + int build_cf(cf_node *n); + + int build_cf_alu(cf_node *n); + int build_cf_mem(cf_node *n); + int build_cf_exp(cf_node *n); + + int build_alu_clause(cf_node *n); + int build_alu_group(alu_group_node *n); + int build_alu(alu_node *n); + + int build_fetch_clause(cf_node *n); + int build_fetch_tex(fetch_node *n); + int build_fetch_vtx(fetch_node *n); +}; + +} // namespace r600_sb + +#endif /* SB_BC_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp new file mode 100644 index 00000000000..b0c2e41c33f --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp @@ -0,0 +1,638 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <iostream> +#include <sstream> + +#include "sb_bc.h" +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +bc_builder::bc_builder(shader &s) + : sh(s), ctx(s.get_ctx()), bb(ctx.hw_class_bit()), error(0) {} + +int bc_builder::build() { + + container_node *root = sh.root; + int cf_cnt = 0; + + // FIXME reserve total size to avoid reallocs + + for (node_iterator it = root->begin(), end = root->end(); + it != end; ++it) { + + cf_node *cf = static_cast<cf_node*>(*it); + assert(cf->is_cf_inst() || cf->is_alu_clause() || cf->is_fetch_clause()); + + cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags; + + cf->bc.id = cf_cnt++; + + if (flags & CF_ALU) { + if (cf->bc.is_alu_extended()) + cf_cnt++; + } + } + + bb.set_size(cf_cnt << 1); + bb.seek(cf_cnt << 1); + + unsigned cf_pos = 0; + + for (node_iterator I = root->begin(), end = root->end(); + I != end; ++I) { + + cf_node *cf = static_cast<cf_node*>(*I); + cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags; + + if (flags & CF_ALU) { + bb.seek(bb.ndw()); + cf->bc.addr = bb.ndw() >> 1; + build_alu_clause(cf); + cf->bc.count = (bb.ndw() >> 1) - cf->bc.addr - 1; + } else if (flags & CF_FETCH) { + bb.align(4); + bb.seek(bb.ndw()); + cf->bc.addr = bb.ndw() >> 1; + build_fetch_clause(cf); + cf->bc.count = (((bb.ndw() >> 1) - cf->bc.addr) >> 1) - 1; + } else if (cf->jump_target) { + cf->bc.addr = cf->jump_target->bc.id; + if (cf->jump_after_target) + cf->bc.addr += 1; + } + + bb.seek(cf_pos); + build_cf(cf); + cf_pos = bb.get_pos(); + } + + if (sh.enable_dump) { + bc_dump(sh, cerr, &bb).run(); + } + + return 0; +} + +int bc_builder::build_alu_clause(cf_node* n) { + for (node_iterator I = n->begin(), E = n->end(); + I != E; ++I) { + + alu_group_node *g = static_cast<alu_group_node*>(*I); + assert(g->is_valid()); + + build_alu_group(g); + } + return 0; +} + +int bc_builder::build_alu_group(alu_group_node* n) { + + for (node_iterator I = n->begin(), E = n->end(); + I != E; ++I) { + + alu_node *a = static_cast<alu_node*>(*I); + assert(a->is_valid()); + build_alu(a); + } + + for(int i = 0, ls = n->literals.size(); i < ls; ++i) { + bb << n->literals.at(i).u; + } + + bb.align(2); + bb.seek(bb.ndw()); + + return 0; +} + +int bc_builder::build_fetch_clause(cf_node* n) { + for (node_iterator I = n->begin(), E = n->end(); + I != E; ++I) { + fetch_node *f = static_cast<fetch_node*>(*I); + + if (f->bc.op_ptr->flags & FF_VTX) + build_fetch_vtx(f); + else + build_fetch_tex(f); + } + return 0; +} + + +int bc_builder::build_cf(cf_node* n) { + const bc_cf &bc = n->bc; + const cf_op_info *cfop = bc.op_ptr; + + if (cfop->flags & CF_ALU) + return build_cf_alu(n); + if (cfop->flags & (CF_EXP | CF_MEM)) + return build_cf_exp(n); + + if (ctx.is_egcm()) { + bb << CF_WORD0_EGCM() + .ADDR(bc.addr) + .JUMPTABLE_SEL(bc.jumptable_sel); + + if (ctx.is_evergreen()) + + bb << CF_WORD1_EG() + .BARRIER(bc.barrier) + .CF_CONST(bc.cf_const) + .CF_INST(ctx.cf_opcode(bc.op)) + .COND(bc.cond) + .COUNT(bc.count) + .END_OF_PROGRAM(bc.end_of_program) + .POP_COUNT(bc.pop_count) + .VALID_PIXEL_MODE(bc.valid_pixel_mode) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + + else //cayman + + bb << CF_WORD1_CM() + .BARRIER(bc.barrier) + .CF_CONST(bc.cf_const) + .CF_INST(ctx.cf_opcode(bc.op)) + .COND(bc.cond) + .COUNT(bc.count) + .POP_COUNT(bc.pop_count) + .VALID_PIXEL_MODE(bc.valid_pixel_mode); + } else { + bb << CF_WORD0_R6R7() + .ADDR(bc.addr); + + assert(bc.count < ctx.max_fetch); + + bb << CF_WORD1_R6R7() + .BARRIER(bc.barrier) + .CALL_COUNT(bc.call_count) + .CF_CONST(bc.cf_const) + .CF_INST(ctx.cf_opcode(bc.op)) + .COND(bc.cond) + .COUNT(bc.count & 7) + .COUNT_3(bc.count >> 3) + .END_OF_PROGRAM(bc.end_of_program) + .POP_COUNT(bc.pop_count) + .VALID_PIXEL_MODE(bc.valid_pixel_mode) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + } + + return 0; +} + +int bc_builder::build_cf_alu(cf_node* n) { + const bc_cf &bc = n->bc; + + assert(bc.count < 128); + + if (n->bc.is_alu_extended()) { + assert(ctx.is_egcm()); + + bb << CF_ALU_WORD0_EXT_EGCM() + .KCACHE_BANK2(bc.kc[2].bank) + .KCACHE_BANK3(bc.kc[3].bank) + .KCACHE_BANK_INDEX_MODE0(bc.kc[0].index_mode) + .KCACHE_BANK_INDEX_MODE1(bc.kc[1].index_mode) + .KCACHE_BANK_INDEX_MODE2(bc.kc[2].index_mode) + .KCACHE_BANK_INDEX_MODE3(bc.kc[3].index_mode) + .KCACHE_MODE2(bc.kc[2].mode); + + bb << CF_ALU_WORD1_EXT_EGCM() + .BARRIER(bc.barrier) + .CF_INST(ctx.cf_opcode(CF_OP_ALU_EXT)) + .KCACHE_ADDR2(bc.kc[2].addr) + .KCACHE_ADDR3(bc.kc[3].addr) + .KCACHE_MODE3(bc.kc[3].mode); + } + + bb << CF_ALU_WORD0_ALL() + .ADDR(bc.addr) + .KCACHE_BANK0(bc.kc[0].bank) + .KCACHE_BANK1(bc.kc[1].bank) + .KCACHE_MODE0(bc.kc[0].mode); + + assert(bc.count < 128); + + if (ctx.is_r600()) + bb << CF_ALU_WORD1_R6() + .BARRIER(bc.barrier) + .CF_INST(ctx.cf_opcode(bc.op)) + .COUNT(bc.count) + .KCACHE_ADDR0(bc.kc[0].addr) + .KCACHE_ADDR1(bc.kc[1].addr) + .KCACHE_MODE1(bc.kc[1].mode) + .USES_WATERFALL(bc.uses_waterfall) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + else + bb << CF_ALU_WORD1_R7EGCM() + .ALT_CONST(bc.alt_const) + .BARRIER(bc.barrier) + .CF_INST(ctx.cf_opcode(bc.op)) + .COUNT(bc.count) + .KCACHE_ADDR0(bc.kc[0].addr) + .KCACHE_ADDR1(bc.kc[1].addr) + .KCACHE_MODE1(bc.kc[1].mode) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + + return 0; +} + +int bc_builder::build_cf_exp(cf_node* n) { + const bc_cf &bc = n->bc; + const cf_op_info *cfop = bc.op_ptr; + + if (cfop->flags & CF_RAT) { + assert(ctx.is_egcm()); + + bb << CF_ALLOC_EXPORT_WORD0_RAT_EGCM() + .ELEM_SIZE(bc.elem_size) + .INDEX_GPR(bc.index_gpr) + .RAT_ID(bc.rat_id) + .RAT_INDEX_MODE(bc.rat_index_mode) + .RAT_INST(bc.rat_inst) + .RW_GPR(bc.rw_gpr) + .RW_REL(bc.rw_rel) + .TYPE(bc.type); + } else { + + bb << CF_ALLOC_EXPORT_WORD0_ALL() + .ARRAY_BASE(bc.array_base) + .ELEM_SIZE(bc.elem_size) + .INDEX_GPR(bc.index_gpr) + .RW_GPR(bc.rw_gpr) + .RW_REL(bc.rw_rel) + .TYPE(bc.type); + } + + if (cfop->flags & CF_EXP) { + + if (!ctx.is_egcm()) + bb << CF_ALLOC_EXPORT_WORD1_SWIZ_R6R7() + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .END_OF_PROGRAM(bc.end_of_program) + .SEL_X(bc.sel[0]) + .SEL_Y(bc.sel[1]) + .SEL_Z(bc.sel[2]) + .SEL_W(bc.sel[3]) + .VALID_PIXEL_MODE(bc.valid_pixel_mode) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + + else if (ctx.is_evergreen()) + bb << CF_ALLOC_EXPORT_WORD1_SWIZ_EG() + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .END_OF_PROGRAM(bc.end_of_program) + .MARK(bc.mark) + .SEL_X(bc.sel[0]) + .SEL_Y(bc.sel[1]) + .SEL_Z(bc.sel[2]) + .SEL_W(bc.sel[3]) + .VALID_PIXEL_MODE(bc.valid_pixel_mode); + + else // cayman + bb << CF_ALLOC_EXPORT_WORD1_SWIZ_CM() + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .MARK(bc.mark) + .SEL_X(bc.sel[0]) + .SEL_Y(bc.sel[1]) + .SEL_Z(bc.sel[2]) + .SEL_W(bc.sel[3]) + .VALID_PIXEL_MODE(bc.valid_pixel_mode); + + } else if (cfop->flags & CF_MEM) { + return build_cf_mem(n); + } + + return 0; +} + +int bc_builder::build_cf_mem(cf_node* n) { + const bc_cf &bc = n->bc; + + if (!ctx.is_egcm()) + bb << CF_ALLOC_EXPORT_WORD1_BUF_R6R7() + .ARRAY_SIZE(bc.array_size) + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .COMP_MASK(bc.comp_mask) + .END_OF_PROGRAM(bc.end_of_program) + .VALID_PIXEL_MODE(bc.valid_pixel_mode) + .WHOLE_QUAD_MODE(bc.whole_quad_mode); + + else if (ctx.is_evergreen()) + bb << CF_ALLOC_EXPORT_WORD1_BUF_EG() + .ARRAY_SIZE(bc.array_size) + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .COMP_MASK(bc.comp_mask) + .END_OF_PROGRAM(bc.end_of_program) + .MARK(bc.mark) + .VALID_PIXEL_MODE(bc.valid_pixel_mode); + + else // cayman + bb << CF_ALLOC_EXPORT_WORD1_BUF_CM() + .ARRAY_SIZE(bc.array_size) + .BARRIER(bc.barrier) + .BURST_COUNT(bc.burst_count) + .CF_INST(ctx.cf_opcode(bc.op)) + .COMP_MASK(bc.comp_mask) + .MARK(bc.mark) + .VALID_PIXEL_MODE(bc.valid_pixel_mode); + + return 0; +} + +int bc_builder::build_alu(alu_node* n) { + const bc_alu &bc = n->bc; + const alu_op_info *aop = bc.op_ptr; + + bb << ALU_WORD0_ALL() + .INDEX_MODE(bc.index_mode) + .LAST(bc.last) + .PRED_SEL(bc.pred_sel) + .SRC0_SEL(bc.src[0].sel) + .SRC0_CHAN(bc.src[0].chan) + .SRC0_NEG(bc.src[0].neg) + .SRC0_REL(bc.src[0].rel) + .SRC1_SEL(bc.src[1].sel) + .SRC1_CHAN(bc.src[1].chan) + .SRC1_NEG(bc.src[1].neg) + .SRC1_REL(bc.src[1].rel); + + if (aop->src_count<3) { + if (ctx.is_r600()) + bb << ALU_WORD1_OP2_R6() + .ALU_INST(ctx.alu_opcode(bc.op)) + .BANK_SWIZZLE(bc.bank_swizzle) + .CLAMP(bc.clamp) + .DST_GPR(bc.dst_gpr) + .DST_CHAN(bc.dst_chan) + .DST_REL(bc.dst_rel) + .FOG_MERGE(bc.fog_merge) + .OMOD(bc.omod) + .SRC0_ABS(bc.src[0].abs) + .SRC1_ABS(bc.src[1].abs) + .UPDATE_EXEC_MASK(bc.update_exec_mask) + .UPDATE_PRED(bc.update_pred) + .WRITE_MASK(bc.write_mask); + else { + + if (ctx.is_cayman() && (aop->flags & AF_MOVA)) { + + bb << ALU_WORD1_OP2_MOVA_CM() + .ALU_INST(ctx.alu_opcode(bc.op)) + .BANK_SWIZZLE(bc.bank_swizzle) + .CLAMP(bc.clamp) + .MOVA_DST(bc.dst_gpr) + .DST_CHAN(bc.dst_chan) + .DST_REL(bc.dst_rel) + .OMOD(bc.omod) + .UPDATE_EXEC_MASK(bc.update_exec_mask) + .UPDATE_PRED(bc.update_pred) + .WRITE_MASK(bc.write_mask) + .SRC0_ABS(bc.src[0].abs) + .SRC1_ABS(bc.src[1].abs); + + } else if (ctx.is_cayman() && (aop->flags & (AF_PRED|AF_KILL))) { + bb << ALU_WORD1_OP2_EXEC_MASK_CM() + .ALU_INST(ctx.alu_opcode(bc.op)) + .BANK_SWIZZLE(bc.bank_swizzle) + .CLAMP(bc.clamp) + .DST_CHAN(bc.dst_chan) + .DST_REL(bc.dst_rel) + .EXECUTE_MASK_OP(bc.omod) + .UPDATE_EXEC_MASK(bc.update_exec_mask) + .UPDATE_PRED(bc.update_pred) + .WRITE_MASK(bc.write_mask) + .SRC0_ABS(bc.src[0].abs) + .SRC1_ABS(bc.src[1].abs); + + } else + bb << ALU_WORD1_OP2_R7EGCM() + .ALU_INST(ctx.alu_opcode(bc.op)) + .BANK_SWIZZLE(bc.bank_swizzle) + .CLAMP(bc.clamp) + .DST_GPR(bc.dst_gpr) + .DST_CHAN(bc.dst_chan) + .DST_REL(bc.dst_rel) + .OMOD(bc.omod) + .UPDATE_EXEC_MASK(bc.update_exec_mask) + .UPDATE_PRED(bc.update_pred) + .WRITE_MASK(bc.write_mask) + .SRC0_ABS(bc.src[0].abs) + .SRC1_ABS(bc.src[1].abs); + + } + } else + bb << ALU_WORD1_OP3_ALL() + .ALU_INST(ctx.alu_opcode(bc.op)) + .BANK_SWIZZLE(bc.bank_swizzle) + .CLAMP(bc.clamp) + .DST_GPR(bc.dst_gpr) + .DST_CHAN(bc.dst_chan) + .DST_REL(bc.dst_rel) + .SRC2_SEL(bc.src[2].sel) + .SRC2_CHAN(bc.src[2].chan) + .SRC2_NEG(bc.src[2].neg) + .SRC2_REL(bc.src[2].rel); + return 0; +} + +int bc_builder::build_fetch_tex(fetch_node* n) { + const bc_fetch &bc = n->bc; + const fetch_op_info *fop = bc.op_ptr; + + assert(!(fop->flags & FF_VTX)); + + if (ctx.is_r600()) + bb << TEX_WORD0_R6() + .BC_FRAC_MODE(bc.bc_frac_mode) + .FETCH_WHOLE_QUAD(bc.fetch_whole_quad) + .RESOURCE_ID(bc.resource_id) + .SRC_GPR(bc.src_gpr) + .SRC_REL(bc.src_rel) + .TEX_INST(ctx.fetch_opcode(bc.op)); + + else if (ctx.is_r700()) + bb << TEX_WORD0_R7() + .ALT_CONST(bc.alt_const) + .BC_FRAC_MODE(bc.bc_frac_mode) + .FETCH_WHOLE_QUAD(bc.fetch_whole_quad) + .RESOURCE_ID(bc.resource_id) + .SRC_GPR(bc.src_gpr) + .SRC_REL(bc.src_rel) + .TEX_INST(ctx.fetch_opcode(bc.op)); + + else + bb << TEX_WORD0_EGCM() + .ALT_CONST(bc.alt_const) + .FETCH_WHOLE_QUAD(bc.fetch_whole_quad) + .INST_MOD(bc.inst_mod) + .RESOURCE_ID(bc.resource_id) + .RESOURCE_INDEX_MODE(bc.resource_index_mode) + .SAMPLER_INDEX_MODE(bc.sampler_index_mode) + .SRC_GPR(bc.src_gpr) + .SRC_REL(bc.src_rel) + .TEX_INST(ctx.fetch_opcode(bc.op)); + + bb << TEX_WORD1_ALL() + .COORD_TYPE_X(bc.coord_type[0]) + .COORD_TYPE_Y(bc.coord_type[1]) + .COORD_TYPE_Z(bc.coord_type[2]) + .COORD_TYPE_W(bc.coord_type[3]) + .DST_GPR(bc.dst_gpr) + .DST_REL(bc.dst_rel) + .DST_SEL_X(bc.dst_sel[0]) + .DST_SEL_Y(bc.dst_sel[1]) + .DST_SEL_Z(bc.dst_sel[2]) + .DST_SEL_W(bc.dst_sel[3]) + .LOD_BIAS(bc.lod_bias); + + bb << TEX_WORD2_ALL() + .OFFSET_X(bc.offset[0]) + .OFFSET_Y(bc.offset[1]) + .OFFSET_Z(bc.offset[2]) + .SAMPLER_ID(bc.sampler_id) + .SRC_SEL_X(bc.src_sel[0]) + .SRC_SEL_Y(bc.src_sel[1]) + .SRC_SEL_Z(bc.src_sel[2]) + .SRC_SEL_W(bc.src_sel[3]); + + bb << 0; + return 0; +} + +int bc_builder::build_fetch_vtx(fetch_node* n) { + const bc_fetch &bc = n->bc; + const fetch_op_info *fop = bc.op_ptr; + + assert(fop->flags & FF_VTX); + + if (!ctx.is_cayman()) + bb << VTX_WORD0_R6R7EG() + .BUFFER_ID(bc.resource_id) + .FETCH_TYPE(bc.fetch_type) + .FETCH_WHOLE_QUAD(bc.fetch_whole_quad) + .MEGA_FETCH_COUNT(bc.mega_fetch_count) + .SRC_GPR(bc.src_gpr) + .SRC_REL(bc.src_rel) + .SRC_SEL_X(bc.src_sel[0]) + .VC_INST(ctx.fetch_opcode(bc.op)); + + else + bb << VTX_WORD0_CM() + .BUFFER_ID(bc.resource_id) + .COALESCED_READ(bc.coalesced_read) + .FETCH_TYPE(bc.fetch_type) + .FETCH_WHOLE_QUAD(bc.fetch_whole_quad) + .LDS_REQ(bc.lds_req) + .SRC_GPR(bc.src_gpr) + .SRC_REL(bc.src_rel) + .SRC_SEL_X(bc.src_sel[0]) + .SRC_SEL_Y(bc.src_sel[1]) + .STRUCTURED_READ(bc.structured_read) + .VC_INST(ctx.fetch_opcode(bc.op)); + + if (bc.op == FETCH_OP_SEMFETCH) + bb << VTX_WORD1_SEM_ALL() + .DATA_FORMAT(bc.data_format) + .DST_SEL_X(bc.dst_sel[0]) + .DST_SEL_Y(bc.dst_sel[1]) + .DST_SEL_Z(bc.dst_sel[2]) + .DST_SEL_W(bc.dst_sel[3]) + .FORMAT_COMP_ALL(bc.format_comp_all) + .NUM_FORMAT_ALL(bc.num_format_all) + .SEMANTIC_ID(bc.semantic_id) + .SRF_MODE_ALL(bc.srf_mode_all) + .USE_CONST_FIELDS(bc.use_const_fields); + else + bb << VTX_WORD1_GPR_ALL() + .DATA_FORMAT(bc.data_format) + .DST_GPR(bc.dst_gpr) + .DST_REL(bc.dst_rel) + .DST_SEL_X(bc.dst_sel[0]) + .DST_SEL_Y(bc.dst_sel[1]) + .DST_SEL_Z(bc.dst_sel[2]) + .DST_SEL_W(bc.dst_sel[3]) + .FORMAT_COMP_ALL(bc.format_comp_all) + .NUM_FORMAT_ALL(bc.num_format_all) + .SRF_MODE_ALL(bc.srf_mode_all) + .USE_CONST_FIELDS(bc.use_const_fields); + + switch (ctx.hw_class) { + case HW_CLASS_R600: + bb << VTX_WORD2_R6() + .CONST_BUF_NO_STRIDE(bc.const_buf_no_stride) + .ENDIAN_SWAP(bc.endian_swap) + .MEGA_FETCH(bc.mega_fetch) + .OFFSET(bc.offset[0]); + break; + case HW_CLASS_R700: + bb << VTX_WORD2_R7() + .ALT_CONST(bc.alt_const) + .CONST_BUF_NO_STRIDE(bc.const_buf_no_stride) + .ENDIAN_SWAP(bc.endian_swap) + .MEGA_FETCH(bc.mega_fetch) + .OFFSET(bc.offset[0]); + break; + case HW_CLASS_EVERGREEN: + bb << VTX_WORD2_EG() + .ALT_CONST(bc.alt_const) + .BUFFER_INDEX_MODE(bc.resource_index_mode) + .CONST_BUF_NO_STRIDE(bc.const_buf_no_stride) + .ENDIAN_SWAP(bc.endian_swap) + .MEGA_FETCH(bc.mega_fetch) + .OFFSET(bc.offset[0]); + break; + case HW_CLASS_CAYMAN: + bb << VTX_WORD2_CM() + .ALT_CONST(bc.alt_const) + .BUFFER_INDEX_MODE(bc.resource_index_mode) + .CONST_BUF_NO_STRIDE(bc.const_buf_no_stride) + .ENDIAN_SWAP(bc.endian_swap) + .OFFSET(bc.offset[0]); + break; + default: + assert(!"unknown hw class"); + return -1; + } + + bb << 0; + return 0; +} + +} diff --git a/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp b/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp new file mode 100644 index 00000000000..273969f473b --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp @@ -0,0 +1,553 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sstream" + +#include "sb_bc.h" + +namespace r600_sb { + +int bc_decoder::decode_cf(unsigned &i, bc_cf& bc) { + int r = 0; + uint32_t dw0 = dw[i]; + uint32_t dw1 = dw[i+1]; + + if ((dw1 >> 29) & 1) { // CF_ALU + return decode_cf_alu(i, bc); + } else { + // CF_INST field encoding on cayman is the same as on evergreen + unsigned opcode = ctx.is_egcm() ? + CF_WORD1_EG(dw1).get_CF_INST() : + CF_WORD1_R6R7(dw1).get_CF_INST(); + + bc.set_op(r600_isa_cf_by_opcode(ctx.isa, opcode, 0)); + + if (bc.op_ptr->flags & CF_EXP) { + return decode_cf_exp(i, bc); + } else if (bc.op_ptr->flags & CF_MEM) { + return decode_cf_mem(i, bc); + } + + if (ctx.is_egcm()) { + CF_WORD0_EGCM w0(dw0); + bc.addr = w0.get_ADDR(); + bc.jumptable_sel = w0.get_JUMPTABLE_SEL(); + + if (ctx.is_evergreen()) { + CF_WORD1_EG w1(dw1); + + bc.barrier = w1.get_BARRIER(); + bc.cf_const = w1.get_CF_CONST(); + bc.cond = w1.get_COND(); + bc.count = w1.get_COUNT(); + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.pop_count = w1.get_POP_COUNT(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + + } else { // cayman + CF_WORD1_CM w1(dw1); + + bc.barrier = w1.get_BARRIER(); + bc.cf_const = w1.get_CF_CONST(); + bc.cond = w1.get_COND(); + bc.count = w1.get_COUNT(); + bc.pop_count = w1.get_POP_COUNT(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + } + + + } else { + CF_WORD0_R6R7 w0(dw0); + bc.addr = w0.get_ADDR(); + + CF_WORD1_R6R7 w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.cf_const = w1.get_CF_CONST(); + bc.cond = w1.get_COND(); + + if (ctx.is_r600()) + bc.count = w1.get_COUNT(); + else + bc.count = w1.get_COUNT() + (w1.get_COUNT_3() << 3); + + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.pop_count = w1.get_POP_COUNT(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + bc.call_count = w1.get_CALL_COUNT(); + } + } + + i += 2; + + return r; +} + +int bc_decoder::decode_cf_alu(unsigned & i, bc_cf& bc) { + int r = 0; + uint32_t dw0 = dw[i++]; + uint32_t dw1 = dw[i++]; + + assert(i <= ndw); + + CF_ALU_WORD0_ALL w0(dw0); + + bc.kc[0].bank = w0.get_KCACHE_BANK0(); + bc.kc[1].bank = w0.get_KCACHE_BANK1(); + bc.kc[0].mode = w0.get_KCACHE_MODE0(); + + bc.addr = w0.get_ADDR(); + + if (ctx.is_r600()) { + CF_ALU_WORD1_R6 w1(dw1); + + bc.set_op(r600_isa_cf_by_opcode(ctx.isa, w1.get_CF_INST(), 1)); + + bc.kc[0].addr = w1.get_KCACHE_ADDR0(); + bc.kc[1].mode = w1.get_KCACHE_MODE1(); + bc.kc[1].addr = w1.get_KCACHE_ADDR1(); + + bc.barrier = w1.get_BARRIER(); + bc.count = w1.get_COUNT(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + + bc.uses_waterfall = w1.get_USES_WATERFALL(); + } else { + CF_ALU_WORD1_R7EGCM w1(dw1); + + bc.set_op(r600_isa_cf_by_opcode(ctx.isa, w1.get_CF_INST(), 1)); + + if (bc.op == CF_OP_ALU_EXT) { + CF_ALU_WORD0_EXT_EGCM w0(dw0); + CF_ALU_WORD1_EXT_EGCM w1(dw1); + + bc.kc[0].index_mode = w0.get_KCACHE_BANK_INDEX_MODE0(); + bc.kc[1].index_mode = w0.get_KCACHE_BANK_INDEX_MODE1(); + bc.kc[2].index_mode = w0.get_KCACHE_BANK_INDEX_MODE2(); + bc.kc[3].index_mode = w0.get_KCACHE_BANK_INDEX_MODE3(); + bc.kc[2].bank = w0.get_KCACHE_BANK2(); + bc.kc[3].bank = w0.get_KCACHE_BANK3(); + bc.kc[2].mode = w0.get_KCACHE_MODE2(); + bc.kc[3].mode = w1.get_KCACHE_MODE3(); + bc.kc[2].addr = w1.get_KCACHE_ADDR2(); + bc.kc[3].addr = w1.get_KCACHE_ADDR3(); + + r = decode_cf_alu(i, bc); + + } else { + + bc.kc[0].addr = w1.get_KCACHE_ADDR0(); + bc.kc[1].mode = w1.get_KCACHE_MODE1(); + bc.kc[1].addr = w1.get_KCACHE_ADDR1(); + bc.barrier = w1.get_BARRIER(); + bc.count = w1.get_COUNT(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + + bc.alt_const = w1.get_ALT_CONST(); + } + } + return r; +} + +int bc_decoder::decode_cf_exp(unsigned & i, bc_cf& bc) { + int r = 0; + uint32_t dw0 = dw[i++]; + uint32_t dw1 = dw[i++]; + assert(i <= ndw); + + CF_ALLOC_EXPORT_WORD0_ALL w0(dw0); + bc.array_base = w0.get_ARRAY_BASE(); + bc.elem_size = w0.get_ELEM_SIZE(); + bc.index_gpr = w0.get_INDEX_GPR(); + bc.rw_gpr = w0.get_RW_GPR(); + bc.rw_rel = w0.get_RW_REL(); + bc.type = w0.get_TYPE(); + + if (ctx.is_evergreen()) { + CF_ALLOC_EXPORT_WORD1_SWIZ_EG w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.sel[0] = w1.get_SEL_X(); + bc.sel[1] = w1.get_SEL_Y(); + bc.sel[2] = w1.get_SEL_Z(); + bc.sel[3] = w1.get_SEL_W(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.mark = w1.get_MARK(); + + } else if (ctx.is_cayman()) { + CF_ALLOC_EXPORT_WORD1_SWIZ_CM w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.mark = w1.get_MARK(); + bc.sel[0] = w1.get_SEL_X(); + bc.sel[1] = w1.get_SEL_Y(); + bc.sel[2] = w1.get_SEL_Z(); + bc.sel[3] = w1.get_SEL_W(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + + } else { // r67 + CF_ALLOC_EXPORT_WORD1_SWIZ_R6R7 w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.sel[0] = w1.get_SEL_X(); + bc.sel[1] = w1.get_SEL_Y(); + bc.sel[2] = w1.get_SEL_Z(); + bc.sel[3] = w1.get_SEL_W(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + } + + return r; +} + + +int bc_decoder::decode_cf_mem(unsigned & i, bc_cf& bc) { + int r = 0; + uint32_t dw0 = dw[i++]; + uint32_t dw1 = dw[i++]; + assert(i <= ndw); + + if (!(bc.op_ptr->flags & CF_RAT)) { + CF_ALLOC_EXPORT_WORD0_ALL w0(dw0); + bc.array_base = w0.get_ARRAY_BASE(); + bc.elem_size = w0.get_ELEM_SIZE(); + bc.index_gpr = w0.get_INDEX_GPR(); + bc.rw_gpr = w0.get_RW_GPR(); + bc.rw_rel = w0.get_RW_REL(); + bc.type = w0.get_TYPE(); + } else { + assert(ctx.is_egcm()); + CF_ALLOC_EXPORT_WORD0_RAT_EGCM w0(dw0); + bc.elem_size = w0.get_ELEM_SIZE(); + bc.index_gpr = w0.get_INDEX_GPR(); + bc.rw_gpr = w0.get_RW_GPR(); + bc.rw_rel = w0.get_RW_REL(); + bc.type = w0.get_TYPE(); + bc.rat_id = w0.get_RAT_ID(); + bc.rat_inst = w0.get_RAT_INST(); + bc.rat_index_mode = w0.get_RAT_INDEX_MODE(); + } + + if (ctx.is_evergreen()) { + CF_ALLOC_EXPORT_WORD1_BUF_EG w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.mark = w1.get_MARK(); + bc.array_size = w1.get_ARRAY_SIZE(); + bc.comp_mask = w1.get_COMP_MASK(); + + } else if (ctx.is_cayman()) { + CF_ALLOC_EXPORT_WORD1_BUF_CM w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.mark = w1.get_MARK(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.array_size = w1.get_ARRAY_SIZE(); + bc.comp_mask = w1.get_COMP_MASK(); + + } else { // r67 + CF_ALLOC_EXPORT_WORD1_BUF_R6R7 w1(dw1); + bc.barrier = w1.get_BARRIER(); + bc.burst_count = w1.get_BURST_COUNT(); + bc.end_of_program = w1.get_END_OF_PROGRAM(); + bc.valid_pixel_mode = w1.get_VALID_PIXEL_MODE(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + bc.array_size = w1.get_ARRAY_SIZE(); + bc.comp_mask = w1.get_COMP_MASK(); + bc.whole_quad_mode = w1.get_WHOLE_QUAD_MODE(); + } + + return r; +} + +int bc_decoder::decode_alu(unsigned & i, bc_alu& bc) { + int r = 0; + uint32_t dw0 = dw[i++]; + uint32_t dw1 = dw[i++]; + assert(i <= ndw); + + ALU_WORD0_ALL w0(dw0); + bc.index_mode = w0.get_INDEX_MODE(); + bc.last = w0.get_LAST(); + bc.pred_sel = w0.get_PRED_SEL(); + bc.src[0].chan = w0.get_SRC0_CHAN(); + bc.src[0].sel = w0.get_SRC0_SEL(); + bc.src[0].neg = w0.get_SRC0_NEG(); + bc.src[0].rel = w0.get_SRC0_REL(); + bc.src[1].chan = w0.get_SRC1_CHAN(); + bc.src[1].sel = w0.get_SRC1_SEL(); + bc.src[1].neg = w0.get_SRC1_NEG(); + bc.src[1].rel = w0.get_SRC1_REL(); + + if ((dw1 >> 15) & 7) { // op3 + ALU_WORD1_OP3_ALL w1(dw1); + bc.set_op(r600_isa_alu_by_opcode(ctx.isa, w1.get_ALU_INST(), 1)); + + bc.bank_swizzle = w1.get_BANK_SWIZZLE(); + bc.clamp = w1.get_CLAMP(); + bc.dst_chan = w1.get_DST_CHAN(); + bc.dst_gpr = w1.get_DST_GPR(); + bc.dst_rel = w1.get_DST_REL(); + + bc.src[2].chan = w1.get_SRC2_CHAN(); + bc.src[2].sel = w1.get_SRC2_SEL(); + bc.src[2].neg = w1.get_SRC2_NEG(); + bc.src[2].rel = w1.get_SRC2_REL(); + + } else { // op2 + if (ctx.is_r600()) { + ALU_WORD1_OP2_R6 w1(dw1); + bc.set_op(r600_isa_alu_by_opcode(ctx.isa, w1.get_ALU_INST(), 0)); + + bc.bank_swizzle = w1.get_BANK_SWIZZLE(); + bc.clamp = w1.get_CLAMP(); + bc.dst_chan = w1.get_DST_CHAN(); + bc.dst_gpr = w1.get_DST_GPR(); + bc.dst_rel = w1.get_DST_REL(); + + bc.omod = w1.get_OMOD(); + bc.src[0].abs = w1.get_SRC0_ABS(); + bc.src[1].abs = w1.get_SRC1_ABS(); + bc.write_mask = w1.get_WRITE_MASK(); + bc.update_exec_mask = w1.get_UPDATE_EXEC_MASK(); + bc.update_pred = w1.get_UPDATE_PRED(); + + bc.fog_merge = w1.get_FOG_MERGE(); + + } else { + ALU_WORD1_OP2_R7EGCM w1(dw1); + bc.set_op(r600_isa_alu_by_opcode(ctx.isa, w1.get_ALU_INST(), 0)); + + bc.bank_swizzle = w1.get_BANK_SWIZZLE(); + bc.clamp = w1.get_CLAMP(); + bc.dst_chan = w1.get_DST_CHAN(); + bc.dst_gpr = w1.get_DST_GPR(); + bc.dst_rel = w1.get_DST_REL(); + + bc.omod = w1.get_OMOD(); + bc.src[0].abs = w1.get_SRC0_ABS(); + bc.src[1].abs = w1.get_SRC1_ABS(); + bc.write_mask = w1.get_WRITE_MASK(); + bc.update_exec_mask = w1.get_UPDATE_EXEC_MASK(); + bc.update_pred = w1.get_UPDATE_PRED(); + } + } + + bc.slot_flags = (alu_op_flags)bc.op_ptr->slots[ctx.isa->hw_class]; + return r; +} + +int bc_decoder::decode_fetch(unsigned & i, bc_fetch& bc) { + int r = 0; + uint32_t dw0 = dw[i]; + uint32_t dw1 = dw[i+1]; + uint32_t dw2 = dw[i+2]; + assert(i + 4 <= ndw); + + unsigned fetch_opcode = dw0 & 0x1F; + + bc.set_op(r600_isa_fetch_by_opcode(ctx.isa, fetch_opcode)); + + if (bc.op_ptr->flags & FF_VTX) + return decode_fetch_vtx(i, bc); + + // tex + + if (ctx.is_r600()) { + TEX_WORD0_R6 w0(dw0); + + bc.bc_frac_mode = w0.get_BC_FRAC_MODE(); + bc.fetch_whole_quad = w0.get_FETCH_WHOLE_QUAD(); + bc.resource_id = w0.get_RESOURCE_ID(); + bc.src_gpr = w0.get_SRC_GPR(); + bc.src_rel = w0.get_SRC_REL(); + + } else if (ctx.is_r600()) { + TEX_WORD0_R7 w0(dw0); + + bc.bc_frac_mode = w0.get_BC_FRAC_MODE(); + bc.fetch_whole_quad = w0.get_FETCH_WHOLE_QUAD(); + bc.resource_id = w0.get_RESOURCE_ID(); + bc.src_gpr = w0.get_SRC_GPR(); + bc.src_rel = w0.get_SRC_REL(); + bc.alt_const = w0.get_ALT_CONST(); + + } else { // eg/cm + TEX_WORD0_EGCM w0(dw0); + + bc.fetch_whole_quad = w0.get_FETCH_WHOLE_QUAD(); + bc.resource_id = w0.get_RESOURCE_ID(); + bc.src_gpr = w0.get_SRC_GPR(); + bc.src_rel = w0.get_SRC_REL(); + bc.alt_const = w0.get_ALT_CONST(); + bc.inst_mod = w0.get_INST_MOD(); + bc.resource_index_mode = w0.get_RESOURCE_INDEX_MODE(); + bc.sampler_index_mode = w0.get_SAMPLER_INDEX_MODE(); + } + + TEX_WORD1_ALL w1(dw1); + bc.coord_type[0] = w1.get_COORD_TYPE_X(); + bc.coord_type[1] = w1.get_COORD_TYPE_Y(); + bc.coord_type[2] = w1.get_COORD_TYPE_Z(); + bc.coord_type[3] = w1.get_COORD_TYPE_W(); + bc.dst_gpr = w1.get_DST_GPR(); + bc.dst_rel = w1.get_DST_REL(); + bc.dst_sel[0] = w1.get_DST_SEL_X(); + bc.dst_sel[1] = w1.get_DST_SEL_Y(); + bc.dst_sel[2] = w1.get_DST_SEL_Z(); + bc.dst_sel[3] = w1.get_DST_SEL_W(); + bc.lod_bias = w1.get_LOD_BIAS(); + + TEX_WORD2_ALL w2(dw2); + bc.offset[0] = w2.get_OFFSET_X(); + bc.offset[1] = w2.get_OFFSET_Y(); + bc.offset[2] = w2.get_OFFSET_Z(); + bc.sampler_id = w2.get_SAMPLER_ID(); + bc.src_sel[0] = w2.get_SRC_SEL_X(); + bc.src_sel[1] = w2.get_SRC_SEL_Y(); + bc.src_sel[2] = w2.get_SRC_SEL_Z(); + bc.src_sel[3] = w2.get_SRC_SEL_W(); + + i += 4; + return r; +} + +int bc_decoder::decode_fetch_vtx(unsigned & i, bc_fetch& bc) { + int r = 0; + uint32_t dw0 = dw[i]; + uint32_t dw1 = dw[i+1]; + uint32_t dw2 = dw[i+2]; + i+= 4; + assert(i <= ndw); + + if (ctx.is_cayman()) { + VTX_WORD0_CM w0(dw0); + bc.resource_id = w0.get_BUFFER_ID(); + bc.fetch_type = w0.get_FETCH_TYPE(); + bc.fetch_whole_quad = w0.get_FETCH_WHOLE_QUAD(); + bc.src_gpr = w0.get_SRC_GPR(); + bc.src_rel = w0.get_SRC_REL(); + bc.src_sel[0] = w0.get_SRC_SEL_X(); + bc.coalesced_read = w0.get_COALESCED_READ(); + bc.lds_req = w0.get_LDS_REQ(); + bc.structured_read = w0.get_STRUCTURED_READ(); + + } else { + VTX_WORD0_R6R7EG w0(dw0); + bc.resource_id = w0.get_BUFFER_ID(); + bc.fetch_type = w0.get_FETCH_TYPE(); + bc.fetch_whole_quad = w0.get_FETCH_WHOLE_QUAD(); + bc.mega_fetch_count = w0.get_MEGA_FETCH_COUNT(); + bc.src_gpr = w0.get_SRC_GPR(); + bc.src_rel = w0.get_SRC_REL(); + bc.src_sel[0] = w0.get_SRC_SEL_X(); + } + + if (bc.op == FETCH_OP_SEMFETCH) { + VTX_WORD1_SEM_ALL w1(dw1); + bc.data_format = w1.get_DATA_FORMAT(); + bc.dst_sel[0] = w1.get_DST_SEL_X(); + bc.dst_sel[1] = w1.get_DST_SEL_Y(); + bc.dst_sel[2] = w1.get_DST_SEL_Z(); + bc.dst_sel[3] = w1.get_DST_SEL_W(); + bc.format_comp_all = w1.get_FORMAT_COMP_ALL(); + bc.num_format_all = w1.get_NUM_FORMAT_ALL(); + bc.srf_mode_all = w1.get_SRF_MODE_ALL(); + bc.use_const_fields = w1.get_USE_CONST_FIELDS(); + + bc.semantic_id = w1.get_SEMANTIC_ID(); + + } else { + VTX_WORD1_GPR_ALL w1(dw1); + bc.data_format = w1.get_DATA_FORMAT(); + bc.dst_sel[0] = w1.get_DST_SEL_X(); + bc.dst_sel[1] = w1.get_DST_SEL_Y(); + bc.dst_sel[2] = w1.get_DST_SEL_Z(); + bc.dst_sel[3] = w1.get_DST_SEL_W(); + bc.format_comp_all = w1.get_FORMAT_COMP_ALL(); + bc.num_format_all = w1.get_NUM_FORMAT_ALL(); + bc.srf_mode_all = w1.get_SRF_MODE_ALL(); + bc.use_const_fields = w1.get_USE_CONST_FIELDS(); + + bc.dst_gpr = w1.get_DST_GPR(); + bc.dst_rel = w1.get_DST_REL(); + } + + switch (ctx.hw_class) { + case HW_CLASS_R600: + { + VTX_WORD2_R6 w2(dw2); + bc.const_buf_no_stride = w2.get_CONST_BUF_NO_STRIDE(); + bc.endian_swap = w2.get_ENDIAN_SWAP(); + bc.mega_fetch = w2.get_MEGA_FETCH(); + bc.offset[0] = w2.get_OFFSET(); + break; + } + case HW_CLASS_R700: + { + VTX_WORD2_R7 w2(dw2); + bc.const_buf_no_stride = w2.get_CONST_BUF_NO_STRIDE(); + bc.endian_swap = w2.get_ENDIAN_SWAP(); + bc.mega_fetch = w2.get_MEGA_FETCH(); + bc.offset[0] = w2.get_OFFSET(); + bc.alt_const = w2.get_ALT_CONST(); + break; + } + case HW_CLASS_EVERGREEN: + { + VTX_WORD2_EG w2(dw2); + bc.const_buf_no_stride = w2.get_CONST_BUF_NO_STRIDE(); + bc.endian_swap = w2.get_ENDIAN_SWAP(); + bc.mega_fetch = w2.get_MEGA_FETCH(); + bc.offset[0] = w2.get_OFFSET(); + bc.alt_const = w2.get_ALT_CONST(); + bc.resource_index_mode = w2.get_BUFFER_INDEX_MODE(); + break; + } + case HW_CLASS_CAYMAN: + { + VTX_WORD2_CM w2(dw2); + bc.const_buf_no_stride = w2.get_CONST_BUF_NO_STRIDE(); + bc.endian_swap = w2.get_ENDIAN_SWAP(); + bc.offset[0] = w2.get_OFFSET(); + bc.alt_const = w2.get_ALT_CONST(); + bc.resource_index_mode = w2.get_BUFFER_INDEX_MODE(); + break; + } + default: + assert(!"unknown hw class"); + return -1; + } + + return r; +} + +} diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp new file mode 100644 index 00000000000..28ae32c2300 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp @@ -0,0 +1,481 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <sstream> +#include <iomanip> + +#include "sb_bc.h" +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +static const char* chans = "xyzw01?_"; + +static const char* vec_bs[] = { + "VEC_012", "VEC_021", "VEC_120", "VEC_102", "VEC_201", "VEC_210" +}; + +static const char* scl_bs[] = { + "SCL_210", "SCL_122", "SCL_212", "SCL_221" +}; + + +bool bc_dump::visit(cf_node& n, bool enter) { + if (enter) { + + id = n.bc.id << 1; + + if ((n.bc.op_ptr->flags & CF_ALU) && n.bc.is_alu_extended()) { + dump_dw(id, 2); + id += 2; + o << "\n"; + } + + dump_dw(id, 2); + dump(n); + + if (n.bc.op_ptr->flags & CF_CLAUSE) { + id = n.bc.addr << 1; + new_group = 1; + } + } + return true; +} + +bool bc_dump::visit(alu_node& n, bool enter) { + if (enter) { + o << " "; + dump_dw(id, 2); + + if (new_group) + o << std::setw(5) << ++group_index << " "; + else + o << " "; + + dump(n); + id += 2; + + new_group = n.bc.last; + } else { + if (n.bc.last) { + alu_group_node *g = + static_cast<alu_group_node*>(n.get_alu_group_node()); + assert(g); + for (unsigned k = 0; k < g->literals.size(); ++k) { + o << " "; + dump_dw(id, 1); + id += 1; + o << "\n"; + } + + id = (id + 1) & ~1u; + } + } + + return false; +} + +bool bc_dump::visit(fetch_node& n, bool enter) { + if (enter) { + o << " "; + dump_dw(id, 3); + dump(n); + id += 4; + } + return false; +} + +static void fill_to(std::ostringstream &s, int pos) { + int l = s.str().length(); + if (l < pos) + s << std::string(pos-l, ' '); +} + +void bc_dump::dump(cf_node& n) { + std::ostringstream s; + s << n.bc.op_ptr->name; + + if (n.bc.op_ptr->flags & CF_EXP) { + static const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; + + fill_to(s, 18); + s << " " << exp_type[n.bc.type]; + + if (n.bc.burst_count) { + std::ostringstream s2; + s2 << n.bc.array_base << "-" << n.bc.array_base + n.bc.burst_count; + s << " " << std::setw(5) << std::left << s2.str(); + s << "R" << n.bc.rw_gpr << "-" << + n.bc.rw_gpr + n.bc.burst_count << "."; + } else { + s << " " << std::setw(5) << std::left << n.bc.array_base; + s << "R" << n.bc.rw_gpr << "."; + } + + for (int k = 0; k < 4; ++k) + s << chans[n.bc.sel[k]]; + + } else if (n.bc.op_ptr->flags & (CF_STRM | CF_RAT)) { + static const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", + "WRITE_IND_ACK"}; + fill_to(s, 18); + s << " " << exp_type[n.bc.type]; + s << " " << std::setw(5) << std::left << n.bc.array_base; + s << "R" << n.bc.rw_gpr << "."; + for (int k = 0; k < 4; ++k) + s << ((n.bc.comp_mask & (1 << k)) ? chans[k] : '_'); + + if ((n.bc.op_ptr->flags & CF_RAT) && (n.bc.type & 1)) { + s << ", @R" << n.bc.index_gpr << ".xyz"; + } + + s << " ES:" << n.bc.elem_size; + + } else { + + if (n.bc.op_ptr->flags & CF_CLAUSE) { + s << " " << n.bc.count+1; + } + + s << " @" << (n.bc.addr << 1); + + if (n.bc.op_ptr->flags & CF_ALU) { + + for (int k = 0; k < 4; ++k) { + bc_kcache &kc = n.bc.kc[k]; + if (kc.mode) { + s << " KC" << k << "[CB" << kc.bank << ":" << + (kc.addr << 4) << "-" << + (((kc.addr + kc.mode) << 4) - 1) << "]"; + } + } + } + + if (n.bc.cond) + s << " CND:" << n.bc.pop_count; + + if (n.bc.pop_count) + s << " POP:" << n.bc.pop_count; + } + + if (!n.bc.barrier) + s << " NO_BARRIER"; + + if (n.bc.valid_pixel_mode) + s << " VPM"; + + if (n.bc.whole_quad_mode) + s << " WQM"; + + if (n.bc.end_of_program) + s << " EOP"; + + o << s.str() << "\n"; +} + + +static void print_sel(std::ostream &s, int sel, int rel, int index_mode, + int need_brackets) { + if (rel && index_mode >= 5 && sel < 128) + s << "G"; + if (rel || need_brackets) { + s << "["; + } + s << sel; + if (rel) { + if (index_mode == 0 || index_mode == 6) + s << "+AR"; + else if (index_mode == 4) + s << "+AL"; + } + if (rel || need_brackets) { + s << "]"; + } +} + +static void print_dst(std::ostream &s, bc_alu &alu) +{ + unsigned sel = alu.dst_gpr; + char reg_char = 'R'; + if (sel >= 128 - 4) { // clause temporary gpr + sel -= 128 - 4; + reg_char = 'T'; + } + + if (alu.write_mask || alu.op_ptr->src_count == 3) { + s << reg_char; + print_sel(s, sel, alu.dst_rel, alu.index_mode, 0); + } else { + s << "__"; + } + s << "."; + s << chans[alu.dst_chan]; +} + +static void print_src(std::ostream &s, bc_alu &alu, unsigned idx) +{ + bc_alu_src *src = &alu.src[idx]; + unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0; + + if (src->neg) + s <<"-"; + if (src->abs) + s <<"|"; + + if (sel < 128 - 4) { + s << "R"; + } else if (sel < 128) { + s << "T"; + sel -= 128 - 4; + } else if (sel < 160) { + s << "KC0"; + need_brackets = 1; + sel -= 128; + } else if (sel < 192) { + s << "KC1"; + need_brackets = 1; + sel -= 160; + } else if (sel >= 448) { + s << "Param"; + sel -= 448; + } else if (sel >= 288) { + s << "KC3"; + need_brackets = 1; + sel -= 288; + } else if (sel >= 256) { + s << "KC2"; + need_brackets = 1; + sel -= 256; + } else { + need_sel = 0; + need_chan = 0; + switch (sel) { + case ALU_SRC_PS: + s << "PS"; + break; + case ALU_SRC_PV: + s << "PV"; + need_chan = 1; + break; + case ALU_SRC_LITERAL: + s << "[" << std::hex << std::setfill('0') << std::setw(8) + << std::showbase << src->value.u << " " + << std::noshowbase << std::setfill(' ') << std::dec + << src->value.f << "]"; + need_chan = 1; + break; + case ALU_SRC_0_5: + s << "0.5"; + break; + case ALU_SRC_M_1_INT: + s << "-1"; + break; + case ALU_SRC_1_INT: + s << "1"; + break; + case ALU_SRC_1: + s << "1.0"; + break; + case ALU_SRC_0: + s << "0"; + break; + default: + s << "??IMM_" << sel; + break; + } + } + + if (need_sel) + print_sel(s, sel, src->rel, alu.index_mode, need_brackets); + + if (need_chan) { + s << "." << chans[src->chan]; + } + + if (src->abs) + s << "|"; +} +void bc_dump::dump(alu_node& n) { + std::ostringstream s; + static const char *omod_str[] = {"","*2","*4","/2"}; + static const char *slots = "xyzwt"; + + s << (n.bc.update_exec_mask ? 'M' : ' '); + s << (n.bc.update_pred ? 'P' : ' '); + s << " "; + s << (n.bc.pred_sel>=2 ? (n.bc.pred_sel == 2 ? '0' : '1') : ' '); + s << " "; + + s << slots[n.bc.slot] << ": "; + + s << n.bc.op_ptr->name << omod_str[n.bc.omod] << (n.bc.clamp ? "_sat" : ""); + fill_to(s, 26); + s << " "; + + print_dst(s, n.bc); + for (int k = 0; k < n.bc.op_ptr->src_count; ++k) { + s << (k ? ", " : ", "); + print_src(s, n.bc, k); + } + + if (n.bc.bank_swizzle) { + fill_to(s, 55); + if (n.bc.slot == SLOT_TRANS) + s << " " << scl_bs[n.bc.bank_swizzle]; + else + s << " " << vec_bs[n.bc.bank_swizzle]; + } + + o << s.str() << "\n"; +} + +int bc_dump::init() { + std::ostringstream s; + s << "===== SHADER #" << sh.id; + + if (sh.optimized) + s << " OPT"; + + s << " "; + + std::string target = std::string(" ") + + sh.get_full_target_name() + " ====="; + + while (s.str().length() + target.length() < 80) + s << "="; + + s << target; + + o << "\n" << s.str() << "\n"; + + s.str(std::string()); + + if (bc_data) { + s << "===== " << ndw << " dw ===== " << sh.ngpr + << " gprs ===== " << sh.nstack << " stack "; + } + + while (s.str().length() < 80) + s << "="; + + o << s.str() << "\n"; + + return 0; +} + +int bc_dump::done() { + std::ostringstream s; + s << "===== SHADER_END "; + + while (s.str().length() < 80) + s << "="; + + o << s.str() << "\n\n"; + + return 0; +} + +bc_dump::bc_dump(shader& s, std::ostream& o, bytecode* bc) : + vpass(s), o(o), bc_data(), ndw(), id(), + new_group(), group_index() { + + if (bc) { + bc_data = bc->data(); + ndw = bc->ndw(); + } +} + +void bc_dump::dump(fetch_node& n) { + std::ostringstream s; + static const char * fetch_type[] = {"VERTEX", "INSTANCE", ""}; + + s << n.bc.op_ptr->name; + fill_to(s, 20); + + s << "R"; + print_sel(s, n.bc.dst_gpr, n.bc.dst_rel, INDEX_LOOP, 0); + s << "."; + for (int k = 0; k < 4; ++k) + s << chans[n.bc.dst_sel[k]]; + s << ", "; + + s << "R"; + print_sel(s, n.bc.src_gpr, n.bc.src_rel, INDEX_LOOP, 0); + s << "."; + + unsigned vtx = n.bc.op_ptr->flags & FF_VTX; + unsigned num_src_comp = vtx ? ctx.is_cayman() ? 2 : 1 : 4; + + for (unsigned k = 0; k < num_src_comp; ++k) + s << chans[n.bc.src_sel[k]]; + + if (vtx && n.bc.offset[0]) { + s << " + " << n.bc.offset[0] << "b "; + } + + s << ", RID:" << n.bc.resource_id; + + if (vtx) { + s << " " << fetch_type[n.bc.fetch_type]; + if (!ctx.is_cayman() && n.bc.mega_fetch_count) + s << " MFC:" << n.bc.mega_fetch_count; + if (n.bc.fetch_whole_quad) + s << " FWQ"; + s << " UCF:" << n.bc.use_const_fields + << " FMT(DTA:" << n.bc.data_format + << " NUM:" << n.bc.num_format_all + << " COMP:" << n.bc.format_comp_all + << " MODE:" << n.bc.srf_mode_all << ")"; + } else { + s << ", SID:" << n.bc.sampler_id; + if (n.bc.lod_bias) + s << " LB:" << n.bc.lod_bias; + s << " CT:"; + for (unsigned k = 0; k < 4; ++k) + s << (n.bc.coord_type[k] ? "N" : "U"); + for (unsigned k = 0; k < 3; ++k) + if (n.bc.offset[k]) + s << " O" << chans[k] << ":" << n.bc.offset[k]; + } + + o << s.str() << "\n"; +} + +void bc_dump::dump_dw(unsigned dw_id, unsigned count) { + if (!bc_data) + return; + + assert(dw_id + count <= ndw); + + o << std::setfill('0') << std::setw(4) << dw_id << " "; + while (count--) { + o << std::setw(8) << std::hex << bc_data[dw_id++] << " "; + } + o << std::setfill(' ') << std::dec; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp new file mode 100644 index 00000000000..c883e5e5a67 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -0,0 +1,833 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define FBC_DEBUG 0 + +#if FBC_DEBUG +#define FBC_DUMP(q) do { q } while (0) +#else +#define FBC_DUMP(q) +#endif + +#include <iostream> + +#include "sb_bc.h" + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int bc_finalizer::run() { + + regions_vec &rv = sh.get_regions(); + + for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E; + ++I) { + region_node *r = *I; + + assert(r); + + bool loop = r->is_loop(); + + if (loop) + finalize_loop(r); + else + finalize_if(r); + + r->expand(); + } + + run_on(sh.root); + + cf_peephole(); + + // workaround for some problems on r6xx/7xx + // add ALU NOP to each vertex shader + if (!ctx.is_egcm() && sh.target == TARGET_VS) { + cf_node *c = sh.create_clause(NST_ALU_CLAUSE); + + alu_group_node *g = sh.create_alu_group(); + + alu_node *a = sh.create_alu(); + a->bc.set_op(ALU_OP0_NOP); + a->bc.last = 1; + + g->push_back(a); + c->push_back(g); + + sh.root->push_back(c); + + c = sh.create_cf(CF_OP_NOP); + sh.root->push_back(c); + + last_cf = c; + } + + if (last_cf->bc.op_ptr->flags & CF_ALU) { + last_cf = sh.create_cf(CF_OP_NOP); + sh.root->push_back(last_cf); + } + + if (ctx.is_cayman()) + last_cf->insert_after(sh.create_cf(CF_OP_CF_END)); + else + last_cf->bc.end_of_program = 1; + + for (unsigned t = EXP_PIXEL; t < EXP_TYPE_COUNT; ++t) { + cf_node *le = last_export[t]; + if (le) + le->bc.set_op(CF_OP_EXPORT_DONE); + } + + sh.ngpr = ngpr; + sh.nstack = nstack; + return 0; +} + +void bc_finalizer::finalize_loop(region_node* r) { + + cf_node *loop_start = sh.create_cf(CF_OP_LOOP_START_DX10); + cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END); + + loop_start->jump_after(loop_end); + loop_end->jump_after(loop_start); + + for (depart_vec::iterator I = r->departs.begin(), E = r->departs.end(); + I != E; ++I) { + depart_node *dep = *I; + cf_node *loop_break = sh.create_cf(CF_OP_LOOP_BREAK); + loop_break->jump(loop_end); + dep->push_back(loop_break); + dep->expand(); + } + + // FIXME produces unnecessary LOOP_CONTINUE + for (repeat_vec::iterator I = r->repeats.begin(), E = r->repeats.end(); + I != E; ++I) { + repeat_node *rep = *I; + if (!(rep->parent == r && rep->prev == NULL)) { + cf_node *loop_cont = sh.create_cf(CF_OP_LOOP_CONTINUE); + loop_cont->jump(loop_end); + rep->push_back(loop_cont); + } + rep->expand(); + } + + r->push_front(loop_start); + r->push_back(loop_end); +} + +void bc_finalizer::finalize_if(region_node* r) { + + update_nstack(r); + + // expecting the following control flow structure here: + // - region + // { + // - depart/repeat 1 (it may be depart/repeat for some outer region) + // { + // - if + // { + // - depart/repeat 2 (possibly for outer region) + // { + // - some optional code + // } + // } + // - optional <else> code> ... + // } + // } + + container_node *repdep1 = static_cast<container_node*>(r->first); + assert(repdep1->is_depart() || repdep1->is_repeat()); + + if_node *n_if = static_cast<if_node*>(repdep1->first); + + if (n_if) { + + + assert(n_if->is_if()); + + container_node *repdep2 = static_cast<container_node*>(n_if->first); + assert(repdep2->is_depart() || repdep2->is_repeat()); + + cf_node *if_jump = sh.create_cf(CF_OP_JUMP); + cf_node *if_pop = sh.create_cf(CF_OP_POP); + + if_pop->bc.pop_count = 1; + if_pop->jump_after(if_pop); + + r->push_front(if_jump); + r->push_back(if_pop); + + bool has_else = n_if->next; + + if (has_else) { + cf_node *nelse = sh.create_cf(CF_OP_ELSE); + n_if->insert_after(nelse); + if_jump->jump(nelse); + nelse->jump_after(if_pop); + nelse->bc.pop_count = 1; + + } else { + if_jump->jump_after(if_pop); + if_jump->bc.pop_count = 1; + } + + n_if->expand(); + } + + for (depart_vec::iterator I = r->departs.begin(), E = r->departs.end(); + I != E; ++I) { + (*I)->expand(); + } + r->departs.clear(); + assert(r->repeats.empty()); +} + +void bc_finalizer::run_on(container_node* c) { + + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *n = *I; + + if (n->is_alu_group()) { + finalize_alu_group(static_cast<alu_group_node*>(n)); + } else { + if (n->is_fetch_inst()) { + finalize_fetch(static_cast<fetch_node*>(n)); + } else if (n->is_cf_inst()) { + finalize_cf(static_cast<cf_node*>(n)); + } else if (n->is_alu_clause()) { + + } else if (n->is_fetch_clause()) { + + } else { + assert(!"unexpected node"); + } + + if (n->is_container()) + run_on(static_cast<container_node*>(n)); + } + } +} + +void bc_finalizer::finalize_alu_group(alu_group_node* g) { + + alu_node *last = NULL; + + for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { + alu_node *n = static_cast<alu_node*>(*I); + unsigned slot = n->bc.slot; + + value *d = n->dst.empty() ? NULL : n->dst[0]; + + if (d && d->is_special_reg()) { + assert(n->bc.op_ptr->flags & AF_MOVA); + d = NULL; + } + + sel_chan fdst = d ? d->get_final_gpr() : sel_chan(0, 0); + + if (d) { + assert(fdst.chan() == slot || slot == SLOT_TRANS); + } + + n->bc.dst_gpr = fdst.sel(); + n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0; + + + if (d && d->is_rel() && d->rel && !d->rel->is_const()) { + n->bc.dst_rel = 1; + update_ngpr(d->array->gpr.sel() + d->array->array_size -1); + } else { + n->bc.dst_rel = 0; + } + + n->bc.write_mask = d != NULL; + n->bc.last = 0; + + if (n->bc.op_ptr->flags & AF_PRED) { + n->bc.update_pred = (n->dst[1] != NULL); + n->bc.update_exec_mask = (n->dst[2] != NULL); + } + + // FIXME handle predication here + n->bc.pred_sel = PRED_SEL_OFF; + + update_ngpr(n->bc.dst_gpr); + + finalize_alu_src(g, n); + + last = n; + } + + last->bc.last = 1; +} + +void bc_finalizer::finalize_alu_src(alu_group_node* g, alu_node* a) { + vvec &sv = a->src; + + FBC_DUMP( + cerr << "finalize_alu_src: "; + dump::dump_op(a); + cerr << "\n"; + ); + + unsigned si = 0; + + for (vvec::iterator I = sv.begin(), E = sv.end(); I != E; ++I, ++si) { + value *v = *I; + assert(v); + + bc_alu_src &src = a->bc.src[si]; + sel_chan sc; + src.rel = 0; + + sel_chan gpr; + + switch (v->kind) { + case VLK_REL_REG: + sc = v->get_final_gpr(); + src.sel = sc.sel(); + src.chan = sc.chan(); + if (!v->rel->is_const()) { + src.rel = 1; + update_ngpr(v->array->gpr.sel() + v->array->array_size -1); + } else + src.rel = 0; + + break; + case VLK_REG: + gpr = v->get_final_gpr(); + src.sel = gpr.sel(); + src.chan = gpr.chan(); + update_ngpr(src.sel); + break; + case VLK_TEMP: + src.sel = v->gpr.sel(); + src.chan = v->gpr.chan(); + update_ngpr(src.sel); + break; + case VLK_UNDEF: + case VLK_CONST: { + literal lv = v->literal_value; + src.chan = 0; + + if (lv == literal(0)) + src.sel = ALU_SRC_0; + else if (lv == literal(0.5f)) + src.sel = ALU_SRC_0_5; + else if (lv == literal(1.0f)) + src.sel = ALU_SRC_1; + else if (lv == literal(1)) + src.sel = ALU_SRC_1_INT; + else if (lv == literal(-1)) + src.sel = ALU_SRC_M_1_INT; + else { + src.sel = ALU_SRC_LITERAL; + src.chan = g->literal_chan(lv); + src.value = lv; + } + break; + } + case VLK_KCACHE: { + cf_node *clause = static_cast<cf_node*>(g->parent); + assert(clause->is_alu_clause()); + sel_chan k = translate_kcache(clause, v); + + assert(k && "kcache translation failed"); + + src.sel = k.sel(); + src.chan = k.chan(); + break; + } + case VLK_PARAM: + case VLK_SPECIAL_CONST: + src.sel = v->select.sel(); + src.chan = v->select.chan(); + break; + default: + assert(!"unknown value kind"); + break; + } + } + + while (si < 3) { + a->bc.src[si++].sel = 0; + } +} + +void bc_finalizer::emit_set_grad(fetch_node* f) { + + assert(f->src.size() == 12); + unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H }; + + unsigned arg_start = 0; + + for (unsigned op = 0; op < 2; ++op) { + fetch_node *n = sh.create_fetch(); + n->bc.set_op(ops[op]); + + // FIXME extract this loop into a separate method and reuse it + + int reg = -1; + + arg_start += 4; + + for (unsigned chan = 0; chan < 4; ++chan) { + + n->bc.dst_sel[chan] = SEL_MASK; + + unsigned sel = SEL_MASK; + + value *v = f->src[arg_start + chan]; + + if (!v || v->is_undef()) { + sel = SEL_MASK; + } else if (v->is_const()) { + literal l = v->literal_value; + if (l == literal(0)) + sel = SEL_0; + else if (l == literal(1.0f)) + sel = SEL_1; + else { + cerr << "invalid fetch constant operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + } else if (v->is_any_gpr()) { + unsigned vreg = v->gpr.sel(); + unsigned vchan = v->gpr.chan(); + + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid fetch source operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + sel = vchan; + + } else { + cerr << "invalid fetch source operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + n->bc.src_sel[chan] = sel; + } + + if (reg >= 0) + update_ngpr(reg); + + n->bc.src_gpr = reg >= 0 ? reg : 0; + + f->insert_before(n); + } + +} + +void bc_finalizer::finalize_fetch(fetch_node* f) { + + int reg = -1; + + // src + + unsigned src_count = 4; + + unsigned flags = f->bc.op_ptr->flags; + + if (flags & FF_VTX) { + src_count = 1; + } else if (flags & FF_USEGRAD) { + emit_set_grad(f); + } + + for (unsigned chan = 0; chan < src_count; ++chan) { + + unsigned sel = f->bc.src_sel[chan]; + + if (sel > SEL_W) + continue; + + value *v = f->src[chan]; + + if (v->is_undef()) { + sel = SEL_MASK; + } else if (v->is_const()) { + literal l = v->literal_value; + if (l == literal(0)) + sel = SEL_0; + else if (l == literal(1.0f)) + sel = SEL_1; + else { + cerr << "invalid fetch constant operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + } else if (v->is_any_gpr()) { + unsigned vreg = v->gpr.sel(); + unsigned vchan = v->gpr.chan(); + + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid fetch source operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + sel = vchan; + + } else { + cerr << "invalid fetch source operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + f->bc.src_sel[chan] = sel; + } + + if (reg >= 0) + update_ngpr(reg); + + f->bc.src_gpr = reg >= 0 ? reg : 0; + + // dst + + reg = -1; + + unsigned dst_swz[4] = {SEL_MASK, SEL_MASK, SEL_MASK, SEL_MASK}; + + for (unsigned chan = 0; chan < 4; ++chan) { + + unsigned sel = f->bc.dst_sel[chan]; + + if (sel == SEL_MASK) + continue; + + value *v = f->dst[chan]; + if (!v) + continue; + + if (v->is_any_gpr()) { + unsigned vreg = v->gpr.sel(); + unsigned vchan = v->gpr.chan(); + + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid fetch dst operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + dst_swz[vchan] = sel; + + } else { + cerr << "invalid fetch dst operand " << chan << " "; + dump::dump_op(f); + cerr << "\n"; + abort(); + } + + } + + for (unsigned i = 0; i < 4; ++i) + f->bc.dst_sel[i] = dst_swz[i]; + + assert(reg >= 0); + + if (reg >= 0) + update_ngpr(reg); + + f->bc.dst_gpr = reg >= 0 ? reg : 0; +} + +void bc_finalizer::finalize_cf(cf_node* c) { + + unsigned flags = c->bc.op_ptr->flags; + + if (flags & CF_CALL) { + update_nstack(c->get_parent_region(), ctx.is_cayman() ? 1 : 2); + } + + c->bc.end_of_program = 0; + last_cf = c; + + if (flags & CF_EXP) { + c->bc.set_op(CF_OP_EXPORT); + last_export[c->bc.type] = c; + + int reg = -1; + + for (unsigned chan = 0; chan < 4; ++chan) { + + unsigned sel = c->bc.sel[chan]; + + if (sel > SEL_W) + continue; + + value *v = c->src[chan]; + + if (v->is_undef()) { + sel = SEL_MASK; + } else if (v->is_const()) { + literal l = v->literal_value; + if (l == literal(0)) + sel = SEL_0; + else if (l == literal(1.0f)) + sel = SEL_1; + else { + cerr << "invalid export constant operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + + } else if (v->is_any_gpr()) { + unsigned vreg = v->gpr.sel(); + unsigned vchan = v->gpr.chan(); + + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid export source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + + sel = vchan; + + } else { + cerr << "invalid export source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + + c->bc.sel[chan] = sel; + } + + if (reg >= 0) + update_ngpr(reg); + + c->bc.rw_gpr = reg >= 0 ? reg : 0; + + } else if (flags & CF_MEM) { + + int reg = -1; + unsigned mask = 0; + + for (unsigned chan = 0; chan < 4; ++chan) { + value *v = c->src[chan]; + if (!v || v->is_undef()) + continue; + + if (!v->is_any_gpr() || v->gpr.chan() != chan) { + cerr << "invalid source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + unsigned vreg = v->gpr.sel(); + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + + mask |= (1 << chan); + } + + assert(reg >= 0 && mask); + + if (reg >= 0) + update_ngpr(reg); + + c->bc.rw_gpr = reg >= 0 ? reg : 0; + c->bc.comp_mask = mask; + + if ((flags & CF_RAT) && (c->bc.type & 1)) { + + reg = -1; + + for (unsigned chan = 0; chan < 4; ++chan) { + value *v = c->src[4 + chan]; + if (!v || v->is_undef()) + continue; + + if (!v->is_any_gpr() || v->gpr.chan() != chan) { + cerr << "invalid source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + unsigned vreg = v->gpr.sel(); + if (reg == -1) + reg = vreg; + else if ((unsigned)reg != vreg) { + cerr << "invalid source operand " << chan << " "; + dump::dump_op(c); + cerr << "\n"; + abort(); + } + } + + assert(reg >= 0); + + if (reg >= 0) + update_ngpr(reg); + + c->bc.index_gpr = reg >= 0 ? reg : 0; + } + + + + } else { + +#if 0 + if ((flags & (CF_BRANCH | CF_LOOP)) && !sh.uses_gradients) { + c->bc.valid_pixel_mode = 1; + } +#endif + + } +} + +sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) { + unsigned sel = v->select.sel(); + unsigned bank = sel >> 12; + unsigned chan = v->select.chan(); + static const unsigned kc_base[] = {128, 160, 256, 288}; + + sel &= 4095; + + unsigned line = sel >> 4; + + for (unsigned k = 0; k < 4; ++k) { + bc_kcache &kc = alu->bc.kc[k]; + + if (kc.mode == KC_LOCK_NONE) + break; + + if (kc.bank == bank && (kc.addr == line || + (kc.mode == KC_LOCK_2 && kc.addr + 1 == line))) { + + sel = kc_base[k] + (sel - (kc.addr << 4)); + + return sel_chan(sel, chan); + } + } + + assert(!"kcache translation error"); + return 0; +} + +void bc_finalizer::update_ngpr(unsigned gpr) { + if (gpr < MAX_GPR - ctx.alu_temp_gprs && gpr >= ngpr) + ngpr = gpr + 1; +} + +void bc_finalizer::update_nstack(region_node* r, unsigned add) { + unsigned loops = 0; + unsigned ifs = 0; + + while (r) { + if (r->is_loop()) + ++loops; + else + ++ifs; + + r = r->get_parent_region(); + } + + unsigned stack_elements = (loops * ctx.stack_entry_size) + ifs + add; + + // FIXME calculate more precisely + if (ctx.is_evergreen()) { + ++stack_elements; + } else { + stack_elements += 2; + if (ctx.is_cayman()) + ++stack_elements; + } + + unsigned stack_entries = (stack_elements + 3) >> 2; + + if (nstack < stack_entries) + nstack = stack_entries; +} + +void bc_finalizer::cf_peephole() { + + for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E; + I = N) { + N = I; ++N; + + cf_node *c = static_cast<cf_node*>(*I); + + if (c->jump_after_target) { + c->jump_target = static_cast<cf_node*>(c->jump_target->next); + c->jump_after_target = false; + } + + if (c->is_cf_op(CF_OP_POP)) { + node *p = c->prev; + if (p->is_alu_clause()) { + cf_node *a = static_cast<cf_node*>(p); + + if (a->bc.op == CF_OP_ALU) { + a->bc.set_op(CF_OP_ALU_POP_AFTER); + c->remove(); + } + } + } else if (c->is_cf_op(CF_OP_JUMP) && c->jump_target == c->next) { + // if JUMP is immediately followed by its jump target, + // then JUMP is useless and we can eliminate it + c->remove(); + } + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc b/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc new file mode 100644 index 00000000000..50f73d7df3b --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc @@ -0,0 +1,543 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +// TODO add all formats + +// CF + +BC_FORMAT_BEGIN_HW(CF_WORD0, R6R7) +BC_FIELD(CF_WORD0, ADDR, ADDR, 31, 0) +BC_FORMAT_END(CF_WORD0) + +BC_FORMAT_BEGIN_HW(CF_WORD0, EGCM) +BC_FIELD(CF_WORD0, ADDR, ADDR, 23, 0) +BC_FIELD(CF_WORD0, JUMPTABLE_SEL, JTS, 26, 24) +BC_RSRVD(CF_WORD0, 31, 27) +BC_FORMAT_END(CF_WORD0) + + +BC_FORMAT_BEGIN_HW(CF_GWS_WORD0, EGCM) +BC_FIELD(CF_GWS_WORD0, VALUE, VAL, 9, 0) +BC_RSRVD(CF_GWS_WORD0, 15, 10) +BC_FIELD(CF_GWS_WORD0, RESOURCE, RSRC, 20, 16) +BC_RSRVD(CF_GWS_WORD0, 24, 21) +BC_FIELD(CF_GWS_WORD0, SIGN, SIGN, 25, 25) +BC_FIELD(CF_GWS_WORD0, VAL_INDEX_MODE, VIM, 27, 26) +BC_FIELD(CF_GWS_WORD0, RSRC_INDEX_MODE, RIM, 29, 28) +BC_FIELD(CF_GWS_WORD0, GWS_OPCODE, GWS_OP, 31, 30) +BC_FORMAT_END(CF_GWS_WORD0) + + +BC_FORMAT_BEGIN_HW(CF_WORD1, R6R7) +BC_FIELD(CF_WORD1, POP_COUNT, PC, 2, 0) +BC_FIELD(CF_WORD1, CF_CONST, CF_CONST, 7, 3) +BC_FIELD(CF_WORD1, COND, COND, 9, 8) +BC_FIELD(CF_WORD1, COUNT, COUNT, 12, 10) +BC_FIELD(CF_WORD1, CALL_COUNT, CALL_CNT, 18, 13) +BC_FIELD(CF_WORD1, COUNT_3, COUNT_3, 19, 19) +BC_RSRVD(CF_WORD1, 20, 20) +BC_FIELD(CF_WORD1, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_WORD1, VALID_PIXEL_MODE, VPM, 22, 22) +BC_FIELD(CF_WORD1, CF_INST, CF_INST, 29, 23) +BC_FIELD(CF_WORD1, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_WORD1, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_WORD1) + +BC_FORMAT_BEGIN_HW(CF_WORD1, EG) +BC_FIELD(CF_WORD1, POP_COUNT, PC, 2, 0) +BC_FIELD(CF_WORD1, CF_CONST, CF_CONST, 7, 3) +BC_FIELD(CF_WORD1, COND, COND, 9, 8) +BC_FIELD(CF_WORD1, COUNT, COUNT, 15, 10) +BC_RSRVD(CF_WORD1, 19, 16) +BC_FIELD(CF_WORD1, VALID_PIXEL_MODE, VPM, 20, 20) +BC_FIELD(CF_WORD1, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_WORD1, CF_INST, CF_INST, 29, 22) +BC_FIELD(CF_WORD1, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_WORD1, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_WORD1) + +BC_FORMAT_BEGIN_HW(CF_WORD1, CM) +BC_FIELD(CF_WORD1, POP_COUNT, PC, 2, 0) +BC_FIELD(CF_WORD1, CF_CONST, CF_CONST, 7, 3) +BC_FIELD(CF_WORD1, COND, COND, 9, 8) +BC_FIELD(CF_WORD1, COUNT, COUNT, 15, 10) +BC_RSRVD(CF_WORD1, 19, 16) +BC_FIELD(CF_WORD1, VALID_PIXEL_MODE, VPM, 20, 20) +BC_RSRVD(CF_WORD1, 21, 21) +BC_FIELD(CF_WORD1, CF_INST, CF_INST, 29, 22) +BC_RSRVD(CF_WORD1, 30, 30) +BC_FIELD(CF_WORD1, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_WORD1) + + +BC_FORMAT_BEGIN(CF_ALU_WORD0) +BC_FIELD(CF_ALU_WORD0, ADDR, ADDR, 21, 0) +BC_FIELD(CF_ALU_WORD0, KCACHE_BANK0, KB0, 25, 22) +BC_FIELD(CF_ALU_WORD0, KCACHE_BANK1, KB1, 29, 26) +BC_FIELD(CF_ALU_WORD0, KCACHE_MODE0, KM0, 31, 30) +BC_FORMAT_END(CF_ALU_WORD0) + +BC_FORMAT_BEGIN_HW(CF_ALU_WORD1, R6) +BC_FIELD(CF_ALU_WORD1, KCACHE_MODE1, KM1, 1, 0) +BC_FIELD(CF_ALU_WORD1, KCACHE_ADDR0, KA0, 9, 2) +BC_FIELD(CF_ALU_WORD1, KCACHE_ADDR1, KA1, 17, 10) +BC_FIELD(CF_ALU_WORD1, COUNT, COUNT, 24, 18) +BC_FIELD(CF_ALU_WORD1, USES_WATERFALL, UW, 25, 25) +BC_FIELD(CF_ALU_WORD1, CF_INST, CF_INST, 29, 26) +BC_FIELD(CF_ALU_WORD1, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_ALU_WORD1, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALU_WORD1) + +BC_FORMAT_BEGIN_HW(CF_ALU_WORD1, R7EGCM) +BC_FIELD(CF_ALU_WORD1, KCACHE_MODE1, KM1, 1, 0) +BC_FIELD(CF_ALU_WORD1, KCACHE_ADDR0, KA0, 9, 2) +BC_FIELD(CF_ALU_WORD1, KCACHE_ADDR1, KA1, 17, 10) +BC_FIELD(CF_ALU_WORD1, COUNT, COUNT, 24, 18) +BC_FIELD(CF_ALU_WORD1, ALT_CONST, ALT_C, 25, 25) +BC_FIELD(CF_ALU_WORD1, CF_INST, CF_INST, 29, 26) +BC_FIELD(CF_ALU_WORD1, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_ALU_WORD1, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALU_WORD1) + + +BC_FORMAT_BEGIN_HW(CF_ALU_WORD0_EXT, EGCM) +BC_RSRVD(CF_ALU_WORD0_EXT, 3, 0) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK_INDEX_MODE0, KBIM0, 5, 4) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK_INDEX_MODE1, KBIM1, 7, 6) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK_INDEX_MODE2, KBIM2, 9, 8) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK_INDEX_MODE3, KBIM3, 11, 10) +BC_RSRVD(CF_ALU_WORD0_EXT, 21, 12) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK2, KB2, 25, 22) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_BANK3, KB3, 29, 26) +BC_FIELD(CF_ALU_WORD0_EXT, KCACHE_MODE2, KM2, 31, 30) +BC_FORMAT_END(CF_ALU_WORD0_EXT) + +BC_FORMAT_BEGIN_HW(CF_ALU_WORD1_EXT, EGCM) +BC_FIELD(CF_ALU_WORD1_EXT, KCACHE_MODE3, KM3, 1, 0) +BC_FIELD(CF_ALU_WORD1_EXT, KCACHE_ADDR2, KA2, 9, 2) +BC_FIELD(CF_ALU_WORD1_EXT, KCACHE_ADDR3, KA3, 17, 10) +BC_RSRVD(CF_ALU_WORD1_EXT, 25, 18) +BC_FIELD(CF_ALU_WORD1_EXT, CF_INST, CF_INST, 29, 26) +BC_RSRVD(CF_ALU_WORD1_EXT, 30, 30) +BC_FIELD(CF_ALU_WORD1_EXT, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALU_WORD1_EXT) + + +BC_FORMAT_BEGIN(CF_ALLOC_EXPORT_WORD0) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, ARRAY_BASE, ARR_BS, 12, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, TYPE, TYPE, 14, 13) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, RW_GPR, RW_GPR, 21, 15) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, RW_REL, RW_REL, 22, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, INDEX_GPR, IND_GPR, 29, 23) +BC_FIELD(CF_ALLOC_EXPORT_WORD0, ELEM_SIZE, ES, 31, 30) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD0) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD0_RAT, EGCM) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, RAT_ID, R_ID, 3, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, RAT_INST, R_INST, 9, 4) +BC_RSRVD(CF_ALLOC_EXPORT_WORD0_RAT, 10, 10) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, RAT_INDEX_MODE, RIM, 12, 11) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, TYPE, TYPE, 14, 13) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, RW_GPR, RW_GPR, 21, 15) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, RW_REL, RW_REL, 22, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, INDEX_GPR, IND_GPR, 29, 23) +BC_FIELD(CF_ALLOC_EXPORT_WORD0_RAT, ELEM_SIZE, ES, 31, 30) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD0_RAT) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_BUF, R6R7) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, ARRAY_SIZE, ARR_SZ, 11, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, COMP_MASK, MASK, 15, 12) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_BUF, 16, 16) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BURST_COUNT, BURST, 20, 17) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, VALID_PIXEL_MODE, VPM, 22, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, CF_INST, CF_INST, 29, 23) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_BUF) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_BUF, EG) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, ARRAY_SIZE, ARR_SZ, 11, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, COMP_MASK, MASK, 15, 12) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BURST_COUNT, BURST, 19, 16) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, VALID_PIXEL_MODE, VPM, 20, 20) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, CF_INST, CF_INST, 29, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, MARK, MARK, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_BUF) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_BUF, CM) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, ARRAY_SIZE, ARR_SZ, 11, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, COMP_MASK, MASK, 15, 12) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BURST_COUNT, BURST, 19, 16) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, VALID_PIXEL_MODE, VPM, 20, 20) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_BUF, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, CF_INST, CF_INST, 29, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, MARK, MARK, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_BUF, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_BUF) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_SWIZ, R6R7) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_X, SEL_X, 2, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Y, SEL_Y, 5, 3) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Z, SEL_Z, 8, 6) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_W, SEL_W, 11, 9) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_SWIZ, 16, 12) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BURST_COUNT, BURST, 20, 17) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, VALID_PIXEL_MODE, VPM, 22, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, CF_INST, CF_INST, 29, 23) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, WHOLE_QUAD_MODE, WQM, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_SWIZ) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_SWIZ, EG) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_X, SEL_X, 2, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Y, SEL_Y, 5, 3) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Z, SEL_Z, 8, 6) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_W, SEL_W, 11, 9) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_SWIZ, 15, 12) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BURST_COUNT, BURST, 19, 16) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, VALID_PIXEL_MODE, VPM, 20, 20) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, END_OF_PROGRAM, EOP, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, CF_INST, CF_INST, 29, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, MARK, M, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_SWIZ) + +BC_FORMAT_BEGIN_HW(CF_ALLOC_EXPORT_WORD1_SWIZ, CM) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_X, SEL_X, 2, 0) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Y, SEL_Y, 5, 3) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_Z, SEL_Z, 8, 6) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, SEL_W, SEL_W, 11, 9) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_SWIZ, 15, 12) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BURST_COUNT, BURST, 19, 16) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, VALID_PIXEL_MODE, VPM, 20, 20) +BC_RSRVD(CF_ALLOC_EXPORT_WORD1_SWIZ, 21, 21) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, CF_INST, CF_INST, 29, 22) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, MARK, M, 30, 30) +BC_FIELD(CF_ALLOC_EXPORT_WORD1_SWIZ, BARRIER, B, 31, 31) +BC_FORMAT_END(CF_ALLOC_EXPORT_WORD1_SWIZ) + +// ALU + +BC_FORMAT_BEGIN(ALU_WORD0) +BC_FIELD(ALU_WORD0, SRC0_SEL, S0S, 8, 0) +BC_FIELD(ALU_WORD0, SRC0_REL, S0R, 9, 9) +BC_FIELD(ALU_WORD0, SRC0_CHAN, S0C, 11, 10) +BC_FIELD(ALU_WORD0, SRC0_NEG, S0N, 12, 12) +BC_FIELD(ALU_WORD0, SRC1_SEL, S1S, 21, 13) +BC_FIELD(ALU_WORD0, SRC1_REL, S1R, 22, 22) +BC_FIELD(ALU_WORD0, SRC1_CHAN, S1C, 24, 23) +BC_FIELD(ALU_WORD0, SRC1_NEG, S1N, 25, 25) +BC_FIELD(ALU_WORD0, INDEX_MODE, IM, 28, 26) +BC_FIELD(ALU_WORD0, PRED_SEL, PS, 30, 29) +BC_FIELD(ALU_WORD0, LAST, L, 31, 31) +BC_FORMAT_END(ALU_WORD0) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_OP2, R6) +BC_FIELD(ALU_WORD1_OP2, SRC0_ABS, S0A, 0, 0) +BC_FIELD(ALU_WORD1_OP2, SRC1_ABS, S1A, 1, 1) +BC_FIELD(ALU_WORD1_OP2, UPDATE_EXEC_MASK, UEM, 2, 2) +BC_FIELD(ALU_WORD1_OP2, UPDATE_PRED, UP, 3, 3) +BC_FIELD(ALU_WORD1_OP2, WRITE_MASK, WM, 4, 4) +BC_FIELD(ALU_WORD1_OP2, FOG_MERGE, FM, 5, 5) +BC_FIELD(ALU_WORD1_OP2, OMOD, OMOD, 7, 6) +BC_FIELD(ALU_WORD1_OP2, ALU_INST, INST, 17, 8) +BC_FIELD(ALU_WORD1_OP2, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_OP2, DST_GPR, DGPR, 27, 21) +BC_FIELD(ALU_WORD1_OP2, DST_REL, DR, 28, 28) +BC_FIELD(ALU_WORD1_OP2, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_OP2, CLAMP, C, 31, 31) +BC_FORMAT_END(ALU_WORD1_OP2) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_OP2, R7EGCM) +BC_FIELD(ALU_WORD1_OP2, SRC0_ABS, S0A, 0, 0) +BC_FIELD(ALU_WORD1_OP2, SRC1_ABS, S1A, 1, 1) +BC_FIELD(ALU_WORD1_OP2, UPDATE_EXEC_MASK, UEM, 2, 2) +BC_FIELD(ALU_WORD1_OP2, UPDATE_PRED, UP, 3, 3) +BC_FIELD(ALU_WORD1_OP2, WRITE_MASK, WM, 4, 4) +BC_FIELD(ALU_WORD1_OP2, OMOD, OMOD, 6, 5) +BC_FIELD(ALU_WORD1_OP2, ALU_INST, INST, 17, 7) +BC_FIELD(ALU_WORD1_OP2, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_OP2, DST_GPR, DGPR, 27, 21) +BC_FIELD(ALU_WORD1_OP2, DST_REL, DR, 28, 28) +BC_FIELD(ALU_WORD1_OP2, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_OP2, CLAMP, C, 31, 31) +BC_FORMAT_END(ALU_WORD1_OP2) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_OP2_MOVA, CM) +BC_FIELD(ALU_WORD1_OP2_MOVA, SRC0_ABS, S0A, 0, 0) +BC_FIELD(ALU_WORD1_OP2_MOVA, SRC1_ABS, S1A, 1, 1) +BC_FIELD(ALU_WORD1_OP2_MOVA, UPDATE_EXEC_MASK, UEM, 2, 2) +BC_FIELD(ALU_WORD1_OP2_MOVA, UPDATE_PRED, UP, 3, 3) +BC_FIELD(ALU_WORD1_OP2_MOVA, WRITE_MASK, WM, 4, 4) +BC_FIELD(ALU_WORD1_OP2_MOVA, OMOD, OMOD, 6, 5) +BC_FIELD(ALU_WORD1_OP2_MOVA, ALU_INST, INST, 17, 7) +BC_FIELD(ALU_WORD1_OP2_MOVA, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_OP2_MOVA, MOVA_DST, MOVA_DST, 27, 21) +BC_FIELD(ALU_WORD1_OP2_MOVA, DST_REL, DR, 28, 28) +BC_FIELD(ALU_WORD1_OP2_MOVA, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_OP2_MOVA, CLAMP, C, 31, 31) +BC_FORMAT_END(ALU_WORD1_OP2_MOVA) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_OP2_EXEC_MASK, CM) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, SRC0_ABS, S0A, 0, 0) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, SRC1_ABS, S1A, 1, 1) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, UPDATE_EXEC_MASK, UEM, 2, 2) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, UPDATE_PRED, UP, 3, 3) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, WRITE_MASK, WM, 4, 4) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, EXECUTE_MASK_OP, EMO, 6, 5) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, ALU_INST, INST, 17, 7) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, DST_GPR, DGPR, 27, 21) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, DST_REL, DR, 28, 28) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_OP2_EXEC_MASK, CLAMP, C, 31, 31) +BC_FORMAT_END(ALU_WORD1_OP2_EXEC_MASK) + +BC_FORMAT_BEGIN(ALU_WORD1_OP3) +BC_FIELD(ALU_WORD1_OP3, SRC2_SEL, S2S, 8, 0) +BC_FIELD(ALU_WORD1_OP3, SRC2_REL, S2R, 9, 9) +BC_FIELD(ALU_WORD1_OP3, SRC2_CHAN, S2C, 11, 10) +BC_FIELD(ALU_WORD1_OP3, SRC2_NEG, S2N, 12, 12) +BC_FIELD(ALU_WORD1_OP3, ALU_INST, INST, 17, 13) +BC_FIELD(ALU_WORD1_OP3, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_OP3, DST_GPR, DGPR, 27, 21) +BC_FIELD(ALU_WORD1_OP3, DST_REL, DR, 28, 28) +BC_FIELD(ALU_WORD1_OP3, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_OP3, CLAMP, C, 31, 31) +BC_FORMAT_END(ALU_WORD1_OP3) + + +BC_FORMAT_BEGIN_HW(ALU_WORD0_LDS_IDX_OP, EGCM) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC0_SEL, S0S, 8, 0) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC0_REL, S0R, 9, 9) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC0_CHAN, S0C, 11, 10) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, IDX_OFFSET_4, IO4, 12, 12) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC1_SEL, S1S, 21, 13) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC1_REL, S1R, 22, 22) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, SRC1_CHAN, S1C, 24, 23) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, IDX_OFFSET_5, IO5, 25, 25) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, INDEX_MODE, IM, 28, 26) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, PRED_SEL, PS, 30, 29) +BC_FIELD(ALU_WORD0_LDS_IDX_OP, LAST, L, 31, 31) +BC_FORMAT_END(ALU_WORD0_LDS_IDX_OP) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_LDS_IDX_OP, EGCM) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, SRC2_SEL, S2S, 8, 0) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, SRC2_REL, S2R, 9, 9) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, SRC2_CHAN, S2C, 11, 10) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, IDX_OFFSET_1, IO1, 12, 12) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, ALU_INST, INST, 17, 13) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, BANK_SWIZZLE, BS, 20, 18) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, LDS_OP, LDS_OP, 26, 21) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, IDX_OFFSET_0, IO0, 27, 27) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, IDX_OFFSET_2, IO2, 28, 28) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, DST_CHAN, DC, 30, 29) +BC_FIELD(ALU_WORD1_LDS_IDX_OP, IDX_OFFSET_3, IO3, 31, 31) +BC_FORMAT_END(ALU_WORD1_LDS_IDX_OP) + + +BC_FORMAT_BEGIN_HW(ALU_WORD1_LDS_DIRECT_LITERAL_LO, EGCM) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_LO, OFFSET_A, OFS_A, 12, 0) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_LO, STRIDE_A, STR_A, 19, 13) +BC_RSRVD(ALU_WORD1_LDS_DIRECT_LITERAL_LO, 21, 20) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_LO, THREAD_REL_A, THR_A, 22, 22) +BC_RSRVD(ALU_WORD1_LDS_DIRECT_LITERAL_LO, 31, 22) +BC_FORMAT_END(ALU_WORD1_LDS_DIRECT_LITERAL_LO) + +BC_FORMAT_BEGIN_HW(ALU_WORD1_LDS_DIRECT_LITERAL_HI, EGCM) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, OFFSET_B, OFS_B, 12, 0) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, STRIDE_B, STR_B, 19, 13) +BC_RSRVD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, 21, 20) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, THREAD_REL_B, THR_B, 22, 22) +BC_RSRVD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, 30, 22) +BC_FIELD(ALU_WORD1_LDS_DIRECT_LITERAL_HI, DIRECT_READ_32, DR32, 31, 31) +BC_FORMAT_END(ALU_WORD1_LDS_DIRECT_LITERAL_HI) + + +// VTX + +BC_FORMAT_BEGIN_HW(VTX_WORD0, R6R7EG) +BC_FIELD(VTX_WORD0, VC_INST, INST, 4, 0) +BC_FIELD(VTX_WORD0, FETCH_TYPE, FT, 6, 5) +BC_FIELD(VTX_WORD0, FETCH_WHOLE_QUAD, FWQ, 7, 7) +BC_FIELD(VTX_WORD0, BUFFER_ID, BUF_ID, 15, 8) +BC_FIELD(VTX_WORD0, SRC_GPR, S_GPR, 22, 16) +BC_FIELD(VTX_WORD0, SRC_REL, SR, 23, 23) +BC_FIELD(VTX_WORD0, SRC_SEL_X, SSX, 25, 24) +BC_FIELD(VTX_WORD0, MEGA_FETCH_COUNT, MFC, 31, 26) +BC_FORMAT_END(VTX_WORD0) + +BC_FORMAT_BEGIN_HW(VTX_WORD0, CM) +BC_FIELD(VTX_WORD0, VC_INST, INST, 4, 0) +BC_FIELD(VTX_WORD0, FETCH_TYPE, FT, 6, 5) +BC_FIELD(VTX_WORD0, FETCH_WHOLE_QUAD, FWQ, 7, 7) +BC_FIELD(VTX_WORD0, BUFFER_ID, BUF_ID, 15, 8) +BC_FIELD(VTX_WORD0, SRC_GPR, S_GPR, 22, 16) +BC_FIELD(VTX_WORD0, SRC_REL, SR, 23, 23) +BC_FIELD(VTX_WORD0, SRC_SEL_X, SSX, 25, 24) +BC_FIELD(VTX_WORD0, SRC_SEL_Y, SSY, 27, 26) +BC_FIELD(VTX_WORD0, STRUCTURED_READ, SR, 29, 28) +BC_FIELD(VTX_WORD0, LDS_REQ, LR, 30, 30) +BC_FIELD(VTX_WORD0, COALESCED_READ, CR, 31, 31) +BC_FORMAT_END(VTX_WORD0) + + +BC_FORMAT_BEGIN(VTX_WORD1_GPR) +BC_FIELD(VTX_WORD1_GPR, DST_GPR, D_GPR, 6, 0) +BC_FIELD(VTX_WORD1_GPR, DST_REL, DR, 7, 7) +BC_RSRVD(VTX_WORD1_GPR, 8, 8) +BC_FIELD(VTX_WORD1_GPR, DST_SEL_X, DSX, 11, 9) +BC_FIELD(VTX_WORD1_GPR, DST_SEL_Y, DSY, 14, 12) +BC_FIELD(VTX_WORD1_GPR, DST_SEL_Z, DSZ, 17, 15) +BC_FIELD(VTX_WORD1_GPR, DST_SEL_W, DSW, 20, 18) +BC_FIELD(VTX_WORD1_GPR, USE_CONST_FIELDS, UCF, 21, 21) +BC_FIELD(VTX_WORD1_GPR, DATA_FORMAT, DFMT, 27, 22) +BC_FIELD(VTX_WORD1_GPR, NUM_FORMAT_ALL, NFA, 29, 28) +BC_FIELD(VTX_WORD1_GPR, FORMAT_COMP_ALL, FCA, 30, 30) +BC_FIELD(VTX_WORD1_GPR, SRF_MODE_ALL, SMA, 31, 31) +BC_FORMAT_END(VTX_WORD1_GPR) + +BC_FORMAT_BEGIN(VTX_WORD1_SEM) +BC_FIELD(VTX_WORD1_SEM, SEMANTIC_ID, SID, 7, 0) +BC_RSRVD(VTX_WORD1_SEM, 8, 8) +BC_FIELD(VTX_WORD1_SEM, DST_SEL_X, DSX, 11, 9) +BC_FIELD(VTX_WORD1_SEM, DST_SEL_Y, DSY, 14, 12) +BC_FIELD(VTX_WORD1_SEM, DST_SEL_Z, DSZ, 17, 15) +BC_FIELD(VTX_WORD1_SEM, DST_SEL_W, DSW, 20, 18) +BC_FIELD(VTX_WORD1_SEM, USE_CONST_FIELDS, UCF, 21, 21) +BC_FIELD(VTX_WORD1_SEM, DATA_FORMAT, DFMT, 27, 22) +BC_FIELD(VTX_WORD1_SEM, NUM_FORMAT_ALL, NFA, 29, 28) +BC_FIELD(VTX_WORD1_SEM, FORMAT_COMP_ALL, FCA, 30, 30) +BC_FIELD(VTX_WORD1_SEM, SRF_MODE_ALL, SMA, 31, 31) +BC_FORMAT_END(VTX_WORD1_SEM) + + +BC_FORMAT_BEGIN_HW(VTX_WORD2, R6) +BC_FIELD(VTX_WORD2, OFFSET, OFS, 15, 0) +BC_FIELD(VTX_WORD2, ENDIAN_SWAP, ES, 17, 16) +BC_FIELD(VTX_WORD2, CONST_BUF_NO_STRIDE, CBNS, 18, 18) +BC_FIELD(VTX_WORD2, MEGA_FETCH, MF, 19, 19) +BC_RSRVD(VTX_WORD2, 31, 20) +BC_FORMAT_END(VTX_WORD2) + +BC_FORMAT_BEGIN_HW(VTX_WORD2, R7) +BC_FIELD(VTX_WORD2, OFFSET, OFS, 15, 0) +BC_FIELD(VTX_WORD2, ENDIAN_SWAP, ES, 17, 16) +BC_FIELD(VTX_WORD2, CONST_BUF_NO_STRIDE, CBNS, 18, 18) +BC_FIELD(VTX_WORD2, MEGA_FETCH, MF, 19, 19) +BC_FIELD(VTX_WORD2, ALT_CONST, ALT_C, 20, 20) +BC_RSRVD(VTX_WORD2, 31, 21) +BC_FORMAT_END(VTX_WORD2) + +BC_FORMAT_BEGIN_HW(VTX_WORD2, EG) +BC_FIELD(VTX_WORD2, OFFSET, OFS, 15, 0) +BC_FIELD(VTX_WORD2, ENDIAN_SWAP, ES, 17, 16) +BC_FIELD(VTX_WORD2, CONST_BUF_NO_STRIDE, CBNS, 18, 18) +BC_FIELD(VTX_WORD2, MEGA_FETCH, MF, 19, 19) +BC_FIELD(VTX_WORD2, ALT_CONST, ALT_C, 20, 20) +BC_FIELD(VTX_WORD2, BUFFER_INDEX_MODE, BIM, 22, 21) +BC_RSRVD(VTX_WORD2, 31, 23) +BC_FORMAT_END(VTX_WORD2) + +BC_FORMAT_BEGIN_HW(VTX_WORD2, CM) +BC_FIELD(VTX_WORD2, OFFSET, OFS, 15, 0) +BC_FIELD(VTX_WORD2, ENDIAN_SWAP, ES, 17, 16) +BC_FIELD(VTX_WORD2, CONST_BUF_NO_STRIDE, CBNS, 18, 18) +BC_RSRVD(VTX_WORD2, 19, 19) +BC_FIELD(VTX_WORD2, ALT_CONST, ALT_C, 20, 20) +BC_FIELD(VTX_WORD2, BUFFER_INDEX_MODE, BIM, 22, 21) +BC_RSRVD(VTX_WORD2, 31, 23) +BC_FORMAT_END(VTX_WORD2) + +// TEX + +BC_FORMAT_BEGIN_HW(TEX_WORD0, R6) +BC_FIELD(TEX_WORD0, TEX_INST, T_INST, 4, 0) +BC_FIELD(TEX_WORD0, BC_FRAC_MODE, BFM, 5, 5) +BC_RSRVD(TEX_WORD0, 6, 6) +BC_FIELD(TEX_WORD0, FETCH_WHOLE_QUAD, FWQ, 7, 7) +BC_FIELD(TEX_WORD0, RESOURCE_ID, RSRC_ID, 15, 8) +BC_FIELD(TEX_WORD0, SRC_GPR, S_GPR, 22, 16) +BC_FIELD(TEX_WORD0, SRC_REL, SR, 23, 23) +BC_RSRVD(TEX_WORD0, 31, 24) +BC_FORMAT_END(TEX_WORD0) + +BC_FORMAT_BEGIN_HW(TEX_WORD0, R7) +BC_FIELD(TEX_WORD0, TEX_INST, T_INST, 4, 0) +BC_FIELD(TEX_WORD0, BC_FRAC_MODE, BFM, 5, 5) +BC_RSRVD(TEX_WORD0, 6, 6) +BC_FIELD(TEX_WORD0, FETCH_WHOLE_QUAD, FWQ, 7, 7) +BC_FIELD(TEX_WORD0, RESOURCE_ID, RSRC_ID, 15, 8) +BC_FIELD(TEX_WORD0, SRC_GPR, S_GPR, 22, 16) +BC_FIELD(TEX_WORD0, SRC_REL, SR, 23, 23) +BC_FIELD(TEX_WORD0, ALT_CONST, ALT_C, 24, 24) +BC_RSRVD(TEX_WORD0, 31, 25) +BC_FORMAT_END(TEX_WORD0) + +BC_FORMAT_BEGIN_HW(TEX_WORD0, EGCM) +BC_FIELD(TEX_WORD0, TEX_INST, T_INST, 4, 0) +BC_FIELD(TEX_WORD0, INST_MOD, IMOD, 6, 5) +BC_FIELD(TEX_WORD0, FETCH_WHOLE_QUAD, FWQ, 7, 7) +BC_FIELD(TEX_WORD0, RESOURCE_ID, RSRC_ID, 15, 8) +BC_FIELD(TEX_WORD0, SRC_GPR, S_GPR, 22, 16) +BC_FIELD(TEX_WORD0, SRC_REL, SR, 23, 23) +BC_FIELD(TEX_WORD0, ALT_CONST, ALT_C, 24, 24) +BC_FIELD(TEX_WORD0, RESOURCE_INDEX_MODE, RIM, 26, 25) +BC_FIELD(TEX_WORD0, SAMPLER_INDEX_MODE, SIM, 28, 27) +BC_RSRVD(TEX_WORD0, 31, 29) +BC_FORMAT_END(TEX_WORD0) + + +BC_FORMAT_BEGIN(TEX_WORD1) +BC_FIELD(TEX_WORD1, DST_GPR, D_GPR, 6, 0) +BC_FIELD(TEX_WORD1, DST_REL, DR, 7, 7) +BC_RSRVD(TEX_WORD1, 8, 8) +BC_FIELD(TEX_WORD1, DST_SEL_X, DSX, 11, 9) +BC_FIELD(TEX_WORD1, DST_SEL_Y, DSY, 14, 12) +BC_FIELD(TEX_WORD1, DST_SEL_Z, DSZ, 17, 15) +BC_FIELD(TEX_WORD1, DST_SEL_W, DSW, 20, 18) +BC_FIELD(TEX_WORD1, LOD_BIAS, LBIAS, 27, 21) +BC_FIELD(TEX_WORD1, COORD_TYPE_X, CTX, 28, 28) +BC_FIELD(TEX_WORD1, COORD_TYPE_Y, CTY, 29, 29) +BC_FIELD(TEX_WORD1, COORD_TYPE_Z, CTZ, 30, 30) +BC_FIELD(TEX_WORD1, COORD_TYPE_W, CTW, 31, 31) +BC_FORMAT_END(TEX_WORD1) + + +BC_FORMAT_BEGIN(TEX_WORD2) +BC_FIELD(TEX_WORD2, OFFSET_X, OFS_X, 4, 0) +BC_FIELD(TEX_WORD2, OFFSET_Y, OFS_Y, 9, 5) +BC_FIELD(TEX_WORD2, OFFSET_Z, OFS_Z, 14, 10) +BC_FIELD(TEX_WORD2, SAMPLER_ID, SAMP_ID, 19, 15) +BC_FIELD(TEX_WORD2, SRC_SEL_X, SSX, 22, 20) +BC_FIELD(TEX_WORD2, SRC_SEL_Y, SSY, 25, 23) +BC_FIELD(TEX_WORD2, SRC_SEL_Z, SSZ, 28, 26) +BC_FIELD(TEX_WORD2, SRC_SEL_W, SSW, 31, 29) +BC_FORMAT_END(TEX_WORD2) diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp new file mode 100644 index 00000000000..fa9e2e0e38d --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -0,0 +1,763 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define BCP_DEBUG 0 + +#if BCP_DEBUG +#define BCP_DUMP(q) do { q } while (0) +#else +#define BCP_DUMP(q) +#endif + +extern "C" { +#include "r600_pipe.h" +#include "r600_shader.h" +} + +#include <stack> + +#include "sb_bc.h" +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int bc_parser::parse() { + + dw = bc->bytecode; + bc_ndw = bc->ndw; + max_cf = 0; + + dec = new bc_decoder(ctx, dw, bc_ndw); + + shader_target t = TARGET_UNKNOWN; + + if (pshader) { + switch (bc->type) { + case TGSI_PROCESSOR_FRAGMENT: t = TARGET_PS; break; + case TGSI_PROCESSOR_VERTEX: t = TARGET_VS; break; + case TGSI_PROCESSOR_COMPUTE: t = TARGET_COMPUTE; break; + default: assert(!"unknown shader target"); return -1; break; + } + } else { + if (bc->type == TGSI_PROCESSOR_COMPUTE) + t = TARGET_COMPUTE; + else + t = TARGET_FETCH; + } + + sh = new shader(ctx, t, bc->debug_id, enable_dump); + int r = parse_shader(); + + if (r) + return r; + + if (enable_dump) { + sh->ngpr = bc->ngpr; + sh->nstack = bc->nstack; + bc_dump(*sh, cerr, bc->bytecode, bc_ndw).run(); + } + + if (!optimize) + return 0; + + prepare_ir(); + + delete dec; + return r; +} + +int bc_parser::parse_shader() { + int r = 0; + unsigned i = 0; + bool eop = false; + + sh->init(); + + if (pshader) + parse_decls(); + + do { + eop = false; + if ((r = parse_cf(i, eop))) + return r; + + } while (!eop || (i >> 1) <= max_cf); + + return 0; +} + +int bc_parser::parse_decls() { + +// sh->prepare_regs(rs.bc.ngpr); + + if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) { + +#if SB_NO_ARRAY_INFO + + sh->add_gpr_array(0, pshader->bc.ngpr, 0b1111); + +#else + + assert(pshader->num_arrays); + + if (pshader->num_arrays) { + + for (unsigned i = 0; i < pshader->num_arrays; ++i) { + r600_shader_array &a = pshader->arrays[i]; + sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask); + } + + } else { + sh->add_gpr_array(0, pshader->bc.ngpr, 0b1111); + } + + +#endif + + } + + if (sh->target == TARGET_VS) + sh->add_input(0, 1, 0b1111); + + bool ps_interp = ctx.hw_class >= HW_CLASS_EVERGREEN + && sh->target == TARGET_PS; + + unsigned linear = 0, persp = 0, centroid = 1; + + for (unsigned i = 0; i < pshader->ninput; ++i) { + r600_shader_io & in = pshader->input[i]; + bool preloaded = sh->target == TARGET_PS && !(ps_interp && in.spi_sid); + sh->add_input(in.gpr, preloaded, /*in.write_mask*/ 0b1111); + if (ps_interp && in.spi_sid) { + if (in.interpolate == TGSI_INTERPOLATE_LINEAR || + in.interpolate == TGSI_INTERPOLATE_COLOR) + linear = 1; + else if (in.interpolate == TGSI_INTERPOLATE_PERSPECTIVE) + persp = 1; + if (in.centroid) + centroid = 2; + } + } + + if (ps_interp) { + unsigned mask = (1 << (2 * (linear + persp) * centroid)) - 1; + unsigned gpr = 0; + + while (mask) { + sh->add_input(gpr, true, mask & 0b1111); + ++gpr; + mask >>= 4; + } + } + + + return 0; +} + + +int bc_parser::parse_cf(unsigned &i, bool &eop) { + + int r; + + cf_node *cf = sh->create_cf(); + sh->root->push_back(cf); + + unsigned id = i >> 1; + + cf->bc.id = id; + + if (cf_map.size() < id + 1) + cf_map.resize(id + 1); + + cf_map[id] = cf; + + if ((r = dec->decode_cf(i, cf->bc))) + return r; + + cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags; + + if (flags & CF_ALU) { + if ((r = parse_alu_clause(cf))) + return r; + } else if (flags & CF_FETCH) { + if ((r = parse_fetch_clause(cf))) + return r;; + } else if (flags & CF_EXP) { + assert(!cf->bc.rw_rel); + } else if (flags & (CF_STRM | CF_RAT)) { + assert(!cf->bc.rw_rel); + } else if (cf->bc.op == CF_OP_CALL_FS) { + sh->init_call_fs(cf); + cf->flags |= NF_SCHEDULE_EARLY | NF_DONT_MOVE; + } else if (flags & CF_BRANCH) { + if (cf->bc.addr > max_cf) + max_cf = cf->bc.addr; + } + + eop = cf->bc.end_of_program || cf->bc.op == CF_OP_CF_END || + cf->bc.op == CF_OP_RET; + return 0; +} + +int bc_parser::parse_alu_clause(cf_node* cf) { + unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1, gcnt; + + cgroup = 0; + memset(slots[0], 0, 5*sizeof(slots[0][0])); + + unsigned ng = 0; + + do { + parse_alu_group(cf, i, gcnt); + assert(gcnt <= cnt); + cnt -= gcnt; + ng++; + } while (cnt); + + return 0; +} + +int bc_parser::parse_alu_group(cf_node* cf, unsigned &i, unsigned &gcnt) { + int r; + alu_node *n; + alu_group_node *g = sh->create_alu_group(); + + cgroup = !cgroup; + memset(slots[cgroup], 0, 5*sizeof(slots[0][0])); + + gcnt = 0; + + do { + n = sh->create_alu(); + g->push_back(n); + + if ((r = dec->decode_alu(i, n->bc))) + return r; + + if (!sh->assign_slot(n, slots[cgroup])) { + assert(!"alu slot assignment failed"); + return -1; + } + + gcnt++; + + } while (gcnt <= 5 && !n->bc.last); + + assert(n->bc.last); + + unsigned literal_mask = 0; + + for (node_iterator I = g->begin(), E = g->end(); + I != E; ++I) { + n = static_cast<alu_node*>(*I); + unsigned src_count = n->bc.op_ptr->src_count; + + if (ctx.alu_slots(n->bc.op) & AF_4SLOT) + n->flags |= NF_ALU_4SLOT; + + n->src.resize(src_count); + + unsigned flags = n->bc.op_ptr->flags; + + if (flags & AF_PRED) { + n->dst.resize(3); + if (n->bc.update_pred) + n->dst[1] = sh->get_special_value(SV_ALU_PRED); + if (n->bc.update_exec_mask) + n->dst[2] = sh->get_special_value(SV_EXEC_MASK); + + n->flags |= NF_DONT_HOIST; + + } else if (flags & AF_KILL) { + + n->dst.resize(2); + n->dst[1] = sh->get_special_value(SV_VALID_MASK); + sh->set_uses_kill(); + + n->flags |= NF_DONT_HOIST | NF_DONT_MOVE | + NF_DONT_KILL | NF_SCHEDULE_EARLY; + + } else { + n->dst.resize(1); + } + + if (flags & AF_MOVA) { + + n->dst[0] = sh->get_special_value(SV_AR_INDEX); + + n->flags |= NF_DONT_HOIST; + + } else if (n->bc.op_ptr->src_count == 3 || n->bc.write_mask) { + assert(!n->bc.dst_rel || n->bc.index_mode == INDEX_AR_X); + + value *v = sh->get_gpr_value(false, n->bc.dst_gpr, n->bc.dst_chan, + n->bc.dst_rel); + + n->dst[0] = v; + } + + if (n->bc.pred_sel) { + sh->has_alu_predication = true; + n->pred = sh->get_special_value(SV_ALU_PRED); + } + + for (unsigned s = 0; s < src_count; ++s) { + bc_alu_src &src = n->bc.src[s]; + + if (src.sel == ALU_SRC_LITERAL) { + unsigned chan = src.chan; + + literal_mask |= (1 << chan); + src.value.u = dw[i+chan]; + n->src[s] = sh->get_const_value(src.value); + } else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) { + unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ? + SLOT_TRANS : src.chan; + alu_node *prev_alu = slots[pgroup][prev_slot]; + + assert(prev_alu); + + if (!prev_alu->dst[0]) { + value * t = sh->create_temp_value(); + prev_alu->dst[0] = t; + } + + value *d = prev_alu->dst[0]; + + if (d->is_rel()) { + d = sh->get_gpr_value(true, prev_alu->bc.dst_gpr, + prev_alu->bc.dst_chan, + prev_alu->bc.dst_rel); + } + + n->src[s] = d; + } else if (ctx.is_kcache_sel(src.sel)) { + unsigned sel = src.sel, kc_addr; + unsigned kc_set = ((sel >> 7) & 2) + ((sel >> 5) & 1); + + bc_kcache &kc = cf->bc.kc[kc_set]; + kc_addr = (kc.addr << 4) + (sel & 0x1F); + n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan); + } else if (src.sel < MAX_GPR) { + value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel); + + n->src[s] = v; + + } else if (src.sel >= ALU_SRC_PARAM_OFFSET) { + // using slot for value channel because in fact the slot + // determines the channel that is loaded by INTERP_LOAD_P0 + // (and maybe some others). + // otherwise GVN will consider INTERP_LOAD_P0s with the same + // param index as equal instructions and leave only one of them + n->src[s] = sh->get_special_ro_value(sel_chan(src.sel, + n->bc.slot)); + } else { + switch (src.sel) { + case ALU_SRC_0: + n->src[s] = sh->get_const_value(0); + break; + case ALU_SRC_0_5: + n->src[s] = sh->get_const_value(0.5f); + break; + case ALU_SRC_1: + n->src[s] = sh->get_const_value(1.0f); + break; + case ALU_SRC_1_INT: + n->src[s] = sh->get_const_value(1); + break; + case ALU_SRC_M_1_INT: + n->src[s] = sh->get_const_value(-1); + break; + default: + n->src[s] = sh->get_special_ro_value(src.sel); + break; + } + } + } + } + + // pack multislot instructions into alu_packed_node + + alu_packed_node *p = NULL; + for (node_iterator N, I = g->begin(), E = g->end(); I != E; I = N) { + N = I + 1; + alu_node *a = static_cast<alu_node*>(*I); + unsigned sflags = a->bc.slot_flags; + + if (sflags == AF_4V || (ctx.is_cayman() && sflags == AF_S)) { + if (!p) + p = sh->create_alu_packed(); + + a->remove(); + p->push_back(a); + } + } + + if (p) { + g->push_front(p); + } + + unsigned literal_ndw = 0; + while (literal_mask) { + g->literals.push_back(dw[i + literal_ndw]); + literal_ndw += 1; + literal_mask >>= 1; + } + + literal_ndw = (literal_ndw + 1) & ~1u; + + i += literal_ndw; + gcnt += literal_ndw >> 1; + + cf->push_back(g); + return 0; +} + +int bc_parser::parse_fetch_clause(cf_node* cf) { + int r; + unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1; + + vvec grad_v, grad_h; + + while (cnt--) { + fetch_node *n = sh->create_fetch(); + cf->push_back(n); + if ((r = dec->decode_fetch(i, n->bc))) + return r; + + unsigned flags = n->bc.op_ptr->flags; + + unsigned vtx = flags & FF_VTX; + unsigned num_src = vtx ? ctx.vtx_src_num : 4; + + n->dst.resize(4); + + if (flags & (FF_SETGRAD | FF_USEGRAD | FF_GETGRAD)) { + sh->uses_gradients = true; + } + + if (flags & FF_SETGRAD) { + + vvec *grad = NULL; + + switch (n->bc.op) { + case FETCH_OP_SET_GRADIENTS_V: + grad = &grad_v; + break; + case FETCH_OP_SET_GRADIENTS_H: + grad = &grad_h; + break; + default: + assert(!"unexpected SET_GRAD instruction"); + return -1; + } + + if (grad->empty()) + grad->resize(4); + + for(unsigned s = 0; s < 4; ++s) { + unsigned sw = n->bc.src_sel[s]; + if (sw <= SEL_W) + (*grad)[s] = sh->get_gpr_value(true, n->bc.src_gpr, + sw, false); + else if (sw == SEL_0) + (*grad)[s] = sh->get_const_value(0.0f); + else if (sw == SEL_1) + (*grad)[s] = sh->get_const_value(1.0f); + } + } else { + + if (flags & FF_USEGRAD) { + n->src.resize(12); + std::copy(grad_v.begin(), grad_v.end(), n->src.begin() + 4); + std::copy(grad_h.begin(), grad_h.end(), n->src.begin() + 8); + } else { + n->src.resize(4); + } + + for(int s = 0; s < 4; ++s) { + if (n->bc.dst_sel[s] != SEL_MASK) + n->dst[s] = sh->get_gpr_value(false, n->bc.dst_gpr, s, false); + // NOTE: it doesn't matter here which components of the result we + // are using, but original n->bc.dst_sel should be taken into + // account when building the bytecode + } + for(unsigned s = 0; s < num_src; ++s) { + if (n->bc.src_sel[s] <= SEL_W) + n->src[s] = sh->get_gpr_value(true, n->bc.src_gpr, + n->bc.src_sel[s], false); + } + + } + } + return 0; +} + +int bc_parser::prepare_ir() { + + for(id_cf_map::iterator I = cf_map.begin(), E = cf_map.end(); I != E; ++I) { + cf_node *c = *I; + + if (!c) + continue; + + unsigned flags = c->bc.op_ptr->flags; + + if (flags & CF_LOOP_START) { + prepare_loop(c); + } else if (c->bc.op == CF_OP_JUMP) { + prepare_if(c); + } else if (c->bc.op == CF_OP_LOOP_END) { + loop_stack.pop(); + } else if (c->bc.op == CF_OP_LOOP_CONTINUE) { + assert(!loop_stack.empty()); + repeat_node *rep = sh->create_repeat(loop_stack.top()); + if (c->parent->first != c) + rep->move(c->parent->first, c); + c->replace_with(rep); + sh->simplify_dep_rep(rep); + } else if (c->bc.op == CF_OP_LOOP_BREAK) { + assert(!loop_stack.empty()); + depart_node *dep = sh->create_depart(loop_stack.top()); + if (c->parent->first != c) + dep->move(c->parent->first, c); + c->replace_with(dep); + sh->simplify_dep_rep(dep); + } else if (flags & CF_ALU && ctx.is_cayman()) { + // postprocess cayman's 3-slot instructions (ex-trans-only) + // FIXME it shouldn't be required with proper handling + prepare_alu_clause(c); + } else if (flags & CF_EXP) { + + // unroll burst exports + + assert(c->bc.op == CF_OP_EXPORT || c->bc.op == CF_OP_EXPORT_DONE); + + c->bc.set_op(CF_OP_EXPORT); + + unsigned burst_count = c->bc.burst_count; + unsigned eop = c->bc.end_of_program; + + c->bc.end_of_program = 0; + c->bc.burst_count = 0; + + do { + c->src.resize(4); + + for(int s = 0; s < 4; ++s) { + switch (c->bc.sel[s]) { + case SEL_0: + c->src[s] = sh->get_const_value(0.0f); + break; + case SEL_1: + c->src[s] = sh->get_const_value(1.0f); + break; + case SEL_MASK: + break; + default: + if (c->bc.sel[s] <= SEL_W) + c->src[s] = sh->get_gpr_value(true, c->bc.rw_gpr, + c->bc.sel[s], false); + else + assert(!"invalid src_sel for export"); + } + } + + if (!burst_count--) + break; + + cf_node *cf_next = sh->create_cf(); + cf_next->bc = c->bc; + ++cf_next->bc.rw_gpr; + ++cf_next->bc.array_base; + + c->insert_after(cf_next); + c = cf_next; + + } while (1); + + c->bc.end_of_program = eop; + } else if (flags & (CF_STRM | CF_RAT)) { + + unsigned burst_count = c->bc.burst_count; + unsigned eop = c->bc.end_of_program; + + c->bc.end_of_program = 0; + c->bc.burst_count = 0; + + do { + + c->src.resize(4); + + for(int s = 0; s < 4; ++s) { + if (c->bc.comp_mask & (1 << s)) + c->src[s] = + sh->get_gpr_value(true, c->bc.rw_gpr, s, false); + } + + if ((flags & CF_RAT) && (c->bc.type & 1)) { // indexed write + c->src.resize(8); + for(int s = 0; s < 3; ++s) { + c->src[4 + s] = + sh->get_gpr_value(true, c->bc.index_gpr, s, false); + } + + // FIXME probably we can relax it a bit + c->flags |= NF_DONT_HOIST | NF_DONT_MOVE; + } + + if (!burst_count--) + break; + + cf_node *cf_next = sh->create_cf(); + cf_next->bc = c->bc; + ++cf_next->bc.rw_gpr; + + // FIXME is it correct? + cf_next->bc.array_base += cf_next->bc.elem_size + 1; + + c->insert_after(cf_next); + c = cf_next; + } while (1); + + c->bc.end_of_program = eop; + + } + } + + assert(loop_stack.empty()); + return 0; +} + +int bc_parser::prepare_loop(cf_node* c) { + + cf_node *end = cf_map[c->bc.addr - 1]; + assert(end->bc.op == CF_OP_LOOP_END); + assert(c->parent == end->parent); + + region_node *reg = sh->create_region(); + repeat_node *rep = sh->create_repeat(reg); + + reg->push_back(rep); + c->insert_before(reg); + rep->move(c, end->next); + + loop_stack.push(reg); + return 0; +} + +int bc_parser::prepare_if(cf_node* c) { + cf_node *c_else = NULL, *end = cf_map[c->bc.addr]; + + BCP_DUMP( + cerr << "parsing JUMP @" << c->bc.id; + cerr << "\n"; + ); + + if (end->bc.op == CF_OP_ELSE) { + BCP_DUMP( + cerr << " found ELSE : "; + dump::dump_op(end); + cerr << "\n"; + ); + + c_else = end; + end = cf_map[c_else->bc.addr]; + } else { + BCP_DUMP( + cerr << " no else\n"; + ); + + c_else = end; + } + + if (c_else->parent != c->parent) + c_else = NULL; + + if (end->parent != c->parent) + end = NULL; + + region_node *reg = sh->create_region(); + + depart_node *dep2 = sh->create_depart(reg); + depart_node *dep = sh->create_depart(reg); + if_node *n_if = sh->create_if(); + + c->insert_before(reg); + + if (c_else != end) + dep->move(c_else, end); + dep2->move(c, end); + + reg->push_back(dep); + dep->push_front(n_if); + n_if->push_back(dep2); + + n_if->cond = sh->get_special_value(SV_EXEC_MASK); + + return 0; +} + +int bc_parser::prepare_alu_clause(cf_node* c) { + + // loop over alu groups + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + assert(I->subtype == NST_ALU_GROUP); + + alu_group_node *g = static_cast<alu_group_node*>(*I); + + // loop over alu_group items + for (node_iterator I2 = g->begin(), E2 = g->end(); I2 != E2; ++I2) { + if (I2->subtype != NST_ALU_PACKED_INST) + continue; + + alu_packed_node *p = static_cast<alu_packed_node*>(*I2); + + if (p->count() == 3) { + // cayman's scalar instruction that takes 3 or 4 slots + + // FIXME for simplicity we'll always add 4th slot, + // but probably we might want to always remove 4th slot and make + // sure that regalloc won't choose w component for dst + + alu_node *f = static_cast<alu_node*>(p->first); + alu_node *a = sh->create_alu(); + a->src = f->src; + a->dst.resize(f->dst.size()); + a->bc = f->bc; + a->bc.slot = SLOT_W; + p->push_back(a); + } + } + } + + return 0; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_context.cpp b/src/gallium/drivers/r600/sb/sb_context.cpp new file mode 100644 index 00000000000..7a259828d08 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_context.cpp @@ -0,0 +1,82 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_bc.h" + +namespace r600_sb { + +unsigned sb_context::dump_pass = 0; +unsigned sb_context::dump_stat = 0; +unsigned sb_context::dry_run = 0; + +unsigned sb_context::dskip_start = 0; +unsigned sb_context::dskip_end = 0; +unsigned sb_context::dskip_mode = 0; + +int sb_context::init(r600_isa *isa, sb_hw_chip chip, sb_hw_class cclass) { + if (chip == HW_CHIP_UNKNOWN || cclass == HW_CLASS_UNKNOWN) + return -1; + + this->isa = isa; + + hw_chip = chip; + hw_class = cclass; + + alu_temp_gprs = 4; + + max_fetch = is_r600() ? 8 : 16; + + has_trans = !is_cayman(); + + vtx_src_num = 1; + + num_slots = has_trans ? 5 : 4; + + uses_mova_gpr = is_r600() && chip != HW_CHIP_RV670; + + switch (chip) { + case HW_CHIP_RV610: + case HW_CHIP_RS780: + case HW_CHIP_RV620: + case HW_CHIP_RS880: + + case HW_CHIP_RV630: + case HW_CHIP_RV635: + case HW_CHIP_RV730: + case HW_CHIP_RV710: + case HW_CHIP_PALM: + case HW_CHIP_CEDAR: + stack_entry_size = 8; + break; + default: + stack_entry_size = 4; + break; + } + + return 0; +} + +} diff --git a/src/gallium/drivers/r600/sb/sb_core.cpp b/src/gallium/drivers/r600/sb/sb_core.cpp new file mode 100644 index 00000000000..aec838dcacb --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_core.cpp @@ -0,0 +1,279 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define SB_RA_SCHED_CHECK DEBUG + +extern "C" { +#include "os/os_time.h" +#include "r600_pipe.h" +#include "r600_shader.h" + +#include "sb_public.h" +} + +#include <stack> +#include <map> +#include <iostream> + +#include "sb_bc.h" +#include "sb_shader.h" +#include "sb_pass.h" +#include "sb_sched.h" + +using namespace r600_sb; + +using std::cerr; + +static sb_hw_class translate_chip_class(enum chip_class cc); +static sb_hw_chip translate_chip(enum radeon_family rf); + +sb_context *r600_sb_context_create(struct r600_context *rctx) { + + sb_context *sctx = new sb_context(); + + if (sctx->init(rctx->isa, translate_chip(rctx->family), + translate_chip_class(rctx->chip_class))) { + delete sctx; + sctx = NULL; + } + + unsigned df = rctx->screen->debug_flags; + + sb_context::dump_pass = df & DBG_SB_DUMP; + sb_context::dump_stat = df & DBG_SB_STAT; + sb_context::dry_run = df & DBG_SB_DRY_RUN; + + sb_context::dskip_start = debug_get_num_option("R600_SB_DSKIP_START", 0); + sb_context::dskip_end = debug_get_num_option("R600_SB_DSKIP_END", 0); + sb_context::dskip_mode = debug_get_num_option("R600_SB_DSKIP_MODE", 0); + + return sctx; +} + +void r600_sb_context_destroy(void * sctx) { + if (sctx) + delete (sb_context*)sctx; +} + +int r600_sb_bytecode_process(struct r600_context *rctx, + struct r600_bytecode *bc, + struct r600_shader *pshader, + int dump_source_bytecode, + int optimize) { + int r = 0; + unsigned shader_id = bc->debug_id; + + sb_context *ctx = (sb_context *)rctx->sb_context; + if (!ctx) { + rctx->sb_context = ctx = r600_sb_context_create(rctx); + } + + int64_t time_start = 0; + if (sb_context::dump_stat) { + time_start = os_time_get_nano(); + } + + /* skip some shaders (use shaders from default backend) + * dskip_start - range start, dskip_end - range_end, + * e.g. start = 5, end = 6 means shaders 5 & 6 + * + * dskip_mode == 0 - disabled, + * dskip_mode == 1 - don't process the shaders from the [start;end] range + * dskip_mode == 2 - process only the shaders from the range + */ + if (sb_context::dskip_mode) { + if ((sb_context::dskip_start <= shader_id && + shader_id <= sb_context::dskip_end) == + (sb_context::dskip_mode == 1)) { + cerr << "sb: skipped shader " << shader_id << " : " << "[" + << sb_context::dskip_start << "; " + << sb_context::dskip_end << "] mode " + << sb_context::dskip_mode << "\n"; + return 0; + } + } + + SB_DUMP_STAT( cerr << "\nsb: shader " << shader_id << "\n"; ); + + bc_parser parser(*ctx, bc, pshader, dump_source_bytecode, optimize); + + if ((r = parser.parse())) { + assert(0); + return r; + } + + shader *sh = parser.get_shader(); + SB_DUMP_PASS( cerr << "\n\n###### after parse\n"; sh->dump_ir(); ); + + if (!optimize) { + delete sh; + return 0; + } + +#define SB_RUN_PASS(n, dump) \ + do { \ + r = n(*sh).run(); \ + if (dump) { \ + SB_DUMP_PASS( cerr << "\n\n###### after " << #n << "\n"; sh->dump_ir();); \ + } \ + assert(!r); \ + } while (0) + + SB_RUN_PASS(ssa_prepare, 0); + SB_RUN_PASS(ssa_rename, 1); + + if (sh->has_alu_predication) + SB_RUN_PASS(psi_ops, 1); + + SB_RUN_PASS(liveness, 0); + SB_RUN_PASS(dce_cleanup, 0); + SB_RUN_PASS(def_use, 0); + + sh->set_undef(sh->root->live_before); + + SB_RUN_PASS(peephole, 1); + SB_RUN_PASS(if_conversion, 1); + + SB_RUN_PASS(def_use, 0); + + SB_RUN_PASS(gvn, 1); + + SB_RUN_PASS(liveness, 0); + SB_RUN_PASS(dce_cleanup, 1); + SB_RUN_PASS(def_use, 0); + + SB_RUN_PASS(liveness, 0); + SB_RUN_PASS(dce_cleanup, 0); + + SB_RUN_PASS(ra_split, 0); + SB_RUN_PASS(def_use, 0); + + // create 'basic blocks'. it's not like we build CFG, they are just + // container nodes in the correct locations for code placement + sh->create_bbs(); + + SB_RUN_PASS(gcm, 0); + + sh->compute_interferences = true; + SB_RUN_PASS(liveness, 0); + + SB_RUN_PASS(ra_coalesce, 1); + SB_RUN_PASS(ra_init, 1); + + SB_RUN_PASS(post_scheduler, 1); + + sh->expand_bbs(); + +#if SB_RA_SCHED_CHECK + // check code correctness after regalloc/scheduler + SB_RUN_PASS(ra_checker, 0); +#endif + + SB_RUN_PASS(bc_finalizer, 0); + + sh->optimized = true; + + bc_builder builder(*sh); + + if ((r = builder.build())) { + assert(0); + return r; + } + + if (!sb_context::dry_run) { + bytecode &nbc = builder.get_bytecode(); + + free(bc->bytecode); + bc->ndw = nbc.ndw(); + bc->bytecode = (uint32_t*) malloc(bc->ndw << 2); + nbc.write_data(bc->bytecode); + + bc->ngpr = sh->ngpr; + bc->nstack = sh->nstack; + } else { + SB_DUMP_STAT( cerr << "SB_USE_NEW_BYTECODE is not enabled\n"; ); + } + + delete sh; + + if (sb_context::dump_stat) { + int64_t t = os_time_get_nano() - time_start; + + cerr << "sb: processing shader " << shader_id << " done ( " + << ((double)t)/1000000.0 << " ms ).\n"; + } + + return 0; +} + +static sb_hw_chip translate_chip(enum radeon_family rf) { + switch (rf) { + +#define TRANSLATE_CHIP(c) case CHIP_##c: return HW_CHIP_##c + TRANSLATE_CHIP(R600); + TRANSLATE_CHIP(RV610); + TRANSLATE_CHIP(RV630); + TRANSLATE_CHIP(RV670); + TRANSLATE_CHIP(RV620); + TRANSLATE_CHIP(RV635); + TRANSLATE_CHIP(RS780); + TRANSLATE_CHIP(RS880); + TRANSLATE_CHIP(RV770); + TRANSLATE_CHIP(RV730); + TRANSLATE_CHIP(RV710); + TRANSLATE_CHIP(RV740); + TRANSLATE_CHIP(CEDAR); + TRANSLATE_CHIP(REDWOOD); + TRANSLATE_CHIP(JUNIPER); + TRANSLATE_CHIP(CYPRESS); + TRANSLATE_CHIP(HEMLOCK); + TRANSLATE_CHIP(PALM); + TRANSLATE_CHIP(SUMO); + TRANSLATE_CHIP(SUMO2); + TRANSLATE_CHIP(BARTS); + TRANSLATE_CHIP(TURKS); + TRANSLATE_CHIP(CAICOS); + TRANSLATE_CHIP(CAYMAN); +#undef TRANSLATE_CHIP + + default: + assert(!"unknown chip"); + return HW_CHIP_UNKNOWN; + } +} + +static sb_hw_class translate_chip_class(enum chip_class cc) { + switch(cc) { + case R600: return HW_CLASS_R600; + case R700: return HW_CLASS_R700; + case EVERGREEN: return HW_CLASS_EVERGREEN; + case CAYMAN: return HW_CLASS_CAYMAN; + + default: + assert(!"unknown chip class"); + return HW_CLASS_UNKNOWN; + } +} diff --git a/src/gallium/drivers/r600/sb/sb_dce_cleanup.cpp b/src/gallium/drivers/r600/sb/sb_dce_cleanup.cpp new file mode 100644 index 00000000000..acd6613166f --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_dce_cleanup.cpp @@ -0,0 +1,133 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +bool dce_cleanup::visit(node& n, bool enter) { + if (enter) { + } else { + if (n.flags & NF_DEAD) + n.remove(); + else + cleanup_dst(n); + } + return true; +} + +bool dce_cleanup::visit(alu_group_node& n, bool enter) { + if (enter) { + } else { + n.expand(); + } + return true; +} + +bool dce_cleanup::visit(cf_node& n, bool enter) { + if (enter) { + if (n.flags & NF_DEAD) + n.remove(); + else + cleanup_dst(n); + } else { + if (n.bc.op_ptr->flags & (CF_CLAUSE | CF_BRANCH | CF_LOOP)) + n.expand(); + } + return true; +} + +bool dce_cleanup::visit(alu_node& n, bool enter) { + if (enter) { + } else { + if (n.flags & NF_DEAD) + n.remove(); + else + cleanup_dst(n); + } + return true; +} + +bool dce_cleanup::visit(alu_packed_node& n, bool enter) { + if (enter) { + } else { + if (n.flags & NF_DEAD) + n.remove(); + else + cleanup_dst(n); + } + return false; +} + +bool dce_cleanup::visit(fetch_node& n, bool enter) { + if (enter) { + } else { + if (n.flags & NF_DEAD) + n.remove(); + else + cleanup_dst(n); + } + return true; +} + +bool dce_cleanup::visit(region_node& n, bool enter) { + if (enter) { + if (n.loop_phi) + run_on(*n.loop_phi); + } else { + if (n.phi) + run_on(*n.phi); + } + return true; +} + +void dce_cleanup::cleanup_dst(node& n) { + cleanup_dst_vec(n.dst); +} + +bool dce_cleanup::visit(container_node& n, bool enter) { + if (enter) { + cleanup_dst(n); + } else { + + } + return true; +} + +void dce_cleanup::cleanup_dst_vec(vvec& vv) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value* &v = *I; + if (!v) + continue; + + if (v->is_dead()) + v = NULL; + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_def_use.cpp b/src/gallium/drivers/r600/sb/sb_def_use.cpp new file mode 100644 index 00000000000..f35f592c929 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_def_use.cpp @@ -0,0 +1,167 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int def_use::run() { + run_on(sh.root, true); + run_on(sh.root, false); + return 0; +} + +void def_use::process_phi(container_node *c, bool defs, bool uses) { + + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *n = *I; + if (uses) + process_uses(n); + if (defs) + process_defs(n, n->dst, false); + } +} + +void def_use::run_on(node* n, bool defs) { + + bool is_region = (n->type == NT_REGION); + bool is_op = (n->type == NT_OP || n->type == NT_IF); + + if (is_op) { + + if (0) { + cerr << "def_use processing op "; + dump::dump_op(n); + cerr << "\n"; + } + + if (defs) + process_defs(n, n->dst, false); + else + process_uses(n); + } else if (is_region & defs) { + region_node *r = static_cast<region_node*>(n); + if (r->loop_phi) + process_phi(r->loop_phi, true, false); + } + + if (n->is_container() && n->subtype != NST_ALU_PACKED_INST) { + container_node *c = static_cast<container_node*>(n); + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + run_on(*I, defs); + } + } + + if (is_region) { + region_node *r = static_cast<region_node*>(n); + if (r->phi) + process_phi(r->phi, defs, !defs); + if (r->loop_phi && !defs) + process_phi(r->loop_phi, false, true); + } +} + +void def_use::process_defs(node *n, vvec &vv, bool arr_def) { + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (arr_def) + v->adef = n; + else + v->def = n; + + v->delete_uses(); + + if (v->is_rel()) { + process_defs(n, v->mdef, true); + } + } +} + +void def_use::process_uses(node* n) { + unsigned k = 0; + + for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; + ++I, ++k) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + if (!v->rel->is_readonly()) + v->rel->add_use(n, UK_SRC_REL, k); + + unsigned k2 = 0; + for (vvec::iterator I = v->muse.begin(), E = v->muse.end(); + I != E; ++I, ++k2) { + value *v = *I; + if (!v) + continue; + + v->add_use(n, UK_MAYUSE, k2); + } + } else + v->add_use(n, UK_SRC, k); + } + + k = 0; + for (vvec::iterator I = n->dst.begin(), E = n->dst.end(); I != E; + ++I, ++k) { + value *v = *I; + if (!v || !v->is_rel()) + continue; + + if (!v->rel->is_readonly()) + v->rel->add_use(n, UK_DST_REL, k); + unsigned k2 = 0; + for (vvec::iterator I = v->muse.begin(), E = v->muse.end(); + I != E; ++I, ++k2) { + value *v = *I; + if (!v) + continue; + + v->add_use(n, UK_MAYDEF, k2); + } + } + + if (n->pred) + n->pred->add_use(n, UK_PRED, 0); + + if (n->type == NT_IF) { + if_node *i = static_cast<if_node*>(n); + if (i->cond) + i->cond->add_use(i, UK_COND, 0); + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_dump.cpp b/src/gallium/drivers/r600/sb/sb_dump.cpp new file mode 100644 index 00000000000..c2ee34dc0af --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_dump.cpp @@ -0,0 +1,524 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <iostream> +#include <iomanip> + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +bool dump::visit(node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + + switch (n.subtype) { + case NST_PHI: + dump_op(n, "* phi"); + break; + case NST_PSI: + dump_op(n, "* psi"); + break; + case NST_COPY: + dump_op(n, "* copy"); + break; + default: + assert(!"invalid node subtype"); + break; + } + cerr << "\n"; + } + return false; +} + +bool dump::visit(container_node& n, bool enter) { + if (enter) { + if (!n.empty()) { + indent(); + dump_flags(n); + cerr << "{ "; + if (!n.dst.empty()) { + cerr << " preloaded inputs ["; + dump_vec(n.dst); + cerr << "] "; + } + dump_live_values(n, true); + } + ++level; + } else { + --level; + if (!n.empty()) { + indent(); + cerr << "} "; + if (!n.src.empty()) { + cerr << " results ["; + dump_vec(n.src); + cerr << "] "; + } + dump_live_values(n, false); + } + } + return true; +} + +bool dump::visit(bb_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "{ BB_" << n.id << " loop_level = " << n.loop_level << " "; + dump_live_values(n, true); + ++level; + } else { + --level; + indent(); + cerr << "} end BB_" << n.id << " "; + dump_live_values(n, false); + } + return true; +} + +bool dump::visit(alu_group_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "[ "; + dump_live_values(n, true); + + ++level; + } else { + --level; + + indent(); + cerr << "] "; + dump_live_values(n, false); + } + return true; +} + +bool dump::visit(cf_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + dump_op(n, n.bc.op_ptr->name); + + if (n.bc.op_ptr->flags & CF_BRANCH) { + cerr << " @" << (n.bc.addr << 1); + } + + dump_common(n); + cerr << "\n"; + + if (!n.empty()) { + indent(); + cerr << "< "; + dump_live_values(n, true); + } + + ++level; + } else { + --level; + if (!n.empty()) { + indent(); + cerr << "> "; + dump_live_values(n, false); + } + } + return true; +} + +bool dump::visit(alu_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + dump_alu(&n); + dump_common(n); + cerr << "\n"; + + ++level; + } else { + --level; + + } + return true; +} + +bool dump::visit(alu_packed_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + dump_op(n, n.op_ptr()->name); + cerr << " "; + dump_live_values(n, true); + + ++level; + } else { + --level; + if (!n.live_after.empty()) { + indent(); + dump_live_values(n, false); + } + + } + // proccess children only if their src/dst aren't moved to this node yet + return n.src.empty(); +} + +bool dump::visit(fetch_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + dump_op(n, n.bc.op_ptr->name); + cerr << "\n"; + + ++level; + } else { + --level; + } + return true; +} + +bool dump::visit(region_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "region #" << n.region_id << " "; + dump_common(n); + + if (!n.vars_defined.empty()) { + cerr << "vars_defined: "; + dump_set(sh, n.vars_defined); + } + + dump_live_values(n, true); + + ++level; + + if (n.loop_phi) + run_on(*n.loop_phi); + } else { + --level; + + if (n.phi) + run_on(*n.phi); + + indent(); + dump_live_values(n, false); + } + return true; +} + +bool dump::visit(repeat_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "repeat region #" << n.target->region_id; + cerr << (n.empty() ? " " : " after { "); + dump_common(n); + cerr << " "; + dump_live_values(n, true); + + ++level; + } else { + --level; + + if (!n.empty()) { + indent(); + cerr << "} end_repeat "; + dump_live_values(n, false); + } + } + return true; +} + +bool dump::visit(depart_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "depart region #" << n.target->region_id; + cerr << (n.empty() ? " " : " after { "); + dump_common(n); + cerr << " "; + dump_live_values(n, true); + + ++level; + } else { + --level; + if (!n.empty()) { + indent(); + cerr << "} end_depart "; + dump_live_values(n, false); + } + } + return true; +} + +bool dump::visit(if_node& n, bool enter) { + if (enter) { + indent(); + dump_flags(n); + cerr << "if " << *n.cond << " "; + dump_common(n); + cerr << " "; + dump_live_values(n, true); + + indent(); + cerr <<"{\n"; + + ++level; + } else { + --level; + indent(); + cerr << "} endif "; + dump_live_values(n, false); + } + return true; +} + +void dump::indent() { + cerr << std::setw(level * 4) << ""; +} + +void dump::dump_vec(const vvec & vv) { + bool first = true; + for(vvec::const_iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!first) + cerr << ", "; + else + first = false; + + if (v) { + cerr << *v; + } else { + cerr << "__"; + } + } +} + +void dump::dump_rels(vvec & vv) { + for(vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + + if (!v || !v->is_rel()) + continue; + + cerr << "\n\t\t\t\t\t"; + cerr << " rels: " << *v << " : "; + dump_vec(v->mdef); + cerr << " <= "; + dump_vec(v->muse); + } +} + +void dump::dump_op(node &n, const char *name) { + + if (n.pred) { + alu_node &a = static_cast<alu_node&>(n); + cerr << (a.bc.pred_sel-2) << " [" << *a.pred << "] "; + } + + cerr << name; + + bool has_dst = !n.dst.empty(); + + if (n.subtype == NST_CF_INST) { + cf_node *c = static_cast<cf_node*>(&n); + if (c->bc.op_ptr->flags & CF_EXP) { + static const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; + cerr << " " << exp_type[c->bc.type] << " " << c->bc.array_base; + has_dst = false; + } else if (c->bc.op_ptr->flags & CF_STRM) { + static const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", + "WRITE_IND_ACK"}; + cerr << " " << exp_type[c->bc.type] << " " << c->bc.array_base + << " ES:" << c->bc.elem_size; + has_dst = false; + } + } + + cerr << " "; + + if (has_dst) { + dump_vec(n.dst); + cerr << ", "; + } + + dump_vec(n.src); +} + +void dump::dump_set(shader &sh, val_set& v) { + cerr << "["; + for(val_set::iterator I = v.begin(sh), E = v.end(sh); I != E; ++I) { + value *val = *I; + cerr << *val << " "; + } + cerr << "]"; +} + +void dump::dump_common(node& n) { +} + +void dump::dump_flags(node &n) { + if (n.flags & NF_DEAD) + cerr << "### DEAD "; + if (n.flags & NF_REG_CONSTRAINT) + cerr << "R_CONS "; + if (n.flags & NF_CHAN_CONSTRAINT) + cerr << "CH_CONS "; + if (n.flags & NF_ALU_4SLOT) + cerr << "4S "; +} + +void dump::dump_val(value* v) { + cerr << *v; +} + +void dump::dump_alu(alu_node *n) { + + if (n->is_copy_mov()) + cerr << "(copy) "; + + if (n->pred) { + cerr << (n->bc.pred_sel-2) << " [" << n->pred << "] "; + } + + cerr << n->bc.op_ptr->name; + + if (n->bc.omod) { + static const char *omod_str[] = {"", "*2", "*4", "/2"}; + cerr << omod_str[n->bc.omod]; + } + + if (n->bc.clamp) { + cerr << "_sat"; + } + + bool has_dst = !n->dst.empty(); + + cerr << " "; + + if (has_dst) { + dump_vec(n->dst); + cerr << ", "; + } + + unsigned s = 0; + for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; + ++I, ++s) { + + bc_alu_src &src = n->bc.src[s]; + + if (src.neg) + cerr << "-"; + + if (src.abs) + cerr << "|"; + + dump_val(*I); + + if (src.abs) + cerr << "|"; + + if (I + 1 != E) + cerr << ", "; + } + + dump_rels(n->dst); + dump_rels(n->src); + +} + +void dump::dump_op(node* n) { + if (n->type == NT_IF) { + dump_op(*n, "IF "); + return; + } + + switch(n->subtype) { + case NST_ALU_INST: + dump_alu(static_cast<alu_node*>(n)); + break; + case NST_FETCH_INST: + dump_op(*n, static_cast<fetch_node*>(n)->bc.op_ptr->name); + break; + case NST_CF_INST: + case NST_ALU_CLAUSE: + case NST_TEX_CLAUSE: + case NST_VTX_CLAUSE: + dump_op(*n, static_cast<cf_node*>(n)->bc.op_ptr->name); + break; + case NST_ALU_PACKED_INST: + dump_op(*n, static_cast<alu_packed_node*>(n)->op_ptr()->name); + break; + case NST_PHI: + dump_op(*n, "PHI"); + break; + case NST_PSI: + dump_op(*n, "PSI"); + break; + case NST_COPY: + dump_op(*n, "COPY"); + break; + default: + dump_op(*n, "??unknown_op"); + } +} + +void dump::dump_op_list(container_node* c) { + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + dump_op(*I); + cerr << "\n"; + } +} + +void dump::dump_queue(sched_queue& q) { + for (sched_queue::iterator I = q.begin(), E = q.end(); I != E; ++I) { + dump_op(*I); + cerr << "\n"; + } +} + +void dump::dump_live_values(container_node &n, bool before) { + if (before) { + if (!n.live_before.empty()) { + cerr << "live_before: "; + dump_set(sh, n.live_before); + } + } else { + if (!n.live_after.empty()) { + cerr << "live_after: "; + dump_set(sh, n.live_after); + } + } + cerr << "\n"; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp new file mode 100644 index 00000000000..e3c7858c3e9 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -0,0 +1,611 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <cmath> + +#include "sb_shader.h" + +namespace r600_sb { + +value* get_select_value_for_em(shader& sh, value* em) { + if (!em->def) + return NULL; + + node *predset = em->def; + if (!predset->is_pred_set()) + return NULL; + + alu_node *s = sh.clone(static_cast<alu_node*>(predset)); + convert_predset_to_set(sh, s); + + predset->insert_after(s); + + value* &d0 = s->dst[0]; + d0 = sh.create_temp_value(); + d0->def = s; + return d0; +} + +expr_handler::expr_handler(shader& sh) : sh(sh), vt(sh.vt) {} + +value * expr_handler::get_const(const literal &l) { + value *v = sh.get_const_value(l); + if (!v->gvn_source) + vt.add_value(v); + return v; +} + +void expr_handler::assign_source(value *dst, value *src) { + dst->gvn_source = src->gvn_source; +} + +bool expr_handler::equal(value *l, value *r) { + + assert(l != r); + + if (l->gvalue() == r->gvalue()) + return true; + + if (l->def && r->def) + return defs_equal(l, r); + + if (l->is_rel() && r->is_rel()) + return ivars_equal(l, r); + + return false; +} + +bool expr_handler::ivars_equal(value* l, value* r) { + if (l->rel->gvalue() == r->rel->gvalue() + && l->select == r->select) { + + vvec &lv = l->mdef.empty() ? l->muse : l->mdef; + vvec &rv = r->mdef.empty() ? r->muse : r->mdef; + + // FIXME: replace this with more precise aliasing test + return lv == rv; + } + return false; +} + +bool expr_handler::defs_equal(value* l, value* r) { + + node *d1 = l->def; + node *d2 = r->def; + + if (d1->type != d2->type || d1->subtype != d2->subtype) + return false; + + if (d1->is_pred_set() || d2->is_pred_set()) + return false; + + if (d1->type == NT_OP) { + switch (d1->subtype) { + case NST_ALU_INST: + return ops_equal( + static_cast<alu_node*>(d1), + static_cast<alu_node*>(d2)); +// case NST_FETCH_INST: return ops_equal(static_cast<fetch_node*>(d1), +// static_cast<fetch_node*>(d2); +// case NST_CF_INST: return ops_equal(static_cast<cf_node*>(d1), +// static_cast<cf_node*>(d2); + default: + break; + } + } + return false; +} + +bool expr_handler::try_fold(value* v) { + assert(!v->gvn_source); + + if (v->def) + try_fold(v->def); + + if (v->gvn_source) + return true; + + return false; +} + +bool expr_handler::try_fold(node* n) { + return n->fold_dispatch(this); +} + +bool expr_handler::fold(node& n) { + if (n.subtype == NST_PHI) { + + value *s = n.src[0]; + + // FIXME disabling phi folding for registers for now, otherwise we lose + // control flow information in some cases + // (GCM fails on tests/shaders/glsl-fs-if-nested-loop.shader_test) + // probably control flow transformation is required to enable it + if (s->is_sgpr()) + return false; + + for(vvec::iterator I = n.src.begin() + 1, E = n.src.end(); I != E; ++I) { + value *v = *I; + if (!s->v_equal(v)) + return false; + } + + assign_source(n.dst[0], s); + } else { + assert(n.subtype == NST_PSI); + assert(n.src.size() >= 6); + + value *s = n.src[2]; + assert(s->gvn_source); + + for(vvec::iterator I = n.src.begin() + 3, E = n.src.end(); I != E; I += 3) { + value *v = *(I+2); + if (!s->v_equal(v)) + return false; + } + assign_source(n.dst[0], s); + } + return true; +} + +bool expr_handler::fold(container_node& n) { + return false; +} + +bool expr_handler::fold_setcc(alu_node &n) { + + // TODO + + return false; +} + +bool expr_handler::fold(alu_node& n) { + + if (n.bc.op_ptr->flags & (AF_PRED | AF_KILL)) { + fold_setcc(n); + return false; + } + + switch (n.bc.op_ptr->src_count) { + case 1: return fold_alu_op1(n); + case 2: return fold_alu_op2(n); + case 3: return fold_alu_op3(n); + default: + assert(0); + } + return false; +} + +bool expr_handler::fold(fetch_node& n) { + + unsigned chan = 0; + for (vvec::iterator I = n.dst.begin(), E = n.dst.end(); I != E; ++I) { + value* &v = *I; + if (v) { + if (n.bc.dst_sel[chan] == SEL_0) + assign_source(*I, get_const(0.0f)); + else if (n.bc.dst_sel[chan] == SEL_1) + assign_source(*I, get_const(1.0f)); + } + ++chan; + } + return false; +} + +bool expr_handler::fold(cf_node& n) { + return false; +} + +void expr_handler::apply_alu_src_mod(const bc_alu &bc, unsigned src, + literal &v) { + const bc_alu_src &s = bc.src[src]; + + if (s.abs) + v = fabs(v.f); + if (s.neg) + v = -v.f; +} + +void expr_handler::apply_alu_dst_mod(const bc_alu &bc, literal &v) { + float omod_coeff[] = {2.0f, 4.0, 0.5f}; + + if (bc.omod) + v = v.f * omod_coeff[bc.omod - 1]; + if (bc.clamp) + v = float_clamp(v.f); +} + +bool expr_handler::args_equal(const vvec &l, const vvec &r) { + + assert(l.size() == r.size()); + + int s = l.size(); + + for (int k = 0; k < s; ++k) { + if (!l[k]->v_equal(r[k])) + return false; + } + + return true; +} + +bool expr_handler::ops_equal(const alu_node *l, const alu_node* r) { + const bc_alu &b0 = l->bc; + const bc_alu &b1 = r->bc; + + if (b0.op != b1.op) + return false; + + unsigned src_count = b0.op_ptr->src_count; + + if (b0.index_mode != b1.index_mode) + return false; + + if (b0.clamp != b1.clamp || b0.omod != b1.omod) + return false; + + for (unsigned s = 0; s < src_count; ++s) { + const bc_alu_src &s0 = b0.src[s]; + const bc_alu_src &s1 = b1.src[s]; + + if (s0.abs != s1.abs || s0.neg != s1.neg) + return false; + } + return args_equal(l->src, r->src); +} + +bool expr_handler::fold_alu_op1(alu_node& n) { + + assert(!n.src.empty()); + if (n.src.empty()) + return false; + + value* v0 = n.src[0]; + + assert(v0 && n.dst[0]); + + if (!v0->is_const()) { + if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT || + n.bc.op == ALU_OP1_MOVA_GPR_INT) + && n.bc.clamp == 0 && n.bc.omod == 0 + && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0) { + assign_source(n.dst[0], v0); + return true; + } + return false; + } + + literal dv, cv = v0->get_const_value(); + apply_alu_src_mod(n.bc, 0, cv); + + switch (n.bc.op) { + case ALU_OP1_CEIL: dv = ceil(cv.f); break; + case ALU_OP1_COS: dv = cos(cv.f * 2.0f * M_PI); break; + case ALU_OP1_EXP_IEEE: dv = exp2(cv.f); break; + case ALU_OP1_FLOOR: dv = floor(cv.f); break; + case ALU_OP1_FLT_TO_INT: dv = (int)cv.f; break; // FIXME: round modes ???? + case ALU_OP1_FLT_TO_INT_FLOOR: dv = (int32_t)floor(cv.f); break; + case ALU_OP1_FLT_TO_INT_RPI: dv = (int32_t)floor(cv.f + 0.5f); break; + case ALU_OP1_FLT_TO_INT_TRUNC: dv = (int32_t)trunc(cv.f); break; + case ALU_OP1_FLT_TO_UINT: dv = (uint32_t)cv.f; break; + case ALU_OP1_FRACT: dv = cv.f - floor(cv.f); break; + case ALU_OP1_INT_TO_FLT: dv = (float)cv.i; break; + case ALU_OP1_LOG_CLAMPED: + case ALU_OP1_LOG_IEEE: + if (cv.f != 0.0f) + dv = log2(cv.f); + else + // don't fold to NAN, let the GPU handle it for now + // (prevents degenerate LIT tests from failing) + return false; + break; + case ALU_OP1_MOV: dv = cv; break; + case ALU_OP1_MOVA_INT: dv = cv; break; // FIXME ??? +// case ALU_OP1_MOVA_FLOOR: dv = (int32_t)floor(cv.f); break; +// case ALU_OP1_MOVA_GPR_INT: + case ALU_OP1_NOT_INT: dv = ~cv.i; break; + case ALU_OP1_PRED_SET_INV: + dv = cv.f == 0.0f ? 1.0f : (cv.f == 1.0f ? 0.0f : cv.f); break; + case ALU_OP1_PRED_SET_RESTORE: dv = cv; break; + case ALU_OP1_RECIPSQRT_CLAMPED: + case ALU_OP1_RECIPSQRT_FF: + case ALU_OP1_RECIPSQRT_IEEE: dv = 1.0f / sqrt(cv.f); break; + case ALU_OP1_RECIP_CLAMPED: + case ALU_OP1_RECIP_FF: + case ALU_OP1_RECIP_IEEE: dv = 1.0f / cv.f; break; +// case ALU_OP1_RECIP_INT: +// case ALU_OP1_RECIP_UINT: +// case ALU_OP1_RNDNE: dv = floor(cv.f + 0.5f); break; + case ALU_OP1_SIN: dv = sin(cv.f * 2.0f * M_PI); break; + case ALU_OP1_SQRT_IEEE: dv = sqrt(cv.f); break; + case ALU_OP1_TRUNC: dv = trunc(cv.f); break; + + default: + return false; + } + + apply_alu_dst_mod(n.bc, dv); + assign_source(n.dst[0], get_const(dv)); + return true; +} + +bool expr_handler::fold_alu_op2(alu_node& n) { + + if (n.src.size() < 2) + return false; + + value* v0 = n.src[0]; + value* v1 = n.src[1]; + + assert(v0 && v1 && n.dst[0]); + + bool isc0 = v0->is_const(); + bool isc1 = v1->is_const(); + + if (!isc0 && !isc1) + return false; + + literal dv, cv0, cv1; + + if (isc0) { + cv0 = v0->get_const_value(); + apply_alu_src_mod(n.bc, 0, cv0); + } + + if (isc1) { + cv1 = v1->get_const_value(); + apply_alu_src_mod(n.bc, 1, cv1); + } + + if (isc0 && isc1) { + switch (n.bc.op) { + case ALU_OP2_ADD: dv = cv0.f + cv1.f; break; + case ALU_OP2_ADDC_UINT: + dv = (uint32_t)(((uint64_t)cv0.u + cv1.u)>>32); break; + case ALU_OP2_ADD_INT: dv = cv0.i + cv1.i; break; + case ALU_OP2_AND_INT: dv = cv0.i & cv1.i; break; + case ALU_OP2_ASHR_INT: dv = cv0.i >> (cv1.i & 0x1F); break; + case ALU_OP2_BFM_INT: + dv = (((1 << (cv0.i & 0x1F)) - 1) << (cv1.i & 0x1F)); break; + case ALU_OP2_LSHL_INT: dv = cv0.i << cv1.i; break; + case ALU_OP2_LSHR_INT: dv = cv0.u >> cv1.u; break; + case ALU_OP2_MAX: + case ALU_OP2_MAX_DX10: dv = cv0.f > cv1.f ? cv0.f : cv1.f; break; + case ALU_OP2_MAX_INT: dv = cv0.i > cv1.i ? cv0.i : cv1.i; break; + case ALU_OP2_MAX_UINT: dv = cv0.u > cv1.u ? cv0.u : cv1.u; break; + case ALU_OP2_MIN: + case ALU_OP2_MIN_DX10: dv = cv0.f < cv1.f ? cv0.f : cv1.f; break; + case ALU_OP2_MIN_INT: dv = cv0.i < cv1.i ? cv0.i : cv1.i; break; + case ALU_OP2_MIN_UINT: dv = cv0.u < cv1.u ? cv0.u : cv1.u; break; + case ALU_OP2_MUL: + case ALU_OP2_MUL_IEEE: dv = cv0.f * cv1.f; break; + case ALU_OP2_MULHI_INT: + dv = (int32_t)(((int64_t)cv0.u * cv1.u)>>32); break; + case ALU_OP2_MULHI_UINT: + dv = (uint32_t)(((uint64_t)cv0.u * cv1.u)>>32); break; + case ALU_OP2_MULLO_INT: + dv = (int32_t)(((int64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; + case ALU_OP2_MULLO_UINT: + dv = (uint32_t)(((uint64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; + case ALU_OP2_OR_INT: dv = cv0.i | cv1.i; break; + case ALU_OP2_SUB_INT: dv = cv0.i - cv1.i; break; + case ALU_OP2_XOR_INT: dv = cv0.i ^ cv1.i; break; + + case ALU_OP2_SETE: dv = cv0.f == cv1.f ? 1.0f : 0.0f; break; + + default: + return false; + } + + } else { // one source is const + + // TODO handle 1 * anything, 0 * anything, 0 + anything, etc + + return false; + } + + apply_alu_dst_mod(n.bc, dv); + assign_source(n.dst[0], get_const(dv)); + return true; +} + +bool expr_handler::fold_alu_op3(alu_node& n) { + + if (n.src.size() < 3) + return false; + + // TODO handle CNDxx by some common path + + value* v0 = n.src[0]; + value* v1 = n.src[1]; + value* v2 = n.src[2]; + + assert(v0 && v1 && v2 && n.dst[0]); + + bool isc0 = v0->is_const(); + bool isc1 = v1->is_const(); + bool isc2 = v2->is_const(); + + if (!isc0 && !isc1 && !isc2) + return false; + + literal dv, cv0, cv1, cv2; + + if (isc0) { + cv0 = v0->get_const_value(); + apply_alu_src_mod(n.bc, 0, cv0); + } + + if (isc1) { + cv1 = v1->get_const_value(); + apply_alu_src_mod(n.bc, 1, cv1); + } + + if (isc2) { + cv2 = v2->get_const_value(); + apply_alu_src_mod(n.bc, 2, cv2); + } + + if (isc0 && isc1 && isc2) { + switch (n.bc.op) { + case ALU_OP3_MULADD: dv = cv0.f * cv1.f + cv2.f; break; + + // TODO + + default: + return false; + } + + } else { + + // TODO + + return false; + } + + apply_alu_dst_mod(n.bc, dv); + assign_source(n.dst[0], get_const(dv)); + return true; +} + +unsigned invert_setcc_condition(unsigned cc, bool &swap_args) { + unsigned ncc = 0; + + switch (cc) { + case AF_CC_E: ncc = AF_CC_NE; break; + case AF_CC_NE: ncc = AF_CC_E; break; + case AF_CC_GE: ncc = AF_CC_GT; swap_args = true; break; + case AF_CC_GT: ncc = AF_CC_GE; swap_args = true; break; + default: + assert(!"unexpected condition code"); + break; + } + return ncc; +} + +unsigned get_setcc_opcode(unsigned cc, unsigned cmp_type, bool int_dst) { + + if (int_dst && cmp_type == AF_FLOAT_CMP) { + switch (cc) { + case AF_CC_E: return ALU_OP2_SETE_DX10; + case AF_CC_NE: return ALU_OP2_SETNE_DX10; + case AF_CC_GT: return ALU_OP2_SETGT_DX10; + case AF_CC_GE: return ALU_OP2_SETGE_DX10; + } + } else { + + switch(cmp_type) { + case AF_FLOAT_CMP: { + switch (cc) { + case AF_CC_E: return ALU_OP2_SETE; + case AF_CC_NE: return ALU_OP2_SETNE; + case AF_CC_GT: return ALU_OP2_SETGT; + case AF_CC_GE: return ALU_OP2_SETGE; + } + break; + } + case AF_INT_CMP: { + switch (cc) { + case AF_CC_E: return ALU_OP2_SETE_INT; + case AF_CC_NE: return ALU_OP2_SETNE_INT; + case AF_CC_GT: return ALU_OP2_SETGT_INT; + case AF_CC_GE: return ALU_OP2_SETGE_INT; + } + break; + } + case AF_UINT_CMP: { + switch (cc) { + case AF_CC_GT: return ALU_OP2_SETGT_UINT; + case AF_CC_GE: return ALU_OP2_SETGE_UINT; + } + break; + } + } + } + + assert(!"unexpected cc&cmp_type combination"); + return ~0u; +} + +unsigned get_predsetcc_opcode(unsigned cc, unsigned cmp_type) { + + switch(cmp_type) { + case AF_FLOAT_CMP: { + switch (cc) { + case AF_CC_E: return ALU_OP2_PRED_SETE; + case AF_CC_NE: return ALU_OP2_PRED_SETNE; + case AF_CC_GT: return ALU_OP2_PRED_SETGT; + case AF_CC_GE: return ALU_OP2_PRED_SETGE; + } + break; + } + case AF_INT_CMP: { + switch (cc) { + case AF_CC_E: return ALU_OP2_PRED_SETE_INT; + case AF_CC_NE: return ALU_OP2_PRED_SETNE_INT; + case AF_CC_GT: return ALU_OP2_PRED_SETGT_INT; + case AF_CC_GE: return ALU_OP2_PRED_SETGE_INT; + } + break; + } + case AF_UINT_CMP: { + switch (cc) { + case AF_CC_GT: return ALU_OP2_PRED_SETGT_UINT; + case AF_CC_GE: return ALU_OP2_PRED_SETGE_UINT; + } + break; + } + } + + assert(!"unexpected cc&cmp_type combination"); + return ~0u; +} + +void convert_predset_to_set(shader& sh, alu_node* a) { + + unsigned flags = a->bc.op_ptr->flags; + unsigned cc = flags & AF_CC_MASK; + unsigned cmp_type = flags & AF_CMP_TYPE_MASK; + + bool swap_args = false; + + cc = invert_setcc_condition(cc, swap_args); + + unsigned newop = get_setcc_opcode(cc, cmp_type, true); + + a->dst.resize(1); + a->bc.set_op(newop); + + if (swap_args) { + std::swap(a->src[0], a->src[1]); + std::swap(a->bc.src[0], a->bc.src[1]); + } + + a->bc.update_exec_mask = 0; + a->bc.update_pred = 0; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_expr.h b/src/gallium/drivers/r600/sb/sb_expr.h new file mode 100644 index 00000000000..7f3bd15ba37 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_expr.h @@ -0,0 +1,83 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef SB_EXPR_H_ +#define SB_EXPR_H_ + +namespace r600_sb { + +inline float float_clamp(float v) { + return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); +} + +value* get_select_value_for_em(shader &sh, value *em); + +void convert_predset_to_set(shader &sh, alu_node *a); +unsigned invert_setcc_condition(unsigned cc, bool &swap_args); +unsigned get_setcc_opcode(unsigned cc, unsigned cmp_type, bool int_dst); +unsigned get_predsetcc_opcode(unsigned cc, unsigned cmp_type); + +class expr_handler { + + shader &sh; + value_table &vt; + +public: + + expr_handler(shader &sh); + + bool equal(value *l, value *r); + bool defs_equal(value *l, value *r); + bool args_equal(const vvec &l, const vvec &r); + bool ops_equal(const alu_node *l, const alu_node *r); + bool ivars_equal(value *l, value *r); + + value* get_const(const literal &l); + + bool try_fold(value *v); + bool try_fold(node *n); + + bool fold(node &n); + bool fold(container_node &n); + bool fold(alu_node &n); + bool fold(fetch_node &n); + bool fold(cf_node &n); + + bool fold_setcc(alu_node &n); + + bool fold_alu_op1(alu_node &n); + bool fold_alu_op2(alu_node &n); + bool fold_alu_op3(alu_node &n); + + void apply_alu_src_mod(const bc_alu &bc, unsigned src, literal &v); + void apply_alu_dst_mod(const bc_alu &bc, literal &v); + + void assign_source(value *dst, value *src); +}; + +} // namespace r600_sb + +#endif /* SB_EXPR_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp new file mode 100644 index 00000000000..b6d20430750 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp @@ -0,0 +1,745 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define GCM_DEBUG 0 + +#if GCM_DEBUG +#define GCM_DUMP(a) do { a } while(0); +#else +#define GCM_DUMP(a) +#endif + +#include <iostream> +#include <map> + +#include "sb_bc.h" +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int gcm::run() { + + GCM_DUMP( cerr << "==== GCM ==== \n"; sh.dump_ir(); ); + + collect_instructions(sh.root, true); + + init_def_count(uses, pending); + + for (node_iterator N, I = pending.begin(), E = pending.end(); + I != E; I = N) { + N = I; + ++N; + node *o = *I; + + GCM_DUMP( + cerr << "pending : "; + dump::dump_op(o); + cerr << "\n"; + ); + + if (td_is_ready(o)) { + + GCM_DUMP( + cerr << " ready: "; + dump::dump_op(o); + cerr << "\n"; + ); + pending.remove_node(o); + ready.push_back(o); + } else { + } + } + + sched_early(sh.root); + + if (!pending.empty()) { + cerr << "##### gcm_sched_early_pass: unscheduled ops:\n"; + dump::dump_op(pending.front()); + } + + assert(pending.empty()); + + GCM_DUMP( sh.dump_ir(); ); + + GCM_DUMP( cerr << "\n\n ############## gcm late\n\n"; ); + + collect_instructions(sh.root, false); + + init_use_count(uses, pending); + + sched_late(sh.root); + if (!pending.empty()) { + cerr << "##### gcm_sched_late_pass: unscheduled ops:\n"; + dump::dump_op(pending.front()); + } + + assert(ucs_level == 0); + assert(pending.empty()); + + return 0; +} + + +void gcm::collect_instructions(container_node *c, bool early_pass) { + if (c->is_bb()) { + + if (early_pass) { + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *n = *I; + if (n->flags & NF_DONT_MOVE) { + op_info &o = op_map[n]; + o.top_bb = o.bottom_bb = static_cast<bb_node*>(c); + } + } + } + + pending.append_from(c); + return; + } + + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + if (I->is_container()) { + collect_instructions(static_cast<container_node*>(*I), early_pass); + } + } +} + +void gcm::sched_early(container_node *n) { + + region_node *r = + (n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL; + + if (r && r->loop_phi) { + sched_early(r->loop_phi); + } + + for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) { + if (I->type == NT_OP) { + node *op = *I; + if (op->subtype == NST_PHI) { + td_release_uses(op->dst); + } + } else if (I->is_container()) { + if (I->subtype == NST_BB) { + bb_node* bb = static_cast<bb_node*>(*I); + td_sched_bb(bb); + } else { + sched_early(static_cast<container_node*>(*I)); + } + } + } + + if (r && r->phi) { + sched_early(r->phi); + } +} + +void gcm::td_schedule(bb_node *bb, node *n) { + GCM_DUMP( + cerr << "scheduling : "; + dump::dump_op(n); + cerr << "\n"; + ); + td_release_uses(n->dst); + + bb->push_back(n); + + op_map[n].top_bb = bb; + +} + +void gcm::td_sched_bb(bb_node* bb) { + GCM_DUMP( + cerr << "td scheduling BB_" << bb->id << "\n"; + ); + + while (!ready.empty()) { + for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E; + I = N) { + N = I; ++N; + td_schedule(bb, *I); + ready.erase(I); + } + } +} + +bool gcm::td_is_ready(node* n) { + return uses[n] == 0; +} + +void gcm::td_release_val(value *v) { + + GCM_DUMP( + cerr << "td checking uses: "; + dump::dump_val(v); + cerr << "\n"; + ); + + use_info *u = v->uses; + while (u) { + if (u->op->parent != &pending) { + u = u->next; + continue; + } + + GCM_DUMP( + cerr << "td used in "; + dump::dump_op(u->op); + cerr << "\n"; + ); + + if (--uses[u->op] == 0) { + GCM_DUMP( + cerr << "td released : "; + dump::dump_op(u->op); + cerr << "\n"; + ); + + pending.remove_node(u->op); + ready.push_back(u->op); + } + u = u->next; + } + +} + +void gcm::td_release_uses(vvec& v) { + for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_rel()) + td_release_uses(v->mdef); + else + td_release_val(v); + } +} + +void gcm::sched_late(container_node *n) { + + bool stack_pushed = false; + + if (n->is_depart()) { + depart_node *d = static_cast<depart_node*>(n); + push_uc_stack(); + stack_pushed = true; + bu_release_phi_defs(d->target->phi, d->dep_id); + } else if (n->is_repeat()) { + repeat_node *r = static_cast<repeat_node*>(n); + assert(r->target->loop_phi); + push_uc_stack(); + stack_pushed = true; + bu_release_phi_defs(r->target->loop_phi, r->rep_id); + } + + for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) { + if (I->is_container()) { + if (I->subtype == NST_BB) { + bb_node* bb = static_cast<bb_node*>(*I); + bu_sched_bb(bb); + } else { + sched_late(static_cast<container_node*>(*I)); + } + } + } + + if (n->type == NT_IF) { + if_node *f = static_cast<if_node*>(n); + if (f->cond) + pending_defs.push_back(f->cond); + } else if (n->type == NT_REGION) { + region_node *r = static_cast<region_node*>(n); + if (r->loop_phi) + bu_release_phi_defs(r->loop_phi, 0); + } + + if (stack_pushed) + pop_uc_stack(); + +} + +void gcm::bu_sched_bb(bb_node* bb) { + GCM_DUMP( + cerr << "bu scheduling BB_" << bb->id << "\n"; + ); + + bu_bb = bb; + + if (!pending_nodes.empty()) { + GCM_DUMP( + cerr << "pending nodes:\n"; + ); + + // TODO consider sorting the exports by array_base, + // possibly it can improve performance + + for (node_list::iterator I = pending_nodes.begin(), + E = pending_nodes.end(); I != E; ++I) { + bu_release_op(*I); + } + pending_nodes.clear(); + GCM_DUMP( + cerr << "pending nodes processed...\n"; + ); + } + + + if (!pending_defs.empty()) { + for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end(); + I != E; ++I) { + bu_release_val(*I); + } + pending_defs.clear(); + } + + for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end(); + I != E; I = N) { + N = I; + ++N; + node *n = *I; + if (op_map[n].bottom_bb == bb) { + add_ready(*I); + ready_above.erase(I); + } + } + + unsigned cnt_ready[SQ_NUM]; + + container_node *clause = NULL; + unsigned last_inst_type = ~0; + unsigned last_count = 0; + + bool s = true; + while (s) { + node *n; + + s = false; + + unsigned ready_mask = 0; + + for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) { + if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty()) + ready_mask |= (1 << sq); + } + + if (!ready_mask) { + for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) { + if (!bu_ready_early[sq].empty()) { + node *n = bu_ready_early[sq].front(); + bu_ready_early[sq].pop_front(); + bu_ready[sq].push_back(n); + break; + } + } + } + + for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) { + + if (!bu_ready_next[sq].empty()) + bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]); + + cnt_ready[sq] = bu_ready[sq].size(); + + if ((sq == SQ_TEX || sq == SQ_VTX) && + cnt_ready[sq] < ctx.max_fetch/2 && + !bu_ready_next[SQ_ALU].empty()) { + sq = SQ_ALU; + --sq; + continue; + } + + while (!bu_ready[sq].empty()) { + + if (last_inst_type != sq) { + clause = NULL; + last_count = 0; + last_inst_type = sq; + } + + n = bu_ready[sq].front(); + + // real count (e.g. SAMPLE_G will be expanded to 3 instructions, + // 2 SET_GRAD_ + 1 SAMPLE_G + unsigned ncnt = 1; + if (n->is_fetch_inst() && n->src.size() == 12) { + ncnt = 3; + } + + if ((sq == SQ_TEX || sq == SQ_VTX) && + ((last_count >= ctx.max_fetch/2 && + check_alu_ready_count(24)) || + last_count + ncnt > ctx.max_fetch)) + break; + else if (sq == SQ_CF && last_count > 4 && + check_alu_ready_count(24)) + break; + + bu_ready[sq].pop_front(); + + if (sq != SQ_CF) { + if (!clause) { + clause = sh.create_clause(sq == SQ_ALU ? + NST_ALU_CLAUSE : + sq == SQ_TEX ? NST_TEX_CLAUSE : + NST_VTX_CLAUSE); + bb->push_front(clause); + } + } else { + clause = bb; + } + + bu_schedule(clause, n); + s = true; + last_count += ncnt; + } + } + } + + bu_bb = NULL; + + GCM_DUMP( + cerr << "bu finished scheduling BB_" << bb->id << "\n"; + ); +} + +void gcm::bu_release_defs(vvec& v, bool src) { + for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + if (!v->rel->is_readonly()) + bu_release_val(v->rel); + bu_release_defs(v->muse, true); + } else if (src) + bu_release_val(v); + } +} + +void gcm::push_uc_stack() { + GCM_DUMP( + cerr << "pushing use count stack prev_level " << ucs_level + << " new level " << (ucs_level + 1) << "\n"; + ); + ++ucs_level; + if (ucs_level == nuc_stk.size()) { + nuc_stk.resize(ucs_level + 1); + } + else { + nuc_stk[ucs_level].clear(); + } +} + +bool gcm::bu_is_ready(node* n) { + nuc_map &cm = nuc_stk[ucs_level]; + nuc_map::iterator F = cm.find(n); + unsigned uc = (F == cm.end() ? 0 : F->second); + return uc == uses[n]; +} + +void gcm::bu_schedule(container_node* c, node* n) { + GCM_DUMP( + cerr << "bu scheduling : "; + dump::dump_op(n); + cerr << "\n"; + ); + + assert(op_map[n].bottom_bb == bu_bb); + + bu_release_defs(n->src, true); + bu_release_defs(n->dst, false); + + c->push_front(n); +} + +void gcm::dump_uc_stack() { + cerr << "##### uc_stk start ####\n"; + for (unsigned l = 0; l <= ucs_level; ++l) { + nuc_map &m = nuc_stk[l]; + + cerr << "nuc_stk[" << l << "] : @" << &m << "\n"; + + for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) { + cerr << " uc " << I->second << " for "; + dump::dump_op(I->first); + cerr << "\n"; + } + } + cerr << "##### uc_stk end ####\n"; +} + +void gcm::pop_uc_stack() { + nuc_map &pm = nuc_stk[ucs_level]; + --ucs_level; + nuc_map &cm = nuc_stk[ucs_level]; + + GCM_DUMP( + cerr << "merging use stack from level " << (ucs_level+1) + << " to " << ucs_level << "\n"; + ); + + for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) { + node *n = I->first; + + GCM_DUMP( + cerr << " " << cm[n] << " += " << I->second << " for "; + dump::dump_op(n); + cerr << "\n"; + ); + + unsigned uc = cm[n] += I->second; + + if (n->parent == &pending && uc == uses[n]) { + cm.erase(n); + pending_nodes.push_back(n); + GCM_DUMP( + cerr << "pushed pending_node due to stack pop "; + dump::dump_op(n); + cerr << "\n"; + ); + } + } +} + +void gcm::bu_find_best_bb(node *n, op_info &oi) { + + GCM_DUMP( + cerr << " find best bb : "; + dump::dump_op(n); + cerr << "\n"; + ); + + if (oi.bottom_bb) + return; + + // don't hoist generated copies + if (n->flags & NF_DONT_HOIST) { + oi.bottom_bb = bu_bb; + return; + } + + bb_node* best_bb = bu_bb; + bb_node* top_bb = oi.top_bb; + assert(oi.top_bb && !oi.bottom_bb); + + node *c = best_bb; + + // FIXME top_bb may be located inside the loop so we'll never enter it + // in the loop below, and the instruction will be incorrectly placed at the + // beginning of the shader. + // For now just check if top_bb's loop_level is higher than of + // current bb and abort the search for better bb in such case, + // but this problem may require more complete (and more expensive) fix + if (top_bb->loop_level <= best_bb->loop_level) { + while (c && c != top_bb) { + + if (c->prev) { + c = c->prev; + } else { + c = c->parent; + if (!c) + break; + continue; + } + + if (c->subtype == NST_BB) { + bb_node *bb = static_cast<bb_node*>(c); + if (bb->loop_level < best_bb->loop_level) + best_bb = bb; + } + } + } + + oi.bottom_bb = best_bb; +} + +void gcm::add_ready(node *n) { + sched_queue_id sq = sh.get_queue_id(n); + if (n->flags & NF_SCHEDULE_EARLY) + bu_ready_early[sq].push_back(n); + else + bu_ready_next[sq].push_back(n); +} + +void gcm::bu_release_op(node * n) { + op_info &oi = op_map[n]; + + GCM_DUMP( + cerr << " bu release op "; + dump::dump_op(n); + ); + + nuc_stk[ucs_level].erase(n); + pending.remove_node(n); + + bu_find_best_bb(n, oi); + + if (oi.bottom_bb == bu_bb) { + GCM_DUMP( cerr << " ready\n";); + add_ready(n); + } else { + GCM_DUMP( cerr << " ready_above\n";); + ready_above.push_back(n); + } +} + +void gcm::bu_release_phi_defs(container_node* p, unsigned op) +{ + for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) { + node *o = *I; + value *v = o->src[op]; + if (v && !v->is_readonly()) + pending_defs.push_back(o->src[op]); + + } +} + +unsigned gcm::get_uc_vec(vvec &vv) { + unsigned c = 0; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_rel()) + c += get_uc_vec(v->mdef); + else + c += v->use_count(); + } + return c; +} + +void gcm::init_use_count(nuc_map& m, container_node &s) { + m.clear(); + for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) { + node *n = *I; + unsigned uc = get_uc_vec(n->dst); + GCM_DUMP( + cerr << "uc " << uc << " "; + dump::dump_op(n); + cerr << "\n"; + ); + if (!uc) { + pending_nodes.push_back(n); + GCM_DUMP( + cerr << "pushed pending_node in init "; + dump::dump_op(n); + cerr << "\n"; + ); + + } else + m[n] = uc; + } +} + +void gcm::bu_release_val(value* v) { + node *n = v->any_def(); + + if (n && n->parent == &pending) { + unsigned uc = ++nuc_stk[ucs_level][n]; + unsigned uc2 = uses[n]; + + GCM_DUMP( + cerr << "release val "; + dump::dump_val(v); + cerr << " for node "; + dump::dump_op(n); + cerr << " new uc=" << uc << ", total " << uc2 << "\n"; + ); + + if (uc == uc2) + bu_release_op(n); + } + +} + +void gcm::init_def_count(nuc_map& m, container_node& s) { + m.clear(); + for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) { + node *n = *I; + unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false); + m[n] = dc; + + GCM_DUMP( + cerr << "dc " << dc << " "; + dump::dump_op(n); + cerr << "\n"; + ); + } +} + +unsigned gcm::get_dc_vec(vvec& vv, bool src) { + unsigned c = 0; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + c += v->rel->def != NULL; + c += get_dc_vec(v->muse, true); + } + else if (src) { + c += v->def != NULL; + c += v->adef != NULL; + } + } + return c; +} + +unsigned gcm::real_alu_count(sched_queue& q, unsigned max) { + sq_iterator I(q.begin()), E(q.end()); + unsigned c = 0; + + while (I != E && c < max) { + node *n = *I; + if (n->is_alu_inst()) { + if (!n->is_copy_mov() || !n->src[0]->is_any_gpr()) + ++c; + } else if (n->is_alu_packed()) { + c += static_cast<container_node*>(n)->count(); + } + ++I; + } + + return c; +} + +bool gcm::check_alu_ready_count(unsigned threshold) { + unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold); + if (r >= threshold) + return true; + r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r); + return r >= threshold; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_gvn.cpp b/src/gallium/drivers/r600/sb/sb_gvn.cpp new file mode 100644 index 00000000000..6798917b189 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_gvn.cpp @@ -0,0 +1,231 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define GVN_DEBUG 0 + +#if GVN_DEBUG +#define GVN_DUMP(q) do { q } while (0) +#else +#define GVN_DUMP(q) +#endif + +#include "sb_shader.h" + +#include "sb_pass.h" + +#include "sb_sched.h" + +namespace r600_sb { + +using std::cerr; + +bool gvn::visit(node& n, bool enter) { + if (enter) { + + + bool rewrite = true; + + if (n.dst[0]->is_agpr()) { + rewrite = false; + } + + + process_op(n, rewrite); + + assert(n.parent); + + if (n.parent->subtype == NST_LOOP_PHI_CONTAINER) { + // There is a problem - sometimes with nested loops + // loop counter initialization for inner loop is incorrectly hoisted + // out of the outer loop + + // FIXME not sure if this is enough to fix a problem completely, + // possibly more complete fix is needed (anyway, the + // problem was seen only in relatively complex + // case involving nested loops and + // indirect access to loop counters (without proper array info + // loop counters may be considered as array elements too), + // was not seen in any tests + // or real apps when proper array information is available in TGSI). + + // For now just mark the instructions that initialize loop counters + // with DONT_HOIST flag to prevent the insts like MOV r, 0 + // (initialization of inner loop's counter with const) + // from being hoisted out of the outer loop + + assert(!n.src.empty()); + value *v = n.src[0]; + + if (v->is_any_gpr() && v->def) + v->def->flags |= NF_DONT_HOIST; + } + + } else { + } + return true; +} + +bool gvn::visit(cf_node& n, bool enter) { + if (enter) { + process_op(n); + } else { + } + return true; +} + +bool gvn::visit(alu_node& n, bool enter) { + if (enter) { + process_op(n); + } else { + } + return true; +} + +bool gvn::visit(alu_packed_node& n, bool enter) { + if (enter) { + process_op(n); + } else { + } + return false; +} + +bool gvn::visit(fetch_node& n, bool enter) { + if (enter) { + process_op(n); + } else { + } + return true; +} + +bool gvn::visit(region_node& n, bool enter) { + if (enter) { +// FIXME: loop_phi sources are undefined yet (except theone from the preceding +// code), can we handle that somehow? +// if (n.loop_phi) +// run_on(*n.loop_phi); + } else { + if (n.loop_phi) + run_on(*n.loop_phi); + + if (n.phi) + run_on(*n.phi); + } + return true; +} + +bool gvn::process_src(value* &v, bool rewrite) { + if (!v->gvn_source) + sh.vt.add_value(v); + + if (rewrite && !v->gvn_source->is_rel()) { + v = v->gvn_source; + return true; + } + return false; +} + +// FIXME: maybe handle it in the scheduler? +void gvn::process_alu_src_constants(node &n, value* &v) { + if (n.src.size() < 3) { + process_src(v, true); + return; + } + + if (!v->gvn_source) + sh.vt.add_value(v); + + rp_kcache_tracker kc(sh); + + kc.try_reserve(v->gvn_source->select); + + // don't propagate 3rd constant to the trans-only instruction + if (!n.is_alu_packed()) { + alu_node *a = static_cast<alu_node*>(&n); + if (a->bc.op_ptr->src_count == 3 && !(a->bc.slot_flags & AF_V)) { + unsigned const_count = 0; + for (vvec::iterator I = n.src.begin(), E = n.src.end(); I != E; + ++I) { + value *c = (*I); + if (c && c->is_readonly() && ++const_count == 2) { + process_src(v, false); + return; + } + } + } + } + + for (vvec::iterator I = n.src.begin(), E = n.src.end(); I != E; ++I) { + value *c = (*I); + + if (c->is_kcache() && !kc.try_reserve(c->select)) { + process_src(v, false); + return; + } + } + process_src(v, true); +} + +void gvn::process_op(node& n, bool rewrite) { + + for(vvec::iterator I = n.src.begin(), E = n.src.end(); I != E; ++I) { + value* &v = *I; + if (v) { + if (v->rel) { + process_src(v->rel, rewrite); + } + + if (rewrite && v->gvn_source && v->gvn_source->is_readonly() + && n.is_any_alu()) { + process_alu_src_constants(n, v); + } else { + if (rewrite && (n.is_fetch_op(FETCH_OP_VFETCH) || + n.is_fetch_op(FETCH_OP_SEMFETCH))) + process_src(v, false); + else + process_src(v, rewrite); + } + } + } + if (n.pred) + process_src(n.pred, false); + + if (n.type == NT_IF) { + if_node &i = (if_node&)n; + if (i.cond) + process_src(i.cond, false); + } + + for(vvec::iterator I = n.dst.begin(), E = n.dst.end(); I != E; ++I) { + value *v = *I; + if (v) { + if (v->rel) + process_src(v->rel, rewrite); + sh.vt.add_value(v); + } + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_if_conversion.cpp b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp new file mode 100644 index 00000000000..489b049490c --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp @@ -0,0 +1,268 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define IFC_DEBUG 0 + +#if IFC_DEBUG +#define IFC_DUMP(q) do { q } while (0) +#else +#define IFC_DUMP(q) +#endif + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int if_conversion::run() { + + regions_vec &rv = sh.get_regions(); + + unsigned converted = 0; + + for (regions_vec::reverse_iterator N, I = rv.rbegin(), E = rv.rend(); + I != E; I = N) { + N = I; ++N; + + region_node *r = *I; + if (run_on(r)) { + rv.erase(I.base() - 1); + ++converted; + } + } + return 0; +} + +unsigned if_conversion::try_convert_kills(region_node* r) { + + // handling the simplest (and probably most frequent) case only - + // if - 4 kills - endif + + // TODO handle more complex cases + + depart_node *d1 = static_cast<depart_node*>(r->front()); + if (!d1->is_depart()) + return 0; + + if_node *f = static_cast<if_node*>(d1->front()); + if (!f->is_if()) + return 0; + + depart_node *d2 = static_cast<depart_node*>(f->front()); + if (!d2->is_depart()) + return 0; + + unsigned cnt = 0; + + for (node_iterator I = d2->begin(), E = d2->end(); I != E; ++I) { + alu_node *n = static_cast<alu_node*>(*I); + if (!n->is_alu_inst()) + return 0; + + if (!(n->bc.op_ptr->flags & AF_KILL)) + return 0; + + if (n->bc.op_ptr->src_count != 2 || n->src.size() != 2) + return 0; + + value *s1 = n->src[0], *s2 = n->src[1]; + + // assuming that the KILL with constant operands is "always kill" + + if (!s1 || !s2 || !s1->is_const() || !s2->is_const()) + return 0; + + ++cnt; + } + + if (cnt > 4) + return 0; + + value *cond = f->cond; + value *pred = get_select_value_for_em(sh, cond); + + if (!pred) + return 0; + + for (node_iterator N, I = d2->begin(), E = d2->end(); I != E; I = N) { + N = I; ++N; + + alu_node *n = static_cast<alu_node*>(*I); + + IFC_DUMP( + cerr << "converting "; + dump::dump_op(n); + cerr << " " << n << "\n"; + ); + + n->remove(); + + n->bc.set_op(ALU_OP2_KILLE_INT); + n->src[0] = pred; + n->src[1] = sh.get_const_value(0); + // reset src modifiers + memset(&n->bc.src[0], 0, sizeof(bc_alu_src)); + memset(&n->bc.src[1], 0, sizeof(bc_alu_src)); + + r->insert_before(n); + } + + return cnt; +} + + + +bool if_conversion::run_on(region_node* r) { + + if (r->dep_count() != 2 || r->rep_count() != 1) + return false; + + node_stats s; + + r->collect_stats(s); + + IFC_DUMP( + cerr << "ifcvt: region " << r->region_id << " :\n"; + s.dump(); + ); + + if (s.region_count || s.fetch_count || + s.if_count != 1 || s.repeat_count) + return false; + + unsigned real_alu_count = s.alu_count - s.alu_copy_mov_count; + + // if_conversion allows to eliminate JUMP-ALU_POP_AFTER or + // JUMP-ALU-ELSE-ALU_POP_AFTER, for now let's assume that 3 CF instructions + // are eliminated. According to the docs, cost of CF instruction is + // equal to ~40 ALU VLIW instructions (instruction groups), + // so we have eliminated cost equal to ~120 groups in total. + // Let's also assume that we have avg 3 ALU instructions per group, + // This means that potential eliminated cost is about 360 single alu inst. + // On the other hand, we are speculatively executing conditional code now, + // so we are increasing the cost in some cases. In the worst case, we'll + // have to execute real_alu_count additional alu instructions instead of + // jumping over them. Let's assume for now that average added cost is + // + // (0.9 * real_alu_count) + // + // So we should perform if_conversion if + // + // (0.9 * real_alu_count) < 360, or + // + // real_alu_count < 400 + // + // So if real_alu_count is more than 400, than we think that if_conversion + // doesn't make sense. + + // FIXME: We can use more precise heuristic, taking into account sizes of + // the branches and their probability instead of total size. + // Another way to improve this is to consider the number of the groups + // instead of the number of instructions (taking into account actual VLIW + // packing). + // (Currently we don't know anything about packing at this stage, but + // probably we can make some more precise estimations anyway) + + if (real_alu_count > 400) + return false; + + if (s.alu_kill_count) { + unsigned kcnt = try_convert_kills(r); + if (kcnt < s.alu_kill_count) + return false; + } + + IFC_DUMP( cerr << "if_cvt: processing...\n"; ); + + depart_node *nd1 = static_cast<depart_node*>(r->first); + if (!nd1->is_depart()) + return false; + if_node *nif = static_cast<if_node*>(nd1->first); + if (!nif->is_if()) + return false; + depart_node *nd2 = static_cast<depart_node*>(nif->first); + if (!nd2->is_depart()) + return false; + + value *em = nif->cond; + value *select = get_select_value_for_em(sh, em); + + if (!select) + return false; + + for (node_iterator I = r->phi->begin(), E = r->phi->end(); I != E; ++I) { + node *n = *I; + + alu_node *ns = convert_phi(select, n); + + if (ns) + r->insert_after(ns); + } + + nd2->expand(); + nif->expand(); + nd1->expand(); + r->expand(); + + return true; +} + +alu_node* if_conversion::convert_phi(value* select, node* phi) { + assert(phi->dst.size() == 1 || phi->src.size() == 2); + + value *d = phi->dst[0]; + value *v1 = phi->src[0]; + value *v2 = phi->src[1]; + + assert(d); + + if (!d->is_any_gpr()) + return NULL; + + if (v1->is_undef()) { + if (v2->is_undef()) { + return NULL; + } else { + return sh.create_mov(d, v2); + } + } else if (v2->is_undef()) + return sh.create_mov(d, v1); + + alu_node* n = sh.create_alu(); + + n->bc.set_op(ALU_OP3_CNDE_INT); + n->dst.push_back(d); + n->src.push_back(select); + n->src.push_back(v1); + n->src.push_back(v2); + + return n; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_ir.cpp b/src/gallium/drivers/r600/sb/sb_ir.cpp new file mode 100644 index 00000000000..cbb32378f91 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ir.cpp @@ -0,0 +1,553 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_bc.h" +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +bool node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool container_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool alu_group_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool alu_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool cf_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool fetch_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool region_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } + +bool repeat_node::accept(vpass& p, bool enter) { + return p.visit(*this, enter); +} + +bool depart_node::accept(vpass& p, bool enter) { + return p.visit(*this, enter); +} +bool if_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool bb_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); } +bool alu_packed_node::accept(vpass& p, bool enter) { + return p.visit(*this, enter); +} + +void alu_packed_node::init_args() { + alu_node *p = static_cast<alu_node*>(first); + assert(p->is_valid()); + while (p) { + dst.insert(dst.end(), p->dst.begin(), p->dst.end()); + src.insert(src.end(), p->src.begin(), p->src.end()); + p = static_cast<alu_node*>(p->next); + } + + // if it's packed then it's always multislot, no need to check slot flags + bool repl = (op_ptr()->flags & AF_REPL); + value *replicated_value = NULL; + + for (vvec::iterator I = dst.begin(), E = dst.end(); I != E; ++I) { + value *v = *I; + if (v) { + if (repl) { + if (replicated_value) + v->assign_source(replicated_value); + else + replicated_value = v; + } + + v->def = this; + } + } +} + +void container_node::insert_node_before(node* s, node* n) { + if (s->prev) { + node *sp = s->prev; + sp->next = n; + n->prev = sp; + n->next = s; + s->prev = n; + } else { + n->next = s; + s->prev = n; + first = n; + } + n->parent = this; +} + +void container_node::insert_node_after(node* s, node* n) { + if (s->next) { + node *sn = s->next; + sn->prev = n; + n->next = sn; + n->prev = s; + s->next = n; + } else { + n->prev = s; + s->next = n; + last = n; + } + n->parent = this; +} + +void container_node::move(iterator b, iterator e) { + assert(b != e); + + container_node *source_container = b->parent; + node *l = source_container->cut(b, e); + + first = last = l; + first->parent = this; + + while (last->next) { + last = last->next; + last->parent = this; + } +} + +node* container_node::cut(iterator b, iterator e) { + assert(!*b || b->parent == this); + assert(!*e || e->parent == this); + assert(b != e); + + if (b->prev) { + b->prev->next = *e; + } else { + first = *e; + } + + if (*e) { + e->prev->next = NULL; + e->prev = b->prev; + } else { + last->next = NULL; + last = b->prev; + } + + b->prev = NULL; + + return *b; +} + +unsigned container_node::count() { + unsigned c = 0; + node *t = first; + while (t) { + t = t->next; + c++; + } + return c; +} + +void container_node::remove_node(node *n) { + if (n->prev) + n->prev->next = n->next; + else + first = n->next; + if (n->next) + n->next->prev = n->prev; + else + last = n->prev; + n->parent = NULL; +} + +void container_node::expand(container_node *n) { + if (!n->empty()) { + node *e0 = n->first; + node *e1 = n->last; + + e0->prev = n->prev; + if (e0->prev) { + e0->prev->next = e0; + } else { + first = e0; + } + + e1->next = n->next; + if (e1->next) + e1->next->prev = e1; + else + last = e1; + + do { + e0->parent = this; + e0 = e0->next; + } while (e0 != e1->next); + } else + remove_node(n); +} + +void container_node::push_back(node *n) { + if (last) { + last->next = n; + n->next = NULL; + n->prev = last; + last = n; + } else { + assert(!first); + first = last = n; + n->prev = n->next = NULL; + } + n->parent = this; +} +void container_node::push_front(node *n) { + if (first) { + first->prev = n; + n->prev = NULL; + n->next = first; + first = n; + } else { + assert(!last); + first = last = n; + n->prev = n->next = NULL; + } + n->parent = this; +} + +void node::insert_before(node* n) { + parent->insert_node_before(this, n); +} + +void node::insert_after(node* n) { + parent->insert_node_after(this, n); +} + +void node::replace_with(node* n) { + n->prev = prev; + n->next = next; + n->parent = parent; + if (prev) + prev->next = n; + if (next) + next->prev = n; + + if (parent->first == this) + parent->first = n; + + if (parent->last == this) + parent->last = n; + + parent = NULL; + next = prev = NULL; +} + +void container_node::expand() { + parent->expand(this); +} + +void node::remove() {parent->remove_node(this); +} + +value_hash node::hash_src() { + + value_hash h = 12345; + + for (int k = 0, e = src.size(); k < e; ++k) { + value *s = src[k]; + if (s) + h ^= (s->hash()); + } + + return h; +} + + +value_hash node::hash() { + + if (parent && parent->subtype == NST_LOOP_PHI_CONTAINER) + return 47451; + + return hash_src() ^ (subtype << 13) ^ (type << 3); +} + +void r600_sb::container_node::append_from(container_node* c) { + if (!c->first) + return; + + node *b = c->first; + + if (last) { + last->next = c->first; + last->next->prev = last; + } else { + first = c->first; + } + + last = c->last; + c->first = NULL; + c->last = NULL; + + while (b) { + b->parent = this; + b = b->next; + } +} + +bool node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } +bool container_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } +bool alu_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } +bool alu_packed_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } +bool fetch_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } +bool cf_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); } + +unsigned alu_packed_node::get_slot_mask() { + unsigned mask = 0; + for (node_iterator I = begin(), E = end(); I != E; ++I) + mask |= 1 << static_cast<alu_node*>(*I)->bc.slot; + return mask; +} + +void alu_packed_node::update_packed_items(sb_context &ctx) { + + vvec::iterator SI(src.begin()), DI(dst.begin()); + + assert(first); + + alu_node *c = static_cast<alu_node*>(first); + unsigned flags = c->bc.op_ptr->flags; + unsigned slot_flags = c->bc.slot_flags; + + // fixup dst for instructions that replicate output + if (((flags & AF_REPL) && slot_flags == AF_4V) || + (ctx.is_cayman() && slot_flags == AF_S)) { + + value *swp[4] = {}; + + unsigned chan; + + for (vvec::iterator I2 = dst.begin(), E2 = dst.end(); + I2 != E2; ++I2) { + value *v = *I2; + if (v) { + chan = v->get_final_chan(); + assert(!swp[chan] || swp[chan] == v); + swp[chan] = v; + } + } + + chan = 0; + for (vvec::iterator I2 = dst.begin(), E2 = dst.end(); + I2 != E2; ++I2, ++chan) { + *I2 = swp[chan]; + } + } + + for (node_iterator I = begin(), E = end(); I != E; ++I) { + alu_node *n = static_cast<alu_node*>(*I); + assert(n); + + for (vvec::iterator I2 = n->src.begin(), E2 = n->src.end(); + I2 != E2; ++I2, ++SI) { + *I2 = *SI; + } + for (vvec::iterator I2 = n->dst.begin(), E2 = n->dst.end(); + I2 != E2; ++I2, ++DI) { + *I2 = *DI; + } + } +} + +bool node::is_cf_op(unsigned op) { + if (!is_cf_inst()) + return false; + cf_node *c = static_cast<cf_node*>(this); + return c->bc.op == op; +} + +bool node::is_alu_op(unsigned op) { + if (!is_alu_inst()) + return false; + alu_node *c = static_cast<alu_node*>(this); + return c->bc.op == op; +} + +bool node::is_fetch_op(unsigned op) { + if (!is_fetch_inst()) + return false; + fetch_node *c = static_cast<fetch_node*>(this); + return c->bc.op == op; +} + + + +bool node::is_mova() { + if (!is_alu_inst()) + return false; + alu_node *a = static_cast<alu_node*>(this); + return (a->bc.op_ptr->flags & AF_MOVA); +} + +bool node::is_pred_set() { + if (!is_alu_inst()) + return false; + alu_node *a = static_cast<alu_node*>(this); + return (a->bc.op_ptr->flags & AF_ANY_PRED); +} + +unsigned node::cf_op_flags() { + assert(is_cf_inst()); + cf_node *c = static_cast<cf_node*>(this); + return c->bc.op_ptr->flags; +} + +unsigned node::alu_op_flags() { + assert(is_alu_inst()); + alu_node *c = static_cast<alu_node*>(this); + return c->bc.op_ptr->flags; +} + +unsigned node::fetch_op_flags() { + assert(is_fetch_inst()); + fetch_node *c = static_cast<fetch_node*>(this); + return c->bc.op_ptr->flags; +} + +unsigned node::alu_op_slot_flags() { + assert(is_alu_inst()); + alu_node *c = static_cast<alu_node*>(this); + return c->bc.slot_flags; +} + +region_node* node::get_parent_region() { + node *p = this; + while ((p = p->parent)) + if (p->is_region()) + return static_cast<region_node*>(p); + return NULL; +} + +unsigned container_node::real_alu_count() { + unsigned c = 0; + node *t = first; + while (t) { + if (t->is_alu_inst()) + ++c; + else if (t->is_alu_packed()) + c += static_cast<container_node*>(t)->count(); + t = t->next; + } + return c; +} + +void container_node::collect_stats(node_stats& s) { + + for (node_iterator I = begin(), E = end(); I != E; ++I) { + node *n = *I; + if (n->is_container()) { + static_cast<container_node*>(n)->collect_stats(s); + } + + if (n->is_alu_inst()) { + ++s.alu_count; + alu_node *a = static_cast<alu_node*>(n); + if (a->bc.op_ptr->flags & AF_KILL) + ++s.alu_kill_count; + else if (a->is_copy_mov()) + ++s.alu_copy_mov_count; + } else if (n->is_fetch_inst()) + ++s.fetch_count; + else if (n->is_cf_inst()) + ++s.cf_count; + else if (n->is_region()) { + ++s.region_count; + region_node *r = static_cast<region_node*>(n); + if(r->is_loop()) + ++s.loop_count; + + if (r->phi) + s.phi_count += r->phi->count(); + if (r->loop_phi) + s.loop_phi_count += r->loop_phi->count(); + } + else if (n->is_depart()) + ++s.depart_count; + else if (n->is_repeat()) + ++s.repeat_count; + else if (n->is_if()) + ++s.if_count; + } +} + +void region_node::expand_depart(depart_node *d) { + depart_vec::iterator I = departs.begin() + d->dep_id, E; + I = departs.erase(I); + E = departs.end(); + while (I != E) { + --(*I)->dep_id; + ++I; + } + d->expand(); +} + +void region_node::expand_repeat(repeat_node *r) { + repeat_vec::iterator I = repeats.begin() + r->rep_id - 1, E; + I = repeats.erase(I); + E = repeats.end(); + while (I != E) { + --(*I)->rep_id; + ++I; + } + r->expand(); +} + +void node_stats::dump() { + cerr << " alu_count : " << alu_count << "\n"; + cerr << " alu_kill_count : " << alu_kill_count << "\n"; + cerr << " alu_copy_mov_count : " << alu_copy_mov_count << "\n"; + cerr << " cf_count : " << cf_count << "\n"; + cerr << " fetch_count : " << fetch_count << "\n"; + cerr << " region_count : " << region_count << "\n"; + cerr << " loop_count : " << loop_count << "\n"; + cerr << " phi_count : " << phi_count << "\n"; + cerr << " loop_phi_count : " << loop_phi_count << "\n"; + cerr << " depart_count : " << depart_count << "\n"; + cerr << " repeat_count : " << repeat_count << "\n"; + cerr << " if_count : " << if_count << "\n"; +} + +unsigned alu_node::interp_param() { + if (!(bc.op_ptr->flags & AF_INTERP)) + return 0; + unsigned param; + if (bc.op_ptr->src_count == 2) { + param = src[1]->select.sel(); + } else { + param = src[0]->select.sel(); + } + return param + 1; +} + +alu_group_node* alu_node::get_alu_group_node() { + node *p = parent; + if (p) { + if (p->subtype == NST_ALU_PACKED_INST) { + assert(p->parent && p->parent->subtype == NST_ALU_GROUP); + p = p->parent; + } + return static_cast<alu_group_node*>(p); + } + return NULL; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h new file mode 100644 index 00000000000..7f7e71e2335 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ir.h @@ -0,0 +1,1153 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef R600_SB_IR_H_ +#define R600_SB_IR_H_ + +#include <algorithm> +#include <stdint.h> +#include <iostream> +#include <vector> +#include <set> +#include <algorithm> + +#include "sb_bc.h" + +namespace r600_sb { + +enum special_regs { + SV_ALU_PRED = 128, + SV_EXEC_MASK, + SV_AR_INDEX, + SV_VALID_MASK +}; + +class node; +class value; +class shader; + +struct sel_chan +{ + unsigned id; + + sel_chan(unsigned id = 0) : id(id) {} + sel_chan(unsigned sel, unsigned chan) : id(((sel << 2) | chan) + 1) {} + + unsigned sel() const { return sel(id); } + unsigned chan() const {return chan(id); } + operator unsigned() const {return id;} + + static unsigned sel(unsigned idx) { return (idx-1) >> 2; } + static unsigned chan(unsigned idx) { return (idx-1) & 3; } +}; + +inline std::ostream& operator <<(std::ostream& o, sel_chan r) { + static const char * ch = "xyzw"; + o << r.sel() << "." << ch[r.chan()]; + return o; +} + +typedef std::vector<value*> vvec; + +class sb_pool { +protected: + static const unsigned SB_POOL_ALIGN = 8; + static const unsigned SB_POOL_DEFAULT_BLOCK_SIZE = (1 << 16); + + typedef std::vector<void*> block_vector; + + unsigned block_size; + block_vector blocks; + unsigned total_size; + +public: + sb_pool(unsigned block_size = SB_POOL_DEFAULT_BLOCK_SIZE) + : block_size(block_size), blocks(), total_size() {} + + virtual ~sb_pool() { free_all(); } + + void* allocate(unsigned sz); + +protected: + void free_all(); +}; + +template <typename V, typename Comp = std::less<V> > +class sb_set { + typedef std::vector<V> data_vector; + data_vector vec; +public: + + typedef typename data_vector::iterator iterator; + typedef typename data_vector::const_iterator const_iterator; + + sb_set() : vec() {} + ~sb_set() { } + + iterator begin() { return vec.begin(); } + iterator end() { return vec.end(); } + const_iterator begin() const { return vec.begin(); } + const_iterator end() const { return vec.end(); } + + void add_set(const sb_set& s) { + data_vector t; + t.reserve(vec.size() + s.vec.size()); + std::set_union(vec.begin(), vec.end(), s.vec.begin(), s.vec.end(), + std::inserter(t, t.begin()), Comp()); + vec.swap(t); + } + + iterator lower_bound(const V& v) { + return std::lower_bound(vec.begin(), vec.end(), v, Comp()); + } + + std::pair<iterator, bool> insert(const V& v) { + iterator P = lower_bound(v); + if (P != vec.end() && is_equal(*P, v)) + return std::make_pair(P, false); + return std::make_pair(vec.insert(P, v), true); + } + + unsigned erase(const V& v) { + iterator P = lower_bound(v); + if (P == vec.end() || !is_equal(*P, v)) + return 0; + vec.erase(P); + return 1; + } + + void clear() { vec.clear(); } + + bool empty() { return vec.empty(); } + + bool is_equal(const V& v1, const V& v2) { + return !Comp()(v1, v2) && !Comp()(v2, v1); + } + + iterator find(const V& v) { + iterator P = lower_bound(v); + return (P != vec.end() && is_equal(*P, v)) ? P : vec.end(); + } + + unsigned size() { return vec.size(); } + void erase(iterator I) { vec.erase(I); } +}; + +template <typename K, typename V, typename KComp = std::less<K> > +class sb_map { + typedef std::pair<K, V> datatype; + + struct Comp { + bool operator()(const datatype &v1, const datatype &v2) { + return KComp()(v1.first, v2.first); + } + }; + + typedef sb_set<datatype, Comp> dataset; + + dataset set; + +public: + + sb_map() : set() {} + + typedef typename dataset::iterator iterator; + + iterator begin() { return set.begin(); } + iterator end() { return set.end(); } + + void clear() { set.clear(); } + + V& operator[](const K& key) { + datatype P = std::make_pair(key, V()); + iterator F = set.find(P); + if (F == set.end()) { + return (*(set.insert(P).first)).second; + } else { + return (*F).second; + } + } + + std::pair<iterator, bool> insert(const datatype& d) { + return set.insert(d); + } + + iterator find(const K& key) { + return set.find(std::make_pair(key, V())); + } + + unsigned erase(const K& key) { + return set.erase(std::make_pair(key, V())); + } + + void erase(iterator I) { + set.erase(I); + } +}; + +class sb_bitset { + typedef uint32_t basetype; + static const unsigned bt_bits = sizeof(basetype) << 3; + std::vector<basetype> data; + unsigned bit_size; + +public: + + sb_bitset() : data(), bit_size() {} + + bool get(unsigned id); + void set(unsigned id, bool bit = true); + bool set_chk(unsigned id, bool bit = true); + + void clear(); + void resize(unsigned size); + + unsigned size() { return bit_size; } + + unsigned find_bit(unsigned start = 0); + + void swap(sb_bitset & bs2); + + bool operator==(const sb_bitset &bs2); + bool operator!=(const sb_bitset &bs2) { return !(*this == bs2); } + + sb_bitset& operator|=(const sb_bitset &bs2) { + if (bit_size < bs2.bit_size) { + resize(bs2.bit_size); + } + + for (unsigned i = 0, c = std::min(data.size(), bs2.data.size()); i < c; + ++i) { + data[i] |= bs2.data[i]; + } + return *this; + } + + sb_bitset& operator&=(const sb_bitset &bs2); + sb_bitset& mask(const sb_bitset &bs2); + + friend sb_bitset operator|(const sb_bitset &b1, const sb_bitset &b2) { + sb_bitset nbs(b1); + nbs |= b2; + return nbs; + } +}; + +class value; + +enum value_kind { + VLK_REG, + VLK_REL_REG, + VLK_SPECIAL_REG, + VLK_TEMP, + + VLK_CONST, + VLK_KCACHE, + VLK_PARAM, + VLK_SPECIAL_CONST, + + VLK_UNDEF +}; + + + +class sb_value_pool : protected sb_pool { + unsigned aligned_elt_size; + +public: + sb_value_pool(unsigned elt_size, unsigned block_elts = 256) + : sb_pool(block_elts * (aligned_elt_size = ((elt_size + + SB_POOL_ALIGN - 1) & ~(SB_POOL_ALIGN - 1)))) {} + + virtual ~sb_value_pool() { delete_all(); } + + value* create(value_kind k, sel_chan regid, unsigned ver); + + value* operator[](unsigned id) { + unsigned offset = id * aligned_elt_size; + unsigned block_id; + if (offset < block_size) { + block_id = 0; + } else { + block_id = offset / block_size; + offset = offset % block_size; + } + return (value*)((char*)blocks[block_id] + offset); + } + + unsigned size() { return total_size / aligned_elt_size; } + +protected: + void delete_all(); +}; + + + + + +class sb_value_set { + + sb_bitset bs; + +public: + sb_value_set() : bs() {} + + class iterator { + sb_value_pool &vp; + sb_value_set *s; + unsigned nb; + public: + iterator(shader &sh, sb_value_set *s, unsigned nb = 0); + + + iterator& operator++() { + if (nb + 1 < s->bs.size()) + nb = s->bs.find_bit(nb + 1); + else + nb = s->bs.size(); + return *this; + } + bool operator !=(const iterator &i) { + return s != i.s || nb != i.nb; + } + bool operator ==(const iterator &i) { return !(*this != i); } + value* operator *() { + return vp[nb]; + } + + + }; + + iterator begin(shader &sh) { + return iterator(sh, this, bs.size() ? bs.find_bit(0) : 0); + } + iterator end(shader &sh) { return iterator(sh, this, bs.size()); } + + bool add_set_checked(sb_value_set & s2); + + void add_set(sb_value_set & s2) { + if (bs.size() < s2.bs.size()) + bs.resize(s2.bs.size()); + bs |= s2.bs; + } + + void remove_set(sb_value_set & s2); + + bool add_vec(vvec &vv); + + bool add_val(value *v); + bool contains(value *v); + + bool remove_val(value *v); + + bool remove_vec(vvec &vv); + + void clear(); + + bool empty(); +}; + +typedef sb_value_set val_set; + +struct gpr_array { + sel_chan base_gpr; // original gpr + sel_chan gpr; // assigned by regalloc + unsigned array_size; + + gpr_array(sel_chan base_gpr, unsigned array_size) : base_gpr(base_gpr), + array_size(array_size) {} + + unsigned hash() { return (base_gpr << 10) * array_size; } + + val_set interferences; + vvec refs; + + bool is_dead(); + +}; + +typedef std::vector<gpr_array*> regarray_vec; + +enum value_flags { + VLF_UNDEF = (1 << 0), + VLF_READONLY = (1 << 1), + VLF_DEAD = (1 << 2), + + VLF_PIN_REG = (1 << 3), + VLF_PIN_CHAN = (1 << 4), + + // opposite to alu clause local value - goes through alu clause boundary + // (can't use temp gpr, can't recolor in the alu scheduler, etc) + VLF_GLOBAL = (1 << 5), + VLF_FIXED = (1 << 6), + VLF_PVPS = (1 << 7), + + VLF_PREALLOC = (1 << 8) +}; + +inline value_flags operator |(value_flags l, value_flags r) { + return (value_flags)((unsigned)l|(unsigned)r); +} +inline value_flags operator &(value_flags l, value_flags r) { + return (value_flags)((unsigned)l&(unsigned)r); +} +inline value_flags operator ~(value_flags l) { + return (value_flags)(~(unsigned)l); +} +inline value_flags& operator |=(value_flags &l, value_flags r) { + l = l | r; + return l; +} +inline value_flags& operator &=(value_flags &l, value_flags r) { + l = l & r; + return l; +} + +struct value; + +std::ostream& operator << (std::ostream &o, value &v); + +typedef uint32_t value_hash; + +enum use_kind { + UK_SRC, + UK_SRC_REL, + UK_DST_REL, + UK_MAYDEF, + UK_MAYUSE, + UK_PRED, + UK_COND +}; + +struct use_info { + use_info *next; + node *op; + use_kind kind; + int arg; + + use_info(node *n, use_kind kind, int arg, use_info* next) + : next(next), op(n), kind(kind), arg(arg) {} +}; + +enum constraint_kind { + CK_SAME_REG, + CK_PACKED_BS, + CK_PHI +}; + +class shader; +class sb_value_pool; +class ra_chunk; +class ra_constraint; + +class value { +protected: + value(unsigned sh_id, value_kind k, sel_chan select, unsigned ver = 0) + : kind(k), flags(), + rel(), array(), + version(ver), select(select), pin_gpr(select), gpr(), + gvn_source(), ghash(), + def(), adef(), uses(), constraint(), chunk(), + literal_value(), uid(sh_id) {} + + ~value() { delete_uses(); } + + friend class sb_value_pool; +public: + value_kind kind; + value_flags flags; + + vvec mdef; + vvec muse; + value *rel; + gpr_array *array; + + unsigned version; + + sel_chan select; + sel_chan pin_gpr; + sel_chan gpr; + + value *gvn_source; + value_hash ghash; + + node *def, *adef; + use_info *uses; + + ra_constraint *constraint; + ra_chunk *chunk; + + literal literal_value; + + bool is_const() { return kind == VLK_CONST || kind == VLK_UNDEF; } + + bool is_AR() { + return is_special_reg() && select == sel_chan(SV_AR_INDEX, 0); + } + + node* any_def() { + assert(!(def && adef)); + return def ? def : adef; + } + + value* gvalue() { + value *v = this; + while (v->gvn_source && v != v->gvn_source) + // FIXME we really shouldn't have such chains + v = v->gvn_source; + return v; + } + + bool is_float_0_or_1() { + value *v = gvalue(); + return v->is_const() && (v->literal_value == literal(0) + || v->literal_value == literal(1.0f)); + } + + bool is_undef() { return gvalue()->kind == VLK_UNDEF; } + + bool is_any_gpr() { + return (kind == VLK_REG || kind == VLK_TEMP); + } + + bool is_agpr() { + return array && is_any_gpr(); + } + + // scalar gpr, as opposed to element of gpr array + bool is_sgpr() { + return !array && is_any_gpr(); + } + + bool is_special_reg() { return kind == VLK_SPECIAL_REG; } + bool is_any_reg() { return is_any_gpr() || is_special_reg(); } + bool is_kcache() { return kind == VLK_KCACHE; } + bool is_rel() { return kind == VLK_REL_REG; } + bool is_readonly() { return flags & VLF_READONLY; } + + bool is_chan_pinned() { return flags & VLF_PIN_CHAN; } + bool is_reg_pinned() { return flags & VLF_PIN_REG; } + + bool is_global(); + void set_global(); + void set_prealloc(); + + bool is_prealloc(); + + bool is_fixed(); + void fix(); + + bool is_dead() { return flags & VLF_DEAD; } + + literal & get_const_value() { + value *v = gvalue(); + assert(v->is_const()); + return v->literal_value; + } + + // true if needs to be encoded as literal in alu + bool is_literal() { + return is_const() + && literal_value != literal(0) + && literal_value != literal(1) + && literal_value != literal(-1) + && literal_value != literal(0.5) + && literal_value != literal(1.0); + } + + void add_use(node *n, use_kind kind, int arg); + + value_hash hash(); + value_hash rel_hash(); + + void assign_source(value *v) { + assert(!gvn_source || gvn_source == this); + gvn_source = v->gvalue(); + } + + bool v_equal(value *v) { return gvalue() == v->gvalue(); } + + unsigned use_count(); + void delete_uses(); + + sel_chan get_final_gpr() { + if (array && array->gpr) { + int reg_offset = select.sel() - array->base_gpr.sel(); + if (rel && rel->is_const()) + reg_offset += rel->get_const_value().i; + return array->gpr + (reg_offset << 2); + } else { + return gpr; + } + } + + unsigned get_final_chan() { + if (array) { + assert(array->gpr); + return array->gpr.chan(); + } else { + assert(gpr); + return gpr.chan(); + } + } + + val_set interferences; + unsigned uid; +}; + +class expr_handler; + +class value_table { + typedef std::vector<value*> vt_item; + typedef std::vector<vt_item> vt_table; + + expr_handler &ex; + + unsigned size_bits; + unsigned size; + unsigned size_mask; + + vt_table hashtable; + + unsigned cnt; + +public: + + value_table(expr_handler &ex, unsigned size_bits = 10) + : ex(ex), size_bits(size_bits), size(1u << size_bits), + size_mask(size - 1), hashtable(size), cnt() {} + + ~value_table() {} + + void add_value(value* v); + + bool expr_equal(value* l, value* r); + + unsigned count() { return cnt; } + + void get_values(vvec & v); +}; + +class sb_context; + +enum node_type { + NT_UNKNOWN, + NT_LIST, + NT_OP, + NT_REGION, + NT_REPEAT, + NT_DEPART, + NT_IF, +}; + +enum node_subtype { + NST_UNKNOWN, + NST_LIST, + NST_ALU_GROUP, + NST_ALU_CLAUSE, + NST_ALU_INST, + NST_ALU_PACKED_INST, + NST_CF_INST, + NST_FETCH_INST, + NST_TEX_CLAUSE, + NST_VTX_CLAUSE, + + NST_BB, + + NST_PHI, + NST_PSI, + NST_COPY, + + NST_LOOP_PHI_CONTAINER, + NST_LOOP_CONTINUE, + NST_LOOP_BREAK +}; + +enum node_flags { + NF_EMPTY = 0, + NF_DEAD = (1 << 0), + NF_REG_CONSTRAINT = (1 << 1), + NF_CHAN_CONSTRAINT = (1 << 2), + NF_ALU_4SLOT = (1 << 3), + NF_CONTAINER = (1 << 4), + + NF_COPY_MOV = (1 << 5), + + NF_DONT_KILL = (1 << 6), + NF_DONT_HOIST = (1 << 7), + NF_DONT_MOVE = (1 << 8), + + // for KILLxx - we want to schedule them as early as possible + NF_SCHEDULE_EARLY = (1 << 9) +}; + +inline node_flags operator |(node_flags l, node_flags r) { + return (node_flags)((unsigned)l|(unsigned)r); +} +inline node_flags& operator |=(node_flags &l, node_flags r) { + l = l | r; + return l; +} + +inline node_flags& operator &=(node_flags &l, node_flags r) { + l = (node_flags)((unsigned)l & (unsigned)r); + return l; +} + +inline node_flags operator ~(node_flags r) { + return (node_flags)~(unsigned)r; +} + +struct node_stats { + unsigned alu_count; + unsigned alu_kill_count; + unsigned alu_copy_mov_count; + unsigned cf_count; + unsigned fetch_count; + unsigned region_count; + unsigned loop_count; + unsigned phi_count; + unsigned loop_phi_count; + unsigned depart_count; + unsigned repeat_count; + unsigned if_count; + + node_stats() : alu_count(), alu_kill_count(), alu_copy_mov_count(), + cf_count(), fetch_count(), region_count(), + loop_count(), phi_count(), loop_phi_count(), depart_count(), + repeat_count(), if_count() {} + + void dump(); +}; + +class shader; + +class vpass; + +class container_node; +class region_node; + +class node { + +protected: + node(node_type nt, node_subtype nst, node_flags flags = NF_EMPTY) + : prev(), next(), parent(), + type(nt), subtype(nst), flags(flags), + pred(), dst(), src() {} + + virtual ~node() {}; + +public: + node *prev, *next; + container_node *parent; + + node_type type; + node_subtype subtype; + node_flags flags; + + value *pred; + + vvec dst; + vvec src; + + virtual bool is_valid() { return true; } + virtual bool accept(vpass &p, bool enter); + + void insert_before(node *n); + void insert_after(node *n); + void replace_with(node *n); + void remove(); + + virtual value_hash hash(); + value_hash hash_src(); + + virtual bool fold_dispatch(expr_handler *ex); + + bool is_container() { return flags & NF_CONTAINER; } + + bool is_alu_packed() { return subtype == NST_ALU_PACKED_INST; } + bool is_alu_inst() { return subtype == NST_ALU_INST; } + bool is_alu_group() { return subtype == NST_ALU_GROUP; } + bool is_alu_clause() { return subtype == NST_ALU_CLAUSE; } + + bool is_fetch_clause() { + return subtype == NST_TEX_CLAUSE || subtype == NST_VTX_CLAUSE; + } + + bool is_copy() { return subtype == NST_COPY; } + bool is_copy_mov() { return flags & NF_COPY_MOV; } + bool is_any_alu() { return is_alu_inst() || is_alu_packed() || is_copy(); } + + bool is_fetch_inst() { return subtype == NST_FETCH_INST; } + bool is_cf_inst() { return subtype == NST_CF_INST; } + + bool is_region() { return type == NT_REGION; } + bool is_depart() { return type == NT_DEPART; } + bool is_repeat() { return type == NT_REPEAT; } + bool is_if() { return type == NT_IF; } + bool is_bb() { return subtype == NST_BB; } + + bool is_phi() { return subtype == NST_PHI; } + + bool is_dead() { return flags & NF_DEAD; } + + bool is_cf_op(unsigned op); + bool is_alu_op(unsigned op); + bool is_fetch_op(unsigned op); + + unsigned cf_op_flags(); + unsigned alu_op_flags(); + unsigned alu_op_slot_flags(); + unsigned fetch_op_flags(); + + bool is_mova(); + bool is_pred_set(); + + bool vec_uses_ar(vvec &vv) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (v && v->rel && !v->rel->is_const()) + return true; + } + return false; + } + + bool uses_ar() { + return vec_uses_ar(dst) || vec_uses_ar(src); + } + + + region_node* get_parent_region(); + + friend class shader; +}; + +class container_node : public node { +public: + + container_node(node_type nt = NT_LIST, node_subtype nst = NST_LIST, + node_flags flags = NF_EMPTY) + : node(nt, nst, flags | NF_CONTAINER), first(), last(), + live_after(), live_before() {} + + // child items list + node *first, *last; + + val_set live_after; + val_set live_before; + + class iterator { + node *p; + public: + iterator(node *pp = NULL) : p(pp) {} + iterator & operator ++() { p = p->next; return *this;} + iterator & operator --() { p = p->prev; return *this;} + node* operator *() { return p; } + node* operator ->() { return p; } + const iterator advance(int n) { + if (!n) return *this; + iterator I(p); + if (n > 0) while (n--) ++I; + else while (n++) --I; + return I; + } + const iterator operator +(int n) { return advance(n); } + const iterator operator -(int n) { return advance(-n); } + bool operator !=(const iterator &i) { return p != i.p; } + bool operator ==(const iterator &i) { return p == i.p; } + }; + + class riterator { + iterator i; + public: + riterator(node *p = NULL) : i(p) {} + riterator & operator ++() { --i; return *this;} + riterator & operator --() { ++i; return *this;} + node* operator *() { return *i; } + node* operator ->() { return *i; } + bool operator !=(const riterator &r) { return i != r.i; } + bool operator ==(const riterator &r) { return i == r.i; } + }; + + iterator begin() { return first; } + iterator end() { return NULL; } + riterator rbegin() { return last; } + riterator rend() { return NULL; } + + bool empty() { assert(first != NULL || first == last); return !first; } + unsigned count(); + + // used with node containers that represent shceduling queues + // ignores copies and takes into account alu_packed_node items + unsigned real_alu_count(); + + void push_back(node *n); + void push_front(node *n); + + void insert_node_before(node *s, node *n); + void insert_node_after(node *s, node *n); + + void append_from(container_node *c); + + // remove range [b..e) from some container and assign to this container + void move(iterator b, iterator e); + + void expand(); + void expand(container_node *n); + void remove_node(node *n); + + node *cut(iterator b, iterator e); + + void clear() { first = last = NULL; } + + virtual bool is_valid() { return true; } + virtual bool accept(vpass &p, bool enter); + virtual bool fold_dispatch(expr_handler *ex); + + node* front() { return first; } + node* back() { return last; } + + void collect_stats(node_stats &s); + + friend class shader; + + +}; + +typedef container_node::iterator node_iterator; +typedef container_node::riterator node_riterator; + +class alu_group_node : public container_node { +protected: + alu_group_node() : container_node(NT_LIST, NST_ALU_GROUP), literals() {} +public: + + std::vector<literal> literals; + + virtual bool is_valid() { return subtype == NST_ALU_GROUP; } + virtual bool accept(vpass &p, bool enter); + + + unsigned literal_chan(literal l) { + std::vector<literal>::iterator F = + std::find(literals.begin(), literals.end(), l); + assert(F != literals.end()); + return F - literals.begin(); + } + + friend class shader; +}; + +class cf_node : public container_node { +protected: + cf_node() : container_node(NT_OP, NST_CF_INST), bc(), jump_target(), + jump_after_target() {}; +public: + bc_cf bc; + + cf_node *jump_target; + bool jump_after_target; + + virtual bool is_valid() { return subtype == NST_CF_INST; } + virtual bool accept(vpass &p, bool enter); + virtual bool fold_dispatch(expr_handler *ex); + + void jump(cf_node *c) { jump_target = c; jump_after_target = false; } + void jump_after(cf_node *c) { jump_target = c; jump_after_target = true; } + + friend class shader; +}; + +class alu_node : public node { +protected: + alu_node() : node(NT_OP, NST_ALU_INST), bc() {}; +public: + bc_alu bc; + + virtual bool is_valid() { return subtype == NST_ALU_INST; } + virtual bool accept(vpass &p, bool enter); + virtual bool fold_dispatch(expr_handler *ex); + + unsigned forced_bank_swizzle() { + return ((bc.op_ptr->flags & AF_INTERP) && (bc.slot_flags == AF_4V)) ? + VEC_210 : 0; + } + + // return param index + 1 if instruction references interpolation param, + // otherwise 0 + unsigned interp_param(); + + alu_group_node *get_alu_group_node(); + + friend class shader; +}; + +// for multi-slot instrs - DOT/INTERP/... (maybe useful for 64bit pairs later) +class alu_packed_node : public container_node { +protected: + alu_packed_node() : container_node(NT_OP, NST_ALU_PACKED_INST) {} +public: + + const alu_op_info* op_ptr() { + return static_cast<alu_node*>(first)->bc.op_ptr; + } + unsigned op() { return static_cast<alu_node*>(first)->bc.op; } + void init_args(); + + virtual bool is_valid() { return subtype == NST_ALU_PACKED_INST; } + virtual bool accept(vpass &p, bool enter); + virtual bool fold_dispatch(expr_handler *ex); + + unsigned get_slot_mask(); + void update_packed_items(sb_context &ctx); + + friend class shader; +}; + +class fetch_node : public node { +protected: + fetch_node() : node(NT_OP, NST_FETCH_INST), bc() {}; +public: + bc_fetch bc; + + virtual bool is_valid() { return subtype == NST_FETCH_INST; } + virtual bool accept(vpass &p, bool enter); + virtual bool fold_dispatch(expr_handler *ex); + + bool uses_grad() { return bc.op_ptr->flags & FF_USEGRAD; } + + friend class shader; +}; + +class region_node; + +class repeat_node : public container_node { +protected: + repeat_node(region_node *target, unsigned id) + : container_node(NT_REPEAT, NST_LIST), target(target), rep_id(id) {} +public: + region_node *target; + unsigned rep_id; + + virtual bool accept(vpass &p, bool enter); + + friend class shader; +}; + +class depart_node : public container_node { +protected: + depart_node(region_node *target, unsigned id) + : container_node(NT_DEPART, NST_LIST), target(target), dep_id(id) {} +public: + region_node *target; + unsigned dep_id; + + virtual bool accept(vpass &p, bool enter); + + friend class shader; +}; + +class if_node : public container_node { +protected: + if_node() : container_node(NT_IF, NST_LIST), cond() {}; +public: + value *cond; // glued to pseudo output (dst[2]) of the PRED_SETxxx + + virtual bool accept(vpass &p, bool enter); + + friend class shader; +}; + +typedef std::vector<depart_node*> depart_vec; +typedef std::vector<repeat_node*> repeat_vec; + +class region_node : public container_node { +protected: + region_node(unsigned id) : container_node(NT_REGION, NST_LIST), region_id(id), + loop_phi(), phi(), vars_defined(), departs(), repeats() {} +public: + unsigned region_id; + + container_node *loop_phi; + container_node *phi; + + val_set vars_defined; + + depart_vec departs; + repeat_vec repeats; + + virtual bool accept(vpass &p, bool enter); + + unsigned dep_count() { return departs.size(); } + unsigned rep_count() { return repeats.size() + 1; } + + bool is_loop() { return !repeats.empty(); } + + container_node* get_entry_code_location() { + node *p = first; + while (p && (p->is_depart() || p->is_repeat())) + p = static_cast<container_node*>(p)->first; + + container_node *c = static_cast<container_node*>(p); + if (c->is_bb()) + return c; + else + return c->parent; + } + + void expand_depart(depart_node *d); + void expand_repeat(repeat_node *r); + + friend class shader; +}; + +class bb_node : public container_node { +protected: + bb_node(unsigned id, unsigned loop_level) + : container_node(NT_LIST, NST_BB), id(id), loop_level(loop_level) {} +public: + unsigned id; + unsigned loop_level; + + virtual bool accept(vpass &p, bool enter); + + friend class shader; +}; + + +typedef std::vector<region_node*> regions_vec; +typedef std::vector<bb_node*> bbs_vec; +typedef std::list<node*> sched_queue; +typedef sched_queue::iterator sq_iterator; +typedef std::vector<node*> node_vec; +typedef std::list<node*> node_list; +typedef std::set<node*> node_set; + + + +} // namespace r600_sb + +#endif /* R600_SB_IR_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_liveness.cpp b/src/gallium/drivers/r600/sb/sb_liveness.cpp new file mode 100644 index 00000000000..50988d4f0bb --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_liveness.cpp @@ -0,0 +1,407 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_shader.h" + +#include "sb_pass.h" + +#define LIV_DEBUG 0 + +#if LIV_DEBUG +#define LIV_DUMP(a) do { a } while (0) +#else +#define LIV_DUMP(a) +#endif + +namespace r600_sb { + +using std::cerr; + +bool liveness::visit(container_node& n, bool enter) { + if (enter) { + n.live_after = live; + process_ins(n); + } else { + process_outs(n); + n.live_before = live; + } + return true; +} + +bool liveness::visit(bb_node& n, bool enter) { + if (enter) { + n.live_after = live; + } else { + n.live_before = live; + } + return true; +} + +bool liveness::visit(alu_group_node& n, bool enter) { + if (enter) { + } else { + } + return true; +} + +bool liveness::visit(cf_node& n, bool enter) { + if (enter) { + if (n.bc.op == CF_OP_CF_END) { + n.flags |= NF_DEAD; + return false; + } + n.live_after = live; + update_interferences(); + process_op(n); + } else { + n.live_before = live; + } + return true; +} + +bool liveness::visit(alu_node& n, bool enter) { + if (enter) { + update_interferences(); + process_op(n); + } else { + } + return false; +} + +bool liveness::visit(alu_packed_node& n, bool enter) { + if (enter) { + update_interferences(); + process_op(n); + + } else { + } + return false; +} + +bool liveness::visit(fetch_node& n, bool enter) { + if (enter) { + update_interferences(); + process_op(n); + } else { + } + return true; +} + +bool liveness::visit(region_node& n, bool enter) { + if (enter) { + val_set s = live; + + update_interferences(); + + if (n.phi) + process_phi_outs(n.phi); + + n.live_after = live; + + live.clear(); + + if (n.loop_phi) { + n.live_before.clear(); + } + + assert(n.count() == 1); + run_on(*static_cast<container_node*>(*n.begin())); + + // second pass for loops + if (n.loop_phi) { + process_phi_outs(n.loop_phi); + n.live_before = live; + + run_on(*static_cast<container_node*>(*n.begin())); + + update_interferences(); // FIXME is it required + + process_phi_outs(n.loop_phi); + process_phi_branch(n.loop_phi, 0); + } + + update_interferences(); // FIXME is it required + + n.live_after = s; + n.live_before = live; + } + return false; +} + +bool liveness::visit(repeat_node& n, bool enter) { + if (enter) { + live = n.target->live_before; + process_phi_branch(n.target->loop_phi, n.rep_id); + } else { + } + return true; +} + +bool liveness::visit(depart_node& n, bool enter) { + if (enter) { + live = n.target->live_after; + if(n.target->phi) + process_phi_branch(n.target->phi, n.dep_id); + } else { + } + return true; +} + +bool liveness::visit(if_node& n, bool enter) { + if (enter) { + assert(n.count() == 1); + n.live_after = live; + + run_on(*static_cast<container_node*>(*n.begin())); + + process_op(n); + live.add_set(n.live_after); + } + return false; +} + +void liveness::update_interferences() { + if (!sh.compute_interferences) + return; + + if (!live_changed) + return; + + LIV_DUMP( + cerr << "interf "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + val_set& s = live; + for(val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) { + value *v = *I; + assert(v); + + if (v->array) { + v->array->interferences.add_set(s); + } + + v->interferences.add_set(s); + v->interferences.remove_val(v); + + LIV_DUMP( + cerr << "interferences updated for "; + dump::dump_val(v); + cerr << " : "; + dump::dump_set(sh, v->interferences); + cerr << "\n"; + ); + } + live_changed = false; +} + +bool liveness::remove_val(value *v) { + if (live.remove_val(v)) { + v->flags &= ~VLF_DEAD; + return true; + } + v->flags |= VLF_DEAD; + return false; +} + +bool liveness::process_maydef(value *v) { + bool r = false; + vvec::iterator S(v->muse.begin()); + + for (vvec::iterator I = v->mdef.begin(), E = v->mdef.end(); I != E; + ++I, ++S) { + value *&d = *I, *&u = *S; + if (!d) { + assert(!u); + continue; + } + + bool alive = remove_val(d); + if (alive) { + r = true; + } else { + d = NULL; + u = NULL; + } + } + return r; +} + +bool liveness::remove_vec(vvec &vv) { + bool r = false; + for (vvec::reverse_iterator I = vv.rbegin(), E = vv.rend(); I != E; ++I) { + value* &v = *I; + if (!v) + continue; + + if (v->is_rel()) { + r |= process_maydef(v); + } else + r |= remove_val(v); + } + return r; +} + +bool r600_sb::liveness::visit(node& n, bool enter) { + if (enter) { + update_interferences(); + process_op(n); + } + return false; +} + +bool liveness::process_outs(node& n) { + bool alive = remove_vec(n.dst); + if (alive) + live_changed = true; + return alive; +} + +bool liveness::add_vec(vvec &vv, bool src) { + bool r = false; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + r |= add_vec(v->muse, true); + if (v->rel->is_any_reg()) + r |= live.add_val(v->rel); + + } else if (src) { + r |= live.add_val(v); + } + } + + return r; +} + +void liveness::process_ins(node& n) { + if (!(n.flags & NF_DEAD)) { + + live_changed |= add_vec(n.src, true); + live_changed |= add_vec(n.dst, false); + + if (n.type == NT_IF) { + if_node &in = (if_node&)n; + if (in.cond) + live_changed |= live.add_val(in.cond); + } + if (n.pred) + live_changed |= live.add_val(n.pred); + } +} + +void liveness::process_op(node& n) { + + LIV_DUMP( + cerr << "process_op: "; + dump::dump_op(&n); + cerr << "\n"; + cerr << "process_op: live_after:"; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + if(!n.dst.empty() || n.is_cf_op(CF_OP_CALL_FS)) { + if (!process_outs(n)) { + if (!(n.flags & NF_DONT_KILL)) + n.flags |= NF_DEAD; + } else { + n.flags &= ~NF_DEAD; + } + } + process_ins(n); + + LIV_DUMP( + cerr << "process_op: live_before:"; + dump::dump_set(sh, live); + cerr << "\n"; + ); +} + +int liveness::init() { + + if (sh.compute_interferences) { + gpr_array_vec &vv = sh.arrays(); + for (gpr_array_vec::iterator I = vv.begin(), E = vv.end(); I != E; + ++I) { + gpr_array *a = *I; + a->interferences.clear(); + } + } + + return 0; +} + +void liveness::update_src_vec(vvec &vv, bool src) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + + if (!v || !v->is_sgpr()) + continue; + + if (v->rel && v->rel->is_dead()) + v->rel->flags &= ~VLF_DEAD; + + if (src && v->is_dead()) { + v->flags &= ~VLF_DEAD; + } + } +} + +void liveness::process_phi_outs(container_node *phi) { + for (node_iterator I = phi->begin(), E = phi->end(); I != E; ++I) { + node *n = *I; + if (!process_outs(*n)) { + n->flags |= NF_DEAD; + } else { + n->flags &= ~NF_DEAD; + update_src_vec(n->src, true); + update_src_vec(n->dst, false); + } + } +} + +void liveness::process_phi_branch(container_node* phi, unsigned id) { + val_set &s = live; + for (node_iterator I = phi->begin(), E = phi->end(); I != E; ++I) { + node *n = *I; + if (n->is_dead()) + continue; + + value *v = n->src[id]; + + if (!v->is_readonly()) { + live_changed |= s.add_val(v); + v->flags &= ~VLF_DEAD; + } + } +} + +} //namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_pass.cpp b/src/gallium/drivers/r600/sb/sb_pass.cpp new file mode 100644 index 00000000000..aecdec8c2c0 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_pass.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +pass::pass(shader &s) : ctx(s.get_ctx()), sh(s) {} + +int pass::run() { return -1; } + +int vpass::init() { return 0; } +int vpass::done() { return 0; } + +int vpass::run() { + int r; + if ((r = init())) + return r; + + run_on(*sh.root); + + if ((r = done())) + return r; + + return 0; +} + +void vpass::run_on(container_node &n) { + if (n.accept(*this, true)) { + + for (node_iterator N, I = n.begin(), E = n.end(); I != E; I = N) { + N = I; + ++N; + + if (I->is_container()) { + container_node *c = static_cast<container_node*>(*I); + run_on(*c); + } else { + I->accept(*this, true); + I->accept(*this, false); + } + } + + } + n.accept(*this, false); +} + +bool vpass::visit(node& n, bool enter) { return true; } +bool vpass::visit(container_node& n, bool enter) { return true; } +bool vpass::visit(alu_group_node& n, bool enter) { return true; } +bool vpass::visit(cf_node& n, bool enter) { return true; } +bool vpass::visit(alu_node& n, bool enter) { return true; } +bool vpass::visit(alu_packed_node& n, bool enter) { return true; } +bool vpass::visit(fetch_node& n, bool enter) { return true; } +bool vpass::visit(region_node& n, bool enter) { return true; } +bool vpass::visit(repeat_node& n, bool enter) { return true; } +bool vpass::visit(depart_node& n, bool enter) { return true; } +bool vpass::visit(if_node& n, bool enter) { return true; } +bool vpass::visit(bb_node& n, bool enter) { return true; } + +void rev_vpass::run_on(container_node& n) { + if (n.accept(*this, true)) { + + for (node_riterator N, I = n.rbegin(), E = n.rend(); I != E; I = N) { + N = I; + ++N; + + if (I->is_container()) { + container_node *c = static_cast<container_node*>(*I); + run_on(*c); + } else { + I->accept(*this, true); + I->accept(*this, false); + } + } + + } + n.accept(*this, false); +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h new file mode 100644 index 00000000000..ac0a51777e9 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_pass.h @@ -0,0 +1,681 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef SB_PASS_H_ +#define SB_PASS_H_ + +#include <stack> + +namespace r600_sb { + +class pass { +protected: + sb_context &ctx; + shader &sh; + +public: + pass(shader &s); + + virtual int run(); + + virtual ~pass() {} +}; + +class vpass : public pass { + +public: + + vpass(shader &s) : pass(s) {} + + virtual int init(); + virtual int done(); + + virtual int run(); + virtual void run_on(container_node &n); + + virtual bool visit(node &n, bool enter); + virtual bool visit(container_node &n, bool enter); + virtual bool visit(alu_group_node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(repeat_node &n, bool enter); + virtual bool visit(depart_node &n, bool enter); + virtual bool visit(if_node &n, bool enter); + virtual bool visit(bb_node &n, bool enter); + +}; + +class rev_vpass : public vpass { + +public: + rev_vpass(shader &s) : vpass(s) {} + + virtual void run_on(container_node &n); +}; + + +// =================== PASSES + +class bytecode; + +class bc_dump : public vpass { + using vpass::visit; + + std::ostream &o; + + uint32_t *bc_data; + unsigned ndw; + + unsigned id; + + unsigned new_group, group_index; + +public: + + bc_dump(shader &s, std::ostream &o, bytecode *bc = NULL); + + bc_dump(shader &s, std::ostream &o, uint32_t *bc_ptr, unsigned ndw) : + vpass(s), o(o), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {} + + virtual int init(); + virtual int done(); + + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + + void dump_dw(unsigned dw_id, unsigned count = 2); + + void dump(cf_node& n); + void dump(alu_node& n); + void dump(fetch_node& n); +}; + + +class dce_cleanup : public vpass { + using vpass::visit; + +public: + + dce_cleanup(shader &s) : vpass(s) {} + + virtual bool visit(node &n, bool enter); + virtual bool visit(alu_group_node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(container_node &n, bool enter); + +private: + + void cleanup_dst(node &n); + void cleanup_dst_vec(vvec &vv); + +}; + + +class def_use : public pass { + +public: + + def_use(shader &sh) : pass(sh) {} + + virtual int run(); + void run_on(node *n, bool defs); + +private: + + void process_uses(node *n); + void process_defs(node *n, vvec &vv, bool arr_def); + void process_phi(container_node *c, bool defs, bool uses); +}; + + + +class dump : public vpass { + using vpass::visit; + + int level; + +public: + + dump(shader &s) : vpass(s), level(0) {} + + virtual bool visit(node &n, bool enter); + virtual bool visit(container_node &n, bool enter); + virtual bool visit(alu_group_node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(repeat_node &n, bool enter); + virtual bool visit(depart_node &n, bool enter); + virtual bool visit(if_node &n, bool enter); + virtual bool visit(bb_node &n, bool enter); + + + static void dump_op(node &n, const char *name); + static void dump_vec(const vvec & vv); + static void dump_set(shader &sh, val_set & v); + + static void dump_rels(vvec & vv); + + static void dump_val(value *v); + static void dump_op(node *n); + + static void dump_op_list(container_node *c); + static void dump_queue(sched_queue &q); + + static void dump_alu(alu_node *n); + +private: + + void indent(); + + void dump_common(node &n); + void dump_flags(node &n); + + void dump_live_values(container_node &n, bool before); +}; + + +// Global Code Motion + +class gcm : public pass { + + sched_queue bu_ready[SQ_NUM]; + sched_queue bu_ready_next[SQ_NUM]; + sched_queue bu_ready_early[SQ_NUM]; + sched_queue ready; + sched_queue ready_above; + + container_node pending; + + struct op_info { + bb_node* top_bb; + bb_node* bottom_bb; + op_info() : top_bb(), bottom_bb() {} + }; + + typedef std::map<node*, op_info> op_info_map; + + typedef std::map<node*, unsigned> nuc_map; + + op_info_map op_map; + nuc_map uses; + + typedef std::vector<nuc_map> nuc_stack; + + nuc_stack nuc_stk; + unsigned ucs_level; + + bb_node * bu_bb; + + vvec pending_defs; + + node_list pending_nodes; + + unsigned cur_sq; + +public: + + gcm(shader &sh) : pass(sh), + bu_ready(), bu_ready_next(), bu_ready_early(), + ready(), op_map(), uses(), nuc_stk(1), ucs_level(), + bu_bb(), pending_defs(), pending_nodes() {} + + virtual int run(); + +private: + + void collect_instructions(container_node *c, bool early_pass); + + void sched_early(container_node *n); + void td_sched_bb(bb_node *bb); + bool td_is_ready(node *n); + void td_release_uses(vvec &v); + void td_release_val(value *v); + void td_schedule(bb_node *bb, node *n); + + void sched_late(container_node *n); + void bu_sched_bb(bb_node *bb); + void bu_release_defs(vvec &v, bool src); + void bu_release_phi_defs(container_node *p, unsigned op); + bool bu_is_ready(node *n); + void bu_release_val(value *v); + void bu_release_op(node * n); + void bu_find_best_bb(node *n, op_info &oi); + void bu_schedule(container_node *bb, node *n); + + void push_uc_stack(); + void pop_uc_stack(); + + void init_def_count(nuc_map &m, container_node &s); + void init_use_count(nuc_map &m, container_node &s); + unsigned get_uc_vec(vvec &vv); + unsigned get_dc_vec(vvec &vv, bool src); + + void add_ready(node *n); + + void dump_uc_stack(); + + unsigned real_alu_count(sched_queue &q, unsigned max); + + // check if we have not less than threshold ready alu instructions + bool check_alu_ready_count(unsigned threshold); +}; + + +class gvn : public vpass { + using vpass::visit; + +public: + + gvn(shader &sh) : vpass(sh) {} + + virtual bool visit(node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + +private: + + void process_op(node &n, bool rewrite = true); + + // returns true if the value was rewritten + bool process_src(value* &v, bool rewrite); + + + void process_alu_src_constants(node &n, value* &v); +}; + + +class if_conversion : public pass { + +public: + + if_conversion(shader &sh) : pass(sh) {} + + virtual int run(); + + bool run_on(region_node *r); + + alu_node* convert_phi(value *select, node *phi); + + unsigned try_convert_kills(region_node* r); + +}; + + +class liveness : public rev_vpass { + using vpass::visit; + + val_set live; + bool live_changed; + +public: + + liveness(shader &s) : rev_vpass(s), live_changed(false) {} + + virtual int init(); + + virtual bool visit(node &n, bool enter); + virtual bool visit(bb_node &n, bool enter); + virtual bool visit(container_node &n, bool enter); + virtual bool visit(alu_group_node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(repeat_node &n, bool enter); + virtual bool visit(depart_node &n, bool enter); + virtual bool visit(if_node &n, bool enter); + +private: + + void update_interferences(); + void process_op(node &n); + + bool remove_val(value *v); + bool remove_vec(vvec &v); + bool process_outs(node& n); + void process_ins(node& n); + + void process_phi_outs(container_node *phi); + void process_phi_branch(container_node *phi, unsigned id); + + bool process_maydef(value *v); + + bool add_vec(vvec &vv, bool src); + + void update_src_vec(vvec &vv, bool src); +}; + + +struct bool_op_info { + bool invert; + unsigned int_cvt; + + alu_node *n; +}; + +class peephole : public pass { + +public: + + peephole(shader &sh) : pass(sh) {} + + virtual int run(); + + void run_on(container_node *c); + + void optimize_cc_op(alu_node *a); + + void optimize_SETcc_op(alu_node *a); + void optimize_CNDcc_op(alu_node *a); + + bool get_bool_op_info(value *b, bool_op_info& bop); + bool get_bool_flt_to_int_source(alu_node* &a); + void convert_float_setcc(alu_node *f2i, alu_node *s); +}; + + +class psi_ops : public rev_vpass { + using rev_vpass::visit; + +public: + + psi_ops(shader &s) : rev_vpass(s) {} + + virtual bool visit(node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + + bool try_inline(node &n); + bool try_reduce(node &n); + bool eliminate(node &n); + + void unpredicate(node *n); +}; + + +// check correctness of the generated code, e.g.: +// - expected source operand value is the last value written to its gpr, +// - all arguments of phi node should be allocated to the same gpr, +// TODO other tests +class ra_checker : public pass { + + typedef std::map<sel_chan, value *> reg_value_map; + + typedef std::vector<reg_value_map> regmap_stack; + + regmap_stack rm_stack; + unsigned rm_stk_level; + + value* prev_dst[5]; + +public: + + ra_checker(shader &sh) : pass(sh) {} + + virtual int run(); + + void run_on(container_node *c); + + void dump_error(const error_info &e); + void dump_all_errors(); + +private: + + reg_value_map& rmap() { return rm_stack[rm_stk_level]; } + + void push_stack(); + void pop_stack(); + + // when going out of the alu clause, values in the clause temporary gprs, + // AR, predicate values, PS/PV are destroyed + void kill_alu_only_regs(); + void error(node *n, unsigned id, std::string msg); + + void check_phi_src(container_node *p, unsigned id); + void process_phi_dst(container_node *p); + void check_alu_group(alu_group_node *g); + void process_op_dst(node *n); + void check_op_src(node *n); + void check_src_vec(node *n, unsigned id, vvec &vv, bool src); + void check_value_gpr(node *n, unsigned id, value *v); +}; + +// ======================================= + + +class ra_coalesce : public pass { + +public: + + ra_coalesce(shader &sh) : pass(sh) {} + + virtual int run(); +}; + + + +// ======================================= + +class ra_init : public pass { + +public: + + ra_init(shader &sh) : pass(sh) {} + + virtual int run(); + +private: + + void ra_node(container_node *c); + void process_op(node *n); + + void color(value *v); + + void color_bs_constraint(ra_constraint *c); + + void assign_color(value *v, sel_chan c); + void alloc_arrays(); +}; + +// ======================================= + +class ra_split : public pass { + +public: + + ra_split(shader &sh) : pass(sh) {} + + virtual int run(); + + void split(container_node *n); + void split_op(node *n); + void split_alu_packed(alu_packed_node *n); + void split_vector_inst(node *n); + + void split_packed_ins(alu_packed_node *n); + +#if 0 + void split_pinned_outs(node *n); +#endif + + void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz); + + void split_phi_src(container_node *loc, container_node *c, unsigned id, + bool loop); + void split_phi_dst(node *loc, container_node *c, bool loop); + void init_phi_constraints(container_node *c); +}; + + + +class ssa_prepare : public vpass { + using vpass::visit; + + typedef std::vector<val_set> vd_stk; + vd_stk stk; + + unsigned level; + +public: + ssa_prepare(shader &s) : vpass(s), level(0) {} + + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(repeat_node &n, bool enter); + virtual bool visit(depart_node &n, bool enter); + +private: + + void push_stk() { + ++level; + if (level + 1 > stk.size()) + stk.resize(level+1); + else + stk[level].clear(); + } + void pop_stk() { + assert(level); + --level; + stk[level].add_set(stk[level + 1]); + } + + void add_defs(node &n); + + val_set & cur_set() { return stk[level]; } + + container_node* create_phi_nodes(int count); +}; + +class ssa_rename : public vpass { + using vpass::visit; + + typedef sb_map<value*, unsigned> def_map; + + def_map def_count; + std::stack<def_map> rename_stack; + + typedef std::map<uint32_t, value*> val_map; + val_map values; + +public: + + ssa_rename(shader &s) : vpass(s) {} + + virtual int init(); + + virtual bool visit(container_node &n, bool enter); + virtual bool visit(node &n, bool enter); + virtual bool visit(alu_group_node &n, bool enter); + virtual bool visit(cf_node &n, bool enter); + virtual bool visit(alu_node &n, bool enter); + virtual bool visit(alu_packed_node &n, bool enter); + virtual bool visit(fetch_node &n, bool enter); + virtual bool visit(region_node &n, bool enter); + virtual bool visit(repeat_node &n, bool enter); + virtual bool visit(depart_node &n, bool enter); + virtual bool visit(if_node &n, bool enter); + +private: + + void push(node *phi); + void pop(); + + unsigned get_index(def_map& m, value* v); + void set_index(def_map& m, value* v, unsigned index); + unsigned new_index(def_map& m, value* v); + + value* rename_use(node *n, value* v); + value* rename_def(node *def, value* v); + + void rename_src_vec(node *n, vvec &vv, bool src); + void rename_dst_vec(node *def, vvec &vv, bool set_def); + + void rename_src(node *n); + void rename_dst(node *n); + + void rename_phi_args(container_node *phi, unsigned op, bool def); + + void rename_virt(node *n); + void rename_virt_val(node *n, value *v); +}; + +class bc_finalizer : public pass { + + cf_node *last_export[EXP_TYPE_COUNT]; + cf_node *last_cf; + + unsigned ngpr; + unsigned nstack; + +public: + + bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(), + nstack() {} + + virtual int run(); + + void finalize_loop(region_node *r); + void finalize_if(region_node *r); + + void run_on(container_node *c); + + void finalize_alu_group(alu_group_node *g); + void finalize_alu_src(alu_group_node *g, alu_node *a); + + void emit_set_grad(fetch_node* f); + void finalize_fetch(fetch_node *f); + + void finalize_cf(cf_node *c); + + sel_chan translate_kcache(cf_node *alu, value *v); + + void update_ngpr(unsigned gpr); + void update_nstack(region_node *r, unsigned add = 0); + + void cf_peephole(); + +}; + + +} // namespace r600_sb + +#endif /* SB_PASS_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_peephole.cpp b/src/gallium/drivers/r600/sb/sb_peephole.cpp new file mode 100644 index 00000000000..444765e8779 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_peephole.cpp @@ -0,0 +1,232 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define PPH_DEBUG 0 + +#if PPH_DEBUG +#define PPH_DUMP(q) do { q } while (0) +#else +#define PPH_DUMP(q) +#endif + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int peephole::run() { + + run_on(sh.root); + + return 0; +} + +void peephole::run_on(container_node* c) { + + for (node_riterator I = c->rbegin(), E = c->rend(); I != E; ++I) { + node *n = *I; + + if (n->is_container()) + run_on(static_cast<container_node*>(n)); + else { + + if (n->is_alu_inst()) { + alu_node *a = static_cast<alu_node*>(n); + + if (a->bc.op_ptr->flags & AF_CC_MASK) { + optimize_cc_op(a); + } else if (a->bc.op == ALU_OP1_FLT_TO_INT) { + + alu_node *s = a; + if (get_bool_flt_to_int_source(s)) { + convert_float_setcc(a, s); + } + } + } + } + } +} + +void peephole::optimize_cc_op(alu_node* a) { + unsigned aflags = a->bc.op_ptr->flags; + + if (aflags & (AF_PRED | AF_SET)) { + optimize_SETcc_op(a); + } else if (aflags & AF_CMOV) { + optimize_CNDcc_op(a); + } +} + +void peephole::convert_float_setcc(alu_node *f2i, alu_node *s) { + alu_node *ns = sh.clone(s); + + ns->dst[0] = f2i->dst[0]; + ns->dst[0]->def = ns; + ns->bc.set_op(ns->bc.op + (ALU_OP2_SETE_DX10 - ALU_OP2_SETE)); + f2i->insert_after(ns); + f2i->remove(); +} + +void peephole::optimize_SETcc_op(alu_node* a) { + + unsigned flags = a->bc.op_ptr->flags; + unsigned cc = flags & AF_CC_MASK; + unsigned cmp_type = flags & AF_CMP_TYPE_MASK; + unsigned dst_type = flags & AF_DST_TYPE_MASK; + bool is_pred = flags & AF_PRED; + + // TODO handle other cases + + if (a->src[1]->is_const() && (cc == AF_CC_E || cc == AF_CC_NE) && + a->src[1]->literal_value == literal(0) && + a->bc.src[0].neg == 0 && a->bc.src[0].abs == 0) { + + value *s = a->src[0]; + + bool_op_info bop = {}; + + PPH_DUMP( + cerr << "optSETcc "; + dump::dump_op(a); + cerr << "\n"; + ); + + if (!get_bool_op_info(s, bop)) + return; + + if (cc == AF_CC_E) + bop.invert = !bop.invert; + + bool swap_args = false; + + cc = bop.n->bc.op_ptr->flags & AF_CC_MASK; + + if (bop.invert) + cc = invert_setcc_condition(cc, swap_args); + + if (bop.int_cvt) { + assert(cmp_type != AF_FLOAT_CMP); + cmp_type = AF_FLOAT_CMP; + } + + PPH_DUMP( + cerr << "boi node: "; + dump::dump_op(bop.n); + cerr << " invert: " << bop.invert << " int_cvt: " << bop.int_cvt; + cerr <<"\n"; + ); + + unsigned newop = is_pred ? get_predsetcc_opcode(cc, cmp_type) : + get_setcc_opcode(cc, cmp_type, dst_type != AF_FLOAT_DST); + + a->bc.set_op(newop); + + if (swap_args) { + a->src[0] = bop.n->src[1]; + a->src[1] = bop.n->src[0]; + a->bc.src[0] = bop.n->bc.src[1]; + a->bc.src[1] = bop.n->bc.src[0]; + + } else { + a->src[0] = bop.n->src[0]; + a->src[1] = bop.n->src[1]; + a->bc.src[0] = bop.n->bc.src[0]; + a->bc.src[1] = bop.n->bc.src[1]; + } + } +} + +void peephole::optimize_CNDcc_op(alu_node* a) { + + //TODO +} + +bool peephole::get_bool_flt_to_int_source(alu_node* &a) { + + if (a->bc.op == ALU_OP1_FLT_TO_INT) { + + if (a->bc.src[0].neg || a->bc.src[0].abs || a->bc.src[0].rel) + return false; + + value *s = a->src[0]; + if (!s || !s->def || !s->def->is_alu_inst()) + return false; + + alu_node *dn = static_cast<alu_node*>(s->def); + + if (dn->is_alu_op(ALU_OP1_TRUNC)) { + s = dn->src[0]; + if (!s || !s->def || !s->def->is_alu_inst()) + return false; + + if (dn->bc.src[0].neg != 1 || dn->bc.src[0].abs != 0 || + dn->bc.src[0].rel != 0) { + return false; + } + + dn = static_cast<alu_node*>(s->def); + + } + + if (dn->bc.op_ptr->flags & AF_SET) { + a = dn; + return true; + } + } + return false; +} + +bool peephole::get_bool_op_info(value* b, bool_op_info& bop) { + + node *d = b->def; + + if (!d || !d->is_alu_inst()) + return false; + + alu_node *dn = static_cast<alu_node*>(d); + + if (dn->bc.op_ptr->flags & AF_SET) { + bop.n = dn; + + if (dn->bc.op_ptr->flags & AF_DX10) + bop.int_cvt = true; + + return true; + } + + if (get_bool_flt_to_int_source(dn)) { + bop.n = dn; + bop.int_cvt = true; + return true; + } + + return false; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_psi_ops.cpp b/src/gallium/drivers/r600/sb/sb_psi_ops.cpp new file mode 100644 index 00000000000..7d0a31e511a --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_psi_ops.cpp @@ -0,0 +1,189 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +bool r600_sb::psi_ops::visit(alu_node& n, bool enter) { + if (enter) { + } + return false; +} + +bool psi_ops::visit(node& n, bool enter) { + if (enter) { + assert(n.subtype == NST_PSI); + + try_inline(n); + + // TODO eliminate predication until there is full support in all passes + // unpredicate instructions and replace psi-nodes with conditional moves + eliminate(n); + } + return false; +} + +value* get_pred_val(node &n) { + value *pred_val = NULL; + + for (vvec::iterator I = n.src.begin(), E = n.src.end(); I != E; I += 3) { + value* &pred = *I; + if (pred) { + if (!pred_val) + pred_val = pred; + else { + assert(pred == pred_val); + } + } + } + return pred_val; +} + +// for now we'll never inline psi's with different predicate values, +// so psi node may only contain the refs to one predicate value. +bool psi_ops::try_inline(node& n) { + assert(n.subtype == NST_PSI); + + vvec &ns = n.src; + + int sz = ns.size(); + assert(sz && (sz % 3 == 0)); + + value *pred_val = get_pred_val(n); + + int ps_mask = 0; + + bool r = false; + + for (int i = sz - 1; i >= 0; i -= 3) { + + if (ps_mask == 3) { + ns.erase(ns.begin(), ns.begin() + i + 1); + return r; + } + + value* val = ns[i]; + value* predsel = ns[i-1]; + int ps = !predsel ? 3 : predsel == sh.get_pred_sel(0) ? 1 : 2; + + assert(val->def); + + if (val->def->subtype == NST_PSI && ps == 3) { + if (get_pred_val(*val->def) != pred_val) + continue; + + vvec &ds = val->def->src; + + ns.insert(ns.begin() + i + 1, ds.begin(), ds.end()); + ns.erase(ns.begin() + i - 2, ns.begin() + i + 1); + i += ds.size(); + r = true; + + } else { + if ((ps_mask & ps) == ps) { + // this predicate select is subsumed by already handled ops + ns.erase(ns.begin() + i - 2, ns.begin() + i + 1); + } else { + ps_mask |= ps; + } + } + } + return r; +} + +bool psi_ops::try_reduce(node& n) { + assert(n.subtype == NST_PSI); + assert(n.src.size() % 3 == 0); + + // TODO + + return false; +} + +void psi_ops::unpredicate(node *n) { + + if (!n->is_alu_inst()) + return; + + alu_node *a = static_cast<alu_node*>(n); + a->pred = NULL; +} + +bool psi_ops::eliminate(node& n) { + assert(n.subtype == NST_PSI); + assert(n.src.size() == 6); + + value *d = n.dst[0]; + + value *s1 = n.src[2]; + value *s2 = n.src[5]; + + value *pred = n.src[3]; + + bool psel = n.src[4] == sh.get_pred_sel(0); + + value *sel = get_select_value_for_em(sh, pred); + + if (s1->is_undef()) { + if (s2->is_undef()) { + + } else { + n.insert_after(sh.create_mov(d, s2)); + } + } else if (s2->is_undef()) { + n.insert_after(sh.create_mov(d, s1)); + } else { + alu_node *a = sh.create_alu(); + a->bc.set_op(ALU_OP3_CNDE_INT); + + a->dst.push_back(d); + a->src.push_back(sel); + + if (psel) { + a->src.push_back(s1); + a->src.push_back(s2); + } else { + a->src.push_back(s2); + a->src.push_back(s1); + } + + n.insert_after(a); + } + + n.remove(); + + if (s1->is_any_gpr() && !s1->is_undef() && s1->def) + unpredicate(s1->def); + if (s2->is_any_gpr() && !s2->is_undef() && s2->def) + unpredicate(s2->def); + + return false; +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_public.h b/src/gallium/drivers/r600/sb/sb_public.h new file mode 100644 index 00000000000..c9f5f97f9c8 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_public.h @@ -0,0 +1,40 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef R600_SB_H_ +#define R600_SB_H_ + +struct r600_shader; + +void r600_sb_context_destroy(void *sctx); + +int r600_sb_bytecode_process(struct r600_context *rctx, + struct r600_bytecode *bc, + struct r600_shader *pshader, + int dump_source_bytecode, + int optimize); + +#endif //R600_SB_H_ diff --git a/src/gallium/drivers/r600/sb/sb_ra_checker.cpp b/src/gallium/drivers/r600/sb/sb_ra_checker.cpp new file mode 100644 index 00000000000..83510b02158 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ra_checker.cpp @@ -0,0 +1,277 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <sstream> + +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int ra_checker::run() { + + rm_stack.clear(); + rm_stack.resize(1); + rm_stk_level = 0; + + process_op_dst(sh.root); + + run_on(sh.root); + + assert(rm_stk_level == 0); + + dump_all_errors(); + + assert(sh.errors.empty()); + + return 0; +} + +void ra_checker::dump_error(const error_info &e) { + + cerr << "error at : "; + dump::dump_op(e.n); + + cerr << "\n"; + cerr << " : " << e.message << "\n"; +} + +void ra_checker::dump_all_errors() { + for (error_map::iterator I = sh.errors.begin(), E = sh.errors.end(); + I != E; ++I) { + dump_error(I->second); + } +} + + +void ra_checker::error(node *n, unsigned id, std::string msg) { + error_info e; + e.n = n; + e.arg_index = id; + e.message = msg; + sh.errors.insert(std::make_pair(n, e)); +} + +void ra_checker::push_stack() { + ++rm_stk_level; + if (rm_stack.size() == rm_stk_level) + rm_stack.push_back(rm_stack.back()); + else + rm_stack[rm_stk_level] = rm_stack[rm_stk_level - 1]; +} + +void ra_checker::pop_stack() { + --rm_stk_level; +} + +void ra_checker::kill_alu_only_regs() { + // TODO +} + +void ra_checker::check_value_gpr(node *n, unsigned id, value *v) { + sel_chan gpr = v->gpr; + if (!gpr) { + std::ostringstream o; + o << "operand value " << *v << " is not allocated"; + error(n, id, o.str()); + return; + } + reg_value_map::iterator F = rmap().find(v->gpr); + if (F == rmap().end()) { + std::ostringstream o; + o << "operand value " << *v << " was not previously written to its gpr"; + error(n, id, o.str()); + return; + } + if (!F->second->v_equal(v)) { + std::ostringstream o; + o << "expected operand value " << *v + << ", gpr contains " << *(F->second); + error(n, id, o.str()); + return; + } + + +} + +void ra_checker::check_src_vec(node *n, unsigned id, vvec &vv, bool src) { + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v || !v->is_sgpr()) + continue; + + if (v->is_rel()) { + if (!v->rel) { + std::ostringstream o; + o << "expected relative offset in " << *v; + error(n, id, o.str()); + return; + } + } else if (src) { + check_value_gpr(n, id, v); + } + } +} + +void ra_checker::check_op_src(node *n) { + check_src_vec(n, 0, n->dst, false); + check_src_vec(n, 100, n->src, true); +} + +void ra_checker::process_op_dst(node *n) { + + unsigned id = 0; + + for (vvec::iterator I = n->dst.begin(), E = n->dst.end(); I != E; ++I) { + value *v = *I; + + ++id; + + if (!v) + continue; + + if (v->is_sgpr()) { + + if (!v->gpr) { + std::ostringstream o; + o << "destination operand " << *v << " is not allocated"; + error(n, id, o.str()); + return; + } + + rmap()[v->gpr] = v; + } else if (v->is_rel()) { + if (v->rel->is_const()) { + rmap()[v->get_final_gpr()] = v; + } else { + unsigned sz = v->array->array_size; + unsigned start = v->array->gpr; + for (unsigned i = 0; i < sz; ++i) { + rmap()[start + (i << 2)] = v; + } + } + } + } +} + +void ra_checker::check_phi_src(container_node *p, unsigned id) { + for (node_iterator I = p->begin(), E = p->end(); I != E; ++I) { + node *n = *I; + value *s = n->src[id]; + if (s->is_sgpr()) + check_value_gpr(n, id, s); + } +} + +void ra_checker::process_phi_dst(container_node *p) { + for (node_iterator I = p->begin(), E = p->end(); I != E; ++I) { + node *n = *I; + process_op_dst(n); + } +} + +void ra_checker::check_alu_group(alu_group_node *g) { + + for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { + node *a = *I; + if (!a->is_alu_inst()) { + std::ostringstream o; + o << "non-alu node inside alu group"; + error(a, 0, o.str()); + return; + } + + check_op_src(a); + } + + std::fill(prev_dst, prev_dst + 5, (value*)NULL); + + for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { + alu_node *a = static_cast<alu_node*>(*I); + + process_op_dst(a); + + unsigned slot = a->bc.slot; + prev_dst[slot] = a->dst[0]; + } +} + +void ra_checker::run_on(container_node* c) { + + if (c->is_region()) { + region_node *r = static_cast<region_node*>(c); + if (r->loop_phi) { + check_phi_src(r->loop_phi, 0); + process_phi_dst(r->loop_phi); + } + } else if (c->is_depart()) { + + push_stack(); + + } else if (c->is_repeat()) { + + push_stack(); + + } + + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *n = *I; + + if(n->is_cf_inst() || n->is_fetch_inst()) { + check_op_src(n); + process_op_dst(n); + } + + if (n->is_container()) { + if (n->is_alu_group()) { + check_alu_group(static_cast<alu_group_node*>(n)); + } else { + container_node *nc = static_cast<container_node*>(n); + run_on(nc); + } + } + } + + if (c->is_depart()) { + depart_node *r = static_cast<depart_node*>(c); + check_phi_src(r->target->phi, r->dep_id); + pop_stack(); + } else if (c->is_repeat()) { + repeat_node *r = static_cast<repeat_node*>(c); + assert (r->target->loop_phi); + + pop_stack(); + } else if (c->is_region()) { + region_node *r = static_cast<region_node*>(c); + if (r->phi) + process_phi_dst(r->phi); + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_ra_coalesce.cpp b/src/gallium/drivers/r600/sb/sb_ra_coalesce.cpp new file mode 100644 index 00000000000..52e76687b95 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ra_coalesce.cpp @@ -0,0 +1,608 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define RA_DEBUG 0 + +#if RA_DEBUG +#define RA_DUMP(q) do { q } while (0) +#else +#define RA_DUMP(q) +#endif + +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +int ra_coalesce::run() { + + sh.coal.run(); + + return 0; +} + +void coalescer::add_edge(value* a, value* b, unsigned cost) { + assert(a->is_sgpr() && b->is_sgpr()); + edges.insert(new ra_edge(a,b, cost)); +} + +void coalescer::create_chunk(value *v) { + + assert(v->is_sgpr()); + + ra_chunk *c = new ra_chunk(); + + c->values.push_back(v); + + if (v->is_chan_pinned()) + c->flags |= RCF_PIN_CHAN; + if (v->is_reg_pinned()) { + c->flags |= RCF_PIN_REG; + } + + c->pin = v->pin_gpr; + + RA_DUMP( + cerr << "create_chunk: "; + dump_chunk(c); + ); + + all_chunks.push_back(c); + v->chunk = c; + +} + +void coalescer::unify_chunks(ra_edge *e) { + ra_chunk *c1 = e->a->chunk, *c2 = e->b->chunk; + + RA_DUMP( + cerr << "unify_chunks: "; + dump_chunk(c1); + dump_chunk(c2); + ); + + if (c2->is_chan_pinned() && !c1->is_chan_pinned()) { + c1->flags |= RCF_PIN_CHAN; + c1->pin = sel_chan(c1->pin.sel(), c2->pin.chan()); + } + + if (c2->is_reg_pinned() && !c1->is_reg_pinned()) { + c1->flags |= RCF_PIN_REG; + c1->pin = sel_chan(c2->pin.sel(), c1->pin.chan()); + } + + c1->values.reserve(c1->values.size() + c2->values.size()); + + for (vvec::iterator I = c2->values.begin(), E = c2->values.end(); I != E; + ++I) { + (*I)->chunk = c1; + c1->values.push_back(*I); + } + + chunk_vec::iterator F = std::find(all_chunks.begin(), all_chunks.end(), c2); + assert(F != all_chunks.end()); + + all_chunks.erase(F); + + c1->cost += c2->cost + e->cost; + delete c2; +} + +bool coalescer::chunks_interference(ra_chunk *c1, ra_chunk *c2) { + unsigned pin_flags = (c1->flags & c2->flags) & + (RCF_PIN_CHAN | RCF_PIN_REG); + + if ((pin_flags & RCF_PIN_CHAN) && + c1->pin.chan() != c2->pin.chan()) + return true; + + if ((pin_flags & RCF_PIN_REG) && + c1->pin.sel() != c2->pin.sel()) + return true; + + for (vvec::iterator I = c1->values.begin(), E = c1->values.end(); I != E; + ++I) { + value *v1 = *I; + + for (vvec::iterator I = c2->values.begin(), E = c2->values.end(); I != E; + ++I) { + value *v2 = *I; + + if (!v1->v_equal(v2) && v1->interferences.contains(v2)) + return true; + } + } + return false; +} + +void coalescer::build_chunks() { + + for (edge_queue::iterator I = edges.begin(), E = edges.end(); + I != E; ++I) { + + ra_edge *e = *I; + + if (!e->a->chunk) + create_chunk(e->a); + + if (!e->b->chunk) + create_chunk(e->b); + + ra_chunk *c1 = e->a->chunk, *c2 = e->b->chunk; + + if (c1 == c2) { + c1->cost += e->cost; + } else if (!chunks_interference(c1, c2)) + unify_chunks(e); + } +} + +ra_constraint* coalescer::create_constraint(constraint_kind kind) { + ra_constraint *c = new ra_constraint(kind); + all_constraints.push_back(c); + return c; +} + +void coalescer::dump_edges() { + cerr << "######## affinity edges\n"; + + for (edge_queue::iterator I = edges.begin(), E = edges.end(); + I != E; ++I) { + ra_edge* e = *I; + cerr << " ra_edge "; + dump::dump_val(e->a); + cerr << " <-> "; + dump::dump_val(e->b); + cerr << " cost = " << e->cost << "\n"; + } +} + +void coalescer::dump_chunks() { + cerr << "######## chunks\n"; + + for (chunk_vec::iterator I = all_chunks.begin(), E = all_chunks.end(); + I != E; ++I) { + ra_chunk* c = *I; + dump_chunk(c); + } +} + + +void coalescer::dump_constraint_queue() { + cerr << "######## constraints\n"; + + for (constraint_queue::iterator I = constraints.begin(), + E = constraints.end(); I != E; ++I) { + ra_constraint* c = *I; + dump_constraint(c); + } +} + +void coalescer::dump_chunk(ra_chunk* c) { + cerr << " ra_chunk cost = " << c->cost << " : "; + dump::dump_vec(c->values); + + if (c->flags & RCF_PIN_REG) + cerr << " REG = " << c->pin.sel(); + + if (c->flags & RCF_PIN_CHAN) + cerr << " CHAN = " << c->pin.chan(); + + cerr << (c->flags & RCF_GLOBAL ? " GLOBAL" : ""); + + cerr << "\n"; +} + +void coalescer::dump_constraint(ra_constraint* c) { + cerr << " ra_constraint: "; + switch (c->kind) { + case CK_PACKED_BS: cerr << "PACKED_BS"; break; + case CK_PHI: cerr << "PHI"; break; + case CK_SAME_REG: cerr << "SAME_REG"; break; + default: cerr << "UNKNOWN_KIND"; assert(0); break; + } + + cerr << " cost = " << c->cost << " : "; + dump::dump_vec(c->values); + + cerr << "\n"; +} + +void coalescer::get_chunk_interferences(ra_chunk *c, val_set &s) { + + for (vvec::iterator I = c->values.begin(), E = c->values.end(); I != E; + ++I) { + value *v = *I; + s.add_set(v->interferences); + } + s.remove_vec(c->values); +} + +void coalescer::build_chunk_queue() { + for (chunk_vec::iterator I = all_chunks.begin(), + E = all_chunks.end(); I != E; ++I) { + ra_chunk *c = *I; + + if (!c->is_fixed()) + chunks.insert(c); + } +} + +void coalescer::build_constraint_queue() { + for (constraint_vec::iterator I = all_constraints.begin(), + E = all_constraints.end(); I != E; ++I) { + ra_constraint *c = *I; + unsigned cost = 0; + + if (c->values.empty() || !c->values.front()->is_sgpr()) + continue; + + if (c->kind != CK_SAME_REG) + continue; + + for (vvec::iterator I = c->values.begin(), E = c->values.end(); + I != E; ++I) { + value *v = *I; + if (!v->chunk) + create_chunk(v); + else + cost += v->chunk->cost; + } + c->cost = cost; + constraints.insert(c); + } +} + +void coalescer::color_chunks() { + + for (chunk_queue::iterator I = chunks.begin(), E = chunks.end(); + I != E; ++I) { + ra_chunk *c = *I; + if (c->is_fixed() || c->values.size() == 1) + continue; + + sb_bitset rb; + val_set interf; + + get_chunk_interferences(c, interf); + + RA_DUMP( + cerr << "color_chunks: "; + dump_chunk(c); + cerr << "\n interferences: "; + dump::dump_set(sh,interf); + cerr << "\n"; + ); + + init_reg_bitset(rb, interf); + + unsigned pass = c->is_reg_pinned() ? 0 : 1; + + unsigned cs = c->is_chan_pinned() ? c->pin.chan() : 0; + unsigned ce = c->is_chan_pinned() ? cs + 1 : 4; + + unsigned color = 0; + + while (pass < 2) { + + unsigned rs, re; + + if (pass == 0) { + rs = c->pin.sel(); + re = rs + 1; + } else { + rs = 0; + re = sh.num_nontemp_gpr(); + } + + for (unsigned reg = rs; reg < re; ++reg) { + for (unsigned chan = cs; chan < ce; ++chan) { + unsigned bit = sel_chan(reg, chan); + if (bit >= rb.size() || !rb.get(bit)) { + color = bit; + break; + } + } + if (color) + break; + } + + if (color) + break; + + ++pass; + } + + assert(color); + color_chunk(c, color); + } +} + +void coalescer::init_reg_bitset(sb_bitset &bs, val_set &vs) { + + for (val_set::iterator I = vs.begin(sh), E = vs.end(sh); I != E; ++I) { + value *v = *I; + + if (!v->is_sgpr()) + continue; + + if (v->gpr) { + if (v->gpr >= bs.size()) + bs.resize(v->gpr + 64); + bs.set(v->gpr, 1); + } + } +} + +void coalescer::color_chunk(ra_chunk *c, sel_chan color) { + + vvec vv = c->values; + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; + ++I) { + value *v = *I; + + if (v->is_reg_pinned() && v->pin_gpr.sel() != color.sel()) { + detach_value(v); + continue; + } + + if (v->is_chan_pinned() && v->pin_gpr.chan() != color.chan()) { + detach_value(v); + continue; + } + + v->gpr = color; + + if (v->constraint && v->constraint->kind == CK_PHI) + v->fix(); + + + RA_DUMP( + cerr << " assigned " << color << " to "; + dump::dump_val(v); + cerr << "\n"; + ); + } + + c->pin = color; + + if (c->is_reg_pinned()) { + c->fix(); + } +} + +coalescer::~coalescer() { + + // FIXME use pool allocator ?? + + for (constraint_vec::iterator I = all_constraints.begin(), + E = all_constraints.end(); I != E; ++I) { + delete (*I); + } + + for (chunk_vec::iterator I = all_chunks.begin(), + E = all_chunks.end(); I != E; ++I) { + delete (*I); + } + + for (edge_queue::iterator I = edges.begin(), E = edges.end(); + I != E; ++I) { + delete (*I); + } +} + +void coalescer::run() { + RA_DUMP( dump_edges(); ); + + build_chunks(); + RA_DUMP( dump_chunks(); ); + + build_constraint_queue(); + RA_DUMP( dump_constraint_queue(); ); + + color_constraints(); + + build_chunk_queue(); + color_chunks(); +} + +void coalescer::color_phi_constraint(ra_constraint* c) { +} + +ra_chunk* coalescer::detach_value(value *v) { + + vvec::iterator F = std::find(v->chunk->values.begin(), + v->chunk->values.end(), v); + + assert(F != v->chunk->values.end()); + v->chunk->values.erase(F); + create_chunk(v); + + if (v->is_reg_pinned()) { + v->chunk->fix(); + } + + RA_DUMP( + cerr << " detached : "; + dump_chunk(v->chunk); + ); + + return v->chunk; + +} + +void coalescer::color_reg_constraint(ra_constraint *c) { + unsigned k, cnt = c->values.size(); + vvec & cv = c->values; + + ra_chunk *ch[4]; + unsigned swz[4] = {0, 1, 2, 3}; + val_set interf[4]; + sb_bitset rb[4]; + + bool reg_pinned = false; + unsigned pin_reg = ~0; + + unsigned chan_mask = 0; + + k = 0; + for (vvec::iterator I = cv.begin(), E = cv.end(); I != E; ++I, ++k) { + value *v = *I; + + if (!v->chunk) + create_chunk(v); + + ch[k] = v->chunk; + + if (v->chunk->is_chan_pinned()) { + unsigned chan = 1 << v->chunk->pin.chan(); + + if (chan & chan_mask) { // channel already in use + ch[k] = detach_value(v); + assert(!ch[k]->is_chan_pinned()); + } else { + chan_mask |= chan; + } + } + + if (v->chunk->is_reg_pinned()) { + if (!reg_pinned) { + reg_pinned = true; + pin_reg = v->chunk->pin.sel(); + } + } + + get_chunk_interferences(ch[k], interf[k]); + init_reg_bitset(rb[k], interf[k]); + } + + unsigned start_reg, end_reg; + + start_reg = 0; + end_reg = sh.num_nontemp_gpr(); + + unsigned min_reg = end_reg; + unsigned min_swz[4]; + unsigned i, pass = reg_pinned ? 0 : 1; + + bool done = false; + + while (pass < 2) { + + unsigned rs, re; + + if (pass == 0) { + re = pin_reg + 1; + rs = pin_reg; + } else { + re = end_reg; + rs = start_reg; + } + + min_reg = re; + + // cycle on swizzle combinations + do { + for (i = 0; i < cnt; ++i) { + if (ch[i]->flags & RCF_PIN_CHAN) + if (ch[i]->pin.chan() != swz[i]) + break; + } + if (i != cnt) + continue; + + // looking for minimal reg number such that the constrained chunks + // may be colored with the current swizzle combination + for (unsigned reg = rs; reg < min_reg; ++reg) { + for (i = 0; i < cnt; ++i) { + unsigned bit = sel_chan(reg, swz[i]); + if (bit < rb[i].size() && rb[i].get(bit)) + break; + } + if (i == cnt) { + done = true; + min_reg = reg; + std::copy(swz, swz + 4, min_swz); + break; + } + } + + if (pass == 0 && done) + break; + + } while (std::next_permutation(swz, swz + 4)); + + if (pass == 0 && done) + break; + + ++pass; + }; + + assert(done); + + RA_DUMP( + cerr << "min reg = " << min_reg << " min_swz = " + << min_swz[0] << min_swz[1] << min_swz[2] << min_swz[3] << "\n"; + ); + + for (i = 0; i < cnt; ++i) { + sel_chan color(min_reg, min_swz[i]); + ra_chunk *cc = ch[i]; + + if (cc->is_fixed()) { + if (cc->pin != color) + cc = detach_value(cv[i]); + else + continue; + } + + color_chunk(cc, color); + cc->fix(); + } +} + +void coalescer::color_constraints() { + for (constraint_queue::iterator I = constraints.begin(), + E = constraints.end(); I != E; ++I) { + + ra_constraint *c = *I; + + RA_DUMP( + cerr << "color_constraints: "; + dump_constraint(c); + ); + + if (c->kind == CK_SAME_REG) + color_reg_constraint(c); + else if (c->kind == CK_PHI) + color_phi_constraint(c); + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp new file mode 100644 index 00000000000..75b2d5d9a0c --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp @@ -0,0 +1,793 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define RA_DEBUG 0 + +#if RA_DEBUG +#define RA_DUMP(q) do { q } while (0) +#else +#define RA_DUMP(q) +#endif + +#include <cstring> +#include <iostream> +#include <iomanip> + +#include "sb_bc.h" +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +class regbits { + typedef uint32_t basetype; + static const unsigned bt_bytes = sizeof(basetype); + static const unsigned bt_index_shift = 5; + static const unsigned bt_index_mask = (1u << bt_index_shift) - 1; + static const unsigned bt_bits = bt_bytes << 3; + static const unsigned size = MAX_GPR * 4 / bt_bits; + + basetype dta[size]; + + unsigned num_temps; + +public: + + regbits(unsigned num_temps) : dta(), num_temps(num_temps) {} + regbits(unsigned num_temps, unsigned value) : num_temps(num_temps) + { set_all(value); } + + regbits(shader &sh, val_set &vs) : num_temps(sh.get_ctx().alu_temp_gprs) + { set_all(1); from_val_set(sh, vs); } + + void set_all(unsigned val); + void from_val_set(shader &sh, val_set &vs); + + void set(unsigned index); + void clear(unsigned index); + bool get(unsigned index); + + void set(unsigned index, unsigned val); + + sel_chan find_free_bit(unsigned start); + sel_chan find_free_chans(unsigned mask); + sel_chan find_free_array(unsigned size, unsigned mask); + + void dump(); +}; + +// ======================================= + +void regbits::dump() { + for (unsigned i = 0; i < size * bt_bits; ++i) { + + if (!(i & 31)) + cerr << "\n"; + + if (!(i & 3)) + cerr << " " << std::setw(3) << (i / 4) << " "; + + cerr << (get(i) ? 1 : 0); + } +} + + +void regbits::set_all(unsigned v) { + memset(&dta, v ? 0xFF : 0x00, size * bt_bytes); +} + +void regbits::from_val_set(shader &sh, val_set& vs) { + val_set &s = vs; + unsigned g; + for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) { + value *v = *I; + if (v->is_any_gpr()) { + g = v->get_final_gpr(); + if (!g) + continue; + } else + continue; + + assert(g); + --g; + assert(g < 512); + clear(g); + } +} + +void regbits::set(unsigned index) { + unsigned ih = index >> bt_index_shift; + unsigned il = index & bt_index_mask; + dta[ih] |= ((basetype)1u << il); +} + +void regbits::clear(unsigned index) { + unsigned ih = index >> bt_index_shift; + unsigned il = index & bt_index_mask; + assert(ih < size); + dta[ih] &= ~((basetype)1u << il); +} + +bool regbits::get(unsigned index) { + unsigned ih = index >> bt_index_shift; + unsigned il = index & bt_index_mask; + return dta[ih] & ((basetype)1u << il); +} + +void regbits::set(unsigned index, unsigned val) { + unsigned ih = index >> bt_index_shift; + unsigned il = index & bt_index_mask; + basetype bm = 1u << il; + dta[ih] = (dta[ih] & ~bm) | (val << il); +} + +// free register for ra means the bit is set +sel_chan regbits::find_free_bit(unsigned start) { + unsigned elt = start >> bt_index_shift; + unsigned bit = start & bt_index_mask; + + unsigned end = start < MAX_GPR - num_temps ? MAX_GPR - num_temps : MAX_GPR; + + while (elt < end && !dta[elt]) { + ++elt; + bit = 0; + } + + if (elt >= end) + return 0; + + // FIXME this seems broken when not starting from 0 + + bit += __builtin_ctz(dta[elt]); + return ((elt << bt_index_shift) | bit) + 1; +} + +// find free gpr component to use as indirectly addressable array +sel_chan regbits::find_free_array(unsigned length, unsigned mask) { + unsigned cc[4] = {}; + + // FIXME optimize this. though hopefully we won't have a lot of arrays + for (unsigned a = 0; a < MAX_GPR - num_temps; ++a) { + for(unsigned c = 0; c < MAX_CHAN; ++c) { + if (mask & (1 << c)) { + if (get((a << 2) | c)) { + if (++cc[c] == length) + return sel_chan(a - length + 1, c); + } else { + cc[c] = 0; + } + } + } + } + return 0; +} + +sel_chan regbits::find_free_chans(unsigned mask) { + unsigned elt = 0; + unsigned bit = 0; + + basetype cd = dta[elt] >> bit; + + do { + + if (!cd) { + if (++elt < size) + cd = dta[elt]; + else + return 0; + + bit = 0; + } + + unsigned p = __builtin_ctz(cd) & ~(basetype)3u; + + if (p > bt_bits - bit) { + if (++elt < size) + cd = dta[elt]; + else + return 0; + bit = 0; + } + + bit += p; + cd >>= p; + + if ((cd & mask) == mask) { + return ((elt << bt_index_shift) | bit) + 1; + } + + bit += 4; + cd >>= 4; + + } while (1); + + return 0; +} + +// ================================ + +void ra_init::alloc_arrays() { + + gpr_array_vec &ga = sh.arrays(); + + for(gpr_array_vec::iterator I = ga.begin(), E = ga.end(); I != E; ++I) { + gpr_array *a = *I; + + RA_DUMP( + cerr << "array [" << a->array_size << "] at " << a->base_gpr << "\n"; + cerr << "\n"; + ); + + bool dead = a->is_dead(); + + if (dead) { + RA_DUMP( cerr << " DEAD\n"; ); + continue; + } + + val_set &s = a->interferences; + + + for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) { + value *v = *I; + if (v->array == a) + s.remove_val(v); + } + + RA_DUMP( + cerr << " interf: "; + dump::dump_set(sh, s); + cerr << "\n"; + ); + + regbits rb(sh, s); + + sel_chan base = rb.find_free_array(a->array_size, + (1 << a->base_gpr.chan())); + + RA_DUMP( cerr << " found base: " << base << "\n"; ); + + a->gpr = base; + } +} + + +int ra_init::run() { + + alloc_arrays(); + + ra_node(sh.root); + return 0; +} + +void ra_init::ra_node(container_node* c) { + + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *n = *I; + if (n->type == NT_OP) { + process_op(n); + } + if (n->is_container() && !n->is_alu_packed()) { + ra_node(static_cast<container_node*>(n)); + } + } +} + +void ra_init::process_op(node* n) { + + bool copy = n->is_copy_mov(); + + RA_DUMP( + cerr << "ra_init: process_op : "; + dump::dump_op(n); + cerr << "\n"; + ); + + if (n->is_alu_packed()) { + for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) { + value *v = *I; + if (v && v->is_sgpr() && v->constraint && + v->constraint->kind == CK_PACKED_BS) { + color_bs_constraint(v->constraint); + break; + } + } + } + + if (n->is_fetch_inst() || n->is_cf_inst()) { + for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) { + value *v = *I; + if (v && v->is_sgpr()) + color(v); + } + } + + for (vvec::iterator I = n->dst.begin(), E = n->dst.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + if (v->is_sgpr()) { + if (!v->gpr) { + if (copy && !v->constraint) { + value *s = *(n->src.begin() + (I - n->dst.begin())); + assert(s); + if (s->is_sgpr()) { + assign_color(v, s->gpr); + } + } else + color(v); + } + } + } +} + +void ra_init::color_bs_constraint(ra_constraint* c) { + vvec &vv = c->values; + assert(vv.size() <= 8); + + RA_DUMP( + cerr << "color_bs_constraint: "; + dump::dump_vec(vv); + cerr << "\n"; + ); + + regbits rb(ctx.alu_temp_gprs); + + unsigned chan_count[4] = {}; + unsigned allowed_chans = 0b1111; + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + sel_chan gpr = v->get_final_gpr(); + + if (!v || v->is_dead()) + continue; + + val_set interf; + + if (v->chunk) + sh.coal.get_chunk_interferences(v->chunk, interf); + else + interf = v->interferences; + + RA_DUMP( + cerr << " processing " << *v << " interferences : "; + dump::dump_set(sh, interf); + cerr << "\n"; + ); + + if (gpr) { + unsigned chan = gpr.chan(); + if (chan_count[chan] < 3) { + ++chan_count[chan]; + continue; + } else { + v->flags &= ~VLF_FIXED; + allowed_chans &= ~(1 << chan); + assert(allowed_chans); + } + } + + v->gpr = 0; + + gpr = 1; + rb.set_all(1); + + + rb.from_val_set(sh, interf); + + RA_DUMP( + cerr << " regbits : "; + rb.dump(); + cerr << "\n"; + ); + + while (allowed_chans && gpr.sel() < sh.num_nontemp_gpr()) { + + while (rb.get(gpr - 1) == 0) + gpr = gpr + 1; + + RA_DUMP( + cerr << " trying " << gpr << "\n"; + ); + + unsigned chan = gpr.chan(); + if (chan_count[chan] < 3) { + ++chan_count[chan]; + + if (v->chunk) { + vvec::iterator F = std::find(v->chunk->values.begin(), + v->chunk->values.end(), + v); + v->chunk->values.erase(F); + v->chunk = NULL; + } + + assign_color(v, gpr); + break; + } else { + allowed_chans &= ~(1 << chan); + } + gpr = gpr + 1; + } + + if (!gpr) { + cerr << "color_bs_constraint: failed...\n"; + assert(!"coloring failed"); + } + } +} + +void ra_init::color(value* v) { + + if (v->constraint && v->constraint->kind == CK_PACKED_BS) { + color_bs_constraint(v->constraint); + return; + } + + if (v->chunk && v->chunk->is_fixed()) + return; + + RA_DUMP( + cerr << "coloring "; + dump::dump_val(v); + cerr << " interferences "; + dump::dump_set(sh, v->interferences); + cerr << "\n"; + ); + + if (v->is_reg_pinned()) { + assert(v->is_chan_pinned()); + assign_color(v, v->pin_gpr); + return; + } + + regbits rb(sh, v->interferences); + sel_chan c; + + if (v->is_chan_pinned()) { + RA_DUMP( cerr << "chan_pinned = " << v->pin_gpr.chan() << " "; ); + unsigned mask = 1 << v->pin_gpr.chan(); + c = rb.find_free_chans(mask) + v->pin_gpr.chan(); + } else { + c = rb.find_free_bit(0); + } + + assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed"); + assign_color(v, c); +} + +void ra_init::assign_color(value* v, sel_chan c) { + v->gpr = c; + RA_DUMP( + cerr << "colored "; + dump::dump_val(v); + cerr << " to " << c << "\n"; + ); +} + +// =================================================== + +int ra_split::run() { + split(sh.root); + return 0; +} + +void ra_split::split_phi_src(container_node *loc, container_node *c, + unsigned id, bool loop) { + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *p = *I; + value* &v = p->src[id], *d = p->dst[0]; + assert(v); + + if (!d->is_sgpr() || v->is_undef()) + continue; + + value *t = sh.create_temp_value(); + if (loop && id == 0) + loc->insert_before(sh.create_copy_mov(t, v)); + else + loc->push_back(sh.create_copy_mov(t, v)); + v = t; + + sh.coal.add_edge(v, d, coalescer::phi_cost); + } +} + +void ra_split::split_phi_dst(node* loc, container_node *c, bool loop) { + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *p = *I; + value* &v = p->dst[0]; + assert(v); + + if (!v->is_sgpr()) + continue; + + value *t = sh.create_temp_value(); + node *cp = sh.create_copy_mov(v, t); + if (loop) + static_cast<container_node*>(loc)->push_front(cp); + else + loc->insert_after(cp); + v = t; + } +} + + +void ra_split::init_phi_constraints(container_node *c) { + for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) { + node *p = *I; + ra_constraint *cc = sh.coal.create_constraint(CK_PHI); + cc->values.push_back(p->dst[0]); + + for (vvec::iterator I = p->src.begin(), E = p->src.end(); I != E; ++I) { + value *v = *I; + if (v->is_sgpr()) + cc->values.push_back(v); + } + + cc->update_values(); + } +} + +void ra_split::split(container_node* n) { + + if (n->type == NT_DEPART) { + depart_node *d = static_cast<depart_node*>(n); + if (d->target->phi) + split_phi_src(d, d->target->phi, d->dep_id, false); + } else if (n->type == NT_REPEAT) { + repeat_node *r = static_cast<repeat_node*>(n); + if (r->target->loop_phi) + split_phi_src(r, r->target->loop_phi, r->rep_id, true); + } else if (n->type == NT_REGION) { + region_node *r = static_cast<region_node*>(n); + if (r->phi) { + split_phi_dst(r, r->phi, false); + } + if (r->loop_phi) { + split_phi_dst(r->get_entry_code_location(), r->loop_phi, + true); + split_phi_src(r, r->loop_phi, 0, true); + } + } + + for (node_riterator N, I = n->rbegin(), E = n->rend(); I != E; I = N) { + N = I; + ++N; + node *o = *I; + if (o->type == NT_OP) { + split_op(o); + } else if (o->is_container()) { + split(static_cast<container_node*>(o)); + } + } + + if (n->type == NT_REGION) { + region_node *r = static_cast<region_node*>(n); + if (r->phi) + init_phi_constraints(r->phi); + if (r->loop_phi) + init_phi_constraints(r->loop_phi); + } +} + +void ra_split::split_op(node* n) { + switch(n->subtype) { + case NST_ALU_PACKED_INST: + split_alu_packed(static_cast<alu_packed_node*>(n)); + break; + case NST_FETCH_INST: + case NST_CF_INST: + split_vector_inst(n); + default: + break; + } +} + +void ra_split::split_packed_ins(alu_packed_node *n) { + vvec vv = n->src; + vvec sv, dv; + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + + value *&v = *I; + + if (v && v->is_any_gpr() && !v->is_undef()) { + + vvec::iterator F = std::find(sv.begin(), sv.end(), v); + value *t; + + if (F != sv.end()) { + t = *(dv.begin() + (F - sv.begin())); + } else { + t = sh.create_temp_value(); + sv.push_back(v); + dv.push_back(t); + } + v = t; + } + } + + unsigned cnt = sv.size(); + + if (cnt > 0) { + n->src = vv; + for (vvec::iterator SI = sv.begin(), DI = dv.begin(), SE = sv.end(); + SI != SE; ++SI, ++DI) { + n->insert_before(sh.create_copy_mov(*DI, *SI)); + } + + ra_constraint *c = sh.coal.create_constraint(CK_PACKED_BS); + c->values = dv; + c->update_values(); + } +} + +// TODO handle other packed ops for cayman +void ra_split::split_alu_packed(alu_packed_node* n) { + switch (n->op()) { + case ALU_OP2_DOT4: + case ALU_OP2_CUBE: + split_packed_ins(n); + break; + default: + break; + } +} + +void ra_split::split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz) { + unsigned ch = 0; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I, ++ch) { + + value* &o = *I; + + if (o) { + + assert(!o->is_dead()); + + if (o->is_undef()) + continue; + + if (allow_swz && o->is_float_0_or_1()) + continue; + + value *t; + vvec::iterator F = + allow_swz ? find(v2.begin(), v2.end(), o) : v2.end(); + + if (F != v2.end()) { + t = *(v1.begin() + (F - v2.begin())); + } else { + t = sh.create_temp_value(); + + if (!allow_swz) { + t->flags |= VLF_PIN_CHAN; + t->pin_gpr = sel_chan(0, ch); + } + + v2.push_back(o); + v1.push_back(t); + } + o = t; + } + } +} + +void ra_split::split_vector_inst(node* n) { + ra_constraint *c; + + bool call_fs = n->is_cf_op(CF_OP_CALL_FS); + bool no_src_swizzle = n->is_cf_inst() && (n->cf_op_flags() & CF_MEM); + + no_src_swizzle |= n->is_fetch_op(FETCH_OP_VFETCH) || + n->is_fetch_op(FETCH_OP_SEMFETCH); + + if (!n->src.empty() && !call_fs) { + + // we may have more than one source vector - + // fetch instructions with FF_USEGRAD have gradient values in + // src vectors 1 (src[4-7] and 2 (src[8-11]) + + unsigned nvec = n->src.size() >> 2; + assert(nvec << 2 == n->src.size()); + + for (unsigned nv = 0; nv < nvec; ++nv) { + vvec sv, tv, nsrc(4); + unsigned arg_start = nv << 2; + + std::copy(n->src.begin() + arg_start, + n->src.begin() + arg_start + 4, + nsrc.begin()); + + split_vec(nsrc, tv, sv, !no_src_swizzle); + + unsigned cnt = sv.size(); + + if (no_src_swizzle || cnt) { + + std::copy(nsrc.begin(), nsrc.end(), n->src.begin() + arg_start); + + for(unsigned i = 0, s = tv.size(); i < s; ++i) { + n->insert_before(sh.create_copy_mov(tv[i], sv[i])); + } + + c = sh.coal.create_constraint(CK_SAME_REG); + c->values = tv; + c->update_values(); + } + } + } + + if (!n->dst.empty()) { + vvec sv, tv, ndst = n->dst; + + split_vec(ndst, tv, sv, true); + + if (sv.size()) { + n->dst = ndst; + + node *lp = n; + for(unsigned i = 0, s = tv.size(); i < s; ++i) { + lp->insert_after(sh.create_copy_mov(sv[i], tv[i])); + lp = lp->next; + } + + if (call_fs) { + for (unsigned i = 0, cnt = tv.size(); i < cnt; ++i) { + value *v = tv[i]; + value *s = sv[i]; + if (!v) + continue; + + v->flags |= VLF_PIN_REG | VLF_PIN_CHAN; + s->flags &= ~(VLF_PIN_REG | VLF_PIN_CHAN); + sel_chan sel; + + if (s->is_rel()) { + assert(s->rel->is_const()); + sel = sel_chan(s->select.sel() + + s->rel->get_const_value().u, + s->select.chan()); + } else + sel = s->select; + + v->gpr = v->pin_gpr = sel; + v->fix(); + } + } else { + c = sh.coal.create_constraint(CK_SAME_REG); + c->values = tv; + c->update_values(); + } + } + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp new file mode 100644 index 00000000000..06c362a861f --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -0,0 +1,1967 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define PSC_DEBUG 0 + +#if PSC_DEBUG +#define PSC_DUMP(a) do { a } while (0) +#else +#define PSC_DUMP(a) +#endif + +#include "sb_bc.h" +#include "sb_shader.h" + +#include "sb_pass.h" +#include "sb_sched.h" + +namespace r600_sb { + +using std::cerr; + +rp_kcache_tracker::rp_kcache_tracker(shader &sh) : rp(), uc(), + sel_count(sh.get_ctx().is_r600() ? 4 : 2) {} + +bool rp_kcache_tracker::try_reserve(sel_chan r) { + unsigned sel = kc_sel(r); + + for (unsigned i = 0; i < sel_count; ++i) { + if (rp[i] == 0) { + rp[i] = sel; + ++uc[i]; + return true; + } + if (rp[i] == sel) { + ++uc[i]; + return true; + } + } + return false; +} + +bool rp_kcache_tracker::try_reserve(node* n) { + bool need_unreserve = false; + vvec::iterator I(n->src.begin()), E(n->src.end()); + + for (; I != E; ++I) { + value *v = *I; + if (v->is_kcache()) { + if (!try_reserve(v->select)) + break; + else + need_unreserve = true; + } + } + if (I == E) + return true; + + if (need_unreserve && I != n->src.begin()) { + do { + --I; + value *v =*I; + if (v->is_kcache()) + unreserve(v->select); + } while (I != n->src.begin()); + } + return false; +} + +inline +void rp_kcache_tracker::unreserve(node* n) { + vvec::iterator I(n->src.begin()), E(n->src.end()); + for (; I != E; ++I) { + value *v = *I; + if (v->is_kcache()) + unreserve(v->select); + } +} + +void rp_kcache_tracker::unreserve(sel_chan r) { + unsigned sel = kc_sel(r); + + for (unsigned i = 0; i < sel_count; ++i) + if (rp[i] == sel) { + if (--uc[i] == 0) + rp[i] = 0; + return; + } + assert(0); + return; +} + +bool literal_tracker::try_reserve(alu_node* n) { + bool need_unreserve = false; + + vvec::iterator I(n->src.begin()), E(n->src.end()); + + for (; I != E; ++I) { + value *v = *I; + if (v->is_literal()) { + if (!try_reserve(v->literal_value)) + break; + else + need_unreserve = true; + } + } + if (I == E) + return true; + + if (need_unreserve && I != n->src.begin()) { + do { + --I; + value *v =*I; + if (v->is_literal()) + unreserve(v->literal_value); + } while (I != n->src.begin()); + } + return false; +} + +void literal_tracker::unreserve(alu_node* n) { + unsigned nsrc = n->bc.op_ptr->src_count, i; + + for (i = 0; i < nsrc; ++i) { + value *v = n->src[i]; + if (v->is_literal()) + unreserve(v->literal_value); + } +} + +bool literal_tracker::try_reserve(literal l) { + + PSC_DUMP( cerr << "literal reserve " << l.u << " " << l.f << "\n"; ); + + for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) { + if (lt[i] == 0) { + lt[i] = l; + ++uc[i]; + PSC_DUMP( cerr << " reserved new uc = " << uc[i] << "\n"; ); + return true; + } else if (lt[i] == l) { + ++uc[i]; + PSC_DUMP( cerr << " reserved uc = " << uc[i] << "\n"; ); + return true; + } + } + PSC_DUMP( cerr << " failed to reserve literal\n"; ); + return false; +} + +void literal_tracker::unreserve(literal l) { + + PSC_DUMP( cerr << "literal unreserve " << l.u << " " << l.f << "\n"; ); + + for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) { + if (lt[i] == l) { + if (--uc[i] == 0) + lt[i] = 0; + return; + } + } + assert(0); + return; +} + +static inline unsigned bs_cycle_vector(unsigned bs, unsigned src) { + static const unsigned swz[VEC_NUM][3] = { + {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0} + }; + assert(bs < VEC_NUM && src < 3); + return swz[bs][src]; +} + +static inline unsigned bs_cycle_scalar(unsigned bs, unsigned src) { + static const unsigned swz[SCL_NUM][3] = { + {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1} + }; + + if (bs >= SCL_NUM || src >= 3) { + // this prevents gcc warning "array subscript is above array bounds" + // AFAICS we should never hit this path + abort(); + } + return swz[bs][src]; +} + +static inline unsigned bs_cycle(bool trans, unsigned bs, unsigned src) { + return trans ? bs_cycle_scalar(bs, src) : bs_cycle_vector(bs, src); +} + +inline +bool rp_gpr_tracker::try_reserve(unsigned cycle, unsigned sel, unsigned chan) { + ++sel; + if (rp[cycle][chan] == 0) { + rp[cycle][chan] = sel; + ++uc[cycle][chan]; + return true; + } else if (rp[cycle][chan] == sel) { + ++uc[cycle][chan]; + return true; + } + return false; +} + +inline +void rp_gpr_tracker::unreserve(alu_node* n) { + unsigned nsrc = n->bc.op_ptr->src_count, i; + unsigned trans = n->bc.slot == SLOT_TRANS; + unsigned bs = n->bc.bank_swizzle; + unsigned opt = !trans + && n->bc.src[0].sel == n->bc.src[1].sel + && n->bc.src[0].chan == n->bc.src[1].chan; + + for (i = 0; i < nsrc; ++i) { + value *v = n->src[i]; + if (v->is_readonly()) + continue; + if (i == 1 && opt) + continue; + unsigned cycle = bs_cycle(trans, bs, i); + unreserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan); + } +} + +inline +void rp_gpr_tracker::unreserve(unsigned cycle, unsigned sel, unsigned chan) { + ++sel; + assert(rp[cycle][chan] == sel && uc[cycle][chan]); + if (--uc[cycle][chan] == 0) + rp[cycle][chan] = 0; +} + +inline +bool rp_gpr_tracker::try_reserve(alu_node* n) { + unsigned nsrc = n->bc.op_ptr->src_count, i; + unsigned trans = n->bc.slot == SLOT_TRANS; + unsigned bs = n->bc.bank_swizzle; + unsigned opt = !trans && nsrc >= 2 && + n->src[0] == n->src[1]; + + bool need_unreserve = false; + unsigned const_count = 0, min_gpr_cycle = 3; + + for (i = 0; i < nsrc; ++i) { + value *v = n->src[i]; + if (v->is_readonly()) { + const_count++; + if (trans && const_count == 3) + break; + } else { + if (i == 1 && opt) + continue; + + unsigned cycle = bs_cycle(trans, bs, i); + + if (trans && cycle < min_gpr_cycle) + min_gpr_cycle = cycle; + + if (const_count && cycle < const_count && trans) + break; + + if (!try_reserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan)) + break; + else + need_unreserve = true; + } + } + + if ((i == nsrc) && (min_gpr_cycle + 1 > const_count)) + return true; + + if (need_unreserve && i--) { + do { + value *v = n->src[i]; + if (!v->is_readonly()) { + if (i == 1 && opt) + continue; + unreserve(bs_cycle(trans, bs, i), n->bc.src[i].sel, + n->bc.src[i].chan); + } + } while (i--); + } + return false; +} + +alu_group_tracker::alu_group_tracker(shader &sh) + : sh(sh), kc(sh), + gpr(), lt(), slots(), + max_slots(sh.get_ctx().is_cayman() ? 4 : 5), + has_mova(), uses_ar(), has_predset(), has_kill(), + updates_exec_mask(), chan_count(), interp_param(), next_id() { + + available_slots = sh.get_ctx().has_trans ? 0b11111 : 0b01111; +} + +inline +sel_chan alu_group_tracker::get_value_id(value* v) { + unsigned &id = vmap[v]; + if (!id) + id = ++next_id; + return sel_chan(id, v->get_final_chan()); +} + +inline +void alu_group_tracker::assign_slot(unsigned slot, alu_node* n) { + update_flags(n); + slots[slot] = n; + available_slots &= ~(1 << slot); + + unsigned param = n->interp_param(); + + if (param) { + assert(!interp_param || interp_param == param); + interp_param = param; + } +} + + +void alu_group_tracker::discard_all_slots(container_node &removed_nodes) { + PSC_DUMP( cerr << "agt::discard_all_slots\n"; ); + discard_slots(~available_slots & ((1 << max_slots) - 1), removed_nodes); +} + +void alu_group_tracker::discard_slots(unsigned slot_mask, + container_node &removed_nodes) { + + PSC_DUMP( + cerr << "discard_slots : packed_ops : " << packed_ops.size() << "\n"; + ); + + for (node_vec::iterator N, I = packed_ops.begin(); + I != packed_ops.end(); I = N) { + N = I; ++N; + + alu_packed_node *n = static_cast<alu_packed_node*>(*I); + unsigned pslots = n->get_slot_mask(); + + PSC_DUMP( + cerr << "discard_slots : packed slot_mask : " << pslots << "\n"; + ); + + if (pslots & slot_mask) { + + PSC_DUMP( + cerr << "discard_slots : discarding packed...\n"; + ); + + removed_nodes.push_back(n); + slot_mask &= ~pslots; + N = packed_ops.erase(I); + available_slots |= pslots; + for (unsigned k = 0; k < max_slots; ++k) { + if (pslots & (1 << k)) + slots[k] = NULL; + } + } + } + + for (unsigned slot = 0; slot < max_slots; ++slot) { + unsigned slot_bit = 1 << slot; + + if (slot_mask & slot_bit) { + assert(!(available_slots & slot_bit)); + assert(slots[slot]); + + assert(!(slots[slot]->bc.slot_flags & AF_4SLOT)); + + PSC_DUMP( + cerr << "discarding slot " << slot << " : "; + dump::dump_op(slots[slot]); + cerr << "\n"; + ); + + removed_nodes.push_back(slots[slot]); + slots[slot] = NULL; + available_slots |= slot_bit; + } + } + + alu_node *t = slots[4]; + if (t && (t->bc.slot_flags & AF_V)) { + unsigned chan = t->bc.dst_chan; + if (!slots[chan]) { + PSC_DUMP( + cerr << "moving "; + dump::dump_op(t); + cerr << " from trans slot to free slot " << chan << "\n"; + ); + + slots[chan] = t; + slots[4] = NULL; + t->bc.slot = chan; + } + } + + reinit(); +} + +alu_group_node* alu_group_tracker::emit() { + + alu_group_node *g = sh.create_alu_group(); + + lt.init_group_literals(g); + + for (unsigned i = 0; i < max_slots; ++i) { + alu_node *n = slots[i]; + if (n) { + g->push_back(n); + } + } + return g; +} + +bool alu_group_tracker::try_reserve(alu_node* n) { + unsigned nsrc = n->bc.op_ptr->src_count; + unsigned slot = n->bc.slot; + bool trans = slot == 4; + + if (slots[slot]) + return false; + + unsigned flags = n->bc.op_ptr->flags; + + unsigned param = n->interp_param(); + + if (param && interp_param && interp_param != param) + return false; + + if ((flags & AF_KILL) && has_predset) + return false; + if ((flags & AF_ANY_PRED) && (has_kill || has_predset)) + return false; + if ((flags & AF_MOVA) && (has_mova || uses_ar)) + return false; + + if (n->uses_ar() && has_mova) + return false; + + for (unsigned i = 0; i < nsrc; ++i) { + + unsigned last_id = next_id; + + value *v = n->src[i]; + if (!v->is_any_gpr() && !v->is_rel()) + continue; + sel_chan vid = get_value_id(n->src[i]); + + if (vid > last_id && chan_count[vid.chan()] == 3) { + return false; + } + + n->bc.src[i].sel = vid.sel(); + n->bc.src[i].chan = vid.chan(); + } + + if (!lt.try_reserve(n)) + return false; + + if (!kc.try_reserve(n)) { + lt.unreserve(n); + return false; + } + + unsigned fbs = n->forced_bank_swizzle(); + + n->bc.bank_swizzle = 0; + + if (!trans & fbs) + n->bc.bank_swizzle = VEC_210; + + if (gpr.try_reserve(n)) { + assign_slot(slot, n); + return true; + } + + if (!fbs) { + unsigned swz_num = trans ? SCL_NUM : VEC_NUM; + for (unsigned bs = 0; bs < swz_num; ++bs) { + n->bc.bank_swizzle = bs; + if (gpr.try_reserve(n)) { + assign_slot(slot, n); + return true; + } + } + } + + gpr.reset(); + + slots[slot] = n; + unsigned forced_swz_slots = 0; + int first_slot = ~0, first_nf = ~0, last_slot = ~0; + unsigned save_bs[5]; + + for (unsigned i = 0; i < max_slots; ++i) { + alu_node *a = slots[i]; + if (a) { + if (first_slot == ~0) + first_slot = i; + last_slot = i; + save_bs[i] = a->bc.bank_swizzle; + if (a->forced_bank_swizzle()) { + assert(i != SLOT_TRANS); + forced_swz_slots |= (1 << i); + a->bc.bank_swizzle = VEC_210; + if (!gpr.try_reserve(a)) + assert("!internal reservation error"); + } else { + if (first_nf == ~0) + first_nf = i; + + a->bc.bank_swizzle = 0; + } + } + } + + if (first_nf == ~0) { + assign_slot(slot, n); + return true; + } + + assert(first_slot != ~0 && last_slot != ~0); + + int i = first_nf; + alu_node *a = slots[i]; + bool backtrack = false; + + while (1) { + + PSC_DUMP( + cerr << " bs: trying s" << i << " bs:" << a->bc.bank_swizzle + << " bt:" << backtrack << "\n"; + ); + + if (!backtrack && gpr.try_reserve(a)) { + PSC_DUMP( + cerr << " bs: reserved s" << i << " bs:" << a->bc.bank_swizzle + << "\n"; + ); + + while ((++i <= last_slot) && !slots[i]); + if (i <= last_slot) + a = slots[i]; + else + break; + } else { + bool itrans = i == SLOT_TRANS; + unsigned max_swz = itrans ? SCL_221 : VEC_210; + + if (a->bc.bank_swizzle < max_swz) { + ++a->bc.bank_swizzle; + + PSC_DUMP( + cerr << " bs: inc s" << i << " bs:" << a->bc.bank_swizzle + << "\n"; + ); + + } else { + + a->bc.bank_swizzle = 0; + while ((--i >= first_nf) && !slots[i]); + if (i < first_nf) + break; + a = slots[i]; + PSC_DUMP( + cerr << " bs: unreserve s" << i << " bs:" << a->bc.bank_swizzle + << "\n"; + ); + gpr.unreserve(a); + backtrack = true; + + continue; + } + } + backtrack = false; + } + + if (i == last_slot + 1) { + assign_slot(slot, n); + return true; + } + + // reservation failed, restore previous state + slots[slot] = NULL; + gpr.reset(); + for (unsigned i = 0; i < max_slots; ++i) { + alu_node *a = slots[i]; + if (a) { + a->bc.bank_swizzle = save_bs[i]; + bool b = gpr.try_reserve(a); + assert(b); + } + } + + kc.unreserve(n); + lt.unreserve(n); + return false; +} + +bool alu_group_tracker::try_reserve(alu_packed_node* p) { + bool need_unreserve = false; + node_iterator I(p->begin()), E(p->end()); + + for (; I != E; ++I) { + alu_node *n = static_cast<alu_node*>(*I); + if (!try_reserve(n)) + break; + else + need_unreserve = true; + } + + if (I == E) { + packed_ops.push_back(p); + return true; + } + + if (need_unreserve) { + while (--I != E) { + alu_node *n = static_cast<alu_node*>(*I); + slots[n->bc.slot] = NULL; + } + reinit(); + } + return false; +} + +void alu_group_tracker::reinit() { + alu_node * s[5]; + memcpy(s, slots, sizeof(slots)); + + reset(true); + + for (int i = max_slots - 1; i >= 0; --i) { + if (s[i] && !try_reserve(s[i])) { + cerr << "alu_group_tracker: reinit error on slot " << i << "\n"; + for (unsigned i = 0; i < max_slots; ++i) { + cerr << " slot " << i << " : "; + if (s[i]) + dump::dump_op(s[i]); + + cerr << "\n"; + } + assert(!"alu_group_tracker: reinit error"); + } + } +} + +void alu_group_tracker::reset(bool keep_packed) { + kc.reset(); + gpr.reset(); + lt.reset(); + memset(slots, 0, sizeof(slots)); + vmap.clear(); + next_id = 0; + has_mova = false; + uses_ar = false; + has_predset = false; + has_kill = false; + updates_exec_mask = false; + available_slots = sh.get_ctx().has_trans ? 0b11111 : 0b01111; + interp_param = 0; + + chan_count[0] = 0; + chan_count[1] = 0; + chan_count[2] = 0; + chan_count[3] = 0; + + if (!keep_packed) + packed_ops.clear(); +} + +void alu_group_tracker::update_flags(alu_node* n) { + unsigned flags = n->bc.op_ptr->flags; + has_kill |= (flags & AF_KILL); + has_mova |= (flags & AF_MOVA); + has_predset |= (flags & AF_ANY_PRED); + uses_ar |= n->uses_ar(); + + if (flags & AF_ANY_PRED) { + if (n->dst[2] != NULL) + updates_exec_mask = true; + } +} + +int post_scheduler::run() { + run_on(sh.root); + return 0; +} + +void post_scheduler::run_on(container_node* n) { + + for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) { + if (I->is_container()) { + if (I->subtype == NST_BB) { + bb_node* bb = static_cast<bb_node*>(*I); + schedule_bb(bb); + } else { + run_on(static_cast<container_node*>(*I)); + } + } + } +} + +void post_scheduler::init_uc_val(container_node *c, value *v) { + node *d = v->any_def(); + if (d && d->parent == c) + ++ucm[d]; +} + +void post_scheduler::init_uc_vec(container_node *c, vvec &vv, bool src) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + init_uc_val(c, v->rel); + init_uc_vec(c, v->muse, true); + } if (src) { + init_uc_val(c, v); + } + } +} + +unsigned post_scheduler::init_ucm(container_node *c, node *n) { + init_uc_vec(c, n->src, true); + init_uc_vec(c, n->dst, false); + + uc_map::iterator F = ucm.find(n); + return F == ucm.end() ? 0 : F->second; +} + +void post_scheduler::schedule_bb(bb_node* bb) { + PSC_DUMP( + cerr << "scheduling BB " << bb->id << "\n"; + if (!pending.empty()) + dump::dump_op_list(&pending); + ); + + assert(pending.empty()); + assert(bb_pending.empty()); + assert(ready.empty()); + + bb_pending.append_from(bb); + cur_bb = bb; + + node *n; + + while ((n = bb_pending.back())) { + + PSC_DUMP( + cerr << "post_sched_bb "; + dump::dump_op(n); + cerr << "\n"; + ); + + if (n->subtype == NST_ALU_CLAUSE) { + n->remove(); + process_alu(static_cast<container_node*>(n)); + continue; + } + + n->remove(); + bb->push_front(n); + } + + this->cur_bb = NULL; +} + +void post_scheduler::init_regmap() { + + regmap.clear(); + + PSC_DUMP( + cerr << "init_regmap: live: "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) { + value *v = *I; + assert(v); + if (!v->is_sgpr() || !v->is_prealloc()) + continue; + + sel_chan r = v->gpr; + + PSC_DUMP( + cerr << "init_regmap: " << r << " <= "; + dump::dump_val(v); + cerr << "\n"; + ); + + assert(r); + regmap[r] = v; + } +} + +void post_scheduler::process_alu(container_node *c) { + + ucm.clear(); + alu.reset(); + + live = c->live_after; + + init_globals(c->live_after, true); + init_globals(c->live_before, true); + + init_regmap(); + + update_local_interferences(); + + for (node_riterator N, I = c->rbegin(), E = c->rend(); I != E; I = N) { + N = I; + ++N; + + node *n = *I; + unsigned uc = init_ucm(c, n); + + PSC_DUMP( + cerr << "process_alu uc=" << uc << " "; + dump::dump_op(n); + cerr << " "; + ); + + if (uc) { + n->remove(); + pending.push_back(n); + PSC_DUMP( cerr << "pending\n"; ); + } else { + release_op(n); + } + } + + schedule_alu(c); +} + +void post_scheduler::update_local_interferences() { + + PSC_DUMP( + cerr << "update_local_interferences : "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + + for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) { + value *v = *I; + if (v->is_prealloc()) + continue; + + v->interferences.add_set(live); + } +} + +void post_scheduler::update_live_src_vec(vvec &vv, val_set &born, bool src) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + + if (!v) + continue; + + if (src && v->is_any_gpr()) { + if (live.add_val(v)) { + if (!v->is_prealloc()) { + if (!cleared_interf.contains(v)) { + PSC_DUMP( + cerr << "clearing interferences for " << *v << "\n"; + ); + v->interferences.clear(); + cleared_interf.add_val(v); + } + } + born.add_val(v); + } + } else if (v->is_rel()) { + if (!v->rel->is_any_gpr()) + live.add_val(v->rel); + update_live_src_vec(v->muse, born, true); + } + } +} + +void post_scheduler::update_live_dst_vec(vvec &vv) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_rel()) { + update_live_dst_vec(v->mdef); + } else if (v->is_any_gpr()) { + if (!live.remove_val(v)) { + PSC_DUMP( + cerr << "failed to remove "; + dump::dump_val(v); + cerr << " from live : "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + } + } + } +} + +void post_scheduler::update_live(node *n, val_set &born) { + update_live_dst_vec(n->dst); + update_live_src_vec(n->src, born, true); + update_live_src_vec(n->dst, born, false); +} + +void post_scheduler::process_group() { + alu_group_tracker &rt = alu.grp(); + + val_set vals_born; + + recolor_locals(); + + PSC_DUMP( + cerr << "process_group: live_before : "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + for (unsigned s = 0; s < ctx.num_slots; ++s) { + alu_node *n = rt.slot(s); + if (!n) + continue; + + update_live(n, vals_born); + } + + PSC_DUMP( + cerr << "process_group: live_after : "; + dump::dump_set(sh, live); + cerr << "\n"; + ); + + update_local_interferences(); + + for (unsigned i = 0; i < 5; ++i) { + node *n = rt.slot(i); + if (n && !n->is_mova()) { + release_src_values(n); + } + } +} + +void post_scheduler::init_globals(val_set &s, bool prealloc) { + + PSC_DUMP( + cerr << "init_globals: "; + dump::dump_set(sh, s); + cerr << "\n"; + ); + + for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) { + value *v = *I; + if (v->is_sgpr() && !v->is_global()) { + v->set_global(); + + if (prealloc && v->is_fixed()) { + v->set_prealloc(); + } + } + } +} + +void post_scheduler::emit_clause() { + + if (alu.current_ar) { + emit_load_ar(); + process_group(); + alu.emit_group(); + } + + alu.emit_clause(cur_bb); +} + +void post_scheduler::schedule_alu(container_node *c) { + + assert(!ready.empty() || !ready_copies.empty()); + + while (1) { + + prev_regmap = regmap; + + if (!prepare_alu_group()) { + if (alu.current_ar) { + emit_load_ar(); + continue; + } else + break; + } + + if (!alu.check_clause_limits()) { + regmap = prev_regmap; + emit_clause(); + init_globals(live, false); + continue; + } + + process_group(); + alu.emit_group(); + }; + + if (!alu.is_empty()) { + emit_clause(); + } + + if (!ready.empty()) { + cerr << "##post_scheduler: unscheduled ready instructions :"; + dump::dump_op_list(&ready); + assert(!"unscheduled ready instructions"); + } + + if (!pending.empty()) { + cerr << "##post_scheduler: unscheduled pending instructions :"; + dump::dump_op_list(&pending); + assert(!"unscheduled pending instructions"); + } +} + +void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) { + unsigned chan = v->gpr.chan(); + + for (val_set::iterator I = vs.begin(sh), E = vs.end(sh); + I != E; ++I) { + value *vi = *I; + sel_chan gpr = vi->get_final_gpr(); + + if (vi->is_any_gpr() && gpr && vi != v && + (!v->chunk || v->chunk != vi->chunk) && + vi->is_fixed() && gpr.chan() == chan) { + + unsigned r = gpr.sel(); + + PSC_DUMP( + cerr << "\tadd_interferences: " << *vi << "\n"; + ); + + if (rb.size() <= r) + rb.resize(r + 32); + rb.set(r); + } + } +} + +void post_scheduler::set_color_local_val(value *v, sel_chan color) { + v->gpr = color; + + PSC_DUMP( + cerr << " recolored: "; + dump::dump_val(v); + cerr << "\n"; + ); +} + +void post_scheduler::set_color_local(value *v, sel_chan color) { + if (v->chunk) { + vvec &vv = v->chunk->values; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v2 =*I; + set_color_local_val(v2, color); + } + v->chunk->fix(); + } else { + set_color_local_val(v, color); + v->fix(); + } +} + +bool post_scheduler::recolor_local(value *v) { + + sb_bitset rb; + + assert(v->is_sgpr()); + assert(!v->is_prealloc()); + assert(v->gpr); + + unsigned chan = v->gpr.chan(); + + PSC_DUMP( + cerr << "recolor_local: "; + dump::dump_val(v); + cerr << " interferences: "; + dump::dump_set(sh, v->interferences); + cerr << "\n"; + if (v->chunk) { + cerr << " in chunk: "; + coalescer::dump_chunk(v->chunk); + cerr << "\n"; + } + ); + + if (v->chunk) { + for (vvec::iterator I = v->chunk->values.begin(), + E = v->chunk->values.end(); I != E; ++I) { + value *v2 = *I; + + PSC_DUMP( cerr << " add_interferences for " << *v2 << " :\n"; ); + + add_interferences(v, rb, v2->interferences); + } + } else { + add_interferences(v, rb, v->interferences); + } + + PSC_DUMP( + unsigned sz = rb.size(); + cerr << "registers bits: " << sz; + for (unsigned r = 0; r < sz; ++r) { + if ((r & 7) == 0) + cerr << "\n " << r << " "; + cerr << (rb.get(r) ? 1 : 0); + } + ); + + bool no_temp_gprs = v->is_global(); + unsigned rs, re, pass = no_temp_gprs ? 1 : 0; + + while (pass < 2) { + + if (pass == 0) { + rs = sh.first_temp_gpr(); + re = MAX_GPR; + } else { + rs = 0; + re = sh.num_nontemp_gpr(); + } + + for (unsigned reg = rs; reg < re; ++reg) { + if (reg >= rb.size() || !rb.get(reg)) { + // color found + set_color_local(v, sel_chan(reg, chan)); + return true; + } + } + ++pass; + } + + assert(!"recolor_local failed"); + return true; +} + +void post_scheduler::emit_load_ar() { + + regmap = prev_regmap; + alu.discard_current_group(); + + alu_group_tracker &rt = alu.grp(); + alu_node *a = alu.create_ar_load(); + + if (!rt.try_reserve(a)) { + cerr << "can't emit AR load : "; + dump::dump_op(a); + cerr << "\n"; + } + + alu.current_ar = 0; +} + +bool post_scheduler::unmap_dst_val(value *d) { + + if (d == alu.current_ar) { + emit_load_ar(); + return false; + } + + if (d->is_prealloc()) { + sel_chan gpr = d->get_final_gpr(); + rv_map::iterator F = regmap.find(gpr); + value *c = NULL; + if (F != regmap.end()) + c = F->second; + + if (c && c!=d && (!c->chunk || c->chunk != d->chunk)) { + PSC_DUMP( + cerr << "dst value conflict : "; + dump::dump_val(d); + cerr << " regmap contains "; + dump::dump_val(c); + cerr << "\n"; + ); + assert(!"scheduler error"); + return false; + } else if (c) { + regmap.erase(F); + } + } + return true; +} + +bool post_scheduler::unmap_dst(alu_node *n) { + value *d = n->dst.empty() ? NULL : n->dst[0]; + + if (!d) + return true; + + if (!d->is_rel()) { + if (d && d->is_any_reg()) { + + if (d->is_AR()) { + if (alu.current_ar != d) { + cerr << "loading wrong ar value\n"; + assert(0); + } else { + alu.current_ar = NULL; + } + + } else if (d->is_any_gpr()) { + if (!unmap_dst_val(d)) + return false; + } + } + } else { + for (vvec::iterator I = d->mdef.begin(), E = d->mdef.end(); + I != E; ++I) { + d = *I; + if (!d) + continue; + + assert(d->is_any_gpr()); + + if (!unmap_dst_val(d)) + return false; + } + } + return true; +} + +bool post_scheduler::map_src_val(value *v) { + + if (!v->is_prealloc()) + return true; + + sel_chan gpr = v->get_final_gpr(); + rv_map::iterator F = regmap.find(gpr); + value *c = NULL; + if (F != regmap.end()) { + c = F->second; + if (!v->v_equal(c)) { + PSC_DUMP( + cerr << "can't map src value "; + dump::dump_val(v); + cerr << ", regmap contains "; + dump::dump_val(c); + cerr << "\n"; + ); + return false; + } + } else { + regmap.insert(std::make_pair(gpr, v)); + } + return true; +} + +bool post_scheduler::map_src_vec(vvec &vv, bool src) { + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if ((!v->is_any_gpr() || !v->is_fixed()) && !v->is_rel()) + continue; + + if (v->is_rel()) { + value *rel = v->rel; + assert(rel); + + if (!rel->is_const()) { + if (!map_src_vec(v->muse, true)) + return false; + + if (rel != alu.current_ar) { + if (alu.current_ar) { + PSC_DUMP( + cerr << " current_AR is " << *alu.current_ar + << " trying to use " << *rel << "\n"; + ); + return false; + } + + alu.current_ar = rel; + + PSC_DUMP( + cerr << " new current_AR assigned: " << *alu.current_ar + << "\n"; + ); + } + } + + } else if (src) { + if (!map_src_val(v)) { + return false; + } + } + } + return true; +} + +bool post_scheduler::map_src(alu_node *n) { + if (!map_src_vec(n->dst, false)) + return false; + + if (!map_src_vec(n->src, true)) + return false; + + return true; +} + +void post_scheduler::dump_regmap() { + + cerr << "# REGMAP :\n"; + + for(rv_map::iterator I = regmap.begin(), E = regmap.end(); I != E; ++I) { + cerr << " # " << I->first << " => " << *(I->second) << "\n"; + } + + if (alu.current_ar) + cerr << " current_AR: " << *alu.current_ar << "\n"; + if (alu.current_pr) + cerr << " current_PR: " << *alu.current_pr << "\n"; +} + +void post_scheduler::recolor_locals() { + alu_group_tracker &rt = alu.grp(); + + for (unsigned s = 0; s < ctx.num_slots; ++s) { + alu_node *n = rt.slot(s); + if (n) { + value *d = n->dst[0]; + if (d && d->is_sgpr() && !d->is_prealloc()) { + recolor_local(d); + } + } + } +} + +// returns true if there are interferences +bool post_scheduler::check_interferences() { + + alu_group_tracker &rt = alu.grp(); + + unsigned interf_slots; + + bool discarded = false; + + PSC_DUMP( + cerr << "check_interferences: before: \n"; + dump_regmap(); + ); + + do { + + interf_slots = 0; + + for (unsigned s = 0; s < ctx.num_slots; ++s) { + alu_node *n = rt.slot(s); + if (n) { + if (!unmap_dst(n)) { + return true; + } + } + } + + for (unsigned s = 0; s < ctx.num_slots; ++s) { + alu_node *n = rt.slot(s); + if (n) { + if (!map_src(n)) { + interf_slots |= (1 << s); + } + } + } + + PSC_DUMP( + for (unsigned i = 0; i < 5; ++i) { + if (interf_slots & (1 << i)) { + cerr << "!!!!!! interf slot: " << i << " : "; + dump::dump_op(rt.slot(i)); + cerr << "\n"; + } + } + ); + + if (!interf_slots) + break; + + PSC_DUMP( cerr << "ci: discarding slots " << interf_slots << "\n"; ); + + rt.discard_slots(interf_slots, alu.conflict_nodes); + regmap = prev_regmap; + discarded = true; + + } while(1); + + PSC_DUMP( + cerr << "check_interferences: after: \n"; + dump_regmap(); + ); + + return discarded; +} + +// add instruction(s) (alu_node or contents of alu_packed_node) to current group +// returns the number of added instructions on success +unsigned post_scheduler::try_add_instruction(node *n) { + + alu_group_tracker &rt = alu.grp(); + + unsigned avail_slots = rt.avail_slots(); + + if (n->is_alu_packed()) { + alu_packed_node *p = static_cast<alu_packed_node*>(n); + unsigned slots = p->get_slot_mask(); + unsigned cnt = __builtin_popcount(slots); + + if ((slots & avail_slots) != slots) { + PSC_DUMP( cerr << " no slots \n"; ); + return 0; + } + + p->update_packed_items(ctx); + + if (!rt.try_reserve(p)) { + PSC_DUMP( cerr << " reservation failed \n"; ); + return 0; + } + + p->remove(); + return cnt; + + } else { + alu_node *a = static_cast<alu_node*>(n); + value *d = a->dst.empty() ? NULL : a->dst[0]; + + if (d && d->is_special_reg()) { + assert(a->bc.op_ptr->flags & AF_MOVA); + d = NULL; + } + + unsigned allowed_slots = ctx.alu_slots_mask(a->bc.op_ptr); + unsigned slot; + + allowed_slots &= avail_slots; + + if (!allowed_slots) + return 0; + + if (d) { + slot = d->get_final_chan(); + a->bc.dst_chan = slot; + allowed_slots &= (1 << slot) | 0b10000; + } else { + if (a->bc.op_ptr->flags & AF_MOVA) { + if (a->bc.slot_flags & AF_V) + allowed_slots &= (1 << SLOT_X); + else + allowed_slots &= (1 << SLOT_TRANS); + } + } + + // FIXME workaround for some problems with MULADD in trans slot on r700, + // (is it really needed on r600?) + if (a->bc.op == ALU_OP3_MULADD && !ctx.is_egcm()) { + allowed_slots &= 0b01111; + } + + if (!allowed_slots) { + PSC_DUMP( cerr << " no suitable slots\n"; ); + return 0; + } + + slot = __builtin_ctz(allowed_slots); + a->bc.slot = slot; + + PSC_DUMP( cerr << "slot: " << slot << "\n"; ); + + if (!rt.try_reserve(a)) { + PSC_DUMP( cerr << " reservation failed\n"; ); + return 0; + } + + a->remove(); + return 1; + } +} + +bool post_scheduler::check_copy(node *n) { + if (!n->is_copy_mov()) + return false; + + value *s = n->src[0]; + value *d = n->dst[0]; + + if (!s->is_sgpr() || !d->is_sgpr()) + return false; + + if (!s->is_prealloc()) { + recolor_local(s); + } + + if (s->gpr == d->gpr) { + + PSC_DUMP( + cerr << "check_copy: "; + dump::dump_op(n); + cerr << "\n"; + ); + + rv_map::iterator F = regmap.find(d->gpr); + bool gpr_free = (F == regmap.end()); + + if (d->is_prealloc()) { + if (gpr_free) { + PSC_DUMP( cerr << " copy not ready...\n";); + return true; + } + + value *rv = F->second; + if (rv != d && (!rv->chunk || rv->chunk != d->chunk)) { + PSC_DUMP( cerr << " copy not ready(2)...\n";); + return true; + } + + unmap_dst(static_cast<alu_node*>(n)); + } + + if (s->is_prealloc() && !map_src_val(s)) + return true; + + live.remove_val(d); + live.add_val(s); + + release_src_values(n); + n->remove(); + PSC_DUMP( cerr << " copy coalesced...\n";); + return true; + } + return false; +} + +void post_scheduler::dump_group(alu_group_tracker &rt) { + for (unsigned i = 0; i < 5; ++i) { + node *n = rt.slot(i); + if (n) { + cerr << "slot " << i << " : "; + dump::dump_op(n); + cerr << "\n"; + } + } +} + +void post_scheduler::process_ready_copies() { + + node *last; + + do { + last = ready_copies.back(); + + for (node_iterator N, I = ready_copies.begin(), E = ready_copies.end(); + I != E; I = N) { + N = I; ++N; + + node *n = *I; + + if (!check_copy(n)) { + n->remove(); + ready.push_back(n); + } + } + } while (last != ready_copies.back()); + + update_local_interferences(); +} + + +bool post_scheduler::prepare_alu_group() { + + alu_group_tracker &rt = alu.grp(); + + unsigned i1 = 0; + + PSC_DUMP( + cerr << "prepare_alu_group: starting...\n"; + dump_group(rt); + ); + + ready.append_from(&alu.conflict_nodes); + + // FIXME rework this loop + + do { + + process_ready_copies(); + + ++i1; + + for (node_iterator N, I = ready.begin(), E = ready.end(); I != E; + I = N) { + N = I; ++N; + node *n = *I; + + PSC_DUMP( + cerr << "p_a_g: "; + dump::dump_op(n); + cerr << "\n"; + ); + + + unsigned cnt = try_add_instruction(n); + + if (!cnt) + continue; + + PSC_DUMP( + cerr << "current group:\n"; + dump_group(rt); + ); + + if (rt.inst_count() == ctx.num_slots) { + PSC_DUMP( cerr << " all slots used\n"; ); + break; + } + } + + if (!check_interferences()) + break; + + // don't try to add more instructions to the group with mova if this + // can lead to breaking clause slot count limit - we don't want mova to + // end up in the end of the new clause instead of beginning of the + // current clause. + if (rt.has_ar_load() && alu.total_slots() > 121) + break; + + if (rt.inst_count() && i1 > 50) + break; + + regmap = prev_regmap; + + } while (1); + + PSC_DUMP( + cerr << " prepare_alu_group done, " << rt.inst_count() + << " slot(s) \n"; + + cerr << "$$$$$$$$PAG i1=" << i1 + << " ready " << ready.count() + << " pending " << pending.count() + << " conflicting " << alu.conflict_nodes.count() + <<"\n"; + + ); + + return rt.inst_count(); +} + +void post_scheduler::release_src_values(node* n) { + release_src_vec(n->src, true); + release_src_vec(n->dst, false); +} + +void post_scheduler::release_op(node *n) { + PSC_DUMP( + cerr << "release_op "; + dump::dump_op(n); + cerr << "\n"; + ); + + n->remove(); + + if (n->is_copy_mov()) { + ready_copies.push_back(n); + } else if (n->is_mova() || n->is_pred_set()) { + ready.push_front(n); + } else { + ready.push_back(n); + } +} + +void post_scheduler::release_src_val(value *v) { + node *d = v->any_def(); + if (d) { + if (!--ucm[d]) + release_op(d); + } +} + +void post_scheduler::release_src_vec(vvec& vv, bool src) { + + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + release_src_val(v->rel); + release_src_vec(v->muse, true); + + } else if (src) { + release_src_val(v); + } + } +} + +void literal_tracker::reset() { + memset(lt, 0, sizeof(lt)); + memset(uc, 0, sizeof(uc)); +} + +void rp_gpr_tracker::reset() { + memset(rp, 0, sizeof(rp)); + memset(uc, 0, sizeof(uc)); +} + +void rp_kcache_tracker::reset() { + memset(rp, 0, sizeof(rp)); + memset(uc, 0, sizeof(uc)); +} + +void alu_kcache_tracker::reset() { + memset(kc, 0, sizeof(kc)); + lines.clear(); +} + +void alu_clause_tracker::reset() { + group = 0; + slot_count = 0; + grp0.reset(); + grp1.reset(); +} + +alu_clause_tracker::alu_clause_tracker(shader &sh) + : sh(sh), kt(sh.get_ctx().hw_class), slot_count(), + grp0(sh), grp1(sh), + group(), clause(), + push_exec_mask(), + current_ar(), current_pr() {} + +void alu_clause_tracker::emit_group() { + + assert(grp().inst_count()); + + alu_group_node *g = grp().emit(); + + if (grp().has_update_exec_mask()) { + assert(!push_exec_mask); + push_exec_mask = true; + } + + assert(g); + + if (!clause) { + clause = sh.create_clause(NST_ALU_CLAUSE); + } + + clause->push_front(g); + + slot_count += grp().slot_count(); + + new_group(); + + PSC_DUMP( cerr << " #### group emitted\n"; ); +} + +void alu_clause_tracker::emit_clause(container_node *c) { + assert(clause); + + kt.init_clause(clause->bc); + + assert(!current_ar); + assert(!current_pr); + + if (push_exec_mask) + clause->bc.set_op(CF_OP_ALU_PUSH_BEFORE); + + c->push_front(clause); + + clause = NULL; + push_exec_mask = false; + slot_count = 0; + kt.reset(); + + PSC_DUMP( cerr << "######### ALU clause emitted\n"; ); +} + +bool alu_clause_tracker::check_clause_limits() { + + alu_group_tracker > = grp(); + + unsigned slots = gt.slot_count(); + + // reserving slots to load AR and PR values + unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0); + + if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots) + return false; + + if (!kt.try_reserve(gt)) + return false; + + return true; +} + +void alu_clause_tracker::new_group() { + group = !group; + grp().reset(); +} + +bool alu_clause_tracker::is_empty() { + return clause == NULL; +} + +void literal_tracker::init_group_literals(alu_group_node* g) { + + g->literals.clear(); + for (unsigned i = 0; i < 4; ++i) { + if (!lt[i]) + break; + + g->literals.push_back(lt[i]); + + PSC_DUMP( + cerr << "literal emitted: " << lt[i].f + << " 0x" << std::hex << lt[i].u + << std::dec << " " << lt[i].i << "\n"; + ); + } +} + +bool alu_kcache_tracker::try_reserve(alu_group_tracker& gt) { + rp_kcache_tracker &kt = gt.kcache(); + + if (!kt.num_sels()) + return true; + + sb_set<unsigned> group_lines; + + unsigned nl = kt.get_lines(group_lines); + assert(nl); + + sb_set<unsigned> clause_lines(lines); + lines.add_set(group_lines); + + if (clause_lines.size() == lines.size()) + return true; + + if (update_kc()) + return true; + + lines = clause_lines; + + return false; +} + +unsigned rp_kcache_tracker::get_lines(kc_lines& lines) { + unsigned cnt = 0; + + for (unsigned i = 0; i < sel_count; ++i) { + unsigned line = rp[i]; + + if (!line) + return cnt; + + --line; + line = (sel_count == 2) ? line >> 5 : line >> 6; + + if (lines.insert(line).second) + ++cnt; + } + return cnt; +} + +bool alu_kcache_tracker::update_kc() { + unsigned c = 0; + + bc_kcache old_kc[4]; + memcpy(old_kc, kc, sizeof(kc)); + + for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) { + unsigned line = *I; + unsigned bank = line >> 8; + + line &= 0xFF; + + if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line)) + ++kc[c-1].mode; + else { + if (c == max_kcs) { + memcpy(kc, old_kc, sizeof(kc)); + return false; + } + + kc[c].mode = KC_LOCK_1; + + kc[c].bank = bank; + kc[c].addr = line; + ++c; + } + } + return true; +} + +alu_node* alu_clause_tracker::create_ar_load() { + alu_node *a = sh.create_alu(); + + // FIXME use MOVA_GPR on R6xx + + if (sh.get_ctx().uses_mova_gpr) { + a->bc.set_op(ALU_OP1_MOVA_GPR_INT); + a->bc.slot = SLOT_TRANS; + } else { + a->bc.set_op(ALU_OP1_MOVA_INT); + a->bc.slot = SLOT_X; + } + + a->dst.resize(1); + a->src.push_back(current_ar); + + PSC_DUMP( + cerr << "created AR load: "; + dump::dump_op(a); + cerr << "\n"; + ); + + return a; +} + +void alu_clause_tracker::discard_current_group() { + PSC_DUMP( cerr << "act::discard_current_group\n"; ); + grp().discard_all_slots(conflict_nodes); +} + +void rp_gpr_tracker::dump() { + cerr << "=== gpr_tracker dump:\n"; + for (int c = 0; c < 3; ++c) { + cerr << "cycle " << c << " "; + for (int h = 0; h < 4; ++h) { + cerr << rp[c][h] << ":" << uc[c][h] << " "; + } + cerr << "\n"; + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h new file mode 100644 index 00000000000..e74046c9c5a --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -0,0 +1,324 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef SB_SCHED_H_ +#define SB_SCHED_H_ + +namespace r600_sb { + +typedef sb_map<node*, unsigned> uc_map; + +// resource trackers for scheduler + +typedef sb_set<unsigned> kc_lines; + +class rp_kcache_tracker { + unsigned rp[4]; + unsigned uc[4]; + const unsigned sel_count; + + unsigned kc_sel(sel_chan r) { + return sel_count == 4 ? (unsigned)r : ((r - 1) >> 1) + 1; + } + +public: + rp_kcache_tracker(shader &sh); + + bool try_reserve(node *n); + void unreserve(node *n); + + + bool try_reserve(sel_chan r); + void unreserve(sel_chan r); + + void reset(); + + unsigned num_sels() { return !!rp[0] + !!rp[1] + !!rp[2] + !!rp[3]; } + + unsigned get_lines(kc_lines &lines); +}; + +class literal_tracker { + literal lt[4]; + unsigned uc[4]; +public: + literal_tracker() : lt(), uc() {} + + bool try_reserve(alu_node *n); + void unreserve(alu_node *n); + + bool try_reserve(literal l); + void unreserve(literal l); + + void reset(); + + unsigned count() { return !!uc[0] + !!uc[1] + !!uc[2] + !!uc[3]; } + + void init_group_literals(alu_group_node *g); + +}; + +class rp_gpr_tracker { + // rp[cycle][elem] + unsigned rp[3][4]; + unsigned uc[3][4]; + +public: + rp_gpr_tracker() : rp(), uc() {} + + bool try_reserve(alu_node *n); + void unreserve(alu_node *n); + + bool try_reserve(unsigned cycle, unsigned sel, unsigned chan); + void unreserve(unsigned cycle, unsigned sel, unsigned chan); + + void reset(); + + void dump(); +}; + +class alu_group_tracker { + + shader &sh; + + rp_kcache_tracker kc; + rp_gpr_tracker gpr; + literal_tracker lt; + + alu_node * slots[5]; + + unsigned available_slots; + + unsigned max_slots; + + typedef std::map<value*, unsigned> value_index_map; + + value_index_map vmap; + + bool has_mova; + bool uses_ar; + bool has_predset; + bool has_kill; + bool updates_exec_mask; + + unsigned chan_count[4]; + + // param index + 1 (0 means that group doesn't refer to Params) + // we can't use more than one param index in a group + unsigned interp_param; + + unsigned next_id; + + node_vec packed_ops; + + void assign_slot(unsigned slot, alu_node *n); + +public: + alu_group_tracker(shader &sh); + + // FIXME use fast bs correctness check (values for same chan <= 3) ?? + bool try_reserve(alu_node *n); + bool try_reserve(alu_packed_node *p); + + void reinit(); + void reset(bool keep_packed = false); + + sel_chan get_value_id(value *v); + void update_flags(alu_node *n); + + alu_node* slot(unsigned i) { return slots[i]; } + + unsigned used_slots() { + return (~available_slots) & ((1 << max_slots) - 1); + } + + unsigned inst_count() { + return __builtin_popcount(used_slots()); + } + + unsigned literal_count() { return lt.count(); } + unsigned literal_slot_count() { return (literal_count() + 1) >> 1; }; + unsigned slot_count() { return inst_count() + literal_slot_count(); } + + alu_group_node* emit(); + + rp_kcache_tracker& kcache() { return kc; } + + bool has_update_exec_mask() { return updates_exec_mask; } + unsigned avail_slots() { return available_slots; } + + void discard_all_slots(container_node &removed_nodes); + void discard_slots(unsigned slot_mask, container_node &removed_nodes); + + bool has_ar_load() { return has_mova; } +}; + +class alu_kcache_tracker { + bc_kcache kc[4]; + sb_set<unsigned> lines; + unsigned max_kcs; + +public: + + alu_kcache_tracker(sb_hw_class hc) + : kc(), lines(), max_kcs(hc >= HW_CLASS_EVERGREEN ? 4 : 2) {} + + void reset(); + bool try_reserve(alu_group_tracker >); + bool update_kc(); + void init_clause(bc_cf &bc) { + memcpy(bc.kc, kc, sizeof(kc)); + } +}; + +class alu_clause_tracker { + shader &sh; + + alu_kcache_tracker kt; + unsigned slot_count; + + alu_group_tracker grp0; + alu_group_tracker grp1; + + unsigned group; + + cf_node *clause; + + bool push_exec_mask; + +public: + container_node conflict_nodes; + + // current values of AR and PR registers that we have to preload + // till the end of clause (in fact, beginning, because we're scheduling + // bottom-up) + value *current_ar; + value *current_pr; + + alu_clause_tracker(shader &sh); + + void reset(); + + // current group + alu_group_tracker& grp() { return group ? grp1 : grp0; } + // previous group + alu_group_tracker& prev_grp() { return group ? grp0 : grp1; } + + void emit_group(); + void emit_clause(container_node *c); + bool check_clause_limits(); + void new_group(); + bool is_empty(); + + alu_node* create_ar_load(); + + void discard_current_group(); + + unsigned total_slots() { return slot_count; } +}; + +class post_scheduler : public pass { + + container_node ready, ready_copies; // alu only + container_node pending, bb_pending; + bb_node *cur_bb; + val_set live; // values live at the end of the alu clause + uc_map ucm; + alu_clause_tracker alu; + + typedef std::map<sel_chan, value*> rv_map; + rv_map regmap, prev_regmap; + + val_set cleared_interf; + +public: + + post_scheduler(shader &sh) : pass(sh), + ready(), ready_copies(), pending(), cur_bb(), + live(), ucm(), alu(sh), regmap(), cleared_interf() {} + + virtual int run(); + void run_on(container_node *n); + void schedule_bb(bb_node *bb); + + void process_alu(container_node *c); + void schedule_alu(container_node *c); + bool prepare_alu_group(); + + void release_op(node *n); + + void release_src_values(node *n); + void release_src_vec(vvec &vv, bool src); + void release_src_val(value *v); + + void init_uc_val(container_node *c, value *v); + void init_uc_vec(container_node *c, vvec &vv, bool src); + unsigned init_ucm(container_node *c, node *n); + + void init_regmap(); + + bool check_interferences(); + + unsigned try_add_instruction(node *n); + + bool check_copy(node *n); + void dump_group(alu_group_tracker &rt); + + bool unmap_dst(alu_node *n); + bool unmap_dst_val(value *d); + + bool map_src(alu_node *n); + bool map_src_vec(vvec &vv, bool src); + bool map_src_val(value *v); + + bool recolor_local(value *v); + + void update_local_interferences(); + void update_live_src_vec(vvec &vv, val_set &born, bool src); + void update_live_dst_vec(vvec &vv); + void update_live(node *n, val_set &born); + void process_group(); + + void set_color_local_val(value *v, sel_chan color); + void set_color_local(value *v, sel_chan color); + + void add_interferences(value *v, sb_bitset &rb, val_set &vs); + + void init_globals(val_set &s, bool prealloc); + + void recolor_locals(); + + void dump_regmap(); + + void emit_load_ar(); + void emit_clause(); + + void process_ready_copies(); +}; + +} // namespace r600_sb + +#endif /* SB_SCHED_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp new file mode 100644 index 00000000000..91f7c5dba5e --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -0,0 +1,660 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include "sb_bc.h" +#include "sb_shader.h" + +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +shader::shader(sb_context &sctx, shader_target t, unsigned id, bool dump) +: ctx(sctx), next_temp_value_index(temp_regid_offset), + prep_regs_count(), pred_sels(), + regions(), inputs(), undef(), val_pool(sizeof(value)), + pool(), all_nodes(), errors(), enable_dump(dump), + optimized(), id(id), + coal(*this), bbs(), + target(t), vt(ex), ex(*this), root(), + compute_interferences(), + has_alu_predication(), uses_gradients(), ngpr(), nstack() {} + +bool shader::assign_slot(alu_node* n, alu_node *slots[5]) { + + unsigned slot_flags = ctx.alu_slots(n->bc.op); + unsigned slot = n->bc.dst_chan; + + if (!ctx.is_cayman() && (!(slot_flags & AF_V) || slots[slot]) && + (slot_flags & AF_S)) + slot = SLOT_TRANS; + + if (slots[slot]) + return false; + + n->bc.slot = slot; + slots[slot] = n; + return true; +} + +void shader::add_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask, + bool src) { + unsigned chan = 0; + while (comp_mask) { + if (comp_mask & 1) { + value *v = get_gpr_value(src, gpr, chan, false); + v->flags |= (VLF_PIN_REG | VLF_PIN_CHAN); + if (!v->is_rel()) { + v->gpr = v->pin_gpr = v->select; + v->fix(); + } + vec.push_back(v); + } + comp_mask >>= 1; + ++chan; + } +} + +cf_node* shader::create_clause(node_subtype nst) { + cf_node *n = create_cf(); + + n->subtype = nst; + + switch (nst) { + case NST_ALU_CLAUSE: n->bc.set_op(CF_OP_ALU); break; + case NST_TEX_CLAUSE: n->bc.set_op(CF_OP_TEX); break; + case NST_VTX_CLAUSE: n->bc.set_op(CF_OP_VTX); break; + default: assert(!"invalid clause type"); break; + } + + n->bc.barrier = 1; + return n; +} + +void shader::create_bbs() { + create_bbs(root, bbs); +} + +void shader::expand_bbs() { + expand_bbs(bbs); +} + +alu_node* shader::create_mov(value* dst, value* src) { + alu_node *n = create_alu(); + n->bc.set_op(ALU_OP1_MOV); + n->dst.push_back(dst); + n->src.push_back(src); + dst->def = n; + + return n; +} + +alu_node* shader::create_copy_mov(value* dst, value* src, unsigned affcost) { + alu_node *n = create_mov(dst, src); + + dst->assign_source(src); + n->flags |= NF_COPY_MOV | NF_DONT_HOIST; + + if (affcost && dst->is_sgpr() && src->is_sgpr()) + coal.add_edge(src, dst, affcost); + + return n; +} + +value* shader::get_value(value_kind kind, sel_chan id, + unsigned version) { + if (version == 0 && kind == VLK_REG && id.sel() < prep_regs_count) + return val_pool[id - 1]; + + + unsigned key = (kind << 28) | (version << 16) | id; + value_map::iterator i = reg_values.find(key); + if (i != reg_values.end()) { + return i->second; + } + value *v = create_value(kind, id, version); + reg_values.insert(std::make_pair(key, v)); + return v; +} + +value* shader::get_special_value(unsigned sv_id, unsigned version) { + sel_chan id(sv_id, 0); + return get_value(VLK_SPECIAL_REG, id, version); +} + +void shader::fill_array_values(gpr_array *a, vvec &vv) { + unsigned sz = a->array_size; + vv.resize(sz); + for (unsigned i = 0; i < a->array_size; ++i) { + vv[i] = get_gpr_value(true, a->base_gpr.sel() + i, a->base_gpr.chan(), + false); + } +} + +value* shader::get_gpr_value(bool src, unsigned reg, unsigned chan, bool rel, + unsigned version) { + sel_chan id(reg, chan); + value *v; + gpr_array *a = get_gpr_array(reg, chan); + if (rel) { + assert(a); + v = create_value(VLK_REL_REG, id, 0); + v->rel = get_special_value(SV_AR_INDEX); + fill_array_values(a, v->muse); + if (!src) + fill_array_values(a, v->mdef); + } else { + if (version == 0 && reg < prep_regs_count) + return (val_pool[id - 1]); + + v = get_value(VLK_REG, id, version); + } + + v->array = a; + v->pin_gpr = v->select; + + return v; +} + +value* shader::create_temp_value() { + sel_chan id(++next_temp_value_index, 0); + return get_value(VLK_TEMP, id, 0); +} + +value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) { + return get_ro_value(kcache_values, VLK_KCACHE, + sel_chan((bank << 12) | index, chan)); +} + +void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) { + if (inputs.size() <= gpr) + inputs.resize(gpr+1); + + shader_input &i = inputs[gpr]; + i.preloaded = preloaded; + i.comp_mask = comp_mask; + + if (preloaded) { + add_gpr_values(root->dst, gpr, comp_mask, true); + } + +} + +void shader::init() { + assert(!root); + root = create_container(); +} + +void shader::init_call_fs(cf_node* cf) { + unsigned gpr = 0; + + assert(target == TARGET_VS); + + for(inputs_vec::const_iterator I = inputs.begin(), + E = inputs.end(); I != E; ++I, ++gpr) { + if (!I->preloaded) + add_gpr_values(cf->dst, gpr, I->comp_mask, false); + else + add_gpr_values(cf->src, gpr, I->comp_mask, true); + } +} + +void shader::set_undef(val_set& s) { + value *undefined = get_undef_value(); + if (!undefined->gvn_source) + vt.add_value(undefined); + + val_set &vs = s; + + for (val_set::iterator I = vs.begin(*this), E = vs.end(*this); I != E; ++I) { + value *v = *I; + + assert(!v->is_readonly() && !v->is_rel()); + + v->gvn_source = undefined->gvn_source; + } +} +/* +void shader::transfer_pins(vvec& vv, vvec &sv) { + vvec::iterator SI = sv.begin(); + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I, ++SI) { + value *v = *I; + value *sv = *SI; + + v->pin_gpr = sv->pin_gpr; + + if (sv->is_chan_pinned()) { + v->flags |= VLF_PIN_CHAN; + sv->flags &= ~VLF_PIN_CHAN; + } + if (sv->is_reg_pinned()) { + v->flags |= VLF_PIN_REG; + sv->flags &= ~VLF_PIN_REG; + } + } +} +*/ +value* shader::create_value(value_kind k, sel_chan regid, unsigned ver) { + value *v = val_pool.create(k, regid, ver); + return v; +} + +value* shader::get_undef_value() { + if (!undef) + undef = create_value(VLK_UNDEF, 0, 0); + return undef; +} + +node* shader::create_node(node_type nt, node_subtype nst, node_flags flags) { + node *n = new (pool.allocate(sizeof(node))) node(nt, nst, flags); + all_nodes.push_back(n); + return n; +} + +alu_node* shader::create_alu() { + alu_node* n = new (pool.allocate(sizeof(alu_node))) alu_node(); + all_nodes.push_back(n); + return n; +} + +alu_group_node* shader::create_alu_group() { + alu_group_node* n = + new (pool.allocate(sizeof(alu_group_node))) alu_group_node(); + all_nodes.push_back(n); + return n; +} + +alu_packed_node* shader::create_alu_packed() { + alu_packed_node* n = + new (pool.allocate(sizeof(alu_packed_node))) alu_packed_node(); + all_nodes.push_back(n); + return n; +} + +cf_node* shader::create_cf() { + cf_node* n = new (pool.allocate(sizeof(cf_node))) cf_node(); + n->bc.barrier = 1; + all_nodes.push_back(n); + return n; +} + +fetch_node* shader::create_fetch() { + fetch_node* n = new (pool.allocate(sizeof(fetch_node))) fetch_node(); + all_nodes.push_back(n); + return n; +} + +region_node* shader::create_region() { + region_node *n = new (pool.allocate(sizeof(region_node))) + region_node(regions.size()); + regions.push_back(n); + all_nodes.push_back(n); + return n; +} + +depart_node* shader::create_depart(region_node* target) { + depart_node* n = new (pool.allocate(sizeof(depart_node))) + depart_node(target, target->departs.size()); + target->departs.push_back(n); + all_nodes.push_back(n); + return n; +} + +repeat_node* shader::create_repeat(region_node* target) { + repeat_node* n = new (pool.allocate(sizeof(repeat_node))) + repeat_node(target, target->repeats.size() + 1); + target->repeats.push_back(n); + all_nodes.push_back(n); + return n; +} + +container_node* shader::create_container(node_type nt, node_subtype nst, + node_flags flags) { + container_node *n = new (pool.allocate(sizeof(container_node))) + container_node(nt, nst, flags); + all_nodes.push_back(n); + return n; +} + +if_node* shader::create_if() { + if_node* n = new (pool.allocate(sizeof(if_node))) if_node(); + all_nodes.push_back(n); + return n; +} + +bb_node* shader::create_bb(unsigned id, unsigned loop_level) { + bb_node* n = new (pool.allocate(sizeof(bb_node))) bb_node(id, loop_level); + all_nodes.push_back(n); + return n; +} +/* +void shader::prepare_regs(unsigned cnt) { + assert(!prep_regs_count); + + for (unsigned i = 0; i < cnt*4; ++i) { + value *v = create_value(VLK_REG, i + 1, 0); + assert (v->uid == i + 1); + } + + prep_regs_count = cnt; +} +*/ +value* shader::get_special_ro_value(unsigned sel) { + return get_ro_value(special_ro_values, VLK_PARAM, sel); +} + +value* shader::get_const_value(const literal &v) { + value *val = get_ro_value(const_values, VLK_CONST, v); + val->literal_value = v; + return val; +} + +shader::~shader() { + for (node_vec::iterator I = all_nodes.begin(), E = all_nodes.end(); + I != E; ++I) + (*I)->~node(); +} + +void shader::dump_ir() { + if (ctx.dump_pass) + dump(*this).run(); +} + +value* shader::get_value_version(value* v, unsigned ver) { + assert(!v->is_readonly() && !v->is_rel()); + value *vv = get_value(v->kind, v->select, ver); + assert(vv); + + if (v->array) { + vv->array = v->array; + } + + return vv; +} + +gpr_array* shader::get_gpr_array(unsigned reg, unsigned chan) { + + for (regarray_vec::iterator I = gpr_arrays.begin(), + E = gpr_arrays.end(); I != E; ++I) { + gpr_array* a = *I; + unsigned achan = a->base_gpr.chan(); + unsigned areg = a->base_gpr.sel(); + if (achan == chan && (reg >= areg && reg < areg+a->array_size)) + return a; + } + return NULL; +} + +void shader::add_gpr_array(unsigned gpr_start, unsigned gpr_count, + unsigned comp_mask) { + unsigned chan = 0; + while (comp_mask) { + if (comp_mask & 1) { + gpr_array *a = new gpr_array( + sel_chan(gpr_start, chan), gpr_count); + + SB_DUMP_PASS( cerr << "add_gpr_array: @" << a->base_gpr + << " [" << a->array_size << "]\n"; + ); + + gpr_arrays.push_back(a); + } + comp_mask >>= 1; + ++chan; + } +} + +value* shader::get_pred_sel(int sel) { + assert(sel == 0 || sel == 1); + if (!pred_sels[sel]) + pred_sels[sel] = get_const_value(sel); + + return pred_sels[sel]; +} + +cf_node* shader::create_cf(unsigned op) { + cf_node *c = create_cf(); + c->bc.set_op(op); + c->bc.barrier = 1; + return c; +} + +std::string shader::get_full_target_name() { + std::string s = get_shader_target_name(); + s += "/"; + s += get_hw_chip_name(); + s += "/"; + s += get_hw_class_name(); + return s; +} + +const char* shader::get_hw_class_name() { + switch (ctx.hw_class) { +#define TRANSLATE_HW_CLASS(c) case HW_CLASS_##c: return #c + TRANSLATE_HW_CLASS(R600); + TRANSLATE_HW_CLASS(R700); + TRANSLATE_HW_CLASS(EVERGREEN); + TRANSLATE_HW_CLASS(CAYMAN); +#undef TRANSLATE_HW_CLASS + default: + return "INVALID_CHIP_CLASS"; + } +} + +const char* shader::get_hw_chip_name() { + switch (ctx.hw_chip) { +#define TRANSLATE_CHIP(c) case HW_CHIP_##c: return #c + TRANSLATE_CHIP(R600); + TRANSLATE_CHIP(RV610); + TRANSLATE_CHIP(RV630); + TRANSLATE_CHIP(RV670); + TRANSLATE_CHIP(RV620); + TRANSLATE_CHIP(RV635); + TRANSLATE_CHIP(RS780); + TRANSLATE_CHIP(RS880); + TRANSLATE_CHIP(RV770); + TRANSLATE_CHIP(RV730); + TRANSLATE_CHIP(RV710); + TRANSLATE_CHIP(RV740); + TRANSLATE_CHIP(CEDAR); + TRANSLATE_CHIP(REDWOOD); + TRANSLATE_CHIP(JUNIPER); + TRANSLATE_CHIP(CYPRESS); + TRANSLATE_CHIP(HEMLOCK); + TRANSLATE_CHIP(PALM); + TRANSLATE_CHIP(SUMO); + TRANSLATE_CHIP(SUMO2); + TRANSLATE_CHIP(BARTS); + TRANSLATE_CHIP(TURKS); + TRANSLATE_CHIP(CAICOS); + TRANSLATE_CHIP(CAYMAN); +#undef TRANSLATE_CHIP + + default: + assert(!"unknown chip"); + return "INVALID_CHIP"; + } +} + +const char* shader::get_shader_target_name() { + switch (target) { + case TARGET_VS: return "VS"; + case TARGET_PS: return "PS"; + case TARGET_GS: return "GS"; + case TARGET_COMPUTE: return "COMPUTE"; + case TARGET_FETCH: return "FETCH"; + default: + return "INVALID_TARGET"; + } +} + +void shader::simplify_dep_rep(node* dr) { + container_node *p = dr->parent; + if (p->is_repeat()) { + repeat_node *r = static_cast<repeat_node*>(p); + r->target->expand_repeat(r); + } else if (p->is_depart()) { + depart_node *d = static_cast<depart_node*>(p); + d->target->expand_depart(d); + } + if (dr->next) + dr->parent->cut(dr->next, NULL); +} + + +// FIXME this is used in some places as the max non-temp gpr, +// (MAX_GPR - 2 * ctx.alu_temp_gprs) should be used for that instead. +unsigned shader::first_temp_gpr() { + return MAX_GPR - ctx.alu_temp_gprs; +} + +unsigned shader::num_nontemp_gpr() { + return MAX_GPR - 2 * ctx.alu_temp_gprs; +} + +void shader::set_uses_kill() { + if (root->src.empty()) + root->src.resize(1); + + if (!root->src[0]) + root->src[0] = get_special_value(SV_VALID_MASK); +} + +alu_node* shader::clone(alu_node* n) { + alu_node *c = create_alu(); + + // FIXME: this may be wrong with indirect operands + c->src = n->src; + c->dst = n->dst; + + c->bc = n->bc; + c->pred = n->pred; + + return c; +} + +value* shader::get_ro_value(value_map& vm, value_kind vk, unsigned key) { + value_map::iterator I = vm.find(key); + if (I != vm.end()) + return I->second; + value *v = create_value(vk, key, 0); + v->flags = VLF_READONLY; + vm.insert(std::make_pair(key, v)); + return v; +} + +void shader::create_bbs(container_node* n, bbs_vec &bbs, int loop_level) { + + bool inside_bb = false; + bool last_inside_bb = true; + node_iterator bb_start(n->begin()), I(bb_start), E(n->end()); + + for (; I != E; ++I) { + node *k = *I; + inside_bb = k->type == NT_OP; + + if (inside_bb && !last_inside_bb) + bb_start = I; + else if (!inside_bb) { + if (last_inside_bb + && I->type != NT_REPEAT + && I->type != NT_DEPART + && I->type != NT_IF) { + bb_node *bb = create_bb(bbs.size(), loop_level); + bbs.push_back(bb); + n->insert_node_before(*bb_start, bb); + if (bb_start != I) + bb->move(bb_start, I); + } + + if (k->is_container()) { + + bool loop = false; + if (k->type == NT_REGION) { + loop = static_cast<region_node*>(k)->is_loop(); + } + + create_bbs(static_cast<container_node*>(k), bbs, + loop_level + loop); + } + } + + if (k->type == NT_DEPART) + return; + + last_inside_bb = inside_bb; + } + + if (last_inside_bb) { + bb_node *bb = create_bb(bbs.size(), loop_level); + bbs.push_back(bb); + if (n->empty()) + n->push_back(bb); + else { + n->insert_node_before(*bb_start, bb); + if (bb_start != n->end()) + bb->move(bb_start, n->end()); + } + } else { + if (n->last && n->last->type == NT_IF) { + bb_node *bb = create_bb(bbs.size(), loop_level); + bbs.push_back(bb); + n->push_back(bb); + } + } +} + +void shader::expand_bbs(bbs_vec &bbs) { + + for (bbs_vec::iterator I = bbs.begin(), E = bbs.end(); I != E; ++I) { + bb_node *b = *I; + b->expand(); + } +} + +sched_queue_id shader::get_queue_id(node* n) { + switch (n->subtype) { + case NST_ALU_INST: + case NST_ALU_PACKED_INST: + case NST_COPY: + case NST_PSI: + return SQ_ALU; + case NST_FETCH_INST: { + fetch_node *f = static_cast<fetch_node*>(n); + if (ctx.is_r600() && (f->bc.op_ptr->flags & FF_VTX)) + return SQ_VTX; + return SQ_TEX; + } + case NST_CF_INST: + return SQ_CF; + default: + assert(0); + return SQ_NUM; + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h new file mode 100644 index 00000000000..5531fdaa092 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_shader.h @@ -0,0 +1,406 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#ifndef SB_SHADER_H_ +#define SB_SHADER_H_ + +#include <list> +#include <string> +#include <map> + +#include "sb_ir.h" +#include "sb_expr.h" + +namespace r600_sb { + +struct shader_input { + unsigned comp_mask; + unsigned preloaded; +}; + +struct error_info { + node *n; + unsigned arg_index; + std::string message; +}; + +typedef std::multimap<node*, error_info> error_map; + +class sb_context; + +typedef std::vector<shader_input> inputs_vec; +typedef std::vector<gpr_array*> gpr_array_vec; + +struct ra_edge { + value *a, *b; + unsigned cost; + + ra_edge(value *a, value *b, unsigned cost) : a(a), b(b), cost(cost) {} +}; + +enum chunk_flags { + RCF_GLOBAL = (1 << 0), + RCF_PIN_CHAN = (1 << 1), + RCF_PIN_REG = (1 << 2), + + RCF_FIXED = (1 << 3), + + RCF_PREALLOC = (1 << 4) +}; + +inline chunk_flags operator |(chunk_flags l, chunk_flags r) { + return (chunk_flags)((unsigned)l|(unsigned)r); +} +inline chunk_flags& operator |=(chunk_flags &l, chunk_flags r) { + l = l | r; + return l; +} + +inline chunk_flags& operator &=(chunk_flags &l, chunk_flags r) { + l = (chunk_flags)((unsigned)l & (unsigned)r); + return l; +} + +inline chunk_flags operator ~(chunk_flags r) { + return (chunk_flags)~(unsigned)r; +} + +struct ra_chunk { + vvec values; + chunk_flags flags; + unsigned cost; + sel_chan pin; + + ra_chunk() : values(), flags(), cost(), pin() {} + + bool is_fixed() { return flags & RCF_FIXED; } + void fix() { flags |= RCF_FIXED; } + + bool is_global() { return flags & RCF_GLOBAL; } + void set_global() { flags |= RCF_GLOBAL; } + + bool is_reg_pinned() { return flags & RCF_PIN_REG; } + bool is_chan_pinned() { return flags & RCF_PIN_CHAN; } + + bool is_prealloc() { return flags & RCF_PREALLOC; } + void set_prealloc() { flags |= RCF_PREALLOC; } +}; + +typedef std::vector<ra_chunk*> chunk_vector; + +class ra_constraint { +public: + ra_constraint(constraint_kind kind) : kind(kind) {} + + constraint_kind kind; + vvec values; + unsigned cost; + + void update_values(); + bool check(); +}; + +typedef std::vector<ra_constraint*> constraint_vec; +typedef std::vector<ra_chunk*> chunk_vec; + +// priority queue +// FIXME use something more suitale or custom class ? + +template <class T> +struct cost_compare { + bool operator ()(const T& t1, const T& t2) { + return t1->cost > t2->cost; + } +}; + +template <class T, class Comp> +class queue { + typedef std::vector<T> container; + container cont; + +public: + queue() : cont() {} + + typedef typename container::iterator iterator; + + iterator begin() { return cont.begin(); } + iterator end() { return cont.end(); } + + iterator insert(const T& t) { + iterator I = std::upper_bound(begin(), end(), t, Comp()); + if (I == end()) + cont.push_back(t); + else + cont.insert(I, t); + + return I; + } + + void erase(const T& t) { + std::pair<iterator, iterator> R = + std::equal_range(begin(), end(), t, Comp()); + iterator F = std::find(R.first, R.second, t); + if (F != R.second) + cont.erase(F); + } +}; + +typedef queue<ra_chunk*, cost_compare<ra_chunk*> > chunk_queue; +typedef queue<ra_edge*, cost_compare<ra_edge*> > edge_queue; +typedef queue<ra_constraint*, cost_compare<ra_constraint*> > constraint_queue; + +typedef std::set<ra_chunk*> chunk_set; + +class shader; + +class coalescer { + + shader &sh; + + edge_queue edges; + chunk_queue chunks; + constraint_queue constraints; + + constraint_vec all_constraints; + chunk_vec all_chunks; + +public: + + coalescer(shader &sh) : sh(sh), edges(), chunks(), constraints() {} + ~coalescer(); + + void run(); + + void add_edge(value *a, value *b, unsigned cost); + void build_chunks(); + void build_constraint_queue(); + void build_chunk_queue(); + void color_constraints(); + void color_chunks(); + + ra_constraint* create_constraint(constraint_kind kind); + + enum ac_cost { + phi_cost = 10000, + copy_cost = 1, + }; + + void dump_edges(); + void dump_chunks(); + void dump_constraint_queue(); + + static void dump_chunk(ra_chunk *c); + static void dump_constraint(ra_constraint* c); + + void get_chunk_interferences(ra_chunk *c, val_set &s); + +private: + + void create_chunk(value *v); + void unify_chunks(ra_edge *e); + bool chunks_interference(ra_chunk *c1, ra_chunk *c2); + + void color_reg_constraint(ra_constraint *c); + void color_phi_constraint(ra_constraint *c); + + + void init_reg_bitset(sb_bitset &bs, val_set &vs); + + void color_chunk(ra_chunk *c, sel_chan color); + + ra_chunk* detach_value(value *v); +}; + + + +class shader { + + sb_context &ctx; + + typedef sb_map<uint32_t, value*> value_map; + value_map reg_values; + + // read-only values + value_map const_values; // immediate constants key -const value (uint32_t) + value_map special_ro_values; // key - hw alu_sel & chan + value_map kcache_values; + + gpr_array_vec gpr_arrays; + + unsigned next_temp_value_index; + + unsigned prep_regs_count; + + value* pred_sels[2]; + + regions_vec regions; + inputs_vec inputs; + + value *undef; + + sb_value_pool val_pool; + sb_pool pool; + + std::vector<node*> all_nodes; + +public: + error_map errors; + + bool enable_dump; + bool optimized; + + unsigned id; + + coalescer coal; + + static const unsigned temp_regid_offset = 512; + + bbs_vec bbs; + + const shader_target target; + + value_table vt; + expr_handler ex; + + container_node *root; + + bool compute_interferences; + + bool has_alu_predication; + bool uses_gradients; + + unsigned ngpr, nstack; + + shader(sb_context &sctx, shader_target t, unsigned id, bool dump); + + ~shader(); + + sb_context &get_ctx() const { return ctx; } + + value* get_const_value(const literal & v); + value* get_special_value(unsigned sv_id, unsigned version = 0); + value* create_temp_value(); + value* get_gpr_value(bool src, unsigned reg, unsigned chan, bool rel, + unsigned version = 0); + + + value* get_special_ro_value(unsigned sel); + value* get_kcache_value(unsigned bank, unsigned index, unsigned chan); + + value* get_value_version(value* v, unsigned ver); + + void init(); + void add_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask, bool src); + + void dump_ir(); + + void add_gpr_array(unsigned gpr_start, unsigned gpr_count, + unsigned comp_mask); + + value* get_pred_sel(int sel); + bool assign_slot(alu_node *n, alu_node *slots[5]); + + gpr_array* get_gpr_array(unsigned reg, unsigned chan); + + void add_input(unsigned gpr, bool preloaded = false, + unsigned comp_mask = 0xF); + + const inputs_vec & get_inputs() {return inputs; } + + regions_vec & get_regions() { return regions; } + + void init_call_fs(cf_node *cf); + + value *get_undef_value(); + void set_undef(val_set &s); + + void transfer_pins(vvec& vv, vvec &sv); + + node* create_node(node_type nt, node_subtype nst, + node_flags flags = NF_EMPTY); + alu_node* create_alu(); + alu_group_node* create_alu_group(); + alu_packed_node* create_alu_packed(); + cf_node* create_cf(); + cf_node* create_cf(unsigned op); + fetch_node* create_fetch(); + region_node* create_region(); + depart_node* create_depart(region_node *target); + repeat_node* create_repeat(region_node *target); + container_node* create_container(node_type nt = NT_LIST, + node_subtype nst = NST_LIST, + node_flags flags = NF_EMPTY); + if_node* create_if(); + bb_node* create_bb(unsigned id, unsigned loop_level); + + void prepare_regs(unsigned cnt); + + value* get_value_by_uid(unsigned id) { return val_pool[id - 1]; } + + cf_node* create_clause(node_subtype nst); + + void create_bbs(); + void expand_bbs(); + + alu_node* create_mov(value* dst, value* src); + alu_node* create_copy_mov(value *dst, value *src, unsigned affcost = 1); + + const char * get_hw_class_name(); + const char * get_hw_chip_name(); + const char * get_shader_target_name(); + + std::string get_full_target_name(); + + void create_bbs(container_node* n, bbs_vec &bbs, int loop_level = 0); + void expand_bbs(bbs_vec &bbs); + + sched_queue_id get_queue_id(node* n); + + void simplify_dep_rep(node *dr); + + unsigned first_temp_gpr(); + unsigned num_nontemp_gpr(); + + gpr_array_vec& arrays() { return gpr_arrays; } + + void set_uses_kill(); + + void fill_array_values(gpr_array *a, vvec &vv); + + alu_node* clone(alu_node *n); + + sb_value_pool& get_value_pool() { return val_pool; } + +private: + value* create_value(value_kind k, sel_chan regid, unsigned ver); + value* get_value(value_kind kind, sel_chan id, + unsigned version = 0); + value* get_ro_value(value_map &vm, value_kind vk, unsigned key); +}; + +} + +#endif /* SHADER_H_ */ diff --git a/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp b/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp new file mode 100644 index 00000000000..87eafaee247 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp @@ -0,0 +1,418 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#include <stack> +#include <map> + +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +container_node* ssa_prepare::create_phi_nodes(int count) { + container_node *p = sh.create_container(); + val_set &vars = cur_set(); + node *nn; + + for (val_set::iterator I = vars.begin(sh), E = vars.end(sh); I != E; ++I) { + nn = sh.create_node(NT_OP, NST_PHI); + nn->dst.assign(1, *I); + nn->src.assign(count, *I); + p->push_back(nn); + } + return p; +} + +void ssa_prepare::add_defs(node &n) { + val_set &s = cur_set(); + for (vvec::iterator I = n.dst.begin(), E = n.dst.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_rel()) { + s.add_vec(v->mdef); + } else + s.add_val(v); + } +} + +bool ssa_prepare::visit(cf_node& n, bool enter) { + if (enter) { + push_stk(); + } else { + add_defs(n); + pop_stk(); + } + return true; +} + +bool ssa_prepare::visit(alu_node& n, bool enter) { + if (enter) { + } else { + add_defs(n); + } + return true; +} + +bool ssa_prepare::visit(fetch_node& n, bool enter) { + if (enter) { + } else { + add_defs(n); + } + return true; +} + +bool ssa_prepare::visit(region_node& n, bool enter) { + if (enter) { + + push_stk(); + } else { + cur_set().add_set(n.vars_defined); + if (n.dep_count() > 0) + n.phi = create_phi_nodes(n.dep_count()); + if (n.rep_count() > 1) { + n.loop_phi = create_phi_nodes(n.rep_count()); + n.loop_phi->subtype = NST_LOOP_PHI_CONTAINER; + } + n.vars_defined.clear(); + pop_stk(); + } + return true; +} + +bool ssa_prepare::visit(repeat_node& n, bool enter) { + if (enter) { + push_stk(); + } else { + assert(n.target); + n.target->vars_defined.add_set(cur_set()); + cur_set().clear(); + pop_stk(); + } + return true; +} + +bool ssa_prepare::visit(depart_node& n, bool enter) { + if (enter) { + push_stk(); + } else { + assert(n.target); + n.target->vars_defined.add_set(cur_set()); + cur_set().clear(); + pop_stk(); + } + return true; +} + +// =============================== + +int ssa_rename::init() { + rename_stack.push(def_map()); + return 0; +} + +bool ssa_rename::visit(alu_group_node& n, bool enter) { + // taking into account parallel execution of the alu group + if (enter) { + for (node_iterator I = n.begin(), E = n.end(); I != E; ++I) { + I->accept(*this, true); + } + } else { + for (node_iterator I = n.begin(), E = n.end(); I != E; ++I) { + I->accept(*this, false); + } + } + return false; +} + +bool ssa_rename::visit(cf_node& n, bool enter) { + if (enter) { + rename_src(&n); + } else { + rename_dst(&n); + } + return true; +} + +bool ssa_rename::visit(alu_node& n, bool enter) { + if (enter) { + rename_src(&n); + } else { + + node *psi = NULL; + + if (n.pred && n.dst[0]) { + + value *d = n.dst[0]; + unsigned index = get_index(rename_stack.top(), d); + value *p = sh.get_value_version(d, index); + + psi = sh.create_node(NT_OP, NST_PSI); + + container_node *parent; + if (n.parent->subtype == NST_ALU_GROUP) + parent = n.parent; + else { + assert (n.parent->parent->subtype == NST_ALU_GROUP); + parent = n.parent->parent; + } + parent->insert_after(psi); + + assert(n.bc.pred_sel); + + psi->src.resize(6); + psi->src[2] = p; + psi->src[3] = n.pred; + psi->src[4] = sh.get_pred_sel(n.bc.pred_sel - PRED_SEL_0); + psi->src[5] = d; + psi->dst.push_back(d); + } + + rename_dst(&n); + + if (psi) { + rename_src(psi); + rename_dst(psi); + } + + if (!n.dst.empty() && n.dst[0]) { + // FIXME probably use separate pass for such things + if ((n.bc.op_ptr->flags & AF_INTERP) || n.bc.op == ALU_OP2_CUBE) + n.dst[0]->flags |= VLF_PIN_CHAN; + } + } + return true; +} + +bool ssa_rename::visit(alu_packed_node& n, bool enter) { + if (enter) { + for (node_iterator I = n.begin(), E = n.end(); I != E; ++I) { + I->accept(*this, true); + } + } else { + for (node_iterator I = n.begin(), E = n.end(); I != E; ++I) { + I->accept(*this, false); + } + + n.init_args(); + } + return false; +} + +bool ssa_rename::visit(fetch_node& n, bool enter) { + if (enter) { + rename_src(&n); + rename_dst(&n); + } else { + } + return true; +} + +bool ssa_rename::visit(region_node& n, bool enter) { + if (enter) { + if (n.loop_phi) + rename_phi_args(n.loop_phi, 0, true); + } else { + if (n.phi) + rename_phi_args(n.phi, ~0u, true); + } + return true; +} + +bool ssa_rename::visit(repeat_node& n, bool enter) { + if (enter) { + push(n.target->loop_phi); + } else { + if (n.target->loop_phi) + rename_phi_args(n.target->loop_phi, n.rep_id, false); + pop(); + } + return true; +} + +bool ssa_rename::visit(depart_node& n, bool enter) { + if (enter) { + push(n.target->phi); + } else { + if (n.target->phi) + rename_phi_args(n.target->phi, n.dep_id, false); + pop(); + } + return true; +} + +bool ssa_rename::visit(if_node& n, bool enter) { + if (enter) { + } else { + n.cond = rename_use(&n, n.cond); + } + return true; +} + +void ssa_rename::push(node* phi) { + rename_stack.push(rename_stack.top()); +} + +void ssa_rename::pop() { + rename_stack.pop(); +} + +value* ssa_rename::rename_use(node *n, value* v) { + if (v->version) + return v; + + unsigned index = get_index(rename_stack.top(), v); + v = sh.get_value_version(v, index); + + // if (alu) instruction is predicated and source arg comes from psi node + // (that is, from another predicated instruction through its psi node), + // we can try to select the corresponding source value directly + if (n->pred && v->def && v->def->subtype == NST_PSI) { + assert(n->subtype == NST_ALU_INST); + alu_node *an = static_cast<alu_node*>(n); + node *pn = v->def; + // FIXME make it more generic ??? + if (pn->src.size() == 6) { + if (pn->src[3] == n->pred) { + value* ps = sh.get_pred_sel(an->bc.pred_sel - PRED_SEL_0); + if (pn->src[4] == ps) + return pn->src[5]; + else + return pn->src[2]; + } + } + } + return v; +} + +value* ssa_rename::rename_def(node *n, value* v) { + unsigned index = new_index(def_count, v); + set_index(rename_stack.top(), v, index); + value *r = sh.get_value_version(v, index); + return r; +} + +void ssa_rename::rename_src_vec(node *n, vvec &vv, bool src) { + for(vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value* &v = *I; + if (!v || v->is_readonly()) + continue; + + if (v->is_rel()) { + if (!v->rel->is_readonly()) + v->rel = rename_use(n, v->rel); + rename_src_vec(n, v->muse, true); + } else if (src) + v = rename_use(n, v); + } +} + +void ssa_rename::rename_src(node* n) { + if (n->pred) + n->pred = rename_use(n, n->pred); + + rename_src_vec(n, n->src, true); + rename_src_vec(n, n->dst, false); + +} + +void ssa_rename::rename_dst_vec(node *n, vvec &vv, bool set_def) { + + for(vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value* &v = *I; + if (!v) + continue; + + if (v->is_rel()) { + rename_dst_vec(n, v->mdef, false); + } else { + v = rename_def(n, v); + if (set_def) + v->def = n; + } + } +} + +void ssa_rename::rename_dst(node* n) { + rename_dst_vec(n, n->dst, true); +} + +unsigned ssa_rename::get_index(def_map& m, value* v) { + def_map::iterator I = m.find(v); + if (I != m.end()) + return I->second; + return 0; +} + +void ssa_rename::set_index(def_map& m, value* v, unsigned index) { + std::pair<def_map::iterator,bool> r = m.insert(std::make_pair(v, index)); + if (!r.second) + r.first->second = index; +} + +unsigned ssa_rename::new_index(def_map& m, value* v) { + unsigned index = 1; + def_map::iterator I = m.find(v); + if (I != m.end()) + index = ++I->second; + else + m.insert(std::make_pair(v, index)); + return index; +} + +bool ssa_rename::visit(node& n, bool enter) { + if (enter) { + assert(n.subtype == NST_PSI); + rename_src(&n); + rename_dst(&n); + } + return false; +} + +bool ssa_rename::visit(container_node& n, bool enter) { + if (enter) { + } else { + // should be root container node + assert(n.parent == NULL); + rename_src_vec(&n, n.src, true); + } + return true; +} + +void ssa_rename::rename_phi_args(container_node* phi, unsigned op, bool def) { + for (node_iterator I = phi->begin(), E = phi->end(); I != E; ++I) { + node *o = *I; + if (op != ~0u) + o->src[op] = rename_use(o, o->src[op]); + if (def) { + o->dst[0] = rename_def(o, o->dst[0]); + o->dst[0]->def = o; + } + } +} + +} // namespace r600_sb diff --git a/src/gallium/drivers/r600/sb/sb_valtable.cpp b/src/gallium/drivers/r600/sb/sb_valtable.cpp new file mode 100644 index 00000000000..2aaccd00dc5 --- /dev/null +++ b/src/gallium/drivers/r600/sb/sb_valtable.cpp @@ -0,0 +1,553 @@ +/* + * Copyright 2013 Vadim Girlin <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Vadim Girlin + */ + +#define VT_DEBUG 0 + +#if VT_DEBUG +#define VT_DUMP(q) do { q } while (0) +#else +#define VT_DUMP(q) +#endif + +#include <iostream> +#include <cstring> + +#include "sb_shader.h" +#include "sb_pass.h" + +namespace r600_sb { + +using std::cerr; + +static const char * chans = "xyzw01?_"; + +std::ostream& operator << (std::ostream &o, value &v) { + + bool dead = v.flags & VLF_DEAD; + + if (dead) + o << "{"; + + switch (v.kind) { + case VLK_SPECIAL_REG: { + switch (v.select.sel()) { + case SV_AR_INDEX: o << "AR"; break; + case SV_ALU_PRED: o << "PR"; break; + case SV_EXEC_MASK: o << "EM"; break; + case SV_VALID_MASK: o << "VM"; break; + default: o << "???specialreg"; break; + } + break; + } + + case VLK_REG: + o << "R" << v.select.sel() << "." + << chans[v.select.chan()]; + + break; + case VLK_KCACHE: { + o << "C" << v.select.sel() << "." << chans[v.select.chan()]; + } + break; + case VLK_CONST: + o << v.literal_value.f << "|" << std::hex + << std::showbase << v.literal_value.u + << std::dec << std::noshowbase; + break; + case VLK_PARAM: + o << "Param" << (v.select.sel() - ALU_SRC_PARAM_OFFSET) + << chans[v.select.chan()]; + break; + case VLK_TEMP: + o << "t" << v.select.sel() - shader::temp_regid_offset; + break; + case VLK_REL_REG: + + o << "A" << v.select; + o << "["; + o << *v.rel; + o << "]"; + + o << "_" << v.uid; + + break; + case VLK_UNDEF: + o << "undef"; + break; + default: + o << v.kind << "?????"; + break; + } + + if (v.version) + o << "." << v.version; + + if (dead) + o << "}"; + + if (v.is_global()) + o << "||"; + if (v.is_fixed()) + o << "F"; + if (v.is_prealloc()) + o << "P"; + + sel_chan g; + + if (v.is_rel()) { + g = v.array->gpr; + } else { + g = v.gpr; + } + + if (g) { + o << "@R" << g.sel() << "." << chans[g.chan()]; + } + + return o; +} + +void value_table::add_value(value* v) { + + if (v->gvn_source) { + return; + } + + VT_DUMP( + cerr << "gvn add_value "; + dump::dump_val(v); + ); + + value_hash hash = v->hash(); + vt_item & vti = hashtable[hash & size_mask]; + vti.push_back(v); + ++cnt; + + if (v->def && ex.try_fold(v)) { + VT_DUMP( + cerr << " folded: "; + dump::dump_val(v->gvn_source); + cerr << "\n"; + ); + return; + } + + int n = 0; + for (vt_item::iterator I = vti.begin(), E = vti.end(); I != E; ++I, ++n) { + value *c = *I; + + if (c == v) + break; + + if (expr_equal(c, v)) { + v->gvn_source = c->gvn_source; + + VT_DUMP( + cerr << " found : equal to "; + dump::dump_val(v->gvn_source); + cerr << "\n"; + ); + return; + } + } + + v->gvn_source = v; + VT_DUMP( + cerr << " added new\n"; + ); +} + +value_hash value::hash() { + if (ghash) + return ghash; + if (is_rel()) + ghash = rel_hash(); + else if (def) + ghash = def->hash(); + else + ghash = ((uintptr_t)this) | 1; + + return ghash; +} + +value_hash value::rel_hash() { + value_hash h = rel ? rel->hash() : 0; + h |= select << 10; + h |= array->hash(); + return h; +} + +bool value_table::expr_equal(value* l, value* r) { + return ex.equal(l, r); +} + +void value_table::get_values(vvec& v) { + v.resize(cnt); + + vvec::iterator T = v.begin(); + + for(vt_table::iterator I = hashtable.begin(), E = hashtable.end(); + I != E; ++I) { + T = copy(I->begin(), I->end(), T); + } +} + +void value::add_use(node* n, use_kind kind, int arg) { + if (0) { + cerr << "add_use "; + dump::dump_val(this); + cerr << " => "; + dump::dump_op(n); + cerr << " kind " << kind << " arg " << arg << "\n"; + } + uses = new use_info(n, kind, arg, uses); +} + +unsigned value::use_count() { + use_info *u = uses; + unsigned c = 0; + while (u) { + ++c; + u = u->next; + } + return c; +} + +bool value::is_global() { + if (chunk) + return chunk->is_global(); + return flags & VLF_GLOBAL; +} + +void value::set_global() { + assert(is_sgpr()); + flags |= VLF_GLOBAL; + if (chunk) + chunk->set_global(); +} + +void value::set_prealloc() { + assert(is_sgpr()); + flags |= VLF_PREALLOC; + if (chunk) + chunk->set_prealloc(); +} + +bool value::is_fixed() { + if (array && array->gpr) + return true; + if (chunk) + return chunk->is_fixed(); + return flags & VLF_FIXED; +} + +void value::fix() { + if (chunk) + chunk->fix(); + flags |= VLF_FIXED; +} + +bool value::is_prealloc() { + if (chunk) + return chunk->is_prealloc(); + return flags & VLF_PREALLOC; +} + +void value::delete_uses() { + use_info *u, *c = uses; + while (c) { + u = c->next; + delete c; + c = u; + } + uses = NULL; +} + +void ra_constraint::update_values() { + for (vvec::iterator I = values.begin(), E = values.end(); I != E; ++I) { + assert(!(*I)->constraint); + (*I)->constraint = this; + } +} + +void* sb_pool::allocate(unsigned sz) { + sz = (sz + SB_POOL_ALIGN - 1) & ~(SB_POOL_ALIGN - 1); + assert (sz < (block_size >> 6) && "too big allocation size for sb_pool"); + + unsigned offset = total_size % block_size; + unsigned capacity = block_size * blocks.size(); + + if (total_size + sz > capacity) { + total_size = capacity; + void * nb = malloc(block_size); + blocks.push_back(nb); + offset = 0; + } + + total_size += sz; + return ((char*)blocks.back() + offset); +} + +void sb_pool::free_all() { + for (block_vector::iterator I = blocks.begin(), E = blocks.end(); I != E; + ++I) { + free(*I); + } +} + +value* sb_value_pool::create(value_kind k, sel_chan regid, + unsigned ver) { + void* np = allocate(aligned_elt_size); + value *v = new (np) value(size(), k, regid, ver); + return v; +} + +void sb_value_pool::delete_all() { + unsigned bcnt = blocks.size(); + unsigned toffset = 0; + for (unsigned b = 0; b < bcnt; ++b) { + char *bstart = (char*)blocks[b]; + for (unsigned offset = 0; offset < block_size; + offset += aligned_elt_size) { + ((value*)(bstart + offset))->~value(); + toffset += aligned_elt_size; + if (toffset >= total_size) + return; + } + } +} + +bool sb_bitset::get(unsigned id) { + assert(id < bit_size); + unsigned w = id / bt_bits; + unsigned b = id % bt_bits; + return (data[w] >> b) & 1; +} + +void sb_bitset::set(unsigned id, bool bit) { + assert(id < bit_size); + unsigned w = id / bt_bits; + unsigned b = id % bt_bits; + if (w >= data.size()) + data.resize(w + 1); + + if (bit) + data[w] |= (1 << b); + else + data[w] &= ~(1 << b); +} + +inline bool sb_bitset::set_chk(unsigned id, bool bit) { + assert(id < bit_size); + unsigned w = id / bt_bits; + unsigned b = id % bt_bits; + basetype d = data[w]; + basetype dn = (d & ~(1 << b)) | (bit << b); + bool r = (d != dn); + data[w] = r ? dn : data[w]; + return r; +} + +void sb_bitset::clear() { + memset(data.data(), 0, sizeof(basetype) * data.size()); +} + +void sb_bitset::resize(unsigned size) { + unsigned cur_data_size = data.size(); + unsigned new_data_size = (size + bt_bits - 1) / bt_bits; + + + if (new_data_size != cur_data_size) + data.resize(new_data_size); + + // make sure that new bits in the existing word are cleared + if (cur_data_size && size > bit_size && bit_size % bt_bits) { + basetype clear_mask = (~(basetype)0u) << (bit_size % bt_bits); + data[cur_data_size - 1] &= ~clear_mask; + } + + bit_size = size; +} + +unsigned sb_bitset::find_bit(unsigned start) { + assert(start < bit_size); + unsigned w = start / bt_bits; + unsigned b = start % bt_bits; + unsigned sz = data.size(); + + while (w < sz) { + basetype d = data[w] >> b; + if (d != 0) { + unsigned pos = __builtin_ctz(d) + b + w * bt_bits; + return pos; + } + + b = 0; + ++w; + } + + return bit_size; +} + +sb_value_set::iterator::iterator(shader& sh, sb_value_set* s, unsigned nb) + : vp(sh.get_value_pool()), s(s), nb(nb) {} + +bool sb_value_set::add_set_checked(sb_value_set& s2) { + if (bs.size() < s2.bs.size()) + bs.resize(s2.bs.size()); + sb_bitset nbs = bs | s2.bs; + if (bs != nbs) { + bs.swap(nbs); + return true; + } + return false; +} + +void r600_sb::sb_value_set::remove_set(sb_value_set& s2) { + bs.mask(s2.bs); +} + +bool sb_value_set::add_val(value* v) { + assert(v); + if (bs.size() < v->uid) + bs.resize(v->uid + 32); + + return bs.set_chk(v->uid - 1, 1); +} + +bool sb_value_set::remove_vec(vvec& vv) { + bool modified = false; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + if (*I) + modified |= remove_val(*I); + } + return modified; +} + +void sb_value_set::clear() { + bs.clear(); +} + +bool sb_value_set::remove_val(value* v) { + assert(v); + if (bs.size() < v->uid) + return false; + return bs.set_chk(v->uid - 1, 0); +} + +bool r600_sb::sb_value_set::add_vec(vvec& vv) { + bool modified = false; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (v) + modified |= add_val(v); + } + return modified; +} + +bool r600_sb::sb_value_set::contains(value* v) { + unsigned b = v->uid - 1; + if (b < bs.size()) + return bs.get(v->uid - 1); + else + return false; +} + +bool sb_value_set::empty() { + return bs.size() == 0 || bs.find_bit(0) == bs.size(); +} + +void sb_bitset::swap(sb_bitset& bs2) { + std::swap(data, bs2.data); + std::swap(bit_size, bs2.bit_size); +} + +bool sb_bitset::operator ==(const sb_bitset& bs2) { + if (bit_size != bs2.bit_size) + return false; + + for (unsigned i = 0, c = data.size(); i < c; ++i) { + if (data[i] != bs2.data[i]) + return false; + } + return true; +} + +sb_bitset& sb_bitset::operator &=(const sb_bitset& bs2) { + if (bit_size > bs2.bit_size) { + resize(bs2.bit_size); + } + + for (unsigned i = 0, c = std::min(data.size(), bs2.data.size()); i < c; + ++i) { + data[i] &= bs2.data[i]; + } + return *this; +} + +sb_bitset& sb_bitset::mask(const sb_bitset& bs2) { + if (bit_size < bs2.bit_size) { + resize(bs2.bit_size); + } + + for (unsigned i = 0, c = data.size(); i < c; + ++i) { + data[i] &= ~bs2.data[i]; + } + return *this; +} + +bool ra_constraint::check() { + assert(kind == CK_SAME_REG); + + unsigned reg = 0; + + for (vvec::iterator I = values.begin(), E = values.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (!v->gpr) + return false; + + if (reg == 0) + reg = v->gpr.sel() + 1; + else if (reg != v->gpr.sel() + 1) + return false; + + if (v->is_chan_pinned()) { + if (v->pin_gpr.chan() != v->gpr.chan()) + return false; + } + } + return true; +} + +bool gpr_array::is_dead() { + return false; +} + +} // namespace r600_sb |