From db193e5ad06e7a2fbcffb3bb5df85d212eb12291 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 25 Jul 2014 11:15:59 -0400 Subject: freedreno/ir3: split out shader compiler from a3xx Move the bits we want to share between generations from fd3_program to ir3_shader. So overall structure is: fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3 |- ... \- ir3_shader_variant -> ir3 So the ir3_shader becomes the topmost generation neutral object, which manages the set of variants each of which generates, compiles, and assembles it's own ir. There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/, etc. Keep the split between the gallium level stateobj and the shader helper object because it might be a good idea to pre-compute some generation specific register values (ie. anything that is independent of linking). Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/ir3/ir3.h | 480 ++++++++++++++++++++++++++++++++ 1 file changed, 480 insertions(+) create mode 100644 src/gallium/drivers/freedreno/ir3/ir3.h (limited to 'src/gallium/drivers/freedreno/ir3/ir3.h') diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h new file mode 100644 index 00000000000..9ed914ba2e4 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IR3_H_ +#define IR3_H_ + +#include +#include + +#include "instr-a3xx.h" +#include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */ + +/* low level intermediate representation of an adreno shader program */ + +struct ir3; +struct ir3_instruction; +struct ir3_block; + +struct ir3 * fd_asm_parse(const char *src); + +struct ir3_info { + uint16_t sizedwords; + uint16_t instrs_count; /* expanded to account for rpt's */ + /* NOTE: max_reg, etc, does not include registers not touched + * by the shader (ie. vertex fetched via VFD_DECODE but not + * touched by shader) + */ + int8_t max_reg; /* highest GPR # used by shader */ + int8_t max_half_reg; + int8_t max_const; +}; + +struct ir3_register { + enum { + IR3_REG_CONST = 0x001, + IR3_REG_IMMED = 0x002, + IR3_REG_HALF = 0x004, + IR3_REG_RELATIV= 0x008, + IR3_REG_R = 0x010, + IR3_REG_NEGATE = 0x020, + IR3_REG_ABS = 0x040, + IR3_REG_EVEN = 0x080, + IR3_REG_POS_INF= 0x100, + /* (ei) flag, end-input? Set on last bary, presumably to signal + * that the shader needs no more input: + */ + IR3_REG_EI = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ + IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ + IR3_REG_ADDR = 0x4000, /* register is a0.x */ + } flags; + union { + /* normal registers: + * the component is in the low two bits of the reg #, so + * rN.x becomes: (N << 2) | x + */ + int num; + /* immediate: */ + int iim_val; + float fim_val; + /* relative: */ + int offset; + /* for IR3_REG_SSA, src registers contain ptr back to + * assigning instruction. + */ + struct ir3_instruction *instr; + }; + + /* used for cat5 instructions, but also for internal/IR level + * tracking of what registers are read/written by an instruction. + * wrmask may be a bad name since it is used to represent both + * src and dst that touch multiple adjacent registers. + */ + int wrmask; +}; + +struct ir3_instruction { + struct ir3_block *block; + int category; + opc_t opc; + enum { + /* (sy) flag is set on first instruction, and after sample + * instructions (probably just on RAW hazard). + */ + IR3_INSTR_SY = 0x001, + /* (ss) flag is set on first instruction, and first instruction + * to depend on the result of "long" instructions (RAW hazard): + * + * rcp, rsq, log2, exp2, sin, cos, sqrt + * + * It seems to synchronize until all in-flight instructions are + * completed, for example: + * + * rsq hr1.w, hr1.w + * add.f hr2.z, (neg)hr2.z, hc0.y + * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y + * rsq hr2.x, hr2.x + * (rpt1)nop + * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w + * nop + * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w + * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w + * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x + * + * The last mul.f does not have (ss) set, presumably because the + * (ss) on the previous instruction does the job. + * + * The blob driver also seems to set it on WAR hazards, although + * not really clear if this is needed or just blob compiler being + * sloppy. So far I haven't found a case where removing the (ss) + * causes problems for WAR hazard, but I could just be getting + * lucky: + * + * rcp r1.y, r3.y + * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z + * + */ + IR3_INSTR_SS = 0x002, + /* (jp) flag is set on jump targets: + */ + IR3_INSTR_JP = 0x004, + IR3_INSTR_UL = 0x008, + IR3_INSTR_3D = 0x010, + IR3_INSTR_A = 0x020, + IR3_INSTR_O = 0x040, + IR3_INSTR_P = 0x080, + IR3_INSTR_S = 0x100, + IR3_INSTR_S2EN = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_INSTR_MARK = 0x1000, + } flags; + int repeat; + unsigned regs_count; + struct ir3_register *regs[5]; + union { + struct { + char inv; + char comp; + int immed; + } cat0; + struct { + type_t src_type, dst_type; + } cat1; + struct { + enum { + IR3_COND_LT = 0, + IR3_COND_LE = 1, + IR3_COND_GT = 2, + IR3_COND_GE = 3, + IR3_COND_EQ = 4, + IR3_COND_NE = 5, + } condition; + } cat2; + struct { + unsigned samp, tex; + type_t type; + } cat5; + struct { + type_t type; + int offset; + int iim_val; + } cat6; + /* for meta-instructions, just used to hold extra data + * before instruction scheduling, etc + */ + struct { + int off; /* component/offset */ + } fo; + struct { + struct ir3_block *if_block, *else_block; + } flow; + struct { + struct ir3_block *block; + } inout; + }; + + /* transient values used during various algorithms: */ + union { + /* The instruction depth is the max dependency distance to output. + * + * You can also think of it as the "cost", if we did any sort of + * optimization for register footprint. Ie. a value that is just + * result of moving a const to a reg would have a low cost, so to + * it could make sense to duplicate the instruction at various + * points where the result is needed to reduce register footprint. + */ + unsigned depth; + }; + struct ir3_instruction *next; +#ifdef DEBUG + uint32_t serialno; +#endif +}; + +struct ir3_heap_chunk; + +struct ir3 { + unsigned instrs_count, instrs_sz; + struct ir3_instruction **instrs; + unsigned heap_idx; + struct ir3_heap_chunk *chunk; +}; + +struct ir3_block { + struct ir3 *shader; + unsigned ntemporaries, ninputs, noutputs; + /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */ + struct ir3_instruction **temporaries; + struct ir3_instruction **inputs; + struct ir3_instruction **outputs; + /* only a single address register: */ + struct ir3_instruction *address; + struct ir3_block *parent; + struct ir3_instruction *head; +}; + +struct ir3 * ir3_create(void); +void ir3_destroy(struct ir3 *shader); +void * ir3_assemble(struct ir3 *shader, + struct ir3_info *info); +void * ir3_alloc(struct ir3 *shader, int sz); + +struct ir3_block * ir3_block_create(struct ir3 *shader, + unsigned ntmp, unsigned nin, unsigned nout); + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, + int category, opc_t opc); +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +const char *ir3_instr_name(struct ir3_instruction *instr); + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags); + + +static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) +{ + if (instr->flags & IR3_INSTR_MARK) + return true; /* already visited */ + instr->flags ^= IR3_INSTR_MARK; + return false; +} + +static inline void ir3_clear_mark(struct ir3 *shader) +{ + /* TODO would be nice to drop the instruction array.. for + * new compiler, _clear_mark() is all we use it for, and + * we could probably manage a linked list instead.. + */ + unsigned i; + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + instr->flags &= ~IR3_INSTR_MARK; + } +} + +static inline int ir3_instr_regno(struct ir3_instruction *instr, + struct ir3_register *reg) +{ + unsigned i; + for (i = 0; i < instr->regs_count; i++) + if (reg == instr->regs[i]) + return i; + return -1; +} + + +/* comp: + * 0 - x + * 1 - y + * 2 - z + * 3 - w + */ +static inline uint32_t regid(int num, int comp) +{ + return (num << 2) | (comp & 0x3); +} + +static inline uint32_t reg_num(struct ir3_register *reg) +{ + return reg->num >> 2; +} + +static inline uint32_t reg_comp(struct ir3_register *reg) +{ + return reg->num & 0x3; +} + +static inline bool is_flow(struct ir3_instruction *instr) +{ + return (instr->category == 0); +} + +static inline bool is_kill(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_KILL); +} + +static inline bool is_nop(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_NOP); +} + +static inline bool is_alu(struct ir3_instruction *instr) +{ + return (1 <= instr->category) && (instr->category <= 3); +} + +static inline bool is_sfu(struct ir3_instruction *instr) +{ + return (instr->category == 4); +} + +static inline bool is_tex(struct ir3_instruction *instr) +{ + return (instr->category == 5); +} + +static inline bool is_input(struct ir3_instruction *instr) +{ + return (instr->category == 2) && (instr->opc == OPC_BARY_F); +} + +static inline bool is_meta(struct ir3_instruction *instr) +{ + /* TODO how should we count PHI (and maybe fan-in/out) which + * might actually contribute some instructions to the final + * result? + */ + return (instr->category == -1); +} + +static inline bool is_addr(struct ir3_instruction *instr) +{ + return is_meta(instr) && (instr->opc == OPC_META_DEREF); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return !!(dst->flags & IR3_REG_ADDR); + } + return false; +} + +static inline bool writes_pred(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return reg_num(dst) == REG_P0; + } + return false; +} + +static inline bool reg_gpr(struct ir3_register *r) +{ + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) + return false; + if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) + return false; + return true; +} + +/* dump: */ +#include +void ir3_dump(struct ir3 *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, + FILE *f); +void ir3_dump_instr_single(struct ir3_instruction *instr); +void ir3_dump_instr_list(struct ir3_instruction *instr); + +/* flatten if/else: */ +int ir3_block_flatten(struct ir3_block *block); + +/* depth calculation: */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n); +void ir3_block_depth(struct ir3_block *block); + +/* copy-propagate: */ +void ir3_block_cp(struct ir3_block *block); + +/* scheduling: */ +void ir3_block_sched(struct ir3_block *block); + +/* register assignment: */ +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp); + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +/* ************************************************************************* */ +/* split this out or find some helper to use.. like main/bitset.h.. */ + +#include + +#define MAX_REG 256 + +typedef uint8_t regmask_t[2 * MAX_REG / 8]; + +static inline unsigned regmask_idx(struct ir3_register *reg) +{ + unsigned num = reg->num; + assert(num < MAX_REG); + if (reg->flags & IR3_REG_HALF) + num += MAX_REG; + return num; +} + +static inline void regmask_init(regmask_t *regmask) +{ + memset(regmask, 0, sizeof(*regmask)); +} + +static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + (*regmask)[idx / 8] |= 1 << (idx % 8); +} + +/* set bits in a if not set in b, conceptually: + * a |= (reg & ~b) + */ +static inline void regmask_set_if_not(regmask_t *a, + struct ir3_register *reg, regmask_t *b) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if (!((*b)[idx / 8] & (1 << (idx % 8)))) + (*a)[idx / 8] |= 1 << (idx % 8); +} + +static inline unsigned regmask_get(regmask_t *regmask, + struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if ((*regmask)[idx / 8] & (1 << (idx % 8))) + return true; + return false; +} + +/* ************************************************************************* */ + +#endif /* IR3_H_ */ -- cgit v1.2.3