diff options
Diffstat (limited to 'src/gallium/drivers/freedreno/a2xx/ir-a2xx.c')
-rw-r--r-- | src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 734 |
1 files changed, 453 insertions, 281 deletions
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c index 42a9ab494e6..af9811864ff 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c +++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c @@ -35,19 +35,13 @@ #define WARN_MSG(f, ...) DBG("WARN: "f, ##__VA_ARGS__) #define ERROR_MSG(f, ...) DBG("ERROR: "f, ##__VA_ARGS__) -#define REG_MASK 0x3f - -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr); - static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, uint32_t idx, struct ir2_shader_info *info); -static void reg_update_stats(struct ir2_register *reg, - struct ir2_shader_info *info, bool dest); -static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n); -static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg); -static uint32_t reg_alu_dst_swiz(struct ir2_register *reg); -static uint32_t reg_alu_src_swiz(struct ir2_register *reg); +static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n); +static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg); +static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg); +static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg); /* simple allocator to carve allocations out of an up-front allocated heap, * so that we can free everything easily in one shot. @@ -55,7 +49,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_register *reg); static void * ir2_alloc(struct ir2_shader *shader, int sz) { void *ptr = &shader->heap[shader->heap_idx]; - shader->heap_idx += align(sz, 4); + shader->heap_idx += align(sz, 4) / 4; return ptr; } @@ -74,7 +68,9 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str) struct ir2_shader * ir2_shader_create(void) { DEBUG_MSG(""); - return calloc(1, sizeof(struct ir2_shader)); + struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader)); + shader->max_reg = -1; + return shader; } void ir2_shader_destroy(struct ir2_shader *shader) @@ -83,189 +79,344 @@ void ir2_shader_destroy(struct ir2_shader *shader) free(shader); } -/* resolve addr/cnt/sequence fields in the individual CF's */ -static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info) +/* check if an instruction is a simple MOV + */ +static struct ir2_instruction * simple_mov(struct ir2_instruction *instr, + bool output) { - uint32_t addr; - unsigned i; - int j; - - addr = shader->cfs_count / 2; - for (i = 0; i < shader->cfs_count; i++) { - struct ir2_cf *cf = shader->cfs[i]; - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { - uint32_t sequence = 0; - - if (cf->exec.addr && (cf->exec.addr != addr)) - WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i); - if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count)) - WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i); - - for (j = cf->exec.instrs_count - 1; j >= 0; j--) { - struct ir2_instruction *instr = cf->exec.instrs[j]; - sequence <<= 2; - if (instr->instr_type == IR2_FETCH) - sequence |= 0x1; - if (instr->sync) - sequence |= 0x2; - } + struct ir2_src_register *src_reg = instr->src_reg; + struct ir2_dst_register *dst_reg = &instr->dst_reg; + struct ir2_register *reg; + unsigned i; + + /* MAXv used for MOV */ + if (instr->instr_type != IR2_ALU_VECTOR || + instr->alu_vector.opc != MAXv) + return NULL; + + /* non identical srcs */ + if (src_reg[0].num != src_reg[1].num) + return NULL; + + /* flags */ + int flags = IR2_REG_NEGATE | IR2_REG_ABS; + if (output) + flags |= IR2_REG_INPUT | IR2_REG_CONST; + if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags)) + return NULL; + + /* clamping */ + if (instr->alu_vector.clamp) + return NULL; + + /* swizzling */ + for (i = 0; i < 4; i++) { + char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i]; + if (swiz == '_') + continue; + + if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] || + swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i]) + return NULL; + } + + if (output) + reg = &instr->shader->reg[src_reg[0].num]; + else + reg = &instr->shader->reg[dst_reg->num]; + + assert(reg->write_idx >= 0); + if (reg->write_idx != reg->write_idx2) + return NULL; + + if (!output) + return instr; + + instr = instr->shader->instr[reg->write_idx]; + return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr; +} - cf->exec.addr = addr; - cf->exec.cnt = cf->exec.instrs_count; - cf->exec.sequence = sequence; +static int src_to_reg(struct ir2_instruction *instr, + struct ir2_src_register *reg) +{ + if (reg->flags & IR2_REG_CONST) + return reg->num; - addr += cf->exec.instrs_count; - } - } + return instr->shader->reg[reg->num].reg; +} + +static int dst_to_reg(struct ir2_instruction *instr, + struct ir2_dst_register *reg) +{ + if (reg->flags & IR2_REG_EXPORT) + return reg->num; - info->sizedwords = 3 * addr; + return instr->shader->reg[reg->num].reg; +} - return 0; +static bool mask_get(uint32_t *mask, unsigned index) +{ + return !!(mask[index / 32] & 1 << index % 32); } -void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info) +static void mask_set(uint32_t *mask, struct ir2_register *reg, int index) { - uint32_t i, j; - uint32_t *ptr, *dwords = NULL; - uint32_t idx = 0; - int ret; - - info->sizedwords = 0; - info->max_reg = -1; - info->max_input_reg = 0; - info->regs_written = 0; - - /* we need an even # of CF's.. insert a NOP if needed */ - if (shader->cfs_count != align(shader->cfs_count, 2)) - ir2_cf_create(shader, NOP); - - /* first pass, resolve sizes and addresses: */ - ret = shader_resolve(shader, info); - if (ret) { - ERROR_MSG("resolve failed: %d", ret); - goto fail; + if (reg) { + unsigned i; + for (i = 0; i < ARRAY_SIZE(reg->regmask); i++) + mask[i] |= reg->regmask[i]; } + if (index >= 0) + mask[index / 32] |= 1 << index % 32; +} - ptr = dwords = calloc(4, info->sizedwords); +static bool sets_pred(struct ir2_instruction *instr) +{ + return instr->instr_type == IR2_ALU_SCALAR && + instr->alu_scalar.opc >= PRED_SETEs && + instr->alu_scalar.opc <= PRED_SET_RESTOREs; +} - /* second pass, emit CF program in pairs: */ - for (i = 0; i < shader->cfs_count; i += 2) { - instr_cf_t *cfs = (instr_cf_t *)ptr; - ret = cf_emit(shader->cfs[i], &cfs[0]); - if (ret) { - ERROR_MSG("CF emit failed: %d\n", ret); - goto fail; + + +void* ir2_shader_assemble(struct ir2_shader *shader, + struct ir2_shader_info *info) +{ + /* NOTES + * blob compiler seems to always puts PRED_* instrs in a CF by + * themselves, and wont combine EQ/NE in the same CF + * (not doing this - doesn't seem to make a difference) + * + * TODO: implement scheduling for combining vector+scalar instructions + * -some vector instructions can be replaced by scalar + */ + + /* first step: + * 1. remove "NOP" MOV instructions generated by TGSI for input/output: + * 2. track information for register allocation, and to remove + * the dead code when some exports are not needed + * 3. add additional instructions for a20x hw binning if needed + * NOTE: modifies the shader instrs + * this step could be done as instructions are added by compiler instead + */ + + /* mask of exports that must be generated + * used to avoid calculating ps exports with hw binning + */ + uint64_t export = ~0ull; + /* bitmask of variables required for exports defined by "export" */ + uint32_t export_mask[REG_MASK/32+1] = {}; + + unsigned idx, reg_idx; + unsigned max_input = 0; + int export_size = -1; + + for (idx = 0; idx < shader->instr_count; idx++) { + struct ir2_instruction *instr = shader->instr[idx], *prev; + struct ir2_dst_register dst_reg = instr->dst_reg; + + if (dst_reg.flags & IR2_REG_EXPORT) { + if (dst_reg.num < 32) + export_size++; + + if ((prev = simple_mov(instr, true))) { + /* copy instruction but keep dst */ + *instr = *prev; + instr->dst_reg = dst_reg; + } } - ret = cf_emit(shader->cfs[i+1], &cfs[1]); - if (ret) { - ERROR_MSG("CF emit failed: %d\n", ret); - goto fail; + + for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) { + struct ir2_src_register *src_reg = &instr->src_reg[reg_idx]; + struct ir2_register *reg; + int num; + + if (src_reg->flags & IR2_REG_CONST) + continue; + + num = src_reg->num; + reg = &shader->reg[num]; + reg->read_idx = idx; + + if (src_reg->flags & IR2_REG_INPUT) { + max_input = MAX2(max_input, num); + } else { + /* bypass simple mov used to set src_reg */ + assert(reg->write_idx >= 0); + prev = shader->instr[reg->write_idx]; + if (simple_mov(prev, false)) { + *src_reg = prev->src_reg[0]; + /* process same src_reg again */ + reg_idx -= 1; + continue; + } + } + + /* update dependencies */ + uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ? + export_mask : shader->reg[dst_reg.num].regmask; + mask_set(mask, reg, num); + if (sets_pred(instr)) + mask_set(export_mask, reg, num); } - ptr += 3; - assert((ptr - dwords) <= info->sizedwords); } - /* third pass, emit ALU/FETCH: */ - for (i = 0; i < shader->cfs_count; i++) { - struct ir2_cf *cf = shader->cfs[i]; - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { - for (j = 0; j < cf->exec.instrs_count; j++) { - ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info); - if (ret) { - ERROR_MSG("instruction emit failed: %d", ret); - goto fail; - } - ptr += 3; - assert((ptr - dwords) <= info->sizedwords); + /* second step: + * emit instructions (with CFs) + RA + */ + instr_cf_t cfs[128], *cf = cfs; + uint32_t alufetch[3*256], *af = alufetch; + + /* RA is done on write, so inputs must be allocated here */ + for (reg_idx = 0; reg_idx <= max_input; reg_idx++) + shader->reg[reg_idx].reg = reg_idx; + info->max_reg = max_input; + + /* CF instr state */ + instr_cf_exec_t exec = { .opc = EXEC }; + instr_cf_alloc_t alloc = { .opc = ALLOC }; + bool need_alloc = 0; + bool pos_export = 0; + + export_size = MAX2(export_size, 0); + + for (idx = 0; idx < shader->instr_count; idx++) { + struct ir2_instruction *instr = shader->instr[idx]; + struct ir2_dst_register *dst_reg = &instr->dst_reg; + unsigned num = dst_reg->num; + struct ir2_register *reg; + + /* a2xx only has 64 registers, so we can use a single 64-bit mask */ + uint64_t regmask = 0ull; + + /* compute the current regmask */ + for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) { + reg = &shader->reg[reg_idx]; + if ((int) idx > reg->write_idx && idx < reg->read_idx) + regmask |= (1ull << reg->reg); + } + + if (dst_reg->flags & IR2_REG_EXPORT) { + /* skip if export is not needed */ + if (!(export & (1ull << num))) + continue; + + /* ALLOC CF: + * want to alloc all < 32 at once + * 32/33 and 62/63 come in pairs + * XXX assuming all 3 types are never interleaved + */ + if (num < 32) { + alloc.size = export_size; + alloc.buffer_select = SQ_PARAMETER_PIXEL; + need_alloc = export_size >= 0; + export_size = -1; + } else if (num == 32 || num == 33) { + alloc.size = 0; + alloc.buffer_select = SQ_MEMORY; + need_alloc = num != 33; + } else { + alloc.size = 0; + alloc.buffer_select = SQ_POSITION; + need_alloc = !pos_export; + pos_export = true; } + + } else { + /* skip if dst register not needed to compute exports */ + if (!mask_get(export_mask, num)) + continue; + + /* RA on first write */ + reg = &shader->reg[num]; + if (reg->write_idx == idx) { + reg->reg = ffsll(~regmask) - 1; + info->max_reg = MAX2(info->max_reg, reg->reg); + } + } + + if (exec.count == 6 || (exec.count && need_alloc)) { + *cf++ = *(instr_cf_t*) &exec; + exec.address += exec.count; + exec.serialize = 0; + exec.count = 0; } + + if (need_alloc) { + *cf++ = *(instr_cf_t*) &alloc; + need_alloc = false; + } + + int ret = instr_emit(instr, af, idx, info); af += 3; + assert(!ret); + + if (instr->instr_type == IR2_FETCH) + exec.serialize |= 0x1 << exec.count * 2; + if (instr->sync) + exec.serialize |= 0x2 << exec.count * 2; + exec.count += 1; } - return dwords; -fail: - free(dwords); - return NULL; -} + exec.opc = !export_size ? EXEC : EXEC_END; + *cf++ = *(instr_cf_t*) &exec; + exec.address += exec.count; + exec.serialize = 0; + exec.count = 0; + /* GPU will hang without at least one pixel alloc */ + if (!export_size) { + alloc.size = 0; + alloc.buffer_select = SQ_PARAMETER_PIXEL; + *cf++ = *(instr_cf_t*) &alloc; -struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type) -{ - struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf)); - DEBUG_MSG("%d", cf_type); - cf->shader = shader; - cf->cf_type = cf_type; - assert(shader->cfs_count < ARRAY_SIZE(shader->cfs)); - shader->cfs[shader->cfs_count++] = cf; - return cf; -} + exec.opc = EXEC_END; + *cf++ = *(instr_cf_t*) &exec; + } + unsigned num_cfs = cf - cfs; -/* - * CF instructions: - */ + /* insert nop to get an even # of CFs */ + if (num_cfs % 2) { + *cf++ = (instr_cf_t) { .opc = NOP }; + num_cfs++; + } -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr) -{ - memset(instr, 0, sizeof(*instr)); - - instr->opc = cf->cf_type; - - switch (cf->cf_type) { - case NOP: - break; - case EXEC: - case EXEC_END: - assert(cf->exec.addr <= 0x1ff); - assert(cf->exec.cnt <= 0x6); - assert(cf->exec.sequence <= 0xfff); - instr->exec.address = cf->exec.addr; - instr->exec.count = cf->exec.cnt; - instr->exec.serialize = cf->exec.sequence; - break; - case ALLOC: - assert(cf->alloc.size <= 0xf); - instr->alloc.size = cf->alloc.size; - switch (cf->alloc.type) { - case SQ_POSITION: - case SQ_PARAMETER_PIXEL: - instr->alloc.buffer_select = cf->alloc.type; + /* offset cf addrs */ + for (idx = 0; idx < num_cfs; idx++) { + switch (cfs[idx].opc) { + case EXEC: + case EXEC_END: + cfs[idx].exec.address += num_cfs / 2; break; default: - ERROR_MSG("invalid alloc type: %d", cf->alloc.type); - return -1; + break; + /* XXX and any other address using cf that gets implemented */ } - break; - case COND_EXEC: - case COND_EXEC_END: - case COND_PRED_EXEC: - case COND_PRED_EXEC_END: - case LOOP_START: - case LOOP_END: - case COND_CALL: - case RETURN: - case COND_JMP: - case COND_EXEC_PRED_CLEAN: - case COND_EXEC_PRED_CLEAN_END: - case MARK_VS_FETCH_DONE: - ERROR_MSG("TODO"); - return -1; } - return 0; + /* concatenate cfs+alufetchs */ + uint32_t cfdwords = num_cfs / 2 * 3; + uint32_t alufetchdwords = exec.address * 3; + info->sizedwords = cfdwords + alufetchdwords; + uint32_t *dwords = malloc(info->sizedwords * 4); + assert(dwords); + memcpy(dwords, cfs, cfdwords * 4); + memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4); + return dwords; } - -struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type) +struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, + int instr_type) { struct ir2_instruction *instr = - ir2_alloc(cf->shader, sizeof(struct ir2_instruction)); + ir2_alloc(shader, sizeof(struct ir2_instruction)); DEBUG_MSG("%d", instr_type); - instr->shader = cf->shader; - instr->pred = cf->shader->pred; + instr->shader = shader; + instr->idx = shader->instr_count; + instr->pred = shader->pred; instr->instr_type = instr_type; - assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs)); - cf->exec.instrs[cf->exec.instrs_count++] = instr; + shader->instr[shader->instr_count++] = instr; return instr; } @@ -279,15 +430,11 @@ static int instr_emit_fetch(struct ir2_instruction *instr, struct ir2_shader_info *info) { instr_fetch_t *fetch = (instr_fetch_t *)dwords; - int reg = 0; - struct ir2_register *dst_reg = instr->regs[reg++]; - struct ir2_register *src_reg = instr->regs[reg++]; + struct ir2_dst_register *dst_reg = &instr->dst_reg; + struct ir2_src_register *src_reg = &instr->src_reg[0]; memset(fetch, 0, sizeof(*fetch)); - reg_update_stats(dst_reg, info, true); - reg_update_stats(src_reg, info, false); - fetch->opc = instr->fetch.opc; if (instr->fetch.opc == VTX_FETCH) { @@ -298,9 +445,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, assert(instr->fetch.const_idx <= 0x1f); assert(instr->fetch.const_idx_sel <= 0x3); - vtx->src_reg = src_reg->num; + vtx->src_reg = src_to_reg(instr, src_reg); vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1); - vtx->dst_reg = dst_reg->num; + vtx->dst_reg = dst_to_reg(instr, dst_reg); vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg); vtx->must_be_one = 1; vtx->const_index = instr->fetch.const_idx; @@ -326,9 +473,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, assert(instr->fetch.const_idx <= 0x1f); - tex->src_reg = src_reg->num; + tex->src_reg = src_to_reg(instr, src_reg); tex->src_swiz = reg_fetch_src_swiz(src_reg, 3); - tex->dst_reg = dst_reg->num; + tex->dst_reg = dst_to_reg(instr, dst_reg); tex->dst_swiz = reg_fetch_dst_swiz(dst_reg); tex->const_idx = instr->fetch.const_idx; tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; @@ -360,95 +507,62 @@ static int instr_emit_fetch(struct ir2_instruction *instr, * ALU instructions: */ -static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords, +static int instr_emit_alu(struct ir2_instruction *instr_v, + struct ir2_instruction *instr_s, uint32_t *dwords, struct ir2_shader_info *info) { - int reg = 0; instr_alu_t *alu = (instr_alu_t *)dwords; - struct ir2_register *dst_reg = instr->regs[reg++]; - struct ir2_register *src1_reg; - struct ir2_register *src2_reg; - struct ir2_register *src3_reg; + struct ir2_dst_register *vdst_reg, *sdst_reg; + struct ir2_src_register *src1_reg, *src2_reg, *src3_reg; + struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader; + enum ir2_pred pred = IR2_PRED_NONE; memset(alu, 0, sizeof(*alu)); - /* handle instructions w/ 3 src operands: */ - switch (instr->alu.vector_opc) { - case MULADDv: - case CNDEv: - case CNDGTEv: - case CNDGTv: - case DOT2ADDv: - /* note: disassembler lists 3rd src first, ie: - * MULADDv Rdst = Rsrc3 + (Rsrc1 * Rsrc2) - * which is the reason for this strange ordering. - */ - src3_reg = instr->regs[reg++]; - break; - default: - src3_reg = NULL; - break; + vdst_reg = NULL; + sdst_reg = NULL; + src1_reg = NULL; + src2_reg = NULL; + src3_reg = NULL; + + if (instr_v) { + vdst_reg = &instr_v->dst_reg; + assert(instr_v->src_reg_count >= 2); + src1_reg = &instr_v->src_reg[0]; + src2_reg = &instr_v->src_reg[1]; + if (instr_v->src_reg_count > 2) + src3_reg = &instr_v->src_reg[2]; + pred = instr_v->pred; } - src1_reg = instr->regs[reg++]; - src2_reg = instr->regs[reg++]; - - reg_update_stats(dst_reg, info, true); - reg_update_stats(src1_reg, info, false); - reg_update_stats(src2_reg, info, false); - - assert((dst_reg->flags & ~IR2_REG_EXPORT) == 0); - assert(!dst_reg->swizzle || (strlen(dst_reg->swizzle) == 4)); - assert((src1_reg->flags & IR2_REG_EXPORT) == 0); - assert(!src1_reg->swizzle || (strlen(src1_reg->swizzle) == 4)); - assert((src2_reg->flags & IR2_REG_EXPORT) == 0); - assert(!src2_reg->swizzle || (strlen(src2_reg->swizzle) == 4)); + if (instr_s) { + sdst_reg = &instr_s->dst_reg; + assert(instr_s->src_reg_count == 1); + assert(!instr_v || vdst_reg->flags == sdst_reg->flags); + assert(!instr_v || pred == instr_s->pred); + if (src3_reg) { + assert(src3_reg->flags == instr_s->src_reg[0].flags); + assert(src3_reg->num == instr_s->src_reg[0].num); + assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle)); + } + src3_reg = &instr_s->src_reg[0]; + pred = instr_s->pred; + } - if (instr->alu.vector_opc == (instr_vector_opc_t)~0) { - alu->vector_opc = MAXv; - alu->vector_write_mask = 0; + if (vdst_reg) { + assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0); + assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4)); + alu->vector_opc = instr_v->alu_vector.opc; + alu->vector_write_mask = reg_alu_dst_swiz(vdst_reg); + alu->vector_dest = dst_to_reg(instr_v, vdst_reg); } else { - alu->vector_opc = instr->alu.vector_opc; - alu->vector_write_mask = reg_alu_dst_swiz(dst_reg); + alu->vector_opc = MAXv; } - alu->vector_dest = dst_reg->num; - alu->export_data = !!(dst_reg->flags & IR2_REG_EXPORT); - - // TODO predicate case/condition.. need to add to parser - - alu->src2_reg = src2_reg->num; - alu->src2_swiz = reg_alu_src_swiz(src2_reg); - alu->src2_reg_negate = !!(src2_reg->flags & IR2_REG_NEGATE); - alu->src2_reg_abs = !!(src2_reg->flags & IR2_REG_ABS); - alu->src2_sel = !(src2_reg->flags & IR2_REG_CONST); - - alu->src1_reg = src1_reg->num; - alu->src1_swiz = reg_alu_src_swiz(src1_reg); - alu->src1_reg_negate = !!(src1_reg->flags & IR2_REG_NEGATE); - alu->src1_reg_abs = !!(src1_reg->flags & IR2_REG_ABS); - alu->src1_sel = !(src1_reg->flags & IR2_REG_CONST); - - alu->vector_clamp = instr->alu.vector_clamp; - alu->scalar_clamp = instr->alu.scalar_clamp; - - if (instr->alu.scalar_opc != (instr_scalar_opc_t)~0) { - struct ir2_register *sdst_reg = instr->regs[reg++]; - - reg_update_stats(sdst_reg, info, true); - - assert(sdst_reg->flags == dst_reg->flags); - - if (src3_reg) { - assert(src3_reg == instr->regs[reg]); - reg++; - } else { - src3_reg = instr->regs[reg++]; - } - - alu->scalar_dest = sdst_reg->num; + if (sdst_reg) { + alu->scalar_opc = instr_s->alu_scalar.opc; alu->scalar_write_mask = reg_alu_dst_swiz(sdst_reg); - alu->scalar_opc = instr->alu.scalar_opc; + alu->scalar_dest = dst_to_reg(instr_s, sdst_reg); } else { /* not sure if this is required, but adreno compiler seems * to always set scalar opc to MAXs if it is not used: @@ -456,13 +570,58 @@ static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords, alu->scalar_opc = MAXs; } - if (src3_reg) { - reg_update_stats(src3_reg, info, false); + alu->export_data = + !!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT); - alu->src3_reg = src3_reg->num; + /* export32 has this bit set.. it seems to do more than just set + * the base address of the constants used to zero + * TODO make this less of a hack + */ + if (alu->export_data && alu->vector_dest == 32) { + assert(!instr_s); + alu->relative_addr = 1; + } + + if (src1_reg) { + if (src1_reg->flags & IR2_REG_CONST) { + assert(!(src1_reg->flags & IR2_REG_ABS)); + alu->src1_reg_const = src1_reg->num; + } else { + alu->src1_reg = shader->reg[src1_reg->num].reg; + alu->src1_reg_abs = !!(src1_reg->flags & IR2_REG_ABS); + } + alu->src1_swiz = reg_alu_src_swiz(src1_reg); + alu->src1_reg_negate = !!(src1_reg->flags & IR2_REG_NEGATE); + alu->src1_sel = !(src1_reg->flags & IR2_REG_CONST); + } else { + alu->src1_sel = 1; + } + + if (src2_reg) { + if (src2_reg->flags & IR2_REG_CONST) { + assert(!(src2_reg->flags & IR2_REG_ABS)); + alu->src2_reg_const = src2_reg->num; + } else { + alu->src2_reg = shader->reg[src2_reg->num].reg; + alu->src2_reg_abs = !!(src2_reg->flags & IR2_REG_ABS); + } + alu->src2_swiz = reg_alu_src_swiz(src2_reg); + alu->src2_reg_negate = !!(src2_reg->flags & IR2_REG_NEGATE); + alu->src2_sel = !(src2_reg->flags & IR2_REG_CONST); + } else { + alu->src2_sel = 1; + } + + if (src3_reg) { + if (src3_reg->flags & IR2_REG_CONST) { + assert(!(src3_reg->flags & IR2_REG_ABS)); + alu->src3_reg_const = src3_reg->num; + } else { + alu->src3_reg = shader->reg[src3_reg->num].reg; + alu->src3_reg_abs = !!(src3_reg->flags & IR2_REG_ABS); + } alu->src3_swiz = reg_alu_src_swiz(src3_reg); alu->src3_reg_negate = !!(src3_reg->flags & IR2_REG_NEGATE); - alu->src3_reg_abs = !!(src3_reg->flags & IR2_REG_ABS); alu->src3_sel = !(src3_reg->flags & IR2_REG_CONST); } else { /* not sure if this is required, but adreno compiler seems @@ -471,9 +630,11 @@ static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords, alu->src3_sel = 1; } - if (instr->pred != IR2_PRED_NONE) { - alu->pred_select = (instr->pred == IR2_PRED_EQ) ? 3 : 2; - } + alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0; + alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0; + + if (pred != IR2_PRED_NONE) + alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2; return 0; } @@ -483,51 +644,63 @@ static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, { switch (instr->instr_type) { case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info); - case IR2_ALU: return instr_emit_alu(instr, dwords, info); + case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info); + case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info); } return -1; } - -struct ir2_register * ir2_reg_create(struct ir2_instruction *instr, +struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr, int num, const char *swizzle, int flags) { - struct ir2_register *reg = - ir2_alloc(instr->shader, sizeof(struct ir2_register)); - DEBUG_MSG("%x, %d, %s", flags, num, swizzle); - assert(num <= REG_MASK); + if (!(flags & IR2_REG_EXPORT)) { + struct ir2_register *reg = &instr->shader->reg[num]; + + unsigned i; + for (i = instr->shader->max_reg + 1; i <= num; i++) + instr->shader->reg[i].write_idx = -1; + instr->shader->max_reg = i - 1; + + if (reg->write_idx < 0) + reg->write_idx = instr->idx; + reg->write_idx2 = instr->idx; + } + + struct ir2_dst_register *reg = &instr->dst_reg; reg->flags = flags; reg->num = num; reg->swizzle = ir2_strdup(instr->shader, swizzle); - assert(instr->regs_count < ARRAY_SIZE(instr->regs)); - instr->regs[instr->regs_count++] = reg; return reg; } -static void reg_update_stats(struct ir2_register *reg, - struct ir2_shader_info *info, bool dest) +struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr, + int num, const char *swizzle, int flags) { - if (!(reg->flags & (IR2_REG_CONST|IR2_REG_EXPORT))) { - info->max_reg = MAX2(info->max_reg, reg->num); - - if (dest) { - info->regs_written |= (1 << reg->num); - } else if (!(info->regs_written & (1 << reg->num))) { - /* for registers that haven't been written, they must be an - * input register that the thread scheduler (presumably?) - * needs to know about: - */ - info->max_input_reg = MAX2(info->max_input_reg, reg->num); - } + assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg)); + if (!(flags & IR2_REG_CONST)) { + struct ir2_register *reg = &instr->shader->reg[num]; + + reg->read_idx = instr->idx; + + unsigned i; + for (i = instr->shader->max_reg + 1; i <= num; i++) + instr->shader->reg[i].write_idx = -1; + instr->shader->max_reg = i - 1; } + + struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++]; + reg->flags = flags; + reg->num = num; + reg->swizzle = ir2_strdup(instr->shader, swizzle); + return reg; } -static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n) +static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n) { uint32_t swiz = 0; int i; - assert(reg->flags == 0); + assert((reg->flags & ~IR2_REG_INPUT) == 0); assert(reg->swizzle); DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle); @@ -547,7 +720,7 @@ static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n) return swiz; } -static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg) +static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg) { uint32_t swiz = 0; int i; @@ -580,7 +753,7 @@ static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg) } /* actually, a write-mask */ -static uint32_t reg_alu_dst_swiz(struct ir2_register *reg) +static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg) { uint32_t swiz = 0; int i; @@ -607,12 +780,11 @@ static uint32_t reg_alu_dst_swiz(struct ir2_register *reg) return swiz; } -static uint32_t reg_alu_src_swiz(struct ir2_register *reg) +static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg) { uint32_t swiz = 0; int i; - assert((reg->flags & IR2_REG_EXPORT) == 0); assert(!reg->swizzle || (strlen(reg->swizzle) == 4)); DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle); |