diff options
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_compiler.c | 21 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_draw.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 3 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_program.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_ra.c | 102 |
6 files changed, 130 insertions, 10 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index f52003a47ee..818d5611dd9 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -48,6 +48,25 @@ #include "instr-a3xx.h" #include "ir3.h" +/* NOTE on half/full precision: + * Currently, the front end (ie. basically this file) does everything in + * full precision (with the exception of trans_arl() which doesn't work + * currently.. we reject anything with relative addressing and fallback + * to old compiler). + * + * In the RA step, if half_precision, it will assign the output to hr0.x + * but use full precision everywhere else. + * + * Eventually we'll need a better way to communicate type information + * to RA so that it can more properly assign both half and full precision + * registers. (And presumably double precision pairs for a4xx?) This + * would let us make more use of half precision registers, while still + * keeping things like tex coords in full precision registers. + * + * Since the RA is dealing with patching instruction types for half + * precision output, we can ignore that in the front end and just always + * create full precision instructions. + */ struct fd3_compile_context { const struct tgsi_token *tokens; @@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so, ir3_dump_instr_list(ctx.block->head); } - ret = ir3_block_ra(ctx.block, so->type); + ret = ir3_block_ra(ctx.block, so->type, key.half_precision); if (ret) goto out; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index 7b071b2cd5d..f822aa728fe 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) /* do binning pass first: */ .binning_pass = true, .color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false, + // TODO set .half_precision based on render target format, + // ie. float16 and smaller use half, float32 use full.. + .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF), }; draw_impl(ctx, info, ctx->binning_ring, dirty & ~(FD_DIRTY_BLEND), key); @@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty) struct fd_ringbuffer *ring = ctx->binning_ring; struct fd3_shader_key key = { .binning_pass = true, + .half_precision = true, }; fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key); @@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, unsigned dirty = ctx->dirty; unsigned ce, i; struct fd3_shader_key key = { + .half_precision = true, }; dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index d1aa8cf1208..dde71ba97b9 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -44,6 +44,9 @@ #include "fd3_zsa.h" static const struct fd3_shader_key key = { + // XXX should set this based on render target format! We don't + // want half_precision if float32 render target!!! + .half_precision = true, }; static void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 0a7500f1611..34d4dd3330b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) v->type = so->type; if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d", so->type); + DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type, + key.binning_pass, key.color_two_side, key.half_precision); tgsi_dump(tokens, 0); } @@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) fixup_vp_regfootprint(v); if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("disassemble: type=%d", v->type); + DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, + key.binning_pass, key.color_two_side, key.half_precision); disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); } diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 894db175076..9327fbdca72 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block); void ir3_block_sched(struct ir3_block *block); /* register assignment: */ -int ir3_block_ra(struct ir3_block *block, enum shader_t type); +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision); #ifndef ARRAY_SIZE diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 06a86ff3b2d..1b3d0e3e1e5 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -53,10 +53,19 @@ struct ir3_ra_ctx { struct ir3_block *block; enum shader_t type; + bool half_precision; int cnt; bool error; }; +/* sorta ugly way to retrofit half-precision support.. rather than + * passing extra param around, just OR in a high bit. All the low + * value arithmetic (ie. +/- offset within a contiguous vec4, etc) + * will continue to work as long as you don't underflow (and that + * would go badly anyways). + */ +#define REG_HALF 0x8000 + struct ir3_ra_assignment { int8_t off; /* offset of instruction dst within range */ uint8_t num; /* number of components for the range */ @@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx) * see how because the blob driver always uses r0.x (ie. * all zeros) */ - if (ctx->type == SHADER_FRAGMENT) + if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision) return 2; return 0; } @@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v) return (struct ra_assign_visitor *)v; } +static type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + /* instructions may already be fixed up: */ + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + +/* some instructions need fix-up if dst register is half precision: */ +static void fixup_half_instr_dst(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.dst_type = half_type(instr->cat1.dst_type); + break; + case 3: + switch (instr->opc) { + case OPC_MAD_F32: + instr->opc = OPC_MAD_F16; + break; + case OPC_SEL_B32: + instr->opc = OPC_SEL_B16; + break; + case OPC_SEL_S32: + instr->opc = OPC_SEL_S16; + break; + case OPC_SEL_F32: + instr->opc = OPC_SEL_F16; + break; + case OPC_SAD_S32: + instr->opc = OPC_SAD_S16; + break; + /* instructions may already be fixed up: */ + case OPC_MAD_F16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + break; + default: + assert(0); + break; + } + break; + case 5: + instr->cat5.type = half_type(instr->cat5.type); + break; + } +} +/* some instructions need fix-up if src register is half precision: */ +static void fixup_half_instr_src(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.src_type = half_type(instr->cat1.src_type); + break; + } +} + static void ra_assign_reg(struct ir3_visitor *v, struct ir3_instruction *instr, struct ir3_register *reg) { struct ra_assign_visitor *a = ra_assign_visitor(v); reg->flags &= ~IR3_REG_SSA; - reg->num = a->num; + reg->num = a->num & ~REG_HALF; + if (a->num & REG_HALF) { + reg->flags |= IR3_REG_HALF; + /* if dst reg being assigned, patch up the instr: */ + if (reg == instr->regs[0]) + fixup_half_instr_dst(instr); + else + fixup_half_instr_src(instr); + } } static void ra_assign_dst_shader_input(struct ir3_visitor *v, @@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx, /* if we've already visited this instruction, bail now: */ if (ir3_instr_check_mark(assigner)) { - debug_assert(assigner->regs[0]->num == num); - if (assigner->regs[0]->num != num) { + debug_assert(assigner->regs[0]->num == (num & ~REG_HALF)); + if (assigner->regs[0]->num != (num & ~REG_HALF)) { /* impossible situation, should have been resolved * at an earlier stage by inserting extra mov's: */ @@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) base = alloc_block(ctx, NULL, block->noutputs + off); + if (ctx->half_precision) + base |= REG_HALF; + for (i = 0; i < block->noutputs; i++) if (block->outputs[i]) ra_assign(ctx, block->outputs[i], base + i + off); @@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (ctx->type == SHADER_FRAGMENT) { for (i = 0; i < block->ninputs; i++) if (block->inputs[i]) - ra_assign(ctx, block->inputs[i], base + i); + ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i); } else { for (i = 0; i < block->ninputs; i++) if (block->inputs[i]) @@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) return 0; } -int ir3_block_ra(struct ir3_block *block, enum shader_t type) +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision) { struct ir3_ra_ctx ctx = { .block = block, .type = type, + .half_precision = half_precision, }; ir3_shader_clear_mark(block->shader); return block_ra(&ctx, block); |