summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_compiler.c21
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_draw.c5
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_gmem.c3
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_program.c6
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3.h3
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_ra.c102
6 files changed, 130 insertions, 10 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index f52003a47ee..818d5611dd9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -48,6 +48,25 @@
#include "instr-a3xx.h"
#include "ir3.h"
+/* NOTE on half/full precision:
+ * Currently, the front end (ie. basically this file) does everything in
+ * full precision (with the exception of trans_arl() which doesn't work
+ * currently.. we reject anything with relative addressing and fallback
+ * to old compiler).
+ *
+ * In the RA step, if half_precision, it will assign the output to hr0.x
+ * but use full precision everywhere else.
+ *
+ * Eventually we'll need a better way to communicate type information
+ * to RA so that it can more properly assign both half and full precision
+ * registers. (And presumably double precision pairs for a4xx?) This
+ * would let us make more use of half precision registers, while still
+ * keeping things like tex coords in full precision registers.
+ *
+ * Since the RA is dealing with patching instruction types for half
+ * precision output, we can ignore that in the front end and just always
+ * create full precision instructions.
+ */
struct fd3_compile_context {
const struct tgsi_token *tokens;
@@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
ir3_dump_instr_list(ctx.block->head);
}
- ret = ir3_block_ra(ctx.block, so->type);
+ ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
if (ret)
goto out;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 7b071b2cd5d..f822aa728fe 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
/* do binning pass first: */
.binning_pass = true,
.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+ // TODO set .half_precision based on render target format,
+ // ie. float16 and smaller use half, float32 use full..
+ .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
};
draw_impl(ctx, info, ctx->binning_ring,
dirty & ~(FD_DIRTY_BLEND), key);
@@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
struct fd_ringbuffer *ring = ctx->binning_ring;
struct fd3_shader_key key = {
.binning_pass = true,
+ .half_precision = true,
};
fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
unsigned dirty = ctx->dirty;
unsigned ce, i;
struct fd3_shader_key key = {
+ .half_precision = true,
};
dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index d1aa8cf1208..dde71ba97b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -44,6 +44,9 @@
#include "fd3_zsa.h"
static const struct fd3_shader_key key = {
+ // XXX should set this based on render target format! We don't
+ // want half_precision if float32 render target!!!
+ .half_precision = true,
};
static void
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 0a7500f1611..34d4dd3330b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
v->type = so->type;
if (fd_mesa_debug & FD_DBG_DISASM) {
- DBG("dump tgsi: type=%d", so->type);
+ DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
+ key.binning_pass, key.color_two_side, key.half_precision);
tgsi_dump(tokens, 0);
}
@@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
fixup_vp_regfootprint(v);
if (fd_mesa_debug & FD_DBG_DISASM) {
- DBG("disassemble: type=%d", v->type);
+ DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+ key.binning_pass, key.color_two_side, key.half_precision);
disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
index 894db175076..9327fbdca72 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
void ir3_block_sched(struct ir3_block *block);
/* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+ bool half_precision);
#ifndef ARRAY_SIZE
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
index 06a86ff3b2d..1b3d0e3e1e5 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -53,10 +53,19 @@
struct ir3_ra_ctx {
struct ir3_block *block;
enum shader_t type;
+ bool half_precision;
int cnt;
bool error;
};
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit. All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF 0x8000
+
struct ir3_ra_assignment {
int8_t off; /* offset of instruction dst within range */
uint8_t num; /* number of components for the range */
@@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
* see how because the blob driver always uses r0.x (ie.
* all zeros)
*/
- if (ctx->type == SHADER_FRAGMENT)
+ if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
return 2;
return 0;
}
@@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
return (struct ra_assign_visitor *)v;
}
+static type_t half_type(type_t type)
+{
+ switch (type) {
+ case TYPE_F32: return TYPE_F16;
+ case TYPE_U32: return TYPE_U16;
+ case TYPE_S32: return TYPE_S16;
+ /* instructions may already be fixed up: */
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return type;
+ default:
+ assert(0);
+ return ~0;
+ }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+ switch (instr->category) {
+ case 1: /* move instructions */
+ instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+ break;
+ case 3:
+ switch (instr->opc) {
+ case OPC_MAD_F32:
+ instr->opc = OPC_MAD_F16;
+ break;
+ case OPC_SEL_B32:
+ instr->opc = OPC_SEL_B16;
+ break;
+ case OPC_SEL_S32:
+ instr->opc = OPC_SEL_S16;
+ break;
+ case OPC_SEL_F32:
+ instr->opc = OPC_SEL_F16;
+ break;
+ case OPC_SAD_S32:
+ instr->opc = OPC_SAD_S16;
+ break;
+ /* instructions may already be fixed up: */
+ case OPC_MAD_F16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case 5:
+ instr->cat5.type = half_type(instr->cat5.type);
+ break;
+ }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+ switch (instr->category) {
+ case 1: /* move instructions */
+ instr->cat1.src_type = half_type(instr->cat1.src_type);
+ break;
+ }
+}
+
static void ra_assign_reg(struct ir3_visitor *v,
struct ir3_instruction *instr, struct ir3_register *reg)
{
struct ra_assign_visitor *a = ra_assign_visitor(v);
reg->flags &= ~IR3_REG_SSA;
- reg->num = a->num;
+ reg->num = a->num & ~REG_HALF;
+ if (a->num & REG_HALF) {
+ reg->flags |= IR3_REG_HALF;
+ /* if dst reg being assigned, patch up the instr: */
+ if (reg == instr->regs[0])
+ fixup_half_instr_dst(instr);
+ else
+ fixup_half_instr_src(instr);
+ }
}
static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
/* if we've already visited this instruction, bail now: */
if (ir3_instr_check_mark(assigner)) {
- debug_assert(assigner->regs[0]->num == num);
- if (assigner->regs[0]->num != num) {
+ debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+ if (assigner->regs[0]->num != (num & ~REG_HALF)) {
/* impossible situation, should have been resolved
* at an earlier stage by inserting extra mov's:
*/
@@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
base = alloc_block(ctx, NULL, block->noutputs + off);
+ if (ctx->half_precision)
+ base |= REG_HALF;
+
for (i = 0; i < block->noutputs; i++)
if (block->outputs[i])
ra_assign(ctx, block->outputs[i], base + i + off);
@@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
if (ctx->type == SHADER_FRAGMENT) {
for (i = 0; i < block->ninputs; i++)
if (block->inputs[i])
- ra_assign(ctx, block->inputs[i], base + i);
+ ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
} else {
for (i = 0; i < block->ninputs; i++)
if (block->inputs[i])
@@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
return 0;
}
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+ bool half_precision)
{
struct ir3_ra_ctx ctx = {
.block = block,
.type = type,
+ .half_precision = half_precision,
};
ir3_shader_clear_mark(block->shader);
return block_ra(&ctx, block);