6 files changed, 130 insertions, 10 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index f52003a47ee..818d5611dd9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -48,6 +48,25 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 
+/* NOTE on half/full precision:
+ * Currently, the front end (ie. basically this file) does everything in
+ * full precision (with the exception of trans_arl() which doesn't work
+ * currently.. we reject anything with relative addressing and fallback
+ * to old compiler).
+ *
+ * In the RA step, if half_precision, it will assign the output to hr0.x
+ * but use full precision everywhere else.
+ *
+ * Eventually we'll need a better way to communicate type information
+ * to RA so that it can more properly assign both half and full precision
+ * registers.  (And presumably double precision pairs for a4xx?)  This
+ * would let us make more use of half precision registers, while still
+ * keeping things like tex coords in full precision registers.
+ *
+ * Since the RA is dealing with patching instruction types for half
+ * precision output, we can ignore that in the front end and just always
+ * create full precision instructions.
+ */
 
 struct fd3_compile_context {
 	const struct tgsi_token *tokens;
@@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 		ir3_dump_instr_list(ctx.block->head);
 	}
 
-	ret = ir3_block_ra(ctx.block, so->type);
+	ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
 	if (ret)
 		goto out;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 7b071b2cd5d..f822aa728fe 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 			/* do binning pass first: */
 			.binning_pass = true,
 			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+			// TODO set .half_precision based on render target format,
+			// ie. float16 and smaller use half, float32 use full..
+			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
 	};
 	draw_impl(ctx, info, ctx->binning_ring,
 			dirty & ~(FD_DIRTY_BLEND), key);
@@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
 	struct fd_ringbuffer *ring = ctx->binning_ring;
 	struct fd3_shader_key key = {
 			.binning_pass = true,
+			.half_precision = true,
 	};
 
 	fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 	unsigned dirty = ctx->dirty;
 	unsigned ce, i;
 	struct fd3_shader_key key = {
+			.half_precision = true,
 	};
 
 	dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index d1aa8cf1208..dde71ba97b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -44,6 +44,9 @@
 #include "fd3_zsa.h"
 
 static const struct fd3_shader_key key = {
+		// XXX should set this based on render target format!  We don't
+		// want half_precision if float32 render target!!!
+		.half_precision = true,
 };
 
 static void
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 0a7500f1611..34d4dd3330b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 	v->type = so->type;
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d", so->type);
+		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
 		tgsi_dump(tokens, 0);
 	}
 
@@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 		fixup_vp_regfootprint(v);
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d", v->type);
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
 		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
 	}
 
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
index 894db175076..9327fbdca72 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
 void ir3_block_sched(struct ir3_block *block);
 
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision);
 
 
 #ifndef ARRAY_SIZE
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
index 06a86ff3b2d..1b3d0e3e1e5 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -53,10 +53,19 @@
 struct ir3_ra_ctx {
 	struct ir3_block *block;
 	enum shader_t type;
+	bool half_precision;
 	int cnt;
 	bool error;
 };
 
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
 struct ir3_ra_assignment {
 	int8_t  off;        /* offset of instruction dst within range */
 	uint8_t num;        /* number of components for the range */
@@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
 	 * see how because the blob driver always uses r0.x (ie.
 	 * all zeros)
 	 */
-	if (ctx->type == SHADER_FRAGMENT)
+	if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
 		return 2;
 	return 0;
 }
@@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
 	return (struct ra_assign_visitor *)v;
 }
 
+static type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	/* instructions may already be fixed up: */
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+		break;
+	case 3:
+		switch (instr->opc) {
+		case OPC_MAD_F32:
+			instr->opc = OPC_MAD_F16;
+			break;
+		case OPC_SEL_B32:
+			instr->opc = OPC_SEL_B16;
+			break;
+		case OPC_SEL_S32:
+			instr->opc = OPC_SEL_S16;
+			break;
+		case OPC_SEL_F32:
+			instr->opc = OPC_SEL_F16;
+			break;
+		case OPC_SAD_S32:
+			instr->opc = OPC_SAD_S16;
+			break;
+		/* instructions may already be fixed up: */
+		case OPC_MAD_F16:
+		case OPC_SEL_B16:
+		case OPC_SEL_S16:
+		case OPC_SEL_F16:
+		case OPC_SAD_S16:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case 5:
+		instr->cat5.type = half_type(instr->cat5.type);
+		break;
+	}
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.src_type = half_type(instr->cat1.src_type);
+		break;
+	}
+}
+
 static void ra_assign_reg(struct ir3_visitor *v,
 		struct ir3_instruction *instr, struct ir3_register *reg)
 {
 	struct ra_assign_visitor *a = ra_assign_visitor(v);
 	reg->flags &= ~IR3_REG_SSA;
-	reg->num = a->num;
+	reg->num = a->num & ~REG_HALF;
+	if (a->num & REG_HALF) {
+		reg->flags |= IR3_REG_HALF;
+		/* if dst reg being assigned, patch up the instr: */
+		if (reg == instr->regs[0])
+			fixup_half_instr_dst(instr);
+		else
+			fixup_half_instr_src(instr);
+	}
 }
 
 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
 
 	/* if we've already visited this instruction, bail now: */
 	if (ir3_instr_check_mark(assigner)) {
-		debug_assert(assigner->regs[0]->num == num);
-		if (assigner->regs[0]->num != num) {
+		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
 			/* impossible situation, should have been resolved
 			 * at an earlier stage by inserting extra mov's:
 			 */
@@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		base = alloc_block(ctx, NULL, block->noutputs + off);
 
+		if (ctx->half_precision)
+			base |= REG_HALF;
+
 		for (i = 0; i < block->noutputs; i++)
 			if (block->outputs[i])
 				ra_assign(ctx, block->outputs[i], base + i + off);
@@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (ctx->type == SHADER_FRAGMENT) {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
-					ra_assign(ctx, block->inputs[i], base + i);
+					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
 		} else {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
@@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	return 0;
 }
 
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision)
 {
 	struct ir3_ra_ctx ctx = {
 			.block = block,
 			.type = type,
+			.half_precision = half_precision,
 	};
 	ir3_shader_clear_mark(block->shader);
 	return block_ra(&ctx, block);