freedreno: prepare for a3xx

Split the parts that are specific to adreno a2xx series GPUs from the parts that will be in common with a3xx, so that a3xx support can be added more cleanly. Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2013-05-26 17:13:27 -0400
committer: Rob Clark <[email protected]> 2013-06-08 13:15:51 -0400
commit: 18c317b21ddc2ec4538544f9dd69dc568dcf821f (patch)
tree: 224e3c0464dcedabbf5784e5c3def952473fd8f8 /src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
parent: 213c207b3ac40ae769afe01b8578f566b5e7840d (diff)
1 files changed, 1191 insertions, 0 deletions
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
new file mode 100644
index 00000000000..4d0dc323a68
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
@@ -0,0 +1,1191 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "fd2_compiler.h"
+#include "fd2_program.h"
+#include "fd2_util.h"
+
+#include "instr-a2xx.h"
+#include "ir-a2xx.h"
+
+struct fd2_compile_context {
+	struct fd_program_stateobj *prog;
+	struct fd2_shader_stateobj *so;
+
+	struct tgsi_parse_context parser;
+	unsigned type;
+
+	/* predicate stack: */
+	int pred_depth;
+	enum ir2_pred pred_stack[8];
+
+	/* Internal-Temporary and Predicate register assignment:
+	 *
+	 * Some TGSI instructions which translate into multiple actual
+	 * instructions need one or more temporary registers, which are not
+	 * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
+	 * And some instructions (texture fetch) cannot write directly to
+	 * output registers.  We could be more clever and re-use dst or a
+	 * src register in some cases.  But for now don't try to be clever.
+	 * Eventually we should implement an optimization pass that re-
+	 * juggles the register usage and gets rid of unneeded temporaries.
+	 *
+	 * The predicate register must be valid across multiple TGSI
+	 * instructions, but internal temporary's do not.  For this reason,
+	 * once the predicate register is requested, until it is no longer
+	 * needed, it gets the first register slot after after the TGSI
+	 * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
+	 * internal temporaries get the register slots above this.
+	 */
+
+	int pred_reg;
+	int num_internal_temps;
+
+	uint8_t num_regs[TGSI_FILE_COUNT];
+
+	/* maps input register idx to prog->export_linkage idx: */
+	uint8_t input_export_idx[64];
+
+	/* maps output register idx to prog->export_linkage idx: */
+	uint8_t output_export_idx[64];
+
+	/* idx/slot for last compiler generated immediate */
+	unsigned immediate_idx;
+
+	// TODO we can skip emit exports in the VS that the FS doesn't need..
+	// and get rid perhaps of num_param..
+	unsigned num_position, num_param;
+	unsigned position, psize;
+
+	uint64_t need_sync;
+
+	/* current exec CF instruction */
+	struct ir2_cf *cf;
+};
+
+static int
+semantic_idx(struct tgsi_declaration_semantic *semantic)
+{
+	int idx = semantic->Name;
+	if (idx == TGSI_SEMANTIC_GENERIC)
+		idx = TGSI_SEMANTIC_COUNT + semantic->Index;
+	return idx;
+}
+
+/* assign/get the input/export register # for given semantic idx as
+ * returned by semantic_idx():
+ */
+static int
+export_linkage(struct fd2_compile_context *ctx, int idx)
+{
+	struct fd_program_stateobj *prog = ctx->prog;
+
+	/* if first time we've seen this export, assign the next available slot: */
+	if (prog->export_linkage[idx] == 0xff)
+		prog->export_linkage[idx] = prog->num_exports++;
+
+	return prog->export_linkage[idx];
+}
+
+static unsigned
+compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
+		struct fd2_shader_stateobj *so)
+{
+	unsigned ret;
+
+	ctx->prog = prog;
+	ctx->so = so;
+	ctx->cf = NULL;
+	ctx->pred_depth = 0;
+
+	ret = tgsi_parse_init(&ctx->parser, so->tokens);
+	if (ret != TGSI_PARSE_OK)
+		return ret;
+
+	ctx->type = ctx->parser.FullHeader.Processor.Processor;
+	ctx->position = ~0;
+	ctx->psize = ~0;
+	ctx->num_position = 0;
+	ctx->num_param = 0;
+	ctx->need_sync = 0;
+	ctx->immediate_idx = 0;
+	ctx->pred_reg = -1;
+	ctx->num_internal_temps = 0;
+
+	memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
+	memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
+	memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
+
+	/* do first pass to extract declarations: */
+	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+		tgsi_parse_token(&ctx->parser);
+
+		switch (ctx->parser.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION: {
+			struct tgsi_full_declaration *decl =
+					&ctx->parser.FullToken.FullDeclaration;
+			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+				unsigned name = decl->Semantic.Name;
+
+				assert(decl->Declaration.Semantic);  // TODO is this ever not true?
+
+				ctx->output_export_idx[decl->Range.First] =
+						semantic_idx(&decl->Semantic);
+
+				if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+					switch (name) {
+					case TGSI_SEMANTIC_POSITION:
+						ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
+						ctx->num_position++;
+						break;
+					case TGSI_SEMANTIC_PSIZE:
+						ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
+						ctx->num_position++;
+						break;
+					case TGSI_SEMANTIC_COLOR:
+					case TGSI_SEMANTIC_GENERIC:
+						ctx->num_param++;
+						break;
+					default:
+						DBG("unknown VS semantic name: %s",
+								tgsi_semantic_names[name]);
+						assert(0);
+					}
+				} else {
+					switch (name) {
+					case TGSI_SEMANTIC_COLOR:
+					case TGSI_SEMANTIC_GENERIC:
+						ctx->num_param++;
+						break;
+					default:
+						DBG("unknown PS semantic name: %s",
+								tgsi_semantic_names[name]);
+						assert(0);
+					}
+				}
+			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+				ctx->input_export_idx[decl->Range.First] =
+						semantic_idx(&decl->Semantic);
+			}
+			ctx->num_regs[decl->Declaration.File] =
+					MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
+			break;
+		}
+		case TGSI_TOKEN_TYPE_IMMEDIATE: {
+			struct tgsi_full_immediate *imm =
+					&ctx->parser.FullToken.FullImmediate;
+			unsigned n = ctx->so->num_immediates++;
+			memcpy(ctx->so->immediates[n].val, imm->u, 16);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	/* TGSI generated immediates are always entire vec4's, ones we
+	 * generate internally are not:
+	 */
+	ctx->immediate_idx = ctx->so->num_immediates * 4;
+
+	ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
+
+	tgsi_parse_free(&ctx->parser);
+
+	return tgsi_parse_init(&ctx->parser, so->tokens);
+}
+
+static void
+compile_free(struct fd2_compile_context *ctx)
+{
+	tgsi_parse_free(&ctx->parser);
+}
+
+static struct ir2_cf *
+next_exec_cf(struct fd2_compile_context *ctx)
+{
+	struct ir2_cf *cf = ctx->cf;
+	if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
+		ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
+	return cf;
+}
+
+static void
+compile_vtx_fetch(struct fd2_compile_context *ctx)
+{
+	struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
+	int i;
+	for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
+		struct ir2_instruction *instr = ir2_instr_create(
+				next_exec_cf(ctx), IR2_FETCH);
+		instr->fetch.opc = VTX_FETCH;
+
+		ctx->need_sync |= 1 << (i+1);
+
+		ir2_reg_create(instr, i+1, "xyzw", 0);
+		ir2_reg_create(instr, 0, "x", 0);
+
+		if (i == 0)
+			instr->sync = true;
+
+		vfetch_instrs[i] = instr;
+	}
+	ctx->so->num_vfetch_instrs = i;
+	ctx->cf = NULL;
+}
+
+/*
+ * For vertex shaders (VS):
+ * --- ------ -------------
+ *
+ *   Inputs:     R1-R(num_input)
+ *   Constants:  C0-C(num_const-1)
+ *   Immediates: C(num_const)-C(num_const+num_imm-1)
+ *   Outputs:    export0-export(n) and export62, export63
+ *      n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
+ *   Temps:      R(num_input+1)-R(num_input+num_temps)
+ *
+ * R0 could be clobbered after the vertex fetch instructions.. so we
+ * could use it for one of the temporaries.
+ *
+ * TODO: maybe the vertex fetch part could fetch first input into R0 as
+ * the last vtx fetch instruction, which would let us use the same
+ * register layout in either case.. although this is not what the blob
+ * compiler does.
+ *
+ *
+ * For frag shaders (PS):
+ * --- ---- -------------
+ *
+ *   Inputs:     R0-R(num_input-1)
+ *   Constants:  same as VS
+ *   Immediates: same as VS
+ *   Outputs:    export0-export(num_outputs)
+ *   Temps:      R(num_input)-R(num_input+num_temps-1)
+ *
+ * In either case, immediates are are postpended to the constants
+ * (uniforms).
+ *
+ */
+
+static unsigned
+get_temp_gpr(struct fd2_compile_context *ctx, int idx)
+{
+	unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
+	if (ctx->type == TGSI_PROCESSOR_VERTEX)
+		num++;
+	return num;
+}
+
+static struct ir2_register *
+add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
+		const struct tgsi_dst_register *dst)
+{
+	unsigned flags = 0, num = 0;
+	char swiz[5];
+
+	switch (dst->File) {
+	case TGSI_FILE_OUTPUT:
+		flags |= IR2_REG_EXPORT;
+		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+			if (dst->Index == ctx->position) {
+				num = 62;
+			} else if (dst->Index == ctx->psize) {
+				num = 63;
+			} else {
+				num = export_linkage(ctx,
+						ctx->output_export_idx[dst->Index]);
+			}
+		} else {
+			num = dst->Index;
+		}
+		break;
+	case TGSI_FILE_TEMPORARY:
+		num = get_temp_gpr(ctx, dst->Index);
+		break;
+	default:
+		DBG("unsupported dst register file: %s",
+			tgsi_file_name(dst->File));
+		assert(0);
+		break;
+	}
+
+	swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
+	swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
+	swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
+	swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
+	swiz[4] = '\0';
+
+	return ir2_reg_create(alu, num, swiz, flags);
+}
+
+static struct ir2_register *
+add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
+		const struct tgsi_src_register *src)
+{
+	static const char swiz_vals[] = {
+			'x', 'y', 'z', 'w',
+	};
+	char swiz[5];
+	unsigned flags = 0, num = 0;
+
+	switch (src->File) {
+	case TGSI_FILE_CONSTANT:
+		num = src->Index;
+		flags |= IR2_REG_CONST;
+		break;
+	case TGSI_FILE_INPUT:
+		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+			num = src->Index + 1;
+		} else {
+			num = export_linkage(ctx,
+					ctx->input_export_idx[src->Index]);
+		}
+		break;
+	case TGSI_FILE_TEMPORARY:
+		num = get_temp_gpr(ctx, src->Index);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
+		flags |= IR2_REG_CONST;
+		break;
+	default:
+		DBG("unsupported src register file: %s",
+			tgsi_file_name(src->File));
+		assert(0);
+		break;
+	}
+
+	if (src->Absolute)
+		flags |= IR2_REG_ABS;
+	if (src->Negate)
+		flags |= IR2_REG_NEGATE;
+
+	swiz[0] = swiz_vals[src->SwizzleX];
+	swiz[1] = swiz_vals[src->SwizzleY];
+	swiz[2] = swiz_vals[src->SwizzleZ];
+	swiz[3] = swiz_vals[src->SwizzleW];
+	swiz[4] = '\0';
+
+	if ((ctx->need_sync & (uint64_t)(1 << num)) &&
+			!(flags & IR2_REG_CONST)) {
+		alu->sync = true;
+		ctx->need_sync &= ~(uint64_t)(1 << num);
+	}
+
+	return ir2_reg_create(alu, num, swiz, flags);
+}
+
+static void
+add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	switch (inst->Instruction.Saturate) {
+	case TGSI_SAT_NONE:
+		break;
+	case TGSI_SAT_ZERO_ONE:
+		alu->alu.vector_clamp = true;
+		break;
+	case TGSI_SAT_MINUS_PLUS_ONE:
+		DBG("unsupported saturate");
+		assert(0);
+		break;
+	}
+}
+
+static void
+add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	switch (inst->Instruction.Saturate) {
+	case TGSI_SAT_NONE:
+		break;
+	case TGSI_SAT_ZERO_ONE:
+		alu->alu.scalar_clamp = true;
+		break;
+	case TGSI_SAT_MINUS_PLUS_ONE:
+		DBG("unsupported saturate");
+		assert(0);
+		break;
+	}
+}
+
+static void
+add_regs_vector_1(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	assert(inst->Instruction.NumSrcRegs == 1);
+	assert(inst->Instruction.NumDstRegs == 1);
+
+	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+	add_vector_clamp(inst, alu);
+}
+
+static void
+add_regs_vector_2(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	assert(inst->Instruction.NumSrcRegs == 2);
+	assert(inst->Instruction.NumDstRegs == 1);
+
+	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[1].Register);
+	add_vector_clamp(inst, alu);
+}
+
+static void
+add_regs_vector_3(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	assert(inst->Instruction.NumSrcRegs == 3);
+	assert(inst->Instruction.NumDstRegs == 1);
+
+	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
+	/* maybe should re-arrange the syntax some day, but
+	 * in assembler/disassembler and what ir.c expects
+	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
+	 */
+	add_src_reg(ctx, alu, &inst->Src[2].Register);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[1].Register);
+	add_vector_clamp(inst, alu);
+}
+
+static void
+add_regs_dummy_vector(struct ir2_instruction *alu)
+{
+	/* create dummy, non-written vector dst/src regs
+	 * for unused vector instr slot:
+	 */
+	ir2_reg_create(alu, 0, "____", 0); /* vector dst */
+	ir2_reg_create(alu, 0, NULL, 0);   /* vector src1 */
+	ir2_reg_create(alu, 0, NULL, 0);   /* vector src2 */
+}
+
+static void
+add_regs_scalar_1(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
+{
+	assert(inst->Instruction.NumSrcRegs == 1);
+	assert(inst->Instruction.NumDstRegs == 1);
+
+	add_regs_dummy_vector(alu);
+
+	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+	add_scalar_clamp(inst, alu);
+}
+
+/*
+ * Helpers for TGSI instructions that don't map to a single shader instr:
+ */
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+	src->File      = dst->File;
+	src->Indirect  = dst->Indirect;
+	src->Dimension = dst->Dimension;
+	src->Index     = dst->Index;
+	src->Absolute  = 0;
+	src->Negate    = 0;
+	src->SwizzleX  = TGSI_SWIZZLE_X;
+	src->SwizzleY  = TGSI_SWIZZLE_Y;
+	src->SwizzleZ  = TGSI_SWIZZLE_Z;
+	src->SwizzleW  = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static void
+get_internal_temp(struct fd2_compile_context *ctx,
+		struct tgsi_dst_register *tmp_dst,
+		struct tgsi_src_register *tmp_src)
+{
+	int n;
+
+	tmp_dst->File      = TGSI_FILE_TEMPORARY;
+	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+	tmp_dst->Indirect  = 0;
+	tmp_dst->Dimension = 0;
+
+	/* assign next temporary: */
+	n = ctx->num_internal_temps++;
+	if (ctx->pred_reg != -1)
+		n++;
+
+	tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
+
+	src_from_dst(tmp_src, tmp_dst);
+}
+
+static void
+get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst,
+		struct tgsi_src_register *src)
+{
+	assert(ctx->pred_reg != -1);
+
+	dst->File      = TGSI_FILE_TEMPORARY;
+	dst->WriteMask = TGSI_WRITEMASK_W;
+	dst->Indirect  = 0;
+	dst->Dimension = 0;
+	dst->Index     = get_temp_gpr(ctx, ctx->pred_reg);
+
+	if (src) {
+		src_from_dst(src, dst);
+		src->SwizzleX  = TGSI_SWIZZLE_W;
+		src->SwizzleY  = TGSI_SWIZZLE_W;
+		src->SwizzleZ  = TGSI_SWIZZLE_W;
+		src->SwizzleW  = TGSI_SWIZZLE_W;
+	}
+}
+
+static void
+push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
+{
+	struct ir2_instruction *alu;
+	struct tgsi_dst_register pred_dst;
+
+	/* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
+	 * themselves:
+	 */
+	ctx->cf = NULL;
+
+	if (ctx->pred_depth == 0) {
+		/* assign predicate register: */
+		ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
+
+		get_predicate(ctx, &pred_dst, NULL);
+
+		alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
+		add_regs_dummy_vector(alu);
+		add_dst_reg(ctx, alu, &pred_dst);
+		add_src_reg(ctx, alu, src);
+	} else {
+		struct tgsi_src_register pred_src;
+
+		get_predicate(ctx, &pred_dst, &pred_src);
+
+		alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+		add_dst_reg(ctx, alu, &pred_dst);
+		add_src_reg(ctx, alu, &pred_src);
+		add_src_reg(ctx, alu, src);
+
+		// XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
+		// sure src reg is valid if it was calculated with a predicate
+		// condition..
+		alu->pred = IR2_PRED_NONE;
+	}
+
+	/* save previous pred state to restore in pop_predicate(): */
+	ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
+
+	ctx->cf = NULL;
+}
+
+static void
+pop_predicate(struct fd2_compile_context *ctx)
+{
+	/* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
+	 * themselves:
+	 */
+	ctx->cf = NULL;
+
+	/* restore previous predicate state: */
+	ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
+
+	if (ctx->pred_depth != 0) {
+		struct ir2_instruction *alu;
+		struct tgsi_dst_register pred_dst;
+		struct tgsi_src_register pred_src;
+
+		get_predicate(ctx, &pred_dst, &pred_src);
+
+		alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
+		add_regs_dummy_vector(alu);
+		add_dst_reg(ctx, alu, &pred_dst);
+		add_src_reg(ctx, alu, &pred_src);
+		alu->pred = IR2_PRED_NONE;
+	} else {
+		/* predicate register no longer needed: */
+		ctx->pred_reg = -1;
+	}
+
+	ctx->cf = NULL;
+}
+
+static void
+get_immediate(struct fd2_compile_context *ctx,
+		struct tgsi_src_register *reg, uint32_t val)
+{
+	unsigned neg, swiz, idx, i;
+	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+	static const unsigned swiz2tgsi[] = {
+			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+	};
+
+	for (i = 0; i < ctx->immediate_idx; i++) {
+		swiz = i % 4;
+		idx  = i / 4;
+
+		if (ctx->so->immediates[idx].val[swiz] == val) {
+			neg = 0;
+			break;
+		}
+
+		if (ctx->so->immediates[idx].val[swiz] == -val) {
+			neg = 1;
+			break;
+		}
+	}
+
+	if (i == ctx->immediate_idx) {
+		/* need to generate a new immediate: */
+		swiz = i % 4;
+		idx  = i / 4;
+		neg  = 0;
+		ctx->so->immediates[idx].val[swiz] = val;
+		ctx->so->num_immediates = idx + 1;
+		ctx->immediate_idx++;
+	}
+
+	reg->File      = TGSI_FILE_IMMEDIATE;
+	reg->Indirect  = 0;
+	reg->Dimension = 0;
+	reg->Index     = idx;
+	reg->Absolute  = 0;
+	reg->Negate    = neg;
+	reg->SwizzleX  = swiz2tgsi[swiz];
+	reg->SwizzleY  = swiz2tgsi[swiz];
+	reg->SwizzleZ  = swiz2tgsi[swiz];
+	reg->SwizzleW  = swiz2tgsi[swiz];
+}
+
+/* POW(a,b) = EXP2(b * LOG2(a)) */
+static void
+translate_pow(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register tmp_src;
+	struct ir2_instruction *alu;
+
+	get_internal_temp(ctx, &tmp_dst, &tmp_src);
+
+	alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
+	add_regs_dummy_vector(alu);
+	add_dst_reg(ctx, alu, &tmp_dst);
+	add_src_reg(ctx, alu, &inst->Src[0].Register);
+
+	alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+	add_dst_reg(ctx, alu, &tmp_dst);
+	add_src_reg(ctx, alu, &tmp_src);
+	add_src_reg(ctx, alu, &inst->Src[1].Register);
+
+	/* NOTE: some of the instructions, like EXP_IEEE, seem hard-
+	 * coded to take their input from the w component.
+	 */
+	switch(inst->Dst[0].Register.WriteMask) {
+	case TGSI_WRITEMASK_X:
+		tmp_src.SwizzleW = TGSI_SWIZZLE_X;
+		break;
+	case TGSI_WRITEMASK_Y:
+		tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
+		break;
+	case TGSI_WRITEMASK_Z:
+		tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
+		break;
+	case TGSI_WRITEMASK_W:
+		tmp_src.SwizzleW = TGSI_SWIZZLE_W;
+		break;
+	default:
+		DBG("invalid writemask!");
+		assert(0);
+		break;
+	}
+
+	alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
+	add_regs_dummy_vector(alu);
+	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
+	add_src_reg(ctx, alu, &tmp_src);
+	add_scalar_clamp(inst, alu);
+}
+
+static void
+translate_tex(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, unsigned opc)
+{
+	struct ir2_instruction *instr;
+	struct ir2_register *reg;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register tmp_src;
+	const struct tgsi_src_register *coord;
+	bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
+			(inst->Instruction.Saturate != TGSI_SAT_NONE);
+	int idx;
+
+	if (using_temp || (opc == TGSI_OPCODE_TXP))
+		get_internal_temp(ctx, &tmp_dst, &tmp_src);
+
+	if (opc == TGSI_OPCODE_TXP) {
+		static const char *swiz[] = {
+				[TGSI_SWIZZLE_X] = "xxxx",
+				[TGSI_SWIZZLE_Y] = "yyyy",
+				[TGSI_SWIZZLE_Z] = "zzzz",
+				[TGSI_SWIZZLE_W] = "wwww",
+		};
+
+		/* TXP - Projective Texture Lookup:
+		 *
+		 *  coord.x = src0.x / src.w
+		 *  coord.y = src0.y / src.w
+		 *  coord.z = src0.z / src.w
+		 *  coord.w = src0.w
+		 *  bias = 0.0
+		 *
+		 *  dst = texture_sample(unit, coord, bias)
+		 */
+		instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
+
+		/* MAXv: */
+		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
+		add_src_reg(ctx, instr, &inst->Src[0].Register);
+		add_src_reg(ctx, instr, &inst->Src[0].Register);
+
+		/* RECIP_IEEE: */
+		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
+		add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
+				swiz[inst->Src[0].Register.SwizzleW];
+
+		instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
+		add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
+		add_src_reg(ctx, instr, &inst->Src[0].Register);
+
+		coord = &tmp_src;
+	} else {
+		coord = &inst->Src[0].Register;
+	}
+
+	instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
+	instr->fetch.opc = TEX_FETCH;
+	instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
+	assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
+
+	/* save off the tex fetch to be patched later with correct const_idx: */
+	idx = ctx->so->num_tfetch_instrs++;
+	ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
+	ctx->so->tfetch_instrs[idx].instr = instr;
+
+	add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
+	reg = add_src_reg(ctx, instr, coord);
+
+	/* blob compiler always sets 3rd component to same as 1st for 2d: */
+	if (inst->Texture.Texture == TGSI_TEXTURE_2D)
+		reg->swizzle[2] = reg->swizzle[0];
+
+	/* dst register needs to be marked for sync: */
+	ctx->need_sync |= 1 << instr->regs[0]->num;
+
+	/* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
+	instr->sync = true;
+
+	if (using_temp) {
+		/* texture fetch can't write directly to export, so if tgsi
+		 * is telling us the dst register is in output file, we load
+		 * the texture to a temp and the use ALU instruction to move
+		 * to output
+		 */
+		instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
+
+		add_dst_reg(ctx, instr, &inst->Dst[0].Register);
+		add_src_reg(ctx, instr, &tmp_src);
+		add_src_reg(ctx, instr, &tmp_src);
+		add_vector_clamp(inst, instr);
+	}
+}
+
+/* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
+/* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
+static void
+translate_sge_slt(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst, unsigned opc)
+{
+	struct ir2_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register tmp_src;
+	struct tgsi_src_register tmp_const;
+	float c0, c1;
+
+	switch (opc) {
+	default:
+		assert(0);
+	case TGSI_OPCODE_SGE:
+		c0 = 1.0;
+		c1 = 0.0;
+		break;
+	case TGSI_OPCODE_SLT:
+		c0 = 0.0;
+		c1 = 1.0;
+		break;
+	}
+
+	get_internal_temp(ctx, &tmp_dst, &tmp_src);
+
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst);
+	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
+	add_src_reg(ctx, instr, &inst->Src[1].Register);
+
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), CNDGTEv, ~0);
+	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
+	/* maybe should re-arrange the syntax some day, but
+	 * in assembler/disassembler and what ir.c expects
+	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
+	 */
+	get_immediate(ctx, &tmp_const, fui(c0));
+	add_src_reg(ctx, instr, &tmp_const);
+	add_src_reg(ctx, instr, &tmp_src);
+	get_immediate(ctx, &tmp_const, fui(c1));
+	add_src_reg(ctx, instr, &tmp_const);
+}
+
+/* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
+static void
+translate_lrp(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst,
+		unsigned opc)
+{
+	struct ir2_instruction *instr;
+	struct tgsi_dst_register tmp_dst1, tmp_dst2;
+	struct tgsi_src_register tmp_src1, tmp_src2;
+	struct tgsi_src_register tmp_const;
+
+	get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
+	get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
+
+	get_immediate(ctx, &tmp_const, fui(1.0));
+
+	/* tmp1 = (a * b) */
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst1);
+	add_src_reg(ctx, instr, &inst->Src[0].Register);
+	add_src_reg(ctx, instr, &inst->Src[1].Register);
+
+	/* tmp2 = (1 - a) */
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst2);
+	add_src_reg(ctx, instr, &tmp_const);
+	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
+
+	/* tmp2 = tmp2 * c */
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst2);
+	add_src_reg(ctx, instr, &tmp_src2);
+	add_src_reg(ctx, instr, &inst->Src[2].Register);
+
+	/* dst = tmp1 + tmp2 */
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
+	add_src_reg(ctx, instr, &tmp_src1);
+	add_src_reg(ctx, instr, &tmp_src2);
+}
+
+static void
+translate_trig(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst,
+		unsigned opc)
+{
+	struct ir2_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register tmp_src;
+	struct tgsi_src_register tmp_const;
+	instr_scalar_opc_t op;
+
+	switch (opc) {
+	default:
+		assert(0);
+	case TGSI_OPCODE_SIN:
+		op = SIN;
+		break;
+	case TGSI_OPCODE_COS:
+		op = COS;
+		break;
+	}
+
+	get_internal_temp(ctx, &tmp_dst, &tmp_src);
+
+	tmp_dst.WriteMask = TGSI_WRITEMASK_X;
+	tmp_src.SwizzleX = tmp_src.SwizzleY =
+			tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
+
+	/* maybe should re-arrange the syntax some day, but
+	 * in assembler/disassembler and what ir.c expects
+	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
+	 */
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst);
+	get_immediate(ctx, &tmp_const, fui(0.5));
+	add_src_reg(ctx, instr, &tmp_const);
+	add_src_reg(ctx, instr, &inst->Src[0].Register);
+	get_immediate(ctx, &tmp_const, fui(0.159155));
+	add_src_reg(ctx, instr, &tmp_const);
+
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst);
+	add_src_reg(ctx, instr, &tmp_src);
+	add_src_reg(ctx, instr, &tmp_src);
+
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
+	add_dst_reg(ctx, instr, &tmp_dst);
+	get_immediate(ctx, &tmp_const, fui(-3.141593));
+	add_src_reg(ctx, instr, &tmp_const);
+	add_src_reg(ctx, instr, &tmp_src);
+	get_immediate(ctx, &tmp_const, fui(6.283185));
+	add_src_reg(ctx, instr, &tmp_const);
+
+	instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
+	add_regs_dummy_vector(instr);
+	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
+	add_src_reg(ctx, instr, &tmp_src);
+}
+
+/*
+ * Main part of compiler/translator:
+ */
+
+static void
+translate_instruction(struct fd2_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	unsigned opc = inst->Instruction.Opcode;
+	struct ir2_instruction *instr;
+	static struct ir2_cf *cf;
+
+	if (opc == TGSI_OPCODE_END)
+		return;
+
+	if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+		unsigned num = inst->Dst[0].Register.Index;
+		/* seems like we need to ensure that position vs param/pixel
+		 * exports don't end up in the same EXEC clause..  easy way
+		 * to do this is force a new EXEC clause on first appearance
+		 * of an position or param/pixel export.
+		 */
+		if ((num == ctx->position) || (num == ctx->psize)) {
+			if (ctx->num_position > 0) {
+				ctx->cf = NULL;
+				ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
+						ctx->num_position - 1);
+				ctx->num_position = 0;
+			}
+		} else {
+			if (ctx->num_param > 0) {
+				ctx->cf = NULL;
+				ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
+						ctx->num_param - 1);
+				ctx->num_param = 0;
+			}
+		}
+	}
+
+	cf = next_exec_cf(ctx);
+
+	/* TODO turn this into a table: */
+	switch (opc) {
+	case TGSI_OPCODE_MOV:
+		instr = ir2_instr_create_alu(cf, MAXv, ~0);
+		add_regs_vector_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_RCP:
+		instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
+		add_regs_scalar_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_RSQ:
+		instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
+		add_regs_scalar_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_MUL:
+		instr = ir2_instr_create_alu(cf, MULv, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_ADD:
+		instr = ir2_instr_create_alu(cf, ADDv, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_DP3:
+		instr = ir2_instr_create_alu(cf, DOT3v, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_DP4:
+		instr = ir2_instr_create_alu(cf, DOT4v, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_MIN:
+		instr = ir2_instr_create_alu(cf, MINv, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_MAX:
+		instr = ir2_instr_create_alu(cf, MAXv, ~0);
+		add_regs_vector_2(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+		translate_sge_slt(ctx, inst, opc);
+		break;
+	case TGSI_OPCODE_MAD:
+		instr = ir2_instr_create_alu(cf, MULADDv, ~0);
+		add_regs_vector_3(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_LRP:
+		translate_lrp(ctx, inst, opc);
+		break;
+	case TGSI_OPCODE_FRC:
+		instr = ir2_instr_create_alu(cf, FRACv, ~0);
+		add_regs_vector_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_FLR:
+		instr = ir2_instr_create_alu(cf, FLOORv, ~0);
+		add_regs_vector_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_EX2:
+		instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
+		add_regs_scalar_1(ctx, inst, instr);
+		break;
+	case TGSI_OPCODE_POW:
+		translate_pow(ctx, inst);
+		break;
+	case TGSI_OPCODE_ABS:
+		instr = ir2_instr_create_alu(cf, MAXv, ~0);
+		add_regs_vector_1(ctx, inst, instr);
+		instr->regs[1]->flags |= IR2_REG_NEGATE; /* src0 */
+		break;
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		translate_trig(ctx, inst, opc);
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+		translate_tex(ctx, inst, opc);
+		break;
+	case TGSI_OPCODE_CMP:
+		instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
+		add_regs_vector_3(ctx, inst, instr);
+		// TODO this should be src0 if regs where in sane order..
+		instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
+		break;
+	case TGSI_OPCODE_IF:
+		push_predicate(ctx, &inst->Src[0].Register);
+		ctx->so->ir->pred = IR2_PRED_EQ;
+		break;
+	case TGSI_OPCODE_ELSE:
+		ctx->so->ir->pred = IR2_PRED_NE;
+		/* not sure if this is required in all cases, but blob compiler
+		 * won't combine EQ and NE in same CF:
+		 */
+		ctx->cf = NULL;
+		break;
+	case TGSI_OPCODE_ENDIF:
+		pop_predicate(ctx);
+		break;
+	case TGSI_OPCODE_F2I:
+		instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
+		add_regs_vector_1(ctx, inst, instr);
+		break;
+	default:
+		DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
+		tgsi_dump(ctx->so->tokens, 0);
+		assert(0);
+		break;
+	}
+
+	/* internal temporaries are only valid for the duration of a single
+	 * TGSI instruction:
+	 */
+	ctx->num_internal_temps = 0;
+}
+
+static void
+compile_instructions(struct fd2_compile_context *ctx)
+{
+	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+		tgsi_parse_token(&ctx->parser);
+
+		switch (ctx->parser.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			translate_instruction(ctx,
+					&ctx->parser.FullToken.FullInstruction);
+			break;
+		default:
+			break;
+		}
+	}
+
+	ctx->cf->cf_type = EXEC_END;
+}
+
+int
+fd2_compile_shader(struct fd_program_stateobj *prog,
+		struct fd2_shader_stateobj *so)
+{
+	struct fd2_compile_context ctx;
+
+	ir2_shader_destroy(so->ir);
+	so->ir = ir2_shader_create();
+	so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
+
+	if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
+		return -1;
+
+	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
+		compile_vtx_fetch(&ctx);
+	} else if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+		prog->num_exports = 0;
+		memset(prog->export_linkage, 0xff,
+				sizeof(prog->export_linkage));
+	}
+
+	compile_instructions(&ctx);
+
+	compile_free(&ctx);
+
+	return 0;
+}
+
author	Rob Clark <[email protected]>	2013-05-26 17:13:27 -0400
committer	Rob Clark <[email protected]>	2013-06-08 13:15:51 -0400
commit	18c317b21ddc2ec4538544f9dd69dc568dcf821f (patch)
tree	224e3c0464dcedabbf5784e5c3def952473fd8f8 /src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
parent	213c207b3ac40ae769afe01b8578f566b5e7840d (diff)