From b946984e3bbd91da3111edd0d62f90cfd4967ad3 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Fri, 3 Sep 2010 18:31:18 +0200
Subject: nvfx: support indirect addressing in vps

Negative or huge offsets not yet supported.
---
 src/gallium/drivers/nvfx/nv30_vertprog.h |  6 ++
 src/gallium/drivers/nvfx/nvfx_screen.c   | 11 +++-
 src/gallium/drivers/nvfx/nvfx_shader.h   |  7 ++-
 src/gallium/drivers/nvfx/nvfx_vertprog.c | 99 ++++++++++++++++++++++++--------
 4 files changed, 96 insertions(+), 27 deletions(-)

(limited to 'src/gallium/drivers')

diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
index 9a68f5c1fb0..e8c16b0341a 100644
--- a/src/gallium/drivers/nvfx/nv30_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -60,6 +60,9 @@
 
 /* DWORD 0 */
 
+/* guess that this is the same as nv40 */
+#define NV30_VP_INST_INDEX_INPUT                                        (1 << 27)
+
 #define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
 #define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
 #define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
@@ -136,6 +139,9 @@
 #  define NV30_VP_INST_DEST_TC(n)  (8+(n))
 #  define NV30_VP_INST_DEST_CLP(n) (17 + (n))
 
+/* guess that this is the same as nv40 */
+#define NV30_VP_INST_INDEX_CONST                                        (1 << 1)
+
 /* Useful to split the source selection regs into their pieces */
 #define NV30_VP_SRC0_HIGH_SHIFT                                                6
 #define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index affed961d46..42094227e1c 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -110,7 +110,8 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_VS_INPUTS:
 		return 16;
 	case PIPE_CAP_MAX_VS_CONSTS:
-		return 256;
+		/* XXX: currently more don't work, but it should be possible to make it work */
+		return 212 - 6;
 	case PIPE_CAP_MAX_VS_TEMPS:
 		return screen->is_nv4x ? 32 : 13;
 	case PIPE_CAP_MAX_VS_ADDRS:
@@ -487,7 +488,13 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	/* Vtxprog resources */
 	if (nouveau_resource_init(&screen->vp_exec_heap, 0, screen->is_nv4x ? 512 : 256) ||
-	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+	    /* XXX: this should actually be 468 or 256, but apparently indirect addressing
+	     * cannot read consts starting from 212 on nv40.
+	     * It looks like 44 slots are reserved for something, and there is a "mode switch"
+	     * from 256 slots to 512 slots that we are setting to "256 mode" on nv40, leading
+	     * to 212 = 256 - 44 instead of 468 = 512 - 44 usable slots.
+	     */
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 212)) {
 		nvfx_screen_destroy(pscreen);
 		return NULL;
 	}
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index 35006eec3d4..e642a27af86 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -414,14 +414,16 @@
 #define abs(s) nvfx_src_abs((s))
 
 struct nvfx_reg {
-	uint8_t type;
+	int8_t type;
 	uint32_t index;
 };
 
 struct nvfx_src {
 	struct nvfx_reg reg;
 
-	/* src only */
+	uint8_t indirect : 1;
+	uint8_t indirect_reg : 1;
+	uint8_t indirect_swz : 2;
 	uint8_t negate : 1;
 	uint8_t abs : 1;
 	uint8_t swz[4];
@@ -483,6 +485,7 @@ nvfx_src(struct nvfx_reg reg)
 		.abs = 0,
 		.negate = 0,
 		.swz = { 0, 1, 2, 3 },
+		.indirect = 0,
 	};
 	return temp;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index ea7e88c5613..838c3aa208b 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -46,6 +46,7 @@ struct nvfx_vpc {
 	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
 	struct nvfx_reg *r_address;
 	struct nvfx_reg *r_temp;
+	struct nvfx_reg *r_const;
 
 	struct nvfx_reg *imm;
 	unsigned nr_imm;
@@ -152,6 +153,18 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos,
 	       (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
 	       (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
 
+	if(src.indirect) {
+		if(src.reg.type == NVFXSR_CONST)
+			hw[3] |= NVFX_VP(INST_INDEX_CONST);
+		else if(src.reg.type == NVFXSR_INPUT)
+			hw[0] |= NVFX_VP(INST_INDEX_INPUT);
+		else
+			assert(0);
+		if(src.indirect_reg)
+			hw[0] |= NVFX_VP(INST_ADDR_REG_SELECT_1);
+		hw[0] |= src.indirect_swz << NVFX_VP(INST_ADDR_SWZ_SHIFT);
+	}
+
 	switch (pos) {
 	case 0:
 		hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
@@ -317,6 +330,9 @@ nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 	emit_src(nvfx, vpc, hw, 0, insn.src[0]);
 	emit_src(nvfx, vpc, hw, 1, insn.src[1]);
 	emit_src(nvfx, vpc, hw, 2, insn.src[2]);
+
+//	if(insn.src[0].indirect || op == NVFX_VP_INST_VEC_OP_ARL)
+//		hw[3] |= NV40_VP_INST_SCA_RESULT;
 }
 
 static inline struct nvfx_src
@@ -328,7 +344,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 		src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src.reg = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		src.reg = vpc->r_const[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_IMMEDIATE:
 		src.reg = vpc->imm[fsrc->Register.Index];
@@ -339,7 +355,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	default:
 		NOUVEAU_ERR("bad src file\n");
 		src.reg.index = 0;
-		src.reg.type = 0;
+		src.reg.type = -1;
 		break;
 	}
 
@@ -349,6 +365,22 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	src.swz[1] = fsrc->Register.SwizzleY;
 	src.swz[2] = fsrc->Register.SwizzleZ;
 	src.swz[3] = fsrc->Register.SwizzleW;
+	src.indirect = 0;
+
+	if(fsrc->Register.Indirect) {
+		if(fsrc->Indirect.File == TGSI_FILE_ADDRESS &&
+				(fsrc->Register.File == TGSI_FILE_CONSTANT || fsrc->Register.File == TGSI_FILE_INPUT))
+		{
+			src.indirect = 1;
+			src.indirect_reg = fsrc->Indirect.Index;
+			src.indirect_swz = fsrc->Indirect.SwizzleX;
+		}
+		else
+		{
+			src.reg.index = 0;
+			src.reg.type = -1;
+		}
+	}
 	return src;
 }
 
@@ -461,6 +493,15 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		}
 	}
 
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		if(src[i].reg.type < 0)
+			return FALSE;
+	}
+
+	if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
+			finst->Instruction.Opcode != TGSI_OPCODE_ARL)
+		return FALSE;
+
 	dst  = tgsi_dst(vpc, &finst->Dst[0]);
 	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 
@@ -761,7 +802,7 @@ static boolean
 nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 {
 	struct tgsi_parse_context p;
-	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+	int high_const = -1, high_temp = -1, high_addr = -1, nr_imm = 0, i;
 	struct util_semantic_set set;
 	unsigned char sem_layout[8];
 	unsigned num_outputs;
@@ -814,14 +855,18 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 						fdec->Range.Last;
 				}
 				break;
-#if 0 /* this would be nice.. except gallium doesn't track it */
 			case TGSI_FILE_ADDRESS:
 				if (fdec->Range.Last > high_addr) {
 					high_addr =
 						fdec->Range.Last;
 				}
 				break;
-#endif
+			case TGSI_FILE_CONSTANT:
+				if (fdec->Range.Last > high_const) {
+					high_const =
+							fdec->Range.Last;
+				}
+				break;
 			case TGSI_FILE_OUTPUT:
 				if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
 					return FALSE;
@@ -831,23 +876,6 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 			}
 		}
 			break;
-#if 1 /* yay, parse instructions looking for address regs instead */
-		case TGSI_TOKEN_TYPE_INSTRUCTION:
-		{
-			const struct tgsi_full_instruction *finst;
-			const struct tgsi_full_dst_register *fdst;
-
-			finst = &p.FullToken.FullInstruction;
-			fdst = &finst->Dst[0];
-
-			if (fdst->Register.File == TGSI_FILE_ADDRESS) {
-				if (fdst->Register.Index > high_addr)
-					high_addr = fdst->Register.Index;
-			}
-
-		}
-			break;
-#endif
 		default:
 			break;
 		}
@@ -868,7 +896,13 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	if (++high_addr) {
 		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_addr; i++)
-			vpc->r_address[i] = temp(vpc);
+			vpc->r_address[i] = nvfx_reg(NVFXSR_TEMP, i);
+	}
+
+	if(++high_const) {
+		vpc->r_const = CALLOC(high_const, sizeof(struct nvfx_reg));
+		for (i = 0; i < high_const; i++)
+			vpc->r_const[i] = constant(vpc, i, 0, 0, 0, 0);
 	}
 
 	vpc->r_temps_discard = 0;
@@ -1037,6 +1071,8 @@ out_err:
 		FREE(vpc->r_temp);
 	if (vpc->r_address)
 		FREE(vpc->r_address);
+	if (vpc->r_const)
+		FREE(vpc->r_const);
 	if (vpc->imm)
 		FREE(vpc->imm);
 	FREE(vpc);
@@ -1116,6 +1152,8 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
                         }
 		}
 
+		//printf("start at %u nc %u\n", vp->data->start, vp->nr_consts);
+
 		/*XXX: handle this some day */
 		assert(vp->data->start >= vp->data_start_min);
 
@@ -1161,6 +1199,8 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
 			struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
 
+			//printf("reloc %i to %i + %i\n", reloc->location, vp->data->start, reloc->target);
+
 			vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
 			vpi->data[1] |=
 					(reloc->target + vp->data->start) <<
@@ -1178,6 +1218,16 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		if (constbuf)
 			map = (float*)nvfx_buffer(constbuf)->data;
 
+		/*
+		for (i = 0; i < 512; i++) {
+			float v[4] = {0.1, 0,2, 0.3, 0.4};
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i);
+			OUT_RINGp (chan, (uint32_t *)v, 4);
+			printf("frob %i\n", i);
+		}
+		*/
+
 		for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
 
@@ -1190,6 +1240,8 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 				       4 * sizeof(float));
 			}
 
+			//printf("upload into %i + %i: %f %f %f %f\n", vp->data->start, i, vpd->value[0], vpd->value[1], vpd->value[2], vpd->value[3]);
+
 			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
 			OUT_RING  (chan, i + vp->data->start);
 			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
@@ -1202,6 +1254,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
 			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
+			//printf("%08x %08x %08x %08x\n", vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
 			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 		vp->clip_nr = -1;
-- 
cgit v1.2.3