From 4d765f7fa3751eae00bbf2b6ee9710bf5bdf95d0 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Tue, 10 Aug 2010 23:09:53 +0200
Subject: nvfx: support proper shader linkage - adds glsl support

---
 src/gallium/drivers/nvfx/nvfx_state.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 9ceb2577ecc..e1fa3c7e041 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -4,6 +4,8 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_statebuf.h"
+#include "util/u_dynarray.h"
+#include "util/u_linkage.h"
 
 struct nvfx_vertex_program_exec {
 	uint32_t data[4];
@@ -18,6 +20,7 @@ struct nvfx_vertex_program_data {
 
 struct nvfx_vertex_program {
 	struct pipe_shader_state pipe;
+	unsigned long long id;
 
 	struct draw_vertex_shader *draw;
 
@@ -30,6 +33,9 @@ struct nvfx_vertex_program {
 	struct nvfx_vertex_program_data *consts;
 	unsigned nr_consts;
 
+	char generic_to_fp_input[256];
+	unsigned texcoord_ouput_mask;
+
 	struct nouveau_resource *exec;
 	unsigned exec_start;
 	struct nouveau_resource *data;
@@ -49,6 +55,7 @@ struct nvfx_fragment_program_data {
 struct nvfx_fragment_program_bo {
 	struct nvfx_fragment_program_bo* next;
 	struct nouveau_bo* bo;
+	unsigned char* slots;
 	char insn[] __attribute__((aligned(16)));
 };
 
@@ -58,6 +65,7 @@ struct nvfx_fragment_program {
 
 	boolean translated;
 	unsigned samplers;
+	unsigned point_sprite_control;
 
 	uint32_t *insn;
 	int       insn_len;
@@ -65,11 +73,27 @@ struct nvfx_fragment_program {
 	struct nvfx_fragment_program_data *consts;
 	unsigned nr_consts;
 
+	unsigned num_slots; /* how many input semantics? */
+	unsigned char slot_to_generic[8]; /* semantics */
+	unsigned char slot_to_fp_input[8]; /* current assignment of slots for each used semantic */
+	struct util_dynarray slot_relocations[8];
+
+	/* This is reset to progs on any relocation update, and decreases every time we
+	 * move to a new prog due to a constant update
+	 * When this is the same as progs, applying relocations is no longer necessary.
+	 */
+	unsigned progs_left_with_obsolete_slot_assignments;
+
+	unsigned long long last_vp_id;
+	unsigned last_sprite_coord_enable;
+
 	uint32_t fp_control;
 
 	unsigned bo_prog_idx;
 	unsigned prog_size;
 	unsigned progs_per_bo;
+	unsigned progs;
+
 	struct nvfx_fragment_program_bo* fpbo;
 };
 
-- 
cgit v1.2.3


From fe3c62dd7728f1cab64978d634fd0be4237d3b23 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sat, 21 Aug 2010 18:37:21 +0200
Subject: nvfx: add vertex program control flow

---
 src/gallium/drivers/nvfx/nvfx_state.h    |   3 +-
 src/gallium/drivers/nvfx/nvfx_vertprog.c | 184 ++++++++++++++++++++++++++++---
 2 files changed, 169 insertions(+), 18 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index e1fa3c7e041..6d589af5f35 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -9,7 +9,6 @@
 
 struct nvfx_vertex_program_exec {
 	uint32_t data[4];
-	boolean has_branch_offset;
 	int const_index;
 };
 
@@ -45,6 +44,8 @@ struct nvfx_vertex_program {
 	uint32_t ir;
 	uint32_t or;
 	uint32_t clip_ctrl;
+
+	struct util_dynarray branch_relocs;
 };
 
 struct nvfx_fragment_program_data {
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 416e1b4d45a..98fcc928982 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -29,6 +29,12 @@
 
 #define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
 
+struct nvfx_loop_entry
+{
+	unsigned brk_target;
+	unsigned cont_target;
+};
+
 struct nvfx_vpc {
 	struct nvfx_context* nvfx;
 	struct nvfx_vertex_program *vp;
@@ -45,6 +51,9 @@ struct nvfx_vpc {
 	unsigned nr_imm;
 
 	unsigned hpos_idx;
+
+	struct util_dynarray label_relocs;
+	struct util_dynarray loop_stack;
 };
 
 static struct nvfx_reg
@@ -169,6 +178,17 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 	struct nvfx_vertex_program *vp = vpc->vp;
 
 	switch (dst.type) {
+	case NVFXSR_NONE:
+		if(!nvfx->is_nv4x)
+			hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0)
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+			else
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
 	case NVFXSR_TEMP:
 		if(!nvfx->is_nv4x)
 			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
@@ -254,7 +274,7 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 
 		if(!nvfx->is_nv4x) {
 			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
-			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
 
 			/*XXX: no way this is entirely correct, someone needs to
 			 *     figure out what exactly it is.
@@ -264,7 +284,7 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
 			if (slot == 0) {
 				hw[0] |= NV40_VP_INST_VEC_RESULT;
-				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
 			} else {
 				hw[3] |= NV40_VP_INST_SCA_RESULT;
 				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
@@ -292,11 +312,13 @@ nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 
 	hw = vpc->vpi->data;
 
-	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
-	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
-		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
-		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
-		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	if(insn.cc_update)
+		hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
 
 	if(!nvfx->is_nv4x) {
 		if(slot == 0)
@@ -327,7 +349,7 @@ nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 			hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
 	    } else {
 			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
-			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
 			hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
 		}
 	}
@@ -374,6 +396,9 @@ tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 	struct nvfx_reg dst;
 
 	switch (fdst->Register.File) {
+	case TGSI_FILE_NULL:
+		dst = nvfx_reg(NVFXSR_NONE, 0);
+		break;
 	case TGSI_FILE_OUTPUT:
 		dst = vpc->r_result[fdst->Register.Index];
 		break;
@@ -405,11 +430,14 @@ tgsi_mask(uint tgsi)
 
 static boolean
 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
-				const struct tgsi_full_instruction *finst)
+				unsigned idx, const struct tgsi_full_instruction *finst)
 {
 	struct nvfx_src src[3], tmp;
 	struct nvfx_reg dst;
 	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn;
+	struct nvfx_label_relocation reloc;
+	struct nvfx_loop_entry loop;
 	int mask;
 	int ai = -1, ci = -1, ii = -1;
 	int i;
@@ -548,8 +576,6 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 	case TGSI_OPCODE_RCP:
 		nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
 		break;
-	case TGSI_OPCODE_RET:
-		break;
 	case TGSI_OPCODE_RSQ:
 		nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
 		break;
@@ -591,6 +617,84 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
 		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
 		break;
+
+	case TGSI_OPCODE_IF:
+		insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_vp_emit(vpc, insn);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+
+		insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		insn.cc_test = NVFX_COND_EQ;
+		insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
+		nvfx_vp_emit(vpc, insn);
+		break;
+
+	case TGSI_OPCODE_ELSE:
+	case TGSI_OPCODE_BRA:
+	case TGSI_OPCODE_CAL:
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+
+		if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
+			insn = arith(SCA, CAL, none.reg, 0, none, none, none);
+		else
+			insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		nvfx_vp_emit(vpc, insn);
+		break;
+
+	case TGSI_OPCODE_RET:
+		tmp = none;
+		tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
+		nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
+		break;
+
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+	case TGSI_OPCODE_ENDIF:
+		/* nothing to do here */
+		break;
+
+	case TGSI_OPCODE_BGNLOOP:
+		loop.cont_target = idx;
+		loop.brk_target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_CONT:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_BRK:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.brk_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
@@ -777,6 +881,7 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	struct tgsi_parse_context parse;
 	struct nvfx_vpc *vpc = NULL;
 	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct util_dynarray insns;
 	int i;
 
 	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
@@ -801,6 +906,7 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 
 	tgsi_parse_init(&parse, vp->pipe.tokens);
 
+	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
 
@@ -823,8 +929,10 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 		{
 			const struct tgsi_full_instruction *finst;
+			unsigned idx = insns.size >> 2;
+			util_dynarray_append(&insns, unsigned, vp->nr_insns);
 			finst = &parse.FullToken.FullInstruction;
-			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
 				goto out_err;
 		}
 			break;
@@ -833,6 +941,25 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		}
 	}
 
+	util_dynarray_append(&insns, unsigned, vp->nr_insns);
+
+	for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation))
+	{
+		struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)vpc->label_relocs.data + i);
+		struct nvfx_label_relocation hw_reloc;
+
+		hw_reloc.location = label_reloc->location;
+		hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
+
+		//debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
+
+		util_dynarray_append(&vp->branch_relocs, struct nvfx_label_relocation, hw_reloc);
+	}
+	util_dynarray_fini(&insns);
+	util_dynarray_trim(&vp->branch_relocs);
+
+	/* XXX: what if we add a RET before?!  make sure we jump here...*/
+
 	/* Write out HPOS if it was redirected to a temp earlier */
 	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
 		struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
@@ -866,7 +993,11 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
 	}
 
-	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	//vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+
+	/* Append NOP + END instruction for branches to the end of the program */
+	nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
+        vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST | 0x1000;
 
 	if(debug_get_option_nvfx_dump_vp())
 	{
@@ -879,9 +1010,12 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		debug_printf("\n");
 	}
 
+	vp->exec_start = -1;
 	vp->translated = TRUE;
 out_err:
 	tgsi_parse_free(&parse);
+	util_dynarray_fini(&vpc->label_relocs);
+	util_dynarray_fini(&vpc->loop_stack);
 	if (vpc->r_temp)
 		FREE(vpc->r_temp);
 	if (vpc->r_address)
@@ -977,11 +1111,27 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	 * fixup offsets and register IDs.
 	 */
 	if (vp->exec_start != vp->exec->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+		//printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
+		for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_label_relocation))
+		{
+			struct nvfx_label_relocation* reloc = (struct nvfx_label_relocation*)((char*)vp->branch_relocs.data + i);
+			uint32_t* hw = vp->insns[reloc->location].data;
+			unsigned target = vp->exec->start + reloc->target;
 
-			if (vpi->has_branch_offset) {
-				assert(0);
+			//debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
+
+			if(!nvfx->is_nv4x)
+			{
+				hw[2] &=~ NV30_VP_INST_IADDR_MASK;
+				hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
+			}
+			else
+			{
+				hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
+				hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
+
+				hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
+				hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
 			}
 		}
 
-- 
cgit v1.2.3


From a0c45eabf961905ea7bd48b2750fce41c8ba542b Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sun, 22 Aug 2010 00:21:55 +0200
Subject: nvfx: use relocations array for vp constants

---
 src/gallium/drivers/nvfx/nvfx_fragprog.c | 18 ++++++------
 src/gallium/drivers/nvfx/nvfx_shader.h   |  2 +-
 src/gallium/drivers/nvfx/nvfx_state.h    |  2 +-
 src/gallium/drivers/nvfx/nvfx_vertprog.c | 47 ++++++++++++++++----------------
 4 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index db33ecd78cf..025989ac5bb 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -270,7 +270,7 @@ nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
 static void
 nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
 {
-        struct nvfx_label_relocation reloc;
+        struct nvfx_relocation reloc;
         uint32_t *hw;
         fpc->inst_offset = fpc->fp->insn_len;
         grow_insns(fpc, 4);
@@ -284,7 +284,7 @@ nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
         hw[3] = 0;
         reloc.target = target;
         reloc.location = fpc->inst_offset + 2;
-        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 }
 
 static void
@@ -306,7 +306,7 @@ nv40_fp_ret(struct nvfx_fpc *fpc)
 static void
 nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
 {
-        struct nvfx_label_relocation reloc;
+        struct nvfx_relocation reloc;
         uint32_t *hw;
         fpc->inst_offset = fpc->fp->insn_len;
         grow_insns(fpc, 4);
@@ -325,7 +325,7 @@ nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
         hw[3] = 0; /* | end_offset */
         reloc.target = target;
         reloc.location = fpc->inst_offset + 3;
-        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 }
 
@@ -333,7 +333,7 @@ nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
 static void
 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
 {
-        struct nvfx_label_relocation reloc;
+        struct nvfx_relocation reloc;
         uint32_t *hw;
         fpc->inst_offset = fpc->fp->insn_len;
         grow_insns(fpc, 4);
@@ -349,10 +349,10 @@ nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
         hw[3] = 0; /* | endif_offset */
         reloc.target = target;
         reloc.location = fpc->inst_offset + 2;
-        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
         reloc.target = target;
         reloc.location = fpc->inst_offset + 3;
-        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 }
 
 static void
@@ -1041,9 +1041,9 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	}
 	util_dynarray_append(&insns, unsigned, fp->insn_len);
 
-	for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation))
+	for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
 	{
-		struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)fpc->label_relocs.data + i);
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
 		fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
 	}
 	util_dynarray_fini(&insns);
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index 52e684aec3b..c711484ee2b 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -509,7 +509,7 @@ nvfx_src_abs(struct nvfx_src src)
 	return src;
 }
 
-struct nvfx_label_relocation {
+struct nvfx_relocation {
         unsigned location;
         unsigned target;
 };
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 6d589af5f35..1247abcfa21 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -9,7 +9,6 @@
 
 struct nvfx_vertex_program_exec {
 	uint32_t data[4];
-	int const_index;
 };
 
 struct nvfx_vertex_program_data {
@@ -46,6 +45,7 @@ struct nvfx_vertex_program {
 	uint32_t clip_ctrl;
 
 	struct util_dynarray branch_relocs;
+	struct util_dynarray const_relocs;
 };
 
 struct nvfx_fragment_program_data {
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 38f37168a18..996680e32cc 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -113,6 +113,7 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos,
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
 	uint32_t sr = 0;
+	struct nvfx_relocation reloc;
 
 	switch (src.reg.type) {
 	case NVFXSR_TEMP:
@@ -128,9 +129,9 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos,
 	case NVFXSR_CONST:
 		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
 		       NVFX_VP(SRC_REG_TYPE_SHIFT));
-		assert(vpc->vpi->const_index == -1 ||
-		       vpc->vpi->const_index == src.reg.index);
-		vpc->vpi->const_index = src.reg.index;
+		reloc.location = vp->nr_insns - 1;
+		reloc.target = src.reg.index;
+		util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
 		break;
 	case NVFXSR_NONE:
 		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
@@ -308,7 +309,6 @@ nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
 	vpc->vpi = &vp->insns[vp->nr_insns - 1];
 	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
-	vpc->vpi->const_index = -1;
 
 	hw = vpc->vpi->data;
 
@@ -440,7 +440,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 	struct nvfx_reg dst;
 	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
 	struct nvfx_insn insn;
-	struct nvfx_label_relocation reloc;
+	struct nvfx_relocation reloc;
 	struct nvfx_loop_entry loop;
 	int mask;
 	int ai = -1, ci = -1, ii = -1;
@@ -662,7 +662,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 		reloc.location = vpc->vp->nr_insns;
 		reloc.target = finst->Label.Label + 1;
-		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
 
 		insn = arith(SCA, BRA, none.reg, 0, none, none, none);
 		insn.cc_test = NVFX_COND_EQ;
@@ -675,7 +675,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 	case TGSI_OPCODE_CAL:
 		reloc.location = vpc->vp->nr_insns;
 		reloc.target = finst->Label.Label;
-		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
 
 		if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
 			insn = arith(SCA, CAL, none.reg, 0, none, none, none);
@@ -707,7 +707,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 		reloc.location = vpc->vp->nr_insns;
 		reloc.target = loop.cont_target;
-		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
 
 		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
 		break;
@@ -717,7 +717,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 		reloc.location = vpc->vp->nr_insns;
 		reloc.target = loop.cont_target;
-		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
 
 		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
 		break;
@@ -727,7 +727,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 		reloc.location = vpc->vp->nr_insns;
 		reloc.target = loop.brk_target;
-		util_dynarray_append(&vpc->label_relocs, struct nvfx_label_relocation, reloc);
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
 
 		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
 		break;
@@ -979,17 +979,17 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 
 	util_dynarray_append(&insns, unsigned, vp->nr_insns);
 
-	for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation))
+	for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
 	{
-		struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)vpc->label_relocs.data + i);
-		struct nvfx_label_relocation hw_reloc;
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
+		struct nvfx_relocation hw_reloc;
 
 		hw_reloc.location = label_reloc->location;
 		hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
 
 		//debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
 
-		util_dynarray_append(&vp->branch_relocs, struct nvfx_label_relocation, hw_reloc);
+		util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
 	}
 	util_dynarray_fini(&insns);
 	util_dynarray_trim(&vp->branch_relocs);
@@ -1155,9 +1155,9 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	 */
 	if (vp->exec_start != vp->exec->start) {
 		//printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
-		for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_label_relocation))
+		for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
 		{
-			struct nvfx_label_relocation* reloc = (struct nvfx_label_relocation*)((char*)vp->branch_relocs.data + i);
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
 			uint32_t* hw = vp->insns[reloc->location].data;
 			unsigned target = vp->exec->start + reloc->target;
 
@@ -1182,16 +1182,15 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	}
 
 	if (vp->nr_consts && vp->data_start != vp->data->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+		for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
+		{
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
 
-			if (vpi->const_index >= 0) {
-				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
-				vpi->data[1] |=
-					(vpi->const_index + vp->data->start) <<
+			vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+			vpi->data[1] |=
+					(reloc->target + vp->data->start) <<
 					NVFX_VP(INST_CONST_SRC_SHIFT);
-
-			}
 		}
 
 		vp->data_start = vp->data->start;
-- 
cgit v1.2.3


From 7de1f86c49716eeadb443507d16ead933288059c Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sun, 22 Aug 2010 12:02:41 +0200
Subject: nvfx: simplify and correct fragment program update logic

This version should hopefully be much clearer and thus less likely
to be subtly broken.

Also fixes point sprites on nv40 and possibly some other bugs too.
---
 src/gallium/drivers/nvfx/nvfx_context.c    |   2 +
 src/gallium/drivers/nvfx/nvfx_context.h    |   2 +
 src/gallium/drivers/nvfx/nvfx_fragprog.c   | 185 +++++++++++++++++++----------
 src/gallium/drivers/nvfx/nvfx_state.h      |   3 +-
 src/gallium/drivers/nvfx/nvfx_state_emit.c |   2 +
 src/gallium/drivers/nvfx/nvfx_vertprog.c   |  49 +++-----
 6 files changed, 149 insertions(+), 94 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 8e852010f57..e78fc14da44 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -90,6 +90,8 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
+	nvfx->hw_pointsprite_control = -1;
+	nvfx->hw_vp_output = -1;
 	nvfx->use_vertex_buffers = -1;
 
 	LIST_INITHEAD(&nvfx->render_cache);
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 63fbce87b50..fb4a9da5792 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -193,6 +193,8 @@ struct nvfx_context {
 	uint32_t hw_txf[8];
 	struct nvfx_render_target hw_rt[4];
 	struct nvfx_render_target hw_zeta;
+	int hw_pointsprite_control;
+	int hw_vp_output;
 };
 
 static INLINE struct nvfx_context *
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 025989ac5bb..e40a814e18c 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1101,6 +1101,35 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 #endif
 }
 
+/* The hardware only supports immediate constants inside the fragment program,
+ * and at least on nv30 doesn't support an indirect linkage table.
+ *
+ * Hence, we need to patch the fragment program itself both to update constants
+ * and update linkage.
+ *
+ * Using a single fragment program would entail unacceptable stalls if the GPU is
+ * already rendering with that fragment program.
+ * Thus, we instead use a "rotating queue" of buffer objects, each of which is
+ * packed with multiple versions of the same program.
+ *
+ * Whenever we need to patch something, we move to the next program and
+ * patch it. If all buffer objects are in use by the GPU, we allocate another one,
+ * expanding the queue.
+ *
+ * As an additional optimization, we record when all the programs have the
+ * current input slot configuration, and at that point we stop patching inputs.
+ * This happens, for instance, if a given fragment program is always used with
+ * the same vertex program (i.e. always with GLSL), or if the layouts match
+ * enough (non-GLSL).
+ *
+ * Note that instead of using multiple programs, we could push commands
+ * on the FIFO to patch a single program: it's not fully clear which option is
+ * faster, but my guess is that the current way is faster.
+ *
+ * We also track the previous slot assignments for each version and don't
+ * patch if they are the same (this could perhaps be removed).
+ */
+
 void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
@@ -1109,6 +1138,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 	int update = 0;
 	struct nvfx_vertex_program* vp;
 	unsigned sprite_coord_enable;
+	boolean update_pointsprite = !!(nvfx->dirty & NVFX_NEW_FRAGPROG);
 
 	if (!fp->translated)
 	{
@@ -1141,80 +1171,96 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		fp->bo_prog_idx = fp->progs_per_bo - 1;
 	}
 
-	/* we must update constants even on "just" fragprog changes, because
-	   we don't check whether the current constant buffer matches the latest
-	   one bound to this fragment program */
-	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
-		update = TRUE;
-
 	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
-	if (fp->last_vp_id != vp->id) {
-		char* vp_sem_table = vp->generic_to_fp_input;
-		unsigned char* fp_semantics = fp->slot_to_generic;
-		unsigned diff = 0;
-		unsigned char* cur_slots;
-		fp->last_vp_id = nvfx->vertprog->id;
-		cur_slots = fp->slot_to_fp_input;
-		for(unsigned i = 0; i < fp->num_slots; ++i) {
-			unsigned char slot_mask = vp_sem_table[fp_semantics[i]];
-			diff |= (slot_mask >> 4) & (slot_mask ^ cur_slots[i]);
-		}
+        sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
+
+	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
+		int sprite_input = -1;
+		unsigned i;
+		fp->last_vp_id = vp->id;
+		fp->last_sprite_coord_enable = sprite_coord_enable;
 
-		if(diff)
+		if(sprite_coord_enable)
 		{
-			for(unsigned i = 0; i < fp->num_slots; ++i) {
-				/* if 0xff, then this will write to the dummy value at fp->last_layout_mask[0] */
-				fp->slot_to_fp_input[i] = vp_sem_table[fp_semantics[i]] & 0xf;
-				//printf("fp: GENERIC[%i] from fpreg %i\n", fp_semantics[i], fp->slot_to_fp_input[i]);
+			sprite_input = vp->sprite_fp_input;
+			if(sprite_input < 0)
+			{
+				unsigned used_texcoords = 0;
+				for(unsigned i = 0; i < fp->num_slots; ++i) {
+					unsigned generic = fp->slot_to_generic[i];
+					if(!((1 << generic) & sprite_coord_enable))
+					{
+						unsigned char slot_mask = vp->generic_to_fp_input[generic];
+						if(slot_mask >= 0xf0)
+							used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
+					}
+				}
+
+				sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
 			}
 
-			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-			update = TRUE;
+			fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
 		}
-	}
+		else
+			fp->point_sprite_control = 0;
 
-	// last_sprite_coord_enable
-	sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
-	if(fp->last_sprite_coord_enable != sprite_coord_enable)
-	{
-		unsigned texcoord_mask = vp->texcoord_ouput_mask;
-		fp->last_sprite_coord_enable = sprite_coord_enable;
-		fp->point_sprite_control = 0;
-		for(unsigned i = 0; i < fp->num_slots; ++i) {
-			if((1 << fp->slot_to_generic[i]) & sprite_coord_enable)
+		for(i = 0; i < fp->num_slots; ++i) {
+			unsigned generic = fp->slot_to_generic[i];
+			if((1 << generic) & sprite_coord_enable)
 			{
-				unsigned fpin = fp->slot_to_fp_input[i];
-				//printf("sprite: slot %i generic %i had texcoord %i\n", i, fp->slot_to_generic[i], fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-				if(fpin >= 0x0f)
-				{
-					unsigned tc = __builtin_ctz(~texcoord_mask);
-					texcoord_mask |= (1 << tc);
-					fp->slot_to_fp_input[i] = fpin = NVFX_FP_OP_INPUT_SRC_TC(tc);
-
-					fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-					update = TRUE;
-				}
-				//printf("sprite: slot %i texcoord %i\n", i, fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-				fp->point_sprite_control |= (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+				if(fp->slot_to_fp_input[i] != sprite_input)
+					goto update_slots;
 			}
 			else
 			{
-				unsigned fpin = fp->slot_to_fp_input[i];
-				if(!(vp->texcoord_ouput_mask & (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0))))
-				{
-					fp->slot_to_fp_input[i] = 0x0f;
+				unsigned char slot_mask = vp->generic_to_fp_input[generic];
+				if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
+					goto update_slots;
+			}
+		}
+
+		if(0)
+		{
+update_slots:
+			/* optimization: we start updating from the slot we found the first difference in */
+			for(; i < fp->num_slots; ++i)
+			{
+				unsigned generic = fp->slot_to_generic[i];
+				if((1 << generic) & sprite_coord_enable)
+					fp->slot_to_fp_input[i] = sprite_input;
+				else
+					fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
+			}
 
-					fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-					update = TRUE;
+			if(nvfx->is_nv4x)
+			{
+				fp->or = 0;
+				for(i = 0; i < fp->num_slots; ++i) {
+					unsigned fp_input = fp->slot_to_fp_input[i];
+					if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
+						fp->or |= (1 << 12);
+					else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
+						fp->or |= (1 << 13);
+					else if(fp_input != 0xf)
+						fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
 				}
 			}
+
+			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
+			goto update;
 		}
 	}
 
-	if(update) {
+	/* We must update constants even on "just" fragprog changes, because
+	  * we don't check whether the current constant buffer matches the latest
+	  * one bound to this fragment program.
+	  * Doing such a check would likely be a pessimization.
+	  */
+	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) {
 		int offset;
 		uint32_t* fpmap;
 
+update:
 		++fp->bo_prog_idx;
 		if(fp->bo_prog_idx >= fp->progs_per_bo)
 		{
@@ -1278,6 +1324,9 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 		}
 
+		/* we only do this if we aren't sure that all program versions have the
+		 * current slot assignments, otherwise we just update constants for speed
+		 */
 		if(fp->progs_left_with_obsolete_slot_assignments) {
 			unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
 			for(unsigned i = 0; i < fp->num_slots; ++i) {
@@ -1296,10 +1345,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 			--fp->progs_left_with_obsolete_slot_assignments;
 		}
-	}
 
-	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
-		int offset = fp->bo_prog_idx * fp->prog_size;
 		MARK_RING(chan, 8, 1);
 		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -1316,11 +1362,28 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		}
 	}
 
-	if(nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SPRITE))
 	{
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
-		OUT_RING(chan, fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization);
+		unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
+		if(pointsprite_control != nvfx->hw_pointsprite_control)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
+			OUT_RING(chan, pointsprite_control);
+			nvfx->hw_pointsprite_control = pointsprite_control;
+		}
+	}
+
+	if(nvfx->is_nv4x)
+	{
+		unsigned vp_output = vp->or | fp->or;
+
+		if(vp_output != nvfx->hw_vp_output)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+			OUT_RING(chan, vp_output);
+			nvfx->hw_vp_output = vp_output;
+		}
 	}
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 1247abcfa21..05d41cfc8dd 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -32,7 +32,7 @@ struct nvfx_vertex_program {
 	unsigned nr_consts;
 
 	char generic_to_fp_input[256];
-	unsigned texcoord_ouput_mask;
+	int sprite_fp_input;
 
 	struct nouveau_resource *exec;
 	unsigned exec_start;
@@ -67,6 +67,7 @@ struct nvfx_fragment_program {
 	boolean translated;
 	unsigned samplers;
 	unsigned point_sprite_control;
+	unsigned or;
 
 	uint32_t *insn;
 	int       insn_len;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index 8e3c342179d..bd89a385d7c 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -17,6 +17,8 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	{
 		nvfx->dirty = ~0;
 		nvfx->hw_vtxelt_nr = 16;
+		nvfx->hw_pointsprite_control = -1;
+		nvfx->hw_vp_output = -1;
 		nvfx->screen->cur_ctx = nvfx;
 	}
 
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 3c3521e1622..806f263dcff 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -235,24 +235,10 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 			dst.index = NVFX_VP(INST_DEST_PSZ);
 			break;
 		default:
-			if(!nvfx->is_nv4x) {
-				switch (dst.index) {
-				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
-			} else {
+			if(nvfx->is_nv4x) {
+				/* we don't need vp->or on nv3x
+				 * texcoords are handled by fragment program
+				 */
 				switch (dst.index) {
 				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
 				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
@@ -260,14 +246,6 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
 				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
 				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
 				}
 			}
 			break;
@@ -817,13 +795,21 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 
 	/* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
 	memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
-	vpc->vp->texcoord_ouput_mask = 0;
 	for(int i = 0; i < 8; ++i) {
 		if(sem_layout[i] == 0xff)
 			continue;
-		vpc->vp->texcoord_ouput_mask |= (1 << i);
 		//printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
-		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | (NVFX_FP_OP_INPUT_SRC_TC(0) + i);
+		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
+	}
+
+	vpc->vp->sprite_fp_input = -1;
+	for(int i = 0; i < 8; ++i)
+	{
+		if(sem_layout[i] == 0xff)
+		{
+			vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
+			break;
+		}
 	}
 
 	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
@@ -1233,13 +1219,12 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
 	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 	{
-		WAIT_RING(chan, 7);
+		WAIT_RING(chan, 6);
 		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
 		OUT_RING(chan, vp->exec->start);
 		if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
 			OUT_RING(chan, vp->ir);
-			OUT_RING(chan, vp->or);
 		}
 		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
 		OUT_RING(chan, vp->clip_ctrl);
-- 
cgit v1.2.3


From df86f1e7d50e01b92e03dc25fa9e9258d2d4fa2f Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sun, 22 Aug 2010 16:15:51 +0200
Subject: nvfx: refactor to support multiple fragment program versions

---
 src/gallium/drivers/nvfx/nvfx_context.c  |   4 +
 src/gallium/drivers/nvfx/nvfx_context.h  |   5 +-
 src/gallium/drivers/nvfx/nvfx_draw.c     |  10 +-
 src/gallium/drivers/nvfx/nvfx_fragprog.c | 174 +++++++++++++++++++------------
 src/gallium/drivers/nvfx/nvfx_state.h    |  10 +-
 5 files changed, 127 insertions(+), 76 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 99ad7bfacf7..80b36fb7b91 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -33,6 +33,9 @@ nvfx_destroy(struct pipe_context *pipe)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
+	if(nvfx->dummy_fs)
+		pipe->delete_fs_state(pipe, nvfx->dummy_fs);
+
 	for(unsigned i = 0; i < nvfx->vtxbuf_nr; ++i)
 		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
 	pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
@@ -42,6 +45,7 @@ nvfx_destroy(struct pipe_context *pipe)
 
 	if (nvfx->draw)
 		draw_destroy(nvfx->draw);
+
 	FREE(nvfx);
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 02e8ed01784..2134f3c3865 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -161,7 +161,7 @@ struct nvfx_context {
 	unsigned stipple[32];
 	struct pipe_clip_state clip;
 	struct nvfx_vertex_program *vertprog;
-	struct nvfx_fragment_program *fragprog;
+	struct nvfx_pipe_fragment_program *fragprog;
 	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
 	unsigned constbuf_nr[PIPE_SHADER_TYPES];
 	struct nvfx_rasterizer_state *rasterizer;
@@ -174,6 +174,8 @@ struct nvfx_context {
 	struct pipe_index_buffer idxbuf;
 	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
 	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	struct nvfx_pipe_fragment_program* dummy_fs;
+
 	unsigned nr_samplers;
 	unsigned nr_textures;
 	unsigned dirty_samplers;
@@ -195,6 +197,7 @@ struct nvfx_context {
 	struct nvfx_render_target hw_zeta;
 	int hw_pointsprite_control;
 	int hw_vp_output;
+	struct nvfx_fragment_program* hw_fragprog;
 };
 
 static INLINE struct nvfx_context *
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 331e28418ad..0b179212957 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -274,19 +274,19 @@ emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
 void
 nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	unsigned colour = 0, texcoords = 0, fog = 0, i;
 
 	/* Determine needed fragprog inputs */
-	for (i = 0; i < fp->info.num_inputs; i++) {
-		switch (fp->info.input_semantic_name[i]) {
+	for (i = 0; i < pfp->info.num_inputs; i++) {
+		switch (pfp->info.input_semantic_name[i]) {
 		case TGSI_SEMANTIC_POSITION:
 			break;
 		case TGSI_SEMANTIC_COLOR:
-			colour |= (1 << fp->info.input_semantic_index[i]);
+			colour |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_GENERIC:
-			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			texcoords |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_FOG:
 			fog = 1;
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index e0e31e46894..c4394b25f31 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -8,6 +8,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_ureg.h"
 
 #include "nvfx_context.h"
 #include "nvfx_shader.h"
@@ -17,6 +18,7 @@
 #define MAX_IMM 32
 
 struct nvfx_fpc {
+	struct nvfx_pipe_fragment_program* pfp;
 	struct nvfx_fragment_program *fp;
 
 	unsigned r_temps;
@@ -379,27 +381,27 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 
 	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) {
-			assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0);
+		if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION);
-		} else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) {
-			if(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0)
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) {
+			if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0)
 				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0);
-			else if(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 1)
+			else if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 1)
 				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1);
 			else
 				assert(0);
-		} else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) {
-			assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC);
-		} else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) {
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) {
 			/* TODO: check this has the correct values */
 			/* XXX: what do we do for nv30 here (assuming it lacks facing)?!  */
-			assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
 			src.reg = nvfx_reg(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING);
 		} else {
-			assert(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC);
-			src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->fp->info.input_semantic_index[fsrc->Register.Index]]);
+			assert(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC);
+			src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->pfp->info.input_semantic_index[fsrc->Register.Index]]);
 		}
 		break;
 	case TGSI_FILE_CONSTANT:
@@ -922,7 +924,7 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	float const0v[4] = {0, 0, 0, 0};
 	struct nvfx_reg const0;
 
-	fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->fp->pipe.tokens, TGSI_FILE_INPUT);
+	fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT);
 	if(fpc->fp->num_slots > 8)
 		return FALSE;
 	util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, 8);
@@ -933,7 +935,7 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	const0 = constant(fpc, -1, const0v);
 	assert(const0.index == 0);
 
-	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	tgsi_parse_init(&p, fpc->pfp->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
 		const union tgsi_full_token *tok = &p.FullToken;
 
@@ -999,26 +1001,32 @@ out_err:
 
 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
 
-static void
+static struct nvfx_fragment_program*
 nvfx_fragprog_translate(struct nvfx_context *nvfx,
-			struct nvfx_fragment_program *fp)
+			struct nvfx_pipe_fragment_program *pfp)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_fpc *fpc = NULL;
 	struct util_dynarray insns;
+	struct nvfx_fragment_program* fp = NULL;
+        const int min_size = 4096;
+
+	fp = CALLOC_STRUCT(nvfx_fragment_program);
+	if(!fp)
+		goto out_err;
 
-	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	fpc = CALLOC_STRUCT(nvfx_fpc);
 	if (!fpc)
-		return;
+		goto out_err;
+
+	fpc->pfp = pfp;
 	fpc->fp = fp;
 	fpc->num_regs = 2;
 
-	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
-		FREE(fpc);
-		return;
-	}
+	if (!nvfx_fragprog_prepare(nvfx, fpc))
+		goto out_err;
 
-	tgsi_parse_init(&parse, fp->pipe.tokens);
+	tgsi_parse_init(&parse, pfp->pipe.tokens);
 
 	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
@@ -1068,7 +1076,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	if(debug_get_option_nvfx_dump_fp())
 	{
 		debug_printf("\n");
-		tgsi_dump(fp->pipe.tokens, 0);
+		tgsi_dump(pfp->pipe.tokens, 0);
 
 		debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
 		for (unsigned i = 0; i < fp->insn_len; i += 4)
@@ -1076,15 +1084,37 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 		debug_printf("\n");
 	}
 
-	fp->translated = TRUE;
-out_err:
+        fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+        if(fp->prog_size >= min_size)
+                fp->progs_per_bo = 1;
+        else
+                fp->progs_per_bo = min_size / fp->prog_size;
+        fp->bo_prog_idx = fp->progs_per_bo - 1;
+
+out:
 	tgsi_parse_free(&parse);
-	if (fpc->r_temp)
-		FREE(fpc->r_temp);
-	util_dynarray_fini(&fpc->if_stack);
-	util_dynarray_fini(&fpc->label_relocs);
-	//util_dynarray_fini(&fpc->loop_stack);
-	FREE(fpc);
+	if(fpc)
+	{
+		if (fpc->r_temp)
+			FREE(fpc->r_temp);
+		util_dynarray_fini(&fpc->if_stack);
+		util_dynarray_fini(&fpc->label_relocs);
+		//util_dynarray_fini(&fpc->loop_stack);
+		FREE(fpc);
+	}
+	return fp;
+
+out_err:
+	_debug_printf("Error: failed to compile this fragment program:\n");
+	tgsi_dump(pfp->pipe.tokens, 0);
+
+	if(fp)
+	{
+		FREE(fp);
+		fp = NULL;
+	}
+	goto out;
 }
 
 static inline void
@@ -1134,43 +1164,49 @@ void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
-	int update = 0;
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	struct nvfx_vertex_program* vp;
 	unsigned sprite_coord_enable;
-	boolean update_pointsprite = !!(nvfx->dirty & NVFX_NEW_FRAGPROG);
+	unsigned key = 0;
+	struct nvfx_fragment_program* fp;
 
-	if (!fp->translated)
+	fp = pfp->fps[key];
+	if (!fp)
 	{
-		const int min_size = 4096;
+		fp = nvfx_fragprog_translate(nvfx, pfp);
 
-		nvfx_fragprog_translate(nvfx, fp);
-		if (!fp->translated) {
-			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
-			static int warned = 0;
-			if(!warned)
+		if(!fp)
+		{
+			if(!nvfx->dummy_fs)
 			{
-				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
-				warned = 1;
+				struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+				if (ureg)
+				{
+					ureg_END( ureg );
+					nvfx->dummy_fs = ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+				}
+
+				if(!nvfx->dummy_fs)
+				{
+					_debug_printf("Error: unable to create a dummy fragment shader: aborting.");
+					abort();
+				}
 			}
 
-			/* use dummy program: we cannot fail here */
-			fp->translated = TRUE;
-			fp->insn = malloc(sizeof(dummy));
-			memcpy(fp->insn, dummy, sizeof(dummy));
-			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
-		}
-		update = TRUE;
+			fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs);
 
-		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+			if(!fp)
+			{
+				_debug_printf("Error: unable to compile even a dummy fragment shader: aborting.");
+				abort();
+			}
+		}
 
-		if(fp->prog_size >= min_size)
-			fp->progs_per_bo = 1;
-		else
-			fp->progs_per_bo = min_size / fp->prog_size;
-		fp->bo_prog_idx = fp->progs_per_bo - 1;
+		pfp->fps[key] = fp;
 	}
 
+	nvfx->hw_fragprog = fp;
+
 	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
         sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
 
@@ -1391,7 +1427,7 @@ void
 nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_fragment_program *fp = nvfx->hw_fragprog;
 	struct nouveau_bo* bo = fp->fpbo->bo;
 	int offset = fp->bo_prog_idx * fp->prog_size;
 	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
@@ -1433,14 +1469,14 @@ static void *
 nvfx_fp_state_create(struct pipe_context *pipe,
                      const struct pipe_shader_state *cso)
 {
-        struct nvfx_fragment_program *fp;
+        struct nvfx_pipe_fragment_program *pfp;
 
-        fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
-        fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+        pfp = CALLOC(1, sizeof(struct nvfx_pipe_fragment_program));
+        pfp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 
-        tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+        tgsi_scan_shader(pfp->pipe.tokens, &pfp->info);
 
-        return (void *)fp;
+        return (void *)pfp;
 }
 
 static void
@@ -1456,11 +1492,17 @@ static void
 nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 {
         struct nvfx_context *nvfx = nvfx_context(pipe);
-        struct nvfx_fragment_program *fp = hwcso;
+        struct nvfx_pipe_fragment_program *pfp = hwcso;
+        unsigned i;
+
+        for(i = 0; i < Elements(pfp->fps); ++i)
+        {
+        	nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
+        	FREE(pfp->fps[i]);
+        }
 
-        nvfx_fragprog_destroy(nvfx, fp);
-        FREE((void*)fp->pipe.tokens);
-        FREE(fp);
+        FREE((void*)pfp->pipe.tokens);
+        FREE(pfp);
 }
 
 void
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 05d41cfc8dd..fd2174ed690 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -61,10 +61,6 @@ struct nvfx_fragment_program_bo {
 };
 
 struct nvfx_fragment_program {
-	struct pipe_shader_state pipe;
-	struct tgsi_shader_info info;
-
-	boolean translated;
 	unsigned samplers;
 	unsigned point_sprite_control;
 	unsigned or;
@@ -99,5 +95,11 @@ struct nvfx_fragment_program {
 	struct nvfx_fragment_program_bo* fpbo;
 };
 
+struct nvfx_pipe_fragment_program {
+        struct pipe_shader_state pipe;
+        struct tgsi_shader_info info;
+
+        struct nvfx_fragment_program* fps[1];
+};
 
 #endif
-- 
cgit v1.2.3


From d507c0812d5a01d29f1f9f6942ec5cfd91ea0375 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sun, 22 Aug 2010 23:29:34 +0200
Subject: nvfx: support both sprite coord origins

Now we lie less when claiming OpenGL 2 support.

Also, first piglit result group is now all green, except for
fdo25614-genmipmap, which seems mesa/st's fault.
---
 src/gallium/drivers/nvfx/nvfx_fragprog.c | 139 ++++++++++++++++++++++---------
 src/gallium/drivers/nvfx/nvfx_state.c    |   3 +-
 src/gallium/drivers/nvfx/nvfx_state.h    |   9 +-
 3 files changed, 108 insertions(+), 43 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 47df71f2325..12b002a8198 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -25,6 +25,7 @@ struct nvfx_fpc {
 	unsigned long long r_temps_discard;
 	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
 	struct nvfx_reg *r_temp;
+	unsigned sprite_coord_temp;
 
 	int num_regs;
 
@@ -114,9 +115,10 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
 		sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
 		break;
 	case NVFXSR_RELOCATED:
-		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT);
 		//printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index);
-		util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset);
+		util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1);
 		break;
 	case NVFXSR_CONST:
 		if (!fpc->have_const) {
@@ -1003,7 +1005,8 @@ DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
 
 static struct nvfx_fragment_program*
 nvfx_fragprog_translate(struct nvfx_context *nvfx,
-			struct nvfx_pipe_fragment_program *pfp)
+			struct nvfx_pipe_fragment_program *pfp,
+			boolean emulate_sprite_flipping)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_fpc *fpc = NULL;
@@ -1027,8 +1030,20 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 		goto out_err;
 
 	tgsi_parse_init(&parse, pfp->pipe.tokens);
-
 	util_dynarray_init(&insns);
+
+	if(emulate_sprite_flipping)
+	{
+		struct nvfx_reg reg = temp(fpc);
+		struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots));
+		float v[4] = {1, -1, 0, 0};
+		struct nvfx_src imm = nvfx_src(constant(fpc, -1, v));
+
+		fpc->sprite_coord_temp = reg.index;
+		fpc->r_temps_discard = 0ULL;
+		nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z)));
+	}
+
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
 
@@ -1166,14 +1181,16 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	struct nvfx_vertex_program* vp;
-	unsigned sprite_coord_enable;
-	unsigned key = 0;
+	unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
+	// TODO: correct or flipped?
+	boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode;
+	unsigned key = emulate_sprite_flipping;
 	struct nvfx_fragment_program* fp;
 
 	fp = pfp->fps[key];
 	if (!fp)
 	{
-		fp = nvfx_fragprog_translate(nvfx, pfp);
+		fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping);
 
 		if(!fp)
 		{
@@ -1193,7 +1210,8 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 				}
 			}
 
-			fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs);
+			fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE);
+			emulate_sprite_flipping = FALSE;
 
 			if(!fp)
 			{
@@ -1205,21 +1223,19 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		pfp->fps[key] = fp;
 	}
 
-	nvfx->hw_fragprog = fp;
-
 	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
-        sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
 
 	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
-		int sprite_input = -1;
+		int sprite_real_input = -1;
+		int sprite_reloc_input;
 		unsigned i;
 		fp->last_vp_id = vp->id;
 		fp->last_sprite_coord_enable = sprite_coord_enable;
 
 		if(sprite_coord_enable)
 		{
-			sprite_input = vp->sprite_fp_input;
-			if(sprite_input < 0)
+			sprite_real_input = vp->sprite_fp_input;
+			if(sprite_real_input < 0)
 			{
 				unsigned used_texcoords = 0;
 				for(unsigned i = 0; i < fp->num_slots; ++i) {
@@ -1232,19 +1248,24 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					}
 				}
 
-				sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
+				sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
 			}
 
-			fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+			fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
 		}
 		else
 			fp->point_sprite_control = 0;
 
+		if(emulate_sprite_flipping)
+		   sprite_reloc_input = 0;
+		else
+		   sprite_reloc_input = sprite_real_input;
+
 		for(i = 0; i < fp->num_slots; ++i) {
 			unsigned generic = fp->slot_to_generic[i];
 			if((1 << generic) & sprite_coord_enable)
 			{
-				if(fp->slot_to_fp_input[i] != sprite_input)
+				if(fp->slot_to_fp_input[i] != sprite_reloc_input)
 					goto update_slots;
 			}
 			else
@@ -1255,6 +1276,12 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 		}
 
+		if(emulate_sprite_flipping)
+		{
+			if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input)
+				goto update_slots;
+		}
+
 		if(0)
 		{
 update_slots:
@@ -1263,21 +1290,23 @@ update_slots:
 			{
 				unsigned generic = fp->slot_to_generic[i];
 				if((1 << generic) & sprite_coord_enable)
-					fp->slot_to_fp_input[i] = sprite_input;
+					fp->slot_to_fp_input[i] = sprite_reloc_input;
 				else
 					fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
 			}
 
+			fp->slot_to_fp_input[fp->num_slots] = sprite_real_input;
+
 			if(nvfx->is_nv4x)
 			{
 				fp->or = 0;
-				for(i = 0; i < fp->num_slots; ++i) {
+				for(i = 0; i <= fp->num_slots; ++i) {
 					unsigned fp_input = fp->slot_to_fp_input[i];
 					if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
 						fp->or |= (1 << 12);
 					else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
 						fp->or |= (1 << 13);
-					else if(fp_input != 0xf)
+					else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7))
 						fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
 				}
 			}
@@ -1292,7 +1321,7 @@ update_slots:
 	  * one bound to this fragment program.
 	  * Doing such a check would likely be a pessimization.
 	  */
-	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) {
+	if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) {
 		int offset;
 		uint32_t* fpmap;
 
@@ -1365,16 +1394,45 @@ update:
 		 */
 		if(fp->progs_left_with_obsolete_slot_assignments) {
 			unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
-			for(unsigned i = 0; i < fp->num_slots; ++i) {
+			/* also relocate sprite coord slot, if any */
+			for(unsigned i = 0; i <= fp->num_slots; ++i) {
 				unsigned value = fp->slot_to_fp_input[i];;
 				if(value != fpbo_slots[i]) {
-					unsigned* p = (unsigned*)fp->slot_relocations[i].data;
-					unsigned* pend = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size);
-					for(; p != pend; ++p) {
-						unsigned off = *p;
-						unsigned dw = fp->insn[off];
-						dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT);
-						nvfx_fp_memcpy(&fpmap[*p], &dw, sizeof(dw));
+					unsigned* p;
+					unsigned* begin = (unsigned*)fp->slot_relocations[i].data;
+					unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size);
+					//printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value);
+					if(value == 0)
+					{
+						/* was relocated to an input, switch type to temporary */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p;
+							unsigned dw = fp->insn[off];
+							dw &=~ NVFX_FP_REG_TYPE_MASK;
+							//printf("reloc_tmp at %x\n", off);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
+					} else {
+						if(!fpbo_slots[i])
+						{
+							/* was relocated to a temporary, switch type to input */
+							for(p= begin; p != end; ++p) {
+								unsigned off = *p;
+								unsigned dw = fp->insn[off];
+								//printf("reloc_in at %x\n", off);
+								dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT;
+								nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+							}
+						}
+
+						/* set the correct input index */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p & ~3;
+							unsigned dw = fp->insn[off];
+							//printf("reloc&~3 at %x\n", off);
+							dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
 					}
 					fpbo_slots[i] = value;
 				}
@@ -1382,6 +1440,8 @@ update:
 			--fp->progs_left_with_obsolete_slot_assignments;
 		}
 
+		nvfx->hw_fragprog = fp;
+
 		MARK_RING(chan, 8, 1);
 		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -1491,15 +1551,18 @@ nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
 static void
 nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-        struct nvfx_context *nvfx = nvfx_context(pipe);
-        struct nvfx_pipe_fragment_program *pfp = hwcso;
-        unsigned i;
-
-        for(i = 0; i < Elements(pfp->fps); ++i)
-        {
-        	nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
-        	FREE(pfp->fps[i]);
-        }
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_pipe_fragment_program *pfp = hwcso;
+	unsigned i;
+
+	for(i = 0; i < Elements(pfp->fps); ++i)
+	{
+		if(pfp->fps[i])
+		{
+			nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
+			FREE(pfp->fps[i]);
+		}
+	}
 
         FREE((void*)pfp->pipe.tokens);
         FREE(pfp);
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
index cb32e503c8f..5bd7dc07f02 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.c
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -174,7 +174,8 @@ nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 		}
 
 		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.point_quad_rasterization != nvfx->rasterizer->pipe.point_quad_rasterization
-				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_enable != nvfx->rasterizer->pipe.sprite_coord_enable)
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_enable != nvfx->rasterizer->pipe.sprite_coord_enable
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_mode != nvfx->rasterizer->pipe.sprite_coord_mode)
 		{
 			nvfx->dirty |= NVFX_NEW_SPRITE;
 		}
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index fd2174ed690..37951919182 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -71,10 +71,11 @@ struct nvfx_fragment_program {
 	struct nvfx_fragment_program_data *consts;
 	unsigned nr_consts;
 
+	/* the slot at num_slots is for the sprite coordinate, if any */
 	unsigned num_slots; /* how many input semantics? */
-	unsigned char slot_to_generic[8]; /* semantics */
-	unsigned char slot_to_fp_input[8]; /* current assignment of slots for each used semantic */
-	struct util_dynarray slot_relocations[8];
+	unsigned char slot_to_generic[10]; /* semantics */
+	unsigned char slot_to_fp_input[11]; /* current assignment of slots for each used semantic */
+	struct util_dynarray slot_relocations[11];
 
 	/* This is reset to progs on any relocation update, and decreases every time we
 	 * move to a new prog due to a constant update
@@ -99,7 +100,7 @@ struct nvfx_pipe_fragment_program {
         struct pipe_shader_state pipe;
         struct tgsi_shader_info info;
 
-        struct nvfx_fragment_program* fps[1];
+        struct nvfx_fragment_program* fps[2];
 };
 
 #endif
-- 
cgit v1.2.3


From bfaa2577c6474222c79341c0d90685ed579f3414 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 23 Aug 2010 00:31:08 +0200
Subject: nvfx: support clip planes sensibly and fix them on nv30

Before, we were discarding the compiled vertex program on each
vertex program change.

Now we compile the program as if there were 6 clip planes and
dynamically patch in an "end program" bit at the right place.

Also, nv30 should now work.
---
 src/gallium/auxiliary/util/u_dynarray.h    |   3 +
 src/gallium/drivers/nvfx/nv30_vertprog.h   |   5 +-
 src/gallium/drivers/nvfx/nvfx_context.c    |   4 +
 src/gallium/drivers/nvfx/nvfx_context.h    |   1 +
 src/gallium/drivers/nvfx/nvfx_fragprog.c   |  13 ---
 src/gallium/drivers/nvfx/nvfx_state.h      |   4 +-
 src/gallium/drivers/nvfx/nvfx_state_emit.c |  82 ++++++++++++++
 src/gallium/drivers/nvfx/nvfx_vertprog.c   | 170 ++++++++++++-----------------
 8 files changed, 166 insertions(+), 116 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
index 9d1c1713a7c..980cadf22d1 100644
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf)
 #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
 #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
 #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
+#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx))
+#define util_dynarray_begin(buf) ((buf)->data)
+#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size))
 
 #endif /* U_DYNARRAY_H */
 
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
index df92469078c..9a68f5c1fb0 100644
--- a/src/gallium/drivers/nvfx/nv30_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -125,7 +125,7 @@
 #define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
 #define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
 #define NV30_VP_INST_DEST_SHIFT        2
-#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#define NV30_VP_INST_DEST_MASK        (0x1F <<  2)
 #  define NV30_VP_INST_DEST_POS  0
 #  define NV30_VP_INST_DEST_BFC0  1
 #  define NV30_VP_INST_DEST_BFC1  2
@@ -133,7 +133,8 @@
 #  define NV30_VP_INST_DEST_COL1  4
 #  define NV30_VP_INST_DEST_FOGC  5
 #  define NV30_VP_INST_DEST_PSZ   6
-#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+#  define NV30_VP_INST_DEST_TC(n)  (8+(n))
+#  define NV30_VP_INST_DEST_CLP(n) (17 + (n))
 
 /* Useful to split the source selection regs into their pieces */
 #define NV30_VP_SRC0_HIGH_SHIFT                                                6
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 80b36fb7b91..2f775f92cf5 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -75,6 +75,10 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	screen->base.channel->user_private = nvfx;
 
 	nvfx->is_nv4x = screen->is_nv4x;
+	/* TODO: it seems that nv30 might have fixed function clipping usable with vertex programs
+	 * However, my code for that doesn't work, so use vp clipping for all cards, which works.
+	 */
+	nvfx->use_vp_clipping = TRUE;
 
 	nvfx_init_query_functions(nvfx);
 	nvfx_init_surface_functions(nvfx);
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 2134f3c3865..680f4c6ce0f 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -134,6 +134,7 @@ struct nvfx_context {
 	struct nvfx_screen *screen;
 
 	unsigned is_nv4x; /* either 0 or ~0 */
+	boolean use_vp_clipping;
 
 	struct draw_context *draw;
 	struct blitter_context* blitter;
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index a7e43b1513b..23a85c9342e 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1468,19 +1468,6 @@ update:
 			nvfx->hw_pointsprite_control = pointsprite_control;
 		}
 	}
-
-	if(nvfx->is_nv4x)
-	{
-		unsigned vp_output = vp->or | fp->or;
-
-		if(vp_output != nvfx->hw_vp_output)
-		{
-			WAIT_RING(chan, 2);
-			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
-			OUT_RING(chan, vp_output);
-			nvfx->hw_vp_output = vp_output;
-		}
-	}
 }
 
 void
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 37951919182..e9c1f2c26d2 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -24,8 +24,6 @@ struct nvfx_vertex_program {
 
 	boolean translated;
 
-	struct pipe_clip_state ucp;
-
 	struct nvfx_vertex_program_exec *insns;
 	unsigned nr_insns;
 	struct nvfx_vertex_program_data *consts;
@@ -42,7 +40,7 @@ struct nvfx_vertex_program {
 
 	uint32_t ir;
 	uint32_t or;
-	uint32_t clip_ctrl;
+	int clip_nr;
 
 	struct util_dynarray branch_relocs;
 	struct util_dynarray const_relocs;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index bd89a385d7c..c43a75aaa21 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -90,6 +90,74 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	if(dirty & NVFX_NEW_STIPPLE)
 		nvfx_state_stipple_validate(nvfx);
 
+       if(nvfx->dirty & NVFX_NEW_UCP)
+	{
+		unsigned enables[7] =
+		{
+				0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5,
+		};
+
+		if(!nvfx->use_vp_clipping)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+			OUT_RING(chan, 0);
+
+			WAIT_RING(chan, 6 * 4 + 1);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANE_A(0), nvfx->clip.nr * 4));
+			OUT_RINGp(chan, &nvfx->clip.ucp[0][0], nvfx->clip.nr * 4);
+		}
+
+		WAIT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+		OUT_RING(chan, enables[nvfx->clip.nr]);
+	}
+
+	if(nvfx->use_vp_clipping && (nvfx->dirty & (NVFX_NEW_UCP | NVFX_NEW_VERTPROG)))
+	{
+		unsigned i;
+		struct nvfx_vertex_program* vp = nvfx->vertprog;
+		if(nvfx->clip.nr != vp->clip_nr)
+		{
+			unsigned idx;
+			WAIT_RING(chan, 14);
+
+			/* remove last instruction bit */
+			if(vp->clip_nr >= 0)
+			{
+				idx = vp->nr_insns - 7 + vp->clip_nr;
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+				OUT_RING(chan,  vp->exec->start + idx);
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+				OUT_RINGp (chan, vp->insns[idx].data, 4);
+			}
+
+			 /* set last instruction bit */
+			idx = vp->nr_insns - 7 + nvfx->clip.nr;
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+			OUT_RING(chan,  vp->exec->start + idx);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+			OUT_RINGp(chan, vp->insns[idx].data, 3);
+			OUT_RING(chan, vp->insns[idx].data[3] | 1);
+			vp->clip_nr = nvfx->clip.nr;
+		}
+
+		// TODO: only do this for the ones changed
+		WAIT_RING(chan, 6 * 6);
+		for(i = 0; i < nvfx->clip.nr; ++i)
+		{
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_CONST_ID, 5));
+			OUT_RING(chan, vp->data->start + i);
+			OUT_RINGp (chan, nvfx->clip.ucp[i], 4);
+		}
+	}
+
 	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST | NVFX_NEW_VERTPROG | NVFX_NEW_SPRITE))
 	{
 		nvfx_fragprog_validate(nvfx);
@@ -97,6 +165,20 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 			flush_tex_cache = TRUE; // TODO: do we need this?
 	}
 
+	if(nvfx->is_nv4x)
+	{
+		unsigned vp_output = nvfx->vertprog->or | nvfx->hw_fragprog->or;
+		vp_output |= (1 << (nvfx->clip.nr + 6)) - (1 << 6);
+
+		if(vp_output != nvfx->hw_vp_output)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+			OUT_RING(chan, vp_output);
+			nvfx->hw_vp_output = vp_output;
+		}
+	}
+
 	if(all_swizzled >= 0)
 		nvfx_framebuffer_validate(nvfx, all_swizzled);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 3b8d3853b7f..ea7e88c5613 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -29,8 +29,6 @@
 #include "nv30_vertprog.h"
 #include "nv40_vertprog.h"
 
-#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
-
 struct nvfx_loop_entry
 {
 	unsigned brk_target;
@@ -205,52 +203,33 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 		break;
 	case NVFXSR_OUTPUT:
 		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
-		switch (dst.index) {
-		case NVFX_VP_INST_DEST_CLIP(0):
-			vp->or |= (1 << 6);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(1):
-			vp->or |= (1 << 7);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(2):
-			vp->or |= (1 << 8);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(3):
-			vp->or |= (1 << 9);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(4):
-			vp->or |= (1 << 10);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(5):
-			vp->or |= (1 << 11);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		default:
-			if(nvfx->is_nv4x) {
-				/* we don't need vp->or on nv3x
-				 * texcoords are handled by fragment program
-				 */
-				switch (dst.index) {
-				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				}
+		if(nvfx->is_nv4x) {
+			switch (dst.index) {
+			case NV30_VP_INST_DEST_CLP(0):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(1):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(2):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(3):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(4):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(5):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+			case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+			case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+			case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+			case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+			case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
 			}
-			break;
 		}
 
 		if(!nvfx->is_nv4x) {
@@ -914,6 +893,13 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	vpc->nvfx = nvfx;
 	vpc->vp = vp;
 
+	/* reserve space for ucps */
+	if(nvfx->use_vp_clipping)
+	{
+		for(i = 0; i < 6; ++i)
+			constant(vpc, -1, 0, 0, 0, 0);
+	}
+
 	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
 		FREE(vpc);
 		return;
@@ -923,7 +909,8 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	 * planes are enabled.  We need to append code to the vtxprog
 	 * to handle clip planes later.
 	 */
-	if (vp->ucp.nr)  {
+	/* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
+	if (nvfx->use_vp_clipping)  {
 		vpc->r_result[vpc->hpos_idx] = temp(vpc);
 		vpc->r_temps_discard = 0;
 	}
@@ -994,34 +981,39 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	}
 
 	/* Insert code to handle user clip planes */
-	for (i = 0; i < vp->ucp.nr; i++) {
-		struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT,
-						NVFX_VP_INST_DEST_CLIP(i));
-		struct nvfx_src ceqn = nvfx_src(constant(vpc, -1,
-						 nvfx->clip.ucp[i][0],
-						 nvfx->clip.ucp[i][1],
-						 nvfx->clip.ucp[i][2],
-						 nvfx->clip.ucp[i][3]));
-		struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
-		unsigned mask;
+	if(nvfx->use_vp_clipping)
+	{
+		for (i = 0; i < 6; i++) {
+			struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
+			struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
+			struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
+			unsigned mask;
 
-		switch (i) {
-		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
-		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
-		case 2: case 5: mask = NVFX_VP_MASK_W; break;
-		default:
-			NOUVEAU_ERR("invalid clip dist #%d\n", i);
-			goto out_err;
-		}
+			if(nvfx->is_nv4x)
+			{
+				switch (i) {
+				case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+				case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+				case 2: case 5: mask = NVFX_VP_MASK_W; break;
+				default:
+					NOUVEAU_ERR("invalid clip dist #%d\n", i);
+					goto out_err;
+				}
+			}
+			else
+				mask = NVFX_VP_MASK_X;
 
-		nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
+			nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
+		}
 	}
+	else
+	{
+		if(vp->nr_insns)
+			vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
 
-	//vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
-
-	/* Append NOP + END instruction for branches to the end of the program */
-	nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
-        vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST | 0x1000;
+		nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
+		vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	}
 
 	if(debug_get_option_nvfx_dump_vp())
 	{
@@ -1034,6 +1026,7 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		debug_printf("\n");
 	}
 
+	vp->clip_nr = -1;
 	vp->exec_start = -1;
 	vp->translated = TRUE;
 out_err:
@@ -1063,13 +1056,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	if (nvfx->render_mode == HW) {
 		vp = nvfx->vertprog;
 		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
-
-		// TODO: ouch! can't we just use constant slots for these?!
-		if ((nvfx->dirty & NVFX_NEW_UCP) ||
-		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
-			nvfx_vertprog_destroy(nvfx, vp);
-			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
-		}
 	} else {
 		vp = nvfx->swtnl.vertprog;
 		constbuf = NULL;
@@ -1169,7 +1155,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		vp->exec_start = vp->exec->start;
 	}
 
-	if (vp->nr_consts && vp->data_start != vp->data->start) {
+	if (vp->data_start != vp->data->start) {
 		for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
 		{
 			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
@@ -1182,6 +1168,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		}
 
 		vp->data_start = vp->data->start;
+		upload_code = TRUE;
 	}
 
 	/* Update + Upload constant values */
@@ -1191,7 +1178,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		if (constbuf)
 			map = (float*)nvfx_buffer(constbuf)->data;
 
-		for (i = 0; i < vp->nr_consts; i++) {
+		for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
 
 			if (vpd->index >= 0) {
@@ -1217,9 +1204,10 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
 			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
+		vp->clip_nr = -1;
 	}
 
-	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+	if(nvfx->dirty & (NVFX_NEW_VERTPROG))
 	{
 		WAIT_RING(chan, 6);
 		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
@@ -1228,8 +1216,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
 			OUT_RING(chan, vp->ir);
 		}
-		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
-		OUT_RING(chan, vp->clip_ctrl);
 	}
 
 	return TRUE;
@@ -1238,27 +1224,15 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 void
 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
 {
-	vp->translated = FALSE;
-
-	if (vp->nr_insns) {
+	if (vp->nr_insns)
 		FREE(vp->insns);
-		vp->insns = NULL;
-		vp->nr_insns = 0;
-	}
 
-	if (vp->nr_consts) {
+	if (vp->nr_consts)
 		FREE(vp->consts);
-		vp->consts = NULL;
-		vp->nr_consts = 0;
-	}
 
 	nouveau_resource_free(&vp->exec);
-	vp->exec_start = 0;
 	nouveau_resource_free(&vp->data);
-	vp->data_start = 0;
-	vp->data_start_min = 0;
 
-	vp->ir = vp->or = vp->clip_ctrl = 0;
 	util_dynarray_fini(&vp->branch_relocs);
 	util_dynarray_fini(&vp->const_relocs);
 }
-- 
cgit v1.2.3


From 71a8544f89d736d481b15da421110ac275d7c24f Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sat, 4 Sep 2010 02:57:14 +0200
Subject: nvfx: support all coord conventions in hardware

---
 src/gallium/drivers/nouveau/nouveau_class.h | 10 ++++++++++
 src/gallium/drivers/nvfx/nvfx_fragprog.c    | 10 ++++++++++
 src/gallium/drivers/nvfx/nvfx_screen.c      |  3 +--
 src/gallium/drivers/nvfx/nvfx_state.h       |  1 +
 src/gallium/drivers/nvfx/nvfx_state_emit.c  | 18 ++++++++++++++++++
 src/gallium/drivers/nvfx/nvfx_state_fb.c    |  4 +---
 6 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
index 3c2248b6249..79681d277be 100644
--- a/src/gallium/drivers/nouveau/nouveau_class.h
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -6508,6 +6508,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_ONE				(1 <<  8)
 #define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_SHIFT				16
 #define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_MASK				0xffff0000
+#define  NV34TCL_COORD_CONVENTIONS							0x00001d88
+#define   NV34TCL_COORD_CONVENTIONS_HEIGHT_SHIFT					0
+#define   NV34TCL_COORD_CONVENTIONS_ORIGIN_NORMAL					(0 << 12)
+#define   NV34TCL_COORD_CONVENTIONS_ORIGIN_INVERTED					(1 << 12)
+#define   NV34TCL_COORD_CONVENTIONS_ORIGIN_SHIFT					12
+#define   NV34TCL_COORD_CONVENTIONS_ORIGIN_MASK						(1 << 12)
+#define   NV34TCL_COORD_CONVENTIONS_CENTER_HALF_INTEGER					(0 << 16)
+#define   NV34TCL_COORD_CONVENTIONS_CENTER_INTEGER					(1 << 16)
+#define   NV34TCL_COORD_CONVENTIONS_CENTER_SHIFT					16
+#define   NV34TCL_COORD_CONVENTIONS_CENTER_MASK						(1 << 16)
 #define  NV34TCL_CLEAR_DEPTH_VALUE							0x00001d8c
 #define  NV34TCL_CLEAR_COLOR_VALUE							0x00001d90
 #define   NV34TCL_CLEAR_COLOR_VALUE_B_SHIFT						0
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 6eb744e6546..79dd22467a6 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1052,6 +1052,16 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	fpc->fp = fp;
 	fpc->num_regs = 2;
 
+	for (unsigned i = 0; i < pfp->info.num_properties; ++i) {
+		if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_ORIGIN) {
+			if(pfp->info.properties[i].data[0])
+				fp->coord_conventions |= NV34TCL_COORD_CONVENTIONS_ORIGIN_INVERTED;
+		} else if (pfp->info.properties[i].name == TGSI_PROPERTY_FS_COORD_PIXEL_CENTER) {
+			if(pfp->info.properties[i].data[0])
+				fp->coord_conventions |= NV34TCL_COORD_CONVENTIONS_CENTER_INTEGER;
+		}
+	}
+
 	if (!nvfx_fragprog_prepare(nvfx, fpc))
 		goto out_err;
 
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index 335c500355e..2080f44aef7 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -74,10 +74,9 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-		return 1;
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_FS_INSTRUCTIONS:
 	case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
 	case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index e9c1f2c26d2..9200f78a545 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -62,6 +62,7 @@ struct nvfx_fragment_program {
 	unsigned samplers;
 	unsigned point_sprite_control;
 	unsigned or;
+	unsigned coord_conventions;
 
 	uint32_t *insn;
 	int       insn_len;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index 128cf2b6dd3..c84bf60c129 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -3,6 +3,21 @@
 #include "nvfx_resource.h"
 #include "draw/draw_context.h"
 
+static void
+nvfx_coord_conventions_validate(struct nvfx_context* nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned value = 0;
+	if(nvfx->hw_fragprog->coord_conventions & NV34TCL_COORD_CONVENTIONS_ORIGIN_INVERTED)
+		value |= nvfx->framebuffer.height << NV34TCL_COORD_CONVENTIONS_HEIGHT_SHIFT;
+
+	value |= nvfx->hw_fragprog->coord_conventions;
+
+	WAIT_RING(chan, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_COORD_CONVENTIONS, 1));
+	OUT_RING(chan, value);
+}
+
 static boolean
 nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
@@ -212,6 +227,9 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	        OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.enabled);
 	}
 
+	if((all_swizzled >= 0) || (dirty & NVFX_NEW_FRAGPROG))
+		nvfx_coord_conventions_validate(nvfx);
+
 	if(flush_tex_cache)
 	{
 		// TODO: what about nv30?
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 54855290a98..b9d30c4eb13 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -167,7 +167,7 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 	else
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
 
-	MARK_RING(chan, 44, 10);
+	MARK_RING(chan, 42, 10);
 
 	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0) || fb->zsbuf) {
 		struct nvfx_render_target *rt0 = &nvfx->hw_rt[0];
@@ -271,8 +271,6 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 	OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2));
 	OUT_RING(chan, ((w - 1) << 16) | 0);
 	OUT_RING(chan, ((h - 1) << 16) | 0);
-	OUT_RING(chan, RING_3D(0x1d88, 1));
-	OUT_RING(chan, (1 << 12) | h);
 
 	if(!nvfx->is_nv4x) {
 		/* Wonder why this is needed, context should all be set to zero on init */
-- 
cgit v1.2.3


From 8e2badfc269082f4b52a82ac1c5b4350bef0d01b Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Sun, 5 Sep 2010 05:42:59 +0200
Subject: nvfx: add rewritten swtnl support

The old swtnl code was broken by the new shader linkage support for
GLSL.

This is a rewrite of swtnl support, which should instead work properly,
be faster and more closer to the much more tested hardware pipeline.
---
 src/gallium/drivers/nvfx/nvfx_context.h    |  12 +-
 src/gallium/drivers/nvfx/nvfx_draw.c       | 236 +++++++----------------------
 src/gallium/drivers/nvfx/nvfx_fragprog.c   |   2 +-
 src/gallium/drivers/nvfx/nvfx_screen.c     |   2 +-
 src/gallium/drivers/nvfx/nvfx_state.h      |  19 ++-
 src/gallium/drivers/nvfx/nvfx_state_emit.c |  31 ++--
 src/gallium/drivers/nvfx/nvfx_vbo.c        |  38 +++++
 src/gallium/drivers/nvfx/nvfx_vertprog.c   | 161 +++++++++++++-------
 8 files changed, 238 insertions(+), 263 deletions(-)

(limited to 'src/gallium/drivers/nvfx/nvfx_state.h')

diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index b837437c58f..369c2163882 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -151,14 +151,6 @@ struct nvfx_context {
 
 	/* HW state derived from pipe states */
 	struct nvfx_state state;
-	struct {
-		struct nvfx_vertex_program *vertprog;
-
-		unsigned nr_attribs;
-		unsigned hw[PIPE_MAX_SHADER_INPUTS];
-		unsigned draw[PIPE_MAX_SHADER_INPUTS];
-		unsigned emit[PIPE_MAX_SHADER_INPUTS];
-	} swtnl;
 
 	enum {
 		HW, SWTNL, SWRAST
@@ -170,7 +162,7 @@ struct nvfx_context {
 	struct pipe_scissor_state scissor;
 	unsigned stipple[32];
 	struct pipe_clip_state clip;
-	struct nvfx_vertex_program *vertprog;
+	struct nvfx_pipe_vertex_program *vertprog;
 	struct nvfx_pipe_fragment_program *fragprog;
 	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
 	unsigned constbuf_nr[PIPE_SHADER_TYPES];
@@ -208,6 +200,7 @@ struct nvfx_context {
 	int hw_pointsprite_control;
 	int hw_vp_output;
 	struct nvfx_fragment_program* hw_fragprog;
+	struct nvfx_vertex_program* hw_vertprog;
 
 	unsigned relocs_needed;
 };
@@ -326,6 +319,7 @@ extern void nvfx_init_transfer_functions(struct pipe_context *pipe);
 
 /* nvfx_vbo.c */
 extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
+extern void nvfx_vbo_swtnl_validate(struct nvfx_context *nvfx);
 extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
 extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx);
 extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx);
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 2601d5b8e2e..4bf38a9c181 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -1,6 +1,5 @@
 #include "pipe/p_shader_tokens.h"
 #include "util/u_inlines.h"
-#include "tgsi/tgsi_ureg.h"
 
 #include "util/u_pack_color.h"
 
@@ -11,11 +10,6 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
 
-/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
- * often at all.  Uses "quadro style" vertex submission + a fixed vertex
- * layout to avoid the need to generate a vertex program or vtxfmt.
- */
-
 struct nvfx_render_stage {
 	struct draw_stage stage;
 	struct nvfx_context *nvfx;
@@ -28,58 +22,18 @@ nvfx_render_stage(struct draw_stage *stage)
 	return (struct nvfx_render_stage *)stage;
 }
 
-static INLINE void
-nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
+static void
+nvfx_render_flush(struct draw_stage *stage, unsigned flags)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	struct nouveau_grobj *eng3d = screen->eng3d;
-	unsigned i;
+	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
+	struct nvfx_context *nvfx = rs->nvfx;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
 
-	for (i = 0; i < nvfx->swtnl.nr_attribs; i++) {
-		unsigned idx = nvfx->swtnl.draw[i];
-		unsigned hw = nvfx->swtnl.hw[i];
-
-		WAIT_RING(chan, 5);
-		switch (nvfx->swtnl.emit[i]) {
-		case EMIT_OMIT:
-			break;
-		case EMIT_1F:
-			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1);
-			break;
-		case EMIT_2F:
-			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2);
-			break;
-		case EMIT_3F:
-			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3);
-			break;
-		case EMIT_4F:
-			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4);
-			break;
-		case 0xff:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (chan, fui(v->data[idx][0] / v->data[idx][3]));
-			OUT_RING  (chan, fui(v->data[idx][1] / v->data[idx][3]));
-			OUT_RING  (chan, fui(v->data[idx][2] / v->data[idx][3]));
-			OUT_RING  (chan, fui(1.0f / v->data[idx][3]));
-			break;
-		case EMIT_4UB:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
-			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
-					    float_to_ubyte(v->data[idx][1]),
-					    float_to_ubyte(v->data[idx][2]),
-					    float_to_ubyte(v->data[idx][3])));
-		case EMIT_4UB_BGRA:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
-			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][2]),
-					    float_to_ubyte(v->data[idx][1]),
-					    float_to_ubyte(v->data[idx][0]),
-					    float_to_ubyte(v->data[idx][3])));
-			break;
-		default:
-			assert(0);
-			break;
-		}
+	if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+		assert(AVAIL_RING(chan) >= 2);
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING(chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
 	}
 }
 
@@ -92,42 +46,61 @@ nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
 
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
-	struct nouveau_grobj *eng3d = screen->eng3d;
-	unsigned i;
+	boolean no_elements = nvfx->vertprog->draw_no_elements;
+	unsigned num_attribs = nvfx->vertprog->draw_elements;
 
-	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
-	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
-		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
-			NOUVEAU_ERR("AIII, missed flush\n");
-			assert(0);
-		}
+	/* we need to account the flush as well here even if it is done afterthis
+	 * function
+	 */
+	if (AVAIL_RING(chan) < ((1 + count * num_attribs * 4) + 6 + 64)) {
+		nvfx_render_flush(stage, 0);
 		FIRE_RING(chan);
 		nvfx_state_emit(nvfx);
+
+		assert(AVAIL_RING(chan) >= ((1 + count * num_attribs * 4) + 6 + 64));
 	}
 
 	/* Switch primitive modes if necessary */
 	if (rs->prim != mode) {
 		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
-			BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
-			OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+			OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+			OUT_RING(chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		}
+
+		/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
+		/* this seems to cause issues on nv3x, and also be unneeded there */
+		if(nvfx->is_nv4x)
+		{
+			int i;
+			for(i = 0; i < 32; ++i)
+			{
+				OUT_RING(chan, RING_3D(0x1dac, 1));
+				OUT_RING(chan, 0);
+			}
 		}
 
-		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
 		OUT_RING  (chan, mode);
 		rs->prim = mode;
 	}
 
-	/* Emit vertex data */
-	for (i = 0; i < count; i++)
-		nvfx_render_vertex(nvfx, prim->v[i]);
-
-	/* If it's likely we'll need to empty the push buffer soon, finish
-	 * off the primitive now.
-	 */
-	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
-		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
-		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	OUT_RING(chan, RING_3D_NI(NV34TCL_VERTEX_DATA, num_attribs * 4 * count));
+	if(no_elements) {
+		OUT_RING(chan, 0);
+		OUT_RING(chan, 0);
+		OUT_RING(chan, 0);
+		OUT_RING(chan, 0);
+	} else {
+		for (unsigned i = 0; i < count; ++i)
+		{
+			struct vertex_header* v = prim->v[i];
+			/* TODO: disable divide where it's causing the problem, and remove this hack */
+			OUT_RING(chan, fui(v->data[0][0] / v->data[0][3]));
+			OUT_RING(chan, fui(v->data[0][1] / v->data[0][3]));
+			OUT_RING(chan, fui(v->data[0][2] / v->data[0][3]));
+			OUT_RING(chan, fui(1.0f / v->data[0][3]));
+			OUT_RINGp(chan, &v->data[1][0], 4 * (num_attribs - 1));
+		}
 	}
 }
 
@@ -149,25 +122,11 @@ nvfx_render_tri(struct draw_stage *draw, struct prim_header *prim)
 	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_TRIANGLES, 3);
 }
 
-static void
-nvfx_render_flush(struct draw_stage *draw, unsigned flags)
-{
-	struct nvfx_render_stage *rs = nvfx_render_stage(draw);
-	struct nvfx_context *nvfx = rs->nvfx;
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	struct nouveau_grobj *eng3d = screen->eng3d;
-
-	if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
-		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
-		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
-	}
-}
-
 static void
 nvfx_render_reset_stipple_counter(struct draw_stage *draw)
 {
+	/* this doesn't really seem to work, but it matters rather little */
+	nvfx_render_flush(draw, 0);
 }
 
 static void
@@ -176,40 +135,11 @@ nvfx_render_destroy(struct draw_stage *draw)
 	FREE(draw);
 }
 
-static struct nvfx_vertex_program *
-nvfx_create_drawvp(struct nvfx_context *nvfx)
-{
-	struct ureg_program *ureg;
-	uint i;
-
-	ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
-	if (ureg == NULL)
-		return NULL;
-
-	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
-	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0), ureg_DECL_vs_input(ureg, 3));
-	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1), ureg_DECL_vs_input(ureg, 4));
-	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 0), ureg_DECL_vs_input(ureg, 3));
-	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 1), ureg_DECL_vs_input(ureg, 4));
-	ureg_MOV(ureg,
-		   ureg_writemask(ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 1), TGSI_WRITEMASK_X),
-		   ureg_DECL_vs_input(ureg, 5));
-	for (i = 0; i < 8; ++i)
-		ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, i), ureg_DECL_vs_input(ureg, 8 + i));
-
-	ureg_END( ureg );
-
-	return ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
-}
-
 struct draw_stage *
 nvfx_draw_render_stage(struct nvfx_context *nvfx)
 {
 	struct nvfx_render_stage *render = CALLOC_STRUCT(nvfx_render_stage);
 
-	if (!nvfx->swtnl.vertprog)
-		nvfx->swtnl.vertprog = nvfx_create_drawvp(nvfx);
-
 	render->nvfx = nvfx;
 	render->stage.draw = nvfx->draw;
 	render->stage.point = nvfx_render_point;
@@ -231,6 +161,7 @@ nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info
 
 	if (!nvfx_state_validate_swtnl(nvfx))
 		return;
+
 	nvfx_state_emit(nvfx);
 
 	/* these must be passed without adding the offsets */
@@ -256,62 +187,3 @@ nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info
 
 	draw_flush(nvfx->draw);
 }
-
-static INLINE void
-emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
-	    unsigned semantic, unsigned index)
-{
-	unsigned draw_out = draw_find_shader_output(nvfx->draw, semantic, index);
-	unsigned a = nvfx->swtnl.nr_attribs++;
-
-	nvfx->swtnl.hw[a] = hw;
-	nvfx->swtnl.emit[a] = emit;
-	nvfx->swtnl.draw[a] = draw_out;
-}
-
-void
-nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
-{
-	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
-	unsigned colour = 0, texcoords = 0, fog = 0, i;
-
-	/* Determine needed fragprog inputs */
-	for (i = 0; i < pfp->info.num_inputs; i++) {
-		switch (pfp->info.input_semantic_name[i]) {
-		case TGSI_SEMANTIC_POSITION:
-			break;
-		case TGSI_SEMANTIC_COLOR:
-			colour |= (1 << pfp->info.input_semantic_index[i]);
-			break;
-		case TGSI_SEMANTIC_GENERIC:
-			texcoords |= (1 << pfp->info.input_semantic_index[i]);
-			break;
-		case TGSI_SEMANTIC_FOG:
-			fog = 1;
-			break;
-		default:
-			assert(0);
-		}
-	}
-
-	nvfx->swtnl.nr_attribs = 0;
-
-	/* Map draw vtxprog output to hw attribute IDs */
-	for (i = 0; i < 2; i++) {
-		if (!(colour & (1 << i)))
-			continue;
-		emit_attrib(nvfx, 3 + i, EMIT_4F, TGSI_SEMANTIC_COLOR, i);
-	}
-
-	for (i = 0; i < 8; i++) {
-		if (!(texcoords & (1 << i)))
-			continue;
-		emit_attrib(nvfx, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
-	}
-
-	if (fog) {
-		emit_attrib(nvfx, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
-	}
-
-	emit_attrib(nvfx, 0, 0xff, TGSI_SEMANTIC_POSITION, 0);
-}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 93ba5382419..86df7f00496 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1263,7 +1263,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		pfp->fps[key] = fp;
 	}
 
-	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
+	vp = nvfx->hw_vertprog;
 
 	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
 		int sprite_real_input = -1;
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index ac8053f26b3..3900821de48 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -432,7 +432,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+	screen->force_swtnl = debug_get_bool_option("NVFX_SWTNL", FALSE);
 	screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE);
 
 	screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384);
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 9200f78a545..8fafca1950c 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -17,13 +17,8 @@ struct nvfx_vertex_program_data {
 };
 
 struct nvfx_vertex_program {
-	struct pipe_shader_state pipe;
 	unsigned long long id;
 
-	struct draw_vertex_shader *draw;
-
-	boolean translated;
-
 	struct nvfx_vertex_program_exec *insns;
 	unsigned nr_insns;
 	struct nvfx_vertex_program_data *consts;
@@ -46,6 +41,20 @@ struct nvfx_vertex_program {
 	struct util_dynarray const_relocs;
 };
 
+#define NVFX_VP_FAILED ((struct nvfx_vertex_program*)-1)
+
+struct nvfx_pipe_vertex_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	unsigned draw_elements;
+	boolean draw_no_elements;
+	struct draw_vertex_shader *draw_vs;
+	struct nvfx_vertex_program* draw_vp;
+
+	struct nvfx_vertex_program* vp;
+};
+
 struct nvfx_fragment_program_data {
 	unsigned offset;
 	unsigned index;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index 308c25fbe1b..30ef12a95b6 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -145,7 +145,7 @@ nvfx_vertprog_ucp_validate(struct nvfx_context* nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	unsigned i;
-	struct nvfx_vertex_program* vp = nvfx->vertprog;
+	struct nvfx_vertex_program* vp = nvfx->hw_vertprog;
 	if(nvfx->clip.nr != vp->clip_nr)
 	{
 		unsigned idx;
@@ -230,7 +230,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 
 	if(nvfx->render_mode == HW)
 	{
-		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_VERTCONST | NVFX_NEW_UCP))
+		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_VERTCONST))
 		{
 			if(!nvfx_vertprog_validate(nvfx))
 				return FALSE;
@@ -252,12 +252,10 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	}
 	else
 	{
-		/* TODO: this looks a bit misdesigned */
-		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
-			nvfx_vertprog_validate(nvfx);
-
-		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG))
-			nvfx_vtxfmt_validate(nvfx);
+		if(dirty & NVFX_NEW_VERTPROG) {
+			assert(nvfx_vertprog_validate(nvfx));
+			nvfx_vbo_swtnl_validate(nvfx);
+		}
 	}
 
 	if(dirty & NVFX_NEW_RAST)
@@ -284,7 +282,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 
 	if(nvfx->is_nv4x)
 	{
-		unsigned vp_output = nvfx->vertprog->or | nvfx->hw_fragprog->or;
+		unsigned vp_output = nvfx->hw_vertprog->or | nvfx->hw_fragprog->or;
 		vp_output |= (1 << (nvfx->clip.nr + 6)) - (1 << 6);
 
 		if(vp_output != nvfx->hw_vp_output)
@@ -399,8 +397,6 @@ nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs)
 boolean
 nvfx_state_validate(struct nvfx_context *nvfx)
 {
-	boolean was_sw = nvfx->fallback_swtnl ? TRUE : FALSE;
-
 	if (nvfx->render_mode != HW) {
 		/* Don't even bother trying to go back to hw if none
 		 * of the states that caused swtnl previously have changed.
@@ -429,7 +425,11 @@ nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
 
 	/* Setup for swtnl */
 	if (nvfx->render_mode == HW) {
-		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nvfx->fallback_swtnl);
+		static boolean warned = FALSE;
+		if(!warned) {
+			NOUVEAU_ERR("hw->swtnl 0x%08x\n", nvfx->fallback_swtnl);
+			warned = TRUE;
+		}
 		nvfx->pipe.flush(&nvfx->pipe, 0, NULL);
 		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
 				NVFX_NEW_VERTPROG |
@@ -437,8 +437,11 @@ nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
 		nvfx->render_mode = SWTNL;
 	}
 
-	if (nvfx->draw_dirty & NVFX_NEW_VERTPROG)
-		draw_bind_vertex_shader(draw, nvfx->vertprog->draw);
+	if (nvfx->draw_dirty & NVFX_NEW_VERTPROG) {
+		if(!nvfx->vertprog->draw_vs)
+			nvfx->vertprog->draw_vs = draw_create_vertex_shader(draw, &nvfx->vertprog->pipe);
+		draw_bind_vertex_shader(draw, nvfx->vertprog->draw_vs);
+	}
 
 	if (nvfx->draw_dirty & NVFX_NEW_RAST)
            draw_set_rasterizer_state(draw, &nvfx->rasterizer->pipe,
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 611de808af5..c35e926a7a1 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -338,6 +338,44 @@ nvfx_vbo_validate(struct nvfx_context *nvfx)
 	return TRUE;
 }
 
+void
+nvfx_vbo_swtnl_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned num_outputs = nvfx->vertprog->draw_elements;
+	int elements = MAX2(num_outputs, nvfx->hw_vtxelt_nr);
+
+	if (!elements)
+		return;
+
+	WAIT_RING(chan, (1 + 6 + 1 + 2) + elements * 2);
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	for(unsigned i = 0; i < num_outputs; ++i)
+		OUT_RING(chan, (4 << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT);
+	for(unsigned i = num_outputs; i < elements; ++i)
+		OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT);
+
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
+		}
+	}
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	for (unsigned i = 0; i < elements; i++)
+		OUT_RING(chan, 0);
+
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
+
+	nvfx->hw_vtxelt_nr = num_outputs;
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF;
+}
+
 void
 nvfx_vbo_relocate(struct nvfx_context *nvfx)
 {
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index bc78ed400a9..30385b26f79 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -8,6 +8,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_ureg.h"
 
 #include "draw/draw_context.h"
 
@@ -37,6 +38,7 @@ struct nvfx_loop_entry
 
 struct nvfx_vpc {
 	struct nvfx_context* nvfx;
+	struct pipe_shader_state pipe;
 	struct nvfx_vertex_program *vp;
 
 	struct nvfx_vertex_program_exec *vpi;
@@ -813,7 +815,7 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	unsigned num_outputs;
 	unsigned num_texcoords = nvfx->is_nv4x ? 10 : 8;
 
-	num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT);
+	num_outputs = util_semantic_set_from_program_file(&set, vpc->pipe.tokens, TGSI_FILE_OUTPUT);
 
 	if(num_outputs > num_texcoords) {
 		NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
@@ -840,7 +842,7 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 		}
 	}
 
-	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	tgsi_parse_init(&p, vpc->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
 		const union tgsi_full_token *tok = &p.FullToken;
 
@@ -917,21 +919,35 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 
 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
 
-static void
-nvfx_vertprog_translate(struct nvfx_context *nvfx,
-			struct nvfx_vertex_program *vp)
+static struct nvfx_vertex_program*
+nvfx_vertprog_translate(struct nvfx_context *nvfx, const struct pipe_shader_state* vps)
 {
 	struct tgsi_parse_context parse;
+	struct nvfx_vertex_program* vp = NULL;
 	struct nvfx_vpc *vpc = NULL;
 	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
 	struct util_dynarray insns;
 	int i;
 
-	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
+	tgsi_parse_init(&parse, vps->tokens);
+
+	vp = CALLOC_STRUCT(nvfx_vertex_program);
+	if(!vp)
+		goto out_err;
+
+	vpc = CALLOC_STRUCT(nvfx_vpc);
 	if (!vpc)
-		return;
+		goto out_err;
+
 	vpc->nvfx = nvfx;
 	vpc->vp = vp;
+	vpc->pipe = *vps;
+
+	{
+		// TODO: use a 64-bit atomic here!
+		static unsigned long long id = 0;
+		vp->id = ++id;
+	}
 
 	/* reserve space for ucps */
 	if(nvfx->use_vp_clipping)
@@ -942,7 +958,7 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 
 	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
 		FREE(vpc);
-		return;
+		return NULL;
 	}
 
 	/* Redirect post-transform vertex position to a temp if user clip
@@ -955,8 +971,6 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		vpc->r_temps_discard = 0;
 	}
 
-	tgsi_parse_init(&parse, vp->pipe.tokens);
-
 	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
@@ -1058,7 +1072,7 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	if(debug_get_option_nvfx_dump_vp())
 	{
 		debug_printf("\n");
-		tgsi_dump(vp->pipe.tokens, 0);
+		tgsi_dump(vpc->pipe.tokens, 0);
 
 		debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
 		for (i = 0; i < vp->nr_insns; i++)
@@ -1068,20 +1082,49 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 
 	vp->clip_nr = -1;
 	vp->exec_start = -1;
-	vp->translated = TRUE;
-out_err:
+
+out:
 	tgsi_parse_free(&parse);
-	util_dynarray_fini(&vpc->label_relocs);
-	util_dynarray_fini(&vpc->loop_stack);
-	if (vpc->r_temp)
+	if(vpc) {
+		util_dynarray_fini(&vpc->label_relocs);
+		util_dynarray_fini(&vpc->loop_stack);
 		FREE(vpc->r_temp);
-	if (vpc->r_address)
 		FREE(vpc->r_address);
-	if (vpc->r_const)
 		FREE(vpc->r_const);
-	if (vpc->imm)
 		FREE(vpc->imm);
-	FREE(vpc);
+		FREE(vpc);
+	}
+	return vp;
+
+out_err:
+	FREE(vp);
+	vp = NULL;
+	goto out;
+}
+
+static struct nvfx_vertex_program*
+nvfx_vertprog_translate_draw_vp(struct nvfx_context *nvfx, struct nvfx_pipe_vertex_program* pvp)
+{
+	struct nvfx_vertex_program* vp = NULL;
+	struct pipe_shader_state vps;
+	struct ureg_program *ureg = NULL;
+	unsigned num_outputs = MIN2(pvp->info.num_outputs, 16);
+
+	ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+	if(ureg == NULL)
+		return 0;
+
+	for (unsigned i = 0; i < num_outputs; i++)
+		ureg_MOV(ureg, ureg_DECL_output(ureg, pvp->info.output_semantic_name[i], pvp->info.output_semantic_index[i]), ureg_DECL_vs_input(ureg, i));
+
+	ureg_END( ureg );
+
+	vps.tokens = ureg_get_tokens(ureg, 0);
+	vp = nvfx_vertprog_translate(nvfx, &vps);
+	ureg_free_tokens(vps.tokens);
+	ureg_destroy(ureg);
+
+	return vp;
 }
 
 boolean
@@ -1090,30 +1133,44 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 	struct nouveau_grobj *eng3d = screen->eng3d;
-	struct nvfx_vertex_program *vp;
+	struct nvfx_pipe_vertex_program *pvp = nvfx->vertprog;
+	struct nvfx_vertex_program* vp;
 	struct pipe_resource *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
 	if (nvfx->render_mode == HW) {
-		vp = nvfx->vertprog;
-		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
-	} else {
-		vp = nvfx->swtnl.vertprog;
-		constbuf = NULL;
-	}
-
-	/* Translate TGSI shader into hw bytecode */
-	if (!vp->translated)
-	{
 		nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
-		nvfx_vertprog_translate(nvfx, vp);
-		if (!vp->translated) {
+		vp = pvp->vp;
+
+		if(!vp) {
+			vp = nvfx_vertprog_translate(nvfx, &pvp->pipe);
+			if(!vp)
+				vp = NVFX_VP_FAILED;
+			pvp->vp = vp;
+		}
+
+		if(vp == NVFX_VP_FAILED) {
 			nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
 			return FALSE;
 		}
+
+		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
+	} else {
+		vp = pvp->draw_vp;
+		if(!vp)
+		{
+			pvp->draw_vp = vp = nvfx_vertprog_translate_draw_vp(nvfx, pvp);
+			if(!vp) {
+				_debug_printf("Error: unable to create a swtnl passthrough vertex shader: aborting.");
+				abort();
+			}
+		}
+		constbuf = NULL;
 	}
 
+	nvfx->hw_vertprog = vp;
+
 	/* Allocate hw vtxprog exec slots */
 	if (!vp->exec) {
 		struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
@@ -1294,24 +1351,22 @@ nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
 
 	util_dynarray_fini(&vp->branch_relocs);
 	util_dynarray_fini(&vp->const_relocs);
+	FREE(vp);
 }
 
 static void *
-nvfx_vp_state_create(struct pipe_context *pipe,
-                     const struct pipe_shader_state *cso)
+nvfx_vp_state_create(struct pipe_context *pipe, const struct pipe_shader_state *cso)
 {
         struct nvfx_context *nvfx = nvfx_context(pipe);
-        struct nvfx_vertex_program *vp;
-
-        // TODO: use a 64-bit atomic here!
-        static unsigned long long id = 0;
+        struct nvfx_pipe_vertex_program *pvp;
 
-        vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
-        vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
-        vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
-        vp->id = ++id;
+        pvp = CALLOC(1, sizeof(struct nvfx_pipe_vertex_program));
+        pvp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+        tgsi_scan_shader(pvp->pipe.tokens, &pvp->info);
+        pvp->draw_elements = MAX2(1, MIN2(pvp->info.num_outputs, 16));
+        pvp->draw_no_elements = pvp->info.num_outputs == 0;
 
-        return (void *)vp;
+        return (void *)pvp;
 }
 
 static void
@@ -1327,13 +1382,17 @@ nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
 static void
 nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-        struct nvfx_context *nvfx = nvfx_context(pipe);
-        struct nvfx_vertex_program *vp = hwcso;
-
-        draw_delete_vertex_shader(nvfx->draw, vp->draw);
-        nvfx_vertprog_destroy(nvfx, vp);
-        FREE((void*)vp->pipe.tokens);
-        FREE(vp);
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_pipe_vertex_program *pvp = hwcso;
+
+	if(pvp->draw_vs)
+		draw_delete_vertex_shader(nvfx->draw, pvp->draw_vs);
+	if(pvp->vp && pvp->vp != NVFX_VP_FAILED)
+		nvfx_vertprog_destroy(nvfx, pvp->vp);
+	if(pvp->draw_vp)
+		nvfx_vertprog_destroy(nvfx, pvp->draw_vp);
+	FREE((void*)pvp->pipe.tokens);
+	FREE(pvp);
 }
 
 void
-- 
cgit v1.2.3