Merge commit 'origin/master' into drm-gem

author: Eric Anholt <[email protected]> 2008-06-18 14:07:38 -0700
committer: Eric Anholt <[email protected]> 2008-06-18 14:07:38 -0700
commit: 654258a4fe5e7114022c6e02f2844fc469fcc6f3 (patch)
tree: 89d285becb87659ab61ee0ceeb35c76726ae93d2 /src/mesa/drivers/dri/r300
parent: 64adeb163d7da6d75b5664cd2ee3783cadaf63d8 (diff)
parent: cf29ab3ba075905cca786b52617d7dc993f58033 (diff)
23 files changed, 5398 insertions, 3988 deletions
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 5b2bd0bc2b0..d52b2b4c36d 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -28,7 +28,6 @@ DRIVER_SOURCES = \
 		 radeon_span.c \
 		 radeon_state.c \
 		 r300_mem.c \
-		 \
 		 r300_context.c \
 		 r300_ioctl.c \
 		 r300_cmdbuf.c \
@@ -37,9 +36,13 @@ DRIVER_SOURCES = \
 		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
+		 radeon_program.c \
+		 radeon_program_alu.c \
 		 r300_vertprog.c \
 		 r300_fragprog.c \
+		 r300_fragprog_emit.c \
 		 r500_fragprog.c \
+		 r500_fragprog_emit.c \
 		 r300_shader.c \
 		 r300_emit.c \
 		 r300_swtcl.c \
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 31cc00a081b..063d4e575ef 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -98,6 +98,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_ARB_fragment_program",		NULL},
   {"GL_ARB_multisample",		GL_ARB_multisample_functions},
   {"GL_ARB_multitexture",		NULL},
+  {"GL_ARB_shadow",			NULL},
   {"GL_ARB_texture_border_clamp",	NULL},
   {"GL_ARB_texture_compression",	GL_ARB_texture_compression_functions},
   {"GL_ARB_texture_cube_map",		NULL},
@@ -116,6 +117,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_multi_draw_arrays",		GL_EXT_multi_draw_arrays_functions},
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+  {"GL_EXT_shadow_funcs",		NULL},
   {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
   {"GL_EXT_stencil_wrap",		NULL},
   {"GL_EXT_texture_edge_clamp",		NULL},
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index a5ec5ee46e2..6279a67ab16 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -73,7 +73,6 @@ typedef struct r300_context *r300ContextPtr;
 	}
 
 #include "r300_vertprog.h"
-#include "r300_fragprog.h"
 #include "r500_fragprog.h"
 
 /**
@@ -179,13 +178,6 @@ struct r300_tex_obj {
 	GLuint bufAddr;		/* Offset to start of locally
 				   shared texture block */
 
-	GLuint dirty_state;	/* Flags (1 per texunit) for
-				   whether or not this texobj
-				   has dirty hardware state
-				   (pp_*) that needs to be
-				   brought into the
-				   texunit. */
-
 	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
 	/* Six, for the cube faces */
 
@@ -581,9 +573,7 @@ struct r300_depthbuffer_state {
 };
 
 struct r300_stencilbuffer_state {
-	GLuint clear;
 	GLboolean hw_stencil;
-
 };
 
 /* Vertex shader state */
@@ -663,96 +653,40 @@ struct r300_vertex_program_cont {
 #define PFS_NUM_TEMP_REGS	32
 #define PFS_NUM_CONST_REGS	16
 
-/* Mapping Mesa registers to R300 temporaries */
-struct reg_acc {
-	int reg;		/* Assigned hw temp */
-	unsigned int refcount;	/* Number of uses by mesa program */
-};
+struct r300_pfs_compile_state;
 
-/**
- * Describe the current lifetime information for an R300 temporary
- */
-struct reg_lifetime {
-	/* Index of the first slot where this register is free in the sense
-	   that it can be used as a new destination register.
-	   This is -1 if the register has been assigned to a Mesa register
-	   and the last access to the register has not yet been emitted */
-	int free;
-
-	/* Index of the first slot where this register is currently reserved.
-	   This is used to stop e.g. a scalar operation from being moved
-	   before the allocation time of a register that was first allocated
-	   for a vector operation. */
-	int reserved;
-
-	/* Index of the first slot in which the register can be used as a
-	   source without losing the value that is written by the last
-	   emitted instruction that writes to the register */
-	int vector_valid;
-	int scalar_valid;
-
-	/* Index to the slot where the register was last read.
-	   This is also the first slot in which the register may be written again */
-	int vector_lastread;
-	int scalar_lastread;
-};
 
 /**
- * Store usage information about an ALU instruction slot during the
- * compilation of a fragment program.
+ * Stores state that influences the compilation of a fragment program.
  */
-#define SLOT_SRC_VECTOR  (1<<0)
-#define SLOT_SRC_SCALAR  (1<<3)
-#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
-#define SLOT_OP_VECTOR   (1<<16)
-#define SLOT_OP_SCALAR   (1<<17)
-#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
-
-struct r300_pfs_compile_slot {
-	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants
-	   defined above */
-	unsigned int used;
-
-	/* Selected sources */
-	int vsrc[3];
-	int ssrc[3];
+struct r300_fragment_program_external_state {
+	struct {
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is:
+		 *  0 - GL_LUMINANCE
+		 *  1 - GL_INTENSITY
+		 *  2 - GL_ALPHA
+		 * depending on the depth texture mode.
+		 */
+		GLuint depth_texture_mode : 2;
+
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is (texture_compare_func - GL_NEVER).
+		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
+		 *
+		 * Otherwise, this field is 0.
+		 */
+		GLuint texture_compare_func : 3;
+	} unit[16];
 };
 
-/**
- * Store information during compilation of fragment programs.
- */
-struct r300_pfs_compile_state {
-	int nrslots;		/* number of ALU slots used so far */
-
-	/* Track which (parts of) slots are already filled with instructions */
-	struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
-
-	/* Track the validity of R300 temporaries */
-	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
-
-	/* Used to map Mesa's inputs/temps onto hardware temps */
-	int temp_in_use;
-	struct reg_acc temps[PFS_NUM_TEMP_REGS];
-	struct reg_acc inputs[32];	/* don't actually need 32... */
-
-	/* Track usage of hardware temps, for register allocation,
-	 * indirection detection, etc. */
-	GLuint used_in_node;
-	GLuint dest_in_node;
-};
 
 /**
- * Store everything about a fragment program that is needed
- * to render with that program.
+ * Stores an R300 fragment program in its compiled-to-hardware form.
  */
-struct r300_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLcontext *ctx;
-	GLboolean translated;
-	GLboolean error;
-	struct r300_pfs_compile_state *cs;
-
+struct r300_fragment_program_code {
 	struct {
 		int length;
 		GLuint inst[PFS_MAX_TEX_INST];
@@ -793,19 +727,51 @@ struct r300_fragment_program {
 	int const_nr;
 
 	int max_temp_idx;
-
-	GLboolean WritesDepth;
-	GLuint optimization;
 };
 
-struct r500_fragment_program {
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
+struct r300_fragment_program {
 	struct gl_fragment_program mesa_program;
 
-	GLcontext *ctx;
 	GLboolean translated;
 	GLboolean error;
-	struct r300_pfs_compile_state *cs;
 
+	struct r300_fragment_program_external_state state;
+	struct r300_fragment_program_code code;
+
+	GLboolean WritesDepth;
+	GLuint optimization;
+};
+
+struct r500_pfs_compile_state;
+
+struct r500_fragment_program_external_state {
+	struct {
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is:
+		 *  0 - GL_LUMINANCE
+		 *  1 - GL_INTENSITY
+		 *  2 - GL_ALPHA
+		 * depending on the depth texture mode.
+		 */
+		GLuint depth_texture_mode : 2;
+
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is (texture_compare_func - GL_NEVER).
+		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
+		 *
+		 * Otherwise, this field is 0.
+		 */
+		GLuint texture_compare_func : 3;
+	} unit[16];
+};
+
+struct r500_fragment_program_code {
 	struct {
 		GLuint inst0;
 		GLuint inst1;
@@ -822,17 +788,28 @@ struct r500_fragment_program {
 	int inst_end;
 
 	/* Hardware constants.
-	 * Contains a pointer to the value. The destination of the pointer
-	 * is supposed to be updated when GL state changes.
-	 * Typically, this is either a pointer into
-	 * gl_program_parameter_list::ParameterValues, or a pointer to a
-	 * global constant (e.g. for sin/cos-approximation)
-	 */
+	* Contains a pointer to the value. The destination of the pointer
+	* is supposed to be updated when GL state changes.
+	* Typically, this is either a pointer into
+	* gl_program_parameter_list::ParameterValues, or a pointer to a
+	* global constant (e.g. for sin/cos-approximation)
+	*/
 	const GLfloat *constant[PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
+};
 
+struct r500_fragment_program {
+	struct gl_fragment_program mesa_program;
+
+	GLcontext *ctx;
+	GLboolean translated;
+	GLboolean error;
+	
+	struct r500_fragment_program_external_state state;
+	struct r500_fragment_program_code code;
+	
 	GLboolean writes_depth;
 
 	GLuint optimization;
@@ -849,7 +826,6 @@ struct r300_state {
 	struct r300_texture_state texture;
 	int sw_tcl_inputs[VERT_ATTRIB_MAX];
 	struct r300_vertex_shader_state vertex_shader;
-	struct r300_pfs_compile_state pfs_compile;
 	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
 	int aos_count;
 
@@ -949,6 +925,7 @@ struct r300_context {
 	driTextureObject swapped;
 	int texture_depth;
 	float initialMaxAnisotropy;
+	float LODBias;
 
 	/* Clientdata textures;
 	 */
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 54b80d20a16..6d24d266fec 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -28,16 +28,14 @@
 /**
  * \file
  *
- * \author Ben Skeggs <[email protected]>
+ * Fragment program compiler. Perform transformations on the intermediate
+ * \ref radeon_program representation (which is essentially the Mesa
+ * program representation plus the notion of clauses) until the program
+ * is in a form where we can translate it more or less directly into
+ * machine-readable form.
  *
+ * \author Ben Skeggs <[email protected]>
  * \author Jerome Glisse <[email protected]>
- *
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
- * \todo Verify results of opcodes for accuracy, I've only checked them in
- * specific cases.
  */
 
 #include "glheader.h"
@@ -49,1967 +47,225 @@
 
 #include "r300_context.h"
 #include "r300_fragprog.h"
-#include "r300_reg.h"
 #include "r300_state.h"
 
-/*
- * Usefull macros and values
- */
-#define ERROR(fmt, args...) do {			\
-		fprintf(stderr, "%s::%s(): " fmt "\n",	\
-			__FILE__, __FUNCTION__, ##args);	\
-		fp->error = GL_TRUE;			\
-	} while(0)
-
-#define PFS_INVAL 0xFFFFFFFF
-#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
-
-#define SWIZZLE_XYZ		0
-#define SWIZZLE_XXX		1
-#define SWIZZLE_YYY		2
-#define SWIZZLE_ZZZ		3
-#define SWIZZLE_WWW		4
-#define SWIZZLE_YZX		5
-#define SWIZZLE_ZXY		6
-#define SWIZZLE_WZY		7
-#define SWIZZLE_111		8
-#define SWIZZLE_000		9
-#define SWIZZLE_HHH		10
-
-#define swizzle(r, x, y, z, w) do_swizzle(fp, r,		\
-					  ((SWIZZLE_##x<<0)|	\
-					   (SWIZZLE_##y<<3)|	\
-					   (SWIZZLE_##z<<6)|	\
-					   (SWIZZLE_##w<<9)),	\
-					  0)
-
-#define REG_TYPE_INPUT		0
-#define REG_TYPE_OUTPUT		1
-#define REG_TYPE_TEMP		2
-#define REG_TYPE_CONST		3
-
-#define REG_TYPE_SHIFT		0
-#define REG_INDEX_SHIFT		2
-#define REG_VSWZ_SHIFT		8
-#define REG_SSWZ_SHIFT		13
-#define REG_NEGV_SHIFT		18
-#define REG_NEGS_SHIFT		19
-#define REG_ABS_SHIFT		20
-#define REG_NO_USE_SHIFT	21	// Hack for refcounting
-#define REG_VALID_SHIFT		22	// Does the register contain a defined value?
-#define REG_BUILTIN_SHIFT   23	// Is it a builtin (like all zero/all one)?
-
-#define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
-#define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
-#define REG_VSWZ_MASK		(0x1F << REG_VSWZ_SHIFT)
-#define REG_SSWZ_MASK		(0x1F << REG_SSWZ_SHIFT)
-#define REG_NEGV_MASK		(0x01 << REG_NEGV_SHIFT)
-#define REG_NEGS_MASK		(0x01 << REG_NEGS_SHIFT)
-#define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
-#define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
-#define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
-#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
-
-#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
-	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
-	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
-	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
-	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
-	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
-	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
-	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_GET_TYPE(reg)						\
-	((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
-#define REG_GET_INDEX(reg)						\
-	((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
-#define REG_GET_VSWZ(reg)						\
-	((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
-#define REG_GET_SSWZ(reg)						\
-	((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
-#define REG_GET_NO_USE(reg)						\
-	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
-#define REG_GET_VALID(reg)						\
-	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
-#define REG_GET_BUILTIN(reg)						\
-	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
-#define REG_SET_TYPE(reg, type)						\
-	reg = ((reg & ~REG_TYPE_MASK) |					\
-	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
-#define REG_SET_INDEX(reg, index)					\
-	reg = ((reg & ~REG_INDEX_MASK) |				\
-	       ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
-#define REG_SET_VSWZ(reg, vswz)						\
-	reg = ((reg & ~REG_VSWZ_MASK) |					\
-	       ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
-#define REG_SET_SSWZ(reg, sswz)						\
-	reg = ((reg & ~REG_SSWZ_MASK) |					\
-	       ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_SET_NO_USE(reg, nouse)					\
-	reg = ((reg & ~REG_NO_USE_MASK) |				\
-	       ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
-#define REG_SET_VALID(reg, valid)					\
-	reg = ((reg & ~REG_VALID_MASK) |				\
-	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
-#define REG_SET_BUILTIN(reg, builtin)					\
-	reg = ((reg & ~REG_BUILTIN_MASK) |				\
-	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
-#define REG_ABS(reg)							\
-	reg = (reg | REG_ABS_MASK)
-#define REG_NEGV(reg)							\
-	reg = (reg | REG_NEGV_MASK)
-#define REG_NEGS(reg)							\
-	reg = (reg | REG_NEGS_MASK)
-
-/*
- * Datas structures for fragment program generation
- */
-
-/* description of r300 native hw instructions */
-static const struct {
-	const char *name;
-	int argc;
-	int v_op;
-	int s_op;
-} r300_fpop[] = {
-	/* *INDENT-OFF* */
-	{"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
-	{"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
-	{"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
-	{"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
-	{"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
-	{"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
-	{"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
-	{"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
-	{"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
-	{"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
-	{"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
-	{"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
-	{"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
-	/* *INDENT-ON* */
-};
-
-/* vector swizzles r300 can support natively, with a couple of
- * cases we handle specially
- *
- * REG_VSWZ/REG_SSWZ is an index into this table
- */
-
-/* mapping from SWIZZLE_* to r300 native values for scalar insns */
-#define SWIZZLE_HALF 6
-
-#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
-					  SWIZZLE_##y, \
-					  SWIZZLE_##z, \
-					  SWIZZLE_ZERO))
-/* native swizzles */
-static const struct r300_pfs_swizzle {
-	GLuint hash;		/* swizzle value this matches */
-	GLuint base;		/* base value for hw swizzle */
-	GLuint stride;		/* difference in base between arg0/1/2 */
-	GLuint flags;
-} v_swiz[] = {
-	/* *INDENT-OFF* */
-	{MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
-	{MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
-	{MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
-	{MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
-	{MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
-	{MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
-	{PFS_INVAL, 0, 0, 0},
-	/* *INDENT-ON* */
-};
-
-/* used during matching of non-native swizzles */
-#define SWZ_X_MASK (7 << 0)
-#define SWZ_Y_MASK (7 << 3)
-#define SWZ_Z_MASK (7 << 6)
-#define SWZ_W_MASK (7 << 9)
-static const struct {
-	GLuint hash;		/* used to mask matching swizzle components */
-	int mask;		/* actual outmask */
-	int count;		/* count of components matched */
-} s_mask[] = {
-	/* *INDENT-OFF* */
-	{SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
-	{SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
-	{SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
-	{SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
-	{SWZ_X_MASK, 1, 1},
-	{SWZ_Y_MASK, 2, 1},
-	{SWZ_Z_MASK, 4, 1},
-	{PFS_INVAL, PFS_INVAL, PFS_INVAL}
-	/* *INDENT-ON* */
-};
-
-static const struct {
-	int base;		/* hw value of swizzle */
-	int stride;		/* difference between SRC0/1/2 */
-	GLuint flags;
-} s_swiz[] = {
-	/* *INDENT-OFF* */
-	{R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
-	{R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
-	{R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
-	{R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
-	{R300_ALU_ARGA_ZERO, 0, 0},
-	{R300_ALU_ARGA_ONE, 0, 0},
-	{R300_ALU_ARGA_HALF, 0, 0}
-	/* *INDENT-ON* */
-};
-
-/* boiler-plate reg, for convenience */
-static const GLuint undef = REG(REG_TYPE_TEMP,
-				0,
-				SWIZZLE_XYZ,
-				SWIZZLE_W,
-				GL_FALSE,
-				GL_FALSE,
-				GL_FALSE);
-
-/* constant one source */
-static const GLuint pfs_one = REG(REG_TYPE_CONST,
-				  0,
-				  SWIZZLE_111,
-				  SWIZZLE_ONE,
-				  GL_FALSE,
-				  GL_TRUE,
-				  GL_TRUE);
-
-/* constant half source */
-static const GLuint pfs_half = REG(REG_TYPE_CONST,
-				   0,
-				   SWIZZLE_HHH,
-				   SWIZZLE_HALF,
-				   GL_FALSE,
-				   GL_TRUE,
-				   GL_TRUE);
-
-/* constant zero source */
-static const GLuint pfs_zero = REG(REG_TYPE_CONST,
-				   0,
-				   SWIZZLE_000,
-				   SWIZZLE_ZERO,
-				   GL_FALSE,
-				   GL_TRUE,
-				   GL_TRUE);
+#include "radeon_program_alu.h"
 
-/*
- * Common functions prototypes
- */
-static void dump_program(struct r300_fragment_program *fp);
-static void emit_arith(struct r300_fragment_program *fp, int op,
-		       GLuint dest, int mask,
-		       GLuint src0, GLuint src1, GLuint src2, int flags);
 
-/**
- * Get an R300 temporary that can be written to in the given slot.
- */
-static int get_hw_temp(struct r300_fragment_program *fp, int slot)
+static void reset_srcreg(struct prog_src_register* reg)
 {
-	COMPILE_STATE;
-	int r;
-
-	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
-		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
-			break;
-	}
-
-	if (r >= PFS_NUM_TEMP_REGS) {
-		ERROR("Out of hardware temps\n");
-		return 0;
-	}
-	// Reserved is used to avoid the following scenario:
-	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
-	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
-	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
-	//  to overwrite the value of temporary Y.
-	// End scenario.
-	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
-	cs->hwtemps[r].free = -1;
-
-	// Reset to some value that won't mess things up when the user
-	// tries to read from a temporary that hasn't been assigned a value yet.
-	// In the normal case, vector_valid and scalar_valid should be set to
-	// a sane value by the first emit that writes to this temporary.
-	cs->hwtemps[r].vector_valid = 0;
-	cs->hwtemps[r].scalar_valid = 0;
-
-	if (r > fp->max_temp_idx)
-		fp->max_temp_idx = r;
-
-	return r;
+	_mesa_bzero(reg, sizeof(*reg));
+	reg->Swizzle = SWIZZLE_NOOP;
 }
 
 /**
- * Get an R300 temporary that will act as a TEX destination register.
- */
-static int get_hw_temp_tex(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	int r;
-
-	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
-		if (cs->used_in_node & (1 << r))
-			continue;
-
-		// Note: Be very careful here
-		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
-			break;
-	}
-
-	if (r >= PFS_NUM_TEMP_REGS)
-		return get_hw_temp(fp, 0);	/* Will cause an indirection */
-
-	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
-	cs->hwtemps[r].free = -1;
-
-	// Reset to some value that won't mess things up when the user
-	// tries to read from a temporary that hasn't been assigned a value yet.
-	// In the normal case, vector_valid and scalar_valid should be set to
-	// a sane value by the first emit that writes to this temporary.
-	cs->hwtemps[r].vector_valid = cs->nrslots;
-	cs->hwtemps[r].scalar_valid = cs->nrslots;
-
-	if (r > fp->max_temp_idx)
-		fp->max_temp_idx = r;
-
-	return r;
-}
-
-/**
- * Mark the given hardware register as free.
- */
-static void free_hw_temp(struct r300_fragment_program *fp, int idx)
-{
-	COMPILE_STATE;
-
-	// Be very careful here. Consider sequences like
-	//  MAD r0, r1,r2,r3
-	//  TEX r4, ...
-	// The TEX instruction may be moved in front of the MAD instruction
-	// due to the way nodes work. We don't want to alias r1 and r4 in
-	// this case.
-	// I'm certain the register allocation could be further sanitized,
-	// but it's tricky because of stuff that can happen inside emit_tex
-	// and emit_arith.
-	cs->hwtemps[idx].free = cs->nrslots + 1;
-}
-
-/**
- * Create a new Mesa temporary register.
- */
-static GLuint get_temp_reg(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	GLuint r = undef;
-	GLuint index;
-
-	index = ffs(~cs->temp_in_use);
-	if (!index) {
-		ERROR("Out of program temps\n");
-		return r;
-	}
-
-	cs->temp_in_use |= (1 << --index);
-	cs->temps[index].refcount = 0xFFFFFFFF;
-	cs->temps[index].reg = -1;
-
-	REG_SET_TYPE(r, REG_TYPE_TEMP);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
-}
-
-/**
- * Create a new Mesa temporary register that will act as the destination
- * register for a texture read.
- */
-static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
-	GLuint r = undef;
-	GLuint index;
-
-	index = ffs(~cs->temp_in_use);
-	if (!index) {
-		ERROR("Out of program temps\n");
-		return r;
-	}
-
-	cs->temp_in_use |= (1 << --index);
-	cs->temps[index].refcount = 0xFFFFFFFF;
-	cs->temps[index].reg = get_hw_temp_tex(fp);
-
-	REG_SET_TYPE(r, REG_TYPE_TEMP);
-	REG_SET_INDEX(r, index);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
-}
-
-/**
- * Free a Mesa temporary and the associated R300 temporary.
- */
-static void free_temp(struct r300_fragment_program *fp, GLuint r)
-{
-	COMPILE_STATE;
-	GLuint index = REG_GET_INDEX(r);
-
-	if (!(cs->temp_in_use & (1 << index)))
-		return;
-
-	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
-		free_hw_temp(fp, cs->temps[index].reg);
-		cs->temps[index].reg = -1;
-		cs->temp_in_use &= ~(1 << index);
-	} else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
-		free_hw_temp(fp, cs->inputs[index].reg);
-		cs->inputs[index].reg = -1;
-	}
-}
-
-/**
- * Emit a hardware constant/parameter.
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - premultiply texture coordinates for RECT
+ *  - extract operand swizzles
+ *  - introduce a temporary register when write masks are needed
  *
- * \p cp Stable pointer to an array of 4 floats.
- *  The pointer must be stable in the sense that it remains to be valid
- *  and hold the contents of the constant/parameter throughout the lifetime
- *  of the fragment program (actually, up until the next time the fragment
- *  program is translated).
+ * \todo If/when r5xx uses the radeon_program architecture, this can probably
+ * be reused.
  */
-static GLuint emit_const4fv(struct r300_fragment_program *fp,
-			    const GLfloat * cp)
-{
-	GLuint reg = undef;
-	int index;
-
-	for (index = 0; index < fp->const_nr; ++index) {
-		if (fp->constant[index] == cp)
-			break;
-	}
-
-	if (index >= fp->const_nr) {
-		if (index >= PFS_NUM_CONST_REGS) {
-			ERROR("Out of hw constants!\n");
-			return reg;
-		}
-
-		fp->const_nr++;
-		fp->constant[index] = cp;
-	}
-
-	REG_SET_TYPE(reg, REG_TYPE_CONST);
-	REG_SET_INDEX(reg, index);
-	REG_SET_VALID(reg, GL_TRUE);
-	return reg;
-}
-
-static inline GLuint negate(GLuint r)
-{
-	REG_NEGS(r);
-	REG_NEGV(r);
-	return r;
-}
-
-/* Hack, to prevent clobbering sources used multiple times when
- * emulating non-native instructions
- */
-static inline GLuint keep(GLuint r)
-{
-	REG_SET_NO_USE(r, GL_TRUE);
-	return r;
-}
-
-static inline GLuint absolute(GLuint r)
-{
-	REG_ABS(r);
-	return r;
-}
-
-static int swz_native(struct r300_fragment_program *fp,
-		      GLuint src, GLuint * r, GLuint arbneg)
+static GLboolean transform_TEX(
+	struct radeon_program_transform_context* context,
+	struct prog_instruction* orig_inst, void* data)
 {
-	/* Native swizzle, handle negation */
-	src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
-
-	if ((arbneg & 0x7) == 0x0) {
-		src = src & ~REG_NEGV_MASK;
-		*r = src;
-	} else if ((arbneg & 0x7) == 0x7) {
-		src |= REG_NEGV_MASK;
-		*r = src;
-	} else {
-		if (!REG_GET_VALID(*r))
-			*r = get_temp_reg(fp);
-		src |= REG_NEGV_MASK;
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
-		src = src & ~REG_NEGV_MASK;
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   *r,
-			   (arbneg ^ 0x7) | WRITEMASK_W,
-			   src, pfs_one, pfs_zero, 0);
-	}
-
-	return 3;
-}
-
-static int swz_emit_partial(struct r300_fragment_program *fp,
-			    GLuint src,
-			    GLuint * r, int mask, int mc, GLuint arbneg)
-{
-	GLuint tmp;
-	GLuint wmask = 0;
-
-	if (!REG_GET_VALID(*r))
-		*r = get_temp_reg(fp);
-
-	/* A partial match, VSWZ/mask define what parts of the
-	 * desired swizzle we match
-	 */
-	if (mc + s_mask[mask].count == 3) {
-		wmask = WRITEMASK_W;
-		src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
-	}
-
-	tmp = arbneg & s_mask[mask].mask;
-	if (tmp) {
-		tmp = tmp ^ s_mask[mask].mask;
-		if (tmp) {
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r,
-				   arbneg & s_mask[mask].mask,
-				   keep(src) | REG_NEGV_MASK,
-				   pfs_one, pfs_zero, 0);
-			if (!wmask) {
-				REG_SET_NO_USE(src, GL_TRUE);
-			} else {
-				REG_SET_NO_USE(src, GL_FALSE);
-			}
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
-		} else {
-			if (!wmask) {
-				REG_SET_NO_USE(src, GL_TRUE);
-			} else {
-				REG_SET_NO_USE(src, GL_FALSE);
-			}
-			emit_arith(fp,
-				   PFS_OP_MAD,
-				   *r,
-				   (arbneg & s_mask[mask].mask) | wmask,
-				   src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
-		}
-	} else {
-		if (!wmask) {
-			REG_SET_NO_USE(src, GL_TRUE);
-		} else {
-			REG_SET_NO_USE(src, GL_FALSE);
-		}
-		emit_arith(fp, PFS_OP_MAD,
-			   *r,
-			   s_mask[mask].mask | wmask,
-			   src, pfs_one, pfs_zero, 0);
-	}
-
-	return s_mask[mask].count;
-}
-
-static GLuint do_swizzle(struct r300_fragment_program *fp,
-			 GLuint src, GLuint arbswz, GLuint arbneg)
-{
-	GLuint r = undef;
-	GLuint vswz;
-	int c_mask = 0;
-	int v_match = 0;
-
-	/* If swizzling from something without an XYZW native swizzle,
-	 * emit result to a temp, and do new swizzle from the temp.
-	 */
-#if 0
-	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
-		GLuint temp = get_temp_reg(fp);
-		emit_arith(fp,
-			   PFS_OP_MAD,
-			   temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
-		src = temp;
-	}
-#endif
-
-	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
-		GLuint vsrcswz =
-		    (v_swiz[REG_GET_VSWZ(src)].
-		     hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
-		    REG_GET_SSWZ(src) << 9;
-		GLint i;
-
-		GLuint newswz = 0;
-		GLuint offset;
-		for (i = 0; i < 4; ++i) {
-			offset = GET_SWZ(arbswz, i);
-
-			newswz |=
-			    (offset <= 3) ? GET_SWZ(vsrcswz,
-						    offset) << i *
-			    3 : offset << i * 3;
-		}
-
-		arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
-		REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
-	} else {
-		/* set scalar swizzling */
-		REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
-
-	}
-	do {
-		vswz = REG_GET_VSWZ(src);
-		do {
-			int chash;
-
-			REG_SET_VSWZ(src, vswz);
-			chash = v_swiz[REG_GET_VSWZ(src)].hash &
-			    s_mask[c_mask].hash;
-
-			if (chash == (arbswz & s_mask[c_mask].hash)) {
-				if (s_mask[c_mask].count == 3) {
-					v_match += swz_native(fp,
-							      src, &r, arbneg);
-				} else {
-					v_match += swz_emit_partial(fp,
-								    src,
-								    &r,
-								    c_mask,
-								    v_match,
-								    arbneg);
-				}
-
-				if (v_match == 3)
-					return r;
-
-				/* Fill with something invalid.. all 0's was
-				 * wrong before, matched SWIZZLE_X.  So all
-				 * 1's will be okay for now
-				 */
-				arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
-			}
-		} while (v_swiz[++vswz].hash != PFS_INVAL);
-		REG_SET_VSWZ(src, SWIZZLE_XYZ);
-	} while (s_mask[++c_mask].hash != PFS_INVAL);
-
-	ERROR("should NEVER get here\n");
-	return r;
-}
-
-static GLuint t_src(struct r300_fragment_program *fp,
-		    struct prog_src_register fpsrc)
-{
-	GLuint r = undef;
-
-	switch (fpsrc.File) {
-	case PROGRAM_TEMPORARY:
-		REG_SET_INDEX(r, fpsrc.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_TEMP);
-		break;
-	case PROGRAM_INPUT:
-		REG_SET_INDEX(r, fpsrc.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_INPUT);
-		break;
-	case PROGRAM_LOCAL_PARAM:
-		r = emit_const4fv(fp,
-				  fp->mesa_program.Base.LocalParams[fpsrc.
-								    Index]);
-		break;
-	case PROGRAM_ENV_PARAM:
-		r = emit_const4fv(fp,
-				  fp->ctx->FragmentProgram.Parameters[fpsrc.
-								      Index]);
-		break;
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_NAMED_PARAM:
-	case PROGRAM_CONSTANT:
-		r = emit_const4fv(fp,
-				  fp->mesa_program.Base.Parameters->
-				  ParameterValues[fpsrc.Index]);
-		break;
-	default:
-		ERROR("unknown SrcReg->File %x\n", fpsrc.File);
-		return r;
-	}
-
-	/* no point swizzling ONE/ZERO/HALF constants... */
-	if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
-		r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
-	return r;
-}
-
-static GLuint t_scalar_src(struct r300_fragment_program *fp,
-			   struct prog_src_register fpsrc)
-{
-	struct prog_src_register src = fpsrc;
-	int sc = GET_SWZ(fpsrc.Swizzle, 0);	/* X */
-
-	src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
-
-	return t_src(fp, src);
-}
-
-static GLuint t_dst(struct r300_fragment_program *fp,
-		    struct prog_dst_register dest)
-{
-	GLuint r = undef;
-
-	switch (dest.File) {
-	case PROGRAM_TEMPORARY:
-		REG_SET_INDEX(r, dest.Index);
-		REG_SET_VALID(r, GL_TRUE);
-		REG_SET_TYPE(r, REG_TYPE_TEMP);
-		return r;
-	case PROGRAM_OUTPUT:
-		REG_SET_TYPE(r, REG_TYPE_OUTPUT);
-		switch (dest.Index) {
-		case FRAG_RESULT_COLR:
-		case FRAG_RESULT_DEPR:
-			REG_SET_INDEX(r, dest.Index);
-			REG_SET_VALID(r, GL_TRUE);
-			return r;
-		default:
-			ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
-			return r;
-		}
-	default:
-		ERROR("Bad DstReg->File 0x%x\n", dest.File);
-		return r;
-	}
-}
-
-static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
-{
-	COMPILE_STATE;
-	int idx;
-	int index = REG_GET_INDEX(src);
-
-	switch (REG_GET_TYPE(src)) {
-	case REG_TYPE_TEMP:
-		/* NOTE: if reg==-1 here, a source is being read that
-		 *       hasn't been written to. Undefined results.
-		 */
-		if (cs->temps[index].reg == -1)
-			cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
-
-		idx = cs->temps[index].reg;
-
-		if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
-			free_temp(fp, src);
-		break;
-	case REG_TYPE_INPUT:
-		idx = cs->inputs[index].reg;
-
-		if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
-			free_hw_temp(fp, cs->inputs[index].reg);
-		break;
-	case REG_TYPE_CONST:
-		return (index | SRC_CONST);
-	default:
-		ERROR("Invalid type for source reg\n");
-		return (0 | SRC_CONST);
-	}
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
+	struct prog_instruction inst = *orig_inst;
+	struct prog_instruction* tgt;
+	GLboolean destredirect = GL_FALSE;
+
+	if (inst.Opcode != OPCODE_TEX &&
+	    inst.Opcode != OPCODE_TXB &&
+	    inst.Opcode != OPCODE_TXP &&
+	    inst.Opcode != OPCODE_KIL)
+		return GL_FALSE;
 
-	if (!tex)
-		cs->used_in_node |= (1 << idx);
+	if (inst.Opcode != OPCODE_KIL &&
+	    compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
 
-	return idx;
-}
+		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+			tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+				context->dest->NumInstructions, 1);
 
-static int t_hw_dst(struct r300_fragment_program *fp,
-		    GLuint dest, GLboolean tex, int slot)
-{
-	COMPILE_STATE;
-	int idx;
-	GLuint index = REG_GET_INDEX(dest);
-	assert(REG_GET_VALID(dest));
-
-	switch (REG_GET_TYPE(dest)) {
-	case REG_TYPE_TEMP:
-		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
-			if (!tex) {
-				cs->temps[index].reg = get_hw_temp(fp, slot);
-			} else {
-				cs->temps[index].reg = get_hw_temp_tex(fp);
-			}
-		}
-		idx = cs->temps[index].reg;
-
-		if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
-			free_temp(fp, dest);
-
-		cs->dest_in_node |= (1 << idx);
-		cs->used_in_node |= (1 << idx);
-		break;
-	case REG_TYPE_OUTPUT:
-		switch (index) {
-		case FRAG_RESULT_COLR:
-			fp->node[fp->cur_node].flags |=
-			    R300_RGBA_OUT;
-			break;
-		case FRAG_RESULT_DEPR:
-			fp->WritesDepth = GL_TRUE;
-			fp->node[fp->cur_node].flags |=
-			    R300_W_OUT;
-			break;
+			tgt->Opcode = OPCODE_MOV;
+			tgt->DstReg = inst.DstReg;
+			tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+			tgt->SrcReg[0].Swizzle = comparefunc == GL_ALWAYS ? SWIZZLE_1111 : SWIZZLE_0000;
+			return GL_TRUE;
 		}
-		return index;
-		break;
-	default:
-		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
-		return 0;
-	}
-
-	return idx;
-}
-
-static void emit_nop(struct r300_fragment_program *fp)
-{
-	COMPILE_STATE;
 
-	if (cs->nrslots >= PFS_MAX_ALU_INST) {
-		ERROR("Out of ALU instruction slots\n");
-		return;
+		inst.DstReg.File = PROGRAM_TEMPORARY;
+		inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler);
+		inst.DstReg.WriteMask = WRITEMASK_XYZW;
 	}
 
-	fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
-	fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
-	fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
-	fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
-	cs->nrslots++;
-}
 
-static void emit_tex(struct r300_fragment_program *fp,
-		     struct prog_instruction *fpi, int opcode)
-{
-	COMPILE_STATE;
-	GLuint coord = t_src(fp, fpi->SrcReg[0]);
-	GLuint dest = undef, rdest = undef;
-	GLuint din, uin;
-	int unit = fpi->TexSrcUnit;
-	int hwsrc, hwdest;
-	GLuint tempreg = 0;
-
-	/**
-	 * Hardware uses [0..1]x[0..1] range for rectangle textures
+	/* Hardware uses [0..1]x[0..1] range for rectangle textures
 	 * instead of [0..Width]x[0..Height].
 	 * Add a scaling instruction.
-	 *
-	 * \todo Refactor this once we have proper rewriting/optimization
-	 * support for programs.
 	 */
-	if (opcode != R300_TEX_OP_KIL && fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
+	if (inst.Opcode != OPCODE_KIL && inst.TexSrcTarget == TEXTURE_RECT_INDEX) {
 		gl_state_index tokens[STATE_LENGTH] = {
 			STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
 			0
 		};
+
+		int tempreg = radeonCompilerAllocateTemporary(context->compiler);
 		int factor_index;
-		GLuint factorreg;
 
-		tokens[2] = unit;
+		tokens[2] = inst.TexSrcUnit;
 		factor_index =
-			_mesa_add_state_reference(fp->mesa_program.Base.
-						Parameters, tokens);
-		factorreg =
-			emit_const4fv(fp,
-				fp->mesa_program.Base.Parameters->
-				ParameterValues[factor_index]);
-		tempreg = keep(get_temp_reg(fp));
-
-		emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
-			coord, factorreg, pfs_zero, 0);
-
-		coord = tempreg;
-	}
+			_mesa_add_state_reference(
+				compiler->fp->mesa_program.Base.Parameters, tokens);
 
-	/* Texture operations do not support swizzles etc. in hardware,
-	 * so emit an additional arithmetic operation if necessary.
-	 */
-	if (REG_GET_VSWZ(coord) != SWIZZLE_XYZ ||
-	    REG_GET_SSWZ(coord) != SWIZZLE_W ||
-	    coord & (REG_NEGV_MASK | REG_NEGS_MASK | REG_ABS_MASK)) {
-		assert(tempreg == 0);
-		tempreg = keep(get_temp_reg(fp));
-		emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
-			coord, pfs_one, pfs_zero, 0);
-		coord = tempreg;
-	}
-
-	/* Ensure correct node indirection */
-	uin = cs->used_in_node;
-	din = cs->dest_in_node;
-
-	/* Resolve source/dest to hardware registers */
-	hwsrc = t_hw_src(fp, coord, GL_TRUE);
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 1);
 
-	if (opcode != R300_TEX_OP_KIL) {
-		dest = t_dst(fp, fpi->DstReg);
+		tgt->Opcode = OPCODE_MUL;
+		tgt->DstReg.File = PROGRAM_TEMPORARY;
+		tgt->DstReg.Index = tempreg;
+		tgt->SrcReg[0] = inst.SrcReg[0];
+		tgt->SrcReg[1].File = PROGRAM_STATE_VAR;
+		tgt->SrcReg[1].Index = factor_index;
 
-		/* r300 doesn't seem to be able to do TEX->output reg */
-		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-			rdest = dest;
-			dest = get_temp_reg_tex(fp);
-		} else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
-			/* in case write mask isn't XYZW */
-			rdest = dest;
-			dest = get_temp_reg_tex(fp);
-		}
-		hwdest =
-		    t_hw_dst(fp, dest, GL_TRUE,
-			     fp->node[fp->cur_node].alu_offset);
-
-		/* Use a temp that hasn't been used in this node, rather
-		 * than causing an indirection
-		 */
-		if (uin & (1 << hwdest)) {
-			free_hw_temp(fp, hwdest);
-			hwdest = get_hw_temp_tex(fp);
-			cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
-		}
-	} else {
-		hwdest = 0;
-		unit = 0;
+		reset_srcreg(&inst.SrcReg[0]);
+		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst.SrcReg[0].Index = tempreg;
 	}
 
-	/* Indirection if source has been written in this node, or if the
-	 * dest has been read/written in this node
+	/* Texture operations do not support swizzles etc. in hardware,
+	 * so emit an additional arithmetic operation if necessary.
 	 */
-	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
-	     (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
-
-		/* Finish off current node */
-		if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
-			emit_nop(fp);
-
-		fp->node[fp->cur_node].alu_end =
-		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
-		assert(fp->node[fp->cur_node].alu_end >= 0);
-
-		if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
-			ERROR("too many levels of texture indirection\n");
-			return;
-		}
-
-		/* Start new node */
-		fp->node[fp->cur_node].tex_offset = fp->tex.length;
-		fp->node[fp->cur_node].alu_offset = cs->nrslots;
-		fp->node[fp->cur_node].tex_end = -1;
-		fp->node[fp->cur_node].alu_end = -1;
-		fp->node[fp->cur_node].flags = 0;
-		cs->used_in_node = 0;
-		cs->dest_in_node = 0;
-	}
-
-	if (fp->cur_node == 0)
-		fp->first_node_has_tex = 1;
-
-	fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
-	    | (hwdest << R300_DST_ADDR_SHIFT)
-	    | (unit << R300_TEX_ID_SHIFT)
-	    | (opcode << R300_TEX_INST_SHIFT);
-
-	cs->dest_in_node |= (1 << hwdest);
-	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
-		cs->used_in_node |= (1 << hwsrc);
+	if (inst.SrcReg[0].Swizzle != SWIZZLE_NOOP ||
+	    inst.SrcReg[0].Abs || inst.SrcReg[0].NegateBase || inst.SrcReg[0].NegateAbs) {
+		int tempreg = radeonCompilerAllocateTemporary(context->compiler);
 
-	fp->node[fp->cur_node].tex_end++;
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 1);
 
-	/* Copy from temp to output if needed */
-	if (REG_GET_VALID(rdest)) {
-		emit_arith(fp, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
-			   pfs_one, pfs_zero, 0);
-		free_temp(fp, dest);
-	}
-
-	/* Free temp register */
-	if (tempreg != 0)
-		free_temp(fp, tempreg);
-}
-
-/**
- * Returns the first slot where we could possibly allow writing to dest,
- * according to register allocation.
- */
-static int get_earliest_allowed_write(struct r300_fragment_program *fp,
-				      GLuint dest, int mask)
-{
-	COMPILE_STATE;
-	int idx;
-	int pos;
-	GLuint index = REG_GET_INDEX(dest);
-	assert(REG_GET_VALID(dest));
-
-	switch (REG_GET_TYPE(dest)) {
-	case REG_TYPE_TEMP:
-		if (cs->temps[index].reg == -1)
-			return 0;
-
-		idx = cs->temps[index].reg;
-		break;
-	case REG_TYPE_OUTPUT:
-		return 0;
-	default:
-		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
-		return 0;
-	}
+		tgt->Opcode = OPCODE_MOV;
+		tgt->DstReg.File = PROGRAM_TEMPORARY;
+		tgt->DstReg.Index = tempreg;
+		tgt->SrcReg[0] = inst.SrcReg[0];
 
-	pos = cs->hwtemps[idx].reserved;
-	if (mask & WRITEMASK_XYZ) {
-		if (pos < cs->hwtemps[idx].vector_lastread)
-			pos = cs->hwtemps[idx].vector_lastread;
-	}
-	if (mask & WRITEMASK_W) {
-		if (pos < cs->hwtemps[idx].scalar_lastread)
-			pos = cs->hwtemps[idx].scalar_lastread;
+		reset_srcreg(&inst.SrcReg[0]);
+		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst.SrcReg[0].Index = tempreg;
 	}
 
-	return pos;
-}
+	if (inst.Opcode != OPCODE_KIL) {
+		if (inst.DstReg.File != PROGRAM_TEMPORARY ||
+		    inst.DstReg.WriteMask != WRITEMASK_XYZW) {
+			int tempreg = radeonCompilerAllocateTemporary(context->compiler);
 
-/**
- * Allocates a slot for an ALU instruction that can consist of
- * a vertex part or a scalar part or both.
- *
- * Sources from src (src[0] to src[argc-1]) are added to the slot in the
- * appropriate position (vector and/or scalar), and their positions are
- * recorded in the srcpos array.
- *
- * This function emits instruction code for the source fetch and the
- * argument selection. It does not emit instruction code for the
- * opcode or the destination selection.
- *
- * @return the index of the slot
- */
-static int find_and_prepare_slot(struct r300_fragment_program *fp,
-				 GLboolean emit_vop,
-				 GLboolean emit_sop,
-				 int argc, GLuint * src, GLuint dest, int mask)
-{
-	COMPILE_STATE;
-	int hwsrc[3];
-	int srcpos[3];
-	unsigned int used;
-	int tempused;
-	int tempvsrc[3];
-	int tempssrc[3];
-	int pos;
-	int regnr;
-	int i, j;
-
-	// Determine instruction slots, whether sources are required on
-	// vector or scalar side, and the smallest slot number where
-	// all source registers are available
-	used = 0;
-	if (emit_vop)
-		used |= SLOT_OP_VECTOR;
-	if (emit_sop)
-		used |= SLOT_OP_SCALAR;
-
-	pos = get_earliest_allowed_write(fp, dest, mask);
-
-	if (fp->node[fp->cur_node].alu_offset > pos)
-		pos = fp->node[fp->cur_node].alu_offset;
-	for (i = 0; i < argc; ++i) {
-		if (!REG_GET_BUILTIN(src[i])) {
-			if (emit_vop)
-				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
-			if (emit_sop)
-				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
-		}
-
-		hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE);	/* Note: sideeffects wrt refcounting! */
-		regnr = hwsrc[i] & 31;
-
-		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
-			if (used & (SLOT_SRC_VECTOR << i)) {
-				if (cs->hwtemps[regnr].vector_valid > pos)
-					pos = cs->hwtemps[regnr].vector_valid;
-			}
-			if (used & (SLOT_SRC_SCALAR << i)) {
-				if (cs->hwtemps[regnr].scalar_valid > pos)
-					pos = cs->hwtemps[regnr].scalar_valid;
-			}
+			inst.DstReg.File = PROGRAM_TEMPORARY;
+			inst.DstReg.Index = tempreg;
+			inst.DstReg.WriteMask = WRITEMASK_XYZW;
+			destredirect = GL_TRUE;
 		}
 	}
 
-	// Find a slot that fits
-	for (;; ++pos) {
-		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
-			continue;
-
-		if (pos >= cs->nrslots) {
-			if (cs->nrslots >= PFS_MAX_ALU_INST) {
-				ERROR("Out of ALU instruction slots\n");
-				return -1;
-			}
-
-			fp->alu.inst[pos].inst0 = NOP_INST0;
-			fp->alu.inst[pos].inst1 = NOP_INST1;
-			fp->alu.inst[pos].inst2 = NOP_INST2;
-			fp->alu.inst[pos].inst3 = NOP_INST3;
-
-			cs->nrslots++;
-		}
-		// Note: When we need both parts (vector and scalar) of a source,
-		// we always try to put them into the same position. This makes the
-		// code easier to read, and it is optimal (i.e. one doesn't gain
-		// anything by splitting the parts).
-		// It also avoids headaches with swizzles that access both parts (i.e WXY)
-		tempused = cs->slot[pos].used;
-		for (i = 0; i < 3; ++i) {
-			tempvsrc[i] = cs->slot[pos].vsrc[i];
-			tempssrc[i] = cs->slot[pos].ssrc[i];
-		}
-
-		for (i = 0; i < argc; ++i) {
-			int flags = (used >> i) & SLOT_SRC_BOTH;
-
-			if (!flags) {
-				srcpos[i] = 0;
-				continue;
-			}
-
-			for (j = 0; j < 3; ++j) {
-				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
-					if (tempvsrc[j] != hwsrc[i])
-						continue;
-				}
-
-				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
-					if (tempssrc[j] != hwsrc[i])
-						continue;
-				}
-
-				break;
-			}
-
-			if (j == 3)
-				break;
-
-			srcpos[i] = j;
-			tempused |= flags << j;
-			if (flags & SLOT_SRC_VECTOR)
-				tempvsrc[j] = hwsrc[i];
-			if (flags & SLOT_SRC_SCALAR)
-				tempssrc[j] = hwsrc[i];
-		}
-
-		if (i == argc)
-			break;
-	}
-
-	// Found a slot, reserve it
-	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
-	for (i = 0; i < 3; ++i) {
-		cs->slot[pos].vsrc[i] = tempvsrc[i];
-		cs->slot[pos].ssrc[i] = tempssrc[i];
-	}
-
-	for (i = 0; i < argc; ++i) {
-		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
-			int regnr = hwsrc[i] & 31;
-
-			if (used & (SLOT_SRC_VECTOR << i)) {
-				if (cs->hwtemps[regnr].vector_lastread < pos)
-					cs->hwtemps[regnr].vector_lastread =
-					    pos;
-			}
-			if (used & (SLOT_SRC_SCALAR << i)) {
-				if (cs->hwtemps[regnr].scalar_lastread < pos)
-					cs->hwtemps[regnr].scalar_lastread =
-					    pos;
-			}
-		}
-	}
-
-	// Emit the source fetch code
-	fp->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
-	fp->alu.inst[pos].inst1 |=
-	    ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
-	     (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
-	     (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
-
-	fp->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
-	fp->alu.inst[pos].inst3 |=
-	    ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
-	     (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
-	     (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
-
-	// Emit the argument selection code
-	if (emit_vop) {
-		int swz[3];
-
-		for (i = 0; i < 3; ++i) {
-			if (i < argc) {
-				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
-					  (srcpos[i] *
-					   v_swiz[REG_GET_VSWZ(src[i])].
-					   stride)) | ((src[i] & REG_NEGV_MASK)
-						       ? ARG_NEG : 0) | ((src[i]
-									  &
-									  REG_ABS_MASK)
-									 ?
-									 ARG_ABS
-									 : 0);
-			} else {
-				swz[i] = R300_ALU_ARGC_ZERO;
-			}
-		}
-
-		fp->alu.inst[pos].inst0 &=
-		    ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
-		      R300_ALU_ARG2C_MASK);
-		fp->alu.inst[pos].inst0 |=
-		    (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
-							 R300_ALU_ARG1C_SHIFT)
-		    | (swz[2] << R300_ALU_ARG2C_SHIFT);
-	}
-
-	if (emit_sop) {
-		int swz[3];
-
-		for (i = 0; i < 3; ++i) {
-			if (i < argc) {
-				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
-					  (srcpos[i] *
-					   s_swiz[REG_GET_SSWZ(src[i])].
-					   stride)) | ((src[i] & REG_NEGV_MASK)
-						       ? ARG_NEG : 0) | ((src[i]
-									  &
-									  REG_ABS_MASK)
-									 ?
-									 ARG_ABS
-									 : 0);
-			} else {
-				swz[i] = R300_ALU_ARGA_ZERO;
-			}
-		}
-
-		fp->alu.inst[pos].inst2 &=
-		    ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
-		      R300_ALU_ARG2A_MASK);
-		fp->alu.inst[pos].inst2 |=
-		    (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
-							 R300_ALU_ARG1A_SHIFT)
-		    | (swz[2] << R300_ALU_ARG2A_SHIFT);
-	}
-
-	return pos;
-}
-
-/**
- * Append an ALU instruction to the instruction list.
- */
-static void emit_arith(struct r300_fragment_program *fp,
-		       int op,
-		       GLuint dest,
-		       int mask,
-		       GLuint src0, GLuint src1, GLuint src2, int flags)
-{
-	COMPILE_STATE;
-	GLuint src[3] = { src0, src1, src2 };
-	int hwdest;
-	GLboolean emit_vop, emit_sop;
-	int vop, sop, argc;
-	int pos;
-
-	vop = r300_fpop[op].v_op;
-	sop = r300_fpop[op].s_op;
-	argc = r300_fpop[op].argc;
-
-	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
-		if (mask & WRITEMASK_Z) {
-			mask = WRITEMASK_W;
-		} else {
-			return;
-		}
-	}
-
-	emit_vop = GL_FALSE;
-	emit_sop = GL_FALSE;
-	if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
-		emit_vop = GL_TRUE;
-	if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
-		emit_sop = GL_TRUE;
-
-	pos =
-	    find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
-				  mask);
-	if (pos < 0)
-		return;
-
-	hwdest = t_hw_dst(fp, dest, GL_FALSE, pos);	/* Note: Side effects wrt register allocation */
-
-	if (flags & PFS_FLAG_SAT) {
-		vop |= R300_ALU_OUTC_CLAMP;
-		sop |= R300_ALU_OUTA_CLAMP;
-	}
-
-	/* Throw the pieces together and get ALU/1 */
-	if (emit_vop) {
-		fp->alu.inst[pos].inst0 |= vop;
-
-		fp->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
-
-		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-				fp->alu.inst[pos].inst1 |=
-				    (mask & WRITEMASK_XYZ) <<
-				    R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
-			} else
-				assert(0);
+	tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+		context->dest->NumInstructions, 1);
+	_mesa_copy_instructions(tgt, &inst, 1);
+
+	if (inst.Opcode != OPCODE_KIL &&
+	    compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+		GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 2);
+
+		tgt[0].Opcode = OPCODE_ADD;
+		tgt[0].DstReg = inst.DstReg;
+		tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+		tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt[0].SrcReg[0].Index = inst.DstReg.Index;
+		if (depthmode == 0) /* GL_LUMINANCE */
+			tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+		else if (depthmode == 2) /* GL_ALPHA */
+			tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+		tgt[0].SrcReg[1] = inst.SrcReg[0];
+		tgt[0].SrcReg[1].Swizzle = SWIZZLE_ZZZZ;
+
+		/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+		 *   r  < tex  <=>      -tex+r < 0
+		 *   r >= tex  <=> not (-tex+r < 0 */
+		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+			tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+		else
+			tgt[0].SrcReg[1].NegateBase = tgt[0].SrcReg[1].NegateBase ^ NEGATE_XYZW;
+
+		tgt[1].Opcode = OPCODE_CMP;
+		tgt[1].DstReg = orig_inst->DstReg;
+		tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index;
+		tgt[1].SrcReg[1].File = PROGRAM_BUILTIN;
+		tgt[1].SrcReg[2].File = PROGRAM_BUILTIN;
+
+		if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+			tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111;
+			tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000;
 		} else {
-			fp->alu.inst[pos].inst1 |=
-			    (mask & WRITEMASK_XYZ) <<
-			    R300_ALU_DSTC_REG_MASK_SHIFT;
-
-			cs->hwtemps[hwdest].vector_valid = pos + 1;
+			tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000;
+			tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111;
 		}
-	}
+	} else if (destredirect) {
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 1);
 
-	/* And now ALU/3 */
-	if (emit_sop) {
-		fp->alu.inst[pos].inst2 |= sop;
-
-		if (mask & WRITEMASK_W) {
-			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-					fp->alu.inst[pos].inst3 |=
-					    (hwdest << R300_ALU_DSTA_SHIFT) |
-					    R300_ALU_DSTA_OUTPUT;
-				} else if (REG_GET_INDEX(dest) ==
-					   FRAG_RESULT_DEPR) {
-					fp->alu.inst[pos].inst3 |=
-					    R300_ALU_DSTA_DEPTH;
-				} else
-					assert(0);
-			} else {
-				fp->alu.inst[pos].inst3 |=
-				    (hwdest << R300_ALU_DSTA_SHIFT) |
-				    R300_ALU_DSTA_REG;
-
-				cs->hwtemps[hwdest].scalar_valid = pos + 1;
-			}
-		}
+		tgt->Opcode = OPCODE_MOV;
+		tgt->DstReg = orig_inst->DstReg;
+		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt->SrcReg[0].Index = inst.DstReg.Index;
 	}
 
-	return;
+	return GL_TRUE;
 }
 
-#if 0
-static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
+
+static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
 {
 	struct gl_fragment_program *mp = &fp->mesa_program;
-	GLuint r = undef;
 
-	if (!(mp->Base.InputsRead & (1 << attr))) {
-		ERROR("Attribute %d was not provided!\n", attr);
-		return undef;
-	}
-
-	REG_SET_TYPE(r, REG_TYPE_INPUT);
-	REG_SET_INDEX(r, attr);
-	REG_SET_VALID(r, GL_TRUE);
-	return r;
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (mp->Base.Parameters)
+		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
 }
-#endif
-
-static GLfloat SinCosConsts[2][4] = {
-	{
-	 1.273239545,		// 4/PI
-	 -0.405284735,		// -4/(PI*PI)
-	 3.141592654,		// PI
-	 0.2225			// weight
-	 },
-	{
-	 0.75,
-	 0.0,
-	 0.159154943,		// 1/(2*PI)
-	 6.283185307		// 2*PI
-	 }
-};
+
 
 /**
- * Emit a LIT instruction.
- * \p flags may be PFS_FLAG_SAT
+ * Transform the program to support fragment.position.
  *
- * Definition of LIT (from ARB_fragment_program):
- * tmp = VectorLoad(op0);
- * if (tmp.x < 0) tmp.x = 0;
- * if (tmp.y < 0) tmp.y = 0;
- * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- * result.x = 1.0;
- * result.y = tmp.x;
- * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- * result.w = 1.0;
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
  *
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots. So unless there's some special undocumented opcode,
- * this implementation is potentially optimal. Unfortunately,
- * emit_arith is a bit too conservative because it doesn't understand
- * partial writes to the vector component.
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
  */
-static const GLfloat LitConst[4] =
-    { 127.999999, 127.999999, 127.999999, -127.999999 };
-
-static void emit_lit(struct r300_fragment_program *fp,
-		     GLuint dest, int mask, GLuint src, int flags)
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
 {
-	COMPILE_STATE;
-	GLuint cnst;
-	int needTemporary;
-	GLuint temp;
-
-	cnst = emit_const4fv(fp, LitConst);
-
-	needTemporary = 0;
-	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
-		needTemporary = 1;
-	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
-		// LIT is typically followed by DP3/DP4, so there's no point
-		// in creating special code for this case
-		needTemporary = 1;
-	}
-
-	if (needTemporary) {
-		temp = keep(get_temp_reg(fp));
-	} else {
-		temp = keep(dest);
-	}
-
-	// Note: The order of emit_arith inside the slots is relevant,
-	// because emit_arith only looks at scalar vs. vector when resolving
-	// dependencies, and it does not consider individual vector components,
-	// so swizzling between the two parts can create fake dependencies.
-
-	// First slot
-	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
-		   keep(src), pfs_zero, undef, 0);
-	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
-
-	// Second slot
-	emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
-		   swizzle(temp, W, W, W, W), cnst, undef, 0);
-	emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
-		   swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
-
-	// Third slot
-	// If desired, we saturate the y result here.
-	// This does not affect the use as a condition variable in the CMP later
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
-		   temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
-		   swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
-
-	// Fourth slot
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
-		   pfs_one, pfs_one, pfs_zero, 0);
-	emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
-
-	// Fifth slot
-	emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
-		   pfs_zero, swizzle(temp, W, W, W, W),
-		   negate(swizzle(temp, Y, Y, Y, Y)), flags);
-	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
-		   pfs_zero, 0);
-
-	if (needTemporary) {
-		emit_arith(fp, PFS_OP_MAD, dest, mask,
-			   temp, pfs_one, pfs_zero, flags);
-		free_temp(fp, temp);
-	} else {
-		// Decrease refcount of the destination
-		t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
-	}
-}
-
-static GLboolean parse_program(struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-	const struct prog_instruction *inst = mp->Base.Instructions;
-	struct prog_instruction *fpi;
-	GLuint src[3], dest, temp[2];
-	int flags, mask = 0;
-	int const_sin[2];
+	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
 
-	if (!inst || inst[0].Opcode == OPCODE_END) {
-		ERROR("empty program?\n");
-		return GL_FALSE;
-	}
-
-	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-		if (fpi->SaturateMode == SATURATE_ZERO_ONE)
-			flags = PFS_FLAG_SAT;
-		else
-			flags = 0;
-
-		if (fpi->Opcode != OPCODE_KIL) {
-			dest = t_dst(fp, fpi->DstReg);
-			mask = fpi->DstReg.WriteMask;
-		}
-
-		switch (fpi->Opcode) {
-		case OPCODE_ABS:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   absolute(src[0]), pfs_one, pfs_zero, flags);
-			break;
-		case OPCODE_ADD:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, src[1], flags);
-			break;
-		case OPCODE_CMP:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			/* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
-			 *    r300 - if src2.c < 0.0 ? src1.c : src0.c
-			 */
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   src[2], src[1], src[0], flags);
-			break;
-		case OPCODE_COS:
-			/*
-			 * cos using a parabola (see SIN):
-			 * cos(x):
-			 *   x = (x/(2*PI))+0.75
-			 *   x = frac(x)
-			 *   x = (x*2*PI)-PI
-			 *   result = sin(x)
-			 */
-			temp[0] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* add 0.5*PI and do range reduction */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(src[0], X, X, X, X),
-				   swizzle(const_sin[1], Z, Z, Z, Z),
-				   swizzle(const_sin[1], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], X, X, X, X),
-				   undef, undef, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
-				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//-PI
-				   0);
-
-			/* SIN */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
-				   swizzle(temp[0], X, X, X, X),
-				   absolute(swizzle(temp[0], X, X, X, X)),
-				   negate(swizzle(temp[0], X, X, X, X)), 0);
-
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[0], X, X, X, X), flags);
-
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_DP3:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_DP3, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_DP4:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_DPH:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			/* src0.xyz1 -> temp
-			 * DP4 dest, temp, src1
-			 */
-#if 0
-			temp[0] = get_temp_reg(fp);
-			src[0].s_swz = SWIZZLE_ONE;
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, pfs_zero, 0);
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   temp[0], src[1], undef, flags);
-			free_temp(fp, temp[0]);
-#else
-			emit_arith(fp, PFS_OP_DP4, dest, mask,
-				   swizzle(src[0], X, Y, Z, ONE), src[1],
-				   undef, flags);
-#endif
-			break;
-		case OPCODE_DST:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			/* dest.y = src0.y * src1.y */
-			if (mask & WRITEMASK_Y)
-				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
-					   keep(src[0]), keep(src[1]),
-					   pfs_zero, flags);
-			/* dest.z = src0.z */
-			if (mask & WRITEMASK_Z)
-				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
-					   src[0], pfs_one, pfs_zero, flags);
-			/* result.x = 1.0
-			 * result.w = src1.w */
-			if (mask & WRITEMASK_XW) {
-				REG_SET_VSWZ(src[1], SWIZZLE_111);	/*Cheat */
-				emit_arith(fp, PFS_OP_MAD, dest,
-					   mask & WRITEMASK_XW,
-					   src[1], pfs_one, pfs_zero, flags);
-			}
-			break;
-		case OPCODE_EX2:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_EX2, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_FLR:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			temp[0] = get_temp_reg(fp);
-			/* FRC temp, src0
-			 * MAD dest, src0, 1.0, -temp
-			 */
-			emit_arith(fp, PFS_OP_FRC, temp[0], mask,
-				   keep(src[0]), undef, undef, 0);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, negate(temp[0]), flags);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_FRC:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_FRC, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_KIL:
-			emit_tex(fp, fpi, R300_TEX_OP_KIL);
-			break;
-		case OPCODE_LG2:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_LG2, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_LIT:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_lit(fp, dest, mask, src[0], flags);
-			break;
-		case OPCODE_LRP:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			/* result = tmp0tmp1 + (1 - tmp0)tmp2
-			 *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
-			 *     MAD temp, -tmp0, tmp2, tmp2
-			 *     MAD result, tmp0, tmp1, temp
-			 */
-			temp[0] = get_temp_reg(fp);
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   negate(keep(src[0])), keep(src[2]), src[2],
-				   0);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], temp[0], flags);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_MAD:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			src[2] = t_src(fp, fpi->SrcReg[2]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], src[2], flags);
-			break;
-		case OPCODE_MAX:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAX, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_MIN:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MIN, dest, mask,
-				   src[0], src[1], undef, flags);
-			break;
-		case OPCODE_MOV:
-		case OPCODE_SWZ:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, pfs_zero, flags);
-			break;
-		case OPCODE_MUL:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], src[1], pfs_zero, flags);
-			break;
-		case OPCODE_POW:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
-				   src[0], undef, undef, 0);
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
-				   temp[0], src[1], pfs_zero, 0);
-			emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
-				   temp[0], undef, undef, 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_RCP:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_RCP, dest, mask,
-				   src[0], undef, undef, flags);
-			break;
-		case OPCODE_RSQ:
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-			emit_arith(fp, PFS_OP_RSQ, dest, mask,
-				   absolute(src[0]), pfs_zero, pfs_zero, flags);
-			break;
-		case OPCODE_SCS:
-			/*
-			 * scs using a parabola :
-			 * scs(x):
-			 *   result.x = sin(-abs(x)+0.5*PI)  (cos)
-			 *   result.y = sin(x)               (sin)
-			 *
-			 */
-			temp[0] = get_temp_reg(fp);
-			temp[1] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* x = -abs(x)+0.5*PI */
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),	//PI
-				   pfs_half,
-				   negate(abs
-					  (swizzle(keep(src[0]), X, X, X, X))),
-				   0);
-
-			/* C*x (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
-				   swizzle(const_sin[0], Y, Y, Y, Y),
-				   swizzle(keep(src[0]), X, X, X, X),
-				   pfs_zero, 0);
-
-			/* B*x, C*x (cos) */
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			/* B*x (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
-				   swizzle(const_sin[0], X, X, X, X),
-				   keep(src[0]), pfs_zero, 0);
-
-			/* y = B*x + C*x*abs(x) (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
-				   absolute(src[0]),
-				   swizzle(temp[0], W, W, W, W),
-				   swizzle(temp[1], W, W, W, W), 0);
-
-			/* y = B*x + C*x*abs(x) (cos) */
-			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			/* y*abs(y) - y (cos), y*abs(y) - y (sin) */
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
-								      W, Z, Y,
-								      X),
-				   absolute(swizzle(temp[1], W, Z, Y, X)),
-				   negate(swizzle(temp[1], W, Z, Y, X)), 0);
-
-			/* dest.xy = mad(temp.xy, P, temp2.wz) */
-			emit_arith(fp, PFS_OP_MAD, dest,
-				   mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[1], W, Z, Y, X), flags);
-
-			free_temp(fp, temp[0]);
-			free_temp(fp, temp[1]);
-			break;
-		case OPCODE_SGE:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			/* temp = src0 - src1
-			 * dest.c = (temp.c < 0.0) ? 0 : 1
-			 */
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, negate(src[1]), 0);
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   pfs_one, pfs_zero, temp[0], 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SIN:
-			/*
-			 *  using a parabola:
-			 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
-			 * extra precision is obtained by weighting against
-			 * itself squared.
-			 */
-
-			temp[0] = get_temp_reg(fp);
-			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
-			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
-			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
-			/* do range reduction */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(keep(src[0]), X, X, X, X),
-				   swizzle(const_sin[1], Z, Z, Z, Z),
-				   pfs_half, 0);
-
-			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], X, X, X, X),
-				   undef, undef, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
-				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//PI
-				   0);
-
-			/* SIN */
-
-			emit_arith(fp, PFS_OP_MAD, temp[0],
-				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
-								      Z, Z, Z,
-								      Z),
-				   const_sin[0], pfs_zero, 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
-				   swizzle(temp[0], X, X, X, X), 0);
-
-			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
-				   swizzle(temp[0], X, X, X, X),
-				   absolute(swizzle(temp[0], X, X, X, X)),
-				   negate(swizzle(temp[0], X, X, X, X)), 0);
-
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   swizzle(temp[0], Y, Y, Y, Y),
-				   swizzle(const_sin[0], W, W, W, W),
-				   swizzle(temp[0], X, X, X, X), flags);
-
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SLT:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			temp[0] = get_temp_reg(fp);
-			/* temp = src0 - src1
-			 * dest.c = (temp.c < 0.0) ? 1 : 0
-			 */
-			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
-				   src[0], pfs_one, negate(src[1]), 0);
-			emit_arith(fp, PFS_OP_CMP, dest, mask,
-				   pfs_zero, pfs_one, temp[0], 0);
-			free_temp(fp, temp[0]);
-			break;
-		case OPCODE_SUB:
-			src[0] = t_src(fp, fpi->SrcReg[0]);
-			src[1] = t_src(fp, fpi->SrcReg[1]);
-			emit_arith(fp, PFS_OP_MAD, dest, mask,
-				   src[0], pfs_one, negate(src[1]), flags);
-			break;
-		case OPCODE_TEX:
-			emit_tex(fp, fpi, R300_TEX_OP_LD);
-			break;
-		case OPCODE_TXB:
-			emit_tex(fp, fpi, R300_TEX_OP_TXB);
-			break;
-		case OPCODE_TXP:
-			emit_tex(fp, fpi, R300_TEX_OP_TXP);
-			break;
-		case OPCODE_XPD:{
-				src[0] = t_src(fp, fpi->SrcReg[0]);
-				src[1] = t_src(fp, fpi->SrcReg[1]);
-				temp[0] = get_temp_reg(fp);
-				/* temp = src0.zxy * src1.yzx */
-				emit_arith(fp, PFS_OP_MAD, temp[0],
-					   WRITEMASK_XYZ, swizzle(keep(src[0]),
-								  Z, X, Y, W),
-					   swizzle(keep(src[1]), Y, Z, X, W),
-					   pfs_zero, 0);
-				/* dest.xyz = src0.yzx * src1.zxy - temp
-				 * dest.w       = undefined
-				 * */
-				emit_arith(fp, PFS_OP_MAD, dest,
-					   mask & WRITEMASK_XYZ, swizzle(src[0],
-									 Y, Z,
-									 X, W),
-					   swizzle(src[1], Z, X, Y, W),
-					   negate(temp[0]), flags);
-				/* cleanup */
-				free_temp(fp, temp[0]);
-				break;
-			}
-		default:
-			ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
-			break;
-		}
-
-		if (fp->error)
-			return GL_FALSE;
-
-	}
-
-	return GL_TRUE;
-}
+	if (!(InputsRead & FRAG_BIT_WPOS))
+		return;
 
-static void insert_wpos(struct gl_program *prog)
-{
 	static gl_state_index tokens[STATE_LENGTH] = {
 		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
 	};
 	struct prog_instruction *fpi;
 	GLuint window_index;
 	int i = 0;
-	GLuint tempregi = prog->NumTemporaries;
-	/* should do something else if no temps left... */
-	prog->NumTemporaries++;
+	GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler);
 
-	fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
-	_mesa_init_instructions(fpi, prog->NumInstructions + 3);
+	fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3);
 
 	/* perspective divide */
 	fpi[i].Opcode = OPCODE_RCP;
@@ -2041,7 +297,7 @@ static void insert_wpos(struct gl_program *prog)
 	i++;
 
 	/* viewport transformation */
-	window_index = _mesa_add_state_reference(prog->Parameters, tokens);
+	window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
 
 	fpi[i].Opcode = OPCODE_MAD;
 
@@ -2066,203 +322,114 @@ static void insert_wpos(struct gl_program *prog)
 	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
 	i++;
 
-	_mesa_copy_instructions(&fpi[i], prog->Instructions,
-				prog->NumInstructions);
-
-	free(prog->Instructions);
-
-	prog->Instructions = fpi;
-
-	prog->NumInstructions += i;
-	fpi = &prog->Instructions[prog->NumInstructions - 1];
-
-	assert(fpi->Opcode == OPCODE_END);
-
-	for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
-		for (i = 0; i < 3; i++)
-			if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
-			    fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
-				fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
-				fpi->SrcReg[i].Index = tempregi;
+	for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) {
+		int reg;
+		for (reg = 0; reg < 3; reg++) {
+			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+				fpi[i].SrcReg[reg].Index = tempregi;
 			}
-	}
-}
-
-/* - Init structures
- * - Determine what hwregs each input corresponds to
- */
-static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
-	struct r300_pfs_compile_state *cs = NULL;
-	struct gl_fragment_program *mp = &fp->mesa_program;
-	struct prog_instruction *fpi;
-	GLuint InputsRead = mp->Base.InputsRead;
-	GLuint temps_used = 0;	/* for fp->temps[] */
-	int i, j;
-
-	/* New compile, reset tracking data */
-	fp->optimization =
-	    driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
-	fp->translated = GL_FALSE;
-	fp->error = GL_FALSE;
-	fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
-	fp->WritesDepth = GL_FALSE;
-	fp->tex.length = 0;
-	fp->cur_node = 0;
-	fp->first_node_has_tex = 0;
-	fp->const_nr = 0;
-	fp->max_temp_idx = 0;
-	fp->node[0].alu_end = -1;
-	fp->node[0].tex_end = -1;
-
-	_mesa_memset(cs, 0, sizeof(*fp->cs));
-	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
-		for (j = 0; j < 3; j++) {
-			cs->slot[i].vsrc[j] = SRC_CONST;
-			cs->slot[i].ssrc[j] = SRC_CONST;
 		}
 	}
+}
 
-	/* Work out what temps the Mesa inputs correspond to, this must match
-	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
-	 * configures itself based on the fragprog's InputsRead
-	 *
-	 * NOTE: this depends on get_hw_temp() allocating registers in order,
-	 * starting from register 0.
-	 */
 
-	/* Texcoords come first */
-	for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
-			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
-			    get_hw_temp(fp, 0);
-		}
+static GLuint build_dtm(GLuint depthmode)
+{
+	switch(depthmode) {
+	default:
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
 	}
-	InputsRead &= ~FRAG_BITS_TEX_ANY;
+}
 
-	/* fragment position treated as a texcoord */
-	if (InputsRead & FRAG_BIT_WPOS) {
-		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(fp, 0);
-		insert_wpos(&mp->Base);
-	}
-	InputsRead &= ~FRAG_BIT_WPOS;
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
+}
 
-	/* Then primary colour */
-	if (InputsRead & FRAG_BIT_COL0) {
-		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(fp, 0);
-	}
-	InputsRead &= ~FRAG_BIT_COL0;
 
-	/* Secondary color */
-	if (InputsRead & FRAG_BIT_COL1) {
-		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(fp, 0);
-	}
-	InputsRead &= ~FRAG_BIT_COL1;
-
-	/* Anything else */
-	if (InputsRead) {
-		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
-		/* force read from hwreg 0 for now */
-		for (i = 0; i < 32; i++)
-			if (InputsRead & (1 << i))
-				cs->inputs[i].reg = 0;
-	}
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct r300_fragment_program *fp,
+	struct r300_fragment_program_external_state *state)
+{
+	int unit;
 
-	/* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
-	 * That way, we can free up the reg when it's no longer needed
-	 */
-	if (!mp->Base.Instructions) {
-		ERROR("No instructions found in program\n");
-		return;
-	}
+	_mesa_bzero(state, sizeof(*state));
 
-	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-		int idx;
-
-		for (i = 0; i < 3; i++) {
-			idx = fpi->SrcReg[i].Index;
-			switch (fpi->SrcReg[i].File) {
-			case PROGRAM_TEMPORARY:
-				if (!(temps_used & (1 << idx))) {
-					cs->temps[idx].reg = -1;
-					cs->temps[idx].refcount = 1;
-					temps_used |= (1 << idx);
-				} else
-					cs->temps[idx].refcount++;
-				break;
-			case PROGRAM_INPUT:
-				cs->inputs[idx].refcount++;
-				break;
-			default:
-				break;
-			}
-		}
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
 
-		idx = fpi->DstReg.Index;
-		if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
-			if (!(temps_used & (1 << idx))) {
-				cs->temps[idx].reg = -1;
-				cs->temps[idx].refcount = 1;
-				temps_used |= (1 << idx);
-			} else
-				cs->temps[idx].refcount++;
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
 		}
 	}
-	cs->temp_in_use = temps_used;
 }
 
-static void update_params(struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
-}
 
 void r300TranslateFragmentShader(r300ContextPtr r300,
 				 struct r300_fragment_program *fp)
 {
+	struct r300_fragment_program_external_state state;
 
-	struct r300_pfs_compile_state *cs = NULL;
+	build_state(r300, fp, &state);
+	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+		/* TODO: cache compiled programs */
+		fp->translated = GL_FALSE;
+		_mesa_memcpy(&fp->state, &state, sizeof(state));
+	}
 
 	if (!fp->translated) {
+		struct r300_fragment_program_compiler compiler;
+
+		compiler.r300 = r300;
+		compiler.fp = fp;
+		compiler.code = &fp->code;
 
-		init_program(r300, fp);
-		cs = fp->cs;
+		radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base);
 
-		if (parse_program(fp) == GL_FALSE) {
-			dump_program(fp);
-			return;
+		insert_WPOS_trailer(&compiler);
+
+		struct radeon_program_transformation transformations[] = {
+			{ &transform_TEX, &compiler },
+			{ &radeonTransformALU, 0 }
+		};
+		radeonClauseLocalTransform(&compiler.compiler,
+			&compiler.compiler.Clauses[0],
+			2, transformations);
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Compiler state after transformations:\n");
+			radeonCompilerDump(&compiler.compiler);
 		}
 
-		/* Finish off */
-		fp->node[fp->cur_node].alu_end =
-		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
-		if (fp->node[fp->cur_node].tex_end < 0)
-			fp->node[fp->cur_node].tex_end = 0;
-		fp->alu_offset = 0;
-		fp->alu_end = cs->nrslots - 1;
-		fp->tex_offset = 0;
-		fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
-		assert(fp->node[fp->cur_node].alu_end >= 0);
-		assert(fp->alu_end >= 0);
-
-		fp->translated = GL_TRUE;
-		if (RADEON_DEBUG & DEBUG_PIXEL)
-			dump_program(fp);
-		r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
+		if (!r300FragmentProgramEmit(&compiler))
+			fp->error = GL_TRUE;
+
+		radeonCompilerCleanup(&compiler.compiler);
+
+		if (!fp->error)
+			fp->translated = GL_TRUE;
+		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+			r300FragmentProgramDump(fp, &fp->code);
+		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
 	}
 
-	update_params(fp);
+	update_params(r300, fp);
 }
 
 /* just some random things... */
-static void dump_program(struct r300_fragment_program *fp)
+void r300FragmentProgramDump(
+	struct r300_fragment_program *fp,
+	struct r300_fragment_program_code *code)
 {
 	int n, i, j;
 	static int pc = 0;
@@ -2277,21 +444,21 @@ static void dump_program(struct r300_fragment_program *fp)
 	fprintf(stderr, "Hardware program\n");
 	fprintf(stderr, "----------------\n");
 
-	for (n = 0; n < (fp->cur_node + 1); n++) {
+	for (n = 0; n < (code->cur_node + 1); n++) {
 		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
 			"alu_end: %d, tex_end: %d\n", n,
-			fp->node[n].alu_offset,
-			fp->node[n].tex_offset,
-			fp->node[n].alu_end, fp->node[n].tex_end);
+			code->node[n].alu_offset,
+			code->node[n].tex_offset,
+			code->node[n].alu_end, code->node[n].tex_end);
 
-		if (fp->tex.length) {
+		if (code->tex.length) {
 			fprintf(stderr, "  TEX:\n");
-			for (i = fp->node[n].tex_offset;
-			     i <= fp->node[n].tex_offset + fp->node[n].tex_end;
+			for (i = code->node[n].tex_offset;
+			     i <= code->node[n].tex_offset + code->node[n].tex_end;
 			     ++i) {
 				const char *instr;
 
-				switch ((fp->tex.
+				switch ((code->tex.
 					 inst[i] >> R300_TEX_INST_SHIFT) &
 					15) {
 				case R300_TEX_OP_LD:
@@ -2313,20 +480,20 @@ static void dump_program(struct r300_fragment_program *fp)
 				fprintf(stderr,
 					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
 					instr,
-					(fp->tex.
+					(code->tex.
 					 inst[i] >> R300_DST_ADDR_SHIFT) & 31,
 					't',
-					(fp->tex.
+					(code->tex.
 					 inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
-					(fp->tex.
+					(code->tex.
 					 inst[i] & R300_TEX_ID_MASK) >>
 					R300_TEX_ID_SHIFT,
-					fp->tex.inst[i]);
+					code->tex.inst[i]);
 			}
 		}
 
-		for (i = fp->node[n].alu_offset;
-		     i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
+		for (i = code->node[n].alu_offset;
+		     i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
 			char srcc[3][10], dstc[20];
 			char srca[3][10], dsta[20];
 			char argc[3][20];
@@ -2334,8 +501,8 @@ static void dump_program(struct r300_fragment_program *fp)
 			char flags[5], tmp[10];
 
 			for (j = 0; j < 3; ++j) {
-				int regc = fp->alu.inst[i].inst1 >> (j * 6);
-				int rega = fp->alu.inst[i].inst3 >> (j * 6);
+				int regc = code->alu.inst[i].inst1 >> (j * 6);
+				int rega = code->alu.inst[i].inst3 >> (j * 6);
 
 				sprintf(srcc[j], "%c%i",
 					(regc & 32) ? 'c' : 't', regc & 31);
@@ -2345,46 +512,46 @@ static void dump_program(struct r300_fragment_program *fp)
 
 			dstc[0] = 0;
 			sprintf(flags, "%s%s%s",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
 			if (flags[0] != 0) {
 				sprintf(dstc, "t%i.%s ",
-					(fp->alu.inst[i].
+					(code->alu.inst[i].
 					 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
 					flags);
 			}
 			sprintf(flags, "%s%s%s",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
-				(fp->alu.inst[i].
+				(code->alu.inst[i].
 				 inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
 			if (flags[0] != 0) {
 				sprintf(tmp, "o%i.%s",
-					(fp->alu.inst[i].
+					(code->alu.inst[i].
 					 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
 					flags);
 				strcat(dstc, tmp);
 			}
 
 			dsta[0] = 0;
-			if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
+			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
 				sprintf(dsta, "t%i.w ",
-					(fp->alu.inst[i].
+					(code->alu.inst[i].
 					 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
 			}
-			if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
+			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
 				sprintf(tmp, "o%i.w ",
-					(fp->alu.inst[i].
+					(code->alu.inst[i].
 					 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
 				strcat(dsta, tmp);
 			}
-			if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
+			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
 				strcat(dsta, "Z");
 			}
 
@@ -2392,12 +559,12 @@ static void dump_program(struct r300_fragment_program *fp)
 				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
 				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
 				srcc[0], srcc[1], srcc[2], dstc,
-				fp->alu.inst[i].inst1, srca[0], srca[1],
-				srca[2], dsta, fp->alu.inst[i].inst3);
+				code->alu.inst[i].inst1, srca[0], srca[1],
+				srca[2], dsta, code->alu.inst[i].inst3);
 
 			for (j = 0; j < 3; ++j) {
-				int regc = fp->alu.inst[i].inst0 >> (j * 7);
-				int rega = fp->alu.inst[i].inst2 >> (j * 7);
+				int regc = code->alu.inst[i].inst0 >> (j * 7);
+				int rega = code->alu.inst[i].inst2 >> (j * 7);
 				int d;
 				char buf[20];
 
@@ -2479,8 +646,8 @@ static void dump_program(struct r300_fragment_program *fp)
 			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
 				"       w: %8s %8s %8s    op: %08x\n",
 				argc[0], argc[1], argc[2],
-				fp->alu.inst[i].inst0, arga[0], arga[1],
-				arga[2], fp->alu.inst[i].inst2);
+				code->alu.inst[i].inst0, arga[0], arga[1],
+				arga[2], code->alu.inst[i].inst2);
 		}
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index 573aacf19a1..7c1e210b044 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -40,12 +40,7 @@
 #include "shader/prog_instruction.h"
 
 #include "r300_context.h"
-
-typedef struct r300_fragment_program_swizzle {
-	GLuint length;
-	GLuint src[4];
-	GLuint inst[8];
-} r300_fragment_program_swizzle_t;
+#include "radeon_program.h"
 
 /* supported hw opcodes */
 #define PFS_OP_MAD 0
@@ -74,25 +69,6 @@ typedef struct r300_fragment_program_swizzle {
 #define SRC_MASK		(63 << 0)
 #define SRC_STRIDE		6
 
-#define NOP_INST0 (						 \
-		(R300_ALU_OUTC_MAD) |				 \
-		(R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
-		(R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
-		(R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
-#define NOP_INST1 (					     \
-		((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
-		((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
-		((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
-#define NOP_INST2 ( \
-		(R300_ALU_OUTA_MAD) |				 \
-		(R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
-		(R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
-		(R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
-#define NOP_INST3 (					     \
-		((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
-		((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
-		((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
-
 #define DRI_CONF_FP_OPTIMIZATION_SPEED   0
 #define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
 
@@ -161,4 +137,24 @@ struct r300_fragment_program;
 extern void r300TranslateFragmentShader(r300ContextPtr r300,
 					struct r300_fragment_program *fp);
 
+
+/**
+ * Used internally by the r300 fragment program code to store compile-time
+ * only data.
+ */
+struct r300_fragment_program_compiler {
+	r300ContextPtr r300;
+	struct r300_fragment_program *fp;
+	struct r300_fragment_program_code *code;
+	struct radeon_compiler compiler;
+};
+
+extern void r300FPTransformTextures(struct r300_fragment_program_compiler *compiler);
+extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
+
+
+extern void r300FragmentProgramDump(
+	struct r300_fragment_program *fp,
+	struct r300_fragment_program_code *code);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
new file mode 100644
index 00000000000..9ba29feb40b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
@@ -0,0 +1,2058 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Emit the r300_fragment_program_code that can be understood by the hardware.
+ * Input is a pre-transformed radeon_program.
+ *
+ * \author Ben Skeggs <[email protected]>
+ *
+ * \author Jerome Glisse <[email protected]>
+ *
+ * \todo FogOption
+ *
+ * \todo Verify results of opcodes for accuracy, I've only checked them in
+ * specific cases.
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r300_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/* Mapping Mesa registers to R300 temporaries */
+struct reg_acc {
+	int reg;		/* Assigned hw temp */
+	unsigned int refcount;	/* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+	/* Index of the first slot where this register is free in the sense
+	   that it can be used as a new destination register.
+	   This is -1 if the register has been assigned to a Mesa register
+	   and the last access to the register has not yet been emitted */
+	int free;
+
+	/* Index of the first slot where this register is currently reserved.
+	   This is used to stop e.g. a scalar operation from being moved
+	   before the allocation time of a register that was first allocated
+	   for a vector operation. */
+	int reserved;
+
+	/* Index of the first slot in which the register can be used as a
+	   source without losing the value that is written by the last
+	   emitted instruction that writes to the register */
+	int vector_valid;
+	int scalar_valid;
+
+	/* Index to the slot where the register was last read.
+	   This is also the first slot in which the register may be written again */
+	int vector_lastread;
+	int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+	   defined above */
+	unsigned int used;
+
+	/* Selected sources */
+	int vsrc[3];
+	int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r300_pfs_compile_state {
+	struct r300_fragment_program_compiler *compiler;
+
+	int nrslots;		/* number of ALU slots used so far */
+
+	/* Track which (parts of) slots are already filled with instructions */
+	struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+	/* Track the validity of R300 temporaries */
+	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+	/* Used to map Mesa's inputs/temps onto hardware temps */
+	int temp_in_use;
+	struct reg_acc temps[PFS_NUM_TEMP_REGS];
+	struct reg_acc inputs[32];	/* don't actually need 32... */
+
+	/* Track usage of hardware temps, for register allocation,
+	 * indirection detection, etc. */
+	GLuint used_in_node;
+	GLuint dest_in_node;
+};
+
+
+/*
+ * Usefull macros and values
+ */
+#define ERROR(fmt, args...) do {			\
+		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+			__FILE__, __FUNCTION__, ##args);	\
+		fp->error = GL_TRUE;			\
+	} while(0)
+
+#define PFS_INVAL 0xFFFFFFFF
+#define COMPILE_STATE \
+	struct r300_fragment_program *fp = cs->compiler->fp; \
+	struct r300_fragment_program_code *code = cs->compiler->code; \
+	(void)code; (void)fp
+
+#define SWIZZLE_XYZ		0
+#define SWIZZLE_XXX		1
+#define SWIZZLE_YYY		2
+#define SWIZZLE_ZZZ		3
+#define SWIZZLE_WWW		4
+#define SWIZZLE_YZX		5
+#define SWIZZLE_ZXY		6
+#define SWIZZLE_WZY		7
+#define SWIZZLE_111		8
+#define SWIZZLE_000		9
+#define SWIZZLE_HHH		10
+
+#define swizzle(r, x, y, z, w) do_swizzle(cs, r,		\
+					  ((SWIZZLE_##x<<0)|	\
+					   (SWIZZLE_##y<<3)|	\
+					   (SWIZZLE_##z<<6)|	\
+					   (SWIZZLE_##w<<9)),	\
+					  0)
+
+#define REG_TYPE_INPUT		0
+#define REG_TYPE_OUTPUT		1
+#define REG_TYPE_TEMP		2
+#define REG_TYPE_CONST		3
+
+#define REG_TYPE_SHIFT		0
+#define REG_INDEX_SHIFT		2
+#define REG_VSWZ_SHIFT		8
+#define REG_SSWZ_SHIFT		13
+#define REG_NEGV_SHIFT		18
+#define REG_NEGS_SHIFT		19
+#define REG_ABS_SHIFT		20
+#define REG_NO_USE_SHIFT	21	// Hack for refcounting
+#define REG_VALID_SHIFT		22	// Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23	// Is it a builtin (like all zero/all one)?
+
+#define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
+#define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
+#define REG_VSWZ_MASK		(0x1F << REG_VSWZ_SHIFT)
+#define REG_SSWZ_MASK		(0x1F << REG_SSWZ_SHIFT)
+#define REG_NEGV_MASK		(0x01 << REG_NEGV_SHIFT)
+#define REG_NEGS_MASK		(0x01 << REG_NEGS_SHIFT)
+#define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
+#define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
+#define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
+
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
+	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
+	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
+	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
+	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
+	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
+	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
+	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_GET_TYPE(reg)						\
+	((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
+#define REG_GET_INDEX(reg)						\
+	((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
+#define REG_GET_VSWZ(reg)						\
+	((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
+#define REG_GET_SSWZ(reg)						\
+	((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
+#define REG_GET_NO_USE(reg)						\
+	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
+#define REG_GET_VALID(reg)						\
+	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)						\
+	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
+#define REG_SET_TYPE(reg, type)						\
+	reg = ((reg & ~REG_TYPE_MASK) |					\
+	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
+#define REG_SET_INDEX(reg, index)					\
+	reg = ((reg & ~REG_INDEX_MASK) |				\
+	       ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
+#define REG_SET_VSWZ(reg, vswz)						\
+	reg = ((reg & ~REG_VSWZ_MASK) |					\
+	       ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
+#define REG_SET_SSWZ(reg, sswz)						\
+	reg = ((reg & ~REG_SSWZ_MASK) |					\
+	       ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_SET_NO_USE(reg, nouse)					\
+	reg = ((reg & ~REG_NO_USE_MASK) |				\
+	       ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
+#define REG_SET_VALID(reg, valid)					\
+	reg = ((reg & ~REG_VALID_MASK) |				\
+	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)					\
+	reg = ((reg & ~REG_BUILTIN_MASK) |				\
+	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
+#define REG_ABS(reg)							\
+	reg = (reg | REG_ABS_MASK)
+#define REG_NEGV(reg)							\
+	reg = (reg | REG_NEGV_MASK)
+#define REG_NEGS(reg)							\
+	reg = (reg | REG_NEGS_MASK)
+
+#define NOP_INST0 (						 \
+		(R300_ALU_OUTC_MAD) |				 \
+		(R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
+		(R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
+		(R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
+#define NOP_INST1 (					     \
+		((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
+		((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
+		((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
+#define NOP_INST2 ( \
+		(R300_ALU_OUTA_MAD) |				 \
+		(R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
+		(R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
+		(R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
+#define NOP_INST3 (					     \
+		((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
+		((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
+		((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
+
+
+/*
+ * Datas structures for fragment program generation
+ */
+
+/* description of r300 native hw instructions */
+static const struct {
+	const char *name;
+	int argc;
+	int v_op;
+	int s_op;
+} r300_fpop[] = {
+	/* *INDENT-OFF* */
+	{"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
+	{"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
+	{"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
+	{"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
+	{"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
+	{"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
+	{"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
+	{"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
+	{"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
+	{"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
+	{"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
+	{"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
+	{"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
+	/* *INDENT-ON* */
+};
+
+/* vector swizzles r300 can support natively, with a couple of
+ * cases we handle specially
+ *
+ * REG_VSWZ/REG_SSWZ is an index into this table
+ */
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
+#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
+					  SWIZZLE_##y, \
+					  SWIZZLE_##z, \
+					  SWIZZLE_ZERO))
+/* native swizzles */
+static const struct r300_pfs_swizzle {
+	GLuint hash;		/* swizzle value this matches */
+	GLuint base;		/* base value for hw swizzle */
+	GLuint stride;		/* difference in base between arg0/1/2 */
+	GLuint flags;
+} v_swiz[] = {
+	/* *INDENT-OFF* */
+	{MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
+	{MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
+	{MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
+	{MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
+	{MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
+	{PFS_INVAL, 0, 0, 0},
+	/* *INDENT-ON* */
+};
+
+/* used during matching of non-native swizzles */
+#define SWZ_X_MASK (7 << 0)
+#define SWZ_Y_MASK (7 << 3)
+#define SWZ_Z_MASK (7 << 6)
+#define SWZ_W_MASK (7 << 9)
+static const struct {
+	GLuint hash;		/* used to mask matching swizzle components */
+	int mask;		/* actual outmask */
+	int count;		/* count of components matched */
+} s_mask[] = {
+	/* *INDENT-OFF* */
+	{SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
+	{SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
+	{SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
+	{SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
+	{SWZ_X_MASK, 1, 1},
+	{SWZ_Y_MASK, 2, 1},
+	{SWZ_Z_MASK, 4, 1},
+	{PFS_INVAL, PFS_INVAL, PFS_INVAL}
+	/* *INDENT-ON* */
+};
+
+static const struct {
+	int base;		/* hw value of swizzle */
+	int stride;		/* difference between SRC0/1/2 */
+	GLuint flags;
+} s_swiz[] = {
+	/* *INDENT-OFF* */
+	{R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
+	{R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
+	{R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
+	{R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
+	{R300_ALU_ARGA_ZERO, 0, 0},
+	{R300_ALU_ARGA_ONE, 0, 0},
+	{R300_ALU_ARGA_HALF, 0, 0}
+	/* *INDENT-ON* */
+};
+
+/* boiler-plate reg, for convenience */
+static const GLuint undef = REG(REG_TYPE_TEMP,
+				0,
+				SWIZZLE_XYZ,
+				SWIZZLE_W,
+				GL_FALSE,
+				GL_FALSE,
+				GL_FALSE);
+
+/* constant one source */
+static const GLuint pfs_one = REG(REG_TYPE_CONST,
+				  0,
+				  SWIZZLE_111,
+				  SWIZZLE_ONE,
+				  GL_FALSE,
+				  GL_TRUE,
+				  GL_TRUE);
+
+/* constant half source */
+static const GLuint pfs_half = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_HHH,
+				   SWIZZLE_HALF,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/* constant zero source */
+static const GLuint pfs_zero = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_000,
+				   SWIZZLE_ZERO,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/*
+ * Common functions prototypes
+ */
+static void emit_arith(struct r300_pfs_compile_state *cs, int op,
+		       GLuint dest, int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags);
+
+/**
+ * Get an R300 temporary that can be written to in the given slot.
+ */
+static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS) {
+		ERROR("Out of hardware temps\n");
+		return 0;
+	}
+	// Reserved is used to avoid the following scenario:
+	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
+	//  to overwrite the value of temporary Y.
+	// End scenario.
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = 0;
+	cs->hwtemps[r].scalar_valid = 0;
+
+	if (r > code->max_temp_idx)
+		code->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
+static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->used_in_node & (1 << r))
+			continue;
+
+		// Note: Be very careful here
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS)
+		return get_hw_temp(cs, 0);	/* Will cause an indirection */
+
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = cs->nrslots;
+	cs->hwtemps[r].scalar_valid = cs->nrslots;
+
+	if (r > code->max_temp_idx)
+		code->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Mark the given hardware register as free.
+ */
+static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
+{
+	// Be very careful here. Consider sequences like
+	//  MAD r0, r1,r2,r3
+	//  TEX r4, ...
+	// The TEX instruction may be moved in front of the MAD instruction
+	// due to the way nodes work. We don't want to alias r1 and r4 in
+	// this case.
+	// I'm certain the register allocation could be further sanitized,
+	// but it's tricky because of stuff that can happen inside emit_tex
+	// and emit_arith.
+	cs->hwtemps[idx].free = cs->nrslots + 1;
+}
+
+/**
+ * Create a new Mesa temporary register.
+ */
+static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint index;
+
+	index = ffs(~cs->temp_in_use);
+	if (!index) {
+		ERROR("Out of program temps\n");
+		return r;
+	}
+
+	cs->temp_in_use |= (1 << --index);
+	cs->temps[index].refcount = 0xFFFFFFFF;
+	cs->temps[index].reg = -1;
+
+	REG_SET_TYPE(r, REG_TYPE_TEMP);
+	REG_SET_INDEX(r, index);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
+static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
+{
+	GLuint index = REG_GET_INDEX(r);
+
+	if (!(cs->temp_in_use & (1 << index)))
+		return;
+
+	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
+		free_hw_temp(cs, cs->temps[index].reg);
+		cs->temps[index].reg = -1;
+		cs->temp_in_use &= ~(1 << index);
+	} else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
+		free_hw_temp(cs, cs->inputs[index].reg);
+		cs->inputs[index].reg = -1;
+	}
+}
+
+/**
+ * Emit a hardware constant/parameter.
+ *
+ * \p cp Stable pointer to an array of 4 floats.
+ *  The pointer must be stable in the sense that it remains to be valid
+ *  and hold the contents of the constant/parameter throughout the lifetime
+ *  of the fragment program (actually, up until the next time the fragment
+ *  program is translated).
+ */
+static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
+			    const GLfloat * cp)
+{
+	COMPILE_STATE;
+	GLuint reg = undef;
+	int index;
+
+	for (index = 0; index < code->const_nr; ++index) {
+		if (code->constant[index] == cp)
+			break;
+	}
+
+	if (index >= code->const_nr) {
+		if (index >= PFS_NUM_CONST_REGS) {
+			ERROR("Out of hw constants!\n");
+			return reg;
+		}
+
+		code->const_nr++;
+		code->constant[index] = cp;
+	}
+
+	REG_SET_TYPE(reg, REG_TYPE_CONST);
+	REG_SET_INDEX(reg, index);
+	REG_SET_VALID(reg, GL_TRUE);
+	return reg;
+}
+
+static inline GLuint negate(GLuint r)
+{
+	REG_NEGS(r);
+	REG_NEGV(r);
+	return r;
+}
+
+/* Hack, to prevent clobbering sources used multiple times when
+ * emulating non-native instructions
+ */
+static inline GLuint keep(GLuint r)
+{
+	REG_SET_NO_USE(r, GL_TRUE);
+	return r;
+}
+
+static inline GLuint absolute(GLuint r)
+{
+	REG_ABS(r);
+	return r;
+}
+
+static int swz_native(struct r300_pfs_compile_state *cs,
+		      GLuint src, GLuint * r, GLuint arbneg)
+{
+	COMPILE_STATE;
+
+	/* Native swizzle, handle negation */
+	src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
+
+	if ((arbneg & 0x7) == 0x0) {
+		src = src & ~REG_NEGV_MASK;
+		*r = src;
+	} else if ((arbneg & 0x7) == 0x7) {
+		src |= REG_NEGV_MASK;
+		*r = src;
+	} else {
+		if (!REG_GET_VALID(*r))
+			*r = get_temp_reg(cs);
+		src |= REG_NEGV_MASK;
+		emit_arith(cs,
+			   PFS_OP_MAD,
+			   *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
+		src = src & ~REG_NEGV_MASK;
+		emit_arith(cs,
+			   PFS_OP_MAD,
+			   *r,
+			   (arbneg ^ 0x7) | WRITEMASK_W,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return 3;
+}
+
+static int swz_emit_partial(struct r300_pfs_compile_state *cs,
+			    GLuint src,
+			    GLuint * r, int mask, int mc, GLuint arbneg)
+{
+	COMPILE_STATE;
+	GLuint tmp;
+	GLuint wmask = 0;
+
+	if (!REG_GET_VALID(*r))
+		*r = get_temp_reg(cs);
+
+	/* A partial match, VSWZ/mask define what parts of the
+	 * desired swizzle we match
+	 */
+	if (mc + s_mask[mask].count == 3) {
+		wmask = WRITEMASK_W;
+		src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
+	}
+
+	tmp = arbneg & s_mask[mask].mask;
+	if (tmp) {
+		tmp = tmp ^ s_mask[mask].mask;
+		if (tmp) {
+			emit_arith(cs,
+				   PFS_OP_MAD,
+				   *r,
+				   arbneg & s_mask[mask].mask,
+				   keep(src) | REG_NEGV_MASK,
+				   pfs_one, pfs_zero, 0);
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(cs,
+				   PFS_OP_MAD,
+				   *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
+		} else {
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(cs,
+				   PFS_OP_MAD,
+				   *r,
+				   (arbneg & s_mask[mask].mask) | wmask,
+				   src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
+		}
+	} else {
+		if (!wmask) {
+			REG_SET_NO_USE(src, GL_TRUE);
+		} else {
+			REG_SET_NO_USE(src, GL_FALSE);
+		}
+		emit_arith(cs, PFS_OP_MAD,
+			   *r,
+			   s_mask[mask].mask | wmask,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return s_mask[mask].count;
+}
+
+static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
+			 GLuint src, GLuint arbswz, GLuint arbneg)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint vswz;
+	int c_mask = 0;
+	int v_match = 0;
+
+	/* If swizzling from something without an XYZW native swizzle,
+	 * emit result to a temp, and do new swizzle from the temp.
+	 */
+#if 0
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint temp = get_temp_reg(fp);
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
+		src = temp;
+	}
+#endif
+
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint vsrcswz =
+		    (v_swiz[REG_GET_VSWZ(src)].
+		     hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
+		    REG_GET_SSWZ(src) << 9;
+		GLint i;
+
+		GLuint newswz = 0;
+		GLuint offset;
+		for (i = 0; i < 4; ++i) {
+			offset = GET_SWZ(arbswz, i);
+
+			newswz |=
+			    (offset <= 3) ? GET_SWZ(vsrcswz,
+						    offset) << i *
+			    3 : offset << i * 3;
+		}
+
+		arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
+		REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+	} else {
+		/* set scalar swizzling */
+		REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+	}
+	do {
+		vswz = REG_GET_VSWZ(src);
+		do {
+			int chash;
+
+			REG_SET_VSWZ(src, vswz);
+			chash = v_swiz[REG_GET_VSWZ(src)].hash &
+			    s_mask[c_mask].hash;
+
+			if (chash == (arbswz & s_mask[c_mask].hash)) {
+				if (s_mask[c_mask].count == 3) {
+					v_match += swz_native(cs,
+							      src, &r, arbneg);
+				} else {
+					v_match += swz_emit_partial(cs,
+								    src,
+								    &r,
+								    c_mask,
+								    v_match,
+								    arbneg);
+				}
+
+				if (v_match == 3)
+					return r;
+
+				/* Fill with something invalid.. all 0's was
+				 * wrong before, matched SWIZZLE_X.  So all
+				 * 1's will be okay for now
+				 */
+				arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
+			}
+		} while (v_swiz[++vswz].hash != PFS_INVAL);
+		REG_SET_VSWZ(src, SWIZZLE_XYZ);
+	} while (s_mask[++c_mask].hash != PFS_INVAL);
+
+	ERROR("should NEVER get here\n");
+	return r;
+}
+
+static GLuint t_src(struct r300_pfs_compile_state *cs,
+		    struct prog_src_register fpsrc)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+
+	switch (fpsrc.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		break;
+	case PROGRAM_INPUT:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_INPUT);
+		break;
+	case PROGRAM_LOCAL_PARAM:
+		r = emit_const4fv(cs,
+				  fp->mesa_program.Base.LocalParams[fpsrc.
+								    Index]);
+		break;
+	case PROGRAM_ENV_PARAM:
+		r = emit_const4fv(cs,
+			cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
+		break;
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_CONSTANT:
+		r = emit_const4fv(cs,
+				  fp->mesa_program.Base.Parameters->
+				  ParameterValues[fpsrc.Index]);
+		break;
+	case PROGRAM_BUILTIN:
+		switch(fpsrc.Swizzle) {
+		case SWIZZLE_1111: r = pfs_one; break;
+		case SWIZZLE_0000: r = pfs_zero; break;
+		default:
+			ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
+			break;
+		}
+		break;
+	default:
+		ERROR("unknown SrcReg->File %x\n", fpsrc.File);
+		return r;
+	}
+
+	/* no point swizzling ONE/ZERO/HALF constants... */
+	if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
+		r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
+	if (fpsrc.Abs)
+		r = absolute(r);
+	if (fpsrc.NegateAbs)
+		r = negate(r);
+	return r;
+}
+
+static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
+			   struct prog_src_register fpsrc)
+{
+	struct prog_src_register src = fpsrc;
+	int sc = GET_SWZ(fpsrc.Swizzle, 0);	/* X */
+
+	src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
+
+	return t_src(cs, src);
+}
+
+static GLuint t_dst(struct r300_pfs_compile_state *cs,
+		    struct prog_dst_register dest)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+
+	switch (dest.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, dest.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		return r;
+	case PROGRAM_OUTPUT:
+		REG_SET_TYPE(r, REG_TYPE_OUTPUT);
+		switch (dest.Index) {
+		case FRAG_RESULT_COLR:
+		case FRAG_RESULT_DEPR:
+			REG_SET_INDEX(r, dest.Index);
+			REG_SET_VALID(r, GL_TRUE);
+			return r;
+		default:
+			ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
+			return r;
+		}
+	default:
+		ERROR("Bad DstReg->File 0x%x\n", dest.File);
+		return r;
+	}
+}
+
+static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
+{
+	COMPILE_STATE;
+	int idx;
+	int index = REG_GET_INDEX(src);
+
+	switch (REG_GET_TYPE(src)) {
+	case REG_TYPE_TEMP:
+		/* NOTE: if reg==-1 here, a source is being read that
+		 *       hasn't been written to. Undefined results.
+		 */
+		if (cs->temps[index].reg == -1)
+			cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
+
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
+			free_temp(cs, src);
+		break;
+	case REG_TYPE_INPUT:
+		idx = cs->inputs[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
+			free_hw_temp(cs, cs->inputs[index].reg);
+		break;
+	case REG_TYPE_CONST:
+		return (index | SRC_CONST);
+	default:
+		ERROR("Invalid type for source reg\n");
+		return (0 | SRC_CONST);
+	}
+
+	if (!tex)
+		cs->used_in_node |= (1 << idx);
+
+	return idx;
+}
+
+static int t_hw_dst(struct r300_pfs_compile_state *cs,
+		    GLuint dest, GLboolean tex, int slot)
+{
+	COMPILE_STATE;
+	int idx;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
+			if (!tex) {
+				cs->temps[index].reg = get_hw_temp(cs, slot);
+			} else {
+				cs->temps[index].reg = get_hw_temp_tex(cs);
+			}
+		}
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
+			free_temp(cs, dest);
+
+		cs->dest_in_node |= (1 << idx);
+		cs->used_in_node |= (1 << idx);
+		break;
+	case REG_TYPE_OUTPUT:
+		switch (index) {
+		case FRAG_RESULT_COLR:
+			code->node[code->cur_node].flags |= R300_RGBA_OUT;
+			break;
+		case FRAG_RESULT_DEPR:
+			fp->WritesDepth = GL_TRUE;
+			code->node[code->cur_node].flags |= R300_W_OUT;
+			break;
+		}
+		return index;
+		break;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	return idx;
+}
+
+static void emit_nop(struct r300_pfs_compile_state *cs)
+{
+	COMPILE_STATE;
+
+	if (cs->nrslots >= PFS_MAX_ALU_INST) {
+		ERROR("Out of ALU instruction slots\n");
+		return;
+	}
+
+	code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+	code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+	code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+	code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+	cs->nrslots++;
+}
+
+static void emit_tex(struct r300_pfs_compile_state *cs,
+		     struct prog_instruction *fpi, int opcode)
+{
+	COMPILE_STATE;
+	GLuint coord = t_src(cs, fpi->SrcReg[0]);
+	GLuint dest = undef;
+	GLuint din, uin;
+	int unit = fpi->TexSrcUnit;
+	int hwsrc, hwdest;
+
+	/* Ensure correct node indirection */
+	uin = cs->used_in_node;
+	din = cs->dest_in_node;
+
+	/* Resolve source/dest to hardware registers */
+	hwsrc = t_hw_src(cs, coord, GL_TRUE);
+
+	if (opcode != R300_TEX_OP_KIL) {
+		dest = t_dst(cs, fpi->DstReg);
+
+		hwdest =
+		    t_hw_dst(cs, dest, GL_TRUE,
+			     code->node[code->cur_node].alu_offset);
+
+		/* Use a temp that hasn't been used in this node, rather
+		 * than causing an indirection
+		 */
+		if (uin & (1 << hwdest)) {
+			free_hw_temp(cs, hwdest);
+			hwdest = get_hw_temp_tex(cs);
+			cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
+		}
+	} else {
+		hwdest = 0;
+		unit = 0;
+	}
+
+	/* Indirection if source has been written in this node, or if the
+	 * dest has been read/written in this node
+	 */
+	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
+	     (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
+
+		/* Finish off current node */
+		if (code->node[code->cur_node].alu_offset == cs->nrslots)
+			emit_nop(cs);
+
+		code->node[code->cur_node].alu_end =
+		    cs->nrslots - code->node[code->cur_node].alu_offset - 1;
+		assert(code->node[code->cur_node].alu_end >= 0);
+
+		if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
+			ERROR("too many levels of texture indirection\n");
+			return;
+		}
+
+		/* Start new node */
+		code->node[code->cur_node].tex_offset = code->tex.length;
+		code->node[code->cur_node].alu_offset = cs->nrslots;
+		code->node[code->cur_node].tex_end = -1;
+		code->node[code->cur_node].alu_end = -1;
+		code->node[code->cur_node].flags = 0;
+		cs->used_in_node = 0;
+		cs->dest_in_node = 0;
+	}
+
+	if (code->cur_node == 0)
+		code->first_node_has_tex = 1;
+
+	code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
+	    | (hwdest << R300_DST_ADDR_SHIFT)
+	    | (unit << R300_TEX_ID_SHIFT)
+	    | (opcode << R300_TEX_INST_SHIFT);
+
+	cs->dest_in_node |= (1 << hwdest);
+	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
+		cs->used_in_node |= (1 << hwsrc);
+
+	code->node[code->cur_node].tex_end++;
+}
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
+ */
+static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
+				      GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int idx;
+	int pos;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[index].reg == -1)
+			return 0;
+
+		idx = cs->temps[index].reg;
+		break;
+	case REG_TYPE_OUTPUT:
+		return 0;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	pos = cs->hwtemps[idx].reserved;
+	if (mask & WRITEMASK_XYZ) {
+		if (pos < cs->hwtemps[idx].vector_lastread)
+			pos = cs->hwtemps[idx].vector_lastread;
+	}
+	if (mask & WRITEMASK_W) {
+		if (pos < cs->hwtemps[idx].scalar_lastread)
+			pos = cs->hwtemps[idx].scalar_lastread;
+	}
+
+	return pos;
+}
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
+ *
+ * @return the index of the slot
+ */
+static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
+				 GLboolean emit_vop,
+				 GLboolean emit_sop,
+				 int argc, GLuint * src, GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int hwsrc[3];
+	int srcpos[3];
+	unsigned int used;
+	int tempused;
+	int tempvsrc[3];
+	int tempssrc[3];
+	int pos;
+	int regnr;
+	int i, j;
+
+	// Determine instruction slots, whether sources are required on
+	// vector or scalar side, and the smallest slot number where
+	// all source registers are available
+	used = 0;
+	if (emit_vop)
+		used |= SLOT_OP_VECTOR;
+	if (emit_sop)
+		used |= SLOT_OP_SCALAR;
+
+	pos = get_earliest_allowed_write(cs, dest, mask);
+
+	if (code->node[code->cur_node].alu_offset > pos)
+		pos = code->node[code->cur_node].alu_offset;
+	for (i = 0; i < argc; ++i) {
+		if (!REG_GET_BUILTIN(src[i])) {
+			if (emit_vop)
+				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+			if (emit_sop)
+				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+		}
+
+		hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE);	/* Note: sideeffects wrt refcounting! */
+		regnr = hwsrc[i] & 31;
+
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_valid > pos)
+					pos = cs->hwtemps[regnr].vector_valid;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_valid > pos)
+					pos = cs->hwtemps[regnr].scalar_valid;
+			}
+		}
+	}
+
+	// Find a slot that fits
+	for (;; ++pos) {
+		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+			continue;
+
+		if (pos >= cs->nrslots) {
+			if (cs->nrslots >= PFS_MAX_ALU_INST) {
+				ERROR("Out of ALU instruction slots\n");
+				return -1;
+			}
+
+			code->alu.inst[pos].inst0 = NOP_INST0;
+			code->alu.inst[pos].inst1 = NOP_INST1;
+			code->alu.inst[pos].inst2 = NOP_INST2;
+			code->alu.inst[pos].inst3 = NOP_INST3;
+
+			cs->nrslots++;
+		}
+		// Note: When we need both parts (vector and scalar) of a source,
+		// we always try to put them into the same position. This makes the
+		// code easier to read, and it is optimal (i.e. one doesn't gain
+		// anything by splitting the parts).
+		// It also avoids headaches with swizzles that access both parts (i.e WXY)
+		tempused = cs->slot[pos].used;
+		for (i = 0; i < 3; ++i) {
+			tempvsrc[i] = cs->slot[pos].vsrc[i];
+			tempssrc[i] = cs->slot[pos].ssrc[i];
+		}
+
+		for (i = 0; i < argc; ++i) {
+			int flags = (used >> i) & SLOT_SRC_BOTH;
+
+			if (!flags) {
+				srcpos[i] = 0;
+				continue;
+			}
+
+			for (j = 0; j < 3; ++j) {
+				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+					if (tempvsrc[j] != hwsrc[i])
+						continue;
+				}
+
+				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+					if (tempssrc[j] != hwsrc[i])
+						continue;
+				}
+
+				break;
+			}
+
+			if (j == 3)
+				break;
+
+			srcpos[i] = j;
+			tempused |= flags << j;
+			if (flags & SLOT_SRC_VECTOR)
+				tempvsrc[j] = hwsrc[i];
+			if (flags & SLOT_SRC_SCALAR)
+				tempssrc[j] = hwsrc[i];
+		}
+
+		if (i == argc)
+			break;
+	}
+
+	// Found a slot, reserve it
+	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+	for (i = 0; i < 3; ++i) {
+		cs->slot[pos].vsrc[i] = tempvsrc[i];
+		cs->slot[pos].ssrc[i] = tempssrc[i];
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			int regnr = hwsrc[i] & 31;
+
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_lastread < pos)
+					cs->hwtemps[regnr].vector_lastread =
+					    pos;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_lastread < pos)
+					cs->hwtemps[regnr].scalar_lastread =
+					    pos;
+			}
+		}
+	}
+
+	// Emit the source fetch code
+	code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
+	code->alu.inst[pos].inst1 |=
+	    ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
+	     (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
+	     (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
+
+	code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
+	code->alu.inst[pos].inst3 |=
+	    ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
+	     (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
+	     (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
+
+	// Emit the argument selection code
+	if (emit_vop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+					  (srcpos[i] *
+					   v_swiz[REG_GET_VSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGV_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_ALU_ARGC_ZERO;
+			}
+		}
+
+		code->alu.inst[pos].inst0 &=
+		    ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
+		      R300_ALU_ARG2C_MASK);
+		code->alu.inst[pos].inst0 |=
+		    (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
+							 R300_ALU_ARG1C_SHIFT)
+		    | (swz[2] << R300_ALU_ARG2C_SHIFT);
+	}
+
+	if (emit_sop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+					  (srcpos[i] *
+					   s_swiz[REG_GET_SSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGS_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_ALU_ARGA_ZERO;
+			}
+		}
+
+		code->alu.inst[pos].inst2 &=
+		    ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
+		      R300_ALU_ARG2A_MASK);
+		code->alu.inst[pos].inst2 |=
+		    (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
+							 R300_ALU_ARG1A_SHIFT)
+		    | (swz[2] << R300_ALU_ARG2A_SHIFT);
+	}
+
+	return pos;
+}
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
+static void emit_arith(struct r300_pfs_compile_state *cs,
+		       int op,
+		       GLuint dest,
+		       int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags)
+{
+	COMPILE_STATE;
+	GLuint src[3] = { src0, src1, src2 };
+	int hwdest;
+	GLboolean emit_vop, emit_sop;
+	int vop, sop, argc;
+	int pos;
+
+	vop = r300_fpop[op].v_op;
+	sop = r300_fpop[op].s_op;
+	argc = r300_fpop[op].argc;
+
+	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
+		if (mask & WRITEMASK_Z) {
+			mask = WRITEMASK_W;
+		} else {
+			return;
+		}
+	}
+
+	emit_vop = GL_FALSE;
+	emit_sop = GL_FALSE;
+	if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
+		emit_vop = GL_TRUE;
+	if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
+		emit_sop = GL_TRUE;
+
+	pos =
+	    find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
+				  mask);
+	if (pos < 0)
+		return;
+
+	hwdest = t_hw_dst(cs, dest, GL_FALSE, pos);	/* Note: Side effects wrt register allocation */
+
+	if (flags & PFS_FLAG_SAT) {
+		vop |= R300_ALU_OUTC_CLAMP;
+		sop |= R300_ALU_OUTA_CLAMP;
+	}
+
+	/* Throw the pieces together and get ALU/1 */
+	if (emit_vop) {
+		code->alu.inst[pos].inst0 |= vop;
+
+		code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
+
+		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+				code->alu.inst[pos].inst1 |=
+				    (mask & WRITEMASK_XYZ) <<
+				    R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
+			} else
+				assert(0);
+		} else {
+			code->alu.inst[pos].inst1 |=
+			    (mask & WRITEMASK_XYZ) <<
+			    R300_ALU_DSTC_REG_MASK_SHIFT;
+
+			cs->hwtemps[hwdest].vector_valid = pos + 1;
+		}
+	}
+
+	/* And now ALU/3 */
+	if (emit_sop) {
+		code->alu.inst[pos].inst2 |= sop;
+
+		if (mask & WRITEMASK_W) {
+			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+					code->alu.inst[pos].inst3 |=
+					    (hwdest << R300_ALU_DSTA_SHIFT) |
+					    R300_ALU_DSTA_OUTPUT;
+				} else if (REG_GET_INDEX(dest) ==
+					   FRAG_RESULT_DEPR) {
+					code->alu.inst[pos].inst3 |=
+					    R300_ALU_DSTA_DEPTH;
+				} else
+					assert(0);
+			} else {
+				code->alu.inst[pos].inst3 |=
+				    (hwdest << R300_ALU_DSTA_SHIFT) |
+				    R300_ALU_DSTA_REG;
+
+				cs->hwtemps[hwdest].scalar_valid = pos + 1;
+			}
+		}
+	}
+
+	return;
+}
+
+static GLfloat SinCosConsts[2][4] = {
+	{
+	 1.273239545,		// 4/PI
+	 -0.405284735,		// -4/(PI*PI)
+	 3.141592654,		// PI
+	 0.2225			// weight
+	 },
+	{
+	 0.75,
+	 0.0,
+	 0.159154943,		// 1/(2*PI)
+	 6.283185307		// 2*PI
+	 }
+};
+
+/**
+ * Emit a LIT instruction.
+ * \p flags may be PFS_FLAG_SAT
+ *
+ * Definition of LIT (from ARB_fragment_program):
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots. So unless there's some special undocumented opcode,
+ * this implementation is potentially optimal. Unfortunately,
+ * emit_arith is a bit too conservative because it doesn't understand
+ * partial writes to the vector component.
+ */
+static const GLfloat LitConst[4] =
+    { 127.999999, 127.999999, 127.999999, -127.999999 };
+
+static void emit_lit(struct r300_pfs_compile_state *cs,
+		     GLuint dest, int mask, GLuint src, int flags)
+{
+	COMPILE_STATE;
+	GLuint cnst;
+	int needTemporary;
+	GLuint temp;
+
+	cnst = emit_const4fv(cs, LitConst);
+
+	needTemporary = 0;
+	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
+		needTemporary = 1;
+	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+		// LIT is typically followed by DP3/DP4, so there's no point
+		// in creating special code for this case
+		needTemporary = 1;
+	}
+
+	if (needTemporary) {
+		temp = keep(get_temp_reg(cs));
+	} else {
+		temp = keep(dest);
+	}
+
+	// Note: The order of emit_arith inside the slots is relevant,
+	// because emit_arith only looks at scalar vs. vector when resolving
+	// dependencies, and it does not consider individual vector components,
+	// so swizzling between the two parts can create fake dependencies.
+
+	// First slot
+	emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
+		   keep(src), pfs_zero, undef, 0);
+	emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
+
+	// Second slot
+	emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
+		   swizzle(temp, W, W, W, W), cnst, undef, 0);
+	emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
+		   swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
+
+	// Third slot
+	// If desired, we saturate the y result here.
+	// This does not affect the use as a condition variable in the CMP later
+	emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
+		   temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
+	emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
+		   swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
+
+	// Fourth slot
+	emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
+		   pfs_one, pfs_one, pfs_zero, 0);
+	emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
+
+	// Fifth slot
+	emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
+		   pfs_zero, swizzle(temp, W, W, W, W),
+		   negate(swizzle(temp, Y, Y, Y, Y)), flags);
+	emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
+		   pfs_zero, 0);
+
+	if (needTemporary) {
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+			   temp, pfs_one, pfs_zero, flags);
+		free_temp(cs, temp);
+	} else {
+		// Decrease refcount of the destination
+		t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
+	}
+}
+
+static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
+{
+	COMPILE_STATE;
+	GLuint src[3], dest, temp[2];
+	int flags, mask = 0;
+	int const_sin[2];
+
+	if (fpi->SaturateMode == SATURATE_ZERO_ONE)
+		flags = PFS_FLAG_SAT;
+	else
+		flags = 0;
+
+	if (fpi->Opcode != OPCODE_KIL) {
+		dest = t_dst(cs, fpi->DstReg);
+		mask = fpi->DstReg.WriteMask;
+	}
+
+	switch (fpi->Opcode) {
+	case OPCODE_ADD:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				src[0], pfs_one, src[1], flags);
+		break;
+	case OPCODE_CMP:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		src[2] = t_src(cs, fpi->SrcReg[2]);
+		/* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
+			*    r300 - if src2.c < 0.0 ? src1.c : src0.c
+			*/
+		emit_arith(cs, PFS_OP_CMP, dest, mask,
+				src[2], src[1], src[0], flags);
+		break;
+	case OPCODE_COS:
+		/*
+			* cos using a parabola (see SIN):
+			* cos(x):
+			*   x = (x/(2*PI))+0.75
+			*   x = frac(x)
+			*   x = (x*2*PI)-PI
+			*   result = sin(x)
+			*/
+		temp[0] = get_temp_reg(cs);
+		const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+		const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+		/* add 0.5*PI and do range reduction */
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				swizzle(src[0], X, X, X, X),
+				swizzle(const_sin[1], Z, Z, Z, Z),
+				swizzle(const_sin[1], X, X, X, X), 0);
+
+		emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				swizzle(temp[0], X, X, X, X),
+				undef, undef, 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//-PI
+				0);
+
+		/* SIN */
+
+		emit_arith(cs, PFS_OP_MAD, temp[0],
+				WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								Z, Z, Z,
+								Z),
+				const_sin[0], pfs_zero, 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				swizzle(temp[0], Y, Y, Y, Y),
+				absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				swizzle(temp[0], X, X, X, X), 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				swizzle(temp[0], X, X, X, X),
+				absolute(swizzle(temp[0], X, X, X, X)),
+				negate(swizzle(temp[0], X, X, X, X)), 0);
+
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				swizzle(temp[0], Y, Y, Y, Y),
+				swizzle(const_sin[0], W, W, W, W),
+				swizzle(temp[0], X, X, X, X), flags);
+
+		free_temp(cs, temp[0]);
+		break;
+	case OPCODE_DP3:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_DP3, dest, mask,
+				src[0], src[1], undef, flags);
+		break;
+	case OPCODE_DP4:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_DP4, dest, mask,
+				src[0], src[1], undef, flags);
+		break;
+	case OPCODE_DST:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		/* dest.y = src0.y * src1.y */
+		if (mask & WRITEMASK_Y)
+			emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
+					keep(src[0]), keep(src[1]),
+					pfs_zero, flags);
+		/* dest.z = src0.z */
+		if (mask & WRITEMASK_Z)
+			emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
+					src[0], pfs_one, pfs_zero, flags);
+		/* result.x = 1.0
+			* result.w = src1.w */
+		if (mask & WRITEMASK_XW) {
+			REG_SET_VSWZ(src[1], SWIZZLE_111);	/*Cheat */
+			emit_arith(cs, PFS_OP_MAD, dest,
+					mask & WRITEMASK_XW,
+					src[1], pfs_one, pfs_zero, flags);
+		}
+		break;
+	case OPCODE_EX2:
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_EX2, dest, mask,
+				src[0], undef, undef, flags);
+		break;
+	case OPCODE_FRC:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_FRC, dest, mask,
+				src[0], undef, undef, flags);
+		break;
+	case OPCODE_KIL:
+		emit_tex(cs, fpi, R300_TEX_OP_KIL);
+		break;
+	case OPCODE_LG2:
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_LG2, dest, mask,
+				src[0], undef, undef, flags);
+		break;
+	case OPCODE_LIT:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		emit_lit(cs, dest, mask, src[0], flags);
+		break;
+	case OPCODE_LRP:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		src[2] = t_src(cs, fpi->SrcReg[2]);
+		/* result = tmp0tmp1 + (1 - tmp0)tmp2
+			*        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
+			*     MAD temp, -tmp0, tmp2, tmp2
+			*     MAD result, tmp0, tmp1, temp
+			*/
+		temp[0] = get_temp_reg(cs);
+		emit_arith(cs, PFS_OP_MAD, temp[0], mask,
+				negate(keep(src[0])), keep(src[2]), src[2],
+				0);
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				src[0], src[1], temp[0], flags);
+		free_temp(cs, temp[0]);
+		break;
+	case OPCODE_MAD:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		src[2] = t_src(cs, fpi->SrcReg[2]);
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				src[0], src[1], src[2], flags);
+		break;
+	case OPCODE_MAX:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_MAX, dest, mask,
+				src[0], src[1], undef, flags);
+		break;
+	case OPCODE_MIN:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_MIN, dest, mask,
+				src[0], src[1], undef, flags);
+		break;
+	case OPCODE_MOV:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				src[0], pfs_one, pfs_zero, flags);
+		break;
+	case OPCODE_MUL:
+		src[0] = t_src(cs, fpi->SrcReg[0]);
+		src[1] = t_src(cs, fpi->SrcReg[1]);
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				src[0], src[1], pfs_zero, flags);
+		break;
+	case OPCODE_RCP:
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_RCP, dest, mask,
+				src[0], undef, undef, flags);
+		break;
+	case OPCODE_RSQ:
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+		emit_arith(cs, PFS_OP_RSQ, dest, mask,
+				absolute(src[0]), pfs_zero, pfs_zero, flags);
+		break;
+	case OPCODE_SCS:
+		/*
+			* scs using a parabola :
+			* scs(x):
+			*   result.x = sin(-abs(x)+0.5*PI)  (cos)
+			*   result.y = sin(x)               (sin)
+			*
+			*/
+		temp[0] = get_temp_reg(cs);
+		temp[1] = get_temp_reg(cs);
+		const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+		const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+		/* x = -abs(x)+0.5*PI */
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),	//PI
+				pfs_half,
+				negate(abs
+					(swizzle(keep(src[0]), X, X, X, X))),
+				0);
+
+		/* C*x (sin) */
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
+				swizzle(const_sin[0], Y, Y, Y, Y),
+				swizzle(keep(src[0]), X, X, X, X),
+				pfs_zero, 0);
+
+		/* B*x, C*x (cos) */
+		emit_arith(cs, PFS_OP_MAD, temp[0],
+				WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								Z, Z, Z,
+								Z),
+				const_sin[0], pfs_zero, 0);
+
+		/* B*x (sin) */
+		emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				swizzle(const_sin[0], X, X, X, X),
+				keep(src[0]), pfs_zero, 0);
+
+		/* y = B*x + C*x*abs(x) (sin) */
+		emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
+				absolute(src[0]),
+				swizzle(temp[0], W, W, W, W),
+				swizzle(temp[1], W, W, W, W), 0);
+
+		/* y = B*x + C*x*abs(x) (cos) */
+		emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				swizzle(temp[0], Y, Y, Y, Y),
+				absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				swizzle(temp[0], X, X, X, X), 0);
+
+		/* y*abs(y) - y (cos), y*abs(y) - y (sin) */
+		emit_arith(cs, PFS_OP_MAD, temp[0],
+				WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
+								W, Z, Y,
+								X),
+				absolute(swizzle(temp[1], W, Z, Y, X)),
+				negate(swizzle(temp[1], W, Z, Y, X)), 0);
+
+		/* dest.xy = mad(temp.xy, P, temp2.wz) */
+		emit_arith(cs, PFS_OP_MAD, dest,
+				mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
+				swizzle(const_sin[0], W, W, W, W),
+				swizzle(temp[1], W, Z, Y, X), flags);
+
+		free_temp(cs, temp[0]);
+		free_temp(cs, temp[1]);
+		break;
+	case OPCODE_SIN:
+		/*
+			*  using a parabola:
+			* sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+			* extra precision is obtained by weighting against
+			* itself squared.
+			*/
+
+		temp[0] = get_temp_reg(cs);
+		const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+		const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+		src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+		/* do range reduction */
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				swizzle(keep(src[0]), X, X, X, X),
+				swizzle(const_sin[1], Z, Z, Z, Z),
+				pfs_half, 0);
+
+		emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				swizzle(temp[0], X, X, X, X),
+				undef, undef, 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//PI
+				0);
+
+		/* SIN */
+
+		emit_arith(cs, PFS_OP_MAD, temp[0],
+				WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								Z, Z, Z,
+								Z),
+				const_sin[0], pfs_zero, 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				swizzle(temp[0], Y, Y, Y, Y),
+				absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				swizzle(temp[0], X, X, X, X), 0);
+
+		emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				swizzle(temp[0], X, X, X, X),
+				absolute(swizzle(temp[0], X, X, X, X)),
+				negate(swizzle(temp[0], X, X, X, X)), 0);
+
+		emit_arith(cs, PFS_OP_MAD, dest, mask,
+				swizzle(temp[0], Y, Y, Y, Y),
+				swizzle(const_sin[0], W, W, W, W),
+				swizzle(temp[0], X, X, X, X), flags);
+
+		free_temp(cs, temp[0]);
+		break;
+	case OPCODE_TEX:
+		emit_tex(cs, fpi, R300_TEX_OP_LD);
+		break;
+	case OPCODE_TXB:
+		emit_tex(cs, fpi, R300_TEX_OP_TXB);
+		break;
+	case OPCODE_TXP:
+		emit_tex(cs, fpi, R300_TEX_OP_TXP);
+		break;
+	default:
+		ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
+		break;
+	}
+}
+
+static GLboolean parse_program(struct r300_pfs_compile_state *cs)
+{
+	COMPILE_STATE;
+	int clauseidx;
+
+	for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
+		struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+		int ip;
+
+		for(ip = 0; ip < clause->NumInstructions; ++ip) {
+			emit_instruction(cs, clause->Instructions + ip);
+
+			if (fp->error)
+				return GL_FALSE;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+
+/* - Init structures
+ * - Determine what hwregs each input corresponds to
+ */
+static void init_program(struct r300_pfs_compile_state *cs)
+{
+	COMPILE_STATE;
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	GLuint InputsRead = mp->Base.InputsRead;
+	GLuint temps_used = 0;	/* for fp->temps[] */
+	int i, j;
+
+	/* New compile, reset tracking data */
+	fp->optimization =
+	    driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
+	fp->translated = GL_FALSE;
+	fp->error = GL_FALSE;
+	fp->WritesDepth = GL_FALSE;
+	code->tex.length = 0;
+	code->cur_node = 0;
+	code->first_node_has_tex = 0;
+	code->const_nr = 0;
+	code->max_temp_idx = 0;
+	code->node[0].alu_end = -1;
+	code->node[0].tex_end = -1;
+
+	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+		for (j = 0; j < 3; j++) {
+			cs->slot[i].vsrc[j] = SRC_CONST;
+			cs->slot[i].ssrc[j] = SRC_CONST;
+		}
+	}
+
+	/* Work out what temps the Mesa inputs correspond to, this must match
+	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+	 * configures itself based on the fragprog's InputsRead
+	 *
+	 * NOTE: this depends on get_hw_temp() allocating registers in order,
+	 * starting from register 0.
+	 */
+
+	/* Texcoords come first */
+	for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+			    get_hw_temp(cs, 0);
+		}
+	}
+	InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+	/* fragment position treated as a texcoord */
+	if (InputsRead & FRAG_BIT_WPOS) {
+		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
+	}
+	InputsRead &= ~FRAG_BIT_WPOS;
+
+	/* Then primary colour */
+	if (InputsRead & FRAG_BIT_COL0) {
+		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL0;
+
+	/* Secondary color */
+	if (InputsRead & FRAG_BIT_COL1) {
+		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL1;
+
+	/* Anything else */
+	if (InputsRead) {
+		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+		/* force read from hwreg 0 for now */
+		for (i = 0; i < 32; i++)
+			if (InputsRead & (1 << i))
+				cs->inputs[i].reg = 0;
+	}
+
+	/* Pre-parse the program, grabbing refcounts on input/temp regs.
+	 * That way, we can free up the reg when it's no longer needed
+	 */
+	for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
+		struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
+		int idx;
+
+		for (j = 0; j < 3; j++) {
+			idx = fpi->SrcReg[j].Index;
+			switch (fpi->SrcReg[j].File) {
+			case PROGRAM_TEMPORARY:
+				if (!(temps_used & (1 << idx))) {
+					cs->temps[idx].reg = -1;
+					cs->temps[idx].refcount = 1;
+					temps_used |= (1 << idx);
+				} else
+					cs->temps[idx].refcount++;
+				break;
+			case PROGRAM_INPUT:
+				cs->inputs[idx].refcount++;
+				break;
+			default:
+				break;
+			}
+		}
+
+		idx = fpi->DstReg.Index;
+		if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
+			if (!(temps_used & (1 << idx))) {
+				cs->temps[idx].reg = -1;
+				cs->temps[idx].refcount = 1;
+				temps_used |= (1 << idx);
+			} else
+				cs->temps[idx].refcount++;
+		}
+	}
+	cs->temp_in_use = temps_used;
+}
+
+
+/**
+ * Final compilation step: Turn the intermediate radeon_program into
+ * machine-readable instructions.
+ */
+GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+{
+	struct r300_pfs_compile_state cs;
+	struct r300_fragment_program_code *code = compiler->code;
+
+	_mesa_memset(&cs, 0, sizeof(cs));
+	cs.compiler = compiler;
+	init_program(&cs);
+
+	if (!parse_program(&cs))
+		return GL_FALSE;
+
+	/* Finish off */
+	code->node[code->cur_node].alu_end =
+		cs.nrslots - code->node[code->cur_node].alu_offset - 1;
+	if (code->node[code->cur_node].tex_end < 0)
+		code->node[code->cur_node].tex_end = 0;
+	code->alu_offset = 0;
+	code->alu_end = cs.nrslots - 1;
+	code->tex_offset = 0;
+	code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
+	assert(code->node[code->cur_node].alu_end >= 0);
+	assert(code->alu_end >= 0);
+
+	return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index b0225453d3d..71821a01ea0 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -54,6 +54,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_vertprog.h"
 #include "radeon_reg.h"
 #include "r300_emit.h"
+#include "r300_fragprog.h"
 
 #include "vblank.h"
 
@@ -130,8 +131,6 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			t1 |= R300_Z_ENABLE | R300_Z_WRITE_ENABLE;
 			t2 |=
 			    (R300_ZS_ALWAYS << R300_Z_FUNC_SHIFT);
-		} else { //XXX
-			t1 |= R300_STENCIL_FRONT_BACK;	// disable
 		}
 
 		if (flags & CLEARBUFFER_STENCIL) {
@@ -144,20 +143,13 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			    (R300_ZS_REPLACE <<
 			     R300_S_FRONT_ZPASS_OP_SHIFT) |
 			    (R300_ZS_REPLACE <<
-			     R300_S_FRONT_ZFAIL_OP_SHIFT) |
-			    (R300_ZS_ALWAYS <<
-			     R300_S_BACK_FUNC_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_BACK_SFAIL_OP_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_BACK_ZPASS_OP_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_BACK_ZFAIL_OP_SHIFT);
+			     R300_S_FRONT_ZFAIL_OP_SHIFT);
 		}
 
 		e32(t1);
 		e32(t2);
-		e32(r300->state.stencil.clear);
+		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
+		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
 	}
 
 	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
@@ -307,7 +299,6 @@ static void r300EmitClearState(GLcontext * ctx)
 		reg_start(R300_RS_INST_0, 0);
 		e32(R300_RS_INST_COL_CN_WRITE);
 	} else {
-	  
 		R300_STATECHANGE(r300, ri);
 		reg_start(R500_RS_IP_0, 7);
 		for (i = 0; i < 8; ++i) {
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 21e1dc29de7..8b00f9958cc 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1371,29 +1371,22 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_MAG_FILTER_4              (0 << 9)
 #       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
 #       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_ANISO          (3 << 9)
 #       define R300_TX_MAG_FILTER_MASK           (3 << 9)
 #       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
 #       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
-#	define R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST       (5  <<  11) /* TODO: use spec */
-#	define R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR        (9  <<  11) /* TODO: use spec */
-#	define R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST        (6  <<  11) /* TODO: use spec */
-#	define R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR         (10 <<  11) /* TODO: use spec */
-
-/* NOTE: NEAREST doesnt seem to exist.
- * Im not seting MAG_FILTER_MASK and (3 << 11) on for all
- * anisotropy modes because that would void selected mag filter
- */
-#	define R300_TX_MIN_FILTER_ANISO_NEAREST             (0 << 13)
-#	define R300_TX_MIN_FILTER_ANISO_LINEAR              (0 << 13)
-#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (1 << 13)
-#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR  (2 << 13)
-#       define R300_TX_MIN_FILTER_MASK   ( (15 << 11) | (3 << 13) )
-#	define R300_TX_MAX_ANISO_1_TO_1  (0 << 21)
-#	define R300_TX_MAX_ANISO_2_TO_1  (2 << 21)
-#	define R300_TX_MAX_ANISO_4_TO_1  (4 << 21)
-#	define R300_TX_MAX_ANISO_8_TO_1  (6 << 21)
-#	define R300_TX_MAX_ANISO_16_TO_1 (8 << 21)
-#	define R300_TX_MAX_ANISO_MASK    (14 << 21)
+#	define R300_TX_MIN_FILTER_ANISO          (3 << 11)
+#	define R300_TX_MIN_FILTER_MASK           (3 << 11)
+#	define R300_TX_MIN_FILTER_MIP_NONE       (0 << 13)
+#	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
+#	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
+#	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1          (3 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1         (4 << 21)
+#	define R300_TX_MAX_ANISO_MASK            (7 << 21)
 
 #define R300_TX_FILTER1_0                      0x4440
 #	define R300_CHROMA_KEY_MODE_DISABLE    0
@@ -1401,7 +1394,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_CHROMA_KEY_BLEND           2
 #	define R300_MC_ROUND_NORMAL            (0<<2)
 #	define R300_MC_ROUND_MPEG4             (1<<2)
-#	define R300_LOD_BIAS_MASK	    0x1fff
+#	define R300_LOD_BIAS_SHIFT             3
+#	define R300_LOD_BIAS_MASK	       0x1ff8
 #	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
 #	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
 #	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
@@ -1432,9 +1426,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 	   They are given meanings as R, G, B and Alpha by the swizzle
 	   specification */
 #	define R300_TX_FORMAT_X8		    0x0
-#	define R500_TX_FORMAT_X1		    0x0 // bit set in format 2 
+#	define R500_TX_FORMAT_X1		    0x0 // bit set in format 2
 #	define R300_TX_FORMAT_X16		    0x1
-#	define R500_TX_FORMAT_X1_REV		    0x0 // bit set in format 2 
+#	define R500_TX_FORMAT_X1_REV		    0x0 // bit set in format 2
 #	define R300_TX_FORMAT_Y4X4		    0x2
 #	define R300_TX_FORMAT_Y8X8		    0x3
 #	define R300_TX_FORMAT_Y16X16		    0x4
@@ -2238,7 +2232,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22      (1 << 1)
 #	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
 #	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
-  
+
 
 /* Discard src pixels less than or equal to threshold. */
 #define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
@@ -3179,7 +3173,7 @@ enum {
  * 2 to end: Up to 16380 dwords of vertex data.
  */
 #define R300_PACKET3_3D_DRAW_INDX           0x00002A00
- 
+
 
 /* Specify the full set of vertex arrays as (address, stride).
  * The first parameter is the number of vertex arrays specified.
@@ -3209,7 +3203,7 @@ enum {
 /* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
 #define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
 
-/* Clears a portion of hierachical Z RAM 
+/* Clears a portion of hierachical Z RAM
  * 3 dword parameters
  * 0. START
  * 1. COUNT: 13:0 (max is 0x3FFF)
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index fc07105c560..8f74f9d785e 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -74,6 +74,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_reg.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
+#include "r300_fragprog.h"
 extern int future_hw_tcl_on;
 
 /**
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index 5c8fd8a5e58..f30fd986e0f 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -28,7 +28,6 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 							   target, id);
 		} else {
 			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			r300_fp->ctx = ctx;
 			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
 							   target, id);
 		}
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 550f7108542..55d3d55e900 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -60,6 +60,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
+#include "r300_fragprog.h"
 #include "r300_tex.h"
 
 #include "drirenderbuffer.h"
@@ -525,24 +526,15 @@ static void r300SetDepthState(GLcontext * ctx)
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
 	R300_STATECHANGE(r300, zs);
-	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE; // XXX
-	r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
-	    ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
+	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+	r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
 
-	if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+	if (ctx->Depth.Test) {
+		r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE;
 		if (ctx->Depth.Mask)
-			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
-			    R300_Z_ENABLE | R300_Z_WRITE_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
-		else
-		    r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
-
-		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-		    translate_func(ctx->Depth.
-				   Func) << R300_Z_FUNC_SHIFT;
-	} else {
-	    r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK; // XXX
+			r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_WRITE_ENABLE;
 		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
-		    translate_func(GL_NEVER) << R300_Z_FUNC_SHIFT;
+		    translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
 	}
 
 	r300SetEarlyZState(ctx);
@@ -925,7 +917,7 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 	GLuint flag;
 
 	R300_STATECHANGE(rmesa, zs);
-
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
 					       R300_S_FRONT_FUNC_SHIFT)
 					      | (R300_ZS_MASK <<
@@ -1000,17 +992,6 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 	}
 }
 
-static void r300ClearStencil(GLcontext * ctx, GLint s)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	rmesa->state.stencil.clear =
-	    ((GLuint) (ctx->Stencil.Clear & R300_STENCILREF_MASK) |
-	     (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT) |
-	     ((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
-		R300_STENCILMASK_SHIFT));
-}
-
 /* =============================================================
  * Window position and viewport transformation
  */
@@ -1284,7 +1265,7 @@ static unsigned long gen_fixed_filter(unsigned long f)
 		return f;
 
 	mag = f & R300_TX_MAG_FILTER_MASK;
-	min = f & R300_TX_MIN_FILTER_MASK;
+	min = f & (R300_TX_MIN_FILTER_MASK|R300_TX_MIN_FILTER_MIP_MASK);
 
 	/* TODO: Check for anisto filters too */
 	if ((mag != R300_TX_MAG_FILTER_NEAREST)
@@ -1328,18 +1309,19 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	int i;
 	struct r300_fragment_program *fp = (struct r300_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
+	struct r300_fragment_program_code *code = &fp->code;
 
 	R300_STATECHANGE(r300, fpt);
 
-	for (i = 0; i < fp->tex.length; i++) {
+	for (i = 0; i < code->tex.length; i++) {
 		int unit;
 		int opcode;
 		unsigned long val;
 
-		unit = fp->tex.inst[i] >> R300_TEX_ID_SHIFT;
+		unit = code->tex.inst[i] >> R300_TEX_ID_SHIFT;
 		unit &= 15;
 
-		val = fp->tex.inst[i];
+		val = code->tex.inst[i];
 		val &= ~R300_TEX_ID_MASK;
 
 		opcode =
@@ -1361,7 +1343,7 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	}
 
 	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-		cmdpacket0(R300_US_TEX_INST_0, fp->tex.length);
+		cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
 }
 
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
@@ -1369,14 +1351,15 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	int i;
 	struct r500_fragment_program *fp = (struct r500_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
+	struct r500_fragment_program_code *code = &fp->code;
 
 	/* find all the texture instructions and relocate the texture units */
-	for (i = 0; i < fp->inst_end + 1; i++) {
-		if ((fp->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+	for (i = 0; i < code->inst_end + 1; i++) {
+		if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
 			uint32_t val;
 			int unit, opcode, new_unit;
 
-			val = fp->inst[i].inst1;
+			val = code->inst[i].inst1;
 
 			unit = (val >> 16) & 0xf;
 
@@ -1393,11 +1376,23 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 				}
 			}
 			val |= R500_TEX_ID(new_unit);
-			fp->inst[i].inst1 = val;
+			code->inst[i].inst1 = val;
 		}
 	}
 }
 
+static GLuint r300CalculateTexLodBias(GLfloat bias)
+{
+	GLuint b;
+	b = (unsigned int)fabsf(ceilf(bias*31));
+	if (signbit(bias)) {
+		b ^= 0x3ff; /* 10 bits */
+	}
+	b <<= 3;
+	b &= R300_LOD_BIAS_MASK;
+	return b;
+}
+
 static void r300SetupTextures(GLcontext * ctx)
 {
 	int i, mtu;
@@ -1461,8 +1456,8 @@ static void r300SetupTextures(GLcontext * ctx)
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
 						hw_tmu] =
 			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
-			/* Currently disabled! */
-			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0;	//0x20501f80;
+			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1
+				| r300CalculateTexLodBias(r300->LODBias);
 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
 			    t->size;
 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
@@ -2426,6 +2421,7 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	GLcontext *ctx = rmesa->radeon.glCtx;
 	struct r300_fragment_program *fp = (struct r300_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
+	struct r300_fragment_program_code *code;
 	int i, k;
 
 	if (!fp)		/* should only happenen once, just after context is created */
@@ -2437,62 +2433,63 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 			__FUNCTION__);
 		return;
 	}
+	code = &fp->code;
 
 	r300SetupTextures(ctx);
 
 	R300_STATECHANGE(rmesa, fpi[0]);
-	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, fp->alu_end + 1);
-	for (i = 0; i <= fp->alu_end; i++) {
-		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst0;
+	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu_end + 1);
+	for (i = 0; i <= code->alu_end; i++) {
+		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
 	}
 
 	R300_STATECHANGE(rmesa, fpi[1]);
-	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, fp->alu_end + 1);
-	for (i = 0; i <= fp->alu_end; i++) {
-		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst1;
+	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu_end + 1);
+	for (i = 0; i <= code->alu_end; i++) {
+		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
 	}
 
 	R300_STATECHANGE(rmesa, fpi[2]);
-	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, fp->alu_end + 1);
-	for (i = 0; i <= fp->alu_end; i++) {
-		rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst2;
+	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu_end + 1);
+	for (i = 0; i <= code->alu_end; i++) {
+		rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
 	}
 
 	R300_STATECHANGE(rmesa, fpi[3]);
-	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, fp->alu_end + 1);
-	for (i = 0; i <= fp->alu_end; i++) {
-		rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst3;
+	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu_end + 1);
+	for (i = 0; i <= code->alu_end; i++) {
+		rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
 	}
 
 	R300_STATECHANGE(rmesa, fp);
-	rmesa->hw.fp.cmd[R300_FP_CNTL0] = fp->cur_node | (fp->first_node_has_tex << 3);
-	rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+	rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
+	rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
 	rmesa->hw.fp.cmd[R300_FP_CNTL2] =
-	  (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
-	  (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
-	  (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
-	  (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+	  (code->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+	  (code->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
+	  (code->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+	  (code->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
 	/* I just want to say, the way these nodes are stored.. weird.. */
-	for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
-		if (i < (fp->cur_node + 1)) {
+	for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
+		if (i < (code->cur_node + 1)) {
 			rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
-			  (fp->node[i].alu_offset << R300_ALU_START_SHIFT) |
-			  (fp->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
-			  (fp->node[i].tex_offset << R300_TEX_START_SHIFT) |
-			  (fp->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
-			  fp->node[i].flags;
+			  (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
+			  (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
+			  (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
+			  (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
+			  code->node[i].flags;
 		} else {
 			rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
 		}
 	}
 
 	R300_STATECHANGE(rmesa, fpp);
-	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
-	for (i = 0; i < fp->const_nr; i++) {
-		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(fp->constant[i][0]);
-		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(fp->constant[i][1]);
-		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(fp->constant[i][2]);
-		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(fp->constant[i][3]);
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+	for (i = 0; i < code->const_nr; i++) {
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]);
 	}
 }
 
@@ -2516,6 +2513,7 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 	struct r500_fragment_program *fp = (struct r500_fragment_program *)
 	    (char *)ctx->FragmentProgram._Current;
 	int i;
+	struct r500_fragment_program_code *code;
 
 	if (!fp)		/* should only happenen once, just after context is created */
 		return;
@@ -2529,42 +2527,43 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 			__FUNCTION__);
 		return;
 	}
+	code = &fp->code;
 
 	r300SetupTextures(ctx);
 
 	R300_STATECHANGE(rmesa, fp);
-	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = fp->max_temp_idx;
+	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
 
 	rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
-	    R500_US_CODE_START_ADDR(fp->inst_offset) |
-	    R500_US_CODE_END_ADDR(fp->inst_end);
+	    R500_US_CODE_START_ADDR(code->inst_offset) |
+	    R500_US_CODE_END_ADDR(code->inst_end);
 	rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
-	    R500_US_CODE_RANGE_ADDR(fp->inst_offset) |
-	    R500_US_CODE_RANGE_SIZE(fp->inst_end);
+	    R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+	    R500_US_CODE_RANGE_SIZE(code->inst_end);
 	rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
 	    R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
 
 	R300_STATECHANGE(rmesa, r500fp);
 	/* Emit our shader... */
-	for (i = 0; i < fp->inst_end+1; i++) {
-		rmesa->hw.r500fp.cmd[i*6+1] = fp->inst[i].inst0;
-		rmesa->hw.r500fp.cmd[i*6+2] = fp->inst[i].inst1;
-		rmesa->hw.r500fp.cmd[i*6+3] = fp->inst[i].inst2;
-		rmesa->hw.r500fp.cmd[i*6+4] = fp->inst[i].inst3;
-		rmesa->hw.r500fp.cmd[i*6+5] = fp->inst[i].inst4;
-		rmesa->hw.r500fp.cmd[i*6+6] = fp->inst[i].inst5;
+	for (i = 0; i < code->inst_end+1; i++) {
+		rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+		rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+		rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+		rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+		rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+		rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
 	}
 
-	bump_r500fp_count(rmesa->hw.r500fp.cmd, (fp->inst_end + 1) * 6);
+	bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
 
 	R300_STATECHANGE(rmesa, r500fp_const);
-	for (i = 0; i < fp->const_nr; i++) {
-		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(fp->constant[i][0]);
-		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(fp->constant[i][1]);
-		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(fp->constant[i][2]);
-		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(fp->constant[i][3]);
+	for (i = 0; i < code->const_nr; i++) {
+		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(code->constant[i][0]);
+		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(code->constant[i][1]);
+		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(code->constant[i][2]);
+		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(code->constant[i][3]);
 	}
-	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, fp->const_nr * 4);
+	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
 
 }
 
@@ -2637,12 +2636,10 @@ void r300InitState(r300ContextPtr r300)
 	case 16:
 		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
 		depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
-		r300->state.stencil.clear = 0x00000000;
 		break;
 	case 24:
 		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
 		depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		r300->state.stencil.clear = 0x00ff0000;
 		break;
 	default:
 		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
@@ -2706,7 +2703,6 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->ShadeModel = r300ShadeModel;
 
 	/* Stencil related */
-	functions->ClearStencil = r300ClearStencil;
 	functions->StencilFuncSeparate = r300StencilFuncSeparate;
 	functions->StencilMaskSeparate = r300StencilMaskSeparate;
 	functions->StencilOpSeparate = r300StencilOpSeparate;
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 0b4acec0448..5f54bcad9a3 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -160,21 +160,18 @@ static void r300SetTexWrap(r300TexObjPtr t, GLenum swrap, GLenum twrap,
 	t->filter |= hw_qwrap << R300_TX_WRAP_Q_SHIFT;
 }
 
-static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
+static GLuint aniso_filter(GLfloat anisotropy)
 {
-
-	t->filter &= ~R300_TX_MAX_ANISO_MASK;
-
-	if (max <= 1.0) {
-		t->filter |= R300_TX_MAX_ANISO_1_TO_1;
-	} else if (max <= 2.0) {
-		t->filter |= R300_TX_MAX_ANISO_2_TO_1;
-	} else if (max <= 4.0) {
-		t->filter |= R300_TX_MAX_ANISO_4_TO_1;
-	} else if (max <= 8.0) {
-		t->filter |= R300_TX_MAX_ANISO_8_TO_1;
+	if (anisotropy >= 16.0) {
+		return R300_TX_MAX_ANISO_16_TO_1;
+	} else if (anisotropy >= 8.0) {
+		return R300_TX_MAX_ANISO_8_TO_1;
+	} else if (anisotropy >= 4.0) {
+		return R300_TX_MAX_ANISO_4_TO_1;
+	} else if (anisotropy >= 2.0) {
+		return R300_TX_MAX_ANISO_2_TO_1;
 	} else {
-		t->filter |= R300_TX_MAX_ANISO_16_TO_1;
+		return R300_TX_MAX_ANISO_1_TO_1;
 	}
 }
 
@@ -184,54 +181,47 @@ static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
  * \param t Texture whose filter modes are to be set
  * \param minf Texture minification mode
  * \param magf Texture magnification mode
+ * \param anisotropy Maximum anisotropy level
  */
-
-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf)
+static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
 {
-	GLuint anisotropy = (t->filter & R300_TX_MAX_ANISO_MASK);
+	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
 
-	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MAG_FILTER_MASK);
+	/* Note that EXT_texture_filter_anisotropic is extremely vague about
+	 * how anisotropic filtering interacts with the "normal" filter modes.
+	 * When anisotropic filtering is enabled, we override min and mag
+	 * filter settings completely. This includes driconf's settings.
+	 */
+	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+		t->filter |= R300_TX_MAG_FILTER_ANISO
+			| R300_TX_MIN_FILTER_ANISO
+			| R300_TX_MIN_FILTER_MIP_LINEAR
+			| aniso_filter(anisotropy);
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "Using maximum anisotropy of %f\n", anisotropy);
+		return;
+	}
 
-	if (anisotropy == R300_TX_MAX_ANISO_1_TO_1) {
-		switch (minf) {
-		case GL_NEAREST:
-			t->filter |= R300_TX_MIN_FILTER_NEAREST;
-			break;
-		case GL_LINEAR:
-			t->filter |= R300_TX_MIN_FILTER_LINEAR;
-			break;
-		case GL_NEAREST_MIPMAP_NEAREST:
-			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST;
-			break;
-		case GL_NEAREST_MIPMAP_LINEAR:
-			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR;
-			break;
-		case GL_LINEAR_MIPMAP_NEAREST:
-			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST;
-			break;
-		case GL_LINEAR_MIPMAP_LINEAR:
-			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR;
-			break;
-		}
-	} else {
-		switch (minf) {
-		case GL_NEAREST:
-			t->filter |= R300_TX_MIN_FILTER_ANISO_NEAREST;
-			break;
-		case GL_LINEAR:
-			t->filter |= R300_TX_MIN_FILTER_ANISO_LINEAR;
-			break;
-		case GL_NEAREST_MIPMAP_NEAREST:
-		case GL_LINEAR_MIPMAP_NEAREST:
-			t->filter |=
-			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
-			break;
-		case GL_NEAREST_MIPMAP_LINEAR:
-		case GL_LINEAR_MIPMAP_LINEAR:
-			t->filter |=
-			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
-			break;
-		}
+	switch (minf) {
+	case GL_NEAREST:
+		t->filter |= R300_TX_MIN_FILTER_NEAREST;
+		break;
+	case GL_LINEAR:
+		t->filter |= R300_TX_MIN_FILTER_LINEAR;
+		break;
+	case GL_NEAREST_MIPMAP_NEAREST:
+		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+		break;
+	case GL_NEAREST_MIPMAP_LINEAR:
+		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+		break;
+	case GL_LINEAR_MIPMAP_NEAREST:
+		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+		break;
+	case GL_LINEAR_MIPMAP_LINEAR:
+		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+		break;
 	}
 
 	/* Note we don't have 3D mipmaps so only use the mag filter setting
@@ -252,6 +242,20 @@ static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
 	t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
 }
 
+static void r300SetTexLodBias(r300TexObjPtr t, GLfloat bias)
+{
+	GLuint b;
+	b = (unsigned int)fabsf(ceilf(bias*31));
+	if (signbit(bias)) {
+		b ^= 0x3ff; /* 10 bits */
+	}
+	b <<= 3;
+	b &= R300_LOD_BIAS_MASK;
+
+	t->filter_1 &= ~R300_LOD_BIAS_MASK;
+	t->filter_1 |= b;
+}
+
 /**
  * Allocate space for and load the mesa images into the texture memory block.
  * This will happen before drawing with a new texture, or drawing with a
@@ -278,8 +282,7 @@ static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
 		make_empty_list(&t->base);
 
 		r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
-		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
 		r300SetTexBorderColor(t, texObj->_BorderChan);
 	}
 
@@ -976,9 +979,38 @@ r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
 	t->dirty_images[0] |= (1 << level);
 }
 
+/* This feels like a prime target for code reuse, so I'm putting it here
+ * instead of inlining it in TexEnv. */
+static GLenum r300TexUnitTarget(struct gl_texture_unit *unit) {
+	if (unit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
+		return GL_TEXTURE_RECTANGLE_NV;
+	} else if (unit->_ReallyEnabled & (TEXTURE_1D_BIT)) {
+		return GL_TEXTURE_1D;
+	} else if (unit->_ReallyEnabled & (TEXTURE_2D_BIT)) {
+		return GL_TEXTURE_2D;
+	} else if (unit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
+		return GL_TEXTURE_3D;
+	} else if (unit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
+		return GL_TEXTURE_CUBE_MAP;
+	}
+	if (unit->Enabled & (TEXTURE_RECT_BIT)) {
+		return GL_TEXTURE_RECTANGLE_NV;
+	} else if (unit->Enabled & (TEXTURE_1D_BIT)) {
+		return GL_TEXTURE_1D;
+	} else if (unit->Enabled & (TEXTURE_2D_BIT)) {
+		return GL_TEXTURE_2D;
+	} else if (unit->Enabled & (TEXTURE_3D_BIT)) {
+		return GL_TEXTURE_3D;
+	} else if (unit->Enabled & (TEXTURE_CUBE_BIT)) {
+		return GL_TEXTURE_CUBE_MAP;
+	}
+	return 0;
+}
+
 static void r300TexEnv(GLcontext * ctx, GLenum target,
 		       GLenum pname, const GLfloat * param)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	if (RADEON_DEBUG & DEBUG_STATE) {
 		fprintf(stderr, "%s( %s )\n",
 			__FUNCTION__, _mesa_lookup_enum_by_nr(pname));
@@ -989,41 +1021,24 @@ static void r300TexEnv(GLcontext * ctx, GLenum target,
 	 * between them according to _ReallyEnabled.
 	 */
 	switch (pname) {
-	case GL_TEXTURE_LOD_BIAS_EXT:{
-#if 0				/* Needs to be relocated in order to make sure we got the right tmu */
-			GLfloat bias, min;
-			GLuint b;
-
-			/* The R300's LOD bias is a signed 2's complement value with a
-			 * range of -16.0 <= bias < 16.0.
-			 *
-			 * NOTE: Add a small bias to the bias for conform mipsel.c test.
-			 */
-			bias = *param + .01;
-			min =
-			    driQueryOptionb(&rmesa->radeon.optionCache,
-					    "no_neg_lod_bias") ? 0.0 : -16.0;
-			bias = CLAMP(bias, min, 16.0);
-
-			/* 0.0 - 16.0 == 0x0 - 0x1000 */
-			/* 0.0 - -16.0 == 0x1001 - 0x1fff */
-			b = 0x1000 / 16.0 * bias;
-			b &= R300_LOD_BIAS_MASK;
-
-			if (b !=
-			    (rmesa->hw.tex.unknown1.
-			     cmd[R300_TEX_VALUE_0 +
-				 unit] & R300_LOD_BIAS_MASK)) {
-				R300_STATECHANGE(rmesa, tex.unknown1);
-				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
-							   unit] &=
-				    ~R300_LOD_BIAS_MASK;
-				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
-							   unit] |= b;
-			}
-#endif
-			break;
-		}
+	case GL_TEXTURE_LOD_BIAS_EXT: {
+		/* Needs to be relocated in order to make sure we got the right tmu */
+		GLfloat bias, min;
+
+		/* The R300's LOD bias is a signed 2's complement value with a
+		 * range of -16.0 <= bias < 16.0.
+		 *
+		 * NOTE: Add a small bias to the bias for conform mipsel.c test.
+		 */
+		bias = *param + .01;
+		min = driQueryOptionb(&rmesa->radeon.optionCache,
+			"no_neg_lod_bias") ? 0.0 : -16.0;
+		bias = CLAMP(bias, min, 16.0);
+
+		rmesa->LODBias = bias;
+
+		break;
+	}
 
 	default:
 		return;
@@ -1050,8 +1065,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	case GL_TEXTURE_MIN_FILTER:
 	case GL_TEXTURE_MAG_FILTER:
 	case GL_TEXTURE_MAX_ANISOTROPY_EXT:
-		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
 		break;
 
 	case GL_TEXTURE_WRAP_S:
@@ -1077,7 +1091,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
-		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat 
+		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
 		    == GL_DEPTH_COMPONENT) {
 			r300SetDepthTexMode(texObj);
 			break;
@@ -1092,10 +1106,6 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	default:
 		return;
 	}
-
-	/* Mark this texobj as dirty (one bit per tex unit)
-	 */
-	t->dirty_state = TEX_ALL;
 }
 
 static void r300BindTexture(GLcontext * ctx, GLenum target,
@@ -1157,6 +1167,10 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 		return NULL;
 	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
 
+	/* Attempt to fill LOD bias, if previously set.
+	 * Should start at 0.0, which won't affect the HW. */
+	obj->LodBias = rmesa->LODBias;
+
 	r300AllocTexObj(obj);
 	return obj;
 }
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
index 723601ac4a6..69847a4022d 100644
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ b/src/mesa/drivers/dri/r300/r300_texmem.c
@@ -349,7 +349,7 @@ static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
 	imageWidth = texImage->Width;
 	imageHeight = texImage->Height;
 
-	offset = t->bufAddr + t->base.totalSize / 6 * face;
+	offset = t->bufAddr;
 
 	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
 		GLint imageX = 0;
@@ -534,10 +534,6 @@ int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
 			/* hope it's safe to add that here... */
 			t->offset |= t->tile_bits;
 		}
-
-		/* Mark this texobj as dirty on all units:
-		 */
-		t->dirty_state = TEX_ALL;
 	}
 
 	/* Let the world know we've used this memory recently.
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index 78fa75228e7..bdd20b18e44 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -127,18 +127,18 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 {
 	static const GLuint formats[3][3] = {
 		{
-			R300_EASY_TX_FORMAT(X, X, X, X, X16),
 			R300_EASY_TX_FORMAT(X, X, X, ONE, X16),
+			R300_EASY_TX_FORMAT(X, X, X, X, X16),
 			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X16),
 		},
 		{
-			R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
 			R300_EASY_TX_FORMAT(X, X, X, ONE, X24_Y8),
+			R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
 			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X24_Y8),
 		},
 		{
-			R300_EASY_TX_FORMAT(X, X, X, X, X32),
 			R300_EASY_TX_FORMAT(X, X, X, ONE, X32),
+			R300_EASY_TX_FORMAT(X, X, X, X, X32),
 			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X32),
 		},
 	};
@@ -190,6 +190,112 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 
 /**
+ * Compute sizes and fill in offset and blit information for the given
+ * image (determined by \p face and \p level).
+ *
+ * \param curOffset points to the offset at which the image is to be stored
+ * and is updated by this function according to the size of the image.
+ */
+static void compute_tex_image_offset(
+	struct gl_texture_object *tObj,
+	GLuint face,
+	GLint level,
+	GLint* curOffset)
+{
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	const struct gl_texture_image* texImage;
+	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
+	GLuint texelBytes;
+	GLuint size;
+
+	texImage = tObj->Image[0][level + t->base.firstLevel];
+	if (!texImage)
+		return;
+
+	texelBytes = texImage->TexFormat->TexelBytes;
+
+	/* find image size in bytes */
+	if (texImage->IsCompressed) {
+		if ((t->format & R300_TX_FORMAT_DXT1) ==
+			R300_TX_FORMAT_DXT1) {
+			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
+			if ((texImage->Width + 3) < 8)	/* width one block */
+				size = texImage->CompressedSize * 4;
+			else if ((texImage->Width + 3) < 16)
+				size = texImage->CompressedSize * 2;
+			else
+				size = texImage->CompressedSize;
+		} else {
+			/* DXT3/5, 16 bytes per block */
+			WARN_ONCE
+				("DXT 3/5 suffers from multitexturing problems!\n");
+			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
+			if ((texImage->Width + 3) < 8)
+				size = texImage->CompressedSize * 2;
+			else
+				size = texImage->CompressedSize;
+		}
+	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+		size =
+			((texImage->Width * texelBytes +
+			63) & ~63) * texImage->Height;
+		blitWidth = 64 / texelBytes;
+	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
+		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+			though the actual offset may be different (if texture is less than
+			32 bytes width) to the untiled case */
+		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+		size =
+			(w * ((texImage->Height + 1) / 2)) *
+			texImage->Depth;
+		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+	} else {
+		int w = (texImage->Width * texelBytes + 31) & ~31;
+		size = w * texImage->Height * texImage->Depth;
+		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+	}
+	assert(size > 0);
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
+			texImage->Width, texImage->Height,
+			texImage->Depth,
+			texImage->TexFormat->TexelBytes,
+			texImage->InternalFormat);
+
+	/* All images are aligned to a 32-byte offset */
+	*curOffset = (*curOffset + 0x1f) & ~0x1f;
+
+	if (texelBytes) {
+		/* fix x and y coords up later together with offset */
+		t->image[face][level].x = *curOffset;
+		t->image[face][level].y = 0;
+		t->image[face][level].width =
+			MIN2(size / texelBytes, blitWidth);
+		t->image[face][level].height =
+			(size / texelBytes) / t->image[face][level].width;
+	} else {
+		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
+		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
+		t->image[face][level].width =
+			MIN2(size, R300_BLIT_WIDTH_BYTES);
+		t->image[face][level].height = size / t->image[face][level].width;
+	}
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr,
+			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+			level, face, texImage->Width, texImage->Height,
+			t->image[face][level].x, t->image[face][level].y,
+			t->image[face][level].width, t->image[face][level].height,
+			size, *curOffset);
+
+	*curOffset += size;
+}
+
+
+
+/**
  * This function computes the number of bytes of storage needed for
  * the given texture object (all mipmap levels, all cube faces).
  * The \c image[face][level].x/y/width/height parameters for upload/blitting
@@ -206,7 +312,7 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
 	const struct gl_texture_image *baseImage =
 	    tObj->Image[0][tObj->BaseLevel];
-	GLint curOffset, blitWidth;
+	GLint curOffset;
 	GLint i, texelBytes;
 	GLint numLevels;
 	GLint log2Width, log2Height, log2Depth;
@@ -245,8 +351,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 	 * The idea is that we lay out the mipmap levels within a block of
 	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
 	 */
-	curOffset = 0;
-	blitWidth = R300_BLIT_WIDTH_BYTES;
 	t->tile_bits = 0;
 
 	/* figure out if this texture is suitable for tiling. */
@@ -276,94 +380,20 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 	}
 #endif
 
-	for (i = 0; i < numLevels; i++) {
-		const struct gl_texture_image *texImage;
-		GLuint size;
-
-		texImage = tObj->Image[0][i + t->base.firstLevel];
-		if (!texImage)
-			break;
-
-		/* find image size in bytes */
-		if (texImage->IsCompressed) {
-			if ((t->format & R300_TX_FORMAT_DXT1) ==
-			    R300_TX_FORMAT_DXT1) {
-				// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
-				if ((texImage->Width + 3) < 8)	/* width one block */
-					size = texImage->CompressedSize * 4;
-				else if ((texImage->Width + 3) < 16)
-					size = texImage->CompressedSize * 2;
-				else
-					size = texImage->CompressedSize;
-			} else {
-				/* DXT3/5, 16 bytes per block */
-				WARN_ONCE
-				    ("DXT 3/5 suffers from multitexturing problems!\n");
-				// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
-				if ((texImage->Width + 3) < 8)
-					size = texImage->CompressedSize * 2;
-				else
-					size = texImage->CompressedSize;
-			}
-		} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-			size =
-			    ((texImage->Width * texelBytes +
-			      63) & ~63) * texImage->Height;
-			blitWidth = 64 / texelBytes;
-		} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
-			/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-			   though the actual offset may be different (if texture is less than
-			   32 bytes width) to the untiled case */
-			int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-			size =
-			    (w * ((texImage->Height + 1) / 2)) *
-			    texImage->Depth;
-			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-		} else {
-			int w = (texImage->Width * texelBytes + 31) & ~31;
-			size = w * texImage->Height * texImage->Depth;
-			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-		}
-		assert(size > 0);
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
-				texImage->Width, texImage->Height,
-				texImage->Depth,
-				texImage->TexFormat->TexelBytes,
-				texImage->InternalFormat);
-
-		/* Align to 32-byte offset.  It is faster to do this unconditionally
-		 * (no branch penalty).
-		 */
+	curOffset = 0;
 
-		curOffset = (curOffset + 0x1f) & ~0x1f;
+	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+		ASSERT(log2Width == log2Height);
+		t->format |= R300_TX_FORMAT_CUBIC_MAP;
 
-		if (texelBytes) {
-			/* fix x and y coords up later together with offset */
-			t->image[0][i].x = curOffset;
-			t->image[0][i].y = 0;
-			t->image[0][i].width =
-			    MIN2(size / texelBytes, blitWidth);
-			t->image[0][i].height =
-			    (size / texelBytes) / t->image[0][i].width;
-		} else {
-			t->image[0][i].x = curOffset % R300_BLIT_WIDTH_BYTES;
-			t->image[0][i].y = curOffset / R300_BLIT_WIDTH_BYTES;
-			t->image[0][i].width =
-			    MIN2(size, R300_BLIT_WIDTH_BYTES);
-			t->image[0][i].height = size / t->image[0][i].width;
+		for(i = 0; i < numLevels; i++) {
+			GLuint face;
+			for(face = 0; face < 6; face++)
+				compute_tex_image_offset(tObj, face, i, &curOffset);
 		}
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-				i, texImage->Width, texImage->Height,
-				t->image[0][i].x, t->image[0][i].y,
-				t->image[0][i].width, t->image[0][i].height,
-				size, curOffset);
-
-		curOffset += size;
+	} else {
+		for (i = 0; i < numLevels; i++)
+			compute_tex_image_offset(tObj, 0, i, &curOffset);
 	}
 
 	/* Align the total size of texture memory block.
@@ -371,26 +401,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 	t->base.totalSize =
 	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
 
-	/* Setup remaining cube face blits, if needed */
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		GLuint face;
-		for (face = 1; face < 6; face++) {
-			for (i = 0; i < numLevels; i++) {
-				t->image[face][i].x = t->image[0][i].x;
-				t->image[face][i].y = t->image[0][i].y;
-				t->image[face][i].width = t->image[0][i].width;
-				t->image[face][i].height =
-				    t->image[0][i].height;
-			}
-		}
-		t->base.totalSize *= 6;	/* total texmem needed */
-	}
-
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		ASSERT(log2Width == log2Height);
-		t->format |= R300_TX_FORMAT_CUBIC_MAP;
-	}
-
 	t->size =
 	    (((tObj->Image[0][t->base.firstLevel]->Width -
 	       1) << R300_TX_WIDTHMASK_SHIFT)
@@ -408,7 +418,7 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 		t->pitch |=
 		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
 	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = blitWidth - 1;
+		unsigned int align = (64 / texelBytes) - 1;
 		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
 			     texelBytes) + 63) & ~(63);
 		t->size |= R300_TX_SIZE_TXPITCH_EN;
@@ -428,10 +438,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
 	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
 		t->pitch_reg |= R500_TXHEIGHT_BIT11;
 	}
-
-	t->dirty_state = TEX_ALL;
-
-	/* FYI: r300UploadTexImages( rmesa, t ) used to be called here */
 }
 
 /* ================================================================
@@ -568,7 +574,6 @@ static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
 
 		rmesa->state.texture.unit[unit].texobj = t;
 		t->base.bound |= (1 << unit);
-		t->dirty_state |= 1 << unit;
 		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
 	}
 
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index b967aa2d737..5d72ec2784f 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -1,8 +1,5 @@
 /*
- * Copyright (C) 2005 Ben Skeggs.
- *
  * Copyright 2008 Corbin Simpson <[email protected]>
- * Adaptation and modification for ATI/AMD Radeon R500 GPU chipsets.
  *
  * All Rights Reserved.
  *
@@ -28,1439 +25,326 @@
  *
  */
 
+#include "r500_fragprog.h"
+
+static void reset_srcreg(struct prog_src_register* reg)
+{
+	_mesa_bzero(reg, sizeof(*reg));
+	reg->Swizzle = SWIZZLE_NOOP;
+}
+
 /**
- * \file
- *
- * \author Ben Skeggs <[email protected]>
- *
- * \author Jerome Glisse <[email protected]>
- *
- * \author Corbin Simpson <[email protected]>
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - premultiply texture coordinates for RECT
+ *  - extract operand swizzles
+ *  - introduce a temporary register when write masks are needed
  *
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
- * \todo Verify results of opcodes for accuracy, I've only checked them in
- * specific cases.
  */
+static GLboolean transform_TEX(
+	struct radeon_program_transform_context* context,
+	struct prog_instruction* orig_inst, void* data)
+{
+	struct r500_fragment_program_compiler *compiler =
+		(struct r500_fragment_program_compiler*)data;
+	struct prog_instruction inst = *orig_inst;
+	struct prog_instruction* tgt;
+	GLboolean destredirect = GL_FALSE;
+
+	if (inst.Opcode != OPCODE_TEX &&
+	    inst.Opcode != OPCODE_TXB &&
+	    inst.Opcode != OPCODE_TXP &&
+	    inst.Opcode != OPCODE_KIL)
+		return GL_FALSE;
 
-#include "glheader.h"
-#include "macros.h"
-#include "enums.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-
-#include "r300_context.h"
-#include "r500_fragprog.h"
-#include "r300_reg.h"
-#include "r300_state.h"
+	/* ARB_shadow & EXT_shadow_funcs */
+	if (inst.Opcode != OPCODE_KIL &&
+	    compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+
+		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+			tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+				context->dest->NumInstructions, 1);
+
+			tgt->Opcode = OPCODE_MOV;
+			tgt->DstReg.File = inst.DstReg.File;
+			tgt->DstReg.Index = inst.DstReg.Index;
+			tgt->DstReg.WriteMask = inst.DstReg.WriteMask;
+			tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+			tgt->SrcReg[0].Swizzle = comparefunc == GL_ALWAYS ? SWIZZLE_1111 : SWIZZLE_0000;
+			return GL_TRUE;
+		}
 
-/*
- * Useful macros and values
- */
-#define ERROR(fmt, args...) do {			\
-		fprintf(stderr, "%s::%s(): " fmt "\n",	\
-			__FILE__, __FUNCTION__, ##args);	\
-		fp->error = GL_TRUE;			\
-	} while(0)
-
-#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
-
-#define R500_US_NUM_TEMP_REGS 128
-#define R500_US_NUM_CONST_REGS 256
-
-/* "Register" flags */
-#define REG_CONSTANT (1 << 8)
-#define REG_SRC_REL (1 << 9)
-#define REG_DEST_REL (1 << 7)
-
-/* Swizzle tools */
-#define R500_SWIZZLE_ZERO 4
-#define R500_SWIZZLE_HALF 5
-#define R500_SWIZZLE_ONE 6
-#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
-#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
-#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
-#define R500_SWIZ_MOD_NEG 1
-#define R500_SWIZ_MOD_ABS 2
-#define R500_SWIZ_MOD_NEG_ABS 3
-/* Swizzles for inst2 */
-#define MAKE_SWIZ_TEX_STRQ(x) (x << 8)
-#define MAKE_SWIZ_TEX_RGBA(x) (x << 24)
-/* Swizzles for inst3 */
-#define MAKE_SWIZ_RGB_A(x) (x << 2)
-#define MAKE_SWIZ_RGB_B(x) (x << 15)
-/* Swizzles for inst4 */
-#define MAKE_SWIZ_ALPHA_A(x) (x << 14)
-#define MAKE_SWIZ_ALPHA_B(x) (x << 21)
-/* Swizzle for inst5 */
-#define MAKE_SWIZ_RGBA_C(x) (x << 14)
-#define MAKE_SWIZ_ALPHA_C(x) (x << 27)
-
-/* Writemasks */
-#define R500_WRITEMASK_G 0x2
-#define R500_WRITEMASK_B 0x4
-#define R500_WRITEMASK_RGB 0x7
-#define R500_WRITEMASK_A 0x8
-#define R500_WRITEMASK_AR 0x9
-#define R500_WRITEMASK_AG 0xA
-#define R500_WRITEMASK_ARG 0xB
-#define R500_WRITEMASK_AB 0xC
-#define R500_WRITEMASK_ARGB 0xF
-
-/* 1/(2pi), needed for quick modulus in trig insts
- * Thanks to glisse for pointing out how to do it! */
-static const GLfloat RCP_2PI[] = {0.15915494309189535,
-	0.15915494309189535,
-	0.15915494309189535,
-	0.15915494309189535};
-
-static const GLfloat LIT[] = {127.999999,
-	127.999999,
-	127.999999,
-	-127.999999};
-
-static void dump_program(struct r500_fragment_program *fp);
-
-static inline GLuint make_rgb_swizzle(struct prog_src_register src) {
-	GLuint swiz = 0x0;
-	GLuint temp;
-	/* This could be optimized, but it should be plenty fast already. */
-	int i;
-	for (i = 0; i < 3; i++) {
-	        temp = GET_SWZ(src.Swizzle, i);
-		/* Fix SWIZZLE_ONE */
-		if (temp == 5) temp++;
-		swiz |= temp << i*3;
+		inst.DstReg.File = PROGRAM_TEMPORARY;
+		inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler);
+		inst.DstReg.WriteMask = WRITEMASK_XYZW;
 	}
-	if (src.NegateBase)
-		swiz |= (R500_SWIZ_MOD_NEG << 9);
-	return swiz;
-}
 
-static inline GLuint make_rgba_swizzle(GLuint src) {
-	GLuint swiz = 0x0;
-	GLuint temp;
-	int i;
-	for (i = 0; i < 4; i++) {
-	        temp = GET_SWZ(src, i);
-		/* Fix SWIZZLE_ONE */
-		if (temp == 5) temp++;
-		swiz |= temp << i*3;
+	tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+		context->dest->NumInstructions, 1);
+	_mesa_copy_instructions(tgt, &inst, 1);
+
+	if (inst.Opcode != OPCODE_KIL &&
+	    compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+		GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 2);
+
+		tgt[0].Opcode = OPCODE_MAD;
+		tgt[0].DstReg = inst.DstReg;
+		tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+		tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt[0].SrcReg[0].Index = inst.DstReg.Index;
+		if (depthmode == 0) /* GL_LUMINANCE */
+			tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+		else if (depthmode == 2) /* GL_ALPHA */
+			tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+		tgt[0].SrcReg[1].File = PROGRAM_BUILTIN;
+		tgt[0].SrcReg[1].Swizzle = SWIZZLE_1111;
+		tgt[0].SrcReg[2] = inst.SrcReg[0];
+		tgt[0].SrcReg[2].Swizzle = SWIZZLE_ZZZZ;
+
+		/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+		 *   r  < tex  <=>      -tex+r < 0
+		 *   r >= tex  <=> not (-tex+r < 0 */
+		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+			tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+		else
+			tgt[0].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+
+		tgt[1].Opcode = OPCODE_CMP;
+		tgt[1].DstReg = orig_inst->DstReg;
+		tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index;
+		tgt[1].SrcReg[1].File = PROGRAM_BUILTIN;
+		tgt[1].SrcReg[2].File = PROGRAM_BUILTIN;
+
+		if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+			tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111;
+			tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000;
+		} else {
+			tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000;
+			tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111;
+		}
+	} else if (destredirect) {
+		tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+			context->dest->NumInstructions, 1);
+
+		tgt->Opcode = OPCODE_MAD;
+		tgt->DstReg = orig_inst->DstReg;
+		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+		tgt->SrcReg[0].Index = inst.DstReg.Index;
+		tgt->SrcReg[1].File = PROGRAM_BUILTIN;
+		tgt->SrcReg[1].Swizzle = SWIZZLE_1111;
+		tgt->SrcReg[2].File = PROGRAM_BUILTIN;
+		tgt->SrcReg[2].Swizzle = SWIZZLE_0000;
 	}
-	return swiz;
-}
 
-static inline GLuint make_alpha_swizzle(struct prog_src_register src) {
-	GLuint swiz = GET_SWZ(src.Swizzle, 3);
-
-	if (swiz == 5) swiz++;
-
-	if (src.NegateBase)
-		swiz |= (R500_SWIZ_MOD_NEG << 3);
-
-	return swiz;
+	return GL_TRUE;
 }
 
-static inline GLuint make_sop_swizzle(struct prog_src_register src) {
-	GLuint swiz = GET_SWZ(src.Swizzle, 0);
 
-	if (swiz == 5) swiz++;
-	return swiz;
-}
+static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
 
-static inline GLuint make_strq_swizzle(struct prog_src_register src) {
-	GLuint swiz = 0x0, temp = 0x0;
-	int i;
-	for (i = 0; i < 4; i++) {
-		temp = GET_SWZ(src.Swizzle, i) & 0x3;
-		swiz |= temp << i*2;
-	}
-	return swiz;
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (mp->Base.Parameters)
+		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
 }
 
-static int get_temp(struct r500_fragment_program *fp, int slot) {
-
-	COMPILE_STATE;
-
-	int r = fp->temp_reg_offset + cs->temp_in_use + slot;
 
-	if (r > R500_US_NUM_TEMP_REGS) {
-		ERROR("Too many temporary registers requested, can't compile!\n");
-	}
-
-	return r;
-}
-
-/* Borrowed verbatim from r300_fragprog since it hasn't changed. */
-static GLuint emit_const4fv(struct r500_fragment_program *fp,
-			    const GLfloat * cp)
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
+ */
+static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
 {
-	GLuint reg = 0x0;
-	int index;
+	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
 
-	for (index = 0; index < fp->const_nr; ++index) {
-		if (fp->constant[index] == cp)
-			break;
-	}
+	if (!(InputsRead & FRAG_BIT_WPOS))
+		return;
 
-	if (index >= fp->const_nr) {
-		if (index >= R500_US_NUM_CONST_REGS) {
-			ERROR("Out of hw constants!\n");
-			return reg;
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+	GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler);
+
+	fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3);
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) {
+		int reg;
+		for (reg = 0; reg < 3; reg++) {
+			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+				fpi[i].SrcReg[reg].Index = tempregi;
+			}
 		}
-
-		fp->const_nr++;
-		fp->constant[index] = cp;
-	}
-
-	reg = index | REG_CONSTANT;
-	return reg;
-}
-
-static GLuint make_src(struct r500_fragment_program *fp, struct prog_src_register src) {
-	COMPILE_STATE;
-	GLuint reg;
-	switch (src.File) {
-	case PROGRAM_TEMPORARY:
-		reg = src.Index + fp->temp_reg_offset;
-		break;
-	case PROGRAM_INPUT:
-		reg = cs->inputs[src.Index].reg;
-		break;
-	case PROGRAM_LOCAL_PARAM:
-		reg = emit_const4fv(fp,
-				    fp->mesa_program.Base.LocalParams[src.
-								      Index]);
-		break;
-	case PROGRAM_ENV_PARAM:
-		reg = emit_const4fv(fp,
-				    fp->ctx->FragmentProgram.Parameters[src.
-									Index]);
-		break;
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_NAMED_PARAM:
-	case PROGRAM_CONSTANT:
-		reg = emit_const4fv(fp, fp->mesa_program.Base.Parameters->
-				    ParameterValues[src.Index]);
-		break;
-	default:
-		ERROR("Can't handle src.File %x\n", src.File);
-		reg = 0x0;
-		break;
 	}
-	return reg;
 }
 
-static GLuint make_dest(struct r500_fragment_program *fp, struct prog_dst_register dest) {
-	GLuint reg;
-	switch (dest.File) {
-		case PROGRAM_TEMPORARY:
-			reg = dest.Index + fp->temp_reg_offset;
-			break;
-		case PROGRAM_OUTPUT:
-			/* Eventually we may need to handle multiple
-			 * rendering targets... */
-			reg = dest.Index;
-			break;
-		default:
-			ERROR("Can't handle dest.File %x\n", dest.File);
-			reg = 0x0;
-			break;
-	}
-	return reg;
-}
 
-static void emit_tex(struct r500_fragment_program *fp,
-		     struct prog_instruction *fpi, int dest, int counter)
+static GLuint build_dtm(GLuint depthmode)
 {
-	int hwsrc, hwdest;
-	GLuint mask;
-
-	mask = fpi->DstReg.WriteMask << 11;
-	hwsrc = make_src(fp, fpi->SrcReg[0]);
-
-	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
-		hwdest = get_temp(fp, 0);
-	} else {
-		hwdest = dest;
-	}
-
-	fp->inst[counter].inst0 = R500_INST_TYPE_TEX | mask
-		| R500_INST_TEX_SEM_WAIT;
-
-	fp->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit)
-		| R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
-	
-	if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX)
-	        fp->inst[counter].inst1 |= R500_TEX_UNSCALED;
-
-	switch (fpi->Opcode) {
-	case OPCODE_KIL:
-		fp->inst[counter].inst1 |= R500_TEX_INST_TEXKILL;
-		break;
-	case OPCODE_TEX:
-		fp->inst[counter].inst1 |= R500_TEX_INST_LD;
-		break;
-	case OPCODE_TXB:
-		fp->inst[counter].inst1 |= R500_TEX_INST_LODBIAS;
-		break;
-	case OPCODE_TXP:
-		fp->inst[counter].inst1 |= R500_TEX_INST_PROJ;
-		break;
+	switch(depthmode) {
 	default:
-		ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode);
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
 	}
-
-	fp->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc)
-		| MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0]))
-		/* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
-		| R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */
-		| R500_TEX_DST_ADDR(hwdest)
-		| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
-		| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
-
-	fp->inst[counter].inst3 = 0x0;
-	fp->inst[counter].inst4 = 0x0;
-	fp->inst[counter].inst5 = 0x0;
-
-	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
-		counter++;
-		fp->inst[counter].inst0 = R500_INST_TYPE_OUT
-			| R500_INST_TEX_SEM_WAIT | (mask << 4);
-		fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-		fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-		fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-			| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
-			| R500_ALU_RGB_SEL_B_SRC0
-			| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB)
-			| R500_ALU_RGB_OMOD_DISABLE;
-		fp->inst[counter].inst4 = R500_ALPHA_OP_CMP
-			| R500_ALPHA_ADDRD(dest)
-			| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A)
-			| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A)
-			| R500_ALPHA_OMOD_DISABLE;
-		fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-			| R500_ALU_RGBA_ADDRD(dest)
-			| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-			| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-	}
-}
-
-static void emit_alu(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi) {
-	/* Ideally, we shouldn't have to explicitly clear memory here! */
-	fp->inst[counter].inst0 = 0x0;
-	fp->inst[counter].inst1 = 0x0;
-	fp->inst[counter].inst2 = 0x0;
-	fp->inst[counter].inst3 = 0x0;
-	fp->inst[counter].inst4 = 0x0;
-	fp->inst[counter].inst5 = 0x0;
-
-	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
-		fp->inst[counter].inst0 = R500_INST_TYPE_OUT;
-
-		if (fpi->DstReg.Index == FRAG_RESULT_COLR)
-			fp->inst[counter].inst0 |= (fpi->DstReg.WriteMask << 15);
-
-		if (fpi->DstReg.Index == FRAG_RESULT_DEPR) {
-			fp->inst[counter].inst4 |= R500_ALPHA_W_OMASK;
-			/* Notify the state emission! */
-			fp->writes_depth = GL_TRUE;
-		}
-	} else {
-		fp->inst[counter].inst0 = R500_INST_TYPE_ALU
-			/* pixel_mask */
-			| (fpi->DstReg.WriteMask << 11);
-	}
-
-	fp->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT;
 }
 
-static void emit_mov(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, GLuint src_reg, GLuint swizzle, GLuint dest) {
-	/* The r3xx shader uses MAD to implement MOV. We are using CMP, since
-	 * it is technically more accurate and recommended by ATI/AMD. */
-	emit_alu(fp, counter, fpi);
-	fp->inst[counter].inst1 = R500_RGB_ADDR0(src_reg);
-	fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src_reg);
-	/* (De)mangle the swizzle from Mesa to R500. */
-	swizzle = make_rgba_swizzle(swizzle);
-	/* 0x1FF is 9 bits, size of an RGB swizzle. */
-	fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-		| MAKE_SWIZ_RGB_A((swizzle & 0x1ff))
-		| R500_ALU_RGB_SEL_B_SRC0
-		| MAKE_SWIZ_RGB_B((swizzle & 0x1ff))
-		| R500_ALU_RGB_OMOD_DISABLE;
-	fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
-		| R500_ALPHA_ADDRD(dest)
-		| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(GET_SWZ(swizzle, 3))
-		| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(GET_SWZ(swizzle, 3))
-		| R500_ALPHA_OMOD_DISABLE;
-	fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-		| R500_ALU_RGBA_ADDRD(dest)
-		| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-		| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-}
-
-static void emit_mad(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, int one, int two, int three) {
-	/* Note: This code was all Corbin's. Corbin is a rather hackish coder.
-	 * If you can make it pretty or fast, please do so! */
-	emit_alu(fp, counter, fpi);
-	/* Common MAD stuff */
-	fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
-		| R500_ALPHA_ADDRD(make_dest(fp, fpi->DstReg));
-	fp->inst[counter].inst5 |= R500_ALU_RGBA_OP_MAD
-		| R500_ALU_RGBA_ADDRD(make_dest(fp, fpi->DstReg));
-	switch (one) {
-		case 0:
-		case 1:
-		case 2:
-			fp->inst[counter].inst1 |= R500_RGB_ADDR0(make_src(fp, fpi->SrcReg[one]));
-			fp->inst[counter].inst2 |= R500_ALPHA_ADDR0(make_src(fp, fpi->SrcReg[one]));
-			fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_A_SRC0
-				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[one]));
-			fp->inst[counter].inst4 |= R500_ALPHA_SEL_A_SRC0
-				| MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[one]));
-			break;
-		case R500_SWIZZLE_ZERO:
-			fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO);
-			fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO);
-			break;
-		case R500_SWIZZLE_ONE:
-			fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE);
-			fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE);
-			break;
-		default:
-			ERROR("Bad src index in emit_mad: %d\n", one);
-			break;
-	}
-	switch (two) {
-		case 0:
-		case 1:
-		case 2:
-			fp->inst[counter].inst1 |= R500_RGB_ADDR1(make_src(fp, fpi->SrcReg[two]));
-			fp->inst[counter].inst2 |= R500_ALPHA_ADDR1(make_src(fp, fpi->SrcReg[two]));
-			fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
-				| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[two]));
-			fp->inst[counter].inst4 |= R500_ALPHA_SEL_B_SRC1
-				| MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[two]));
-			break;
-		case R500_SWIZZLE_ZERO:
-			fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
-			fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
-			break;
-		case R500_SWIZZLE_ONE:
-			fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
-			fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
-			break;
-		default:
-			ERROR("Bad src index in emit_mad: %d\n", two);
-			break;
-	}
-	switch (three) {
-		case 0:
-		case 1:
-		case 2:
-			fp->inst[counter].inst1 |= R500_RGB_ADDR2(make_src(fp, fpi->SrcReg[three]));
-			fp->inst[counter].inst2 |= R500_ALPHA_ADDR2(make_src(fp, fpi->SrcReg[three]));
-			fp->inst[counter].inst5 |= R500_ALU_RGBA_SEL_C_SRC2
-				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[three]))
-				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
-				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[three]));
-			break;
-		case R500_SWIZZLE_ZERO:
-			fp->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-			break;
-		case R500_SWIZZLE_ONE:
-			fp->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ONE)
-			| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ONE);
-			break;
-		default:
-			ERROR("Bad src index in emit_mad: %d\n", three);
-			break;
-	}
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
 }
 
-static void emit_sop(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, int opcode, GLuint src, GLuint swiz, GLuint dest) {
-	emit_alu(fp, counter, fpi);
-	fp->inst[counter].inst1 = R500_RGB_ADDR0(src);
-	fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src);
-	fp->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest)
-		| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(swiz);
-	fp->inst[counter].inst5 = R500_ALU_RGBA_OP_SOP
-		| R500_ALU_RGBA_ADDRD(dest);
-	switch (opcode) {
-		case OPCODE_COS:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_COS;
-			break;
-		case OPCODE_EX2:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_EX2;
-			break;
-		case OPCODE_LG2:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_LN2;
-			break;
-		case OPCODE_RCP:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_RCP;
-			break;
-		case OPCODE_RSQ:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_RSQ;
-			break;
-		case OPCODE_SIN:
-			fp->inst[counter].inst4 |= R500_ALPHA_OP_SIN;
-			break;
-		default:
-			ERROR("Bad opcode in emit_sop: %d\n", opcode);
-			break;
-	}
-}
 
-static GLboolean parse_program(struct r500_fragment_program *fp)
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct r500_fragment_program *fp,
+	struct r500_fragment_program_external_state *state)
 {
-	struct gl_fragment_program *mp = &fp->mesa_program;
-	const struct prog_instruction *inst = mp->Base.Instructions;
-	struct prog_instruction *fpi;
-	GLuint src[3], dest = 0;
-	int temp_swiz, counter = 0;
-
-	if (!inst || inst[0].Opcode == OPCODE_END) {
-		ERROR("The program is empty!\n");
-		return GL_FALSE;
-	}
+	int unit;
 
-	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-
-		if (fpi->Opcode != OPCODE_KIL) {
-			dest = make_dest(fp, fpi->DstReg);
-		}
+	_mesa_bzero(state, sizeof(*state));
 
-		switch (fpi->Opcode) {
-			case OPCODE_ABS:
-				emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
-				fp->inst[counter].inst3 |= R500_ALU_RGB_MOD_A_ABS
-					| R500_ALU_RGB_MOD_B_ABS;
-				fp->inst[counter].inst4 |= R500_ALPHA_MOD_A_ABS
-					| R500_ALPHA_MOD_B_ABS;
-				break;
-			case OPCODE_ADD:
-				/* Variation on MAD: 1*src0+src1 */
-				emit_mad(fp, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
-				break;
-		        case OPCODE_CMP:
-				/* This inst's selects need to be swapped as follows:
-				 * 0 -> C ; 1 -> B ; 2 -> A */
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				src[2] = make_src(fp, fpi->SrcReg[2]);
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[2])
-					| R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[0]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[2])
-					| R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[0]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[2]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[2]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC2
-					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
-					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]));
-				break;
-			case OPCODE_COS:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = emit_const4fv(fp, RCP_2PI);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
-					| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
-					| R500_ALPHA_ADDRD(get_temp(fp, 1))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
-				counter++;
-				emit_sop(fp, counter, fpi, OPCODE_COS, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_DP3:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_DP4:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				/* Based on DP3 */
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_DPH:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				/* Based on DP3 */
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_DST:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				/* [1, src0.y*src1.y, src0.z, src1.w]
-				 * So basically MUL with lotsa swizzling. */
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| R500_ALU_RGB_SEL_B_SRC1;
-				/* Select [1, y, z, 1] */
-				temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE;
-				fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(temp_swiz);
-				/* Select [1, y, 1, w] */
-				temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6);
-				fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(temp_swiz);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(dest)
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				break;
-			case OPCODE_EX2:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				emit_sop(fp, counter, fpi, OPCODE_EX2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_FLR:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_FRC
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
-				counter++;
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(get_temp(fp, 0));
-				fp->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
-					| R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SWIZ_A_A
-					| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC1
-					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC1
-					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGBA_MOD_C_NEG;
-				break;
-			case OPCODE_FRC:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_FRC
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_LG2:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				emit_sop(fp, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_LIT:
-				/* To be honest, I have no idea how I came up with the following.
-				 * All I know is that it's based on the r3xx stuff, and was
-				 * concieved with the help of NyQuil. Mmm, MyQuil. */
-
-				/* First instruction */
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = emit_const4fv(fp, LIT);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARG << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAX
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
-				counter++;
-				/* Second instruction */
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0)) | R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				/* Select [w, w, w, y] */
-				temp_swiz = 3 | (3 << 3) | (3 << 6);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(temp_swiz)
-					| R500_ALU_RGB_SEL_B_SRC1
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_LN2
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_G;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
-				counter++;
-				/* Third instruction */
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AG << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				/* Select [x, x, x, z] */
-				temp_swiz = 0;
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(temp_swiz)
-					| R500_ALU_RGB_SEL_B_SRC0
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 1))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
-					| R500_ALPHA_SEL_B_SRC0 | R500_ALPHA_SWIZ_B_B;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 1))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| R500_ALU_RGBA_A_SWIZ_0;
-				counter++;
-				/* Fourth instruction */
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AR << 11);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_EX2
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				/* Fifth instruction */
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				/* Select [w, w, w] */
-				temp_swiz = 3 | (3 << 3) | (3 << 6);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
-					| R500_ALU_RGB_SEL_B_SRC0
-					| MAKE_SWIZ_RGB_B(temp_swiz);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SWIZ_A_1
-					| R500_ALPHA_SWIZ_B_1;
-				/* Select [-y, -y, -y] */
-				temp_swiz = 1 | (1 << 3) | (1 << 6);
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(temp_swiz)
-					| R500_ALU_RGBA_MOD_C_NEG
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				/* Final instruction */
-				emit_mov(fp, counter, fpi, get_temp(fp, 0), SWIZZLE_NOOP, dest);
-				break;
-			case OPCODE_LRP:
-				/* src0 * src1 + INV(src0) * src2
-				 * 1) MUL src0, src1, temp
-				 * 2) PRE 1-src0; MAD srcp, src2, temp */
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				src[2] = make_src(fp, fpi->SrcReg[2]);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| R500_INST_NOP | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[2])
-					| R500_RGB_ADDR2(get_temp(fp, 0))
-					| R500_RGB_SRCP_OP_1_MINUS_RGB0;
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[2])
-					| R500_ALPHA_ADDR2(get_temp(fp, 0))
-					| R500_ALPHA_SRCP_OP_1_MINUS_A0;
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
-					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
-				break;
-			case OPCODE_MAD:
-				emit_mad(fp, counter, fpi, 0, 1, 2);
-				break;
-			case OPCODE_MAX:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1
-					| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MAX
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_MIN:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1
-					| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MIN
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
-					| R500_ALU_RGBA_ADDRD(dest);
-				break;
-			case OPCODE_MOV:
-				emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
-				break;
-			case OPCODE_MUL:
-				/* Variation on MAD: src0*src1+0 */
-				emit_mad(fp, counter, fpi, 0, 1, R500_SWIZZLE_ZERO);
-				break;
-			case OPCODE_POW:
-				/* POW(a,b) = EX2(LN2(a)*b) */
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				emit_sop(fp, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), get_temp(fp, 0));
-				fp->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
-				counter++;
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0))
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0))
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 1))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 1))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				emit_sop(fp, counter, fpi, OPCODE_EX2, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_RCP:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				emit_sop(fp, counter, fpi, OPCODE_RCP, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_RSQ:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				emit_sop(fp, counter, fpi, OPCODE_RSQ, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_SCS:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = emit_const4fv(fp, RCP_2PI);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
-					| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
-					| R500_ALPHA_ADDRD(get_temp(fp, 1))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
-				counter++;
-				/* Do a cosine, then a sine, masking out the channels we want to protect. */
-				/* Cosine only goes in R (x) channel. */
-				fpi->DstReg.WriteMask = 0x1;
-				emit_sop(fp, counter, fpi, OPCODE_COS, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
-				counter++;
-				/* Sine only goes in G (y) channel. */
-				fpi->DstReg.WriteMask = 0x2;
-				emit_sop(fp, counter, fpi, OPCODE_SIN, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_SGE:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
-					| R500_RGB_ADDR2(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
-					| R500_ALPHA_ADDR2(src[1]);
-				fp->inst[counter].inst3 = /* 1 */
-					MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| R500_ALU_RGBA_SEL_C_SRC2
-					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
-					| R500_ALU_RGBA_MOD_C_NEG
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
-					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
-					| R500_ALU_RGBA_ALPHA_MOD_C_NEG;
-				counter++;
-				/* This inst's selects need to be swapped as follows:
-				 * 0 -> C ; 1 -> B ; 2 -> A */
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
-					| R500_ALU_RGB_SEL_B_SRC0
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
-					| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC0
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC0
-					| R500_ALU_RGBA_A_SWIZ_A;
-				break;
-			case OPCODE_SIN:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = emit_const4fv(fp, RCP_2PI);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
-					| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
-					| R500_ALPHA_ADDRD(get_temp(fp, 1))
-					| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
-				counter++;
-				emit_sop(fp, counter, fpi, OPCODE_SIN, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
-				break;
-			case OPCODE_SLT:
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_ARGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
-					| R500_RGB_ADDR2(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
-					| R500_ALPHA_ADDR2(src[1]);
-				fp->inst[counter].inst3 = /* 1 */
-					MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
-					| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| R500_ALU_RGBA_SEL_C_SRC2
-					| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
-					| R500_ALU_RGBA_MOD_C_NEG
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
-					| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
-					| R500_ALU_RGBA_ALPHA_MOD_C_NEG;
-				counter++;
-				/* This inst's selects need to be swapped as follows:
-				 * 0 -> C ; 1 -> B ; 2 -> A */
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
-					| R500_ALU_RGB_SEL_B_SRC0
-					| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO)
-					| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC0
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGBA_ALPHA_SEL_C_SRC0
-					| R500_ALU_RGBA_A_SWIZ_A;
-				break;
-			case OPCODE_SUB:
-				/* Variation on MAD: 1*src0-src1 */
-				fpi->SrcReg[1].NegateBase = 0xF; /* NEG_XYZW */
-				emit_mad(fp, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
-				break;
-			case OPCODE_SWZ:
-				/* TODO: The rarer negation masks! */
-				emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
-				break;
-			case OPCODE_XPD:
-				/* src0 * src1 - src1 * src0
-				 * 1) MUL temp.xyz, src0.yzx, src1.zxy
-				 * 2) MAD src0.zxy, src1.yzx, -temp.xyz */
-				src[0] = make_src(fp, fpi->SrcReg[0]);
-				src[1] = make_src(fp, fpi->SrcReg[1]);
-				fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
-					| (R500_WRITEMASK_RGB << 11);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1]);
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1]);
-				/* Select [y, z, x] */
-				temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
-				temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(temp_swiz);
-				/* Select [z, x, y] */
-				temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
-				temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
-				fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
-					| MAKE_SWIZ_RGB_B(temp_swiz);
-				fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(get_temp(fp, 0))
-					| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
-					| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
-					| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-				counter++;
-				emit_alu(fp, counter, fpi);
-				fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
-					| R500_RGB_ADDR1(src[1])
-					| R500_RGB_ADDR2(get_temp(fp, 0));
-				fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
-					| R500_ALPHA_ADDR1(src[1])
-					| R500_ALPHA_ADDR2(get_temp(fp, 0));
-				/* Select [z, x, y] */
-				temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
-				temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
-				fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
-					| MAKE_SWIZ_RGB_A(temp_swiz);
-				/* Select [y, z, x] */
-				temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
-				temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
-				fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
-					| MAKE_SWIZ_RGB_B(temp_swiz);
-				fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
-					| R500_ALPHA_ADDRD(dest)
-					| R500_ALPHA_SWIZ_A_1
-					| R500_ALPHA_SWIZ_B_1;
-				fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
-					| R500_ALU_RGBA_ADDRD(dest)
-					| R500_ALU_RGBA_SEL_C_SRC2
-					| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
-					| R500_ALU_RGBA_MOD_C_NEG
-					| R500_ALU_RGBA_A_SWIZ_0;
-				break;
-			case OPCODE_KIL:
-			case OPCODE_TEX:
-			case OPCODE_TXB:
-			case OPCODE_TXP:
-				emit_tex(fp, fpi, dest, counter);
-					if (fpi->DstReg.File == PROGRAM_OUTPUT)
-						counter++;
-				break;
-			default:
-			        ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode));
-				break;
-		}
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
 
-		/* Finishing touches */
-		if (fpi->SaturateMode == SATURATE_ZERO_ONE) {
-			fp->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
 		}
-
-		counter++;
-
-		if (fp->error)
-			return GL_FALSE;
-
 	}
-
-	/* Finish him! (If it's an ALU/OUT instruction...) */
-	if ((fp->inst[counter-1].inst0 & 0x3) == 1) {
-		fp->inst[counter-1].inst0 |= R500_INST_LAST;
-	} else {
-		/* We still need to put an output inst, right? */
-		WARN_ONCE("Final FP instruction is not an OUT.\n");
-	}
-
-	fp->cs->nrslots = counter;
-
-	fp->max_temp_idx++;
-
-	return GL_TRUE;
 }
 
-static void init_program(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
-	struct r300_pfs_compile_state *cs = NULL;
-	struct gl_fragment_program *mp = &fp->mesa_program;
-	struct prog_instruction *fpi;
-	GLuint InputsRead = mp->Base.InputsRead;
-	GLuint temps_used = 0;
-	int i, j;
-
-	/* New compile, reset tracking data */
-	fp->optimization =
-	    driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
-	fp->translated = GL_FALSE;
-	fp->error = GL_FALSE;
-	fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
-	fp->const_nr = 0;
-	/* Size of pixel stack, plus 1. */
-	fp->max_temp_idx = 1;
-	/* Temp register offset. */
-	fp->temp_reg_offset = 0;
-	/* Whether or not we perform any depth writing. */
-	fp->writes_depth = GL_FALSE;
-
-	_mesa_memset(cs, 0, sizeof(*fp->cs));
-	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
-		for (j = 0; j < 3; j++) {
-			cs->slot[i].vsrc[j] = SRC_CONST;
-			cs->slot[i].ssrc[j] = SRC_CONST;
-		}
-	}
-
-	/* Work out what temps the Mesa inputs correspond to, this must match
-	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
-	 * configures itself based on the fragprog's InputsRead
-	 *
-	 * NOTE: this depends on get_hw_temp() allocating registers in order,
-	 * starting from register 0, so we're just going to do that instead.
-	 */
-
-	/* Texcoords come first */
-	for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
-			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
-				fp->temp_reg_offset;
-			fp->temp_reg_offset++;
-		}
-	}
-	InputsRead &= ~FRAG_BITS_TEX_ANY;
-
-	/* fragment position treated as a texcoord */
-	if (InputsRead & FRAG_BIT_WPOS) {
-		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_WPOS].reg =
-			fp->temp_reg_offset;
-		fp->temp_reg_offset++;
-	}
-	InputsRead &= ~FRAG_BIT_WPOS;
-
-	/* Then primary colour */
-	if (InputsRead & FRAG_BIT_COL0) {
-		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL0].reg =
-			fp->temp_reg_offset;
-		fp->temp_reg_offset++;
-	}
-	InputsRead &= ~FRAG_BIT_COL0;
-
-	/* Secondary color */
-	if (InputsRead & FRAG_BIT_COL1) {
-		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
-		cs->inputs[FRAG_ATTRIB_COL1].reg =
-			fp->temp_reg_offset;
-		fp->temp_reg_offset++;
-	}
-	InputsRead &= ~FRAG_BIT_COL1;
-
-	/* Anything else */
-	if (InputsRead) {
-		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
-		/* force read from hwreg 0 for now */
-		for (i = 0; i < 32; i++)
-			if (InputsRead & (1 << i))
-				cs->inputs[i].reg = 0;
-	}
+static void dump_program(struct r500_fragment_program_code *code);
 
-	if (!mp->Base.Instructions) {
-		ERROR("No instructions found in program, going to go die now.\n");
-		return;
-	}
+void r500TranslateFragmentShader(r300ContextPtr r300,
+				 struct r500_fragment_program *fp)
+{
+	struct r500_fragment_program_external_state state;
 
-	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-		for (i = 0; i < 3; i++) {
-			if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) {
-				if (fpi->SrcReg[i].Index >= temps_used)
-					temps_used = fpi->SrcReg[i].Index + 1;
-			}
-		}
+	build_state(r300, fp, &state);
+	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+		/* TODO: cache compiled programs */
+		fp->translated = GL_FALSE;
+		_mesa_memcpy(&fp->state, &state, sizeof(state));
 	}
 
-	cs->temp_in_use = temps_used + 1;
-
-	fp->max_temp_idx = fp->temp_reg_offset + cs->temp_in_use;
-
-	if (RADEON_DEBUG & DEBUG_PIXEL)
-		fprintf(stderr, "FP temp indices: fp->max_temp_idx: %d cs->temp_in_use: %d\n", fp->max_temp_idx, cs->temp_in_use);
-}
+	if (!fp->translated) {
+		struct r500_fragment_program_compiler compiler;
 
-static void update_params(struct r500_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
+		compiler.r300 = r300;
+		compiler.fp = fp;
+		compiler.code = &fp->code;
 
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
-}
+		radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base);
 
-static void dumb_shader(struct r500_fragment_program *fp)
-{
-	fp->inst[0].inst0 = R500_INST_TYPE_TEX
-		| R500_INST_TEX_SEM_WAIT
-		| R500_INST_RGB_WMASK_R
-		| R500_INST_RGB_WMASK_G
-		| R500_INST_RGB_WMASK_B
-		| R500_INST_ALPHA_WMASK
-		| R500_INST_RGB_CLAMP
-		| R500_INST_ALPHA_CLAMP;
-	fp->inst[0].inst1 = R500_TEX_ID(0)
-		| R500_TEX_INST_LD
-		| R500_TEX_SEM_ACQUIRE
-		| R500_TEX_IGNORE_UNCOVERED;
-	fp->inst[0].inst2 = R500_TEX_SRC_ADDR(0)
-		| R500_TEX_SRC_S_SWIZ_R
-		| R500_TEX_SRC_T_SWIZ_G
-		| R500_TEX_DST_ADDR(0)
-		| R500_TEX_DST_R_SWIZ_R
-		| R500_TEX_DST_G_SWIZ_G
-		| R500_TEX_DST_B_SWIZ_B
-		| R500_TEX_DST_A_SWIZ_A;
-	fp->inst[0].inst3 = R500_DX_ADDR(0)
-		| R500_DX_S_SWIZ_R
-		| R500_DX_T_SWIZ_R
-		| R500_DX_R_SWIZ_R
-		| R500_DX_Q_SWIZ_R
-		| R500_DY_ADDR(0)
-		| R500_DY_S_SWIZ_R
-		| R500_DY_T_SWIZ_R
-		| R500_DY_R_SWIZ_R
-		| R500_DY_Q_SWIZ_R;
-	fp->inst[0].inst4 = 0x0;
-	fp->inst[0].inst5 = 0x0;
-
-	fp->inst[1].inst0 = R500_INST_TYPE_OUT |
-		R500_INST_TEX_SEM_WAIT |
-		R500_INST_LAST |
-		R500_INST_RGB_OMASK_R |
-		R500_INST_RGB_OMASK_G |
-		R500_INST_RGB_OMASK_B |
-		R500_INST_ALPHA_OMASK;
-	fp->inst[1].inst1 = R500_RGB_ADDR0(0) |
-		R500_RGB_ADDR1(0) |
-		R500_RGB_ADDR1_CONST |
-		R500_RGB_ADDR2(0) |
-		R500_RGB_ADDR2_CONST |
-		R500_RGB_SRCP_OP_1_MINUS_2RGB0;
-	fp->inst[1].inst2 = R500_ALPHA_ADDR0(0) |
-		R500_ALPHA_ADDR1(0) |
-		R500_ALPHA_ADDR1_CONST |
-		R500_ALPHA_ADDR2(0) |
-		R500_ALPHA_ADDR2_CONST |
-		R500_ALPHA_SRCP_OP_1_MINUS_2A0;
-	fp->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 |
-		R500_ALU_RGB_R_SWIZ_A_R |
-		R500_ALU_RGB_G_SWIZ_A_G |
-		R500_ALU_RGB_B_SWIZ_A_B |
-		R500_ALU_RGB_SEL_B_SRC0 |
-		R500_ALU_RGB_R_SWIZ_B_1 |
-		R500_ALU_RGB_B_SWIZ_B_1 |
-		R500_ALU_RGB_G_SWIZ_B_1;
-	fp->inst[1].inst4 = R500_ALPHA_OP_MAD |
-		R500_ALPHA_SWIZ_A_A |
-		R500_ALPHA_SWIZ_B_1;
-	fp->inst[1].inst5 = R500_ALU_RGBA_OP_MAD |
-		R500_ALU_RGBA_R_SWIZ_0 |
-		R500_ALU_RGBA_G_SWIZ_0 |
-		R500_ALU_RGBA_B_SWIZ_0 |
-		R500_ALU_RGBA_A_SWIZ_0;
-
-	fp->cs->nrslots = 2;
-	fp->translated = GL_TRUE;
-}
+		insert_WPOS_trailer(&compiler);
 
-void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r500_fragment_program *fp)
-{
+		struct radeon_program_transformation transformations[1] = {
+			{ &transform_TEX, &compiler }
+		};
+		radeonClauseLocalTransform(&compiler.compiler,
+			&compiler.compiler.Clauses[0],
+			1, transformations);
 
-	struct r300_pfs_compile_state *cs = NULL;
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Compiler state after transformations:\n");
+			radeonCompilerDump(&compiler.compiler);
+		}
 
-	if (!fp->translated) {
+		fp->translated = r500FragmentProgramEmit(&compiler);
 
-		init_program(r300, fp);
-		cs = fp->cs;
+		radeonCompilerCleanup(&compiler.compiler);
 
-		if (parse_program(fp) == GL_FALSE) {
-			ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n");
-			dumb_shader(fp);
-			fp->inst_offset = 0;
-			fp->inst_end = cs->nrslots - 1;
-			return;
-		}
-		fp->inst_offset = 0;
-		fp->inst_end = cs->nrslots - 1;
+		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
 
-		fp->translated = GL_TRUE;
 		if (RADEON_DEBUG & DEBUG_PIXEL) {
 			fprintf(stderr, "Mesa program:\n");
 			fprintf(stderr, "-------------\n");
 			_mesa_print_program(&fp->mesa_program.Base);
 			fflush(stdout);
-			dump_program(fp);
+			if (fp->translated)
+				dump_program(&fp->code);
 		}
 
-
-		r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
 	}
 
-	update_params(fp);
+	update_params(r300, fp);
 
 }
 
@@ -1561,7 +445,7 @@ static char *to_texop(int val)
   return NULL;
 }
 
-static void dump_program(struct r500_fragment_program *fp)
+static void dump_program(struct r500_fragment_program_code *code)
 {
 
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
@@ -1571,18 +455,18 @@ static void dump_program(struct r500_fragment_program *fp)
   uint32_t inst0;
   char *str = NULL;
 
-  if (fp->const_nr) {
+  if (code->const_nr) {
     fprintf(stderr, "--------\nConstants:\n");
-    for (n = 0; n < fp->const_nr; n++) {
+    for (n = 0; n < code->const_nr; n++) {
       fprintf(stderr, "Constant %d: %f %f\n\t %f %f\n", n,
-        fp->constant[n][0], fp->constant[n][1], fp->constant[n][2],
-        fp->constant[n][3]);
+        code->constant[n][0], code->constant[n][1], code->constant[n][2],
+        code->constant[n][3]);
     }
     fprintf(stderr, "--------\n");
   }
 
-  for (n = 0; n < fp->inst_end+1; n++) {
-    inst0 = inst = fp->inst[n].inst0;
+  for (n = 0; n < code->inst_end+1; n++) {
+    inst0 = inst = code->inst[n].inst0;
     fprintf(stderr,"%d\t0:CMN_INST   0x%08x:", n, inst);
     switch(inst & 0x3) {
     case R500_INST_TYPE_ALU: str = "ALU"; break;
@@ -1601,8 +485,8 @@ static void dump_program(struct r500_fragment_program *fp)
     switch(inst0 & 0x3) {
     case 0:
     case 1:
-      fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", fp->inst[n].inst1);
-      inst = fp->inst[n].inst1;
+      fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
+      inst = code->inst[n].inst1;
 
       fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
 	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
@@ -1610,15 +494,15 @@ static void dump_program(struct r500_fragment_program *fp)
 	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
 	      (inst >> 30));
 
-      fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", fp->inst[n].inst2);
-      inst = fp->inst[n].inst2;
+      fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", code->inst[n].inst2);
+      inst = code->inst[n].inst2;
       fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
 	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
 	      (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
 	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
 	      (inst >> 30));
-      fprintf(stderr,"\t3 RGB_INST:  0x%08x:", fp->inst[n].inst3);
-      inst = fp->inst[n].inst3;
+      fprintf(stderr,"\t3 RGB_INST:  0x%08x:", code->inst[n].inst3);
+      inst = code->inst[n].inst3;
       fprintf(stderr,"rgb_A_src:%d %s/%s/%s %d rgb_B_src:%d %s/%s/%s %d\n",
 	      (inst) & 0x3, toswiz((inst >> 2) & 0x7), toswiz((inst >> 5) & 0x7), toswiz((inst >> 8) & 0x7),
 	      (inst >> 11) & 0x3,
@@ -1626,16 +510,16 @@ static void dump_program(struct r500_fragment_program *fp)
 	      (inst >> 24) & 0x3);
 
 
-      fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", fp->inst[n].inst4);
-      inst = fp->inst[n].inst4;
+      fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", code->inst[n].inst4);
+      inst = code->inst[n].inst4;
       fprintf(stderr,"%s dest:%d%s alp_A_src:%d %s %d alp_B_src:%d %s %d w:%d\n", to_alpha_op(inst & 0xf),
 	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
 	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), (inst >> 17) & 0x3,
 	      (inst >> 19) & 0x3, toswiz((inst >> 21) & 0x7), (inst >> 24) & 0x3,
 	      (inst >> 31) & 0x1);
 
-      fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", fp->inst[n].inst5);
-      inst = fp->inst[n].inst5;
+      fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", code->inst[n].inst5);
+      inst = code->inst[n].inst5;
       fprintf(stderr,"%s dest:%d%s rgb_C_src:%d %s/%s/%s %d alp_C_src:%d %s %d\n", toop(inst & 0xf),
 	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
 	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), toswiz((inst >> 17) & 0x7), toswiz((inst >> 20) & 0x7),
@@ -1645,11 +529,11 @@ static void dump_program(struct r500_fragment_program *fp)
     case 2:
       break;
     case 3:
-      inst = fp->inst[n].inst1;
+      inst = code->inst[n].inst1;
       fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
 	      to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
 	      (inst & (1<<26)) ? "IGNUNC" : "", (inst & (1<<27)) ? "UNSCALED" : "SCALED");
-      inst = fp->inst[n].inst2;
+      inst = code->inst[n].inst2;
       fprintf(stderr,"\t2:TEX_ADDR:  0x%08x: src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n", inst,
 	      inst & 127, inst & (1<<7) ? "(rel)" : "",
 	      toswiz((inst >> 8) & 0x3), toswiz((inst >> 10) & 0x3),
@@ -1658,7 +542,7 @@ static void dump_program(struct r500_fragment_program *fp)
 	      toswiz((inst >> 24) & 0x3), toswiz((inst >> 26) & 0x3),
 	      toswiz((inst >> 28) & 0x3), toswiz((inst >> 30) & 0x3));
 
-      fprintf(stderr,"\t3:TEX_DXDY:  0x%08x\n", fp->inst[n].inst3);
+      fprintf(stderr,"\t3:TEX_DXDY:  0x%08x\n", code->inst[n].inst3);
       break;
     }
     fprintf(stderr,"\n");
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 5dd2def1c40..ff6a9002c14 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -36,10 +36,14 @@
 #include "glheader.h"
 #include "macros.h"
 #include "enums.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 
 #include "r300_context.h"
+#include "r300_state.h"
+#include "radeon_program.h"
 
 /* supported hw opcodes */
 #define PFS_OP_MAD 0
@@ -76,4 +80,13 @@ struct r500_fragment_program;
 extern void r500TranslateFragmentShader(r300ContextPtr r300,
 					struct r500_fragment_program *fp);
 
+struct r500_fragment_program_compiler {
+	r300ContextPtr r300;
+	struct r500_fragment_program *fp;
+	struct r500_fragment_program_code *code;
+	struct radeon_compiler compiler;
+};
+
+extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
new file mode 100644
index 00000000000..e1ad342690b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
@@ -0,0 +1,1533 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * Copyright 2008 Corbin Simpson <[email protected]>
+ * Adaptation and modification for ATI/AMD Radeon R500 GPU chipsets.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Ben Skeggs <[email protected]>
+ *
+ * \author Jerome Glisse <[email protected]>
+ *
+ * \author Corbin Simpson <[email protected]>
+ *
+ * \todo Depth write, WPOS/FOGC inputs
+ *
+ * \todo FogOption
+ *
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r500_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/* Mapping Mesa registers to R500 temporaries */
+struct reg_acc {
+	int reg;		/* Assigned hw temp */
+	unsigned int refcount;	/* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+	/* Index of the first slot where this register is free in the sense
+	   that it can be used as a new destination register.
+	   This is -1 if the register has been assigned to a Mesa register
+	   and the last access to the register has not yet been emitted */
+	int free;
+
+	/* Index of the first slot where this register is currently reserved.
+	   This is used to stop e.g. a scalar operation from being moved
+	   before the allocation time of a register that was first allocated
+	   for a vector operation. */
+	int reserved;
+
+	/* Index of the first slot in which the register can be used as a
+	   source without losing the value that is written by the last
+	   emitted instruction that writes to the register */
+	int vector_valid;
+	int scalar_valid;
+
+	/* Index to the slot where the register was last read.
+	   This is also the first slot in which the register may be written again */
+	int vector_lastread;
+	int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r500_pfs_compile_slot {
+	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+	   defined above */
+	unsigned int used;
+
+	/* Selected sources */
+	int vsrc[3];
+	int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r500_pfs_compile_state {
+	struct r500_fragment_program_compiler *compiler;
+
+	/* number of ALU slots used so far */
+	int nrslots;
+
+	/* Track which (parts of) slots are already filled with instructions */
+	struct r500_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+	/* Track the validity of R300 temporaries */
+	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+	/* Used to map Mesa's inputs/temps onto hardware temps */
+	int temp_in_use;
+	struct reg_acc temps[PFS_NUM_TEMP_REGS];
+	struct reg_acc inputs[32];	/* don't actually need 32... */
+
+	/* Track usage of hardware temps, for register allocation,
+	 * indirection detection, etc. */
+	GLuint used_in_node;
+	GLuint dest_in_node;
+};
+
+/*
+ * Useful macros and values
+ */
+#define ERROR(fmt, args...) do {			\
+		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+			__FILE__, __FUNCTION__, ##args);	\
+		cs->compiler->fp->error = GL_TRUE;			\
+	} while(0)
+
+#define PROG_CODE struct r500_fragment_program_code *code = cs->compiler->code
+
+#define R500_US_NUM_TEMP_REGS 128
+#define R500_US_NUM_CONST_REGS 256
+
+/* "Register" flags */
+#define REG_CONSTANT (1 << 8)
+#define REG_SRC_REL (1 << 9)
+#define REG_DEST_REL (1 << 7)
+
+/* Swizzle tools */
+#define R500_SWIZZLE_ZERO 4
+#define R500_SWIZZLE_HALF 5
+#define R500_SWIZZLE_ONE 6
+#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
+#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
+#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
+#define R500_SWIZ_MOD_NEG 1
+#define R500_SWIZ_MOD_ABS 2
+#define R500_SWIZ_MOD_NEG_ABS 3
+/* Swizzles for inst2 */
+#define MAKE_SWIZ_TEX_STRQ(x) (x << 8)
+#define MAKE_SWIZ_TEX_RGBA(x) (x << 24)
+/* Swizzles for inst3 */
+#define MAKE_SWIZ_RGB_A(x) (x << 2)
+#define MAKE_SWIZ_RGB_B(x) (x << 15)
+/* Swizzles for inst4 */
+#define MAKE_SWIZ_ALPHA_A(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_B(x) (x << 21)
+/* Swizzle for inst5 */
+#define MAKE_SWIZ_RGBA_C(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_C(x) (x << 27)
+
+/* Writemasks */
+#define R500_WRITEMASK_G 0x2
+#define R500_WRITEMASK_B 0x4
+#define R500_WRITEMASK_RGB 0x7
+#define R500_WRITEMASK_A 0x8
+#define R500_WRITEMASK_AR 0x9
+#define R500_WRITEMASK_AG 0xA
+#define R500_WRITEMASK_ARG 0xB
+#define R500_WRITEMASK_AB 0xC
+#define R500_WRITEMASK_ARGB 0xF
+
+/* 1/(2pi), needed for quick modulus in trig insts
+ * Thanks to glisse for pointing out how to do it! */
+static const GLfloat RCP_2PI[] = {0.15915494309189535,
+	0.15915494309189535,
+	0.15915494309189535,
+	0.15915494309189535};
+
+static const GLfloat LIT[] = {127.999999,
+	127.999999,
+	127.999999,
+	-127.999999};
+
+static inline GLuint make_rgb_swizzle(struct prog_src_register src) {
+	GLuint swiz = 0x0;
+	GLuint temp;
+	/* This could be optimized, but it should be plenty fast already. */
+	int i;
+	for (i = 0; i < 3; i++) {
+	        temp = GET_SWZ(src.Swizzle, i);
+		/* Fix SWIZZLE_ONE */
+		if (temp == 5) temp++;
+		swiz |= temp << i*3;
+	}
+	if (src.NegateBase)
+		swiz |= (R500_SWIZ_MOD_NEG << 9);
+	return swiz;
+}
+
+static inline GLuint make_rgba_swizzle(GLuint src) {
+	GLuint swiz = 0x0;
+	GLuint temp;
+	int i;
+	for (i = 0; i < 4; i++) {
+	        temp = GET_SWZ(src, i);
+		/* Fix SWIZZLE_ONE */
+		if (temp == 5) temp++;
+		swiz |= temp << i*3;
+	}
+	return swiz;
+}
+
+static inline GLuint make_alpha_swizzle(struct prog_src_register src) {
+	GLuint swiz = GET_SWZ(src.Swizzle, 3);
+
+	if (swiz == 5) swiz++;
+
+	if (src.NegateBase)
+		swiz |= (R500_SWIZ_MOD_NEG << 3);
+
+	return swiz;
+}
+
+static inline GLuint make_sop_swizzle(struct prog_src_register src) {
+	GLuint swiz = GET_SWZ(src.Swizzle, 0);
+
+	if (swiz == 5) swiz++;
+	return swiz;
+}
+
+static inline GLuint make_strq_swizzle(struct prog_src_register src) {
+	GLuint swiz = 0x0, temp = 0x0;
+	int i;
+	for (i = 0; i < 4; i++) {
+		temp = GET_SWZ(src.Swizzle, i) & 0x3;
+		swiz |= temp << i*2;
+	}
+	return swiz;
+}
+
+static int get_temp(struct r500_pfs_compile_state *cs, int slot) {
+
+	PROG_CODE;
+
+	int r = code->temp_reg_offset + cs->temp_in_use + slot;
+
+	if (r > R500_US_NUM_TEMP_REGS) {
+		ERROR("Too many temporary registers requested, can't compile!\n");
+	}
+
+	return r;
+}
+
+/* Borrowed verbatim from r300_fragprog since it hasn't changed. */
+static GLuint emit_const4fv(struct r500_pfs_compile_state *cs,
+			    const GLfloat * cp)
+{
+	PROG_CODE;
+
+	GLuint reg = 0x0;
+	int index;
+
+	for (index = 0; index < code->const_nr; ++index) {
+		if (code->constant[index] == cp)
+			break;
+	}
+
+	if (index >= code->const_nr) {
+		if (index >= R500_US_NUM_CONST_REGS) {
+			ERROR("Out of hw constants!\n");
+			return reg;
+		}
+
+		code->const_nr++;
+		code->constant[index] = cp;
+	}
+
+	reg = index | REG_CONSTANT;
+	return reg;
+}
+
+static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_register src) {
+	PROG_CODE;
+	GLuint reg;
+	switch (src.File) {
+	case PROGRAM_TEMPORARY:
+		reg = src.Index + code->temp_reg_offset;
+		break;
+	case PROGRAM_INPUT:
+		reg = cs->inputs[src.Index].reg;
+		break;
+	case PROGRAM_LOCAL_PARAM:
+		reg = emit_const4fv(cs,
+			cs->compiler->fp->mesa_program.Base.LocalParams[src.Index]);
+		break;
+	case PROGRAM_ENV_PARAM:
+		reg = emit_const4fv(cs,
+			cs->compiler->compiler.Ctx->FragmentProgram.Parameters[src.Index]);
+		break;
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_CONSTANT:
+		reg = emit_const4fv(cs,
+			cs->compiler->fp->mesa_program.Base.Parameters->ParameterValues[src.Index]);
+		break;
+	case PROGRAM_BUILTIN:
+		reg = 0x0;
+		break;
+	default:
+		ERROR("Can't handle src.File %x\n", src.File);
+		reg = 0x0;
+		break;
+	}
+	return reg;
+}
+
+static GLuint make_dest(struct r500_pfs_compile_state *cs, struct prog_dst_register dest) {
+	PROG_CODE;
+	GLuint reg;
+	switch (dest.File) {
+	case PROGRAM_TEMPORARY:
+		reg = dest.Index + code->temp_reg_offset;
+		break;
+	case PROGRAM_OUTPUT:
+		/* Eventually we may need to handle multiple
+			* rendering targets... */
+		reg = dest.Index;
+		break;
+	case PROGRAM_BUILTIN:
+		reg = 0x0;
+		break;
+	default:
+		ERROR("Can't handle dest.File %x\n", dest.File);
+		reg = 0x0;
+		break;
+	}
+	return reg;
+}
+
+static void emit_tex(struct r500_pfs_compile_state *cs,
+		     struct prog_instruction *fpi, int dest, int counter)
+{
+	PROG_CODE;
+	int hwsrc, hwdest;
+	GLuint mask;
+
+	mask = fpi->DstReg.WriteMask << 11;
+	hwsrc = make_src(cs, fpi->SrcReg[0]);
+
+	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+		hwdest = get_temp(cs, 0);
+	} else {
+		hwdest = dest;
+	}
+
+	code->inst[counter].inst0 = R500_INST_TYPE_TEX | mask
+		| R500_INST_TEX_SEM_WAIT;
+
+	code->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit)
+		| R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+
+	if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX)
+	        code->inst[counter].inst1 |= R500_TEX_UNSCALED;
+
+	switch (fpi->Opcode) {
+	case OPCODE_KIL:
+		code->inst[counter].inst1 |= R500_TEX_INST_TEXKILL;
+		break;
+	case OPCODE_TEX:
+		code->inst[counter].inst1 |= R500_TEX_INST_LD;
+		break;
+	case OPCODE_TXB:
+		code->inst[counter].inst1 |= R500_TEX_INST_LODBIAS;
+		break;
+	case OPCODE_TXP:
+		code->inst[counter].inst1 |= R500_TEX_INST_PROJ;
+		break;
+	default:
+		ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode);
+	}
+
+	code->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc)
+		| MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0]))
+		/* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
+		| R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */
+		| R500_TEX_DST_ADDR(hwdest)
+		| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
+		| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+
+	code->inst[counter].inst3 = 0x0;
+	code->inst[counter].inst4 = 0x0;
+	code->inst[counter].inst5 = 0x0;
+
+	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+		counter++;
+		code->inst[counter].inst0 = R500_INST_TYPE_OUT
+			| R500_INST_TEX_SEM_WAIT | (mask << 4);
+		code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+		code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+		code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+			| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+			| R500_ALU_RGB_SEL_B_SRC0
+			| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB)
+			| R500_ALU_RGB_OMOD_DISABLE;
+		code->inst[counter].inst4 = R500_ALPHA_OP_CMP
+			| R500_ALPHA_ADDRD(dest)
+			| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A)
+			| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A)
+			| R500_ALPHA_OMOD_DISABLE;
+		code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+			| R500_ALU_RGBA_ADDRD(dest)
+			| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+			| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+	}
+}
+
+static void emit_alu(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi) {
+	PROG_CODE;
+	/* Ideally, we shouldn't have to explicitly clear memory here! */
+	code->inst[counter].inst0 = 0x0;
+	code->inst[counter].inst1 = 0x0;
+	code->inst[counter].inst2 = 0x0;
+	code->inst[counter].inst3 = 0x0;
+	code->inst[counter].inst4 = 0x0;
+	code->inst[counter].inst5 = 0x0;
+
+	if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+		code->inst[counter].inst0 = R500_INST_TYPE_OUT;
+
+		if (fpi->DstReg.Index == FRAG_RESULT_COLR)
+			code->inst[counter].inst0 |= (fpi->DstReg.WriteMask << 15);
+
+		if (fpi->DstReg.Index == FRAG_RESULT_DEPR) {
+			code->inst[counter].inst4 |= R500_ALPHA_W_OMASK;
+			/* Notify the state emission! */
+			cs->compiler->fp->writes_depth = GL_TRUE;
+		}
+	} else {
+		code->inst[counter].inst0 = R500_INST_TYPE_ALU
+			/* pixel_mask */
+			| (fpi->DstReg.WriteMask << 11);
+	}
+
+	code->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT;
+}
+
+static void emit_mov(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, GLuint src_reg, GLuint swizzle, GLuint dest) {
+	PROG_CODE;
+	/* The r3xx shader uses MAD to implement MOV. We are using CMP, since
+	 * it is technically more accurate and recommended by ATI/AMD. */
+	emit_alu(cs, counter, fpi);
+	code->inst[counter].inst1 = R500_RGB_ADDR0(src_reg);
+	code->inst[counter].inst2 = R500_ALPHA_ADDR0(src_reg);
+	/* (De)mangle the swizzle from Mesa to R500. */
+	swizzle = make_rgba_swizzle(swizzle);
+	/* 0x1FF is 9 bits, size of an RGB swizzle. */
+	code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+		| MAKE_SWIZ_RGB_A((swizzle & 0x1ff))
+		| R500_ALU_RGB_SEL_B_SRC0
+		| MAKE_SWIZ_RGB_B((swizzle & 0x1ff))
+		| R500_ALU_RGB_OMOD_DISABLE;
+	code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+		| R500_ALPHA_ADDRD(dest)
+		| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(GET_SWZ(swizzle, 3))
+		| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(GET_SWZ(swizzle, 3))
+		| R500_ALPHA_OMOD_DISABLE;
+	code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+		| R500_ALU_RGBA_ADDRD(dest)
+		| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+		| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+}
+
+static void emit_mad(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int one, int two, int three) {
+	PROG_CODE;
+	/* Note: This code was all Corbin's. Corbin is a rather hackish coder.
+	 * If you can make it pretty or fast, please do so! */
+	emit_alu(cs, counter, fpi);
+	/* Common MAD stuff */
+	code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+		| R500_ALPHA_ADDRD(make_dest(cs, fpi->DstReg));
+	code->inst[counter].inst5 |= R500_ALU_RGBA_OP_MAD
+		| R500_ALU_RGBA_ADDRD(make_dest(cs, fpi->DstReg));
+	switch (one) {
+		case 0:
+		case 1:
+		case 2:
+			code->inst[counter].inst1 |= R500_RGB_ADDR0(make_src(cs, fpi->SrcReg[one]));
+			code->inst[counter].inst2 |= R500_ALPHA_ADDR0(make_src(cs, fpi->SrcReg[one]));
+			code->inst[counter].inst3 |= R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[one]));
+			code->inst[counter].inst4 |= R500_ALPHA_SEL_A_SRC0
+				| MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[one]));
+			break;
+		case R500_SWIZZLE_ZERO:
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO);
+			code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO);
+			break;
+		case R500_SWIZZLE_ONE:
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE);
+			code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE);
+			break;
+		default:
+			ERROR("Bad src index in emit_mad: %d\n", one);
+			break;
+	}
+	switch (two) {
+		case 0:
+		case 1:
+		case 2:
+			code->inst[counter].inst1 |= R500_RGB_ADDR1(make_src(cs, fpi->SrcReg[two]));
+			code->inst[counter].inst2 |= R500_ALPHA_ADDR1(make_src(cs, fpi->SrcReg[two]));
+			code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+				| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[two]));
+			code->inst[counter].inst4 |= R500_ALPHA_SEL_B_SRC1
+				| MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[two]));
+			break;
+		case R500_SWIZZLE_ZERO:
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+			code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
+			break;
+		case R500_SWIZZLE_ONE:
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
+			code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
+			break;
+		default:
+			ERROR("Bad src index in emit_mad: %d\n", two);
+			break;
+	}
+	switch (three) {
+		case 0:
+		case 1:
+		case 2:
+			code->inst[counter].inst1 |= R500_RGB_ADDR2(make_src(cs, fpi->SrcReg[three]));
+			code->inst[counter].inst2 |= R500_ALPHA_ADDR2(make_src(cs, fpi->SrcReg[three]));
+			code->inst[counter].inst5 |= R500_ALU_RGBA_SEL_C_SRC2
+				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[three]))
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[three]));
+			break;
+		case R500_SWIZZLE_ZERO:
+			code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			break;
+		case R500_SWIZZLE_ONE:
+			code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ONE)
+			| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ONE);
+			break;
+		default:
+			ERROR("Bad src index in emit_mad: %d\n", three);
+			break;
+	}
+}
+
+static void emit_sop(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int opcode, GLuint src, GLuint swiz, GLuint dest) {
+	PROG_CODE;
+	emit_alu(cs, counter, fpi);
+	code->inst[counter].inst1 = R500_RGB_ADDR0(src);
+	code->inst[counter].inst2 = R500_ALPHA_ADDR0(src);
+	code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest)
+		| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(swiz);
+	code->inst[counter].inst5 = R500_ALU_RGBA_OP_SOP
+		| R500_ALU_RGBA_ADDRD(dest);
+	switch (opcode) {
+		case OPCODE_COS:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_COS;
+			break;
+		case OPCODE_EX2:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_EX2;
+			break;
+		case OPCODE_LG2:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_LN2;
+			break;
+		case OPCODE_RCP:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_RCP;
+			break;
+		case OPCODE_RSQ:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_RSQ;
+			break;
+		case OPCODE_SIN:
+			code->inst[counter].inst4 |= R500_ALPHA_OP_SIN;
+			break;
+		default:
+			ERROR("Bad opcode in emit_sop: %d\n", opcode);
+			break;
+	}
+}
+
+static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi, int counter) {
+	PROG_CODE;
+	GLuint src[3], dest = 0;
+	int temp_swiz = 0;
+
+	if (fpi->Opcode != OPCODE_KIL) {
+		dest = make_dest(cs, fpi->DstReg);
+	}
+
+	switch (fpi->Opcode) {
+		case OPCODE_ABS:
+			emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+			code->inst[counter].inst3 |= R500_ALU_RGB_MOD_A_ABS
+				| R500_ALU_RGB_MOD_B_ABS;
+			code->inst[counter].inst4 |= R500_ALPHA_MOD_A_ABS
+				| R500_ALPHA_MOD_B_ABS;
+			break;
+		case OPCODE_ADD:
+			/* Variation on MAD: 1*src0+src1 */
+			emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
+			break;
+		case OPCODE_CMP:
+			/* This inst's selects need to be swapped as follows:
+				* 0 -> C ; 1 -> B ; 2 -> A */
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			src[2] = make_src(cs, fpi->SrcReg[2]);
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[2])
+				| R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[0]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[2])
+				| R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[0]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[2]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[2]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC2
+				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]));
+			break;
+		case OPCODE_COS:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = emit_const4fv(cs, RCP_2PI);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+				| R500_ALPHA_ADDRD(get_temp(cs, 1))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+			counter++;
+			emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_DP3:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_DP4:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			/* Based on DP3 */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_DPH:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			/* Based on DP3 */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_DST:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			/* [1, src0.y*src1.y, src0.z, src1.w]
+				* So basically MUL with lotsa swizzling. */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| R500_ALU_RGB_SEL_B_SRC1;
+			/* Select [1, y, z, 1] */
+			temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE;
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(temp_swiz);
+			/* Select [1, y, 1, w] */
+			temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6);
+			code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(temp_swiz);
+			code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(dest)
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			break;
+		case OPCODE_EX2:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			emit_sop(cs, counter, fpi, OPCODE_EX2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_FLR:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_FRC
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0));
+			counter++;
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(get_temp(cs, 0));
+			code->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+				| R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC1
+				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC1
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGBA_MOD_C_NEG;
+			break;
+		case OPCODE_FRC:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_FRC
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_LG2:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_LIT:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = emit_const4fv(cs, LIT);
+			/* First inst: MAX temp, input, [0, 0, 0, -128]
+				* Write: RG, A  */
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARG << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAX
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0));
+			counter++;
+			/* Second inst: MIN temp, temp, [x, x, x, 128]
+				* Write: A */
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_A << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)) | R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)) | R500_ALPHA_ADDR1(src[1]);
+			/* code->inst[counter].inst3; */
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAX
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+				| R500_ALU_RGBA_ADDRD(dest);
+			counter++;
+			/* Third-fifth insts: POW temp, temp.y, temp.w
+				* Write: B */
+			emit_sop(cs, counter, fpi, OPCODE_LG2, get_temp(cs, 0), SWIZZLE_Y, get_temp(cs, 1));
+			code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
+			counter++;
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 1))
+				| R500_RGB_ADDR1(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 1))
+				| R500_ALPHA_ADDR1(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 1))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 1))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), SWIZZLE_W, get_temp(cs, 0));
+			code->inst[counter].inst0 |= (R500_WRITEMASK_B << 11);
+			counter++;
+			/* Sixth inst: CMP dest, temp.xxxx, temp.[1, x, z, 1], temp.[1, x, 0, 1];
+				* Write: ARGB
+				* This inst's selects need to be swapped as follows:
+				* 0 -> C ; 1 -> B ; 2 -> A */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| R500_ALU_RGB_R_SWIZ_A_1
+				| R500_ALU_RGB_G_SWIZ_A_R
+				| R500_ALU_RGB_B_SWIZ_A_B
+				| R500_ALU_RGB_SEL_B_SRC0
+				| R500_ALU_RGB_R_SWIZ_B_1
+				| R500_ALU_RGB_G_SWIZ_B_R
+				| R500_ALU_RGB_B_SWIZ_B_0;
+			code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_1
+				| R500_ALPHA_SEL_B_SRC0 | R500_ALPHA_SWIZ_B_1;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC0
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+				| R500_ALU_RGBA_R_SWIZ_R
+				| R500_ALU_RGBA_G_SWIZ_R
+				| R500_ALU_RGBA_B_SWIZ_R
+				| R500_ALU_RGBA_A_SWIZ_R;
+			break;
+		case OPCODE_LRP:
+			/* src0 * src1 + INV(src0) * src2
+				* 1) MUL src0, src1, temp
+				* 2) PRE 1-src0; MAD srcp, src2, temp */
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			src[2] = make_src(cs, fpi->SrcReg[2]);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| R500_INST_NOP | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[2])
+				| R500_RGB_ADDR2(get_temp(cs, 0))
+				| R500_RGB_SRCP_OP_1_MINUS_RGB0;
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[2])
+				| R500_ALPHA_ADDR2(get_temp(cs, 0))
+				| R500_ALPHA_SRCP_OP_1_MINUS_A0;
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
+			break;
+		case OPCODE_MAD:
+			emit_mad(cs, counter, fpi, 0, 1, 2);
+			break;
+		case OPCODE_MAX:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1
+				| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_MAX
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_MIN:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1
+				| MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 |= R500_ALPHA_OP_MIN
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
+				| R500_ALU_RGBA_ADDRD(dest);
+			break;
+		case OPCODE_MOV:
+			emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+			break;
+		case OPCODE_MUL:
+			/* Variation on MAD: src0*src1+0 */
+			emit_mad(cs, counter, fpi, 0, 1, R500_SWIZZLE_ZERO);
+			break;
+		case OPCODE_POW:
+			/* POW(a,b) = EX2(LN2(a)*b) */
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), get_temp(cs, 0));
+			code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
+			counter++;
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0))
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0))
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 1))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 1))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_RCP:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			emit_sop(cs, counter, fpi, OPCODE_RCP, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_RSQ:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			emit_sop(cs, counter, fpi, OPCODE_RSQ, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_SCS:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = emit_const4fv(cs, RCP_2PI);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+				| R500_ALPHA_ADDRD(get_temp(cs, 1))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+			counter++;
+			/* Do a cosine, then a sine, masking out the channels we want to protect. */
+			/* Cosine only goes in R (x) channel. */
+			fpi->DstReg.WriteMask = 0x1;
+			emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+			counter++;
+			/* Sine only goes in G (y) channel. */
+			fpi->DstReg.WriteMask = 0x2;
+			emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_SGE:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
+				| R500_RGB_ADDR2(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
+				| R500_ALPHA_ADDR2(src[1]);
+			code->inst[counter].inst3 = /* 1 */
+				MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| R500_ALU_RGBA_SEL_C_SRC2
+				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+				| R500_ALU_RGBA_MOD_C_NEG
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
+				| R500_ALU_RGBA_ALPHA_MOD_C_NEG;
+			counter++;
+			/* This inst's selects need to be swapped as follows:
+				* 0 -> C ; 1 -> B ; 2 -> A */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+				| R500_ALU_RGB_SEL_B_SRC0
+				| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+			code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+				| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC0
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+				| R500_ALU_RGBA_A_SWIZ_A;
+			break;
+		case OPCODE_SIN:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = emit_const4fv(cs, RCP_2PI);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+				| R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+			code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+				| R500_ALPHA_ADDRD(get_temp(cs, 1))
+				| R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+			counter++;
+			emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+			break;
+		case OPCODE_SLT:
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_ARGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
+				| R500_RGB_ADDR2(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
+				| R500_ALPHA_ADDR2(src[1]);
+			code->inst[counter].inst3 = /* 1 */
+				MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+				| R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| R500_ALU_RGBA_SEL_C_SRC2
+				| MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+				| R500_ALU_RGBA_MOD_C_NEG
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+				| MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
+				| R500_ALU_RGBA_ALPHA_MOD_C_NEG;
+			counter++;
+			/* This inst's selects need to be swapped as follows:
+				* 0 -> C ; 1 -> B ; 2 -> A */
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
+				| R500_ALU_RGB_SEL_B_SRC0
+				| MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
+			code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO)
+				| R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC0
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+				| R500_ALU_RGBA_A_SWIZ_A;
+			break;
+		case OPCODE_SUB:
+			/* Variation on MAD: 1*src0-src1 */
+			fpi->SrcReg[1].NegateBase = 0xF; /* NEG_XYZW */
+			emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
+			break;
+		case OPCODE_SWZ:
+			/* TODO: The rarer negation masks! */
+			emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+			break;
+		case OPCODE_XPD:
+			/* src0 * src1 - src1 * src0
+				* 1) MUL temp.xyz, src0.yzx, src1.zxy
+				* 2) MAD src0.zxy, src1.yzx, -temp.xyz */
+			src[0] = make_src(cs, fpi->SrcReg[0]);
+			src[1] = make_src(cs, fpi->SrcReg[1]);
+			code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+				| (R500_WRITEMASK_RGB << 11);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1]);
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1]);
+			/* Select [y, z, x] */
+			temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
+			temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(temp_swiz);
+			/* Select [z, x, y] */
+			temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
+			temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
+			code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+				| MAKE_SWIZ_RGB_B(temp_swiz);
+			code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(get_temp(cs, 0))
+				| R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+				| R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+				| MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+			counter++;
+			emit_alu(cs, counter, fpi);
+			code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+				| R500_RGB_ADDR1(src[1])
+				| R500_RGB_ADDR2(get_temp(cs, 0));
+			code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+				| R500_ALPHA_ADDR1(src[1])
+				| R500_ALPHA_ADDR2(get_temp(cs, 0));
+			/* Select [z, x, y] */
+			temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
+			temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
+			code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+				| MAKE_SWIZ_RGB_A(temp_swiz);
+			/* Select [y, z, x] */
+			temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
+			temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
+			code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+				| MAKE_SWIZ_RGB_B(temp_swiz);
+			code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+				| R500_ALPHA_ADDRD(dest)
+				| R500_ALPHA_SWIZ_A_1
+				| R500_ALPHA_SWIZ_B_1;
+			code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+				| R500_ALU_RGBA_ADDRD(dest)
+				| R500_ALU_RGBA_SEL_C_SRC2
+				| MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+				| R500_ALU_RGBA_MOD_C_NEG
+				| R500_ALU_RGBA_A_SWIZ_0;
+			break;
+		case OPCODE_KIL:
+		case OPCODE_TEX:
+		case OPCODE_TXB:
+		case OPCODE_TXP:
+			emit_tex(cs, fpi, dest, counter);
+				if (fpi->DstReg.File == PROGRAM_OUTPUT)
+					counter++;
+			break;
+		default:
+			ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode));
+			break;
+	}
+
+	/* Finishing touches */
+	if (fpi->SaturateMode == SATURATE_ZERO_ONE) {
+		code->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
+	}
+
+	counter++;
+
+	return counter;
+}
+
+static GLboolean parse_program(struct r500_pfs_compile_state *cs)
+{
+	PROG_CODE;
+	int clauseidx, counter = 0;
+
+	for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; clauseidx++) {
+		struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+		struct prog_instruction* fpi;
+
+		int ip;
+
+		for (ip = 0; ip < clause->NumInstructions; ip++) {
+			fpi = clause->Instructions + ip;
+			counter = do_inst(cs, fpi, counter);
+
+			if (cs->compiler->fp->error)
+				return GL_FALSE;
+		}
+	}
+
+	/* Finish him! (If it's an ALU/OUT instruction...) */
+	if ((code->inst[counter-1].inst0 & 0x3) == 1) {
+		code->inst[counter-1].inst0 |= R500_INST_LAST;
+	} else {
+		/* We still need to put an output inst, right? */
+		WARN_ONCE("Final FP instruction is not an OUT.\n");
+	}
+
+	cs->nrslots = counter;
+
+	code->max_temp_idx++;
+
+	return GL_TRUE;
+}
+
+static void init_program(struct r500_pfs_compile_state *cs)
+{
+	PROG_CODE;
+	struct gl_fragment_program *mp = &cs->compiler->fp->mesa_program;
+	struct prog_instruction *fpi;
+	GLuint InputsRead = mp->Base.InputsRead;
+	GLuint temps_used = 0;
+	int i, j;
+
+	/* New compile, reset tracking data */
+	cs->compiler->fp->optimization =
+	    driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
+	cs->compiler->fp->translated = GL_FALSE;
+	cs->compiler->fp->error = GL_FALSE;
+	code->const_nr = 0;
+	/* Size of pixel stack, plus 1. */
+	code->max_temp_idx = 1;
+	/* Temp register offset. */
+	code->temp_reg_offset = 0;
+	/* Whether or not we perform any depth writing. */
+	cs->compiler->fp->writes_depth = GL_FALSE;
+
+	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+		for (j = 0; j < 3; j++) {
+			cs->slot[i].vsrc[j] = SRC_CONST;
+			cs->slot[i].ssrc[j] = SRC_CONST;
+		}
+	}
+
+	/* Work out what temps the Mesa inputs correspond to, this must match
+	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+	 * configures itself based on the fragprog's InputsRead
+	 *
+	 * NOTE: this depends on get_hw_temp() allocating registers in order,
+	 * starting from register 0, so we're just going to do that instead.
+	 */
+
+	/* Texcoords come first */
+	for (i = 0; i < cs->compiler->fp->ctx->Const.MaxTextureUnits; i++) {
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+				code->temp_reg_offset;
+			code->temp_reg_offset++;
+		}
+	}
+	InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+	/* fragment position treated as a texcoord */
+	if (InputsRead & FRAG_BIT_WPOS) {
+		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_WPOS].reg =
+			code->temp_reg_offset;
+		code->temp_reg_offset++;
+	}
+	InputsRead &= ~FRAG_BIT_WPOS;
+
+	/* Then primary colour */
+	if (InputsRead & FRAG_BIT_COL0) {
+		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL0].reg =
+			code->temp_reg_offset;
+		code->temp_reg_offset++;
+	}
+	InputsRead &= ~FRAG_BIT_COL0;
+
+	/* Secondary color */
+	if (InputsRead & FRAG_BIT_COL1) {
+		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL1].reg =
+			code->temp_reg_offset;
+		code->temp_reg_offset++;
+	}
+	InputsRead &= ~FRAG_BIT_COL1;
+
+	/* Anything else */
+	if (InputsRead) {
+		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+		/* force read from hwreg 0 for now */
+		for (i = 0; i < 32; i++)
+			if (InputsRead & (1 << i))
+				cs->inputs[i].reg = 0;
+	}
+
+	int clauseidx;
+
+	for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
+		struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+		int ip;
+
+		for (ip = 0; ip < clause->NumInstructions; ip++) {
+			fpi = clause->Instructions + ip;
+			for (i = 0; i < 3; i++) {
+				if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) {
+					if (fpi->SrcReg[i].Index >= temps_used)
+						temps_used = fpi->SrcReg[i].Index + 1;
+				}
+			}
+		}
+	}
+
+
+	cs->temp_in_use = temps_used + 1;
+
+	code->max_temp_idx = code->temp_reg_offset + cs->temp_in_use;
+
+	if (RADEON_DEBUG & DEBUG_PIXEL)
+		fprintf(stderr, "FP temp indices: code->max_temp_idx: %d cs->temp_in_use: %d\n", code->max_temp_idx, cs->temp_in_use);
+}
+
+static void dumb_shader(struct r500_pfs_compile_state *cs)
+{
+	PROG_CODE;
+	code->inst[0].inst0 = R500_INST_TYPE_TEX
+		| R500_INST_TEX_SEM_WAIT
+		| R500_INST_RGB_WMASK_R
+		| R500_INST_RGB_WMASK_G
+		| R500_INST_RGB_WMASK_B
+		| R500_INST_ALPHA_WMASK
+		| R500_INST_RGB_CLAMP
+		| R500_INST_ALPHA_CLAMP;
+	code->inst[0].inst1 = R500_TEX_ID(0)
+		| R500_TEX_INST_LD
+		| R500_TEX_SEM_ACQUIRE
+		| R500_TEX_IGNORE_UNCOVERED;
+	code->inst[0].inst2 = R500_TEX_SRC_ADDR(0)
+		| R500_TEX_SRC_S_SWIZ_R
+		| R500_TEX_SRC_T_SWIZ_G
+		| R500_TEX_DST_ADDR(0)
+		| R500_TEX_DST_R_SWIZ_R
+		| R500_TEX_DST_G_SWIZ_G
+		| R500_TEX_DST_B_SWIZ_B
+		| R500_TEX_DST_A_SWIZ_A;
+	code->inst[0].inst3 = R500_DX_ADDR(0)
+		| R500_DX_S_SWIZ_R
+		| R500_DX_T_SWIZ_R
+		| R500_DX_R_SWIZ_R
+		| R500_DX_Q_SWIZ_R
+		| R500_DY_ADDR(0)
+		| R500_DY_S_SWIZ_R
+		| R500_DY_T_SWIZ_R
+		| R500_DY_R_SWIZ_R
+		| R500_DY_Q_SWIZ_R;
+	code->inst[0].inst4 = 0x0;
+	code->inst[0].inst5 = 0x0;
+
+	code->inst[1].inst0 = R500_INST_TYPE_OUT |
+		R500_INST_TEX_SEM_WAIT |
+		R500_INST_LAST |
+		R500_INST_RGB_OMASK_R |
+		R500_INST_RGB_OMASK_G |
+		R500_INST_RGB_OMASK_B |
+		R500_INST_ALPHA_OMASK;
+	code->inst[1].inst1 = R500_RGB_ADDR0(0) |
+		R500_RGB_ADDR1(0) |
+		R500_RGB_ADDR1_CONST |
+		R500_RGB_ADDR2(0) |
+		R500_RGB_ADDR2_CONST |
+		R500_RGB_SRCP_OP_1_MINUS_2RGB0;
+	code->inst[1].inst2 = R500_ALPHA_ADDR0(0) |
+		R500_ALPHA_ADDR1(0) |
+		R500_ALPHA_ADDR1_CONST |
+		R500_ALPHA_ADDR2(0) |
+		R500_ALPHA_ADDR2_CONST |
+		R500_ALPHA_SRCP_OP_1_MINUS_2A0;
+	code->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 |
+		R500_ALU_RGB_R_SWIZ_A_R |
+		R500_ALU_RGB_G_SWIZ_A_G |
+		R500_ALU_RGB_B_SWIZ_A_B |
+		R500_ALU_RGB_SEL_B_SRC0 |
+		R500_ALU_RGB_R_SWIZ_B_1 |
+		R500_ALU_RGB_B_SWIZ_B_1 |
+		R500_ALU_RGB_G_SWIZ_B_1;
+	code->inst[1].inst4 = R500_ALPHA_OP_MAD |
+		R500_ALPHA_SWIZ_A_A |
+		R500_ALPHA_SWIZ_B_1;
+	code->inst[1].inst5 = R500_ALU_RGBA_OP_MAD |
+		R500_ALU_RGBA_R_SWIZ_0 |
+		R500_ALU_RGBA_G_SWIZ_0 |
+		R500_ALU_RGBA_B_SWIZ_0 |
+		R500_ALU_RGBA_A_SWIZ_0;
+
+	cs->nrslots = 2;
+}
+
+GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+{
+	struct r500_pfs_compile_state cs;
+	struct r500_fragment_program_code *code = compiler->code;
+
+	_mesa_memset(&cs, 0, sizeof(cs));
+	cs.compiler = compiler;
+	init_program(&cs);
+
+	if (!parse_program(&cs)) {
+#if 0
+		ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n");
+		dumb_shader(fp);
+		code->inst_offset = 0;
+		code->inst_end = cs.nrslots - 1;
+#endif
+		return GL_FALSE;
+	}
+
+	code->inst_offset = 0;
+	code->inst_end = cs.nrslots - 1;
+
+	return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 38d89306016..7458d63723f 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -53,16 +53,6 @@ struct radeon_context;
 typedef struct radeon_context radeonContextRec;
 typedef struct radeon_context *radeonContextPtr;
 
-#define TEX_0   0x1
-#define TEX_1   0x2
-#define TEX_2	0x4
-#define TEX_3	0x8
-#define TEX_4	0x10
-#define TEX_5	0x20
-#define TEX_6	0x40
-#define TEX_7	0x80
-#define TEX_ALL 0xff
-
 /* Rasterizing fallbacks */
 /* See correponding strings in r200_swtcl.c */
 #define RADEON_FALLBACK_TEXTURE		0x0001
diff --git a/src/mesa/drivers/dri/r300/radeon_program.c b/src/mesa/drivers/dri/r300/radeon_program.c
new file mode 100644
index 00000000000..c8f40e81893
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program.h"
+
+#include "shader/prog_print.h"
+
+/**
+ * Initialize a compiler structure with a single mixed clause
+ * containing all instructions from the source program.
+ */
+void radeonCompilerInit(
+	struct radeon_compiler *compiler,
+	GLcontext *ctx,
+	struct gl_program *source)
+{
+	struct radeon_clause* clause;
+
+	_mesa_memset(compiler, 0, sizeof(*compiler));
+	compiler->Source = source;
+	compiler->Ctx = ctx;
+
+	compiler->NumTemporaries = source->NumTemporaries;
+
+	clause = radeonCompilerInsertClause(compiler, 0, CLAUSE_MIXED);
+	clause->NumInstructions = 0;
+	while(source->Instructions[clause->NumInstructions].Opcode != OPCODE_END)
+		clause->NumInstructions++;
+	clause->ReservedInstructions = clause->NumInstructions;
+	clause->Instructions = _mesa_alloc_instructions(clause->NumInstructions);
+	_mesa_copy_instructions(clause->Instructions, source->Instructions, clause->NumInstructions);
+}
+
+
+/**
+ * Free all data that is referenced by the compiler structure.
+ * However, the compiler structure itself is not freed.
+ */
+void radeonCompilerCleanup(struct radeon_compiler *compiler)
+{
+	radeonCompilerEraseClauses(compiler, 0, compiler->NumClauses);
+}
+
+
+/**
+ * Allocate and return a unique temporary register.
+ */
+int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler)
+{
+	if (compiler->NumTemporaries >= 256) {
+		_mesa_problem(compiler->Ctx, "radeonCompiler: Too many temporaries");
+		return 0;
+	}
+
+	return compiler->NumTemporaries++;
+}
+
+
+static const char* clausename(int type)
+{
+	switch(type) {
+	case CLAUSE_MIXED: return "CLAUSE_MIXED";
+	case CLAUSE_ALU: return "CLAUSE_ALU";
+	case CLAUSE_TEX: return "CLAUSE_TEX";
+	default: return "CLAUSE_UNKNOWN";
+	}
+}
+
+
+/**
+ * Dump the current compiler state to the console for debugging.
+ */
+void radeonCompilerDump(struct radeon_compiler *compiler)
+{
+	int i;
+	for(i = 0; i < compiler->NumClauses; ++i) {
+		struct radeon_clause *clause = &compiler->Clauses[i];
+		int j;
+
+		_mesa_printf("%2i: %s\n", i+1, clausename(clause->Type));
+
+		for(j = 0; j < clause->NumInstructions; ++j) {
+			_mesa_printf("%4i: ", j+1);
+			_mesa_print_instruction(&clause->Instructions[j]);
+		}
+	}
+}
+
+
+/**
+ * \p position index of the new clause; later clauses are moved
+ * \p type of the new clause; one of CLAUSE_XXX
+ * \return a pointer to the new clause
+ */
+struct radeon_clause* radeonCompilerInsertClause(
+	struct radeon_compiler *compiler,
+	int position, int type)
+{
+	struct radeon_clause* oldClauses = compiler->Clauses;
+	struct radeon_clause* clause;
+
+	assert(position >= 0 && position <= compiler->NumClauses);
+
+	compiler->Clauses = (struct radeon_clause *)
+		_mesa_malloc((compiler->NumClauses+1) * sizeof(struct radeon_clause));
+	if (oldClauses) {
+		_mesa_memcpy(compiler->Clauses, oldClauses,
+			position*sizeof(struct radeon_clause));
+		_mesa_memcpy(compiler->Clauses+position+1, oldClauses+position,
+			(compiler->NumClauses - position) * sizeof(struct radeon_clause));
+		_mesa_free(oldClauses);
+	}
+	compiler->NumClauses++;
+
+	clause = compiler->Clauses + position;
+	_mesa_memset(clause, 0, sizeof(*clause));
+	clause->Type = type;
+
+	return clause;
+}
+
+
+/**
+ * Remove clauses in the range [start, end)
+ */
+void radeonCompilerEraseClauses(
+	struct radeon_compiler *compiler,
+	int start, int end)
+{
+	struct radeon_clause* oldClauses = compiler->Clauses;
+	int i;
+
+	assert(0 <= start);
+	assert(start <= end);
+	assert(end <= compiler->NumClauses);
+
+	if (end == start)
+		return;
+
+	for(i = start; i < end; ++i) {
+		struct radeon_clause* clause = oldClauses + i;
+		_mesa_free_instructions(clause->Instructions, clause->NumInstructions);
+	}
+
+	if (start > 0 || end < compiler->NumClauses) {
+		compiler->Clauses = (struct radeon_clause*)
+			_mesa_malloc((compiler->NumClauses+start-end) * sizeof(struct radeon_clause));
+		_mesa_memcpy(compiler->Clauses, oldClauses,
+			start * sizeof(struct radeon_clause));
+		_mesa_memcpy(compiler->Clauses + start, oldClauses + end,
+			(compiler->NumClauses - end) * sizeof(struct radeon_clause));
+		compiler->NumClauses -= end - start;
+	} else {
+		compiler->Clauses = 0;
+		compiler->NumClauses = 0;
+	}
+
+	_mesa_free(oldClauses);
+}
+
+
+/**
+ * Insert new instructions at the given position, initialize them as NOPs
+ * and return a pointer to the first new instruction.
+ */
+struct prog_instruction* radeonClauseInsertInstructions(
+	struct radeon_compiler *compiler,
+	struct radeon_clause *clause,
+	int position, int count)
+{
+	int newNumInstructions = clause->NumInstructions + count;
+
+	assert(position >= 0 && position <= clause->NumInstructions);
+
+	if (newNumInstructions <= clause->ReservedInstructions) {
+		memmove(clause->Instructions + position + count, clause->Instructions + position,
+			(clause->NumInstructions - position) * sizeof(struct prog_instruction));
+	} else {
+		struct prog_instruction *oldInstructions = clause->Instructions;
+
+		clause->ReservedInstructions *= 2;
+		if (newNumInstructions > clause->ReservedInstructions)
+			clause->ReservedInstructions = newNumInstructions;
+
+		clause->Instructions = (struct prog_instruction*)
+			_mesa_malloc(clause->ReservedInstructions * sizeof(struct prog_instruction));
+
+		if (oldInstructions) {
+			_mesa_memcpy(clause->Instructions, oldInstructions,
+				position * sizeof(struct prog_instruction));
+			_mesa_memcpy(clause->Instructions + position + count, oldInstructions + position,
+				(clause->NumInstructions - position) * sizeof(struct prog_instruction));
+
+			_mesa_free(oldInstructions);
+		}
+	}
+
+	clause->NumInstructions = newNumInstructions;
+	_mesa_init_instructions(clause->Instructions + position, count);
+	return clause->Instructions + position;
+}
+
+
+/**
+ * Transform the given clause in the following way:
+ *  1. Replace it with an empty clause
+ *  2. For every instruction in the original clause, try the given
+ *     transformations in order.
+ *  3. If one of the transformations returns GL_TRUE, assume that it
+ *     has emitted the appropriate instruction(s) into the new clause;
+ *     otherwise, copy the instruction verbatim.
+ *
+ * \note The transformation is currently not recursive; in other words,
+ * instructions emitted by transformations are not transformed.
+ *
+ * \note The transform is called 'local' because it can only look at
+ * one instruction at a time.
+ */
+void radeonClauseLocalTransform(
+	struct radeon_compiler *compiler,
+	struct radeon_clause *clause,
+	int num_transformations,
+	struct radeon_program_transformation* transformations)
+{
+	struct radeon_program_transform_context context;
+	struct radeon_clause source;
+	int ip;
+
+	source = *clause;
+	clause->Instructions = 0;
+	clause->NumInstructions = 0;
+	clause->ReservedInstructions = 0;
+
+	context.compiler = compiler;
+	context.dest = clause;
+	context.src = &source;
+
+	for(ip = 0; ip < source.NumInstructions; ++ip) {
+		struct prog_instruction *instr = source.Instructions + ip;
+		int i;
+
+		for(i = 0; i < num_transformations; ++i) {
+			struct radeon_program_transformation* t = transformations + i;
+
+			if (t->function(&context, instr, t->userData))
+				break;
+		}
+
+		if (i >= num_transformations) {
+			struct prog_instruction *tgt =
+				radeonClauseInsertInstructions(compiler, clause, clause->NumInstructions, 1);
+			_mesa_copy_instructions(tgt, instr, 1);
+		}
+	}
+
+	_mesa_free_instructions(source.Instructions, source.NumInstructions);
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h
new file mode 100644
index 00000000000..25e70505b16
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_H_
+#define __RADEON_PROGRAM_H_
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+
+enum {
+	CLAUSE_MIXED = 0,
+	CLAUSE_ALU,
+	CLAUSE_TEX
+};
+
+enum {
+	PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */
+};
+
+#define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
+#define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
+
+/**
+ * A clause is simply a sequence of instructions that are executed
+ * in order.
+ */
+struct radeon_clause {
+	/**
+	 * Type of this clause, one of CLAUSE_XXX.
+	 */
+	int Type : 2;
+
+	/**
+	 * Pointer to an array of instructions.
+	 * The array is terminated by an OPCODE_END instruction.
+	 */
+	struct prog_instruction *Instructions;
+
+	/**
+	 * Number of instructions in this clause.
+	 */
+	int NumInstructions;
+
+	/**
+	 * Space reserved for instructions in this clause.
+	 */
+	int ReservedInstructions;
+};
+
+/**
+ * A compile object, holding the current intermediate state during compilation.
+ */
+struct radeon_compiler {
+	struct gl_program *Source;
+	GLcontext* Ctx;
+
+	/**
+	 * Number of clauses in this program.
+	 */
+	int NumClauses;
+
+	/**
+	 * Pointer to an array of NumClauses clauses.
+	 */
+	struct radeon_clause *Clauses;
+
+	/**
+	 * Number of registers in the PROGRAM_TEMPORARIES file.
+	 */
+	int NumTemporaries;
+};
+
+void radeonCompilerInit(
+	struct radeon_compiler *compiler,
+	GLcontext *ctx,
+	struct gl_program *source);
+void radeonCompilerCleanup(struct radeon_compiler *compiler);
+int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler);
+void radeonCompilerDump(struct radeon_compiler *compiler);
+
+struct radeon_clause *radeonCompilerInsertClause(
+	struct radeon_compiler *compiler,
+	int position,
+	int type);
+void radeonCompilerEraseClauses(
+	struct radeon_compiler *compiler,
+	int start,
+	int end);
+
+struct prog_instruction* radeonClauseInsertInstructions(
+	struct radeon_compiler *compiler,
+	struct radeon_clause *clause,
+	int position, int count);
+
+/**
+ *
+ */
+struct radeon_program_transform_context {
+	struct radeon_compiler *compiler;
+
+	/**
+	 * Destination clause where new instructions must be written.
+	 */
+	struct radeon_clause *dest;
+
+	/**
+	 * Original clause that is currently being transformed.
+	 */
+	struct radeon_clause *src;
+};
+
+/**
+ * A transformation that can be passed to \ref radeonClauseLinearTransform.
+ *
+ * The function will be called once for each instruction.
+ * It has to either emit the appropriate transformed code for the instruction
+ * and return GL_TRUE, or return GL_FALSE if it doesn't understand the
+ * instruction.
+ *
+ * The function gets passed the userData as last parameter.
+ */
+struct radeon_program_transformation {
+	GLboolean (*function)(
+		struct radeon_program_transform_context*,
+		struct prog_instruction*,
+		void*);
+	void *userData;
+};
+
+void radeonClauseLocalTransform(
+	struct radeon_compiler *compiler,
+	struct radeon_clause *clause,
+	int num_transformations,
+	struct radeon_program_transformation* transformations);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c
new file mode 100644
index 00000000000..7fe940a7d7f
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Shareable transformations that transform "special" ALU instructions
+ * into ALU instructions that are supported by hardware.
+ *
+ */
+
+#include "radeon_program_alu.h"
+
+
+static struct prog_instruction *emit1(struct radeon_program_transform_context* ctx,
+	gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg)
+{
+	struct prog_instruction *fpi =
+		radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+			ctx->dest->NumInstructions, 1);
+
+	fpi->Opcode = Opcode;
+	fpi->DstReg = DstReg;
+	fpi->SrcReg[0] = SrcReg;
+	return fpi;
+}
+
+static struct prog_instruction *emit2(struct radeon_program_transform_context* ctx,
+	gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
+{
+	struct prog_instruction *fpi =
+		radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+			ctx->dest->NumInstructions, 1);
+
+	fpi->Opcode = Opcode;
+	fpi->DstReg = DstReg;
+	fpi->SrcReg[0] = SrcReg0;
+	fpi->SrcReg[1] = SrcReg1;
+	return fpi;
+}
+
+static struct prog_instruction *emit3(struct radeon_program_transform_context* ctx,
+	gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
+	struct prog_src_register SrcReg2)
+{
+	struct prog_instruction *fpi =
+		radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+			ctx->dest->NumInstructions, 1);
+
+	fpi->Opcode = Opcode;
+	fpi->DstReg = DstReg;
+	fpi->SrcReg[0] = SrcReg0;
+	fpi->SrcReg[1] = SrcReg1;
+	fpi->SrcReg[2] = SrcReg2;
+	return fpi;
+}
+
+static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
+{
+	SrcReg->Swizzle &= ~(7 << (3*coordinate));
+	SrcReg->Swizzle |= swz << (3*coordinate);
+}
+
+static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
+{
+	SrcReg->NegateBase &= ~(1 << coordinate);
+	SrcReg->NegateBase |= (negate << coordinate);
+}
+
+static struct prog_dst_register dstreg(int file, int index)
+{
+	struct prog_dst_register dst;
+	dst.File = file;
+	dst.Index = index;
+	dst.WriteMask = WRITEMASK_XYZW;
+	dst.CondMask = COND_TR;
+	dst.CondSwizzle = SWIZZLE_NOOP;
+	dst.CondSrc = 0;
+	dst.pad = 0;
+	return dst;
+}
+
+static const struct prog_src_register builtin_zero = {
+	.File = PROGRAM_BUILTIN,
+	.Index = 0,
+	.Swizzle = SWIZZLE_0000
+};
+static const struct prog_src_register builtin_one = {
+	.File = PROGRAM_BUILTIN,
+	.Index = 0,
+	.Swizzle = SWIZZLE_1111
+};
+static const struct prog_src_register srcreg_undefined = {
+	.File = PROGRAM_UNDEFINED,
+	.Index = 0,
+	.Swizzle = SWIZZLE_NOOP
+};
+
+static struct prog_src_register srcreg(int file, int index)
+{
+	struct prog_src_register src = srcreg_undefined;
+	src.File = file;
+	src.Index = index;
+	return src;
+}
+
+static struct prog_src_register negate(struct prog_src_register reg)
+{
+	struct prog_src_register newreg = reg;
+	newreg.NegateAbs = !newreg.NegateAbs;
+	return newreg;
+}
+
+static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
+{
+	struct prog_src_register swizzled = reg;
+	swizzled.Swizzle = MAKE_SWIZZLE4(
+		GET_SWZ(reg.Swizzle, x),
+		GET_SWZ(reg.Swizzle, y),
+		GET_SWZ(reg.Swizzle, z),
+		GET_SWZ(reg.Swizzle, w));
+	return swizzled;
+}
+
+static struct prog_src_register scalar(struct prog_src_register reg)
+{
+	return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+}
+
+static void transform_ABS(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	struct prog_src_register src = inst->SrcReg[0];
+	src.Abs = 1;
+	src.NegateBase = 0;
+	src.NegateAbs = 0;
+	emit1(ctx, OPCODE_MOV, inst->DstReg, src);
+}
+
+static void transform_DPH(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	struct prog_src_register src0 = inst->SrcReg[0];
+	if (src0.NegateAbs) {
+		if (src0.Abs) {
+			int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+			emit1(ctx, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
+			src0 = srcreg(src0.File, src0.Index);
+		} else {
+			src0.NegateAbs = 0;
+			src0.NegateBase ^= NEGATE_XYZW;
+		}
+	}
+	set_swizzle(&src0, 3, SWIZZLE_ONE);
+	set_negate_base(&src0, 3, 0);
+	emit2(ctx, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]);
+}
+
+static void transform_FLR(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+	emit1(ctx, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]);
+	emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+static void transform_POW(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+	struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
+	struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
+	tempdst.WriteMask = WRITEMASK_W;
+	tempsrc.Swizzle = SWIZZLE_WWWW;
+
+	emit1(ctx, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0]));
+	emit2(ctx, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1]));
+	emit1(ctx, OPCODE_EX2, inst->DstReg, tempsrc);
+}
+
+static void transform_SGE(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+	emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+	emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
+}
+
+static void transform_SLT(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+	emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+	emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
+}
+
+static void transform_SUB(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1]));
+}
+
+static void transform_SWZ(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	emit1(ctx, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]);
+}
+
+static void transform_XPD(struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst)
+{
+	int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+	emit2(ctx, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg),
+		swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+		swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
+	emit3(ctx, OPCODE_MAD, inst->DstReg,
+		swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
+		swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+		negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+
+/**
+ * Can be used as a transformation for @ref radeonClauseLocalTransform,
+ * no userData necessary.
+ *
+ * Eliminates the following ALU instructions:
+ *  ABS, DPH, FLR, POW, SGE, SLT, SUB, SWZ, XPD
+ * using:
+ *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
+ *
+ * @note should be applicable to R300 and R500 fragment programs.
+ *
+ * @todo add LIT here as well?
+ */
+GLboolean radeonTransformALU(
+	struct radeon_program_transform_context* ctx,
+	struct prog_instruction* inst,
+	void* unused)
+{
+	switch(inst->Opcode) {
+	case OPCODE_ABS: transform_ABS(ctx, inst); return GL_TRUE;
+	case OPCODE_DPH: transform_DPH(ctx, inst); return GL_TRUE;
+	case OPCODE_FLR: transform_FLR(ctx, inst); return GL_TRUE;
+	case OPCODE_POW: transform_POW(ctx, inst); return GL_TRUE;
+	case OPCODE_SGE: transform_SGE(ctx, inst); return GL_TRUE;
+	case OPCODE_SLT: transform_SLT(ctx, inst); return GL_TRUE;
+	case OPCODE_SUB: transform_SUB(ctx, inst); return GL_TRUE;
+	case OPCODE_SWZ: transform_SWZ(ctx, inst); return GL_TRUE;
+	case OPCODE_XPD: transform_XPD(ctx, inst); return GL_TRUE;
+	default:
+		return GL_FALSE;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h
new file mode 100644
index 00000000000..940459624fc
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_ALU_H_
+#define __RADEON_PROGRAM_ALU_H_
+
+#include "radeon_program.h"
+
+GLboolean radeonTransformALU(
+	struct radeon_program_transform_context*,
+	struct prog_instruction*,
+	void*);
+
+#endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
index eae09d6b35e..f1bc56ea6a4 100644
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ b/src/mesa/drivers/dri/r300/radeon_span.c
@@ -282,6 +282,30 @@ static void radeonSpanRenderStart(GLcontext * ctx)
 #endif
 	LOCK_HARDWARE(rmesa);
 	radeonWaitForIdleLocked(rmesa);
+
+	/* Read the first pixel in the frame buffer.  This should
+	 * be a noop, right?  In fact without this conform fails as reading
+	 * from the framebuffer sometimes produces old results -- the
+	 * on-card read cache gets mixed up and doesn't notice that the
+	 * framebuffer has been updated.
+	 *
+	 * Note that we should probably be reading some otherwise unused
+	 * region of VRAM, otherwise we might get incorrect results when
+	 * reading pixels from the top left of the screen.
+	 *
+	 * I found this problem on an R420 with glean's texCube test.
+	 * Note that the R200 span code also *writes* the first pixel in the
+	 * framebuffer, but I've found this to be unnecessary.
+	 *  -- Nicolai Hähnle, June 2008
+	 */
+	{
+		int p;
+		driRenderbuffer *drb =
+			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
+		volatile int *buf =
+			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
+		p = *buf;
+	}
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
author	Eric Anholt <[email protected]>	2008-06-18 14:07:38 -0700
committer	Eric Anholt <[email protected]>	2008-06-18 14:07:38 -0700
commit	654258a4fe5e7114022c6e02f2844fc469fcc6f3 (patch)
tree	89d285becb87659ab61ee0ceeb35c76726ae93d2 /src/mesa/drivers/dri/r300
parent	64adeb163d7da6d75b5664cd2ee3783cadaf63d8 (diff)
parent	cf29ab3ba075905cca786b52617d7dc993f58033 (diff)