aboutsummaryrefslogtreecommitdiffstats
path: root/src/mesa/drivers/dri/r300
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2008-06-18 14:07:38 -0700
committerEric Anholt <[email protected]>2008-06-18 14:07:38 -0700
commit654258a4fe5e7114022c6e02f2844fc469fcc6f3 (patch)
tree89d285becb87659ab61ee0ceeb35c76726ae93d2 /src/mesa/drivers/dri/r300
parent64adeb163d7da6d75b5664cd2ee3783cadaf63d8 (diff)
parentcf29ab3ba075905cca786b52617d7dc993f58033 (diff)
Merge commit 'origin/master' into drm-gem
Diffstat (limited to 'src/mesa/drivers/dri/r300')
-rw-r--r--src/mesa/drivers/dri/r300/Makefile5
-rw-r--r--src/mesa/drivers/dri/r300/r300_context.c2
-rw-r--r--src/mesa/drivers/dri/r300/r300_context.h185
-rw-r--r--src/mesa/drivers/dri/r300/r300_fragprog.c2367
-rw-r--r--src/mesa/drivers/dri/r300/r300_fragprog.h46
-rw-r--r--src/mesa/drivers/dri/r300/r300_fragprog_emit.c2058
-rw-r--r--src/mesa/drivers/dri/r300/r300_ioctl.c17
-rw-r--r--src/mesa/drivers/dri/r300/r300_reg.h46
-rw-r--r--src/mesa/drivers/dri/r300/r300_render.c1
-rw-r--r--src/mesa/drivers/dri/r300/r300_shader.c1
-rw-r--r--src/mesa/drivers/dri/r300/r300_state.c178
-rw-r--r--src/mesa/drivers/dri/r300/r300_tex.c216
-rw-r--r--src/mesa/drivers/dri/r300/r300_texmem.c6
-rw-r--r--src/mesa/drivers/dri/r300/r300_texstate.c239
-rw-r--r--src/mesa/drivers/dri/r300/r500_fragprog.c1672
-rw-r--r--src/mesa/drivers/dri/r300/r500_fragprog.h13
-rw-r--r--src/mesa/drivers/dri/r300/r500_fragprog_emit.c1533
-rw-r--r--src/mesa/drivers/dri/r300/radeon_context.h10
-rw-r--r--src/mesa/drivers/dri/r300/radeon_program.c281
-rw-r--r--src/mesa/drivers/dri/r300/radeon_program.h164
-rw-r--r--src/mesa/drivers/dri/r300/radeon_program_alu.c284
-rw-r--r--src/mesa/drivers/dri/r300/radeon_program_alu.h38
-rw-r--r--src/mesa/drivers/dri/r300/radeon_span.c24
23 files changed, 5398 insertions, 3988 deletions
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 5b2bd0bc2b0..d52b2b4c36d 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -28,7 +28,6 @@ DRIVER_SOURCES = \
radeon_span.c \
radeon_state.c \
r300_mem.c \
- \
r300_context.c \
r300_ioctl.c \
r300_cmdbuf.c \
@@ -37,9 +36,13 @@ DRIVER_SOURCES = \
r300_texmem.c \
r300_tex.c \
r300_texstate.c \
+ radeon_program.c \
+ radeon_program_alu.c \
r300_vertprog.c \
r300_fragprog.c \
+ r300_fragprog_emit.c \
r500_fragprog.c \
+ r500_fragprog_emit.c \
r300_shader.c \
r300_emit.c \
r300_swtcl.c \
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 31cc00a081b..063d4e575ef 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -98,6 +98,7 @@ const struct dri_extension card_extensions[] = {
{"GL_ARB_fragment_program", NULL},
{"GL_ARB_multisample", GL_ARB_multisample_functions},
{"GL_ARB_multitexture", NULL},
+ {"GL_ARB_shadow", NULL},
{"GL_ARB_texture_border_clamp", NULL},
{"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
{"GL_ARB_texture_cube_map", NULL},
@@ -116,6 +117,7 @@ const struct dri_extension card_extensions[] = {
{"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
{"GL_EXT_gpu_program_parameters", GL_EXT_gpu_program_parameters_functions},
{"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
+ {"GL_EXT_shadow_funcs", NULL},
{"GL_EXT_stencil_two_side", GL_EXT_stencil_two_side_functions},
{"GL_EXT_stencil_wrap", NULL},
{"GL_EXT_texture_edge_clamp", NULL},
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index a5ec5ee46e2..6279a67ab16 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -73,7 +73,6 @@ typedef struct r300_context *r300ContextPtr;
}
#include "r300_vertprog.h"
-#include "r300_fragprog.h"
#include "r500_fragprog.h"
/**
@@ -179,13 +178,6 @@ struct r300_tex_obj {
GLuint bufAddr; /* Offset to start of locally
shared texture block */
- GLuint dirty_state; /* Flags (1 per texunit) for
- whether or not this texobj
- has dirty hardware state
- (pp_*) that needs to be
- brought into the
- texunit. */
-
drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
/* Six, for the cube faces */
@@ -581,9 +573,7 @@ struct r300_depthbuffer_state {
};
struct r300_stencilbuffer_state {
- GLuint clear;
GLboolean hw_stencil;
-
};
/* Vertex shader state */
@@ -663,96 +653,40 @@ struct r300_vertex_program_cont {
#define PFS_NUM_TEMP_REGS 32
#define PFS_NUM_CONST_REGS 16
-/* Mapping Mesa registers to R300 temporaries */
-struct reg_acc {
- int reg; /* Assigned hw temp */
- unsigned int refcount; /* Number of uses by mesa program */
-};
+struct r300_pfs_compile_state;
-/**
- * Describe the current lifetime information for an R300 temporary
- */
-struct reg_lifetime {
- /* Index of the first slot where this register is free in the sense
- that it can be used as a new destination register.
- This is -1 if the register has been assigned to a Mesa register
- and the last access to the register has not yet been emitted */
- int free;
-
- /* Index of the first slot where this register is currently reserved.
- This is used to stop e.g. a scalar operation from being moved
- before the allocation time of a register that was first allocated
- for a vector operation. */
- int reserved;
-
- /* Index of the first slot in which the register can be used as a
- source without losing the value that is written by the last
- emitted instruction that writes to the register */
- int vector_valid;
- int scalar_valid;
-
- /* Index to the slot where the register was last read.
- This is also the first slot in which the register may be written again */
- int vector_lastread;
- int scalar_lastread;
-};
/**
- * Store usage information about an ALU instruction slot during the
- * compilation of a fragment program.
+ * Stores state that influences the compilation of a fragment program.
*/
-#define SLOT_SRC_VECTOR (1<<0)
-#define SLOT_SRC_SCALAR (1<<3)
-#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
-#define SLOT_OP_VECTOR (1<<16)
-#define SLOT_OP_SCALAR (1<<17)
-#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
-
-struct r300_pfs_compile_slot {
- /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
- defined above */
- unsigned int used;
-
- /* Selected sources */
- int vsrc[3];
- int ssrc[3];
+struct r300_fragment_program_external_state {
+ struct {
+ /**
+ * If the sampler is used as a shadow sampler,
+ * this field is:
+ * 0 - GL_LUMINANCE
+ * 1 - GL_INTENSITY
+ * 2 - GL_ALPHA
+ * depending on the depth texture mode.
+ */
+ GLuint depth_texture_mode : 2;
+
+ /**
+ * If the sampler is used as a shadow sampler,
+ * this field is (texture_compare_func - GL_NEVER).
+ * [e.g. if compare function is GL_LEQUAL, this field is 3]
+ *
+ * Otherwise, this field is 0.
+ */
+ GLuint texture_compare_func : 3;
+ } unit[16];
};
-/**
- * Store information during compilation of fragment programs.
- */
-struct r300_pfs_compile_state {
- int nrslots; /* number of ALU slots used so far */
-
- /* Track which (parts of) slots are already filled with instructions */
- struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
-
- /* Track the validity of R300 temporaries */
- struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
-
- /* Used to map Mesa's inputs/temps onto hardware temps */
- int temp_in_use;
- struct reg_acc temps[PFS_NUM_TEMP_REGS];
- struct reg_acc inputs[32]; /* don't actually need 32... */
-
- /* Track usage of hardware temps, for register allocation,
- * indirection detection, etc. */
- GLuint used_in_node;
- GLuint dest_in_node;
-};
/**
- * Store everything about a fragment program that is needed
- * to render with that program.
+ * Stores an R300 fragment program in its compiled-to-hardware form.
*/
-struct r300_fragment_program {
- struct gl_fragment_program mesa_program;
-
- GLcontext *ctx;
- GLboolean translated;
- GLboolean error;
- struct r300_pfs_compile_state *cs;
-
+struct r300_fragment_program_code {
struct {
int length;
GLuint inst[PFS_MAX_TEX_INST];
@@ -793,19 +727,51 @@ struct r300_fragment_program {
int const_nr;
int max_temp_idx;
-
- GLboolean WritesDepth;
- GLuint optimization;
};
-struct r500_fragment_program {
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
+struct r300_fragment_program {
struct gl_fragment_program mesa_program;
- GLcontext *ctx;
GLboolean translated;
GLboolean error;
- struct r300_pfs_compile_state *cs;
+ struct r300_fragment_program_external_state state;
+ struct r300_fragment_program_code code;
+
+ GLboolean WritesDepth;
+ GLuint optimization;
+};
+
+struct r500_pfs_compile_state;
+
+struct r500_fragment_program_external_state {
+ struct {
+ /**
+ * If the sampler is used as a shadow sampler,
+ * this field is:
+ * 0 - GL_LUMINANCE
+ * 1 - GL_INTENSITY
+ * 2 - GL_ALPHA
+ * depending on the depth texture mode.
+ */
+ GLuint depth_texture_mode : 2;
+
+ /**
+ * If the sampler is used as a shadow sampler,
+ * this field is (texture_compare_func - GL_NEVER).
+ * [e.g. if compare function is GL_LEQUAL, this field is 3]
+ *
+ * Otherwise, this field is 0.
+ */
+ GLuint texture_compare_func : 3;
+ } unit[16];
+};
+
+struct r500_fragment_program_code {
struct {
GLuint inst0;
GLuint inst1;
@@ -822,17 +788,28 @@ struct r500_fragment_program {
int inst_end;
/* Hardware constants.
- * Contains a pointer to the value. The destination of the pointer
- * is supposed to be updated when GL state changes.
- * Typically, this is either a pointer into
- * gl_program_parameter_list::ParameterValues, or a pointer to a
- * global constant (e.g. for sin/cos-approximation)
- */
+ * Contains a pointer to the value. The destination of the pointer
+ * is supposed to be updated when GL state changes.
+ * Typically, this is either a pointer into
+ * gl_program_parameter_list::ParameterValues, or a pointer to a
+ * global constant (e.g. for sin/cos-approximation)
+ */
const GLfloat *constant[PFS_NUM_CONST_REGS];
int const_nr;
int max_temp_idx;
+};
+struct r500_fragment_program {
+ struct gl_fragment_program mesa_program;
+
+ GLcontext *ctx;
+ GLboolean translated;
+ GLboolean error;
+
+ struct r500_fragment_program_external_state state;
+ struct r500_fragment_program_code code;
+
GLboolean writes_depth;
GLuint optimization;
@@ -849,7 +826,6 @@ struct r300_state {
struct r300_texture_state texture;
int sw_tcl_inputs[VERT_ATTRIB_MAX];
struct r300_vertex_shader_state vertex_shader;
- struct r300_pfs_compile_state pfs_compile;
struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
int aos_count;
@@ -949,6 +925,7 @@ struct r300_context {
driTextureObject swapped;
int texture_depth;
float initialMaxAnisotropy;
+ float LODBias;
/* Clientdata textures;
*/
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 54b80d20a16..6d24d266fec 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -28,16 +28,14 @@
/**
* \file
*
- * \author Ben Skeggs <[email protected]>
+ * Fragment program compiler. Perform transformations on the intermediate
+ * \ref radeon_program representation (which is essentially the Mesa
+ * program representation plus the notion of clauses) until the program
+ * is in a form where we can translate it more or less directly into
+ * machine-readable form.
*
+ * \author Ben Skeggs <[email protected]>
* \author Jerome Glisse <[email protected]>
- *
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
- * \todo Verify results of opcodes for accuracy, I've only checked them in
- * specific cases.
*/
#include "glheader.h"
@@ -49,1967 +47,225 @@
#include "r300_context.h"
#include "r300_fragprog.h"
-#include "r300_reg.h"
#include "r300_state.h"
-/*
- * Usefull macros and values
- */
-#define ERROR(fmt, args...) do { \
- fprintf(stderr, "%s::%s(): " fmt "\n", \
- __FILE__, __FUNCTION__, ##args); \
- fp->error = GL_TRUE; \
- } while(0)
-
-#define PFS_INVAL 0xFFFFFFFF
-#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
-
-#define SWIZZLE_XYZ 0
-#define SWIZZLE_XXX 1
-#define SWIZZLE_YYY 2
-#define SWIZZLE_ZZZ 3
-#define SWIZZLE_WWW 4
-#define SWIZZLE_YZX 5
-#define SWIZZLE_ZXY 6
-#define SWIZZLE_WZY 7
-#define SWIZZLE_111 8
-#define SWIZZLE_000 9
-#define SWIZZLE_HHH 10
-
-#define swizzle(r, x, y, z, w) do_swizzle(fp, r, \
- ((SWIZZLE_##x<<0)| \
- (SWIZZLE_##y<<3)| \
- (SWIZZLE_##z<<6)| \
- (SWIZZLE_##w<<9)), \
- 0)
-
-#define REG_TYPE_INPUT 0
-#define REG_TYPE_OUTPUT 1
-#define REG_TYPE_TEMP 2
-#define REG_TYPE_CONST 3
-
-#define REG_TYPE_SHIFT 0
-#define REG_INDEX_SHIFT 2
-#define REG_VSWZ_SHIFT 8
-#define REG_SSWZ_SHIFT 13
-#define REG_NEGV_SHIFT 18
-#define REG_NEGS_SHIFT 19
-#define REG_ABS_SHIFT 20
-#define REG_NO_USE_SHIFT 21 // Hack for refcounting
-#define REG_VALID_SHIFT 22 // Does the register contain a defined value?
-#define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
-
-#define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
-#define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
-#define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
-#define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
-#define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
-#define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
-#define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
-#define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
-#define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
-#define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
-
-#define REG(type, index, vswz, sswz, nouse, valid, builtin) \
- (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
- ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
- ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
- ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
- ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
- ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
- ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_GET_TYPE(reg) \
- ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
-#define REG_GET_INDEX(reg) \
- ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
-#define REG_GET_VSWZ(reg) \
- ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
-#define REG_GET_SSWZ(reg) \
- ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
-#define REG_GET_NO_USE(reg) \
- ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
-#define REG_GET_VALID(reg) \
- ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
-#define REG_GET_BUILTIN(reg) \
- ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
-#define REG_SET_TYPE(reg, type) \
- reg = ((reg & ~REG_TYPE_MASK) | \
- ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
-#define REG_SET_INDEX(reg, index) \
- reg = ((reg & ~REG_INDEX_MASK) | \
- ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
-#define REG_SET_VSWZ(reg, vswz) \
- reg = ((reg & ~REG_VSWZ_MASK) | \
- ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
-#define REG_SET_SSWZ(reg, sswz) \
- reg = ((reg & ~REG_SSWZ_MASK) | \
- ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
-#define REG_SET_NO_USE(reg, nouse) \
- reg = ((reg & ~REG_NO_USE_MASK) | \
- ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
-#define REG_SET_VALID(reg, valid) \
- reg = ((reg & ~REG_VALID_MASK) | \
- ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
-#define REG_SET_BUILTIN(reg, builtin) \
- reg = ((reg & ~REG_BUILTIN_MASK) | \
- ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
-#define REG_ABS(reg) \
- reg = (reg | REG_ABS_MASK)
-#define REG_NEGV(reg) \
- reg = (reg | REG_NEGV_MASK)
-#define REG_NEGS(reg) \
- reg = (reg | REG_NEGS_MASK)
-
-/*
- * Datas structures for fragment program generation
- */
-
-/* description of r300 native hw instructions */
-static const struct {
- const char *name;
- int argc;
- int v_op;
- int s_op;
-} r300_fpop[] = {
- /* *INDENT-OFF* */
- {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
- {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
- {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
- {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
- {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
- {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
- {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
- {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
- {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
- {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
- {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
- {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
- {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
- /* *INDENT-ON* */
-};
-
-/* vector swizzles r300 can support natively, with a couple of
- * cases we handle specially
- *
- * REG_VSWZ/REG_SSWZ is an index into this table
- */
-
-/* mapping from SWIZZLE_* to r300 native values for scalar insns */
-#define SWIZZLE_HALF 6
-
-#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
- SWIZZLE_##y, \
- SWIZZLE_##z, \
- SWIZZLE_ZERO))
-/* native swizzles */
-static const struct r300_pfs_swizzle {
- GLuint hash; /* swizzle value this matches */
- GLuint base; /* base value for hw swizzle */
- GLuint stride; /* difference in base between arg0/1/2 */
- GLuint flags;
-} v_swiz[] = {
- /* *INDENT-OFF* */
- {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
- {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
- {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
- {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
- {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
- {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
- {PFS_INVAL, 0, 0, 0},
- /* *INDENT-ON* */
-};
-
-/* used during matching of non-native swizzles */
-#define SWZ_X_MASK (7 << 0)
-#define SWZ_Y_MASK (7 << 3)
-#define SWZ_Z_MASK (7 << 6)
-#define SWZ_W_MASK (7 << 9)
-static const struct {
- GLuint hash; /* used to mask matching swizzle components */
- int mask; /* actual outmask */
- int count; /* count of components matched */
-} s_mask[] = {
- /* *INDENT-OFF* */
- {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
- {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
- {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
- {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
- {SWZ_X_MASK, 1, 1},
- {SWZ_Y_MASK, 2, 1},
- {SWZ_Z_MASK, 4, 1},
- {PFS_INVAL, PFS_INVAL, PFS_INVAL}
- /* *INDENT-ON* */
-};
-
-static const struct {
- int base; /* hw value of swizzle */
- int stride; /* difference between SRC0/1/2 */
- GLuint flags;
-} s_swiz[] = {
- /* *INDENT-OFF* */
- {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
- {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
- {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
- {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
- {R300_ALU_ARGA_ZERO, 0, 0},
- {R300_ALU_ARGA_ONE, 0, 0},
- {R300_ALU_ARGA_HALF, 0, 0}
- /* *INDENT-ON* */
-};
-
-/* boiler-plate reg, for convenience */
-static const GLuint undef = REG(REG_TYPE_TEMP,
- 0,
- SWIZZLE_XYZ,
- SWIZZLE_W,
- GL_FALSE,
- GL_FALSE,
- GL_FALSE);
-
-/* constant one source */
-static const GLuint pfs_one = REG(REG_TYPE_CONST,
- 0,
- SWIZZLE_111,
- SWIZZLE_ONE,
- GL_FALSE,
- GL_TRUE,
- GL_TRUE);
-
-/* constant half source */
-static const GLuint pfs_half = REG(REG_TYPE_CONST,
- 0,
- SWIZZLE_HHH,
- SWIZZLE_HALF,
- GL_FALSE,
- GL_TRUE,
- GL_TRUE);
-
-/* constant zero source */
-static const GLuint pfs_zero = REG(REG_TYPE_CONST,
- 0,
- SWIZZLE_000,
- SWIZZLE_ZERO,
- GL_FALSE,
- GL_TRUE,
- GL_TRUE);
+#include "radeon_program_alu.h"
-/*
- * Common functions prototypes
- */
-static void dump_program(struct r300_fragment_program *fp);
-static void emit_arith(struct r300_fragment_program *fp, int op,
- GLuint dest, int mask,
- GLuint src0, GLuint src1, GLuint src2, int flags);
-/**
- * Get an R300 temporary that can be written to in the given slot.
- */
-static int get_hw_temp(struct r300_fragment_program *fp, int slot)
+static void reset_srcreg(struct prog_src_register* reg)
{
- COMPILE_STATE;
- int r;
-
- for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
- if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
- break;
- }
-
- if (r >= PFS_NUM_TEMP_REGS) {
- ERROR("Out of hardware temps\n");
- return 0;
- }
- // Reserved is used to avoid the following scenario:
- // R300 temporary X is first assigned to Mesa temporary Y during vector ops
- // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
- // Then scalar ops on Mesa temporary Z are emitted and move back in time
- // to overwrite the value of temporary Y.
- // End scenario.
- cs->hwtemps[r].reserved = cs->hwtemps[r].free;
- cs->hwtemps[r].free = -1;
-
- // Reset to some value that won't mess things up when the user
- // tries to read from a temporary that hasn't been assigned a value yet.
- // In the normal case, vector_valid and scalar_valid should be set to
- // a sane value by the first emit that writes to this temporary.
- cs->hwtemps[r].vector_valid = 0;
- cs->hwtemps[r].scalar_valid = 0;
-
- if (r > fp->max_temp_idx)
- fp->max_temp_idx = r;
-
- return r;
+ _mesa_bzero(reg, sizeof(*reg));
+ reg->Swizzle = SWIZZLE_NOOP;
}
/**
- * Get an R300 temporary that will act as a TEX destination register.
- */
-static int get_hw_temp_tex(struct r300_fragment_program *fp)
-{
- COMPILE_STATE;
- int r;
-
- for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
- if (cs->used_in_node & (1 << r))
- continue;
-
- // Note: Be very careful here
- if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
- break;
- }
-
- if (r >= PFS_NUM_TEMP_REGS)
- return get_hw_temp(fp, 0); /* Will cause an indirection */
-
- cs->hwtemps[r].reserved = cs->hwtemps[r].free;
- cs->hwtemps[r].free = -1;
-
- // Reset to some value that won't mess things up when the user
- // tries to read from a temporary that hasn't been assigned a value yet.
- // In the normal case, vector_valid and scalar_valid should be set to
- // a sane value by the first emit that writes to this temporary.
- cs->hwtemps[r].vector_valid = cs->nrslots;
- cs->hwtemps[r].scalar_valid = cs->nrslots;
-
- if (r > fp->max_temp_idx)
- fp->max_temp_idx = r;
-
- return r;
-}
-
-/**
- * Mark the given hardware register as free.
- */
-static void free_hw_temp(struct r300_fragment_program *fp, int idx)
-{
- COMPILE_STATE;
-
- // Be very careful here. Consider sequences like
- // MAD r0, r1,r2,r3
- // TEX r4, ...
- // The TEX instruction may be moved in front of the MAD instruction
- // due to the way nodes work. We don't want to alias r1 and r4 in
- // this case.
- // I'm certain the register allocation could be further sanitized,
- // but it's tricky because of stuff that can happen inside emit_tex
- // and emit_arith.
- cs->hwtemps[idx].free = cs->nrslots + 1;
-}
-
-/**
- * Create a new Mesa temporary register.
- */
-static GLuint get_temp_reg(struct r300_fragment_program *fp)
-{
- COMPILE_STATE;
- GLuint r = undef;
- GLuint index;
-
- index = ffs(~cs->temp_in_use);
- if (!index) {
- ERROR("Out of program temps\n");
- return r;
- }
-
- cs->temp_in_use |= (1 << --index);
- cs->temps[index].refcount = 0xFFFFFFFF;
- cs->temps[index].reg = -1;
-
- REG_SET_TYPE(r, REG_TYPE_TEMP);
- REG_SET_INDEX(r, index);
- REG_SET_VALID(r, GL_TRUE);
- return r;
-}
-
-/**
- * Create a new Mesa temporary register that will act as the destination
- * register for a texture read.
- */
-static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
-{
- COMPILE_STATE;
- GLuint r = undef;
- GLuint index;
-
- index = ffs(~cs->temp_in_use);
- if (!index) {
- ERROR("Out of program temps\n");
- return r;
- }
-
- cs->temp_in_use |= (1 << --index);
- cs->temps[index].refcount = 0xFFFFFFFF;
- cs->temps[index].reg = get_hw_temp_tex(fp);
-
- REG_SET_TYPE(r, REG_TYPE_TEMP);
- REG_SET_INDEX(r, index);
- REG_SET_VALID(r, GL_TRUE);
- return r;
-}
-
-/**
- * Free a Mesa temporary and the associated R300 temporary.
- */
-static void free_temp(struct r300_fragment_program *fp, GLuint r)
-{
- COMPILE_STATE;
- GLuint index = REG_GET_INDEX(r);
-
- if (!(cs->temp_in_use & (1 << index)))
- return;
-
- if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
- free_hw_temp(fp, cs->temps[index].reg);
- cs->temps[index].reg = -1;
- cs->temp_in_use &= ~(1 << index);
- } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
- free_hw_temp(fp, cs->inputs[index].reg);
- cs->inputs[index].reg = -1;
- }
-}
-
-/**
- * Emit a hardware constant/parameter.
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ * - premultiply texture coordinates for RECT
+ * - extract operand swizzles
+ * - introduce a temporary register when write masks are needed
*
- * \p cp Stable pointer to an array of 4 floats.
- * The pointer must be stable in the sense that it remains to be valid
- * and hold the contents of the constant/parameter throughout the lifetime
- * of the fragment program (actually, up until the next time the fragment
- * program is translated).
+ * \todo If/when r5xx uses the radeon_program architecture, this can probably
+ * be reused.
*/
-static GLuint emit_const4fv(struct r300_fragment_program *fp,
- const GLfloat * cp)
-{
- GLuint reg = undef;
- int index;
-
- for (index = 0; index < fp->const_nr; ++index) {
- if (fp->constant[index] == cp)
- break;
- }
-
- if (index >= fp->const_nr) {
- if (index >= PFS_NUM_CONST_REGS) {
- ERROR("Out of hw constants!\n");
- return reg;
- }
-
- fp->const_nr++;
- fp->constant[index] = cp;
- }
-
- REG_SET_TYPE(reg, REG_TYPE_CONST);
- REG_SET_INDEX(reg, index);
- REG_SET_VALID(reg, GL_TRUE);
- return reg;
-}
-
-static inline GLuint negate(GLuint r)
-{
- REG_NEGS(r);
- REG_NEGV(r);
- return r;
-}
-
-/* Hack, to prevent clobbering sources used multiple times when
- * emulating non-native instructions
- */
-static inline GLuint keep(GLuint r)
-{
- REG_SET_NO_USE(r, GL_TRUE);
- return r;
-}
-
-static inline GLuint absolute(GLuint r)
-{
- REG_ABS(r);
- return r;
-}
-
-static int swz_native(struct r300_fragment_program *fp,
- GLuint src, GLuint * r, GLuint arbneg)
+static GLboolean transform_TEX(
+ struct radeon_program_transform_context* context,
+ struct prog_instruction* orig_inst, void* data)
{
- /* Native swizzle, handle negation */
- src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
-
- if ((arbneg & 0x7) == 0x0) {
- src = src & ~REG_NEGV_MASK;
- *r = src;
- } else if ((arbneg & 0x7) == 0x7) {
- src |= REG_NEGV_MASK;
- *r = src;
- } else {
- if (!REG_GET_VALID(*r))
- *r = get_temp_reg(fp);
- src |= REG_NEGV_MASK;
- emit_arith(fp,
- PFS_OP_MAD,
- *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
- src = src & ~REG_NEGV_MASK;
- emit_arith(fp,
- PFS_OP_MAD,
- *r,
- (arbneg ^ 0x7) | WRITEMASK_W,
- src, pfs_one, pfs_zero, 0);
- }
-
- return 3;
-}
-
-static int swz_emit_partial(struct r300_fragment_program *fp,
- GLuint src,
- GLuint * r, int mask, int mc, GLuint arbneg)
-{
- GLuint tmp;
- GLuint wmask = 0;
-
- if (!REG_GET_VALID(*r))
- *r = get_temp_reg(fp);
-
- /* A partial match, VSWZ/mask define what parts of the
- * desired swizzle we match
- */
- if (mc + s_mask[mask].count == 3) {
- wmask = WRITEMASK_W;
- src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
- }
-
- tmp = arbneg & s_mask[mask].mask;
- if (tmp) {
- tmp = tmp ^ s_mask[mask].mask;
- if (tmp) {
- emit_arith(fp,
- PFS_OP_MAD,
- *r,
- arbneg & s_mask[mask].mask,
- keep(src) | REG_NEGV_MASK,
- pfs_one, pfs_zero, 0);
- if (!wmask) {
- REG_SET_NO_USE(src, GL_TRUE);
- } else {
- REG_SET_NO_USE(src, GL_FALSE);
- }
- emit_arith(fp,
- PFS_OP_MAD,
- *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
- } else {
- if (!wmask) {
- REG_SET_NO_USE(src, GL_TRUE);
- } else {
- REG_SET_NO_USE(src, GL_FALSE);
- }
- emit_arith(fp,
- PFS_OP_MAD,
- *r,
- (arbneg & s_mask[mask].mask) | wmask,
- src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
- }
- } else {
- if (!wmask) {
- REG_SET_NO_USE(src, GL_TRUE);
- } else {
- REG_SET_NO_USE(src, GL_FALSE);
- }
- emit_arith(fp, PFS_OP_MAD,
- *r,
- s_mask[mask].mask | wmask,
- src, pfs_one, pfs_zero, 0);
- }
-
- return s_mask[mask].count;
-}
-
-static GLuint do_swizzle(struct r300_fragment_program *fp,
- GLuint src, GLuint arbswz, GLuint arbneg)
-{
- GLuint r = undef;
- GLuint vswz;
- int c_mask = 0;
- int v_match = 0;
-
- /* If swizzling from something without an XYZW native swizzle,
- * emit result to a temp, and do new swizzle from the temp.
- */
-#if 0
- if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
- GLuint temp = get_temp_reg(fp);
- emit_arith(fp,
- PFS_OP_MAD,
- temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
- src = temp;
- }
-#endif
-
- if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
- GLuint vsrcswz =
- (v_swiz[REG_GET_VSWZ(src)].
- hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
- REG_GET_SSWZ(src) << 9;
- GLint i;
-
- GLuint newswz = 0;
- GLuint offset;
- for (i = 0; i < 4; ++i) {
- offset = GET_SWZ(arbswz, i);
-
- newswz |=
- (offset <= 3) ? GET_SWZ(vsrcswz,
- offset) << i *
- 3 : offset << i * 3;
- }
-
- arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
- REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
- } else {
- /* set scalar swizzling */
- REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
-
- }
- do {
- vswz = REG_GET_VSWZ(src);
- do {
- int chash;
-
- REG_SET_VSWZ(src, vswz);
- chash = v_swiz[REG_GET_VSWZ(src)].hash &
- s_mask[c_mask].hash;
-
- if (chash == (arbswz & s_mask[c_mask].hash)) {
- if (s_mask[c_mask].count == 3) {
- v_match += swz_native(fp,
- src, &r, arbneg);
- } else {
- v_match += swz_emit_partial(fp,
- src,
- &r,
- c_mask,
- v_match,
- arbneg);
- }
-
- if (v_match == 3)
- return r;
-
- /* Fill with something invalid.. all 0's was
- * wrong before, matched SWIZZLE_X. So all
- * 1's will be okay for now
- */
- arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
- }
- } while (v_swiz[++vswz].hash != PFS_INVAL);
- REG_SET_VSWZ(src, SWIZZLE_XYZ);
- } while (s_mask[++c_mask].hash != PFS_INVAL);
-
- ERROR("should NEVER get here\n");
- return r;
-}
-
-static GLuint t_src(struct r300_fragment_program *fp,
- struct prog_src_register fpsrc)
-{
- GLuint r = undef;
-
- switch (fpsrc.File) {
- case PROGRAM_TEMPORARY:
- REG_SET_INDEX(r, fpsrc.Index);
- REG_SET_VALID(r, GL_TRUE);
- REG_SET_TYPE(r, REG_TYPE_TEMP);
- break;
- case PROGRAM_INPUT:
- REG_SET_INDEX(r, fpsrc.Index);
- REG_SET_VALID(r, GL_TRUE);
- REG_SET_TYPE(r, REG_TYPE_INPUT);
- break;
- case PROGRAM_LOCAL_PARAM:
- r = emit_const4fv(fp,
- fp->mesa_program.Base.LocalParams[fpsrc.
- Index]);
- break;
- case PROGRAM_ENV_PARAM:
- r = emit_const4fv(fp,
- fp->ctx->FragmentProgram.Parameters[fpsrc.
- Index]);
- break;
- case PROGRAM_STATE_VAR:
- case PROGRAM_NAMED_PARAM:
- case PROGRAM_CONSTANT:
- r = emit_const4fv(fp,
- fp->mesa_program.Base.Parameters->
- ParameterValues[fpsrc.Index]);
- break;
- default:
- ERROR("unknown SrcReg->File %x\n", fpsrc.File);
- return r;
- }
-
- /* no point swizzling ONE/ZERO/HALF constants... */
- if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
- r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
- return r;
-}
-
-static GLuint t_scalar_src(struct r300_fragment_program *fp,
- struct prog_src_register fpsrc)
-{
- struct prog_src_register src = fpsrc;
- int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
-
- src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
-
- return t_src(fp, src);
-}
-
-static GLuint t_dst(struct r300_fragment_program *fp,
- struct prog_dst_register dest)
-{
- GLuint r = undef;
-
- switch (dest.File) {
- case PROGRAM_TEMPORARY:
- REG_SET_INDEX(r, dest.Index);
- REG_SET_VALID(r, GL_TRUE);
- REG_SET_TYPE(r, REG_TYPE_TEMP);
- return r;
- case PROGRAM_OUTPUT:
- REG_SET_TYPE(r, REG_TYPE_OUTPUT);
- switch (dest.Index) {
- case FRAG_RESULT_COLR:
- case FRAG_RESULT_DEPR:
- REG_SET_INDEX(r, dest.Index);
- REG_SET_VALID(r, GL_TRUE);
- return r;
- default:
- ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
- return r;
- }
- default:
- ERROR("Bad DstReg->File 0x%x\n", dest.File);
- return r;
- }
-}
-
-static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
-{
- COMPILE_STATE;
- int idx;
- int index = REG_GET_INDEX(src);
-
- switch (REG_GET_TYPE(src)) {
- case REG_TYPE_TEMP:
- /* NOTE: if reg==-1 here, a source is being read that
- * hasn't been written to. Undefined results.
- */
- if (cs->temps[index].reg == -1)
- cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
-
- idx = cs->temps[index].reg;
-
- if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
- free_temp(fp, src);
- break;
- case REG_TYPE_INPUT:
- idx = cs->inputs[index].reg;
-
- if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
- free_hw_temp(fp, cs->inputs[index].reg);
- break;
- case REG_TYPE_CONST:
- return (index | SRC_CONST);
- default:
- ERROR("Invalid type for source reg\n");
- return (0 | SRC_CONST);
- }
+ struct r300_fragment_program_compiler *compiler =
+ (struct r300_fragment_program_compiler*)data;
+ struct prog_instruction inst = *orig_inst;
+ struct prog_instruction* tgt;
+ GLboolean destredirect = GL_FALSE;
+
+ if (inst.Opcode != OPCODE_TEX &&
+ inst.Opcode != OPCODE_TXB &&
+ inst.Opcode != OPCODE_TXP &&
+ inst.Opcode != OPCODE_KIL)
+ return GL_FALSE;
- if (!tex)
- cs->used_in_node |= (1 << idx);
+ if (inst.Opcode != OPCODE_KIL &&
+ compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+ GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
- return idx;
-}
+ if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
-static int t_hw_dst(struct r300_fragment_program *fp,
- GLuint dest, GLboolean tex, int slot)
-{
- COMPILE_STATE;
- int idx;
- GLuint index = REG_GET_INDEX(dest);
- assert(REG_GET_VALID(dest));
-
- switch (REG_GET_TYPE(dest)) {
- case REG_TYPE_TEMP:
- if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
- if (!tex) {
- cs->temps[index].reg = get_hw_temp(fp, slot);
- } else {
- cs->temps[index].reg = get_hw_temp_tex(fp);
- }
- }
- idx = cs->temps[index].reg;
-
- if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
- free_temp(fp, dest);
-
- cs->dest_in_node |= (1 << idx);
- cs->used_in_node |= (1 << idx);
- break;
- case REG_TYPE_OUTPUT:
- switch (index) {
- case FRAG_RESULT_COLR:
- fp->node[fp->cur_node].flags |=
- R300_RGBA_OUT;
- break;
- case FRAG_RESULT_DEPR:
- fp->WritesDepth = GL_TRUE;
- fp->node[fp->cur_node].flags |=
- R300_W_OUT;
- break;
+ tgt->Opcode = OPCODE_MOV;
+ tgt->DstReg = inst.DstReg;
+ tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+ tgt->SrcReg[0].Swizzle = comparefunc == GL_ALWAYS ? SWIZZLE_1111 : SWIZZLE_0000;
+ return GL_TRUE;
}
- return index;
- break;
- default:
- ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
- return 0;
- }
-
- return idx;
-}
-
-static void emit_nop(struct r300_fragment_program *fp)
-{
- COMPILE_STATE;
- if (cs->nrslots >= PFS_MAX_ALU_INST) {
- ERROR("Out of ALU instruction slots\n");
- return;
+ inst.DstReg.File = PROGRAM_TEMPORARY;
+ inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler);
+ inst.DstReg.WriteMask = WRITEMASK_XYZW;
}
- fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
- fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
- fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
- fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
- cs->nrslots++;
-}
-static void emit_tex(struct r300_fragment_program *fp,
- struct prog_instruction *fpi, int opcode)
-{
- COMPILE_STATE;
- GLuint coord = t_src(fp, fpi->SrcReg[0]);
- GLuint dest = undef, rdest = undef;
- GLuint din, uin;
- int unit = fpi->TexSrcUnit;
- int hwsrc, hwdest;
- GLuint tempreg = 0;
-
- /**
- * Hardware uses [0..1]x[0..1] range for rectangle textures
+ /* Hardware uses [0..1]x[0..1] range for rectangle textures
* instead of [0..Width]x[0..Height].
* Add a scaling instruction.
- *
- * \todo Refactor this once we have proper rewriting/optimization
- * support for programs.
*/
- if (opcode != R300_TEX_OP_KIL && fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
+ if (inst.Opcode != OPCODE_KIL && inst.TexSrcTarget == TEXTURE_RECT_INDEX) {
gl_state_index tokens[STATE_LENGTH] = {
STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
0
};
+
+ int tempreg = radeonCompilerAllocateTemporary(context->compiler);
int factor_index;
- GLuint factorreg;
- tokens[2] = unit;
+ tokens[2] = inst.TexSrcUnit;
factor_index =
- _mesa_add_state_reference(fp->mesa_program.Base.
- Parameters, tokens);
- factorreg =
- emit_const4fv(fp,
- fp->mesa_program.Base.Parameters->
- ParameterValues[factor_index]);
- tempreg = keep(get_temp_reg(fp));
-
- emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
- coord, factorreg, pfs_zero, 0);
-
- coord = tempreg;
- }
+ _mesa_add_state_reference(
+ compiler->fp->mesa_program.Base.Parameters, tokens);
- /* Texture operations do not support swizzles etc. in hardware,
- * so emit an additional arithmetic operation if necessary.
- */
- if (REG_GET_VSWZ(coord) != SWIZZLE_XYZ ||
- REG_GET_SSWZ(coord) != SWIZZLE_W ||
- coord & (REG_NEGV_MASK | REG_NEGS_MASK | REG_ABS_MASK)) {
- assert(tempreg == 0);
- tempreg = keep(get_temp_reg(fp));
- emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
- coord, pfs_one, pfs_zero, 0);
- coord = tempreg;
- }
-
- /* Ensure correct node indirection */
- uin = cs->used_in_node;
- din = cs->dest_in_node;
-
- /* Resolve source/dest to hardware registers */
- hwsrc = t_hw_src(fp, coord, GL_TRUE);
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
- if (opcode != R300_TEX_OP_KIL) {
- dest = t_dst(fp, fpi->DstReg);
+ tgt->Opcode = OPCODE_MUL;
+ tgt->DstReg.File = PROGRAM_TEMPORARY;
+ tgt->DstReg.Index = tempreg;
+ tgt->SrcReg[0] = inst.SrcReg[0];
+ tgt->SrcReg[1].File = PROGRAM_STATE_VAR;
+ tgt->SrcReg[1].Index = factor_index;
- /* r300 doesn't seem to be able to do TEX->output reg */
- if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
- rdest = dest;
- dest = get_temp_reg_tex(fp);
- } else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
- /* in case write mask isn't XYZW */
- rdest = dest;
- dest = get_temp_reg_tex(fp);
- }
- hwdest =
- t_hw_dst(fp, dest, GL_TRUE,
- fp->node[fp->cur_node].alu_offset);
-
- /* Use a temp that hasn't been used in this node, rather
- * than causing an indirection
- */
- if (uin & (1 << hwdest)) {
- free_hw_temp(fp, hwdest);
- hwdest = get_hw_temp_tex(fp);
- cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
- }
- } else {
- hwdest = 0;
- unit = 0;
+ reset_srcreg(&inst.SrcReg[0]);
+ inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+ inst.SrcReg[0].Index = tempreg;
}
- /* Indirection if source has been written in this node, or if the
- * dest has been read/written in this node
+ /* Texture operations do not support swizzles etc. in hardware,
+ * so emit an additional arithmetic operation if necessary.
*/
- if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
- (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
-
- /* Finish off current node */
- if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
- emit_nop(fp);
-
- fp->node[fp->cur_node].alu_end =
- cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
- assert(fp->node[fp->cur_node].alu_end >= 0);
-
- if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
- ERROR("too many levels of texture indirection\n");
- return;
- }
-
- /* Start new node */
- fp->node[fp->cur_node].tex_offset = fp->tex.length;
- fp->node[fp->cur_node].alu_offset = cs->nrslots;
- fp->node[fp->cur_node].tex_end = -1;
- fp->node[fp->cur_node].alu_end = -1;
- fp->node[fp->cur_node].flags = 0;
- cs->used_in_node = 0;
- cs->dest_in_node = 0;
- }
-
- if (fp->cur_node == 0)
- fp->first_node_has_tex = 1;
-
- fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
- | (hwdest << R300_DST_ADDR_SHIFT)
- | (unit << R300_TEX_ID_SHIFT)
- | (opcode << R300_TEX_INST_SHIFT);
-
- cs->dest_in_node |= (1 << hwdest);
- if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
- cs->used_in_node |= (1 << hwsrc);
+ if (inst.SrcReg[0].Swizzle != SWIZZLE_NOOP ||
+ inst.SrcReg[0].Abs || inst.SrcReg[0].NegateBase || inst.SrcReg[0].NegateAbs) {
+ int tempreg = radeonCompilerAllocateTemporary(context->compiler);
- fp->node[fp->cur_node].tex_end++;
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
- /* Copy from temp to output if needed */
- if (REG_GET_VALID(rdest)) {
- emit_arith(fp, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
- pfs_one, pfs_zero, 0);
- free_temp(fp, dest);
- }
-
- /* Free temp register */
- if (tempreg != 0)
- free_temp(fp, tempreg);
-}
-
-/**
- * Returns the first slot where we could possibly allow writing to dest,
- * according to register allocation.
- */
-static int get_earliest_allowed_write(struct r300_fragment_program *fp,
- GLuint dest, int mask)
-{
- COMPILE_STATE;
- int idx;
- int pos;
- GLuint index = REG_GET_INDEX(dest);
- assert(REG_GET_VALID(dest));
-
- switch (REG_GET_TYPE(dest)) {
- case REG_TYPE_TEMP:
- if (cs->temps[index].reg == -1)
- return 0;
-
- idx = cs->temps[index].reg;
- break;
- case REG_TYPE_OUTPUT:
- return 0;
- default:
- ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
- return 0;
- }
+ tgt->Opcode = OPCODE_MOV;
+ tgt->DstReg.File = PROGRAM_TEMPORARY;
+ tgt->DstReg.Index = tempreg;
+ tgt->SrcReg[0] = inst.SrcReg[0];
- pos = cs->hwtemps[idx].reserved;
- if (mask & WRITEMASK_XYZ) {
- if (pos < cs->hwtemps[idx].vector_lastread)
- pos = cs->hwtemps[idx].vector_lastread;
- }
- if (mask & WRITEMASK_W) {
- if (pos < cs->hwtemps[idx].scalar_lastread)
- pos = cs->hwtemps[idx].scalar_lastread;
+ reset_srcreg(&inst.SrcReg[0]);
+ inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+ inst.SrcReg[0].Index = tempreg;
}
- return pos;
-}
+ if (inst.Opcode != OPCODE_KIL) {
+ if (inst.DstReg.File != PROGRAM_TEMPORARY ||
+ inst.DstReg.WriteMask != WRITEMASK_XYZW) {
+ int tempreg = radeonCompilerAllocateTemporary(context->compiler);
-/**
- * Allocates a slot for an ALU instruction that can consist of
- * a vertex part or a scalar part or both.
- *
- * Sources from src (src[0] to src[argc-1]) are added to the slot in the
- * appropriate position (vector and/or scalar), and their positions are
- * recorded in the srcpos array.
- *
- * This function emits instruction code for the source fetch and the
- * argument selection. It does not emit instruction code for the
- * opcode or the destination selection.
- *
- * @return the index of the slot
- */
-static int find_and_prepare_slot(struct r300_fragment_program *fp,
- GLboolean emit_vop,
- GLboolean emit_sop,
- int argc, GLuint * src, GLuint dest, int mask)
-{
- COMPILE_STATE;
- int hwsrc[3];
- int srcpos[3];
- unsigned int used;
- int tempused;
- int tempvsrc[3];
- int tempssrc[3];
- int pos;
- int regnr;
- int i, j;
-
- // Determine instruction slots, whether sources are required on
- // vector or scalar side, and the smallest slot number where
- // all source registers are available
- used = 0;
- if (emit_vop)
- used |= SLOT_OP_VECTOR;
- if (emit_sop)
- used |= SLOT_OP_SCALAR;
-
- pos = get_earliest_allowed_write(fp, dest, mask);
-
- if (fp->node[fp->cur_node].alu_offset > pos)
- pos = fp->node[fp->cur_node].alu_offset;
- for (i = 0; i < argc; ++i) {
- if (!REG_GET_BUILTIN(src[i])) {
- if (emit_vop)
- used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
- if (emit_sop)
- used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
- }
-
- hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
- regnr = hwsrc[i] & 31;
-
- if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
- if (used & (SLOT_SRC_VECTOR << i)) {
- if (cs->hwtemps[regnr].vector_valid > pos)
- pos = cs->hwtemps[regnr].vector_valid;
- }
- if (used & (SLOT_SRC_SCALAR << i)) {
- if (cs->hwtemps[regnr].scalar_valid > pos)
- pos = cs->hwtemps[regnr].scalar_valid;
- }
+ inst.DstReg.File = PROGRAM_TEMPORARY;
+ inst.DstReg.Index = tempreg;
+ inst.DstReg.WriteMask = WRITEMASK_XYZW;
+ destredirect = GL_TRUE;
}
}
- // Find a slot that fits
- for (;; ++pos) {
- if (cs->slot[pos].used & used & SLOT_OP_BOTH)
- continue;
-
- if (pos >= cs->nrslots) {
- if (cs->nrslots >= PFS_MAX_ALU_INST) {
- ERROR("Out of ALU instruction slots\n");
- return -1;
- }
-
- fp->alu.inst[pos].inst0 = NOP_INST0;
- fp->alu.inst[pos].inst1 = NOP_INST1;
- fp->alu.inst[pos].inst2 = NOP_INST2;
- fp->alu.inst[pos].inst3 = NOP_INST3;
-
- cs->nrslots++;
- }
- // Note: When we need both parts (vector and scalar) of a source,
- // we always try to put them into the same position. This makes the
- // code easier to read, and it is optimal (i.e. one doesn't gain
- // anything by splitting the parts).
- // It also avoids headaches with swizzles that access both parts (i.e WXY)
- tempused = cs->slot[pos].used;
- for (i = 0; i < 3; ++i) {
- tempvsrc[i] = cs->slot[pos].vsrc[i];
- tempssrc[i] = cs->slot[pos].ssrc[i];
- }
-
- for (i = 0; i < argc; ++i) {
- int flags = (used >> i) & SLOT_SRC_BOTH;
-
- if (!flags) {
- srcpos[i] = 0;
- continue;
- }
-
- for (j = 0; j < 3; ++j) {
- if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
- if (tempvsrc[j] != hwsrc[i])
- continue;
- }
-
- if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
- if (tempssrc[j] != hwsrc[i])
- continue;
- }
-
- break;
- }
-
- if (j == 3)
- break;
-
- srcpos[i] = j;
- tempused |= flags << j;
- if (flags & SLOT_SRC_VECTOR)
- tempvsrc[j] = hwsrc[i];
- if (flags & SLOT_SRC_SCALAR)
- tempssrc[j] = hwsrc[i];
- }
-
- if (i == argc)
- break;
- }
-
- // Found a slot, reserve it
- cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
- for (i = 0; i < 3; ++i) {
- cs->slot[pos].vsrc[i] = tempvsrc[i];
- cs->slot[pos].ssrc[i] = tempssrc[i];
- }
-
- for (i = 0; i < argc; ++i) {
- if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
- int regnr = hwsrc[i] & 31;
-
- if (used & (SLOT_SRC_VECTOR << i)) {
- if (cs->hwtemps[regnr].vector_lastread < pos)
- cs->hwtemps[regnr].vector_lastread =
- pos;
- }
- if (used & (SLOT_SRC_SCALAR << i)) {
- if (cs->hwtemps[regnr].scalar_lastread < pos)
- cs->hwtemps[regnr].scalar_lastread =
- pos;
- }
- }
- }
-
- // Emit the source fetch code
- fp->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
- fp->alu.inst[pos].inst1 |=
- ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
- (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
- (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
-
- fp->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
- fp->alu.inst[pos].inst3 |=
- ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
- (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
- (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
-
- // Emit the argument selection code
- if (emit_vop) {
- int swz[3];
-
- for (i = 0; i < 3; ++i) {
- if (i < argc) {
- swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
- (srcpos[i] *
- v_swiz[REG_GET_VSWZ(src[i])].
- stride)) | ((src[i] & REG_NEGV_MASK)
- ? ARG_NEG : 0) | ((src[i]
- &
- REG_ABS_MASK)
- ?
- ARG_ABS
- : 0);
- } else {
- swz[i] = R300_ALU_ARGC_ZERO;
- }
- }
-
- fp->alu.inst[pos].inst0 &=
- ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
- R300_ALU_ARG2C_MASK);
- fp->alu.inst[pos].inst0 |=
- (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
- R300_ALU_ARG1C_SHIFT)
- | (swz[2] << R300_ALU_ARG2C_SHIFT);
- }
-
- if (emit_sop) {
- int swz[3];
-
- for (i = 0; i < 3; ++i) {
- if (i < argc) {
- swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
- (srcpos[i] *
- s_swiz[REG_GET_SSWZ(src[i])].
- stride)) | ((src[i] & REG_NEGV_MASK)
- ? ARG_NEG : 0) | ((src[i]
- &
- REG_ABS_MASK)
- ?
- ARG_ABS
- : 0);
- } else {
- swz[i] = R300_ALU_ARGA_ZERO;
- }
- }
-
- fp->alu.inst[pos].inst2 &=
- ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
- R300_ALU_ARG2A_MASK);
- fp->alu.inst[pos].inst2 |=
- (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
- R300_ALU_ARG1A_SHIFT)
- | (swz[2] << R300_ALU_ARG2A_SHIFT);
- }
-
- return pos;
-}
-
-/**
- * Append an ALU instruction to the instruction list.
- */
-static void emit_arith(struct r300_fragment_program *fp,
- int op,
- GLuint dest,
- int mask,
- GLuint src0, GLuint src1, GLuint src2, int flags)
-{
- COMPILE_STATE;
- GLuint src[3] = { src0, src1, src2 };
- int hwdest;
- GLboolean emit_vop, emit_sop;
- int vop, sop, argc;
- int pos;
-
- vop = r300_fpop[op].v_op;
- sop = r300_fpop[op].s_op;
- argc = r300_fpop[op].argc;
-
- if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
- REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
- if (mask & WRITEMASK_Z) {
- mask = WRITEMASK_W;
- } else {
- return;
- }
- }
-
- emit_vop = GL_FALSE;
- emit_sop = GL_FALSE;
- if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
- emit_vop = GL_TRUE;
- if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
- emit_sop = GL_TRUE;
-
- pos =
- find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
- mask);
- if (pos < 0)
- return;
-
- hwdest = t_hw_dst(fp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
-
- if (flags & PFS_FLAG_SAT) {
- vop |= R300_ALU_OUTC_CLAMP;
- sop |= R300_ALU_OUTA_CLAMP;
- }
-
- /* Throw the pieces together and get ALU/1 */
- if (emit_vop) {
- fp->alu.inst[pos].inst0 |= vop;
-
- fp->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
-
- if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
- if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
- fp->alu.inst[pos].inst1 |=
- (mask & WRITEMASK_XYZ) <<
- R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
- } else
- assert(0);
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
+ _mesa_copy_instructions(tgt, &inst, 1);
+
+ if (inst.Opcode != OPCODE_KIL &&
+ compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+ GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+ GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 2);
+
+ tgt[0].Opcode = OPCODE_ADD;
+ tgt[0].DstReg = inst.DstReg;
+ tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+ tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt[0].SrcReg[0].Index = inst.DstReg.Index;
+ if (depthmode == 0) /* GL_LUMINANCE */
+ tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+ else if (depthmode == 2) /* GL_ALPHA */
+ tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+ tgt[0].SrcReg[1] = inst.SrcReg[0];
+ tgt[0].SrcReg[1].Swizzle = SWIZZLE_ZZZZ;
+
+ /* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+ * r < tex <=> -tex+r < 0
+ * r >= tex <=> not (-tex+r < 0 */
+ if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+ tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+ else
+ tgt[0].SrcReg[1].NegateBase = tgt[0].SrcReg[1].NegateBase ^ NEGATE_XYZW;
+
+ tgt[1].Opcode = OPCODE_CMP;
+ tgt[1].DstReg = orig_inst->DstReg;
+ tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index;
+ tgt[1].SrcReg[1].File = PROGRAM_BUILTIN;
+ tgt[1].SrcReg[2].File = PROGRAM_BUILTIN;
+
+ if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+ tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111;
+ tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000;
} else {
- fp->alu.inst[pos].inst1 |=
- (mask & WRITEMASK_XYZ) <<
- R300_ALU_DSTC_REG_MASK_SHIFT;
-
- cs->hwtemps[hwdest].vector_valid = pos + 1;
+ tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000;
+ tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111;
}
- }
+ } else if (destredirect) {
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
- /* And now ALU/3 */
- if (emit_sop) {
- fp->alu.inst[pos].inst2 |= sop;
-
- if (mask & WRITEMASK_W) {
- if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
- if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
- fp->alu.inst[pos].inst3 |=
- (hwdest << R300_ALU_DSTA_SHIFT) |
- R300_ALU_DSTA_OUTPUT;
- } else if (REG_GET_INDEX(dest) ==
- FRAG_RESULT_DEPR) {
- fp->alu.inst[pos].inst3 |=
- R300_ALU_DSTA_DEPTH;
- } else
- assert(0);
- } else {
- fp->alu.inst[pos].inst3 |=
- (hwdest << R300_ALU_DSTA_SHIFT) |
- R300_ALU_DSTA_REG;
-
- cs->hwtemps[hwdest].scalar_valid = pos + 1;
- }
- }
+ tgt->Opcode = OPCODE_MOV;
+ tgt->DstReg = orig_inst->DstReg;
+ tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt->SrcReg[0].Index = inst.DstReg.Index;
}
- return;
+ return GL_TRUE;
}
-#if 0
-static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
+
+static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
{
struct gl_fragment_program *mp = &fp->mesa_program;
- GLuint r = undef;
- if (!(mp->Base.InputsRead & (1 << attr))) {
- ERROR("Attribute %d was not provided!\n", attr);
- return undef;
- }
-
- REG_SET_TYPE(r, REG_TYPE_INPUT);
- REG_SET_INDEX(r, attr);
- REG_SET_VALID(r, GL_TRUE);
- return r;
+ /* Ask Mesa nicely to fill in ParameterValues for us */
+ if (mp->Base.Parameters)
+ _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
}
-#endif
-
-static GLfloat SinCosConsts[2][4] = {
- {
- 1.273239545, // 4/PI
- -0.405284735, // -4/(PI*PI)
- 3.141592654, // PI
- 0.2225 // weight
- },
- {
- 0.75,
- 0.0,
- 0.159154943, // 1/(2*PI)
- 6.283185307 // 2*PI
- }
-};
+
/**
- * Emit a LIT instruction.
- * \p flags may be PFS_FLAG_SAT
+ * Transform the program to support fragment.position.
*
- * Definition of LIT (from ARB_fragment_program):
- * tmp = VectorLoad(op0);
- * if (tmp.x < 0) tmp.x = 0;
- * if (tmp.y < 0) tmp.y = 0;
- * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- * result.x = 1.0;
- * result.y = tmp.x;
- * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- * result.w = 1.0;
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
*
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots. So unless there's some special undocumented opcode,
- * this implementation is potentially optimal. Unfortunately,
- * emit_arith is a bit too conservative because it doesn't understand
- * partial writes to the vector component.
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
*/
-static const GLfloat LitConst[4] =
- { 127.999999, 127.999999, 127.999999, -127.999999 };
-
-static void emit_lit(struct r300_fragment_program *fp,
- GLuint dest, int mask, GLuint src, int flags)
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
{
- COMPILE_STATE;
- GLuint cnst;
- int needTemporary;
- GLuint temp;
-
- cnst = emit_const4fv(fp, LitConst);
-
- needTemporary = 0;
- if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
- needTemporary = 1;
- } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
- // LIT is typically followed by DP3/DP4, so there's no point
- // in creating special code for this case
- needTemporary = 1;
- }
-
- if (needTemporary) {
- temp = keep(get_temp_reg(fp));
- } else {
- temp = keep(dest);
- }
-
- // Note: The order of emit_arith inside the slots is relevant,
- // because emit_arith only looks at scalar vs. vector when resolving
- // dependencies, and it does not consider individual vector components,
- // so swizzling between the two parts can create fake dependencies.
-
- // First slot
- emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
- keep(src), pfs_zero, undef, 0);
- emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
-
- // Second slot
- emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
- swizzle(temp, W, W, W, W), cnst, undef, 0);
- emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
- swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
-
- // Third slot
- // If desired, we saturate the y result here.
- // This does not affect the use as a condition variable in the CMP later
- emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
- temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
- emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
- swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
-
- // Fourth slot
- emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
- pfs_one, pfs_one, pfs_zero, 0);
- emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
-
- // Fifth slot
- emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
- pfs_zero, swizzle(temp, W, W, W, W),
- negate(swizzle(temp, Y, Y, Y, Y)), flags);
- emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
- pfs_zero, 0);
-
- if (needTemporary) {
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- temp, pfs_one, pfs_zero, flags);
- free_temp(fp, temp);
- } else {
- // Decrease refcount of the destination
- t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
- }
-}
-
-static GLboolean parse_program(struct r300_fragment_program *fp)
-{
- struct gl_fragment_program *mp = &fp->mesa_program;
- const struct prog_instruction *inst = mp->Base.Instructions;
- struct prog_instruction *fpi;
- GLuint src[3], dest, temp[2];
- int flags, mask = 0;
- int const_sin[2];
+ GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
- if (!inst || inst[0].Opcode == OPCODE_END) {
- ERROR("empty program?\n");
- return GL_FALSE;
- }
-
- for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
- if (fpi->SaturateMode == SATURATE_ZERO_ONE)
- flags = PFS_FLAG_SAT;
- else
- flags = 0;
-
- if (fpi->Opcode != OPCODE_KIL) {
- dest = t_dst(fp, fpi->DstReg);
- mask = fpi->DstReg.WriteMask;
- }
-
- switch (fpi->Opcode) {
- case OPCODE_ABS:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- absolute(src[0]), pfs_one, pfs_zero, flags);
- break;
- case OPCODE_ADD:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], pfs_one, src[1], flags);
- break;
- case OPCODE_CMP:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- src[2] = t_src(fp, fpi->SrcReg[2]);
- /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
- * r300 - if src2.c < 0.0 ? src1.c : src0.c
- */
- emit_arith(fp, PFS_OP_CMP, dest, mask,
- src[2], src[1], src[0], flags);
- break;
- case OPCODE_COS:
- /*
- * cos using a parabola (see SIN):
- * cos(x):
- * x = (x/(2*PI))+0.75
- * x = frac(x)
- * x = (x*2*PI)-PI
- * result = sin(x)
- */
- temp[0] = get_temp_reg(fp);
- const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
- const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
- /* add 0.5*PI and do range reduction */
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
- swizzle(src[0], X, X, X, X),
- swizzle(const_sin[1], Z, Z, Z, Z),
- swizzle(const_sin[1], X, X, X, X), 0);
-
- emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
- swizzle(temp[0], X, X, X, X),
- undef, undef, 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
- negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
- 0);
-
- /* SIN */
-
- emit_arith(fp, PFS_OP_MAD, temp[0],
- WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
- Z, Z, Z,
- Z),
- const_sin[0], pfs_zero, 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
- swizzle(temp[0], Y, Y, Y, Y),
- absolute(swizzle(temp[0], Z, Z, Z, Z)),
- swizzle(temp[0], X, X, X, X), 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
- swizzle(temp[0], X, X, X, X),
- absolute(swizzle(temp[0], X, X, X, X)),
- negate(swizzle(temp[0], X, X, X, X)), 0);
-
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- swizzle(temp[0], Y, Y, Y, Y),
- swizzle(const_sin[0], W, W, W, W),
- swizzle(temp[0], X, X, X, X), flags);
-
- free_temp(fp, temp[0]);
- break;
- case OPCODE_DP3:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_DP3, dest, mask,
- src[0], src[1], undef, flags);
- break;
- case OPCODE_DP4:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_DP4, dest, mask,
- src[0], src[1], undef, flags);
- break;
- case OPCODE_DPH:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- /* src0.xyz1 -> temp
- * DP4 dest, temp, src1
- */
-#if 0
- temp[0] = get_temp_reg(fp);
- src[0].s_swz = SWIZZLE_ONE;
- emit_arith(fp, PFS_OP_MAD, temp[0], mask,
- src[0], pfs_one, pfs_zero, 0);
- emit_arith(fp, PFS_OP_DP4, dest, mask,
- temp[0], src[1], undef, flags);
- free_temp(fp, temp[0]);
-#else
- emit_arith(fp, PFS_OP_DP4, dest, mask,
- swizzle(src[0], X, Y, Z, ONE), src[1],
- undef, flags);
-#endif
- break;
- case OPCODE_DST:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- /* dest.y = src0.y * src1.y */
- if (mask & WRITEMASK_Y)
- emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
- keep(src[0]), keep(src[1]),
- pfs_zero, flags);
- /* dest.z = src0.z */
- if (mask & WRITEMASK_Z)
- emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
- src[0], pfs_one, pfs_zero, flags);
- /* result.x = 1.0
- * result.w = src1.w */
- if (mask & WRITEMASK_XW) {
- REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */
- emit_arith(fp, PFS_OP_MAD, dest,
- mask & WRITEMASK_XW,
- src[1], pfs_one, pfs_zero, flags);
- }
- break;
- case OPCODE_EX2:
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_EX2, dest, mask,
- src[0], undef, undef, flags);
- break;
- case OPCODE_FLR:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- temp[0] = get_temp_reg(fp);
- /* FRC temp, src0
- * MAD dest, src0, 1.0, -temp
- */
- emit_arith(fp, PFS_OP_FRC, temp[0], mask,
- keep(src[0]), undef, undef, 0);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], pfs_one, negate(temp[0]), flags);
- free_temp(fp, temp[0]);
- break;
- case OPCODE_FRC:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_FRC, dest, mask,
- src[0], undef, undef, flags);
- break;
- case OPCODE_KIL:
- emit_tex(fp, fpi, R300_TEX_OP_KIL);
- break;
- case OPCODE_LG2:
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_LG2, dest, mask,
- src[0], undef, undef, flags);
- break;
- case OPCODE_LIT:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- emit_lit(fp, dest, mask, src[0], flags);
- break;
- case OPCODE_LRP:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- src[2] = t_src(fp, fpi->SrcReg[2]);
- /* result = tmp0tmp1 + (1 - tmp0)tmp2
- * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
- * MAD temp, -tmp0, tmp2, tmp2
- * MAD result, tmp0, tmp1, temp
- */
- temp[0] = get_temp_reg(fp);
- emit_arith(fp, PFS_OP_MAD, temp[0], mask,
- negate(keep(src[0])), keep(src[2]), src[2],
- 0);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], src[1], temp[0], flags);
- free_temp(fp, temp[0]);
- break;
- case OPCODE_MAD:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- src[2] = t_src(fp, fpi->SrcReg[2]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], src[1], src[2], flags);
- break;
- case OPCODE_MAX:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_MAX, dest, mask,
- src[0], src[1], undef, flags);
- break;
- case OPCODE_MIN:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_MIN, dest, mask,
- src[0], src[1], undef, flags);
- break;
- case OPCODE_MOV:
- case OPCODE_SWZ:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], pfs_one, pfs_zero, flags);
- break;
- case OPCODE_MUL:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], src[1], pfs_zero, flags);
- break;
- case OPCODE_POW:
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
- src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
- temp[0] = get_temp_reg(fp);
- emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
- src[0], undef, undef, 0);
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
- temp[0], src[1], pfs_zero, 0);
- emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
- temp[0], undef, undef, 0);
- free_temp(fp, temp[0]);
- break;
- case OPCODE_RCP:
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_RCP, dest, mask,
- src[0], undef, undef, flags);
- break;
- case OPCODE_RSQ:
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
- emit_arith(fp, PFS_OP_RSQ, dest, mask,
- absolute(src[0]), pfs_zero, pfs_zero, flags);
- break;
- case OPCODE_SCS:
- /*
- * scs using a parabola :
- * scs(x):
- * result.x = sin(-abs(x)+0.5*PI) (cos)
- * result.y = sin(x) (sin)
- *
- */
- temp[0] = get_temp_reg(fp);
- temp[1] = get_temp_reg(fp);
- const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
- const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
- /* x = -abs(x)+0.5*PI */
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI
- pfs_half,
- negate(abs
- (swizzle(keep(src[0]), X, X, X, X))),
- 0);
-
- /* C*x (sin) */
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
- swizzle(const_sin[0], Y, Y, Y, Y),
- swizzle(keep(src[0]), X, X, X, X),
- pfs_zero, 0);
-
- /* B*x, C*x (cos) */
- emit_arith(fp, PFS_OP_MAD, temp[0],
- WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
- Z, Z, Z,
- Z),
- const_sin[0], pfs_zero, 0);
-
- /* B*x (sin) */
- emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
- swizzle(const_sin[0], X, X, X, X),
- keep(src[0]), pfs_zero, 0);
-
- /* y = B*x + C*x*abs(x) (sin) */
- emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
- absolute(src[0]),
- swizzle(temp[0], W, W, W, W),
- swizzle(temp[1], W, W, W, W), 0);
-
- /* y = B*x + C*x*abs(x) (cos) */
- emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
- swizzle(temp[0], Y, Y, Y, Y),
- absolute(swizzle(temp[0], Z, Z, Z, Z)),
- swizzle(temp[0], X, X, X, X), 0);
-
- /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
- emit_arith(fp, PFS_OP_MAD, temp[0],
- WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
- W, Z, Y,
- X),
- absolute(swizzle(temp[1], W, Z, Y, X)),
- negate(swizzle(temp[1], W, Z, Y, X)), 0);
-
- /* dest.xy = mad(temp.xy, P, temp2.wz) */
- emit_arith(fp, PFS_OP_MAD, dest,
- mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
- swizzle(const_sin[0], W, W, W, W),
- swizzle(temp[1], W, Z, Y, X), flags);
-
- free_temp(fp, temp[0]);
- free_temp(fp, temp[1]);
- break;
- case OPCODE_SGE:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- temp[0] = get_temp_reg(fp);
- /* temp = src0 - src1
- * dest.c = (temp.c < 0.0) ? 0 : 1
- */
- emit_arith(fp, PFS_OP_MAD, temp[0], mask,
- src[0], pfs_one, negate(src[1]), 0);
- emit_arith(fp, PFS_OP_CMP, dest, mask,
- pfs_one, pfs_zero, temp[0], 0);
- free_temp(fp, temp[0]);
- break;
- case OPCODE_SIN:
- /*
- * using a parabola:
- * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
- * extra precision is obtained by weighting against
- * itself squared.
- */
-
- temp[0] = get_temp_reg(fp);
- const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
- const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
- src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
-
- /* do range reduction */
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
- swizzle(keep(src[0]), X, X, X, X),
- swizzle(const_sin[1], Z, Z, Z, Z),
- pfs_half, 0);
-
- emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
- swizzle(temp[0], X, X, X, X),
- undef, undef, 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
- negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
- 0);
-
- /* SIN */
-
- emit_arith(fp, PFS_OP_MAD, temp[0],
- WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
- Z, Z, Z,
- Z),
- const_sin[0], pfs_zero, 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
- swizzle(temp[0], Y, Y, Y, Y),
- absolute(swizzle(temp[0], Z, Z, Z, Z)),
- swizzle(temp[0], X, X, X, X), 0);
-
- emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
- swizzle(temp[0], X, X, X, X),
- absolute(swizzle(temp[0], X, X, X, X)),
- negate(swizzle(temp[0], X, X, X, X)), 0);
-
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- swizzle(temp[0], Y, Y, Y, Y),
- swizzle(const_sin[0], W, W, W, W),
- swizzle(temp[0], X, X, X, X), flags);
-
- free_temp(fp, temp[0]);
- break;
- case OPCODE_SLT:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- temp[0] = get_temp_reg(fp);
- /* temp = src0 - src1
- * dest.c = (temp.c < 0.0) ? 1 : 0
- */
- emit_arith(fp, PFS_OP_MAD, temp[0], mask,
- src[0], pfs_one, negate(src[1]), 0);
- emit_arith(fp, PFS_OP_CMP, dest, mask,
- pfs_zero, pfs_one, temp[0], 0);
- free_temp(fp, temp[0]);
- break;
- case OPCODE_SUB:
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- emit_arith(fp, PFS_OP_MAD, dest, mask,
- src[0], pfs_one, negate(src[1]), flags);
- break;
- case OPCODE_TEX:
- emit_tex(fp, fpi, R300_TEX_OP_LD);
- break;
- case OPCODE_TXB:
- emit_tex(fp, fpi, R300_TEX_OP_TXB);
- break;
- case OPCODE_TXP:
- emit_tex(fp, fpi, R300_TEX_OP_TXP);
- break;
- case OPCODE_XPD:{
- src[0] = t_src(fp, fpi->SrcReg[0]);
- src[1] = t_src(fp, fpi->SrcReg[1]);
- temp[0] = get_temp_reg(fp);
- /* temp = src0.zxy * src1.yzx */
- emit_arith(fp, PFS_OP_MAD, temp[0],
- WRITEMASK_XYZ, swizzle(keep(src[0]),
- Z, X, Y, W),
- swizzle(keep(src[1]), Y, Z, X, W),
- pfs_zero, 0);
- /* dest.xyz = src0.yzx * src1.zxy - temp
- * dest.w = undefined
- * */
- emit_arith(fp, PFS_OP_MAD, dest,
- mask & WRITEMASK_XYZ, swizzle(src[0],
- Y, Z,
- X, W),
- swizzle(src[1], Z, X, Y, W),
- negate(temp[0]), flags);
- /* cleanup */
- free_temp(fp, temp[0]);
- break;
- }
- default:
- ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
- break;
- }
-
- if (fp->error)
- return GL_FALSE;
-
- }
-
- return GL_TRUE;
-}
+ if (!(InputsRead & FRAG_BIT_WPOS))
+ return;
-static void insert_wpos(struct gl_program *prog)
-{
static gl_state_index tokens[STATE_LENGTH] = {
STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
};
struct prog_instruction *fpi;
GLuint window_index;
int i = 0;
- GLuint tempregi = prog->NumTemporaries;
- /* should do something else if no temps left... */
- prog->NumTemporaries++;
+ GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler);
- fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
- _mesa_init_instructions(fpi, prog->NumInstructions + 3);
+ fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3);
/* perspective divide */
fpi[i].Opcode = OPCODE_RCP;
@@ -2041,7 +297,7 @@ static void insert_wpos(struct gl_program *prog)
i++;
/* viewport transformation */
- window_index = _mesa_add_state_reference(prog->Parameters, tokens);
+ window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
fpi[i].Opcode = OPCODE_MAD;
@@ -2066,203 +322,114 @@ static void insert_wpos(struct gl_program *prog)
MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
i++;
- _mesa_copy_instructions(&fpi[i], prog->Instructions,
- prog->NumInstructions);
-
- free(prog->Instructions);
-
- prog->Instructions = fpi;
-
- prog->NumInstructions += i;
- fpi = &prog->Instructions[prog->NumInstructions - 1];
-
- assert(fpi->Opcode == OPCODE_END);
-
- for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
- for (i = 0; i < 3; i++)
- if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
- fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
- fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
- fpi->SrcReg[i].Index = tempregi;
+ for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) {
+ int reg;
+ for (reg = 0; reg < 3; reg++) {
+ if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+ fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+ fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+ fpi[i].SrcReg[reg].Index = tempregi;
}
- }
-}
-
-/* - Init structures
- * - Determine what hwregs each input corresponds to
- */
-static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
- struct r300_pfs_compile_state *cs = NULL;
- struct gl_fragment_program *mp = &fp->mesa_program;
- struct prog_instruction *fpi;
- GLuint InputsRead = mp->Base.InputsRead;
- GLuint temps_used = 0; /* for fp->temps[] */
- int i, j;
-
- /* New compile, reset tracking data */
- fp->optimization =
- driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
- fp->translated = GL_FALSE;
- fp->error = GL_FALSE;
- fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
- fp->WritesDepth = GL_FALSE;
- fp->tex.length = 0;
- fp->cur_node = 0;
- fp->first_node_has_tex = 0;
- fp->const_nr = 0;
- fp->max_temp_idx = 0;
- fp->node[0].alu_end = -1;
- fp->node[0].tex_end = -1;
-
- _mesa_memset(cs, 0, sizeof(*fp->cs));
- for (i = 0; i < PFS_MAX_ALU_INST; i++) {
- for (j = 0; j < 3; j++) {
- cs->slot[i].vsrc[j] = SRC_CONST;
- cs->slot[i].ssrc[j] = SRC_CONST;
}
}
+}
- /* Work out what temps the Mesa inputs correspond to, this must match
- * what setup_rs_unit does, which shouldn't be a problem as rs_unit
- * configures itself based on the fragprog's InputsRead
- *
- * NOTE: this depends on get_hw_temp() allocating registers in order,
- * starting from register 0.
- */
- /* Texcoords come first */
- for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
- if (InputsRead & (FRAG_BIT_TEX0 << i)) {
- cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
- cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
- get_hw_temp(fp, 0);
- }
+static GLuint build_dtm(GLuint depthmode)
+{
+ switch(depthmode) {
+ default:
+ case GL_LUMINANCE: return 0;
+ case GL_INTENSITY: return 1;
+ case GL_ALPHA: return 2;
}
- InputsRead &= ~FRAG_BITS_TEX_ANY;
+}
- /* fragment position treated as a texcoord */
- if (InputsRead & FRAG_BIT_WPOS) {
- cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
- cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(fp, 0);
- insert_wpos(&mp->Base);
- }
- InputsRead &= ~FRAG_BIT_WPOS;
+static GLuint build_func(GLuint comparefunc)
+{
+ return comparefunc - GL_NEVER;
+}
- /* Then primary colour */
- if (InputsRead & FRAG_BIT_COL0) {
- cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
- cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(fp, 0);
- }
- InputsRead &= ~FRAG_BIT_COL0;
- /* Secondary color */
- if (InputsRead & FRAG_BIT_COL1) {
- cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
- cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(fp, 0);
- }
- InputsRead &= ~FRAG_BIT_COL1;
-
- /* Anything else */
- if (InputsRead) {
- WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
- /* force read from hwreg 0 for now */
- for (i = 0; i < 32; i++)
- if (InputsRead & (1 << i))
- cs->inputs[i].reg = 0;
- }
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+ r300ContextPtr r300,
+ struct r300_fragment_program *fp,
+ struct r300_fragment_program_external_state *state)
+{
+ int unit;
- /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
- * That way, we can free up the reg when it's no longer needed
- */
- if (!mp->Base.Instructions) {
- ERROR("No instructions found in program\n");
- return;
- }
+ _mesa_bzero(state, sizeof(*state));
- for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
- int idx;
-
- for (i = 0; i < 3; i++) {
- idx = fpi->SrcReg[i].Index;
- switch (fpi->SrcReg[i].File) {
- case PROGRAM_TEMPORARY:
- if (!(temps_used & (1 << idx))) {
- cs->temps[idx].reg = -1;
- cs->temps[idx].refcount = 1;
- temps_used |= (1 << idx);
- } else
- cs->temps[idx].refcount++;
- break;
- case PROGRAM_INPUT:
- cs->inputs[idx].refcount++;
- break;
- default:
- break;
- }
- }
+ for(unit = 0; unit < 16; ++unit) {
+ if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+ struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
- idx = fpi->DstReg.Index;
- if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
- if (!(temps_used & (1 << idx))) {
- cs->temps[idx].reg = -1;
- cs->temps[idx].refcount = 1;
- temps_used |= (1 << idx);
- } else
- cs->temps[idx].refcount++;
+ state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+ state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
}
}
- cs->temp_in_use = temps_used;
}
-static void update_params(struct r300_fragment_program *fp)
-{
- struct gl_fragment_program *mp = &fp->mesa_program;
-
- /* Ask Mesa nicely to fill in ParameterValues for us */
- if (mp->Base.Parameters)
- _mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
-}
void r300TranslateFragmentShader(r300ContextPtr r300,
struct r300_fragment_program *fp)
{
+ struct r300_fragment_program_external_state state;
- struct r300_pfs_compile_state *cs = NULL;
+ build_state(r300, fp, &state);
+ if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+ /* TODO: cache compiled programs */
+ fp->translated = GL_FALSE;
+ _mesa_memcpy(&fp->state, &state, sizeof(state));
+ }
if (!fp->translated) {
+ struct r300_fragment_program_compiler compiler;
+
+ compiler.r300 = r300;
+ compiler.fp = fp;
+ compiler.code = &fp->code;
- init_program(r300, fp);
- cs = fp->cs;
+ radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base);
- if (parse_program(fp) == GL_FALSE) {
- dump_program(fp);
- return;
+ insert_WPOS_trailer(&compiler);
+
+ struct radeon_program_transformation transformations[] = {
+ { &transform_TEX, &compiler },
+ { &radeonTransformALU, 0 }
+ };
+ radeonClauseLocalTransform(&compiler.compiler,
+ &compiler.compiler.Clauses[0],
+ 2, transformations);
+
+ if (RADEON_DEBUG & DEBUG_PIXEL) {
+ _mesa_printf("Compiler state after transformations:\n");
+ radeonCompilerDump(&compiler.compiler);
}
- /* Finish off */
- fp->node[fp->cur_node].alu_end =
- cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
- if (fp->node[fp->cur_node].tex_end < 0)
- fp->node[fp->cur_node].tex_end = 0;
- fp->alu_offset = 0;
- fp->alu_end = cs->nrslots - 1;
- fp->tex_offset = 0;
- fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
- assert(fp->node[fp->cur_node].alu_end >= 0);
- assert(fp->alu_end >= 0);
-
- fp->translated = GL_TRUE;
- if (RADEON_DEBUG & DEBUG_PIXEL)
- dump_program(fp);
- r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
+ if (!r300FragmentProgramEmit(&compiler))
+ fp->error = GL_TRUE;
+
+ radeonCompilerCleanup(&compiler.compiler);
+
+ if (!fp->error)
+ fp->translated = GL_TRUE;
+ if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+ r300FragmentProgramDump(fp, &fp->code);
+ r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
}
- update_params(fp);
+ update_params(r300, fp);
}
/* just some random things... */
-static void dump_program(struct r300_fragment_program *fp)
+void r300FragmentProgramDump(
+ struct r300_fragment_program *fp,
+ struct r300_fragment_program_code *code)
{
int n, i, j;
static int pc = 0;
@@ -2277,21 +444,21 @@ static void dump_program(struct r300_fragment_program *fp)
fprintf(stderr, "Hardware program\n");
fprintf(stderr, "----------------\n");
- for (n = 0; n < (fp->cur_node + 1); n++) {
+ for (n = 0; n < (code->cur_node + 1); n++) {
fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
"alu_end: %d, tex_end: %d\n", n,
- fp->node[n].alu_offset,
- fp->node[n].tex_offset,
- fp->node[n].alu_end, fp->node[n].tex_end);
+ code->node[n].alu_offset,
+ code->node[n].tex_offset,
+ code->node[n].alu_end, code->node[n].tex_end);
- if (fp->tex.length) {
+ if (code->tex.length) {
fprintf(stderr, " TEX:\n");
- for (i = fp->node[n].tex_offset;
- i <= fp->node[n].tex_offset + fp->node[n].tex_end;
+ for (i = code->node[n].tex_offset;
+ i <= code->node[n].tex_offset + code->node[n].tex_end;
++i) {
const char *instr;
- switch ((fp->tex.
+ switch ((code->tex.
inst[i] >> R300_TEX_INST_SHIFT) &
15) {
case R300_TEX_OP_LD:
@@ -2313,20 +480,20 @@ static void dump_program(struct r300_fragment_program *fp)
fprintf(stderr,
" %s t%i, %c%i, texture[%i] (%08x)\n",
instr,
- (fp->tex.
+ (code->tex.
inst[i] >> R300_DST_ADDR_SHIFT) & 31,
't',
- (fp->tex.
+ (code->tex.
inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
- (fp->tex.
+ (code->tex.
inst[i] & R300_TEX_ID_MASK) >>
R300_TEX_ID_SHIFT,
- fp->tex.inst[i]);
+ code->tex.inst[i]);
}
}
- for (i = fp->node[n].alu_offset;
- i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
+ for (i = code->node[n].alu_offset;
+ i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
char srcc[3][10], dstc[20];
char srca[3][10], dsta[20];
char argc[3][20];
@@ -2334,8 +501,8 @@ static void dump_program(struct r300_fragment_program *fp)
char flags[5], tmp[10];
for (j = 0; j < 3; ++j) {
- int regc = fp->alu.inst[i].inst1 >> (j * 6);
- int rega = fp->alu.inst[i].inst3 >> (j * 6);
+ int regc = code->alu.inst[i].inst1 >> (j * 6);
+ int rega = code->alu.inst[i].inst3 >> (j * 6);
sprintf(srcc[j], "%c%i",
(regc & 32) ? 'c' : 't', regc & 31);
@@ -2345,46 +512,46 @@ static void dump_program(struct r300_fragment_program *fp)
dstc[0] = 0;
sprintf(flags, "%s%s%s",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
if (flags[0] != 0) {
sprintf(dstc, "t%i.%s ",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 >> R300_ALU_DSTC_SHIFT) & 31,
flags);
}
sprintf(flags, "%s%s%s",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
if (flags[0] != 0) {
sprintf(tmp, "o%i.%s",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst1 >> R300_ALU_DSTC_SHIFT) & 31,
flags);
strcat(dstc, tmp);
}
dsta[0] = 0;
- if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
+ if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
sprintf(dsta, "t%i.w ",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst3 >> R300_ALU_DSTA_SHIFT) & 31);
}
- if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
+ if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
sprintf(tmp, "o%i.w ",
- (fp->alu.inst[i].
+ (code->alu.inst[i].
inst3 >> R300_ALU_DSTA_SHIFT) & 31);
strcat(dsta, tmp);
}
- if (fp->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
+ if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
strcat(dsta, "Z");
}
@@ -2392,12 +559,12 @@ static void dump_program(struct r300_fragment_program *fp)
"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
" w: %3s %3s %3s -> %-20s (%08x)\n", i,
srcc[0], srcc[1], srcc[2], dstc,
- fp->alu.inst[i].inst1, srca[0], srca[1],
- srca[2], dsta, fp->alu.inst[i].inst3);
+ code->alu.inst[i].inst1, srca[0], srca[1],
+ srca[2], dsta, code->alu.inst[i].inst3);
for (j = 0; j < 3; ++j) {
- int regc = fp->alu.inst[i].inst0 >> (j * 7);
- int rega = fp->alu.inst[i].inst2 >> (j * 7);
+ int regc = code->alu.inst[i].inst0 >> (j * 7);
+ int rega = code->alu.inst[i].inst2 >> (j * 7);
int d;
char buf[20];
@@ -2479,8 +646,8 @@ static void dump_program(struct r300_fragment_program *fp)
fprintf(stderr, " xyz: %8s %8s %8s op: %08x\n"
" w: %8s %8s %8s op: %08x\n",
argc[0], argc[1], argc[2],
- fp->alu.inst[i].inst0, arga[0], arga[1],
- arga[2], fp->alu.inst[i].inst2);
+ code->alu.inst[i].inst0, arga[0], arga[1],
+ arga[2], code->alu.inst[i].inst2);
}
}
}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index 573aacf19a1..7c1e210b044 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -40,12 +40,7 @@
#include "shader/prog_instruction.h"
#include "r300_context.h"
-
-typedef struct r300_fragment_program_swizzle {
- GLuint length;
- GLuint src[4];
- GLuint inst[8];
-} r300_fragment_program_swizzle_t;
+#include "radeon_program.h"
/* supported hw opcodes */
#define PFS_OP_MAD 0
@@ -74,25 +69,6 @@ typedef struct r300_fragment_program_swizzle {
#define SRC_MASK (63 << 0)
#define SRC_STRIDE 6
-#define NOP_INST0 ( \
- (R300_ALU_OUTC_MAD) | \
- (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
- (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
- (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
-#define NOP_INST1 ( \
- ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
- ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
- ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
-#define NOP_INST2 ( \
- (R300_ALU_OUTA_MAD) | \
- (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
- (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
- (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
-#define NOP_INST3 ( \
- ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
- ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
- ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
-
#define DRI_CONF_FP_OPTIMIZATION_SPEED 0
#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
@@ -161,4 +137,24 @@ struct r300_fragment_program;
extern void r300TranslateFragmentShader(r300ContextPtr r300,
struct r300_fragment_program *fp);
+
+/**
+ * Used internally by the r300 fragment program code to store compile-time
+ * only data.
+ */
+struct r300_fragment_program_compiler {
+ r300ContextPtr r300;
+ struct r300_fragment_program *fp;
+ struct r300_fragment_program_code *code;
+ struct radeon_compiler compiler;
+};
+
+extern void r300FPTransformTextures(struct r300_fragment_program_compiler *compiler);
+extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
+
+
+extern void r300FragmentProgramDump(
+ struct r300_fragment_program *fp,
+ struct r300_fragment_program_code *code);
+
#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
new file mode 100644
index 00000000000..9ba29feb40b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
@@ -0,0 +1,2058 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Emit the r300_fragment_program_code that can be understood by the hardware.
+ * Input is a pre-transformed radeon_program.
+ *
+ * \author Ben Skeggs <[email protected]>
+ *
+ * \author Jerome Glisse <[email protected]>
+ *
+ * \todo FogOption
+ *
+ * \todo Verify results of opcodes for accuracy, I've only checked them in
+ * specific cases.
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r300_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/* Mapping Mesa registers to R300 temporaries */
+struct reg_acc {
+ int reg; /* Assigned hw temp */
+ unsigned int refcount; /* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+ /* Index of the first slot where this register is free in the sense
+ that it can be used as a new destination register.
+ This is -1 if the register has been assigned to a Mesa register
+ and the last access to the register has not yet been emitted */
+ int free;
+
+ /* Index of the first slot where this register is currently reserved.
+ This is used to stop e.g. a scalar operation from being moved
+ before the allocation time of a register that was first allocated
+ for a vector operation. */
+ int reserved;
+
+ /* Index of the first slot in which the register can be used as a
+ source without losing the value that is written by the last
+ emitted instruction that writes to the register */
+ int vector_valid;
+ int scalar_valid;
+
+ /* Index to the slot where the register was last read.
+ This is also the first slot in which the register may be written again */
+ int vector_lastread;
+ int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR (1<<0)
+#define SLOT_SRC_SCALAR (1<<3)
+#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR (1<<16)
+#define SLOT_OP_SCALAR (1<<17)
+#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+ /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+ defined above */
+ unsigned int used;
+
+ /* Selected sources */
+ int vsrc[3];
+ int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r300_pfs_compile_state {
+ struct r300_fragment_program_compiler *compiler;
+
+ int nrslots; /* number of ALU slots used so far */
+
+ /* Track which (parts of) slots are already filled with instructions */
+ struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+ /* Track the validity of R300 temporaries */
+ struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+ /* Used to map Mesa's inputs/temps onto hardware temps */
+ int temp_in_use;
+ struct reg_acc temps[PFS_NUM_TEMP_REGS];
+ struct reg_acc inputs[32]; /* don't actually need 32... */
+
+ /* Track usage of hardware temps, for register allocation,
+ * indirection detection, etc. */
+ GLuint used_in_node;
+ GLuint dest_in_node;
+};
+
+
+/*
+ * Usefull macros and values
+ */
+#define ERROR(fmt, args...) do { \
+ fprintf(stderr, "%s::%s(): " fmt "\n", \
+ __FILE__, __FUNCTION__, ##args); \
+ fp->error = GL_TRUE; \
+ } while(0)
+
+#define PFS_INVAL 0xFFFFFFFF
+#define COMPILE_STATE \
+ struct r300_fragment_program *fp = cs->compiler->fp; \
+ struct r300_fragment_program_code *code = cs->compiler->code; \
+ (void)code; (void)fp
+
+#define SWIZZLE_XYZ 0
+#define SWIZZLE_XXX 1
+#define SWIZZLE_YYY 2
+#define SWIZZLE_ZZZ 3
+#define SWIZZLE_WWW 4
+#define SWIZZLE_YZX 5
+#define SWIZZLE_ZXY 6
+#define SWIZZLE_WZY 7
+#define SWIZZLE_111 8
+#define SWIZZLE_000 9
+#define SWIZZLE_HHH 10
+
+#define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
+ ((SWIZZLE_##x<<0)| \
+ (SWIZZLE_##y<<3)| \
+ (SWIZZLE_##z<<6)| \
+ (SWIZZLE_##w<<9)), \
+ 0)
+
+#define REG_TYPE_INPUT 0
+#define REG_TYPE_OUTPUT 1
+#define REG_TYPE_TEMP 2
+#define REG_TYPE_CONST 3
+
+#define REG_TYPE_SHIFT 0
+#define REG_INDEX_SHIFT 2
+#define REG_VSWZ_SHIFT 8
+#define REG_SSWZ_SHIFT 13
+#define REG_NEGV_SHIFT 18
+#define REG_NEGS_SHIFT 19
+#define REG_ABS_SHIFT 20
+#define REG_NO_USE_SHIFT 21 // Hack for refcounting
+#define REG_VALID_SHIFT 22 // Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
+
+#define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
+#define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
+#define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
+#define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
+#define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
+#define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
+#define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
+#define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
+#define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
+
+#define REG(type, index, vswz, sswz, nouse, valid, builtin) \
+ (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
+ ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
+ ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
+ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
+ ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
+ ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
+ ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_GET_TYPE(reg) \
+ ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
+#define REG_GET_INDEX(reg) \
+ ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
+#define REG_GET_VSWZ(reg) \
+ ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
+#define REG_GET_SSWZ(reg) \
+ ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
+#define REG_GET_NO_USE(reg) \
+ ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
+#define REG_GET_VALID(reg) \
+ ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg) \
+ ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
+#define REG_SET_TYPE(reg, type) \
+ reg = ((reg & ~REG_TYPE_MASK) | \
+ ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
+#define REG_SET_INDEX(reg, index) \
+ reg = ((reg & ~REG_INDEX_MASK) | \
+ ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
+#define REG_SET_VSWZ(reg, vswz) \
+ reg = ((reg & ~REG_VSWZ_MASK) | \
+ ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
+#define REG_SET_SSWZ(reg, sswz) \
+ reg = ((reg & ~REG_SSWZ_MASK) | \
+ ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_SET_NO_USE(reg, nouse) \
+ reg = ((reg & ~REG_NO_USE_MASK) | \
+ ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
+#define REG_SET_VALID(reg, valid) \
+ reg = ((reg & ~REG_VALID_MASK) | \
+ ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin) \
+ reg = ((reg & ~REG_BUILTIN_MASK) | \
+ ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
+#define REG_ABS(reg) \
+ reg = (reg | REG_ABS_MASK)
+#define REG_NEGV(reg) \
+ reg = (reg | REG_NEGV_MASK)
+#define REG_NEGS(reg) \
+ reg = (reg | REG_NEGS_MASK)
+
+#define NOP_INST0 ( \
+ (R300_ALU_OUTC_MAD) | \
+ (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
+ (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
+ (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
+#define NOP_INST1 ( \
+ ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
+ ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
+ ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
+#define NOP_INST2 ( \
+ (R300_ALU_OUTA_MAD) | \
+ (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
+ (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
+ (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
+#define NOP_INST3 ( \
+ ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
+ ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
+ ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
+
+
+/*
+ * Datas structures for fragment program generation
+ */
+
+/* description of r300 native hw instructions */
+static const struct {
+ const char *name;
+ int argc;
+ int v_op;
+ int s_op;
+} r300_fpop[] = {
+ /* *INDENT-OFF* */
+ {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
+ {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
+ {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
+ {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
+ {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
+ {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
+ {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
+ {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
+ {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
+ {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
+ {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
+ {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
+ {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
+ /* *INDENT-ON* */
+};
+
+/* vector swizzles r300 can support natively, with a couple of
+ * cases we handle specially
+ *
+ * REG_VSWZ/REG_SSWZ is an index into this table
+ */
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
+#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
+ SWIZZLE_##y, \
+ SWIZZLE_##z, \
+ SWIZZLE_ZERO))
+/* native swizzles */
+static const struct r300_pfs_swizzle {
+ GLuint hash; /* swizzle value this matches */
+ GLuint base; /* base value for hw swizzle */
+ GLuint stride; /* difference in base between arg0/1/2 */
+ GLuint flags;
+} v_swiz[] = {
+ /* *INDENT-OFF* */
+ {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
+ {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
+ {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
+ {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
+ {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
+ {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
+ {PFS_INVAL, 0, 0, 0},
+ /* *INDENT-ON* */
+};
+
+/* used during matching of non-native swizzles */
+#define SWZ_X_MASK (7 << 0)
+#define SWZ_Y_MASK (7 << 3)
+#define SWZ_Z_MASK (7 << 6)
+#define SWZ_W_MASK (7 << 9)
+static const struct {
+ GLuint hash; /* used to mask matching swizzle components */
+ int mask; /* actual outmask */
+ int count; /* count of components matched */
+} s_mask[] = {
+ /* *INDENT-OFF* */
+ {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
+ {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
+ {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
+ {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
+ {SWZ_X_MASK, 1, 1},
+ {SWZ_Y_MASK, 2, 1},
+ {SWZ_Z_MASK, 4, 1},
+ {PFS_INVAL, PFS_INVAL, PFS_INVAL}
+ /* *INDENT-ON* */
+};
+
+static const struct {
+ int base; /* hw value of swizzle */
+ int stride; /* difference between SRC0/1/2 */
+ GLuint flags;
+} s_swiz[] = {
+ /* *INDENT-OFF* */
+ {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
+ {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
+ {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
+ {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
+ {R300_ALU_ARGA_ZERO, 0, 0},
+ {R300_ALU_ARGA_ONE, 0, 0},
+ {R300_ALU_ARGA_HALF, 0, 0}
+ /* *INDENT-ON* */
+};
+
+/* boiler-plate reg, for convenience */
+static const GLuint undef = REG(REG_TYPE_TEMP,
+ 0,
+ SWIZZLE_XYZ,
+ SWIZZLE_W,
+ GL_FALSE,
+ GL_FALSE,
+ GL_FALSE);
+
+/* constant one source */
+static const GLuint pfs_one = REG(REG_TYPE_CONST,
+ 0,
+ SWIZZLE_111,
+ SWIZZLE_ONE,
+ GL_FALSE,
+ GL_TRUE,
+ GL_TRUE);
+
+/* constant half source */
+static const GLuint pfs_half = REG(REG_TYPE_CONST,
+ 0,
+ SWIZZLE_HHH,
+ SWIZZLE_HALF,
+ GL_FALSE,
+ GL_TRUE,
+ GL_TRUE);
+
+/* constant zero source */
+static const GLuint pfs_zero = REG(REG_TYPE_CONST,
+ 0,
+ SWIZZLE_000,
+ SWIZZLE_ZERO,
+ GL_FALSE,
+ GL_TRUE,
+ GL_TRUE);
+
+/*
+ * Common functions prototypes
+ */
+static void emit_arith(struct r300_pfs_compile_state *cs, int op,
+ GLuint dest, int mask,
+ GLuint src0, GLuint src1, GLuint src2, int flags);
+
+/**
+ * Get an R300 temporary that can be written to in the given slot.
+ */
+static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
+{
+ COMPILE_STATE;
+ int r;
+
+ for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+ if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+ break;
+ }
+
+ if (r >= PFS_NUM_TEMP_REGS) {
+ ERROR("Out of hardware temps\n");
+ return 0;
+ }
+ // Reserved is used to avoid the following scenario:
+ // R300 temporary X is first assigned to Mesa temporary Y during vector ops
+ // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+ // Then scalar ops on Mesa temporary Z are emitted and move back in time
+ // to overwrite the value of temporary Y.
+ // End scenario.
+ cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+ cs->hwtemps[r].free = -1;
+
+ // Reset to some value that won't mess things up when the user
+ // tries to read from a temporary that hasn't been assigned a value yet.
+ // In the normal case, vector_valid and scalar_valid should be set to
+ // a sane value by the first emit that writes to this temporary.
+ cs->hwtemps[r].vector_valid = 0;
+ cs->hwtemps[r].scalar_valid = 0;
+
+ if (r > code->max_temp_idx)
+ code->max_temp_idx = r;
+
+ return r;
+}
+
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
+static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
+{
+ COMPILE_STATE;
+ int r;
+
+ for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+ if (cs->used_in_node & (1 << r))
+ continue;
+
+ // Note: Be very careful here
+ if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+ break;
+ }
+
+ if (r >= PFS_NUM_TEMP_REGS)
+ return get_hw_temp(cs, 0); /* Will cause an indirection */
+
+ cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+ cs->hwtemps[r].free = -1;
+
+ // Reset to some value that won't mess things up when the user
+ // tries to read from a temporary that hasn't been assigned a value yet.
+ // In the normal case, vector_valid and scalar_valid should be set to
+ // a sane value by the first emit that writes to this temporary.
+ cs->hwtemps[r].vector_valid = cs->nrslots;
+ cs->hwtemps[r].scalar_valid = cs->nrslots;
+
+ if (r > code->max_temp_idx)
+ code->max_temp_idx = r;
+
+ return r;
+}
+
+/**
+ * Mark the given hardware register as free.
+ */
+static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
+{
+ // Be very careful here. Consider sequences like
+ // MAD r0, r1,r2,r3
+ // TEX r4, ...
+ // The TEX instruction may be moved in front of the MAD instruction
+ // due to the way nodes work. We don't want to alias r1 and r4 in
+ // this case.
+ // I'm certain the register allocation could be further sanitized,
+ // but it's tricky because of stuff that can happen inside emit_tex
+ // and emit_arith.
+ cs->hwtemps[idx].free = cs->nrslots + 1;
+}
+
+/**
+ * Create a new Mesa temporary register.
+ */
+static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
+{
+ COMPILE_STATE;
+ GLuint r = undef;
+ GLuint index;
+
+ index = ffs(~cs->temp_in_use);
+ if (!index) {
+ ERROR("Out of program temps\n");
+ return r;
+ }
+
+ cs->temp_in_use |= (1 << --index);
+ cs->temps[index].refcount = 0xFFFFFFFF;
+ cs->temps[index].reg = -1;
+
+ REG_SET_TYPE(r, REG_TYPE_TEMP);
+ REG_SET_INDEX(r, index);
+ REG_SET_VALID(r, GL_TRUE);
+ return r;
+}
+
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
+static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
+{
+ GLuint index = REG_GET_INDEX(r);
+
+ if (!(cs->temp_in_use & (1 << index)))
+ return;
+
+ if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
+ free_hw_temp(cs, cs->temps[index].reg);
+ cs->temps[index].reg = -1;
+ cs->temp_in_use &= ~(1 << index);
+ } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
+ free_hw_temp(cs, cs->inputs[index].reg);
+ cs->inputs[index].reg = -1;
+ }
+}
+
+/**
+ * Emit a hardware constant/parameter.
+ *
+ * \p cp Stable pointer to an array of 4 floats.
+ * The pointer must be stable in the sense that it remains to be valid
+ * and hold the contents of the constant/parameter throughout the lifetime
+ * of the fragment program (actually, up until the next time the fragment
+ * program is translated).
+ */
+static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
+ const GLfloat * cp)
+{
+ COMPILE_STATE;
+ GLuint reg = undef;
+ int index;
+
+ for (index = 0; index < code->const_nr; ++index) {
+ if (code->constant[index] == cp)
+ break;
+ }
+
+ if (index >= code->const_nr) {
+ if (index >= PFS_NUM_CONST_REGS) {
+ ERROR("Out of hw constants!\n");
+ return reg;
+ }
+
+ code->const_nr++;
+ code->constant[index] = cp;
+ }
+
+ REG_SET_TYPE(reg, REG_TYPE_CONST);
+ REG_SET_INDEX(reg, index);
+ REG_SET_VALID(reg, GL_TRUE);
+ return reg;
+}
+
+static inline GLuint negate(GLuint r)
+{
+ REG_NEGS(r);
+ REG_NEGV(r);
+ return r;
+}
+
+/* Hack, to prevent clobbering sources used multiple times when
+ * emulating non-native instructions
+ */
+static inline GLuint keep(GLuint r)
+{
+ REG_SET_NO_USE(r, GL_TRUE);
+ return r;
+}
+
+static inline GLuint absolute(GLuint r)
+{
+ REG_ABS(r);
+ return r;
+}
+
+static int swz_native(struct r300_pfs_compile_state *cs,
+ GLuint src, GLuint * r, GLuint arbneg)
+{
+ COMPILE_STATE;
+
+ /* Native swizzle, handle negation */
+ src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
+
+ if ((arbneg & 0x7) == 0x0) {
+ src = src & ~REG_NEGV_MASK;
+ *r = src;
+ } else if ((arbneg & 0x7) == 0x7) {
+ src |= REG_NEGV_MASK;
+ *r = src;
+ } else {
+ if (!REG_GET_VALID(*r))
+ *r = get_temp_reg(cs);
+ src |= REG_NEGV_MASK;
+ emit_arith(cs,
+ PFS_OP_MAD,
+ *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
+ src = src & ~REG_NEGV_MASK;
+ emit_arith(cs,
+ PFS_OP_MAD,
+ *r,
+ (arbneg ^ 0x7) | WRITEMASK_W,
+ src, pfs_one, pfs_zero, 0);
+ }
+
+ return 3;
+}
+
+static int swz_emit_partial(struct r300_pfs_compile_state *cs,
+ GLuint src,
+ GLuint * r, int mask, int mc, GLuint arbneg)
+{
+ COMPILE_STATE;
+ GLuint tmp;
+ GLuint wmask = 0;
+
+ if (!REG_GET_VALID(*r))
+ *r = get_temp_reg(cs);
+
+ /* A partial match, VSWZ/mask define what parts of the
+ * desired swizzle we match
+ */
+ if (mc + s_mask[mask].count == 3) {
+ wmask = WRITEMASK_W;
+ src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
+ }
+
+ tmp = arbneg & s_mask[mask].mask;
+ if (tmp) {
+ tmp = tmp ^ s_mask[mask].mask;
+ if (tmp) {
+ emit_arith(cs,
+ PFS_OP_MAD,
+ *r,
+ arbneg & s_mask[mask].mask,
+ keep(src) | REG_NEGV_MASK,
+ pfs_one, pfs_zero, 0);
+ if (!wmask) {
+ REG_SET_NO_USE(src, GL_TRUE);
+ } else {
+ REG_SET_NO_USE(src, GL_FALSE);
+ }
+ emit_arith(cs,
+ PFS_OP_MAD,
+ *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
+ } else {
+ if (!wmask) {
+ REG_SET_NO_USE(src, GL_TRUE);
+ } else {
+ REG_SET_NO_USE(src, GL_FALSE);
+ }
+ emit_arith(cs,
+ PFS_OP_MAD,
+ *r,
+ (arbneg & s_mask[mask].mask) | wmask,
+ src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
+ }
+ } else {
+ if (!wmask) {
+ REG_SET_NO_USE(src, GL_TRUE);
+ } else {
+ REG_SET_NO_USE(src, GL_FALSE);
+ }
+ emit_arith(cs, PFS_OP_MAD,
+ *r,
+ s_mask[mask].mask | wmask,
+ src, pfs_one, pfs_zero, 0);
+ }
+
+ return s_mask[mask].count;
+}
+
+static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
+ GLuint src, GLuint arbswz, GLuint arbneg)
+{
+ COMPILE_STATE;
+ GLuint r = undef;
+ GLuint vswz;
+ int c_mask = 0;
+ int v_match = 0;
+
+ /* If swizzling from something without an XYZW native swizzle,
+ * emit result to a temp, and do new swizzle from the temp.
+ */
+#if 0
+ if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+ GLuint temp = get_temp_reg(fp);
+ emit_arith(fp,
+ PFS_OP_MAD,
+ temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
+ src = temp;
+ }
+#endif
+
+ if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+ GLuint vsrcswz =
+ (v_swiz[REG_GET_VSWZ(src)].
+ hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
+ REG_GET_SSWZ(src) << 9;
+ GLint i;
+
+ GLuint newswz = 0;
+ GLuint offset;
+ for (i = 0; i < 4; ++i) {
+ offset = GET_SWZ(arbswz, i);
+
+ newswz |=
+ (offset <= 3) ? GET_SWZ(vsrcswz,
+ offset) << i *
+ 3 : offset << i * 3;
+ }
+
+ arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
+ REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+ } else {
+ /* set scalar swizzling */
+ REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+ }
+ do {
+ vswz = REG_GET_VSWZ(src);
+ do {
+ int chash;
+
+ REG_SET_VSWZ(src, vswz);
+ chash = v_swiz[REG_GET_VSWZ(src)].hash &
+ s_mask[c_mask].hash;
+
+ if (chash == (arbswz & s_mask[c_mask].hash)) {
+ if (s_mask[c_mask].count == 3) {
+ v_match += swz_native(cs,
+ src, &r, arbneg);
+ } else {
+ v_match += swz_emit_partial(cs,
+ src,
+ &r,
+ c_mask,
+ v_match,
+ arbneg);
+ }
+
+ if (v_match == 3)
+ return r;
+
+ /* Fill with something invalid.. all 0's was
+ * wrong before, matched SWIZZLE_X. So all
+ * 1's will be okay for now
+ */
+ arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
+ }
+ } while (v_swiz[++vswz].hash != PFS_INVAL);
+ REG_SET_VSWZ(src, SWIZZLE_XYZ);
+ } while (s_mask[++c_mask].hash != PFS_INVAL);
+
+ ERROR("should NEVER get here\n");
+ return r;
+}
+
+static GLuint t_src(struct r300_pfs_compile_state *cs,
+ struct prog_src_register fpsrc)
+{
+ COMPILE_STATE;
+ GLuint r = undef;
+
+ switch (fpsrc.File) {
+ case PROGRAM_TEMPORARY:
+ REG_SET_INDEX(r, fpsrc.Index);
+ REG_SET_VALID(r, GL_TRUE);
+ REG_SET_TYPE(r, REG_TYPE_TEMP);
+ break;
+ case PROGRAM_INPUT:
+ REG_SET_INDEX(r, fpsrc.Index);
+ REG_SET_VALID(r, GL_TRUE);
+ REG_SET_TYPE(r, REG_TYPE_INPUT);
+ break;
+ case PROGRAM_LOCAL_PARAM:
+ r = emit_const4fv(cs,
+ fp->mesa_program.Base.LocalParams[fpsrc.
+ Index]);
+ break;
+ case PROGRAM_ENV_PARAM:
+ r = emit_const4fv(cs,
+ cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
+ break;
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_NAMED_PARAM:
+ case PROGRAM_CONSTANT:
+ r = emit_const4fv(cs,
+ fp->mesa_program.Base.Parameters->
+ ParameterValues[fpsrc.Index]);
+ break;
+ case PROGRAM_BUILTIN:
+ switch(fpsrc.Swizzle) {
+ case SWIZZLE_1111: r = pfs_one; break;
+ case SWIZZLE_0000: r = pfs_zero; break;
+ default:
+ ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
+ break;
+ }
+ break;
+ default:
+ ERROR("unknown SrcReg->File %x\n", fpsrc.File);
+ return r;
+ }
+
+ /* no point swizzling ONE/ZERO/HALF constants... */
+ if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
+ r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
+ if (fpsrc.Abs)
+ r = absolute(r);
+ if (fpsrc.NegateAbs)
+ r = negate(r);
+ return r;
+}
+
+static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
+ struct prog_src_register fpsrc)
+{
+ struct prog_src_register src = fpsrc;
+ int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
+
+ src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
+
+ return t_src(cs, src);
+}
+
+static GLuint t_dst(struct r300_pfs_compile_state *cs,
+ struct prog_dst_register dest)
+{
+ COMPILE_STATE;
+ GLuint r = undef;
+
+ switch (dest.File) {
+ case PROGRAM_TEMPORARY:
+ REG_SET_INDEX(r, dest.Index);
+ REG_SET_VALID(r, GL_TRUE);
+ REG_SET_TYPE(r, REG_TYPE_TEMP);
+ return r;
+ case PROGRAM_OUTPUT:
+ REG_SET_TYPE(r, REG_TYPE_OUTPUT);
+ switch (dest.Index) {
+ case FRAG_RESULT_COLR:
+ case FRAG_RESULT_DEPR:
+ REG_SET_INDEX(r, dest.Index);
+ REG_SET_VALID(r, GL_TRUE);
+ return r;
+ default:
+ ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
+ return r;
+ }
+ default:
+ ERROR("Bad DstReg->File 0x%x\n", dest.File);
+ return r;
+ }
+}
+
+static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
+{
+ COMPILE_STATE;
+ int idx;
+ int index = REG_GET_INDEX(src);
+
+ switch (REG_GET_TYPE(src)) {
+ case REG_TYPE_TEMP:
+ /* NOTE: if reg==-1 here, a source is being read that
+ * hasn't been written to. Undefined results.
+ */
+ if (cs->temps[index].reg == -1)
+ cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
+
+ idx = cs->temps[index].reg;
+
+ if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
+ free_temp(cs, src);
+ break;
+ case REG_TYPE_INPUT:
+ idx = cs->inputs[index].reg;
+
+ if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
+ free_hw_temp(cs, cs->inputs[index].reg);
+ break;
+ case REG_TYPE_CONST:
+ return (index | SRC_CONST);
+ default:
+ ERROR("Invalid type for source reg\n");
+ return (0 | SRC_CONST);
+ }
+
+ if (!tex)
+ cs->used_in_node |= (1 << idx);
+
+ return idx;
+}
+
+static int t_hw_dst(struct r300_pfs_compile_state *cs,
+ GLuint dest, GLboolean tex, int slot)
+{
+ COMPILE_STATE;
+ int idx;
+ GLuint index = REG_GET_INDEX(dest);
+ assert(REG_GET_VALID(dest));
+
+ switch (REG_GET_TYPE(dest)) {
+ case REG_TYPE_TEMP:
+ if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
+ if (!tex) {
+ cs->temps[index].reg = get_hw_temp(cs, slot);
+ } else {
+ cs->temps[index].reg = get_hw_temp_tex(cs);
+ }
+ }
+ idx = cs->temps[index].reg;
+
+ if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
+ free_temp(cs, dest);
+
+ cs->dest_in_node |= (1 << idx);
+ cs->used_in_node |= (1 << idx);
+ break;
+ case REG_TYPE_OUTPUT:
+ switch (index) {
+ case FRAG_RESULT_COLR:
+ code->node[code->cur_node].flags |= R300_RGBA_OUT;
+ break;
+ case FRAG_RESULT_DEPR:
+ fp->WritesDepth = GL_TRUE;
+ code->node[code->cur_node].flags |= R300_W_OUT;
+ break;
+ }
+ return index;
+ break;
+ default:
+ ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+ return 0;
+ }
+
+ return idx;
+}
+
+static void emit_nop(struct r300_pfs_compile_state *cs)
+{
+ COMPILE_STATE;
+
+ if (cs->nrslots >= PFS_MAX_ALU_INST) {
+ ERROR("Out of ALU instruction slots\n");
+ return;
+ }
+
+ code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+ code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+ code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+ code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+ cs->nrslots++;
+}
+
+static void emit_tex(struct r300_pfs_compile_state *cs,
+ struct prog_instruction *fpi, int opcode)
+{
+ COMPILE_STATE;
+ GLuint coord = t_src(cs, fpi->SrcReg[0]);
+ GLuint dest = undef;
+ GLuint din, uin;
+ int unit = fpi->TexSrcUnit;
+ int hwsrc, hwdest;
+
+ /* Ensure correct node indirection */
+ uin = cs->used_in_node;
+ din = cs->dest_in_node;
+
+ /* Resolve source/dest to hardware registers */
+ hwsrc = t_hw_src(cs, coord, GL_TRUE);
+
+ if (opcode != R300_TEX_OP_KIL) {
+ dest = t_dst(cs, fpi->DstReg);
+
+ hwdest =
+ t_hw_dst(cs, dest, GL_TRUE,
+ code->node[code->cur_node].alu_offset);
+
+ /* Use a temp that hasn't been used in this node, rather
+ * than causing an indirection
+ */
+ if (uin & (1 << hwdest)) {
+ free_hw_temp(cs, hwdest);
+ hwdest = get_hw_temp_tex(cs);
+ cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
+ }
+ } else {
+ hwdest = 0;
+ unit = 0;
+ }
+
+ /* Indirection if source has been written in this node, or if the
+ * dest has been read/written in this node
+ */
+ if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
+ (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
+
+ /* Finish off current node */
+ if (code->node[code->cur_node].alu_offset == cs->nrslots)
+ emit_nop(cs);
+
+ code->node[code->cur_node].alu_end =
+ cs->nrslots - code->node[code->cur_node].alu_offset - 1;
+ assert(code->node[code->cur_node].alu_end >= 0);
+
+ if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
+ ERROR("too many levels of texture indirection\n");
+ return;
+ }
+
+ /* Start new node */
+ code->node[code->cur_node].tex_offset = code->tex.length;
+ code->node[code->cur_node].alu_offset = cs->nrslots;
+ code->node[code->cur_node].tex_end = -1;
+ code->node[code->cur_node].alu_end = -1;
+ code->node[code->cur_node].flags = 0;
+ cs->used_in_node = 0;
+ cs->dest_in_node = 0;
+ }
+
+ if (code->cur_node == 0)
+ code->first_node_has_tex = 1;
+
+ code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
+ | (hwdest << R300_DST_ADDR_SHIFT)
+ | (unit << R300_TEX_ID_SHIFT)
+ | (opcode << R300_TEX_INST_SHIFT);
+
+ cs->dest_in_node |= (1 << hwdest);
+ if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
+ cs->used_in_node |= (1 << hwsrc);
+
+ code->node[code->cur_node].tex_end++;
+}
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
+ */
+static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
+ GLuint dest, int mask)
+{
+ COMPILE_STATE;
+ int idx;
+ int pos;
+ GLuint index = REG_GET_INDEX(dest);
+ assert(REG_GET_VALID(dest));
+
+ switch (REG_GET_TYPE(dest)) {
+ case REG_TYPE_TEMP:
+ if (cs->temps[index].reg == -1)
+ return 0;
+
+ idx = cs->temps[index].reg;
+ break;
+ case REG_TYPE_OUTPUT:
+ return 0;
+ default:
+ ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+ return 0;
+ }
+
+ pos = cs->hwtemps[idx].reserved;
+ if (mask & WRITEMASK_XYZ) {
+ if (pos < cs->hwtemps[idx].vector_lastread)
+ pos = cs->hwtemps[idx].vector_lastread;
+ }
+ if (mask & WRITEMASK_W) {
+ if (pos < cs->hwtemps[idx].scalar_lastread)
+ pos = cs->hwtemps[idx].scalar_lastread;
+ }
+
+ return pos;
+}
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
+ *
+ * @return the index of the slot
+ */
+static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
+ GLboolean emit_vop,
+ GLboolean emit_sop,
+ int argc, GLuint * src, GLuint dest, int mask)
+{
+ COMPILE_STATE;
+ int hwsrc[3];
+ int srcpos[3];
+ unsigned int used;
+ int tempused;
+ int tempvsrc[3];
+ int tempssrc[3];
+ int pos;
+ int regnr;
+ int i, j;
+
+ // Determine instruction slots, whether sources are required on
+ // vector or scalar side, and the smallest slot number where
+ // all source registers are available
+ used = 0;
+ if (emit_vop)
+ used |= SLOT_OP_VECTOR;
+ if (emit_sop)
+ used |= SLOT_OP_SCALAR;
+
+ pos = get_earliest_allowed_write(cs, dest, mask);
+
+ if (code->node[code->cur_node].alu_offset > pos)
+ pos = code->node[code->cur_node].alu_offset;
+ for (i = 0; i < argc; ++i) {
+ if (!REG_GET_BUILTIN(src[i])) {
+ if (emit_vop)
+ used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+ if (emit_sop)
+ used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+ }
+
+ hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
+ regnr = hwsrc[i] & 31;
+
+ if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+ if (used & (SLOT_SRC_VECTOR << i)) {
+ if (cs->hwtemps[regnr].vector_valid > pos)
+ pos = cs->hwtemps[regnr].vector_valid;
+ }
+ if (used & (SLOT_SRC_SCALAR << i)) {
+ if (cs->hwtemps[regnr].scalar_valid > pos)
+ pos = cs->hwtemps[regnr].scalar_valid;
+ }
+ }
+ }
+
+ // Find a slot that fits
+ for (;; ++pos) {
+ if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+ continue;
+
+ if (pos >= cs->nrslots) {
+ if (cs->nrslots >= PFS_MAX_ALU_INST) {
+ ERROR("Out of ALU instruction slots\n");
+ return -1;
+ }
+
+ code->alu.inst[pos].inst0 = NOP_INST0;
+ code->alu.inst[pos].inst1 = NOP_INST1;
+ code->alu.inst[pos].inst2 = NOP_INST2;
+ code->alu.inst[pos].inst3 = NOP_INST3;
+
+ cs->nrslots++;
+ }
+ // Note: When we need both parts (vector and scalar) of a source,
+ // we always try to put them into the same position. This makes the
+ // code easier to read, and it is optimal (i.e. one doesn't gain
+ // anything by splitting the parts).
+ // It also avoids headaches with swizzles that access both parts (i.e WXY)
+ tempused = cs->slot[pos].used;
+ for (i = 0; i < 3; ++i) {
+ tempvsrc[i] = cs->slot[pos].vsrc[i];
+ tempssrc[i] = cs->slot[pos].ssrc[i];
+ }
+
+ for (i = 0; i < argc; ++i) {
+ int flags = (used >> i) & SLOT_SRC_BOTH;
+
+ if (!flags) {
+ srcpos[i] = 0;
+ continue;
+ }
+
+ for (j = 0; j < 3; ++j) {
+ if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+ if (tempvsrc[j] != hwsrc[i])
+ continue;
+ }
+
+ if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+ if (tempssrc[j] != hwsrc[i])
+ continue;
+ }
+
+ break;
+ }
+
+ if (j == 3)
+ break;
+
+ srcpos[i] = j;
+ tempused |= flags << j;
+ if (flags & SLOT_SRC_VECTOR)
+ tempvsrc[j] = hwsrc[i];
+ if (flags & SLOT_SRC_SCALAR)
+ tempssrc[j] = hwsrc[i];
+ }
+
+ if (i == argc)
+ break;
+ }
+
+ // Found a slot, reserve it
+ cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+ for (i = 0; i < 3; ++i) {
+ cs->slot[pos].vsrc[i] = tempvsrc[i];
+ cs->slot[pos].ssrc[i] = tempssrc[i];
+ }
+
+ for (i = 0; i < argc; ++i) {
+ if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+ int regnr = hwsrc[i] & 31;
+
+ if (used & (SLOT_SRC_VECTOR << i)) {
+ if (cs->hwtemps[regnr].vector_lastread < pos)
+ cs->hwtemps[regnr].vector_lastread =
+ pos;
+ }
+ if (used & (SLOT_SRC_SCALAR << i)) {
+ if (cs->hwtemps[regnr].scalar_lastread < pos)
+ cs->hwtemps[regnr].scalar_lastread =
+ pos;
+ }
+ }
+ }
+
+ // Emit the source fetch code
+ code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
+ code->alu.inst[pos].inst1 |=
+ ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
+ (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
+ (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
+
+ code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
+ code->alu.inst[pos].inst3 |=
+ ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
+ (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
+ (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
+
+ // Emit the argument selection code
+ if (emit_vop) {
+ int swz[3];
+
+ for (i = 0; i < 3; ++i) {
+ if (i < argc) {
+ swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+ (srcpos[i] *
+ v_swiz[REG_GET_VSWZ(src[i])].
+ stride)) | ((src[i] & REG_NEGV_MASK)
+ ? ARG_NEG : 0) | ((src[i]
+ &
+ REG_ABS_MASK)
+ ?
+ ARG_ABS
+ : 0);
+ } else {
+ swz[i] = R300_ALU_ARGC_ZERO;
+ }
+ }
+
+ code->alu.inst[pos].inst0 &=
+ ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
+ R300_ALU_ARG2C_MASK);
+ code->alu.inst[pos].inst0 |=
+ (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
+ R300_ALU_ARG1C_SHIFT)
+ | (swz[2] << R300_ALU_ARG2C_SHIFT);
+ }
+
+ if (emit_sop) {
+ int swz[3];
+
+ for (i = 0; i < 3; ++i) {
+ if (i < argc) {
+ swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+ (srcpos[i] *
+ s_swiz[REG_GET_SSWZ(src[i])].
+ stride)) | ((src[i] & REG_NEGS_MASK)
+ ? ARG_NEG : 0) | ((src[i]
+ &
+ REG_ABS_MASK)
+ ?
+ ARG_ABS
+ : 0);
+ } else {
+ swz[i] = R300_ALU_ARGA_ZERO;
+ }
+ }
+
+ code->alu.inst[pos].inst2 &=
+ ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
+ R300_ALU_ARG2A_MASK);
+ code->alu.inst[pos].inst2 |=
+ (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
+ R300_ALU_ARG1A_SHIFT)
+ | (swz[2] << R300_ALU_ARG2A_SHIFT);
+ }
+
+ return pos;
+}
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
+static void emit_arith(struct r300_pfs_compile_state *cs,
+ int op,
+ GLuint dest,
+ int mask,
+ GLuint src0, GLuint src1, GLuint src2, int flags)
+{
+ COMPILE_STATE;
+ GLuint src[3] = { src0, src1, src2 };
+ int hwdest;
+ GLboolean emit_vop, emit_sop;
+ int vop, sop, argc;
+ int pos;
+
+ vop = r300_fpop[op].v_op;
+ sop = r300_fpop[op].s_op;
+ argc = r300_fpop[op].argc;
+
+ if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+ REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
+ if (mask & WRITEMASK_Z) {
+ mask = WRITEMASK_W;
+ } else {
+ return;
+ }
+ }
+
+ emit_vop = GL_FALSE;
+ emit_sop = GL_FALSE;
+ if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
+ emit_vop = GL_TRUE;
+ if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
+ emit_sop = GL_TRUE;
+
+ pos =
+ find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
+ mask);
+ if (pos < 0)
+ return;
+
+ hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
+
+ if (flags & PFS_FLAG_SAT) {
+ vop |= R300_ALU_OUTC_CLAMP;
+ sop |= R300_ALU_OUTA_CLAMP;
+ }
+
+ /* Throw the pieces together and get ALU/1 */
+ if (emit_vop) {
+ code->alu.inst[pos].inst0 |= vop;
+
+ code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
+
+ if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+ if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+ code->alu.inst[pos].inst1 |=
+ (mask & WRITEMASK_XYZ) <<
+ R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
+ } else
+ assert(0);
+ } else {
+ code->alu.inst[pos].inst1 |=
+ (mask & WRITEMASK_XYZ) <<
+ R300_ALU_DSTC_REG_MASK_SHIFT;
+
+ cs->hwtemps[hwdest].vector_valid = pos + 1;
+ }
+ }
+
+ /* And now ALU/3 */
+ if (emit_sop) {
+ code->alu.inst[pos].inst2 |= sop;
+
+ if (mask & WRITEMASK_W) {
+ if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+ if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+ code->alu.inst[pos].inst3 |=
+ (hwdest << R300_ALU_DSTA_SHIFT) |
+ R300_ALU_DSTA_OUTPUT;
+ } else if (REG_GET_INDEX(dest) ==
+ FRAG_RESULT_DEPR) {
+ code->alu.inst[pos].inst3 |=
+ R300_ALU_DSTA_DEPTH;
+ } else
+ assert(0);
+ } else {
+ code->alu.inst[pos].inst3 |=
+ (hwdest << R300_ALU_DSTA_SHIFT) |
+ R300_ALU_DSTA_REG;
+
+ cs->hwtemps[hwdest].scalar_valid = pos + 1;
+ }
+ }
+ }
+
+ return;
+}
+
+static GLfloat SinCosConsts[2][4] = {
+ {
+ 1.273239545, // 4/PI
+ -0.405284735, // -4/(PI*PI)
+ 3.141592654, // PI
+ 0.2225 // weight
+ },
+ {
+ 0.75,
+ 0.0,
+ 0.159154943, // 1/(2*PI)
+ 6.283185307 // 2*PI
+ }
+};
+
+/**
+ * Emit a LIT instruction.
+ * \p flags may be PFS_FLAG_SAT
+ *
+ * Definition of LIT (from ARB_fragment_program):
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots. So unless there's some special undocumented opcode,
+ * this implementation is potentially optimal. Unfortunately,
+ * emit_arith is a bit too conservative because it doesn't understand
+ * partial writes to the vector component.
+ */
+static const GLfloat LitConst[4] =
+ { 127.999999, 127.999999, 127.999999, -127.999999 };
+
+static void emit_lit(struct r300_pfs_compile_state *cs,
+ GLuint dest, int mask, GLuint src, int flags)
+{
+ COMPILE_STATE;
+ GLuint cnst;
+ int needTemporary;
+ GLuint temp;
+
+ cnst = emit_const4fv(cs, LitConst);
+
+ needTemporary = 0;
+ if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
+ needTemporary = 1;
+ } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+ // LIT is typically followed by DP3/DP4, so there's no point
+ // in creating special code for this case
+ needTemporary = 1;
+ }
+
+ if (needTemporary) {
+ temp = keep(get_temp_reg(cs));
+ } else {
+ temp = keep(dest);
+ }
+
+ // Note: The order of emit_arith inside the slots is relevant,
+ // because emit_arith only looks at scalar vs. vector when resolving
+ // dependencies, and it does not consider individual vector components,
+ // so swizzling between the two parts can create fake dependencies.
+
+ // First slot
+ emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
+ keep(src), pfs_zero, undef, 0);
+ emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
+
+ // Second slot
+ emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
+ swizzle(temp, W, W, W, W), cnst, undef, 0);
+ emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
+ swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
+
+ // Third slot
+ // If desired, we saturate the y result here.
+ // This does not affect the use as a condition variable in the CMP later
+ emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
+ temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
+ emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
+ swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
+
+ // Fourth slot
+ emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
+ pfs_one, pfs_one, pfs_zero, 0);
+ emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
+
+ // Fifth slot
+ emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
+ pfs_zero, swizzle(temp, W, W, W, W),
+ negate(swizzle(temp, Y, Y, Y, Y)), flags);
+ emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
+ pfs_zero, 0);
+
+ if (needTemporary) {
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ temp, pfs_one, pfs_zero, flags);
+ free_temp(cs, temp);
+ } else {
+ // Decrease refcount of the destination
+ t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
+ }
+}
+
+static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
+{
+ COMPILE_STATE;
+ GLuint src[3], dest, temp[2];
+ int flags, mask = 0;
+ int const_sin[2];
+
+ if (fpi->SaturateMode == SATURATE_ZERO_ONE)
+ flags = PFS_FLAG_SAT;
+ else
+ flags = 0;
+
+ if (fpi->Opcode != OPCODE_KIL) {
+ dest = t_dst(cs, fpi->DstReg);
+ mask = fpi->DstReg.WriteMask;
+ }
+
+ switch (fpi->Opcode) {
+ case OPCODE_ADD:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ src[0], pfs_one, src[1], flags);
+ break;
+ case OPCODE_CMP:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ src[2] = t_src(cs, fpi->SrcReg[2]);
+ /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
+ * r300 - if src2.c < 0.0 ? src1.c : src0.c
+ */
+ emit_arith(cs, PFS_OP_CMP, dest, mask,
+ src[2], src[1], src[0], flags);
+ break;
+ case OPCODE_COS:
+ /*
+ * cos using a parabola (see SIN):
+ * cos(x):
+ * x = (x/(2*PI))+0.75
+ * x = frac(x)
+ * x = (x*2*PI)-PI
+ * result = sin(x)
+ */
+ temp[0] = get_temp_reg(cs);
+ const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+ const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+ /* add 0.5*PI and do range reduction */
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+ swizzle(src[0], X, X, X, X),
+ swizzle(const_sin[1], Z, Z, Z, Z),
+ swizzle(const_sin[1], X, X, X, X), 0);
+
+ emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
+ swizzle(temp[0], X, X, X, X),
+ undef, undef, 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
+ negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
+ 0);
+
+ /* SIN */
+
+ emit_arith(cs, PFS_OP_MAD, temp[0],
+ WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+ Z, Z, Z,
+ Z),
+ const_sin[0], pfs_zero, 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+ swizzle(temp[0], Y, Y, Y, Y),
+ absolute(swizzle(temp[0], Z, Z, Z, Z)),
+ swizzle(temp[0], X, X, X, X), 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+ swizzle(temp[0], X, X, X, X),
+ absolute(swizzle(temp[0], X, X, X, X)),
+ negate(swizzle(temp[0], X, X, X, X)), 0);
+
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ swizzle(temp[0], Y, Y, Y, Y),
+ swizzle(const_sin[0], W, W, W, W),
+ swizzle(temp[0], X, X, X, X), flags);
+
+ free_temp(cs, temp[0]);
+ break;
+ case OPCODE_DP3:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_DP3, dest, mask,
+ src[0], src[1], undef, flags);
+ break;
+ case OPCODE_DP4:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_DP4, dest, mask,
+ src[0], src[1], undef, flags);
+ break;
+ case OPCODE_DST:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ /* dest.y = src0.y * src1.y */
+ if (mask & WRITEMASK_Y)
+ emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
+ keep(src[0]), keep(src[1]),
+ pfs_zero, flags);
+ /* dest.z = src0.z */
+ if (mask & WRITEMASK_Z)
+ emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
+ src[0], pfs_one, pfs_zero, flags);
+ /* result.x = 1.0
+ * result.w = src1.w */
+ if (mask & WRITEMASK_XW) {
+ REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */
+ emit_arith(cs, PFS_OP_MAD, dest,
+ mask & WRITEMASK_XW,
+ src[1], pfs_one, pfs_zero, flags);
+ }
+ break;
+ case OPCODE_EX2:
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_EX2, dest, mask,
+ src[0], undef, undef, flags);
+ break;
+ case OPCODE_FRC:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_FRC, dest, mask,
+ src[0], undef, undef, flags);
+ break;
+ case OPCODE_KIL:
+ emit_tex(cs, fpi, R300_TEX_OP_KIL);
+ break;
+ case OPCODE_LG2:
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_LG2, dest, mask,
+ src[0], undef, undef, flags);
+ break;
+ case OPCODE_LIT:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ emit_lit(cs, dest, mask, src[0], flags);
+ break;
+ case OPCODE_LRP:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ src[2] = t_src(cs, fpi->SrcReg[2]);
+ /* result = tmp0tmp1 + (1 - tmp0)tmp2
+ * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
+ * MAD temp, -tmp0, tmp2, tmp2
+ * MAD result, tmp0, tmp1, temp
+ */
+ temp[0] = get_temp_reg(cs);
+ emit_arith(cs, PFS_OP_MAD, temp[0], mask,
+ negate(keep(src[0])), keep(src[2]), src[2],
+ 0);
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ src[0], src[1], temp[0], flags);
+ free_temp(cs, temp[0]);
+ break;
+ case OPCODE_MAD:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ src[2] = t_src(cs, fpi->SrcReg[2]);
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ src[0], src[1], src[2], flags);
+ break;
+ case OPCODE_MAX:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_MAX, dest, mask,
+ src[0], src[1], undef, flags);
+ break;
+ case OPCODE_MIN:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_MIN, dest, mask,
+ src[0], src[1], undef, flags);
+ break;
+ case OPCODE_MOV:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ src[0], pfs_one, pfs_zero, flags);
+ break;
+ case OPCODE_MUL:
+ src[0] = t_src(cs, fpi->SrcReg[0]);
+ src[1] = t_src(cs, fpi->SrcReg[1]);
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ src[0], src[1], pfs_zero, flags);
+ break;
+ case OPCODE_RCP:
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_RCP, dest, mask,
+ src[0], undef, undef, flags);
+ break;
+ case OPCODE_RSQ:
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+ emit_arith(cs, PFS_OP_RSQ, dest, mask,
+ absolute(src[0]), pfs_zero, pfs_zero, flags);
+ break;
+ case OPCODE_SCS:
+ /*
+ * scs using a parabola :
+ * scs(x):
+ * result.x = sin(-abs(x)+0.5*PI) (cos)
+ * result.y = sin(x) (sin)
+ *
+ */
+ temp[0] = get_temp_reg(cs);
+ temp[1] = get_temp_reg(cs);
+ const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+ const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+ /* x = -abs(x)+0.5*PI */
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI
+ pfs_half,
+ negate(abs
+ (swizzle(keep(src[0]), X, X, X, X))),
+ 0);
+
+ /* C*x (sin) */
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
+ swizzle(const_sin[0], Y, Y, Y, Y),
+ swizzle(keep(src[0]), X, X, X, X),
+ pfs_zero, 0);
+
+ /* B*x, C*x (cos) */
+ emit_arith(cs, PFS_OP_MAD, temp[0],
+ WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+ Z, Z, Z,
+ Z),
+ const_sin[0], pfs_zero, 0);
+
+ /* B*x (sin) */
+ emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
+ swizzle(const_sin[0], X, X, X, X),
+ keep(src[0]), pfs_zero, 0);
+
+ /* y = B*x + C*x*abs(x) (sin) */
+ emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
+ absolute(src[0]),
+ swizzle(temp[0], W, W, W, W),
+ swizzle(temp[1], W, W, W, W), 0);
+
+ /* y = B*x + C*x*abs(x) (cos) */
+ emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
+ swizzle(temp[0], Y, Y, Y, Y),
+ absolute(swizzle(temp[0], Z, Z, Z, Z)),
+ swizzle(temp[0], X, X, X, X), 0);
+
+ /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
+ emit_arith(cs, PFS_OP_MAD, temp[0],
+ WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
+ W, Z, Y,
+ X),
+ absolute(swizzle(temp[1], W, Z, Y, X)),
+ negate(swizzle(temp[1], W, Z, Y, X)), 0);
+
+ /* dest.xy = mad(temp.xy, P, temp2.wz) */
+ emit_arith(cs, PFS_OP_MAD, dest,
+ mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
+ swizzle(const_sin[0], W, W, W, W),
+ swizzle(temp[1], W, Z, Y, X), flags);
+
+ free_temp(cs, temp[0]);
+ free_temp(cs, temp[1]);
+ break;
+ case OPCODE_SIN:
+ /*
+ * using a parabola:
+ * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+ * extra precision is obtained by weighting against
+ * itself squared.
+ */
+
+ temp[0] = get_temp_reg(cs);
+ const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
+ const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
+ src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
+
+ /* do range reduction */
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+ swizzle(keep(src[0]), X, X, X, X),
+ swizzle(const_sin[1], Z, Z, Z, Z),
+ pfs_half, 0);
+
+ emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
+ swizzle(temp[0], X, X, X, X),
+ undef, undef, 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI
+ negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
+ 0);
+
+ /* SIN */
+
+ emit_arith(cs, PFS_OP_MAD, temp[0],
+ WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+ Z, Z, Z,
+ Z),
+ const_sin[0], pfs_zero, 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
+ swizzle(temp[0], Y, Y, Y, Y),
+ absolute(swizzle(temp[0], Z, Z, Z, Z)),
+ swizzle(temp[0], X, X, X, X), 0);
+
+ emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+ swizzle(temp[0], X, X, X, X),
+ absolute(swizzle(temp[0], X, X, X, X)),
+ negate(swizzle(temp[0], X, X, X, X)), 0);
+
+ emit_arith(cs, PFS_OP_MAD, dest, mask,
+ swizzle(temp[0], Y, Y, Y, Y),
+ swizzle(const_sin[0], W, W, W, W),
+ swizzle(temp[0], X, X, X, X), flags);
+
+ free_temp(cs, temp[0]);
+ break;
+ case OPCODE_TEX:
+ emit_tex(cs, fpi, R300_TEX_OP_LD);
+ break;
+ case OPCODE_TXB:
+ emit_tex(cs, fpi, R300_TEX_OP_TXB);
+ break;
+ case OPCODE_TXP:
+ emit_tex(cs, fpi, R300_TEX_OP_TXP);
+ break;
+ default:
+ ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
+ break;
+ }
+}
+
+static GLboolean parse_program(struct r300_pfs_compile_state *cs)
+{
+ COMPILE_STATE;
+ int clauseidx;
+
+ for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
+ struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+ int ip;
+
+ for(ip = 0; ip < clause->NumInstructions; ++ip) {
+ emit_instruction(cs, clause->Instructions + ip);
+
+ if (fp->error)
+ return GL_FALSE;
+ }
+ }
+
+ return GL_TRUE;
+}
+
+
+/* - Init structures
+ * - Determine what hwregs each input corresponds to
+ */
+static void init_program(struct r300_pfs_compile_state *cs)
+{
+ COMPILE_STATE;
+ struct gl_fragment_program *mp = &fp->mesa_program;
+ GLuint InputsRead = mp->Base.InputsRead;
+ GLuint temps_used = 0; /* for fp->temps[] */
+ int i, j;
+
+ /* New compile, reset tracking data */
+ fp->optimization =
+ driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
+ fp->translated = GL_FALSE;
+ fp->error = GL_FALSE;
+ fp->WritesDepth = GL_FALSE;
+ code->tex.length = 0;
+ code->cur_node = 0;
+ code->first_node_has_tex = 0;
+ code->const_nr = 0;
+ code->max_temp_idx = 0;
+ code->node[0].alu_end = -1;
+ code->node[0].tex_end = -1;
+
+ for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+ for (j = 0; j < 3; j++) {
+ cs->slot[i].vsrc[j] = SRC_CONST;
+ cs->slot[i].ssrc[j] = SRC_CONST;
+ }
+ }
+
+ /* Work out what temps the Mesa inputs correspond to, this must match
+ * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+ * configures itself based on the fragprog's InputsRead
+ *
+ * NOTE: this depends on get_hw_temp() allocating registers in order,
+ * starting from register 0.
+ */
+
+ /* Texcoords come first */
+ for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
+ if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+ cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+ get_hw_temp(cs, 0);
+ }
+ }
+ InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+ /* fragment position treated as a texcoord */
+ if (InputsRead & FRAG_BIT_WPOS) {
+ cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
+ }
+ InputsRead &= ~FRAG_BIT_WPOS;
+
+ /* Then primary colour */
+ if (InputsRead & FRAG_BIT_COL0) {
+ cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
+ }
+ InputsRead &= ~FRAG_BIT_COL0;
+
+ /* Secondary color */
+ if (InputsRead & FRAG_BIT_COL1) {
+ cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
+ }
+ InputsRead &= ~FRAG_BIT_COL1;
+
+ /* Anything else */
+ if (InputsRead) {
+ WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+ /* force read from hwreg 0 for now */
+ for (i = 0; i < 32; i++)
+ if (InputsRead & (1 << i))
+ cs->inputs[i].reg = 0;
+ }
+
+ /* Pre-parse the program, grabbing refcounts on input/temp regs.
+ * That way, we can free up the reg when it's no longer needed
+ */
+ for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
+ struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
+ int idx;
+
+ for (j = 0; j < 3; j++) {
+ idx = fpi->SrcReg[j].Index;
+ switch (fpi->SrcReg[j].File) {
+ case PROGRAM_TEMPORARY:
+ if (!(temps_used & (1 << idx))) {
+ cs->temps[idx].reg = -1;
+ cs->temps[idx].refcount = 1;
+ temps_used |= (1 << idx);
+ } else
+ cs->temps[idx].refcount++;
+ break;
+ case PROGRAM_INPUT:
+ cs->inputs[idx].refcount++;
+ break;
+ default:
+ break;
+ }
+ }
+
+ idx = fpi->DstReg.Index;
+ if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
+ if (!(temps_used & (1 << idx))) {
+ cs->temps[idx].reg = -1;
+ cs->temps[idx].refcount = 1;
+ temps_used |= (1 << idx);
+ } else
+ cs->temps[idx].refcount++;
+ }
+ }
+ cs->temp_in_use = temps_used;
+}
+
+
+/**
+ * Final compilation step: Turn the intermediate radeon_program into
+ * machine-readable instructions.
+ */
+GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+{
+ struct r300_pfs_compile_state cs;
+ struct r300_fragment_program_code *code = compiler->code;
+
+ _mesa_memset(&cs, 0, sizeof(cs));
+ cs.compiler = compiler;
+ init_program(&cs);
+
+ if (!parse_program(&cs))
+ return GL_FALSE;
+
+ /* Finish off */
+ code->node[code->cur_node].alu_end =
+ cs.nrslots - code->node[code->cur_node].alu_offset - 1;
+ if (code->node[code->cur_node].tex_end < 0)
+ code->node[code->cur_node].tex_end = 0;
+ code->alu_offset = 0;
+ code->alu_end = cs.nrslots - 1;
+ code->tex_offset = 0;
+ code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
+ assert(code->node[code->cur_node].alu_end >= 0);
+ assert(code->alu_end >= 0);
+
+ return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index b0225453d3d..71821a01ea0 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -54,6 +54,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include "r300_vertprog.h"
#include "radeon_reg.h"
#include "r300_emit.h"
+#include "r300_fragprog.h"
#include "vblank.h"
@@ -130,8 +131,6 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
t1 |= R300_Z_ENABLE | R300_Z_WRITE_ENABLE;
t2 |=
(R300_ZS_ALWAYS << R300_Z_FUNC_SHIFT);
- } else { //XXX
- t1 |= R300_STENCIL_FRONT_BACK; // disable
}
if (flags & CLEARBUFFER_STENCIL) {
@@ -144,20 +143,13 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
(R300_ZS_REPLACE <<
R300_S_FRONT_ZPASS_OP_SHIFT) |
(R300_ZS_REPLACE <<
- R300_S_FRONT_ZFAIL_OP_SHIFT) |
- (R300_ZS_ALWAYS <<
- R300_S_BACK_FUNC_SHIFT) |
- (R300_ZS_REPLACE <<
- R300_S_BACK_SFAIL_OP_SHIFT) |
- (R300_ZS_REPLACE <<
- R300_S_BACK_ZPASS_OP_SHIFT) |
- (R300_ZS_REPLACE <<
- R300_S_BACK_ZFAIL_OP_SHIFT);
+ R300_S_FRONT_ZFAIL_OP_SHIFT);
}
e32(t1);
e32(t2);
- e32(r300->state.stencil.clear);
+ e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
+ (ctx->Stencil.Clear & R300_STENCILREF_MASK));
}
cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
@@ -307,7 +299,6 @@ static void r300EmitClearState(GLcontext * ctx)
reg_start(R300_RS_INST_0, 0);
e32(R300_RS_INST_COL_CN_WRITE);
} else {
-
R300_STATECHANGE(r300, ri);
reg_start(R500_RS_IP_0, 7);
for (i = 0; i < 8; ++i) {
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 21e1dc29de7..8b00f9958cc 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1371,29 +1371,22 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
# define R300_TX_MAG_FILTER_4 (0 << 9)
# define R300_TX_MAG_FILTER_NEAREST (1 << 9)
# define R300_TX_MAG_FILTER_LINEAR (2 << 9)
+# define R300_TX_MAG_FILTER_ANISO (3 << 9)
# define R300_TX_MAG_FILTER_MASK (3 << 9)
# define R300_TX_MIN_FILTER_NEAREST (1 << 11)
# define R300_TX_MIN_FILTER_LINEAR (2 << 11)
-# define R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST (5 << 11) /* TODO: use spec */
-# define R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR (9 << 11) /* TODO: use spec */
-# define R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST (6 << 11) /* TODO: use spec */
-# define R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR (10 << 11) /* TODO: use spec */
-
-/* NOTE: NEAREST doesnt seem to exist.
- * Im not seting MAG_FILTER_MASK and (3 << 11) on for all
- * anisotropy modes because that would void selected mag filter
- */
-# define R300_TX_MIN_FILTER_ANISO_NEAREST (0 << 13)
-# define R300_TX_MIN_FILTER_ANISO_LINEAR (0 << 13)
-# define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (1 << 13)
-# define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR (2 << 13)
-# define R300_TX_MIN_FILTER_MASK ( (15 << 11) | (3 << 13) )
-# define R300_TX_MAX_ANISO_1_TO_1 (0 << 21)
-# define R300_TX_MAX_ANISO_2_TO_1 (2 << 21)
-# define R300_TX_MAX_ANISO_4_TO_1 (4 << 21)
-# define R300_TX_MAX_ANISO_8_TO_1 (6 << 21)
-# define R300_TX_MAX_ANISO_16_TO_1 (8 << 21)
-# define R300_TX_MAX_ANISO_MASK (14 << 21)
+# define R300_TX_MIN_FILTER_ANISO (3 << 11)
+# define R300_TX_MIN_FILTER_MASK (3 << 11)
+# define R300_TX_MIN_FILTER_MIP_NONE (0 << 13)
+# define R300_TX_MIN_FILTER_MIP_NEAREST (1 << 13)
+# define R300_TX_MIN_FILTER_MIP_LINEAR (2 << 13)
+# define R300_TX_MIN_FILTER_MIP_MASK (3 << 13)
+# define R300_TX_MAX_ANISO_1_TO_1 (0 << 21)
+# define R300_TX_MAX_ANISO_2_TO_1 (1 << 21)
+# define R300_TX_MAX_ANISO_4_TO_1 (2 << 21)
+# define R300_TX_MAX_ANISO_8_TO_1 (3 << 21)
+# define R300_TX_MAX_ANISO_16_TO_1 (4 << 21)
+# define R300_TX_MAX_ANISO_MASK (7 << 21)
#define R300_TX_FILTER1_0 0x4440
# define R300_CHROMA_KEY_MODE_DISABLE 0
@@ -1401,7 +1394,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
# define R300_CHROMA_KEY_BLEND 2
# define R300_MC_ROUND_NORMAL (0<<2)
# define R300_MC_ROUND_MPEG4 (1<<2)
-# define R300_LOD_BIAS_MASK 0x1fff
+# define R300_LOD_BIAS_SHIFT 3
+# define R300_LOD_BIAS_MASK 0x1ff8
# define R300_EDGE_ANISO_EDGE_DIAG (0<<13)
# define R300_EDGE_ANISO_EDGE_ONLY (1<<13)
# define R300_MC_COORD_TRUNCATE_DISABLE (0<<14)
@@ -1432,9 +1426,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
They are given meanings as R, G, B and Alpha by the swizzle
specification */
# define R300_TX_FORMAT_X8 0x0
-# define R500_TX_FORMAT_X1 0x0 // bit set in format 2
+# define R500_TX_FORMAT_X1 0x0 // bit set in format 2
# define R300_TX_FORMAT_X16 0x1
-# define R500_TX_FORMAT_X1_REV 0x0 // bit set in format 2
+# define R500_TX_FORMAT_X1_REV 0x0 // bit set in format 2
# define R300_TX_FORMAT_Y4X4 0x2
# define R300_TX_FORMAT_Y8X8 0x3
# define R300_TX_FORMAT_Y16X16 0x4
@@ -2238,7 +2232,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
# define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22 (1 << 1)
# define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
# define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
-
+
/* Discard src pixels less than or equal to threshold. */
#define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
@@ -3179,7 +3173,7 @@ enum {
* 2 to end: Up to 16380 dwords of vertex data.
*/
#define R300_PACKET3_3D_DRAW_INDX 0x00002A00
-
+
/* Specify the full set of vertex arrays as (address, stride).
* The first parameter is the number of vertex arrays specified.
@@ -3209,7 +3203,7 @@ enum {
/* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
#define R300_PACKET3_3D_DRAW_INDX_2 0x00003600
-/* Clears a portion of hierachical Z RAM
+/* Clears a portion of hierachical Z RAM
* 3 dword parameters
* 0. START
* 1. COUNT: 13:0 (max is 0x3FFF)
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index fc07105c560..8f74f9d785e 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -74,6 +74,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
#include "r300_reg.h"
#include "r300_tex.h"
#include "r300_emit.h"
+#include "r300_fragprog.h"
extern int future_hw_tcl_on;
/**
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index 5c8fd8a5e58..f30fd986e0f 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -28,7 +28,6 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
target, id);
} else {
r300_fp = CALLOC_STRUCT(r300_fragment_program);
- r300_fp->ctx = ctx;
return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
target, id);
}
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 550f7108542..55d3d55e900 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -60,6 +60,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include "r300_state.h"
#include "r300_reg.h"
#include "r300_emit.h"
+#include "r300_fragprog.h"
#include "r300_tex.h"
#include "drirenderbuffer.h"
@@ -525,24 +526,15 @@ static void r300SetDepthState(GLcontext * ctx)
r300ContextPtr r300 = R300_CONTEXT(ctx);
R300_STATECHANGE(r300, zs);
- r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE; // XXX
- r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
- ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
+ r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+ r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
- if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+ if (ctx->Depth.Test) {
+ r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE;
if (ctx->Depth.Mask)
- r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
- R300_Z_ENABLE | R300_Z_WRITE_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
- else
- r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE | R300_STENCIL_FRONT_BACK; // XXX
-
- r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
- translate_func(ctx->Depth.
- Func) << R300_Z_FUNC_SHIFT;
- } else {
- r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK; // XXX
+ r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_WRITE_ENABLE;
r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
- translate_func(GL_NEVER) << R300_Z_FUNC_SHIFT;
+ translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
}
r300SetEarlyZState(ctx);
@@ -925,7 +917,7 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
GLuint flag;
R300_STATECHANGE(rmesa, zs);
-
+ rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
R300_S_FRONT_FUNC_SHIFT)
| (R300_ZS_MASK <<
@@ -1000,17 +992,6 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
}
}
-static void r300ClearStencil(GLcontext * ctx, GLint s)
-{
- r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
- rmesa->state.stencil.clear =
- ((GLuint) (ctx->Stencil.Clear & R300_STENCILREF_MASK) |
- (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT) |
- ((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
- R300_STENCILMASK_SHIFT));
-}
-
/* =============================================================
* Window position and viewport transformation
*/
@@ -1284,7 +1265,7 @@ static unsigned long gen_fixed_filter(unsigned long f)
return f;
mag = f & R300_TX_MAG_FILTER_MASK;
- min = f & R300_TX_MIN_FILTER_MASK;
+ min = f & (R300_TX_MIN_FILTER_MASK|R300_TX_MIN_FILTER_MIP_MASK);
/* TODO: Check for anisto filters too */
if ((mag != R300_TX_MAG_FILTER_NEAREST)
@@ -1328,18 +1309,19 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
int i;
struct r300_fragment_program *fp = (struct r300_fragment_program *)
(char *)ctx->FragmentProgram._Current;
+ struct r300_fragment_program_code *code = &fp->code;
R300_STATECHANGE(r300, fpt);
- for (i = 0; i < fp->tex.length; i++) {
+ for (i = 0; i < code->tex.length; i++) {
int unit;
int opcode;
unsigned long val;
- unit = fp->tex.inst[i] >> R300_TEX_ID_SHIFT;
+ unit = code->tex.inst[i] >> R300_TEX_ID_SHIFT;
unit &= 15;
- val = fp->tex.inst[i];
+ val = code->tex.inst[i];
val &= ~R300_TEX_ID_MASK;
opcode =
@@ -1361,7 +1343,7 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
}
r300->hw.fpt.cmd[R300_FPT_CMD_0] =
- cmdpacket0(R300_US_TEX_INST_0, fp->tex.length);
+ cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
}
static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
@@ -1369,14 +1351,15 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
int i;
struct r500_fragment_program *fp = (struct r500_fragment_program *)
(char *)ctx->FragmentProgram._Current;
+ struct r500_fragment_program_code *code = &fp->code;
/* find all the texture instructions and relocate the texture units */
- for (i = 0; i < fp->inst_end + 1; i++) {
- if ((fp->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+ for (i = 0; i < code->inst_end + 1; i++) {
+ if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
uint32_t val;
int unit, opcode, new_unit;
- val = fp->inst[i].inst1;
+ val = code->inst[i].inst1;
unit = (val >> 16) & 0xf;
@@ -1393,11 +1376,23 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
}
}
val |= R500_TEX_ID(new_unit);
- fp->inst[i].inst1 = val;
+ code->inst[i].inst1 = val;
}
}
}
+static GLuint r300CalculateTexLodBias(GLfloat bias)
+{
+ GLuint b;
+ b = (unsigned int)fabsf(ceilf(bias*31));
+ if (signbit(bias)) {
+ b ^= 0x3ff; /* 10 bits */
+ }
+ b <<= 3;
+ b &= R300_LOD_BIAS_MASK;
+ return b;
+}
+
static void r300SetupTextures(GLcontext * ctx)
{
int i, mtu;
@@ -1461,8 +1456,8 @@ static void r300SetupTextures(GLcontext * ctx)
r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
hw_tmu] =
gen_fixed_filter(t->filter) | (hw_tmu << 28);
- /* Currently disabled! */
- r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0; //0x20501f80;
+ r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1
+ | r300CalculateTexLodBias(r300->LODBias);
r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
t->size;
r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
@@ -2426,6 +2421,7 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
GLcontext *ctx = rmesa->radeon.glCtx;
struct r300_fragment_program *fp = (struct r300_fragment_program *)
(char *)ctx->FragmentProgram._Current;
+ struct r300_fragment_program_code *code;
int i, k;
if (!fp) /* should only happenen once, just after context is created */
@@ -2437,62 +2433,63 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
__FUNCTION__);
return;
}
+ code = &fp->code;
r300SetupTextures(ctx);
R300_STATECHANGE(rmesa, fpi[0]);
- rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, fp->alu_end + 1);
- for (i = 0; i <= fp->alu_end; i++) {
- rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst0;
+ rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu_end + 1);
+ for (i = 0; i <= code->alu_end; i++) {
+ rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
}
R300_STATECHANGE(rmesa, fpi[1]);
- rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, fp->alu_end + 1);
- for (i = 0; i <= fp->alu_end; i++) {
- rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst1;
+ rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu_end + 1);
+ for (i = 0; i <= code->alu_end; i++) {
+ rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
}
R300_STATECHANGE(rmesa, fpi[2]);
- rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, fp->alu_end + 1);
- for (i = 0; i <= fp->alu_end; i++) {
- rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst2;
+ rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu_end + 1);
+ for (i = 0; i <= code->alu_end; i++) {
+ rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
}
R300_STATECHANGE(rmesa, fpi[3]);
- rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, fp->alu_end + 1);
- for (i = 0; i <= fp->alu_end; i++) {
- rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = fp->alu.inst[i].inst3;
+ rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu_end + 1);
+ for (i = 0; i <= code->alu_end; i++) {
+ rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
}
R300_STATECHANGE(rmesa, fp);
- rmesa->hw.fp.cmd[R300_FP_CNTL0] = fp->cur_node | (fp->first_node_has_tex << 3);
- rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+ rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
+ rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
rmesa->hw.fp.cmd[R300_FP_CNTL2] =
- (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
- (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
- (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
- (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+ (code->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+ (code->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) |
+ (code->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+ (code->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
/* I just want to say, the way these nodes are stored.. weird.. */
- for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
- if (i < (fp->cur_node + 1)) {
+ for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
+ if (i < (code->cur_node + 1)) {
rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
- (fp->node[i].alu_offset << R300_ALU_START_SHIFT) |
- (fp->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
- (fp->node[i].tex_offset << R300_TEX_START_SHIFT) |
- (fp->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
- fp->node[i].flags;
+ (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
+ (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
+ (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
+ (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
+ code->node[i].flags;
} else {
rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
}
}
R300_STATECHANGE(rmesa, fpp);
- rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
- for (i = 0; i < fp->const_nr; i++) {
- rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(fp->constant[i][0]);
- rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(fp->constant[i][1]);
- rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(fp->constant[i][2]);
- rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(fp->constant[i][3]);
+ rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+ for (i = 0; i < code->const_nr; i++) {
+ rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]);
+ rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]);
+ rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]);
+ rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]);
}
}
@@ -2516,6 +2513,7 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
struct r500_fragment_program *fp = (struct r500_fragment_program *)
(char *)ctx->FragmentProgram._Current;
int i;
+ struct r500_fragment_program_code *code;
if (!fp) /* should only happenen once, just after context is created */
return;
@@ -2529,42 +2527,43 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
__FUNCTION__);
return;
}
+ code = &fp->code;
r300SetupTextures(ctx);
R300_STATECHANGE(rmesa, fp);
- rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = fp->max_temp_idx;
+ rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
- R500_US_CODE_START_ADDR(fp->inst_offset) |
- R500_US_CODE_END_ADDR(fp->inst_end);
+ R500_US_CODE_START_ADDR(code->inst_offset) |
+ R500_US_CODE_END_ADDR(code->inst_end);
rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
- R500_US_CODE_RANGE_ADDR(fp->inst_offset) |
- R500_US_CODE_RANGE_SIZE(fp->inst_end);
+ R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+ R500_US_CODE_RANGE_SIZE(code->inst_end);
rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
R300_STATECHANGE(rmesa, r500fp);
/* Emit our shader... */
- for (i = 0; i < fp->inst_end+1; i++) {
- rmesa->hw.r500fp.cmd[i*6+1] = fp->inst[i].inst0;
- rmesa->hw.r500fp.cmd[i*6+2] = fp->inst[i].inst1;
- rmesa->hw.r500fp.cmd[i*6+3] = fp->inst[i].inst2;
- rmesa->hw.r500fp.cmd[i*6+4] = fp->inst[i].inst3;
- rmesa->hw.r500fp.cmd[i*6+5] = fp->inst[i].inst4;
- rmesa->hw.r500fp.cmd[i*6+6] = fp->inst[i].inst5;
+ for (i = 0; i < code->inst_end+1; i++) {
+ rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+ rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+ rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+ rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+ rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+ rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
}
- bump_r500fp_count(rmesa->hw.r500fp.cmd, (fp->inst_end + 1) * 6);
+ bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
R300_STATECHANGE(rmesa, r500fp_const);
- for (i = 0; i < fp->const_nr; i++) {
- rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(fp->constant[i][0]);
- rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(fp->constant[i][1]);
- rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(fp->constant[i][2]);
- rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(fp->constant[i][3]);
+ for (i = 0; i < code->const_nr; i++) {
+ rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(code->constant[i][0]);
+ rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(code->constant[i][1]);
+ rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(code->constant[i][2]);
+ rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(code->constant[i][3]);
}
- bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, fp->const_nr * 4);
+ bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
}
@@ -2637,12 +2636,10 @@ void r300InitState(r300ContextPtr r300)
case 16:
r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
- r300->state.stencil.clear = 0x00000000;
break;
case 24:
r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
- r300->state.stencil.clear = 0x00ff0000;
break;
default:
fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
@@ -2706,7 +2703,6 @@ void r300InitStateFuncs(struct dd_function_table *functions)
functions->ShadeModel = r300ShadeModel;
/* Stencil related */
- functions->ClearStencil = r300ClearStencil;
functions->StencilFuncSeparate = r300StencilFuncSeparate;
functions->StencilMaskSeparate = r300StencilMaskSeparate;
functions->StencilOpSeparate = r300StencilOpSeparate;
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 0b4acec0448..5f54bcad9a3 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -160,21 +160,18 @@ static void r300SetTexWrap(r300TexObjPtr t, GLenum swrap, GLenum twrap,
t->filter |= hw_qwrap << R300_TX_WRAP_Q_SHIFT;
}
-static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
+static GLuint aniso_filter(GLfloat anisotropy)
{
-
- t->filter &= ~R300_TX_MAX_ANISO_MASK;
-
- if (max <= 1.0) {
- t->filter |= R300_TX_MAX_ANISO_1_TO_1;
- } else if (max <= 2.0) {
- t->filter |= R300_TX_MAX_ANISO_2_TO_1;
- } else if (max <= 4.0) {
- t->filter |= R300_TX_MAX_ANISO_4_TO_1;
- } else if (max <= 8.0) {
- t->filter |= R300_TX_MAX_ANISO_8_TO_1;
+ if (anisotropy >= 16.0) {
+ return R300_TX_MAX_ANISO_16_TO_1;
+ } else if (anisotropy >= 8.0) {
+ return R300_TX_MAX_ANISO_8_TO_1;
+ } else if (anisotropy >= 4.0) {
+ return R300_TX_MAX_ANISO_4_TO_1;
+ } else if (anisotropy >= 2.0) {
+ return R300_TX_MAX_ANISO_2_TO_1;
} else {
- t->filter |= R300_TX_MAX_ANISO_16_TO_1;
+ return R300_TX_MAX_ANISO_1_TO_1;
}
}
@@ -184,54 +181,47 @@ static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
* \param t Texture whose filter modes are to be set
* \param minf Texture minification mode
* \param magf Texture magnification mode
+ * \param anisotropy Maximum anisotropy level
*/
-
-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf)
+static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
{
- GLuint anisotropy = (t->filter & R300_TX_MAX_ANISO_MASK);
+ t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+ t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
- t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MAG_FILTER_MASK);
+ /* Note that EXT_texture_filter_anisotropic is extremely vague about
+ * how anisotropic filtering interacts with the "normal" filter modes.
+ * When anisotropic filtering is enabled, we override min and mag
+ * filter settings completely. This includes driconf's settings.
+ */
+ if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+ t->filter |= R300_TX_MAG_FILTER_ANISO
+ | R300_TX_MIN_FILTER_ANISO
+ | R300_TX_MIN_FILTER_MIP_LINEAR
+ | aniso_filter(anisotropy);
+ if (RADEON_DEBUG & DEBUG_TEXTURE)
+ fprintf(stderr, "Using maximum anisotropy of %f\n", anisotropy);
+ return;
+ }
- if (anisotropy == R300_TX_MAX_ANISO_1_TO_1) {
- switch (minf) {
- case GL_NEAREST:
- t->filter |= R300_TX_MIN_FILTER_NEAREST;
- break;
- case GL_LINEAR:
- t->filter |= R300_TX_MIN_FILTER_LINEAR;
- break;
- case GL_NEAREST_MIPMAP_NEAREST:
- t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST;
- break;
- case GL_NEAREST_MIPMAP_LINEAR:
- t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR;
- break;
- case GL_LINEAR_MIPMAP_NEAREST:
- t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST;
- break;
- case GL_LINEAR_MIPMAP_LINEAR:
- t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR;
- break;
- }
- } else {
- switch (minf) {
- case GL_NEAREST:
- t->filter |= R300_TX_MIN_FILTER_ANISO_NEAREST;
- break;
- case GL_LINEAR:
- t->filter |= R300_TX_MIN_FILTER_ANISO_LINEAR;
- break;
- case GL_NEAREST_MIPMAP_NEAREST:
- case GL_LINEAR_MIPMAP_NEAREST:
- t->filter |=
- R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
- break;
- case GL_NEAREST_MIPMAP_LINEAR:
- case GL_LINEAR_MIPMAP_LINEAR:
- t->filter |=
- R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
- break;
- }
+ switch (minf) {
+ case GL_NEAREST:
+ t->filter |= R300_TX_MIN_FILTER_NEAREST;
+ break;
+ case GL_LINEAR:
+ t->filter |= R300_TX_MIN_FILTER_LINEAR;
+ break;
+ case GL_NEAREST_MIPMAP_NEAREST:
+ t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+ break;
+ case GL_NEAREST_MIPMAP_LINEAR:
+ t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+ break;
+ case GL_LINEAR_MIPMAP_NEAREST:
+ t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+ break;
+ case GL_LINEAR_MIPMAP_LINEAR:
+ t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+ break;
}
/* Note we don't have 3D mipmaps so only use the mag filter setting
@@ -252,6 +242,20 @@ static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
}
+static void r300SetTexLodBias(r300TexObjPtr t, GLfloat bias)
+{
+ GLuint b;
+ b = (unsigned int)fabsf(ceilf(bias*31));
+ if (signbit(bias)) {
+ b ^= 0x3ff; /* 10 bits */
+ }
+ b <<= 3;
+ b &= R300_LOD_BIAS_MASK;
+
+ t->filter_1 &= ~R300_LOD_BIAS_MASK;
+ t->filter_1 |= b;
+}
+
/**
* Allocate space for and load the mesa images into the texture memory block.
* This will happen before drawing with a new texture, or drawing with a
@@ -278,8 +282,7 @@ static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
make_empty_list(&t->base);
r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
- r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
- r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+ r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
r300SetTexBorderColor(t, texObj->_BorderChan);
}
@@ -976,9 +979,38 @@ r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
t->dirty_images[0] |= (1 << level);
}
+/* This feels like a prime target for code reuse, so I'm putting it here
+ * instead of inlining it in TexEnv. */
+static GLenum r300TexUnitTarget(struct gl_texture_unit *unit) {
+ if (unit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
+ return GL_TEXTURE_RECTANGLE_NV;
+ } else if (unit->_ReallyEnabled & (TEXTURE_1D_BIT)) {
+ return GL_TEXTURE_1D;
+ } else if (unit->_ReallyEnabled & (TEXTURE_2D_BIT)) {
+ return GL_TEXTURE_2D;
+ } else if (unit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
+ return GL_TEXTURE_3D;
+ } else if (unit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
+ return GL_TEXTURE_CUBE_MAP;
+ }
+ if (unit->Enabled & (TEXTURE_RECT_BIT)) {
+ return GL_TEXTURE_RECTANGLE_NV;
+ } else if (unit->Enabled & (TEXTURE_1D_BIT)) {
+ return GL_TEXTURE_1D;
+ } else if (unit->Enabled & (TEXTURE_2D_BIT)) {
+ return GL_TEXTURE_2D;
+ } else if (unit->Enabled & (TEXTURE_3D_BIT)) {
+ return GL_TEXTURE_3D;
+ } else if (unit->Enabled & (TEXTURE_CUBE_BIT)) {
+ return GL_TEXTURE_CUBE_MAP;
+ }
+ return 0;
+}
+
static void r300TexEnv(GLcontext * ctx, GLenum target,
GLenum pname, const GLfloat * param)
{
+ r300ContextPtr rmesa = R300_CONTEXT(ctx);
if (RADEON_DEBUG & DEBUG_STATE) {
fprintf(stderr, "%s( %s )\n",
__FUNCTION__, _mesa_lookup_enum_by_nr(pname));
@@ -989,41 +1021,24 @@ static void r300TexEnv(GLcontext * ctx, GLenum target,
* between them according to _ReallyEnabled.
*/
switch (pname) {
- case GL_TEXTURE_LOD_BIAS_EXT:{
-#if 0 /* Needs to be relocated in order to make sure we got the right tmu */
- GLfloat bias, min;
- GLuint b;
-
- /* The R300's LOD bias is a signed 2's complement value with a
- * range of -16.0 <= bias < 16.0.
- *
- * NOTE: Add a small bias to the bias for conform mipsel.c test.
- */
- bias = *param + .01;
- min =
- driQueryOptionb(&rmesa->radeon.optionCache,
- "no_neg_lod_bias") ? 0.0 : -16.0;
- bias = CLAMP(bias, min, 16.0);
-
- /* 0.0 - 16.0 == 0x0 - 0x1000 */
- /* 0.0 - -16.0 == 0x1001 - 0x1fff */
- b = 0x1000 / 16.0 * bias;
- b &= R300_LOD_BIAS_MASK;
-
- if (b !=
- (rmesa->hw.tex.unknown1.
- cmd[R300_TEX_VALUE_0 +
- unit] & R300_LOD_BIAS_MASK)) {
- R300_STATECHANGE(rmesa, tex.unknown1);
- rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
- unit] &=
- ~R300_LOD_BIAS_MASK;
- rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
- unit] |= b;
- }
-#endif
- break;
- }
+ case GL_TEXTURE_LOD_BIAS_EXT: {
+ /* Needs to be relocated in order to make sure we got the right tmu */
+ GLfloat bias, min;
+
+ /* The R300's LOD bias is a signed 2's complement value with a
+ * range of -16.0 <= bias < 16.0.
+ *
+ * NOTE: Add a small bias to the bias for conform mipsel.c test.
+ */
+ bias = *param + .01;
+ min = driQueryOptionb(&rmesa->radeon.optionCache,
+ "no_neg_lod_bias") ? 0.0 : -16.0;
+ bias = CLAMP(bias, min, 16.0);
+
+ rmesa->LODBias = bias;
+
+ break;
+ }
default:
return;
@@ -1050,8 +1065,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
case GL_TEXTURE_MIN_FILTER:
case GL_TEXTURE_MAG_FILTER:
case GL_TEXTURE_MAX_ANISOTROPY_EXT:
- r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
- r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+ r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
break;
case GL_TEXTURE_WRAP_S:
@@ -1077,7 +1091,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
break;
case GL_DEPTH_TEXTURE_MODE:
- if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
+ if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
== GL_DEPTH_COMPONENT) {
r300SetDepthTexMode(texObj);
break;
@@ -1092,10 +1106,6 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
default:
return;
}
-
- /* Mark this texobj as dirty (one bit per tex unit)
- */
- t->dirty_state = TEX_ALL;
}
static void r300BindTexture(GLcontext * ctx, GLenum target,
@@ -1157,6 +1167,10 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
return NULL;
obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+ /* Attempt to fill LOD bias, if previously set.
+ * Should start at 0.0, which won't affect the HW. */
+ obj->LodBias = rmesa->LODBias;
+
r300AllocTexObj(obj);
return obj;
}
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
index 723601ac4a6..69847a4022d 100644
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ b/src/mesa/drivers/dri/r300/r300_texmem.c
@@ -349,7 +349,7 @@ static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
imageWidth = texImage->Width;
imageHeight = texImage->Height;
- offset = t->bufAddr + t->base.totalSize / 6 * face;
+ offset = t->bufAddr;
if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
GLint imageX = 0;
@@ -534,10 +534,6 @@ int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
/* hope it's safe to add that here... */
t->offset |= t->tile_bits;
}
-
- /* Mark this texobj as dirty on all units:
- */
- t->dirty_state = TEX_ALL;
}
/* Let the world know we've used this memory recently.
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index 78fa75228e7..bdd20b18e44 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -127,18 +127,18 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
{
static const GLuint formats[3][3] = {
{
- R300_EASY_TX_FORMAT(X, X, X, X, X16),
R300_EASY_TX_FORMAT(X, X, X, ONE, X16),
+ R300_EASY_TX_FORMAT(X, X, X, X, X16),
R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X16),
},
{
- R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
R300_EASY_TX_FORMAT(X, X, X, ONE, X24_Y8),
+ R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X24_Y8),
},
{
- R300_EASY_TX_FORMAT(X, X, X, X, X32),
R300_EASY_TX_FORMAT(X, X, X, ONE, X32),
+ R300_EASY_TX_FORMAT(X, X, X, X, X32),
R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X32),
},
};
@@ -190,6 +190,112 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
/**
+ * Compute sizes and fill in offset and blit information for the given
+ * image (determined by \p face and \p level).
+ *
+ * \param curOffset points to the offset at which the image is to be stored
+ * and is updated by this function according to the size of the image.
+ */
+static void compute_tex_image_offset(
+ struct gl_texture_object *tObj,
+ GLuint face,
+ GLint level,
+ GLint* curOffset)
+{
+ r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+ const struct gl_texture_image* texImage;
+ GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
+ GLuint texelBytes;
+ GLuint size;
+
+ texImage = tObj->Image[0][level + t->base.firstLevel];
+ if (!texImage)
+ return;
+
+ texelBytes = texImage->TexFormat->TexelBytes;
+
+ /* find image size in bytes */
+ if (texImage->IsCompressed) {
+ if ((t->format & R300_TX_FORMAT_DXT1) ==
+ R300_TX_FORMAT_DXT1) {
+ // fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
+ if ((texImage->Width + 3) < 8) /* width one block */
+ size = texImage->CompressedSize * 4;
+ else if ((texImage->Width + 3) < 16)
+ size = texImage->CompressedSize * 2;
+ else
+ size = texImage->CompressedSize;
+ } else {
+ /* DXT3/5, 16 bytes per block */
+ WARN_ONCE
+ ("DXT 3/5 suffers from multitexturing problems!\n");
+ // fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
+ if ((texImage->Width + 3) < 8)
+ size = texImage->CompressedSize * 2;
+ else
+ size = texImage->CompressedSize;
+ }
+ } else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+ size =
+ ((texImage->Width * texelBytes +
+ 63) & ~63) * texImage->Height;
+ blitWidth = 64 / texelBytes;
+ } else if (t->tile_bits & R300_TXO_MICRO_TILE) {
+ /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+ though the actual offset may be different (if texture is less than
+ 32 bytes width) to the untiled case */
+ int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+ size =
+ (w * ((texImage->Height + 1) / 2)) *
+ texImage->Depth;
+ blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+ } else {
+ int w = (texImage->Width * texelBytes + 31) & ~31;
+ size = w * texImage->Height * texImage->Depth;
+ blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+ }
+ assert(size > 0);
+
+ if (RADEON_DEBUG & DEBUG_TEXTURE)
+ fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
+ texImage->Width, texImage->Height,
+ texImage->Depth,
+ texImage->TexFormat->TexelBytes,
+ texImage->InternalFormat);
+
+ /* All images are aligned to a 32-byte offset */
+ *curOffset = (*curOffset + 0x1f) & ~0x1f;
+
+ if (texelBytes) {
+ /* fix x and y coords up later together with offset */
+ t->image[face][level].x = *curOffset;
+ t->image[face][level].y = 0;
+ t->image[face][level].width =
+ MIN2(size / texelBytes, blitWidth);
+ t->image[face][level].height =
+ (size / texelBytes) / t->image[face][level].width;
+ } else {
+ t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
+ t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
+ t->image[face][level].width =
+ MIN2(size, R300_BLIT_WIDTH_BYTES);
+ t->image[face][level].height = size / t->image[face][level].width;
+ }
+
+ if (RADEON_DEBUG & DEBUG_TEXTURE)
+ fprintf(stderr,
+ "level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+ level, face, texImage->Width, texImage->Height,
+ t->image[face][level].x, t->image[face][level].y,
+ t->image[face][level].width, t->image[face][level].height,
+ size, *curOffset);
+
+ *curOffset += size;
+}
+
+
+
+/**
* This function computes the number of bytes of storage needed for
* the given texture object (all mipmap levels, all cube faces).
* The \c image[face][level].x/y/width/height parameters for upload/blitting
@@ -206,7 +312,7 @@ static void r300SetTexImages(r300ContextPtr rmesa,
r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
const struct gl_texture_image *baseImage =
tObj->Image[0][tObj->BaseLevel];
- GLint curOffset, blitWidth;
+ GLint curOffset;
GLint i, texelBytes;
GLint numLevels;
GLint log2Width, log2Height, log2Depth;
@@ -245,8 +351,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
* The idea is that we lay out the mipmap levels within a block of
* memory organized as a rectangle of width BLIT_WIDTH_BYTES.
*/
- curOffset = 0;
- blitWidth = R300_BLIT_WIDTH_BYTES;
t->tile_bits = 0;
/* figure out if this texture is suitable for tiling. */
@@ -276,94 +380,20 @@ static void r300SetTexImages(r300ContextPtr rmesa,
}
#endif
- for (i = 0; i < numLevels; i++) {
- const struct gl_texture_image *texImage;
- GLuint size;
-
- texImage = tObj->Image[0][i + t->base.firstLevel];
- if (!texImage)
- break;
-
- /* find image size in bytes */
- if (texImage->IsCompressed) {
- if ((t->format & R300_TX_FORMAT_DXT1) ==
- R300_TX_FORMAT_DXT1) {
- // fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
- if ((texImage->Width + 3) < 8) /* width one block */
- size = texImage->CompressedSize * 4;
- else if ((texImage->Width + 3) < 16)
- size = texImage->CompressedSize * 2;
- else
- size = texImage->CompressedSize;
- } else {
- /* DXT3/5, 16 bytes per block */
- WARN_ONCE
- ("DXT 3/5 suffers from multitexturing problems!\n");
- // fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
- if ((texImage->Width + 3) < 8)
- size = texImage->CompressedSize * 2;
- else
- size = texImage->CompressedSize;
- }
- } else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
- size =
- ((texImage->Width * texelBytes +
- 63) & ~63) * texImage->Height;
- blitWidth = 64 / texelBytes;
- } else if (t->tile_bits & R300_TXO_MICRO_TILE) {
- /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
- though the actual offset may be different (if texture is less than
- 32 bytes width) to the untiled case */
- int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
- size =
- (w * ((texImage->Height + 1) / 2)) *
- texImage->Depth;
- blitWidth = MAX2(texImage->Width, 64 / texelBytes);
- } else {
- int w = (texImage->Width * texelBytes + 31) & ~31;
- size = w * texImage->Height * texImage->Depth;
- blitWidth = MAX2(texImage->Width, 64 / texelBytes);
- }
- assert(size > 0);
-
- if (RADEON_DEBUG & DEBUG_TEXTURE)
- fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
- texImage->Width, texImage->Height,
- texImage->Depth,
- texImage->TexFormat->TexelBytes,
- texImage->InternalFormat);
-
- /* Align to 32-byte offset. It is faster to do this unconditionally
- * (no branch penalty).
- */
+ curOffset = 0;
- curOffset = (curOffset + 0x1f) & ~0x1f;
+ if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+ ASSERT(log2Width == log2Height);
+ t->format |= R300_TX_FORMAT_CUBIC_MAP;
- if (texelBytes) {
- /* fix x and y coords up later together with offset */
- t->image[0][i].x = curOffset;
- t->image[0][i].y = 0;
- t->image[0][i].width =
- MIN2(size / texelBytes, blitWidth);
- t->image[0][i].height =
- (size / texelBytes) / t->image[0][i].width;
- } else {
- t->image[0][i].x = curOffset % R300_BLIT_WIDTH_BYTES;
- t->image[0][i].y = curOffset / R300_BLIT_WIDTH_BYTES;
- t->image[0][i].width =
- MIN2(size, R300_BLIT_WIDTH_BYTES);
- t->image[0][i].height = size / t->image[0][i].width;
+ for(i = 0; i < numLevels; i++) {
+ GLuint face;
+ for(face = 0; face < 6; face++)
+ compute_tex_image_offset(tObj, face, i, &curOffset);
}
-
- if (RADEON_DEBUG & DEBUG_TEXTURE)
- fprintf(stderr,
- "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
- i, texImage->Width, texImage->Height,
- t->image[0][i].x, t->image[0][i].y,
- t->image[0][i].width, t->image[0][i].height,
- size, curOffset);
-
- curOffset += size;
+ } else {
+ for (i = 0; i < numLevels; i++)
+ compute_tex_image_offset(tObj, 0, i, &curOffset);
}
/* Align the total size of texture memory block.
@@ -371,26 +401,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
t->base.totalSize =
(curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
- /* Setup remaining cube face blits, if needed */
- if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
- GLuint face;
- for (face = 1; face < 6; face++) {
- for (i = 0; i < numLevels; i++) {
- t->image[face][i].x = t->image[0][i].x;
- t->image[face][i].y = t->image[0][i].y;
- t->image[face][i].width = t->image[0][i].width;
- t->image[face][i].height =
- t->image[0][i].height;
- }
- }
- t->base.totalSize *= 6; /* total texmem needed */
- }
-
- if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
- ASSERT(log2Width == log2Height);
- t->format |= R300_TX_FORMAT_CUBIC_MAP;
- }
-
t->size =
(((tObj->Image[0][t->base.firstLevel]->Width -
1) << R300_TX_WIDTHMASK_SHIFT)
@@ -408,7 +418,7 @@ static void r300SetTexImages(r300ContextPtr rmesa,
t->pitch |=
(tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
- unsigned int align = blitWidth - 1;
+ unsigned int align = (64 / texelBytes) - 1;
t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
texelBytes) + 63) & ~(63);
t->size |= R300_TX_SIZE_TXPITCH_EN;
@@ -428,10 +438,6 @@ static void r300SetTexImages(r300ContextPtr rmesa,
if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
t->pitch_reg |= R500_TXHEIGHT_BIT11;
}
-
- t->dirty_state = TEX_ALL;
-
- /* FYI: r300UploadTexImages( rmesa, t ) used to be called here */
}
/* ================================================================
@@ -568,7 +574,6 @@ static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
rmesa->state.texture.unit[unit].texobj = t;
t->base.bound |= (1 << unit);
- t->dirty_state |= 1 << unit;
driUpdateTextureLRU((driTextureObject *) t); /* XXX: should be locked! */
}
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index b967aa2d737..5d72ec2784f 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -1,8 +1,5 @@
/*
- * Copyright (C) 2005 Ben Skeggs.
- *
* Copyright 2008 Corbin Simpson <[email protected]>
- * Adaptation and modification for ATI/AMD Radeon R500 GPU chipsets.
*
* All Rights Reserved.
*
@@ -28,1439 +25,326 @@
*
*/
+#include "r500_fragprog.h"
+
+static void reset_srcreg(struct prog_src_register* reg)
+{
+ _mesa_bzero(reg, sizeof(*reg));
+ reg->Swizzle = SWIZZLE_NOOP;
+}
+
/**
- * \file
- *
- * \author Ben Skeggs <[email protected]>
- *
- * \author Jerome Glisse <[email protected]>
- *
- * \author Corbin Simpson <[email protected]>
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ * - premultiply texture coordinates for RECT
+ * - extract operand swizzles
+ * - introduce a temporary register when write masks are needed
*
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
- * \todo Verify results of opcodes for accuracy, I've only checked them in
- * specific cases.
*/
+static GLboolean transform_TEX(
+ struct radeon_program_transform_context* context,
+ struct prog_instruction* orig_inst, void* data)
+{
+ struct r500_fragment_program_compiler *compiler =
+ (struct r500_fragment_program_compiler*)data;
+ struct prog_instruction inst = *orig_inst;
+ struct prog_instruction* tgt;
+ GLboolean destredirect = GL_FALSE;
+
+ if (inst.Opcode != OPCODE_TEX &&
+ inst.Opcode != OPCODE_TXB &&
+ inst.Opcode != OPCODE_TXP &&
+ inst.Opcode != OPCODE_KIL)
+ return GL_FALSE;
-#include "glheader.h"
-#include "macros.h"
-#include "enums.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-
-#include "r300_context.h"
-#include "r500_fragprog.h"
-#include "r300_reg.h"
-#include "r300_state.h"
+ /* ARB_shadow & EXT_shadow_funcs */
+ if (inst.Opcode != OPCODE_KIL &&
+ compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+ GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+
+ if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
+
+ tgt->Opcode = OPCODE_MOV;
+ tgt->DstReg.File = inst.DstReg.File;
+ tgt->DstReg.Index = inst.DstReg.Index;
+ tgt->DstReg.WriteMask = inst.DstReg.WriteMask;
+ tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+ tgt->SrcReg[0].Swizzle = comparefunc == GL_ALWAYS ? SWIZZLE_1111 : SWIZZLE_0000;
+ return GL_TRUE;
+ }
-/*
- * Useful macros and values
- */
-#define ERROR(fmt, args...) do { \
- fprintf(stderr, "%s::%s(): " fmt "\n", \
- __FILE__, __FUNCTION__, ##args); \
- fp->error = GL_TRUE; \
- } while(0)
-
-#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
-
-#define R500_US_NUM_TEMP_REGS 128
-#define R500_US_NUM_CONST_REGS 256
-
-/* "Register" flags */
-#define REG_CONSTANT (1 << 8)
-#define REG_SRC_REL (1 << 9)
-#define REG_DEST_REL (1 << 7)
-
-/* Swizzle tools */
-#define R500_SWIZZLE_ZERO 4
-#define R500_SWIZZLE_HALF 5
-#define R500_SWIZZLE_ONE 6
-#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
-#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
-#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
-#define R500_SWIZ_MOD_NEG 1
-#define R500_SWIZ_MOD_ABS 2
-#define R500_SWIZ_MOD_NEG_ABS 3
-/* Swizzles for inst2 */
-#define MAKE_SWIZ_TEX_STRQ(x) (x << 8)
-#define MAKE_SWIZ_TEX_RGBA(x) (x << 24)
-/* Swizzles for inst3 */
-#define MAKE_SWIZ_RGB_A(x) (x << 2)
-#define MAKE_SWIZ_RGB_B(x) (x << 15)
-/* Swizzles for inst4 */
-#define MAKE_SWIZ_ALPHA_A(x) (x << 14)
-#define MAKE_SWIZ_ALPHA_B(x) (x << 21)
-/* Swizzle for inst5 */
-#define MAKE_SWIZ_RGBA_C(x) (x << 14)
-#define MAKE_SWIZ_ALPHA_C(x) (x << 27)
-
-/* Writemasks */
-#define R500_WRITEMASK_G 0x2
-#define R500_WRITEMASK_B 0x4
-#define R500_WRITEMASK_RGB 0x7
-#define R500_WRITEMASK_A 0x8
-#define R500_WRITEMASK_AR 0x9
-#define R500_WRITEMASK_AG 0xA
-#define R500_WRITEMASK_ARG 0xB
-#define R500_WRITEMASK_AB 0xC
-#define R500_WRITEMASK_ARGB 0xF
-
-/* 1/(2pi), needed for quick modulus in trig insts
- * Thanks to glisse for pointing out how to do it! */
-static const GLfloat RCP_2PI[] = {0.15915494309189535,
- 0.15915494309189535,
- 0.15915494309189535,
- 0.15915494309189535};
-
-static const GLfloat LIT[] = {127.999999,
- 127.999999,
- 127.999999,
- -127.999999};
-
-static void dump_program(struct r500_fragment_program *fp);
-
-static inline GLuint make_rgb_swizzle(struct prog_src_register src) {
- GLuint swiz = 0x0;
- GLuint temp;
- /* This could be optimized, but it should be plenty fast already. */
- int i;
- for (i = 0; i < 3; i++) {
- temp = GET_SWZ(src.Swizzle, i);
- /* Fix SWIZZLE_ONE */
- if (temp == 5) temp++;
- swiz |= temp << i*3;
+ inst.DstReg.File = PROGRAM_TEMPORARY;
+ inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler);
+ inst.DstReg.WriteMask = WRITEMASK_XYZW;
}
- if (src.NegateBase)
- swiz |= (R500_SWIZ_MOD_NEG << 9);
- return swiz;
-}
-static inline GLuint make_rgba_swizzle(GLuint src) {
- GLuint swiz = 0x0;
- GLuint temp;
- int i;
- for (i = 0; i < 4; i++) {
- temp = GET_SWZ(src, i);
- /* Fix SWIZZLE_ONE */
- if (temp == 5) temp++;
- swiz |= temp << i*3;
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
+ _mesa_copy_instructions(tgt, &inst, 1);
+
+ if (inst.Opcode != OPCODE_KIL &&
+ compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) {
+ GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+ GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 2);
+
+ tgt[0].Opcode = OPCODE_MAD;
+ tgt[0].DstReg = inst.DstReg;
+ tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+ tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt[0].SrcReg[0].Index = inst.DstReg.Index;
+ if (depthmode == 0) /* GL_LUMINANCE */
+ tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+ else if (depthmode == 2) /* GL_ALPHA */
+ tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+ tgt[0].SrcReg[1].File = PROGRAM_BUILTIN;
+ tgt[0].SrcReg[1].Swizzle = SWIZZLE_1111;
+ tgt[0].SrcReg[2] = inst.SrcReg[0];
+ tgt[0].SrcReg[2].Swizzle = SWIZZLE_ZZZZ;
+
+ /* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+ * r < tex <=> -tex+r < 0
+ * r >= tex <=> not (-tex+r < 0 */
+ if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+ tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+ else
+ tgt[0].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+
+ tgt[1].Opcode = OPCODE_CMP;
+ tgt[1].DstReg = orig_inst->DstReg;
+ tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index;
+ tgt[1].SrcReg[1].File = PROGRAM_BUILTIN;
+ tgt[1].SrcReg[2].File = PROGRAM_BUILTIN;
+
+ if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+ tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111;
+ tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000;
+ } else {
+ tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000;
+ tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111;
+ }
+ } else if (destredirect) {
+ tgt = radeonClauseInsertInstructions(context->compiler, context->dest,
+ context->dest->NumInstructions, 1);
+
+ tgt->Opcode = OPCODE_MAD;
+ tgt->DstReg = orig_inst->DstReg;
+ tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+ tgt->SrcReg[0].Index = inst.DstReg.Index;
+ tgt->SrcReg[1].File = PROGRAM_BUILTIN;
+ tgt->SrcReg[1].Swizzle = SWIZZLE_1111;
+ tgt->SrcReg[2].File = PROGRAM_BUILTIN;
+ tgt->SrcReg[2].Swizzle = SWIZZLE_0000;
}
- return swiz;
-}
-static inline GLuint make_alpha_swizzle(struct prog_src_register src) {
- GLuint swiz = GET_SWZ(src.Swizzle, 3);
-
- if (swiz == 5) swiz++;
-
- if (src.NegateBase)
- swiz |= (R500_SWIZ_MOD_NEG << 3);
-
- return swiz;
+ return GL_TRUE;
}
-static inline GLuint make_sop_swizzle(struct prog_src_register src) {
- GLuint swiz = GET_SWZ(src.Swizzle, 0);
- if (swiz == 5) swiz++;
- return swiz;
-}
+static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
+{
+ struct gl_fragment_program *mp = &fp->mesa_program;
-static inline GLuint make_strq_swizzle(struct prog_src_register src) {
- GLuint swiz = 0x0, temp = 0x0;
- int i;
- for (i = 0; i < 4; i++) {
- temp = GET_SWZ(src.Swizzle, i) & 0x3;
- swiz |= temp << i*2;
- }
- return swiz;
+ /* Ask Mesa nicely to fill in ParameterValues for us */
+ if (mp->Base.Parameters)
+ _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
}
-static int get_temp(struct r500_fragment_program *fp, int slot) {
-
- COMPILE_STATE;
-
- int r = fp->temp_reg_offset + cs->temp_in_use + slot;
- if (r > R500_US_NUM_TEMP_REGS) {
- ERROR("Too many temporary registers requested, can't compile!\n");
- }
-
- return r;
-}
-
-/* Borrowed verbatim from r300_fragprog since it hasn't changed. */
-static GLuint emit_const4fv(struct r500_fragment_program *fp,
- const GLfloat * cp)
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
+ */
+static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
{
- GLuint reg = 0x0;
- int index;
+ GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
- for (index = 0; index < fp->const_nr; ++index) {
- if (fp->constant[index] == cp)
- break;
- }
+ if (!(InputsRead & FRAG_BIT_WPOS))
+ return;
- if (index >= fp->const_nr) {
- if (index >= R500_US_NUM_CONST_REGS) {
- ERROR("Out of hw constants!\n");
- return reg;
+ static gl_state_index tokens[STATE_LENGTH] = {
+ STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+ };
+ struct prog_instruction *fpi;
+ GLuint window_index;
+ int i = 0;
+ GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler);
+
+ fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3);
+
+ /* perspective divide */
+ fpi[i].Opcode = OPCODE_RCP;
+
+ fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+ fpi[i].DstReg.Index = tempregi;
+ fpi[i].DstReg.WriteMask = WRITEMASK_W;
+ fpi[i].DstReg.CondMask = COND_TR;
+
+ fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+ fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+ fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+ i++;
+
+ fpi[i].Opcode = OPCODE_MUL;
+
+ fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+ fpi[i].DstReg.Index = tempregi;
+ fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+ fpi[i].DstReg.CondMask = COND_TR;
+
+ fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+ fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+ fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+ fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+ fpi[i].SrcReg[1].Index = tempregi;
+ fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+ i++;
+
+ /* viewport transformation */
+ window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens);
+
+ fpi[i].Opcode = OPCODE_MAD;
+
+ fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+ fpi[i].DstReg.Index = tempregi;
+ fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+ fpi[i].DstReg.CondMask = COND_TR;
+
+ fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+ fpi[i].SrcReg[0].Index = tempregi;
+ fpi[i].SrcReg[0].Swizzle =
+ MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+ fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+ fpi[i].SrcReg[1].Index = window_index;
+ fpi[i].SrcReg[1].Swizzle =
+ MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+ fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+ fpi[i].SrcReg[2].Index = window_index;
+ fpi[i].SrcReg[2].Swizzle =
+ MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+ i++;
+
+ for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) {
+ int reg;
+ for (reg = 0; reg < 3; reg++) {
+ if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+ fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+ fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+ fpi[i].SrcReg[reg].Index = tempregi;
+ }
}
-
- fp->const_nr++;
- fp->constant[index] = cp;
- }
-
- reg = index | REG_CONSTANT;
- return reg;
-}
-
-static GLuint make_src(struct r500_fragment_program *fp, struct prog_src_register src) {
- COMPILE_STATE;
- GLuint reg;
- switch (src.File) {
- case PROGRAM_TEMPORARY:
- reg = src.Index + fp->temp_reg_offset;
- break;
- case PROGRAM_INPUT:
- reg = cs->inputs[src.Index].reg;
- break;
- case PROGRAM_LOCAL_PARAM:
- reg = emit_const4fv(fp,
- fp->mesa_program.Base.LocalParams[src.
- Index]);
- break;
- case PROGRAM_ENV_PARAM:
- reg = emit_const4fv(fp,
- fp->ctx->FragmentProgram.Parameters[src.
- Index]);
- break;
- case PROGRAM_STATE_VAR:
- case PROGRAM_NAMED_PARAM:
- case PROGRAM_CONSTANT:
- reg = emit_const4fv(fp, fp->mesa_program.Base.Parameters->
- ParameterValues[src.Index]);
- break;
- default:
- ERROR("Can't handle src.File %x\n", src.File);
- reg = 0x0;
- break;
}
- return reg;
}
-static GLuint make_dest(struct r500_fragment_program *fp, struct prog_dst_register dest) {
- GLuint reg;
- switch (dest.File) {
- case PROGRAM_TEMPORARY:
- reg = dest.Index + fp->temp_reg_offset;
- break;
- case PROGRAM_OUTPUT:
- /* Eventually we may need to handle multiple
- * rendering targets... */
- reg = dest.Index;
- break;
- default:
- ERROR("Can't handle dest.File %x\n", dest.File);
- reg = 0x0;
- break;
- }
- return reg;
-}
-static void emit_tex(struct r500_fragment_program *fp,
- struct prog_instruction *fpi, int dest, int counter)
+static GLuint build_dtm(GLuint depthmode)
{
- int hwsrc, hwdest;
- GLuint mask;
-
- mask = fpi->DstReg.WriteMask << 11;
- hwsrc = make_src(fp, fpi->SrcReg[0]);
-
- if (fpi->DstReg.File == PROGRAM_OUTPUT) {
- hwdest = get_temp(fp, 0);
- } else {
- hwdest = dest;
- }
-
- fp->inst[counter].inst0 = R500_INST_TYPE_TEX | mask
- | R500_INST_TEX_SEM_WAIT;
-
- fp->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit)
- | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
-
- if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX)
- fp->inst[counter].inst1 |= R500_TEX_UNSCALED;
-
- switch (fpi->Opcode) {
- case OPCODE_KIL:
- fp->inst[counter].inst1 |= R500_TEX_INST_TEXKILL;
- break;
- case OPCODE_TEX:
- fp->inst[counter].inst1 |= R500_TEX_INST_LD;
- break;
- case OPCODE_TXB:
- fp->inst[counter].inst1 |= R500_TEX_INST_LODBIAS;
- break;
- case OPCODE_TXP:
- fp->inst[counter].inst1 |= R500_TEX_INST_PROJ;
- break;
+ switch(depthmode) {
default:
- ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode);
+ case GL_LUMINANCE: return 0;
+ case GL_INTENSITY: return 1;
+ case GL_ALPHA: return 2;
}
-
- fp->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc)
- | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0]))
- /* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
- | R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */
- | R500_TEX_DST_ADDR(hwdest)
- | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
- | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
-
- fp->inst[counter].inst3 = 0x0;
- fp->inst[counter].inst4 = 0x0;
- fp->inst[counter].inst5 = 0x0;
-
- if (fpi->DstReg.File == PROGRAM_OUTPUT) {
- counter++;
- fp->inst[counter].inst0 = R500_INST_TYPE_OUT
- | R500_INST_TEX_SEM_WAIT | (mask << 4);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGB_OMOD_DISABLE;
- fp->inst[counter].inst4 = R500_ALPHA_OP_CMP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A)
- | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A)
- | R500_ALPHA_OMOD_DISABLE;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(dest)
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- }
-}
-
-static void emit_alu(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi) {
- /* Ideally, we shouldn't have to explicitly clear memory here! */
- fp->inst[counter].inst0 = 0x0;
- fp->inst[counter].inst1 = 0x0;
- fp->inst[counter].inst2 = 0x0;
- fp->inst[counter].inst3 = 0x0;
- fp->inst[counter].inst4 = 0x0;
- fp->inst[counter].inst5 = 0x0;
-
- if (fpi->DstReg.File == PROGRAM_OUTPUT) {
- fp->inst[counter].inst0 = R500_INST_TYPE_OUT;
-
- if (fpi->DstReg.Index == FRAG_RESULT_COLR)
- fp->inst[counter].inst0 |= (fpi->DstReg.WriteMask << 15);
-
- if (fpi->DstReg.Index == FRAG_RESULT_DEPR) {
- fp->inst[counter].inst4 |= R500_ALPHA_W_OMASK;
- /* Notify the state emission! */
- fp->writes_depth = GL_TRUE;
- }
- } else {
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU
- /* pixel_mask */
- | (fpi->DstReg.WriteMask << 11);
- }
-
- fp->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT;
}
-static void emit_mov(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, GLuint src_reg, GLuint swizzle, GLuint dest) {
- /* The r3xx shader uses MAD to implement MOV. We are using CMP, since
- * it is technically more accurate and recommended by ATI/AMD. */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src_reg);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src_reg);
- /* (De)mangle the swizzle from Mesa to R500. */
- swizzle = make_rgba_swizzle(swizzle);
- /* 0x1FF is 9 bits, size of an RGB swizzle. */
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A((swizzle & 0x1ff))
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B((swizzle & 0x1ff))
- | R500_ALU_RGB_OMOD_DISABLE;
- fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(GET_SWZ(swizzle, 3))
- | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(GET_SWZ(swizzle, 3))
- | R500_ALPHA_OMOD_DISABLE;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(dest)
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
-}
-
-static void emit_mad(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, int one, int two, int three) {
- /* Note: This code was all Corbin's. Corbin is a rather hackish coder.
- * If you can make it pretty or fast, please do so! */
- emit_alu(fp, counter, fpi);
- /* Common MAD stuff */
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(make_dest(fp, fpi->DstReg));
- fp->inst[counter].inst5 |= R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(make_dest(fp, fpi->DstReg));
- switch (one) {
- case 0:
- case 1:
- case 2:
- fp->inst[counter].inst1 |= R500_RGB_ADDR0(make_src(fp, fpi->SrcReg[one]));
- fp->inst[counter].inst2 |= R500_ALPHA_ADDR0(make_src(fp, fpi->SrcReg[one]));
- fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[one]));
- fp->inst[counter].inst4 |= R500_ALPHA_SEL_A_SRC0
- | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[one]));
- break;
- case R500_SWIZZLE_ZERO:
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO);
- fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO);
- break;
- case R500_SWIZZLE_ONE:
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE);
- fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE);
- break;
- default:
- ERROR("Bad src index in emit_mad: %d\n", one);
- break;
- }
- switch (two) {
- case 0:
- case 1:
- case 2:
- fp->inst[counter].inst1 |= R500_RGB_ADDR1(make_src(fp, fpi->SrcReg[two]));
- fp->inst[counter].inst2 |= R500_ALPHA_ADDR1(make_src(fp, fpi->SrcReg[two]));
- fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[two]));
- fp->inst[counter].inst4 |= R500_ALPHA_SEL_B_SRC1
- | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[two]));
- break;
- case R500_SWIZZLE_ZERO:
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
- fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
- break;
- case R500_SWIZZLE_ONE:
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
- fp->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
- break;
- default:
- ERROR("Bad src index in emit_mad: %d\n", two);
- break;
- }
- switch (three) {
- case 0:
- case 1:
- case 2:
- fp->inst[counter].inst1 |= R500_RGB_ADDR2(make_src(fp, fpi->SrcReg[three]));
- fp->inst[counter].inst2 |= R500_ALPHA_ADDR2(make_src(fp, fpi->SrcReg[three]));
- fp->inst[counter].inst5 |= R500_ALU_RGBA_SEL_C_SRC2
- | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[three]))
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[three]));
- break;
- case R500_SWIZZLE_ZERO:
- fp->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- break;
- case R500_SWIZZLE_ONE:
- fp->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ONE)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ONE);
- break;
- default:
- ERROR("Bad src index in emit_mad: %d\n", three);
- break;
- }
+static GLuint build_func(GLuint comparefunc)
+{
+ return comparefunc - GL_NEVER;
}
-static void emit_sop(struct r500_fragment_program *fp, int counter, struct prog_instruction *fpi, int opcode, GLuint src, GLuint swiz, GLuint dest) {
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src);
- fp->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(swiz);
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_SOP
- | R500_ALU_RGBA_ADDRD(dest);
- switch (opcode) {
- case OPCODE_COS:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_COS;
- break;
- case OPCODE_EX2:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_EX2;
- break;
- case OPCODE_LG2:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_LN2;
- break;
- case OPCODE_RCP:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_RCP;
- break;
- case OPCODE_RSQ:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_RSQ;
- break;
- case OPCODE_SIN:
- fp->inst[counter].inst4 |= R500_ALPHA_OP_SIN;
- break;
- default:
- ERROR("Bad opcode in emit_sop: %d\n", opcode);
- break;
- }
-}
-static GLboolean parse_program(struct r500_fragment_program *fp)
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+ r300ContextPtr r300,
+ struct r500_fragment_program *fp,
+ struct r500_fragment_program_external_state *state)
{
- struct gl_fragment_program *mp = &fp->mesa_program;
- const struct prog_instruction *inst = mp->Base.Instructions;
- struct prog_instruction *fpi;
- GLuint src[3], dest = 0;
- int temp_swiz, counter = 0;
-
- if (!inst || inst[0].Opcode == OPCODE_END) {
- ERROR("The program is empty!\n");
- return GL_FALSE;
- }
+ int unit;
- for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
-
- if (fpi->Opcode != OPCODE_KIL) {
- dest = make_dest(fp, fpi->DstReg);
- }
+ _mesa_bzero(state, sizeof(*state));
- switch (fpi->Opcode) {
- case OPCODE_ABS:
- emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
- fp->inst[counter].inst3 |= R500_ALU_RGB_MOD_A_ABS
- | R500_ALU_RGB_MOD_B_ABS;
- fp->inst[counter].inst4 |= R500_ALPHA_MOD_A_ABS
- | R500_ALPHA_MOD_B_ABS;
- break;
- case OPCODE_ADD:
- /* Variation on MAD: 1*src0+src1 */
- emit_mad(fp, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
- break;
- case OPCODE_CMP:
- /* This inst's selects need to be swapped as follows:
- * 0 -> C ; 1 -> B ; 2 -> A */
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- src[2] = make_src(fp, fpi->SrcReg[2]);
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[2])
- | R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[0]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[2])
- | R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[0]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[2]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[2]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC2
- | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]));
- break;
- case OPCODE_COS:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = emit_const4fv(fp, RCP_2PI);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
- | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
- | R500_ALPHA_ADDRD(get_temp(fp, 1))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
- counter++;
- emit_sop(fp, counter, fpi, OPCODE_COS, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_DP3:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_DP4:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- /* Based on DP3 */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_DPH:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- /* Based on DP3 */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_DP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_DST:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- /* [1, src0.y*src1.y, src0.z, src1.w]
- * So basically MUL with lotsa swizzling. */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | R500_ALU_RGB_SEL_B_SRC1;
- /* Select [1, y, z, 1] */
- temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE;
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(temp_swiz);
- /* Select [1, y, 1, w] */
- temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6);
- fp->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(temp_swiz);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(dest)
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- break;
- case OPCODE_EX2:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- emit_sop(fp, counter, fpi, OPCODE_EX2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_FLR:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_FRC
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
- counter++;
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(get_temp(fp, 0));
- fp->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
- | R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SWIZ_A_A
- | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC1
- | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC1
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGBA_MOD_C_NEG;
- break;
- case OPCODE_FRC:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_FRC
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_LG2:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- emit_sop(fp, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_LIT:
- /* To be honest, I have no idea how I came up with the following.
- * All I know is that it's based on the r3xx stuff, and was
- * concieved with the help of NyQuil. Mmm, MyQuil. */
-
- /* First instruction */
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = emit_const4fv(fp, LIT);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARG << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAX
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
- counter++;
- /* Second instruction */
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0)) | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- /* Select [w, w, w, y] */
- temp_swiz = 3 | (3 << 3) | (3 << 6);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(temp_swiz)
- | R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_LN2
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_G;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0));
- counter++;
- /* Third instruction */
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AG << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- /* Select [x, x, x, z] */
- temp_swiz = 0;
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(temp_swiz)
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 1))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
- | R500_ALPHA_SEL_B_SRC0 | R500_ALPHA_SWIZ_B_B;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 1))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | R500_ALU_RGBA_A_SWIZ_0;
- counter++;
- /* Fourth instruction */
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AR << 11);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
- fp->inst[counter].inst4 = R500_ALPHA_OP_EX2
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- /* Fifth instruction */
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_AB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- /* Select [w, w, w] */
- temp_swiz = 3 | (3 << 3) | (3 << 6);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B(temp_swiz);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SWIZ_A_1
- | R500_ALPHA_SWIZ_B_1;
- /* Select [-y, -y, -y] */
- temp_swiz = 1 | (1 << 3) | (1 << 6);
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(temp_swiz)
- | R500_ALU_RGBA_MOD_C_NEG
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- /* Final instruction */
- emit_mov(fp, counter, fpi, get_temp(fp, 0), SWIZZLE_NOOP, dest);
- break;
- case OPCODE_LRP:
- /* src0 * src1 + INV(src0) * src2
- * 1) MUL src0, src1, temp
- * 2) PRE 1-src0; MAD srcp, src2, temp */
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- src[2] = make_src(fp, fpi->SrcReg[2]);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | R500_INST_NOP | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[2])
- | R500_RGB_ADDR2(get_temp(fp, 0))
- | R500_RGB_SRCP_OP_1_MINUS_RGB0;
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[2])
- | R500_ALPHA_ADDR2(get_temp(fp, 0))
- | R500_ALPHA_SRCP_OP_1_MINUS_A0;
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
- break;
- case OPCODE_MAD:
- emit_mad(fp, counter, fpi, 0, 1, 2);
- break;
- case OPCODE_MAX:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAX
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_MIN:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MIN
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
- | R500_ALU_RGBA_ADDRD(dest);
- break;
- case OPCODE_MOV:
- emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
- break;
- case OPCODE_MUL:
- /* Variation on MAD: src0*src1+0 */
- emit_mad(fp, counter, fpi, 0, 1, R500_SWIZZLE_ZERO);
- break;
- case OPCODE_POW:
- /* POW(a,b) = EX2(LN2(a)*b) */
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- emit_sop(fp, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), get_temp(fp, 0));
- fp->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
- counter++;
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0))
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0))
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 1))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 1))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- emit_sop(fp, counter, fpi, OPCODE_EX2, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_RCP:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- emit_sop(fp, counter, fpi, OPCODE_RCP, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_RSQ:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- emit_sop(fp, counter, fpi, OPCODE_RSQ, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_SCS:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = emit_const4fv(fp, RCP_2PI);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
- | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
- | R500_ALPHA_ADDRD(get_temp(fp, 1))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
- counter++;
- /* Do a cosine, then a sine, masking out the channels we want to protect. */
- /* Cosine only goes in R (x) channel. */
- fpi->DstReg.WriteMask = 0x1;
- emit_sop(fp, counter, fpi, OPCODE_COS, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
- counter++;
- /* Sine only goes in G (y) channel. */
- fpi->DstReg.WriteMask = 0x2;
- emit_sop(fp, counter, fpi, OPCODE_SIN, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_SGE:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
- | R500_RGB_ADDR2(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
- | R500_ALPHA_ADDR2(src[1]);
- fp->inst[counter].inst3 = /* 1 */
- MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | R500_ALU_RGBA_SEL_C_SRC2
- | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
- | R500_ALU_RGBA_MOD_C_NEG
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
- | R500_ALU_RGBA_ALPHA_MOD_C_NEG;
- counter++;
- /* This inst's selects need to be swapped as follows:
- * 0 -> C ; 1 -> B ; 2 -> A */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
- | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC0
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC0
- | R500_ALU_RGBA_A_SWIZ_A;
- break;
- case OPCODE_SIN:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = emit_const4fv(fp, RCP_2PI);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
- | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
- fp->inst[counter].inst4 = R500_ALPHA_OP_FRC
- | R500_ALPHA_ADDRD(get_temp(fp, 1))
- | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 1));
- counter++;
- emit_sop(fp, counter, fpi, OPCODE_SIN, get_temp(fp, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
- break;
- case OPCODE_SLT:
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_ARGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
- | R500_RGB_ADDR2(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
- | R500_ALPHA_ADDR2(src[1]);
- fp->inst[counter].inst3 = /* 1 */
- MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
- | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | R500_ALU_RGBA_SEL_C_SRC2
- | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
- | R500_ALU_RGBA_MOD_C_NEG
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
- | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
- | R500_ALU_RGBA_ALPHA_MOD_C_NEG;
- counter++;
- /* This inst's selects need to be swapped as follows:
- * 0 -> C ; 1 -> B ; 2 -> A */
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(fp, 0));
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
- | R500_ALU_RGB_SEL_B_SRC0
- | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_CMP
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO)
- | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC0
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGBA_ALPHA_SEL_C_SRC0
- | R500_ALU_RGBA_A_SWIZ_A;
- break;
- case OPCODE_SUB:
- /* Variation on MAD: 1*src0-src1 */
- fpi->SrcReg[1].NegateBase = 0xF; /* NEG_XYZW */
- emit_mad(fp, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
- break;
- case OPCODE_SWZ:
- /* TODO: The rarer negation masks! */
- emit_mov(fp, counter, fpi, make_src(fp, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
- break;
- case OPCODE_XPD:
- /* src0 * src1 - src1 * src0
- * 1) MUL temp.xyz, src0.yzx, src1.zxy
- * 2) MAD src0.zxy, src1.yzx, -temp.xyz */
- src[0] = make_src(fp, fpi->SrcReg[0]);
- src[1] = make_src(fp, fpi->SrcReg[1]);
- fp->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
- | (R500_WRITEMASK_RGB << 11);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1]);
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1]);
- /* Select [y, z, x] */
- temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
- temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(temp_swiz);
- /* Select [z, x, y] */
- temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
- temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
- fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(temp_swiz);
- fp->inst[counter].inst4 = R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(get_temp(fp, 0))
- | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
- | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(get_temp(fp, 0))
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
- | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
- counter++;
- emit_alu(fp, counter, fpi);
- fp->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
- | R500_RGB_ADDR1(src[1])
- | R500_RGB_ADDR2(get_temp(fp, 0));
- fp->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
- | R500_ALPHA_ADDR1(src[1])
- | R500_ALPHA_ADDR2(get_temp(fp, 0));
- /* Select [z, x, y] */
- temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
- temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
- fp->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
- | MAKE_SWIZ_RGB_A(temp_swiz);
- /* Select [y, z, x] */
- temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
- temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
- fp->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
- | MAKE_SWIZ_RGB_B(temp_swiz);
- fp->inst[counter].inst4 |= R500_ALPHA_OP_MAD
- | R500_ALPHA_ADDRD(dest)
- | R500_ALPHA_SWIZ_A_1
- | R500_ALPHA_SWIZ_B_1;
- fp->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
- | R500_ALU_RGBA_ADDRD(dest)
- | R500_ALU_RGBA_SEL_C_SRC2
- | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
- | R500_ALU_RGBA_MOD_C_NEG
- | R500_ALU_RGBA_A_SWIZ_0;
- break;
- case OPCODE_KIL:
- case OPCODE_TEX:
- case OPCODE_TXB:
- case OPCODE_TXP:
- emit_tex(fp, fpi, dest, counter);
- if (fpi->DstReg.File == PROGRAM_OUTPUT)
- counter++;
- break;
- default:
- ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode));
- break;
- }
+ for(unit = 0; unit < 16; ++unit) {
+ if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+ struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
- /* Finishing touches */
- if (fpi->SaturateMode == SATURATE_ZERO_ONE) {
- fp->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
+ state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+ state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
}
-
- counter++;
-
- if (fp->error)
- return GL_FALSE;
-
}
-
- /* Finish him! (If it's an ALU/OUT instruction...) */
- if ((fp->inst[counter-1].inst0 & 0x3) == 1) {
- fp->inst[counter-1].inst0 |= R500_INST_LAST;
- } else {
- /* We still need to put an output inst, right? */
- WARN_ONCE("Final FP instruction is not an OUT.\n");
- }
-
- fp->cs->nrslots = counter;
-
- fp->max_temp_idx++;
-
- return GL_TRUE;
}
-static void init_program(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
- struct r300_pfs_compile_state *cs = NULL;
- struct gl_fragment_program *mp = &fp->mesa_program;
- struct prog_instruction *fpi;
- GLuint InputsRead = mp->Base.InputsRead;
- GLuint temps_used = 0;
- int i, j;
-
- /* New compile, reset tracking data */
- fp->optimization =
- driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
- fp->translated = GL_FALSE;
- fp->error = GL_FALSE;
- fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
- fp->const_nr = 0;
- /* Size of pixel stack, plus 1. */
- fp->max_temp_idx = 1;
- /* Temp register offset. */
- fp->temp_reg_offset = 0;
- /* Whether or not we perform any depth writing. */
- fp->writes_depth = GL_FALSE;
-
- _mesa_memset(cs, 0, sizeof(*fp->cs));
- for (i = 0; i < PFS_MAX_ALU_INST; i++) {
- for (j = 0; j < 3; j++) {
- cs->slot[i].vsrc[j] = SRC_CONST;
- cs->slot[i].ssrc[j] = SRC_CONST;
- }
- }
-
- /* Work out what temps the Mesa inputs correspond to, this must match
- * what setup_rs_unit does, which shouldn't be a problem as rs_unit
- * configures itself based on the fragprog's InputsRead
- *
- * NOTE: this depends on get_hw_temp() allocating registers in order,
- * starting from register 0, so we're just going to do that instead.
- */
-
- /* Texcoords come first */
- for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
- if (InputsRead & (FRAG_BIT_TEX0 << i)) {
- cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
- cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
- fp->temp_reg_offset;
- fp->temp_reg_offset++;
- }
- }
- InputsRead &= ~FRAG_BITS_TEX_ANY;
-
- /* fragment position treated as a texcoord */
- if (InputsRead & FRAG_BIT_WPOS) {
- cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
- cs->inputs[FRAG_ATTRIB_WPOS].reg =
- fp->temp_reg_offset;
- fp->temp_reg_offset++;
- }
- InputsRead &= ~FRAG_BIT_WPOS;
-
- /* Then primary colour */
- if (InputsRead & FRAG_BIT_COL0) {
- cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
- cs->inputs[FRAG_ATTRIB_COL0].reg =
- fp->temp_reg_offset;
- fp->temp_reg_offset++;
- }
- InputsRead &= ~FRAG_BIT_COL0;
-
- /* Secondary color */
- if (InputsRead & FRAG_BIT_COL1) {
- cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
- cs->inputs[FRAG_ATTRIB_COL1].reg =
- fp->temp_reg_offset;
- fp->temp_reg_offset++;
- }
- InputsRead &= ~FRAG_BIT_COL1;
-
- /* Anything else */
- if (InputsRead) {
- WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
- /* force read from hwreg 0 for now */
- for (i = 0; i < 32; i++)
- if (InputsRead & (1 << i))
- cs->inputs[i].reg = 0;
- }
+static void dump_program(struct r500_fragment_program_code *code);
- if (!mp->Base.Instructions) {
- ERROR("No instructions found in program, going to go die now.\n");
- return;
- }
+void r500TranslateFragmentShader(r300ContextPtr r300,
+ struct r500_fragment_program *fp)
+{
+ struct r500_fragment_program_external_state state;
- for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
- for (i = 0; i < 3; i++) {
- if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) {
- if (fpi->SrcReg[i].Index >= temps_used)
- temps_used = fpi->SrcReg[i].Index + 1;
- }
- }
+ build_state(r300, fp, &state);
+ if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+ /* TODO: cache compiled programs */
+ fp->translated = GL_FALSE;
+ _mesa_memcpy(&fp->state, &state, sizeof(state));
}
- cs->temp_in_use = temps_used + 1;
-
- fp->max_temp_idx = fp->temp_reg_offset + cs->temp_in_use;
-
- if (RADEON_DEBUG & DEBUG_PIXEL)
- fprintf(stderr, "FP temp indices: fp->max_temp_idx: %d cs->temp_in_use: %d\n", fp->max_temp_idx, cs->temp_in_use);
-}
+ if (!fp->translated) {
+ struct r500_fragment_program_compiler compiler;
-static void update_params(struct r500_fragment_program *fp)
-{
- struct gl_fragment_program *mp = &fp->mesa_program;
+ compiler.r300 = r300;
+ compiler.fp = fp;
+ compiler.code = &fp->code;
- /* Ask Mesa nicely to fill in ParameterValues for us */
- if (mp->Base.Parameters)
- _mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
-}
+ radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base);
-static void dumb_shader(struct r500_fragment_program *fp)
-{
- fp->inst[0].inst0 = R500_INST_TYPE_TEX
- | R500_INST_TEX_SEM_WAIT
- | R500_INST_RGB_WMASK_R
- | R500_INST_RGB_WMASK_G
- | R500_INST_RGB_WMASK_B
- | R500_INST_ALPHA_WMASK
- | R500_INST_RGB_CLAMP
- | R500_INST_ALPHA_CLAMP;
- fp->inst[0].inst1 = R500_TEX_ID(0)
- | R500_TEX_INST_LD
- | R500_TEX_SEM_ACQUIRE
- | R500_TEX_IGNORE_UNCOVERED;
- fp->inst[0].inst2 = R500_TEX_SRC_ADDR(0)
- | R500_TEX_SRC_S_SWIZ_R
- | R500_TEX_SRC_T_SWIZ_G
- | R500_TEX_DST_ADDR(0)
- | R500_TEX_DST_R_SWIZ_R
- | R500_TEX_DST_G_SWIZ_G
- | R500_TEX_DST_B_SWIZ_B
- | R500_TEX_DST_A_SWIZ_A;
- fp->inst[0].inst3 = R500_DX_ADDR(0)
- | R500_DX_S_SWIZ_R
- | R500_DX_T_SWIZ_R
- | R500_DX_R_SWIZ_R
- | R500_DX_Q_SWIZ_R
- | R500_DY_ADDR(0)
- | R500_DY_S_SWIZ_R
- | R500_DY_T_SWIZ_R
- | R500_DY_R_SWIZ_R
- | R500_DY_Q_SWIZ_R;
- fp->inst[0].inst4 = 0x0;
- fp->inst[0].inst5 = 0x0;
-
- fp->inst[1].inst0 = R500_INST_TYPE_OUT |
- R500_INST_TEX_SEM_WAIT |
- R500_INST_LAST |
- R500_INST_RGB_OMASK_R |
- R500_INST_RGB_OMASK_G |
- R500_INST_RGB_OMASK_B |
- R500_INST_ALPHA_OMASK;
- fp->inst[1].inst1 = R500_RGB_ADDR0(0) |
- R500_RGB_ADDR1(0) |
- R500_RGB_ADDR1_CONST |
- R500_RGB_ADDR2(0) |
- R500_RGB_ADDR2_CONST |
- R500_RGB_SRCP_OP_1_MINUS_2RGB0;
- fp->inst[1].inst2 = R500_ALPHA_ADDR0(0) |
- R500_ALPHA_ADDR1(0) |
- R500_ALPHA_ADDR1_CONST |
- R500_ALPHA_ADDR2(0) |
- R500_ALPHA_ADDR2_CONST |
- R500_ALPHA_SRCP_OP_1_MINUS_2A0;
- fp->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 |
- R500_ALU_RGB_R_SWIZ_A_R |
- R500_ALU_RGB_G_SWIZ_A_G |
- R500_ALU_RGB_B_SWIZ_A_B |
- R500_ALU_RGB_SEL_B_SRC0 |
- R500_ALU_RGB_R_SWIZ_B_1 |
- R500_ALU_RGB_B_SWIZ_B_1 |
- R500_ALU_RGB_G_SWIZ_B_1;
- fp->inst[1].inst4 = R500_ALPHA_OP_MAD |
- R500_ALPHA_SWIZ_A_A |
- R500_ALPHA_SWIZ_B_1;
- fp->inst[1].inst5 = R500_ALU_RGBA_OP_MAD |
- R500_ALU_RGBA_R_SWIZ_0 |
- R500_ALU_RGBA_G_SWIZ_0 |
- R500_ALU_RGBA_B_SWIZ_0 |
- R500_ALU_RGBA_A_SWIZ_0;
-
- fp->cs->nrslots = 2;
- fp->translated = GL_TRUE;
-}
+ insert_WPOS_trailer(&compiler);
-void r500TranslateFragmentShader(r300ContextPtr r300,
- struct r500_fragment_program *fp)
-{
+ struct radeon_program_transformation transformations[1] = {
+ { &transform_TEX, &compiler }
+ };
+ radeonClauseLocalTransform(&compiler.compiler,
+ &compiler.compiler.Clauses[0],
+ 1, transformations);
- struct r300_pfs_compile_state *cs = NULL;
+ if (RADEON_DEBUG & DEBUG_PIXEL) {
+ _mesa_printf("Compiler state after transformations:\n");
+ radeonCompilerDump(&compiler.compiler);
+ }
- if (!fp->translated) {
+ fp->translated = r500FragmentProgramEmit(&compiler);
- init_program(r300, fp);
- cs = fp->cs;
+ radeonCompilerCleanup(&compiler.compiler);
- if (parse_program(fp) == GL_FALSE) {
- ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n");
- dumb_shader(fp);
- fp->inst_offset = 0;
- fp->inst_end = cs->nrslots - 1;
- return;
- }
- fp->inst_offset = 0;
- fp->inst_end = cs->nrslots - 1;
+ r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
- fp->translated = GL_TRUE;
if (RADEON_DEBUG & DEBUG_PIXEL) {
fprintf(stderr, "Mesa program:\n");
fprintf(stderr, "-------------\n");
_mesa_print_program(&fp->mesa_program.Base);
fflush(stdout);
- dump_program(fp);
+ if (fp->translated)
+ dump_program(&fp->code);
}
-
- r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
}
- update_params(fp);
+ update_params(r300, fp);
}
@@ -1561,7 +445,7 @@ static char *to_texop(int val)
return NULL;
}
-static void dump_program(struct r500_fragment_program *fp)
+static void dump_program(struct r500_fragment_program_code *code)
{
fprintf(stderr, "R500 Fragment Program:\n--------\n");
@@ -1571,18 +455,18 @@ static void dump_program(struct r500_fragment_program *fp)
uint32_t inst0;
char *str = NULL;
- if (fp->const_nr) {
+ if (code->const_nr) {
fprintf(stderr, "--------\nConstants:\n");
- for (n = 0; n < fp->const_nr; n++) {
+ for (n = 0; n < code->const_nr; n++) {
fprintf(stderr, "Constant %d: %f %f\n\t %f %f\n", n,
- fp->constant[n][0], fp->constant[n][1], fp->constant[n][2],
- fp->constant[n][3]);
+ code->constant[n][0], code->constant[n][1], code->constant[n][2],
+ code->constant[n][3]);
}
fprintf(stderr, "--------\n");
}
- for (n = 0; n < fp->inst_end+1; n++) {
- inst0 = inst = fp->inst[n].inst0;
+ for (n = 0; n < code->inst_end+1; n++) {
+ inst0 = inst = code->inst[n].inst0;
fprintf(stderr,"%d\t0:CMN_INST 0x%08x:", n, inst);
switch(inst & 0x3) {
case R500_INST_TYPE_ALU: str = "ALU"; break;
@@ -1601,8 +485,8 @@ static void dump_program(struct r500_fragment_program *fp)
switch(inst0 & 0x3) {
case 0:
case 1:
- fprintf(stderr,"\t1:RGB_ADDR 0x%08x:", fp->inst[n].inst1);
- inst = fp->inst[n].inst1;
+ fprintf(stderr,"\t1:RGB_ADDR 0x%08x:", code->inst[n].inst1);
+ inst = code->inst[n].inst1;
fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
@@ -1610,15 +494,15 @@ static void dump_program(struct r500_fragment_program *fp)
(inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
(inst >> 30));
- fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", fp->inst[n].inst2);
- inst = fp->inst[n].inst2;
+ fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", code->inst[n].inst2);
+ inst = code->inst[n].inst2;
fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
(inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
(inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
(inst >> 30));
- fprintf(stderr,"\t3 RGB_INST: 0x%08x:", fp->inst[n].inst3);
- inst = fp->inst[n].inst3;
+ fprintf(stderr,"\t3 RGB_INST: 0x%08x:", code->inst[n].inst3);
+ inst = code->inst[n].inst3;
fprintf(stderr,"rgb_A_src:%d %s/%s/%s %d rgb_B_src:%d %s/%s/%s %d\n",
(inst) & 0x3, toswiz((inst >> 2) & 0x7), toswiz((inst >> 5) & 0x7), toswiz((inst >> 8) & 0x7),
(inst >> 11) & 0x3,
@@ -1626,16 +510,16 @@ static void dump_program(struct r500_fragment_program *fp)
(inst >> 24) & 0x3);
- fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", fp->inst[n].inst4);
- inst = fp->inst[n].inst4;
+ fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", code->inst[n].inst4);
+ inst = code->inst[n].inst4;
fprintf(stderr,"%s dest:%d%s alp_A_src:%d %s %d alp_B_src:%d %s %d w:%d\n", to_alpha_op(inst & 0xf),
(inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
(inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), (inst >> 17) & 0x3,
(inst >> 19) & 0x3, toswiz((inst >> 21) & 0x7), (inst >> 24) & 0x3,
(inst >> 31) & 0x1);
- fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", fp->inst[n].inst5);
- inst = fp->inst[n].inst5;
+ fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", code->inst[n].inst5);
+ inst = code->inst[n].inst5;
fprintf(stderr,"%s dest:%d%s rgb_C_src:%d %s/%s/%s %d alp_C_src:%d %s %d\n", toop(inst & 0xf),
(inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
(inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), toswiz((inst >> 17) & 0x7), toswiz((inst >> 20) & 0x7),
@@ -1645,11 +529,11 @@ static void dump_program(struct r500_fragment_program *fp)
case 2:
break;
case 3:
- inst = fp->inst[n].inst1;
+ inst = code->inst[n].inst1;
fprintf(stderr,"\t1:TEX_INST: 0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
(inst & (1<<26)) ? "IGNUNC" : "", (inst & (1<<27)) ? "UNSCALED" : "SCALED");
- inst = fp->inst[n].inst2;
+ inst = code->inst[n].inst2;
fprintf(stderr,"\t2:TEX_ADDR: 0x%08x: src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n", inst,
inst & 127, inst & (1<<7) ? "(rel)" : "",
toswiz((inst >> 8) & 0x3), toswiz((inst >> 10) & 0x3),
@@ -1658,7 +542,7 @@ static void dump_program(struct r500_fragment_program *fp)
toswiz((inst >> 24) & 0x3), toswiz((inst >> 26) & 0x3),
toswiz((inst >> 28) & 0x3), toswiz((inst >> 30) & 0x3));
- fprintf(stderr,"\t3:TEX_DXDY: 0x%08x\n", fp->inst[n].inst3);
+ fprintf(stderr,"\t3:TEX_DXDY: 0x%08x\n", code->inst[n].inst3);
break;
}
fprintf(stderr,"\n");
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 5dd2def1c40..ff6a9002c14 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -36,10 +36,14 @@
#include "glheader.h"
#include "macros.h"
#include "enums.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
#include "shader/program.h"
#include "shader/prog_instruction.h"
#include "r300_context.h"
+#include "r300_state.h"
+#include "radeon_program.h"
/* supported hw opcodes */
#define PFS_OP_MAD 0
@@ -76,4 +80,13 @@ struct r500_fragment_program;
extern void r500TranslateFragmentShader(r300ContextPtr r300,
struct r500_fragment_program *fp);
+struct r500_fragment_program_compiler {
+ r300ContextPtr r300;
+ struct r500_fragment_program *fp;
+ struct r500_fragment_program_code *code;
+ struct radeon_compiler compiler;
+};
+
+extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+
#endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
new file mode 100644
index 00000000000..e1ad342690b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
@@ -0,0 +1,1533 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * Copyright 2008 Corbin Simpson <[email protected]>
+ * Adaptation and modification for ATI/AMD Radeon R500 GPU chipsets.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Ben Skeggs <[email protected]>
+ *
+ * \author Jerome Glisse <[email protected]>
+ *
+ * \author Corbin Simpson <[email protected]>
+ *
+ * \todo Depth write, WPOS/FOGC inputs
+ *
+ * \todo FogOption
+ *
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r500_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/* Mapping Mesa registers to R500 temporaries */
+struct reg_acc {
+ int reg; /* Assigned hw temp */
+ unsigned int refcount; /* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+ /* Index of the first slot where this register is free in the sense
+ that it can be used as a new destination register.
+ This is -1 if the register has been assigned to a Mesa register
+ and the last access to the register has not yet been emitted */
+ int free;
+
+ /* Index of the first slot where this register is currently reserved.
+ This is used to stop e.g. a scalar operation from being moved
+ before the allocation time of a register that was first allocated
+ for a vector operation. */
+ int reserved;
+
+ /* Index of the first slot in which the register can be used as a
+ source without losing the value that is written by the last
+ emitted instruction that writes to the register */
+ int vector_valid;
+ int scalar_valid;
+
+ /* Index to the slot where the register was last read.
+ This is also the first slot in which the register may be written again */
+ int vector_lastread;
+ int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR (1<<0)
+#define SLOT_SRC_SCALAR (1<<3)
+#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR (1<<16)
+#define SLOT_OP_SCALAR (1<<17)
+#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r500_pfs_compile_slot {
+ /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+ defined above */
+ unsigned int used;
+
+ /* Selected sources */
+ int vsrc[3];
+ int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r500_pfs_compile_state {
+ struct r500_fragment_program_compiler *compiler;
+
+ /* number of ALU slots used so far */
+ int nrslots;
+
+ /* Track which (parts of) slots are already filled with instructions */
+ struct r500_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+ /* Track the validity of R300 temporaries */
+ struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+ /* Used to map Mesa's inputs/temps onto hardware temps */
+ int temp_in_use;
+ struct reg_acc temps[PFS_NUM_TEMP_REGS];
+ struct reg_acc inputs[32]; /* don't actually need 32... */
+
+ /* Track usage of hardware temps, for register allocation,
+ * indirection detection, etc. */
+ GLuint used_in_node;
+ GLuint dest_in_node;
+};
+
+/*
+ * Useful macros and values
+ */
+#define ERROR(fmt, args...) do { \
+ fprintf(stderr, "%s::%s(): " fmt "\n", \
+ __FILE__, __FUNCTION__, ##args); \
+ cs->compiler->fp->error = GL_TRUE; \
+ } while(0)
+
+#define PROG_CODE struct r500_fragment_program_code *code = cs->compiler->code
+
+#define R500_US_NUM_TEMP_REGS 128
+#define R500_US_NUM_CONST_REGS 256
+
+/* "Register" flags */
+#define REG_CONSTANT (1 << 8)
+#define REG_SRC_REL (1 << 9)
+#define REG_DEST_REL (1 << 7)
+
+/* Swizzle tools */
+#define R500_SWIZZLE_ZERO 4
+#define R500_SWIZZLE_HALF 5
+#define R500_SWIZZLE_ONE 6
+#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
+#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
+#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
+#define R500_SWIZ_MOD_NEG 1
+#define R500_SWIZ_MOD_ABS 2
+#define R500_SWIZ_MOD_NEG_ABS 3
+/* Swizzles for inst2 */
+#define MAKE_SWIZ_TEX_STRQ(x) (x << 8)
+#define MAKE_SWIZ_TEX_RGBA(x) (x << 24)
+/* Swizzles for inst3 */
+#define MAKE_SWIZ_RGB_A(x) (x << 2)
+#define MAKE_SWIZ_RGB_B(x) (x << 15)
+/* Swizzles for inst4 */
+#define MAKE_SWIZ_ALPHA_A(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_B(x) (x << 21)
+/* Swizzle for inst5 */
+#define MAKE_SWIZ_RGBA_C(x) (x << 14)
+#define MAKE_SWIZ_ALPHA_C(x) (x << 27)
+
+/* Writemasks */
+#define R500_WRITEMASK_G 0x2
+#define R500_WRITEMASK_B 0x4
+#define R500_WRITEMASK_RGB 0x7
+#define R500_WRITEMASK_A 0x8
+#define R500_WRITEMASK_AR 0x9
+#define R500_WRITEMASK_AG 0xA
+#define R500_WRITEMASK_ARG 0xB
+#define R500_WRITEMASK_AB 0xC
+#define R500_WRITEMASK_ARGB 0xF
+
+/* 1/(2pi), needed for quick modulus in trig insts
+ * Thanks to glisse for pointing out how to do it! */
+static const GLfloat RCP_2PI[] = {0.15915494309189535,
+ 0.15915494309189535,
+ 0.15915494309189535,
+ 0.15915494309189535};
+
+static const GLfloat LIT[] = {127.999999,
+ 127.999999,
+ 127.999999,
+ -127.999999};
+
+static inline GLuint make_rgb_swizzle(struct prog_src_register src) {
+ GLuint swiz = 0x0;
+ GLuint temp;
+ /* This could be optimized, but it should be plenty fast already. */
+ int i;
+ for (i = 0; i < 3; i++) {
+ temp = GET_SWZ(src.Swizzle, i);
+ /* Fix SWIZZLE_ONE */
+ if (temp == 5) temp++;
+ swiz |= temp << i*3;
+ }
+ if (src.NegateBase)
+ swiz |= (R500_SWIZ_MOD_NEG << 9);
+ return swiz;
+}
+
+static inline GLuint make_rgba_swizzle(GLuint src) {
+ GLuint swiz = 0x0;
+ GLuint temp;
+ int i;
+ for (i = 0; i < 4; i++) {
+ temp = GET_SWZ(src, i);
+ /* Fix SWIZZLE_ONE */
+ if (temp == 5) temp++;
+ swiz |= temp << i*3;
+ }
+ return swiz;
+}
+
+static inline GLuint make_alpha_swizzle(struct prog_src_register src) {
+ GLuint swiz = GET_SWZ(src.Swizzle, 3);
+
+ if (swiz == 5) swiz++;
+
+ if (src.NegateBase)
+ swiz |= (R500_SWIZ_MOD_NEG << 3);
+
+ return swiz;
+}
+
+static inline GLuint make_sop_swizzle(struct prog_src_register src) {
+ GLuint swiz = GET_SWZ(src.Swizzle, 0);
+
+ if (swiz == 5) swiz++;
+ return swiz;
+}
+
+static inline GLuint make_strq_swizzle(struct prog_src_register src) {
+ GLuint swiz = 0x0, temp = 0x0;
+ int i;
+ for (i = 0; i < 4; i++) {
+ temp = GET_SWZ(src.Swizzle, i) & 0x3;
+ swiz |= temp << i*2;
+ }
+ return swiz;
+}
+
+static int get_temp(struct r500_pfs_compile_state *cs, int slot) {
+
+ PROG_CODE;
+
+ int r = code->temp_reg_offset + cs->temp_in_use + slot;
+
+ if (r > R500_US_NUM_TEMP_REGS) {
+ ERROR("Too many temporary registers requested, can't compile!\n");
+ }
+
+ return r;
+}
+
+/* Borrowed verbatim from r300_fragprog since it hasn't changed. */
+static GLuint emit_const4fv(struct r500_pfs_compile_state *cs,
+ const GLfloat * cp)
+{
+ PROG_CODE;
+
+ GLuint reg = 0x0;
+ int index;
+
+ for (index = 0; index < code->const_nr; ++index) {
+ if (code->constant[index] == cp)
+ break;
+ }
+
+ if (index >= code->const_nr) {
+ if (index >= R500_US_NUM_CONST_REGS) {
+ ERROR("Out of hw constants!\n");
+ return reg;
+ }
+
+ code->const_nr++;
+ code->constant[index] = cp;
+ }
+
+ reg = index | REG_CONSTANT;
+ return reg;
+}
+
+static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_register src) {
+ PROG_CODE;
+ GLuint reg;
+ switch (src.File) {
+ case PROGRAM_TEMPORARY:
+ reg = src.Index + code->temp_reg_offset;
+ break;
+ case PROGRAM_INPUT:
+ reg = cs->inputs[src.Index].reg;
+ break;
+ case PROGRAM_LOCAL_PARAM:
+ reg = emit_const4fv(cs,
+ cs->compiler->fp->mesa_program.Base.LocalParams[src.Index]);
+ break;
+ case PROGRAM_ENV_PARAM:
+ reg = emit_const4fv(cs,
+ cs->compiler->compiler.Ctx->FragmentProgram.Parameters[src.Index]);
+ break;
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_NAMED_PARAM:
+ case PROGRAM_CONSTANT:
+ reg = emit_const4fv(cs,
+ cs->compiler->fp->mesa_program.Base.Parameters->ParameterValues[src.Index]);
+ break;
+ case PROGRAM_BUILTIN:
+ reg = 0x0;
+ break;
+ default:
+ ERROR("Can't handle src.File %x\n", src.File);
+ reg = 0x0;
+ break;
+ }
+ return reg;
+}
+
+static GLuint make_dest(struct r500_pfs_compile_state *cs, struct prog_dst_register dest) {
+ PROG_CODE;
+ GLuint reg;
+ switch (dest.File) {
+ case PROGRAM_TEMPORARY:
+ reg = dest.Index + code->temp_reg_offset;
+ break;
+ case PROGRAM_OUTPUT:
+ /* Eventually we may need to handle multiple
+ * rendering targets... */
+ reg = dest.Index;
+ break;
+ case PROGRAM_BUILTIN:
+ reg = 0x0;
+ break;
+ default:
+ ERROR("Can't handle dest.File %x\n", dest.File);
+ reg = 0x0;
+ break;
+ }
+ return reg;
+}
+
+static void emit_tex(struct r500_pfs_compile_state *cs,
+ struct prog_instruction *fpi, int dest, int counter)
+{
+ PROG_CODE;
+ int hwsrc, hwdest;
+ GLuint mask;
+
+ mask = fpi->DstReg.WriteMask << 11;
+ hwsrc = make_src(cs, fpi->SrcReg[0]);
+
+ if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+ hwdest = get_temp(cs, 0);
+ } else {
+ hwdest = dest;
+ }
+
+ code->inst[counter].inst0 = R500_INST_TYPE_TEX | mask
+ | R500_INST_TEX_SEM_WAIT;
+
+ code->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit)
+ | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+
+ if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX)
+ code->inst[counter].inst1 |= R500_TEX_UNSCALED;
+
+ switch (fpi->Opcode) {
+ case OPCODE_KIL:
+ code->inst[counter].inst1 |= R500_TEX_INST_TEXKILL;
+ break;
+ case OPCODE_TEX:
+ code->inst[counter].inst1 |= R500_TEX_INST_LD;
+ break;
+ case OPCODE_TXB:
+ code->inst[counter].inst1 |= R500_TEX_INST_LODBIAS;
+ break;
+ case OPCODE_TXP:
+ code->inst[counter].inst1 |= R500_TEX_INST_PROJ;
+ break;
+ default:
+ ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode);
+ }
+
+ code->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc)
+ | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0]))
+ /* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G
+ | R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */
+ | R500_TEX_DST_ADDR(hwdest)
+ | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
+ | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+
+ code->inst[counter].inst3 = 0x0;
+ code->inst[counter].inst4 = 0x0;
+ code->inst[counter].inst5 = 0x0;
+
+ if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_OUT
+ | R500_INST_TEX_SEM_WAIT | (mask << 4);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_SEL_B_SRC0
+ | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_OMOD_DISABLE;
+ code->inst[counter].inst4 = R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A)
+ | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A)
+ | R500_ALPHA_OMOD_DISABLE;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ }
+}
+
+static void emit_alu(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi) {
+ PROG_CODE;
+ /* Ideally, we shouldn't have to explicitly clear memory here! */
+ code->inst[counter].inst0 = 0x0;
+ code->inst[counter].inst1 = 0x0;
+ code->inst[counter].inst2 = 0x0;
+ code->inst[counter].inst3 = 0x0;
+ code->inst[counter].inst4 = 0x0;
+ code->inst[counter].inst5 = 0x0;
+
+ if (fpi->DstReg.File == PROGRAM_OUTPUT) {
+ code->inst[counter].inst0 = R500_INST_TYPE_OUT;
+
+ if (fpi->DstReg.Index == FRAG_RESULT_COLR)
+ code->inst[counter].inst0 |= (fpi->DstReg.WriteMask << 15);
+
+ if (fpi->DstReg.Index == FRAG_RESULT_DEPR) {
+ code->inst[counter].inst4 |= R500_ALPHA_W_OMASK;
+ /* Notify the state emission! */
+ cs->compiler->fp->writes_depth = GL_TRUE;
+ }
+ } else {
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU
+ /* pixel_mask */
+ | (fpi->DstReg.WriteMask << 11);
+ }
+
+ code->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT;
+}
+
+static void emit_mov(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, GLuint src_reg, GLuint swizzle, GLuint dest) {
+ PROG_CODE;
+ /* The r3xx shader uses MAD to implement MOV. We are using CMP, since
+ * it is technically more accurate and recommended by ATI/AMD. */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src_reg);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src_reg);
+ /* (De)mangle the swizzle from Mesa to R500. */
+ swizzle = make_rgba_swizzle(swizzle);
+ /* 0x1FF is 9 bits, size of an RGB swizzle. */
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A((swizzle & 0x1ff))
+ | R500_ALU_RGB_SEL_B_SRC0
+ | MAKE_SWIZ_RGB_B((swizzle & 0x1ff))
+ | R500_ALU_RGB_OMOD_DISABLE;
+ code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(GET_SWZ(swizzle, 3))
+ | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(GET_SWZ(swizzle, 3))
+ | R500_ALPHA_OMOD_DISABLE;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+}
+
+static void emit_mad(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int one, int two, int three) {
+ PROG_CODE;
+ /* Note: This code was all Corbin's. Corbin is a rather hackish coder.
+ * If you can make it pretty or fast, please do so! */
+ emit_alu(cs, counter, fpi);
+ /* Common MAD stuff */
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(make_dest(cs, fpi->DstReg));
+ code->inst[counter].inst5 |= R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(make_dest(cs, fpi->DstReg));
+ switch (one) {
+ case 0:
+ case 1:
+ case 2:
+ code->inst[counter].inst1 |= R500_RGB_ADDR0(make_src(cs, fpi->SrcReg[one]));
+ code->inst[counter].inst2 |= R500_ALPHA_ADDR0(make_src(cs, fpi->SrcReg[one]));
+ code->inst[counter].inst3 |= R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[one]));
+ code->inst[counter].inst4 |= R500_ALPHA_SEL_A_SRC0
+ | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[one]));
+ break;
+ case R500_SWIZZLE_ZERO:
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO);
+ code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO);
+ break;
+ case R500_SWIZZLE_ONE:
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE);
+ code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE);
+ break;
+ default:
+ ERROR("Bad src index in emit_mad: %d\n", one);
+ break;
+ }
+ switch (two) {
+ case 0:
+ case 1:
+ case 2:
+ code->inst[counter].inst1 |= R500_RGB_ADDR1(make_src(cs, fpi->SrcReg[two]));
+ code->inst[counter].inst2 |= R500_ALPHA_ADDR1(make_src(cs, fpi->SrcReg[two]));
+ code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+ | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[two]));
+ code->inst[counter].inst4 |= R500_ALPHA_SEL_B_SRC1
+ | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[two]));
+ break;
+ case R500_SWIZZLE_ZERO:
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+ code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
+ break;
+ case R500_SWIZZLE_ONE:
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
+ code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
+ break;
+ default:
+ ERROR("Bad src index in emit_mad: %d\n", two);
+ break;
+ }
+ switch (three) {
+ case 0:
+ case 1:
+ case 2:
+ code->inst[counter].inst1 |= R500_RGB_ADDR2(make_src(cs, fpi->SrcReg[three]));
+ code->inst[counter].inst2 |= R500_ALPHA_ADDR2(make_src(cs, fpi->SrcReg[three]));
+ code->inst[counter].inst5 |= R500_ALU_RGBA_SEL_C_SRC2
+ | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[three]))
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[three]));
+ break;
+ case R500_SWIZZLE_ZERO:
+ code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ break;
+ case R500_SWIZZLE_ONE:
+ code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ONE)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ONE);
+ break;
+ default:
+ ERROR("Bad src index in emit_mad: %d\n", three);
+ break;
+ }
+}
+
+static void emit_sop(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int opcode, GLuint src, GLuint swiz, GLuint dest) {
+ PROG_CODE;
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src);
+ code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(swiz);
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_SOP
+ | R500_ALU_RGBA_ADDRD(dest);
+ switch (opcode) {
+ case OPCODE_COS:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_COS;
+ break;
+ case OPCODE_EX2:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_EX2;
+ break;
+ case OPCODE_LG2:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_LN2;
+ break;
+ case OPCODE_RCP:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_RCP;
+ break;
+ case OPCODE_RSQ:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_RSQ;
+ break;
+ case OPCODE_SIN:
+ code->inst[counter].inst4 |= R500_ALPHA_OP_SIN;
+ break;
+ default:
+ ERROR("Bad opcode in emit_sop: %d\n", opcode);
+ break;
+ }
+}
+
+static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi, int counter) {
+ PROG_CODE;
+ GLuint src[3], dest = 0;
+ int temp_swiz = 0;
+
+ if (fpi->Opcode != OPCODE_KIL) {
+ dest = make_dest(cs, fpi->DstReg);
+ }
+
+ switch (fpi->Opcode) {
+ case OPCODE_ABS:
+ emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+ code->inst[counter].inst3 |= R500_ALU_RGB_MOD_A_ABS
+ | R500_ALU_RGB_MOD_B_ABS;
+ code->inst[counter].inst4 |= R500_ALPHA_MOD_A_ABS
+ | R500_ALPHA_MOD_B_ABS;
+ break;
+ case OPCODE_ADD:
+ /* Variation on MAD: 1*src0+src1 */
+ emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
+ break;
+ case OPCODE_CMP:
+ /* This inst's selects need to be swapped as follows:
+ * 0 -> C ; 1 -> B ; 2 -> A */
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ src[2] = make_src(cs, fpi->SrcReg[2]);
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[2])
+ | R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[0]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[2])
+ | R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[0]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[2]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[2]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC2
+ | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]));
+ break;
+ case OPCODE_COS:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = emit_const4fv(cs, RCP_2PI);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+ | R500_ALPHA_ADDRD(get_temp(cs, 1))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+ counter++;
+ emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_DP3:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_DP4:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ /* Based on DP3 */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_DPH:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ /* Based on DP3 */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_DP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_DST:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ /* [1, src0.y*src1.y, src0.z, src1.w]
+ * So basically MUL with lotsa swizzling. */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | R500_ALU_RGB_SEL_B_SRC1;
+ /* Select [1, y, z, 1] */
+ temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE;
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(temp_swiz);
+ /* Select [1, y, 1, w] */
+ temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6);
+ code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(temp_swiz);
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(dest)
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ break;
+ case OPCODE_EX2:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ emit_sop(cs, counter, fpi, OPCODE_EX2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_FLR:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_FRC
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0));
+ counter++;
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(get_temp(cs, 0));
+ code->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+ | R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC1
+ | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC1
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGBA_MOD_C_NEG;
+ break;
+ case OPCODE_FRC:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_FRC
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_LG2:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_LIT:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = emit_const4fv(cs, LIT);
+ /* First inst: MAX temp, input, [0, 0, 0, -128]
+ * Write: RG, A */
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARG << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAX
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0));
+ counter++;
+ /* Second inst: MIN temp, temp, [x, x, x, 128]
+ * Write: A */
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_A << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)) | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)) | R500_ALPHA_ADDR1(src[1]);
+ /* code->inst[counter].inst3; */
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAX
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+ | R500_ALU_RGBA_ADDRD(dest);
+ counter++;
+ /* Third-fifth insts: POW temp, temp.y, temp.w
+ * Write: B */
+ emit_sop(cs, counter, fpi, OPCODE_LG2, get_temp(cs, 0), SWIZZLE_Y, get_temp(cs, 1));
+ code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 1))
+ | R500_RGB_ADDR1(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 1))
+ | R500_ALPHA_ADDR1(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 1))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 1))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), SWIZZLE_W, get_temp(cs, 0));
+ code->inst[counter].inst0 |= (R500_WRITEMASK_B << 11);
+ counter++;
+ /* Sixth inst: CMP dest, temp.xxxx, temp.[1, x, z, 1], temp.[1, x, 0, 1];
+ * Write: ARGB
+ * This inst's selects need to be swapped as follows:
+ * 0 -> C ; 1 -> B ; 2 -> A */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | R500_ALU_RGB_R_SWIZ_A_1
+ | R500_ALU_RGB_G_SWIZ_A_R
+ | R500_ALU_RGB_B_SWIZ_A_B
+ | R500_ALU_RGB_SEL_B_SRC0
+ | R500_ALU_RGB_R_SWIZ_B_1
+ | R500_ALU_RGB_G_SWIZ_B_R
+ | R500_ALU_RGB_B_SWIZ_B_0;
+ code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_1
+ | R500_ALPHA_SEL_B_SRC0 | R500_ALPHA_SWIZ_B_1;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC0
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+ | R500_ALU_RGBA_R_SWIZ_R
+ | R500_ALU_RGBA_G_SWIZ_R
+ | R500_ALU_RGBA_B_SWIZ_R
+ | R500_ALU_RGBA_A_SWIZ_R;
+ break;
+ case OPCODE_LRP:
+ /* src0 * src1 + INV(src0) * src2
+ * 1) MUL src0, src1, temp
+ * 2) PRE 1-src0; MAD srcp, src2, temp */
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ src[2] = make_src(cs, fpi->SrcReg[2]);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | R500_INST_NOP | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[2])
+ | R500_RGB_ADDR2(get_temp(cs, 0))
+ | R500_RGB_SRCP_OP_1_MINUS_RGB0;
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[2])
+ | R500_ALPHA_ADDR2(get_temp(cs, 0))
+ | R500_ALPHA_SRCP_OP_1_MINUS_A0;
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2]))
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2]));
+ break;
+ case OPCODE_MAD:
+ emit_mad(cs, counter, fpi, 0, 1, 2);
+ break;
+ case OPCODE_MAX:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1
+ | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MAX
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_MIN:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1
+ | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MIN
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN
+ | R500_ALU_RGBA_ADDRD(dest);
+ break;
+ case OPCODE_MOV:
+ emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+ break;
+ case OPCODE_MUL:
+ /* Variation on MAD: src0*src1+0 */
+ emit_mad(cs, counter, fpi, 0, 1, R500_SWIZZLE_ZERO);
+ break;
+ case OPCODE_POW:
+ /* POW(a,b) = EX2(LN2(a)*b) */
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), get_temp(cs, 0));
+ code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11);
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0))
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0))
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0]))
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 1))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 1))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_RCP:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ emit_sop(cs, counter, fpi, OPCODE_RCP, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_RSQ:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ emit_sop(cs, counter, fpi, OPCODE_RSQ, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_SCS:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = emit_const4fv(cs, RCP_2PI);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+ | R500_ALPHA_ADDRD(get_temp(cs, 1))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+ counter++;
+ /* Do a cosine, then a sine, masking out the channels we want to protect. */
+ /* Cosine only goes in R (x) channel. */
+ fpi->DstReg.WriteMask = 0x1;
+ emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+ counter++;
+ /* Sine only goes in G (y) channel. */
+ fpi->DstReg.WriteMask = 0x2;
+ emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_SGE:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
+ | R500_RGB_ADDR2(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
+ | R500_ALPHA_ADDR2(src[1]);
+ code->inst[counter].inst3 = /* 1 */
+ MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | R500_ALU_RGBA_SEL_C_SRC2
+ | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+ | R500_ALU_RGBA_MOD_C_NEG
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
+ | R500_ALU_RGBA_ALPHA_MOD_C_NEG;
+ counter++;
+ /* This inst's selects need to be swapped as follows:
+ * 0 -> C ; 1 -> B ; 2 -> A */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+ | R500_ALU_RGB_SEL_B_SRC0
+ | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO);
+ code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+ | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO);
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC0
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+ | R500_ALU_RGBA_A_SWIZ_A;
+ break;
+ case OPCODE_SIN:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = emit_const4fv(cs, RCP_2PI);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A
+ | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB);
+ code->inst[counter].inst4 = R500_ALPHA_OP_FRC
+ | R500_ALPHA_ADDRD(get_temp(cs, 1))
+ | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 1));
+ counter++;
+ emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest);
+ break;
+ case OPCODE_SLT:
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_ARGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR1(src[0])
+ | R500_RGB_ADDR2(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0])
+ | R500_ALPHA_ADDR2(src[1]);
+ code->inst[counter].inst3 = /* 1 */
+ MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE)
+ | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE)
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | R500_ALU_RGBA_SEL_C_SRC2
+ | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1]))
+ | R500_ALU_RGBA_MOD_C_NEG
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC2
+ | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1]))
+ | R500_ALU_RGBA_ALPHA_MOD_C_NEG;
+ counter++;
+ /* This inst's selects need to be swapped as follows:
+ * 0 -> C ; 1 -> B ; 2 -> A */
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0));
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO)
+ | R500_ALU_RGB_SEL_B_SRC0
+ | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE);
+ code->inst[counter].inst4 |= R500_ALPHA_OP_CMP
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO)
+ | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE);
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC0
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGBA_ALPHA_SEL_C_SRC0
+ | R500_ALU_RGBA_A_SWIZ_A;
+ break;
+ case OPCODE_SUB:
+ /* Variation on MAD: 1*src0-src1 */
+ fpi->SrcReg[1].NegateBase = 0xF; /* NEG_XYZW */
+ emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1);
+ break;
+ case OPCODE_SWZ:
+ /* TODO: The rarer negation masks! */
+ emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest);
+ break;
+ case OPCODE_XPD:
+ /* src0 * src1 - src1 * src0
+ * 1) MUL temp.xyz, src0.yzx, src1.zxy
+ * 2) MAD src0.zxy, src1.yzx, -temp.xyz */
+ src[0] = make_src(cs, fpi->SrcReg[0]);
+ src[1] = make_src(cs, fpi->SrcReg[1]);
+ code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT
+ | (R500_WRITEMASK_RGB << 11);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1]);
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1]);
+ /* Select [y, z, x] */
+ temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
+ temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(temp_swiz);
+ /* Select [z, x, y] */
+ temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
+ temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
+ code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+ | MAKE_SWIZ_RGB_B(temp_swiz);
+ code->inst[counter].inst4 = R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(get_temp(cs, 0))
+ | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0]))
+ | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1]));
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(get_temp(cs, 0))
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO)
+ | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO);
+ counter++;
+ emit_alu(cs, counter, fpi);
+ code->inst[counter].inst1 = R500_RGB_ADDR0(src[0])
+ | R500_RGB_ADDR1(src[1])
+ | R500_RGB_ADDR2(get_temp(cs, 0));
+ code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0])
+ | R500_ALPHA_ADDR1(src[1])
+ | R500_ALPHA_ADDR2(get_temp(cs, 0));
+ /* Select [z, x, y] */
+ temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]);
+ temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6);
+ code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0
+ | MAKE_SWIZ_RGB_A(temp_swiz);
+ /* Select [y, z, x] */
+ temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]);
+ temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6);
+ code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1
+ | MAKE_SWIZ_RGB_B(temp_swiz);
+ code->inst[counter].inst4 |= R500_ALPHA_OP_MAD
+ | R500_ALPHA_ADDRD(dest)
+ | R500_ALPHA_SWIZ_A_1
+ | R500_ALPHA_SWIZ_B_1;
+ code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD
+ | R500_ALU_RGBA_ADDRD(dest)
+ | R500_ALU_RGBA_SEL_C_SRC2
+ | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB)
+ | R500_ALU_RGBA_MOD_C_NEG
+ | R500_ALU_RGBA_A_SWIZ_0;
+ break;
+ case OPCODE_KIL:
+ case OPCODE_TEX:
+ case OPCODE_TXB:
+ case OPCODE_TXP:
+ emit_tex(cs, fpi, dest, counter);
+ if (fpi->DstReg.File == PROGRAM_OUTPUT)
+ counter++;
+ break;
+ default:
+ ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode));
+ break;
+ }
+
+ /* Finishing touches */
+ if (fpi->SaturateMode == SATURATE_ZERO_ONE) {
+ code->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP;
+ }
+
+ counter++;
+
+ return counter;
+}
+
+static GLboolean parse_program(struct r500_pfs_compile_state *cs)
+{
+ PROG_CODE;
+ int clauseidx, counter = 0;
+
+ for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; clauseidx++) {
+ struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+ struct prog_instruction* fpi;
+
+ int ip;
+
+ for (ip = 0; ip < clause->NumInstructions; ip++) {
+ fpi = clause->Instructions + ip;
+ counter = do_inst(cs, fpi, counter);
+
+ if (cs->compiler->fp->error)
+ return GL_FALSE;
+ }
+ }
+
+ /* Finish him! (If it's an ALU/OUT instruction...) */
+ if ((code->inst[counter-1].inst0 & 0x3) == 1) {
+ code->inst[counter-1].inst0 |= R500_INST_LAST;
+ } else {
+ /* We still need to put an output inst, right? */
+ WARN_ONCE("Final FP instruction is not an OUT.\n");
+ }
+
+ cs->nrslots = counter;
+
+ code->max_temp_idx++;
+
+ return GL_TRUE;
+}
+
+static void init_program(struct r500_pfs_compile_state *cs)
+{
+ PROG_CODE;
+ struct gl_fragment_program *mp = &cs->compiler->fp->mesa_program;
+ struct prog_instruction *fpi;
+ GLuint InputsRead = mp->Base.InputsRead;
+ GLuint temps_used = 0;
+ int i, j;
+
+ /* New compile, reset tracking data */
+ cs->compiler->fp->optimization =
+ driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
+ cs->compiler->fp->translated = GL_FALSE;
+ cs->compiler->fp->error = GL_FALSE;
+ code->const_nr = 0;
+ /* Size of pixel stack, plus 1. */
+ code->max_temp_idx = 1;
+ /* Temp register offset. */
+ code->temp_reg_offset = 0;
+ /* Whether or not we perform any depth writing. */
+ cs->compiler->fp->writes_depth = GL_FALSE;
+
+ for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+ for (j = 0; j < 3; j++) {
+ cs->slot[i].vsrc[j] = SRC_CONST;
+ cs->slot[i].ssrc[j] = SRC_CONST;
+ }
+ }
+
+ /* Work out what temps the Mesa inputs correspond to, this must match
+ * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+ * configures itself based on the fragprog's InputsRead
+ *
+ * NOTE: this depends on get_hw_temp() allocating registers in order,
+ * starting from register 0, so we're just going to do that instead.
+ */
+
+ /* Texcoords come first */
+ for (i = 0; i < cs->compiler->fp->ctx->Const.MaxTextureUnits; i++) {
+ if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+ cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+ code->temp_reg_offset;
+ code->temp_reg_offset++;
+ }
+ }
+ InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+ /* fragment position treated as a texcoord */
+ if (InputsRead & FRAG_BIT_WPOS) {
+ cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_WPOS].reg =
+ code->temp_reg_offset;
+ code->temp_reg_offset++;
+ }
+ InputsRead &= ~FRAG_BIT_WPOS;
+
+ /* Then primary colour */
+ if (InputsRead & FRAG_BIT_COL0) {
+ cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_COL0].reg =
+ code->temp_reg_offset;
+ code->temp_reg_offset++;
+ }
+ InputsRead &= ~FRAG_BIT_COL0;
+
+ /* Secondary color */
+ if (InputsRead & FRAG_BIT_COL1) {
+ cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+ cs->inputs[FRAG_ATTRIB_COL1].reg =
+ code->temp_reg_offset;
+ code->temp_reg_offset++;
+ }
+ InputsRead &= ~FRAG_BIT_COL1;
+
+ /* Anything else */
+ if (InputsRead) {
+ WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+ /* force read from hwreg 0 for now */
+ for (i = 0; i < 32; i++)
+ if (InputsRead & (1 << i))
+ cs->inputs[i].reg = 0;
+ }
+
+ int clauseidx;
+
+ for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
+ struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
+ int ip;
+
+ for (ip = 0; ip < clause->NumInstructions; ip++) {
+ fpi = clause->Instructions + ip;
+ for (i = 0; i < 3; i++) {
+ if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) {
+ if (fpi->SrcReg[i].Index >= temps_used)
+ temps_used = fpi->SrcReg[i].Index + 1;
+ }
+ }
+ }
+ }
+
+
+ cs->temp_in_use = temps_used + 1;
+
+ code->max_temp_idx = code->temp_reg_offset + cs->temp_in_use;
+
+ if (RADEON_DEBUG & DEBUG_PIXEL)
+ fprintf(stderr, "FP temp indices: code->max_temp_idx: %d cs->temp_in_use: %d\n", code->max_temp_idx, cs->temp_in_use);
+}
+
+static void dumb_shader(struct r500_pfs_compile_state *cs)
+{
+ PROG_CODE;
+ code->inst[0].inst0 = R500_INST_TYPE_TEX
+ | R500_INST_TEX_SEM_WAIT
+ | R500_INST_RGB_WMASK_R
+ | R500_INST_RGB_WMASK_G
+ | R500_INST_RGB_WMASK_B
+ | R500_INST_ALPHA_WMASK
+ | R500_INST_RGB_CLAMP
+ | R500_INST_ALPHA_CLAMP;
+ code->inst[0].inst1 = R500_TEX_ID(0)
+ | R500_TEX_INST_LD
+ | R500_TEX_SEM_ACQUIRE
+ | R500_TEX_IGNORE_UNCOVERED;
+ code->inst[0].inst2 = R500_TEX_SRC_ADDR(0)
+ | R500_TEX_SRC_S_SWIZ_R
+ | R500_TEX_SRC_T_SWIZ_G
+ | R500_TEX_DST_ADDR(0)
+ | R500_TEX_DST_R_SWIZ_R
+ | R500_TEX_DST_G_SWIZ_G
+ | R500_TEX_DST_B_SWIZ_B
+ | R500_TEX_DST_A_SWIZ_A;
+ code->inst[0].inst3 = R500_DX_ADDR(0)
+ | R500_DX_S_SWIZ_R
+ | R500_DX_T_SWIZ_R
+ | R500_DX_R_SWIZ_R
+ | R500_DX_Q_SWIZ_R
+ | R500_DY_ADDR(0)
+ | R500_DY_S_SWIZ_R
+ | R500_DY_T_SWIZ_R
+ | R500_DY_R_SWIZ_R
+ | R500_DY_Q_SWIZ_R;
+ code->inst[0].inst4 = 0x0;
+ code->inst[0].inst5 = 0x0;
+
+ code->inst[1].inst0 = R500_INST_TYPE_OUT |
+ R500_INST_TEX_SEM_WAIT |
+ R500_INST_LAST |
+ R500_INST_RGB_OMASK_R |
+ R500_INST_RGB_OMASK_G |
+ R500_INST_RGB_OMASK_B |
+ R500_INST_ALPHA_OMASK;
+ code->inst[1].inst1 = R500_RGB_ADDR0(0) |
+ R500_RGB_ADDR1(0) |
+ R500_RGB_ADDR1_CONST |
+ R500_RGB_ADDR2(0) |
+ R500_RGB_ADDR2_CONST |
+ R500_RGB_SRCP_OP_1_MINUS_2RGB0;
+ code->inst[1].inst2 = R500_ALPHA_ADDR0(0) |
+ R500_ALPHA_ADDR1(0) |
+ R500_ALPHA_ADDR1_CONST |
+ R500_ALPHA_ADDR2(0) |
+ R500_ALPHA_ADDR2_CONST |
+ R500_ALPHA_SRCP_OP_1_MINUS_2A0;
+ code->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 |
+ R500_ALU_RGB_R_SWIZ_A_R |
+ R500_ALU_RGB_G_SWIZ_A_G |
+ R500_ALU_RGB_B_SWIZ_A_B |
+ R500_ALU_RGB_SEL_B_SRC0 |
+ R500_ALU_RGB_R_SWIZ_B_1 |
+ R500_ALU_RGB_B_SWIZ_B_1 |
+ R500_ALU_RGB_G_SWIZ_B_1;
+ code->inst[1].inst4 = R500_ALPHA_OP_MAD |
+ R500_ALPHA_SWIZ_A_A |
+ R500_ALPHA_SWIZ_B_1;
+ code->inst[1].inst5 = R500_ALU_RGBA_OP_MAD |
+ R500_ALU_RGBA_R_SWIZ_0 |
+ R500_ALU_RGBA_G_SWIZ_0 |
+ R500_ALU_RGBA_B_SWIZ_0 |
+ R500_ALU_RGBA_A_SWIZ_0;
+
+ cs->nrslots = 2;
+}
+
+GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+{
+ struct r500_pfs_compile_state cs;
+ struct r500_fragment_program_code *code = compiler->code;
+
+ _mesa_memset(&cs, 0, sizeof(cs));
+ cs.compiler = compiler;
+ init_program(&cs);
+
+ if (!parse_program(&cs)) {
+#if 0
+ ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n");
+ dumb_shader(fp);
+ code->inst_offset = 0;
+ code->inst_end = cs.nrslots - 1;
+#endif
+ return GL_FALSE;
+ }
+
+ code->inst_offset = 0;
+ code->inst_end = cs.nrslots - 1;
+
+ return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 38d89306016..7458d63723f 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -53,16 +53,6 @@ struct radeon_context;
typedef struct radeon_context radeonContextRec;
typedef struct radeon_context *radeonContextPtr;
-#define TEX_0 0x1
-#define TEX_1 0x2
-#define TEX_2 0x4
-#define TEX_3 0x8
-#define TEX_4 0x10
-#define TEX_5 0x20
-#define TEX_6 0x40
-#define TEX_7 0x80
-#define TEX_ALL 0xff
-
/* Rasterizing fallbacks */
/* See correponding strings in r200_swtcl.c */
#define RADEON_FALLBACK_TEXTURE 0x0001
diff --git a/src/mesa/drivers/dri/r300/radeon_program.c b/src/mesa/drivers/dri/r300/radeon_program.c
new file mode 100644
index 00000000000..c8f40e81893
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program.h"
+
+#include "shader/prog_print.h"
+
+/**
+ * Initialize a compiler structure with a single mixed clause
+ * containing all instructions from the source program.
+ */
+void radeonCompilerInit(
+ struct radeon_compiler *compiler,
+ GLcontext *ctx,
+ struct gl_program *source)
+{
+ struct radeon_clause* clause;
+
+ _mesa_memset(compiler, 0, sizeof(*compiler));
+ compiler->Source = source;
+ compiler->Ctx = ctx;
+
+ compiler->NumTemporaries = source->NumTemporaries;
+
+ clause = radeonCompilerInsertClause(compiler, 0, CLAUSE_MIXED);
+ clause->NumInstructions = 0;
+ while(source->Instructions[clause->NumInstructions].Opcode != OPCODE_END)
+ clause->NumInstructions++;
+ clause->ReservedInstructions = clause->NumInstructions;
+ clause->Instructions = _mesa_alloc_instructions(clause->NumInstructions);
+ _mesa_copy_instructions(clause->Instructions, source->Instructions, clause->NumInstructions);
+}
+
+
+/**
+ * Free all data that is referenced by the compiler structure.
+ * However, the compiler structure itself is not freed.
+ */
+void radeonCompilerCleanup(struct radeon_compiler *compiler)
+{
+ radeonCompilerEraseClauses(compiler, 0, compiler->NumClauses);
+}
+
+
+/**
+ * Allocate and return a unique temporary register.
+ */
+int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler)
+{
+ if (compiler->NumTemporaries >= 256) {
+ _mesa_problem(compiler->Ctx, "radeonCompiler: Too many temporaries");
+ return 0;
+ }
+
+ return compiler->NumTemporaries++;
+}
+
+
+static const char* clausename(int type)
+{
+ switch(type) {
+ case CLAUSE_MIXED: return "CLAUSE_MIXED";
+ case CLAUSE_ALU: return "CLAUSE_ALU";
+ case CLAUSE_TEX: return "CLAUSE_TEX";
+ default: return "CLAUSE_UNKNOWN";
+ }
+}
+
+
+/**
+ * Dump the current compiler state to the console for debugging.
+ */
+void radeonCompilerDump(struct radeon_compiler *compiler)
+{
+ int i;
+ for(i = 0; i < compiler->NumClauses; ++i) {
+ struct radeon_clause *clause = &compiler->Clauses[i];
+ int j;
+
+ _mesa_printf("%2i: %s\n", i+1, clausename(clause->Type));
+
+ for(j = 0; j < clause->NumInstructions; ++j) {
+ _mesa_printf("%4i: ", j+1);
+ _mesa_print_instruction(&clause->Instructions[j]);
+ }
+ }
+}
+
+
+/**
+ * \p position index of the new clause; later clauses are moved
+ * \p type of the new clause; one of CLAUSE_XXX
+ * \return a pointer to the new clause
+ */
+struct radeon_clause* radeonCompilerInsertClause(
+ struct radeon_compiler *compiler,
+ int position, int type)
+{
+ struct radeon_clause* oldClauses = compiler->Clauses;
+ struct radeon_clause* clause;
+
+ assert(position >= 0 && position <= compiler->NumClauses);
+
+ compiler->Clauses = (struct radeon_clause *)
+ _mesa_malloc((compiler->NumClauses+1) * sizeof(struct radeon_clause));
+ if (oldClauses) {
+ _mesa_memcpy(compiler->Clauses, oldClauses,
+ position*sizeof(struct radeon_clause));
+ _mesa_memcpy(compiler->Clauses+position+1, oldClauses+position,
+ (compiler->NumClauses - position) * sizeof(struct radeon_clause));
+ _mesa_free(oldClauses);
+ }
+ compiler->NumClauses++;
+
+ clause = compiler->Clauses + position;
+ _mesa_memset(clause, 0, sizeof(*clause));
+ clause->Type = type;
+
+ return clause;
+}
+
+
+/**
+ * Remove clauses in the range [start, end)
+ */
+void radeonCompilerEraseClauses(
+ struct radeon_compiler *compiler,
+ int start, int end)
+{
+ struct radeon_clause* oldClauses = compiler->Clauses;
+ int i;
+
+ assert(0 <= start);
+ assert(start <= end);
+ assert(end <= compiler->NumClauses);
+
+ if (end == start)
+ return;
+
+ for(i = start; i < end; ++i) {
+ struct radeon_clause* clause = oldClauses + i;
+ _mesa_free_instructions(clause->Instructions, clause->NumInstructions);
+ }
+
+ if (start > 0 || end < compiler->NumClauses) {
+ compiler->Clauses = (struct radeon_clause*)
+ _mesa_malloc((compiler->NumClauses+start-end) * sizeof(struct radeon_clause));
+ _mesa_memcpy(compiler->Clauses, oldClauses,
+ start * sizeof(struct radeon_clause));
+ _mesa_memcpy(compiler->Clauses + start, oldClauses + end,
+ (compiler->NumClauses - end) * sizeof(struct radeon_clause));
+ compiler->NumClauses -= end - start;
+ } else {
+ compiler->Clauses = 0;
+ compiler->NumClauses = 0;
+ }
+
+ _mesa_free(oldClauses);
+}
+
+
+/**
+ * Insert new instructions at the given position, initialize them as NOPs
+ * and return a pointer to the first new instruction.
+ */
+struct prog_instruction* radeonClauseInsertInstructions(
+ struct radeon_compiler *compiler,
+ struct radeon_clause *clause,
+ int position, int count)
+{
+ int newNumInstructions = clause->NumInstructions + count;
+
+ assert(position >= 0 && position <= clause->NumInstructions);
+
+ if (newNumInstructions <= clause->ReservedInstructions) {
+ memmove(clause->Instructions + position + count, clause->Instructions + position,
+ (clause->NumInstructions - position) * sizeof(struct prog_instruction));
+ } else {
+ struct prog_instruction *oldInstructions = clause->Instructions;
+
+ clause->ReservedInstructions *= 2;
+ if (newNumInstructions > clause->ReservedInstructions)
+ clause->ReservedInstructions = newNumInstructions;
+
+ clause->Instructions = (struct prog_instruction*)
+ _mesa_malloc(clause->ReservedInstructions * sizeof(struct prog_instruction));
+
+ if (oldInstructions) {
+ _mesa_memcpy(clause->Instructions, oldInstructions,
+ position * sizeof(struct prog_instruction));
+ _mesa_memcpy(clause->Instructions + position + count, oldInstructions + position,
+ (clause->NumInstructions - position) * sizeof(struct prog_instruction));
+
+ _mesa_free(oldInstructions);
+ }
+ }
+
+ clause->NumInstructions = newNumInstructions;
+ _mesa_init_instructions(clause->Instructions + position, count);
+ return clause->Instructions + position;
+}
+
+
+/**
+ * Transform the given clause in the following way:
+ * 1. Replace it with an empty clause
+ * 2. For every instruction in the original clause, try the given
+ * transformations in order.
+ * 3. If one of the transformations returns GL_TRUE, assume that it
+ * has emitted the appropriate instruction(s) into the new clause;
+ * otherwise, copy the instruction verbatim.
+ *
+ * \note The transformation is currently not recursive; in other words,
+ * instructions emitted by transformations are not transformed.
+ *
+ * \note The transform is called 'local' because it can only look at
+ * one instruction at a time.
+ */
+void radeonClauseLocalTransform(
+ struct radeon_compiler *compiler,
+ struct radeon_clause *clause,
+ int num_transformations,
+ struct radeon_program_transformation* transformations)
+{
+ struct radeon_program_transform_context context;
+ struct radeon_clause source;
+ int ip;
+
+ source = *clause;
+ clause->Instructions = 0;
+ clause->NumInstructions = 0;
+ clause->ReservedInstructions = 0;
+
+ context.compiler = compiler;
+ context.dest = clause;
+ context.src = &source;
+
+ for(ip = 0; ip < source.NumInstructions; ++ip) {
+ struct prog_instruction *instr = source.Instructions + ip;
+ int i;
+
+ for(i = 0; i < num_transformations; ++i) {
+ struct radeon_program_transformation* t = transformations + i;
+
+ if (t->function(&context, instr, t->userData))
+ break;
+ }
+
+ if (i >= num_transformations) {
+ struct prog_instruction *tgt =
+ radeonClauseInsertInstructions(compiler, clause, clause->NumInstructions, 1);
+ _mesa_copy_instructions(tgt, instr, 1);
+ }
+ }
+
+ _mesa_free_instructions(source.Instructions, source.NumInstructions);
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h
new file mode 100644
index 00000000000..25e70505b16
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_H_
+#define __RADEON_PROGRAM_H_
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+
+enum {
+ CLAUSE_MIXED = 0,
+ CLAUSE_ALU,
+ CLAUSE_TEX
+};
+
+enum {
+ PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */
+};
+
+#define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
+#define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
+
+/**
+ * A clause is simply a sequence of instructions that are executed
+ * in order.
+ */
+struct radeon_clause {
+ /**
+ * Type of this clause, one of CLAUSE_XXX.
+ */
+ int Type : 2;
+
+ /**
+ * Pointer to an array of instructions.
+ * The array is terminated by an OPCODE_END instruction.
+ */
+ struct prog_instruction *Instructions;
+
+ /**
+ * Number of instructions in this clause.
+ */
+ int NumInstructions;
+
+ /**
+ * Space reserved for instructions in this clause.
+ */
+ int ReservedInstructions;
+};
+
+/**
+ * A compile object, holding the current intermediate state during compilation.
+ */
+struct radeon_compiler {
+ struct gl_program *Source;
+ GLcontext* Ctx;
+
+ /**
+ * Number of clauses in this program.
+ */
+ int NumClauses;
+
+ /**
+ * Pointer to an array of NumClauses clauses.
+ */
+ struct radeon_clause *Clauses;
+
+ /**
+ * Number of registers in the PROGRAM_TEMPORARIES file.
+ */
+ int NumTemporaries;
+};
+
+void radeonCompilerInit(
+ struct radeon_compiler *compiler,
+ GLcontext *ctx,
+ struct gl_program *source);
+void radeonCompilerCleanup(struct radeon_compiler *compiler);
+int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler);
+void radeonCompilerDump(struct radeon_compiler *compiler);
+
+struct radeon_clause *radeonCompilerInsertClause(
+ struct radeon_compiler *compiler,
+ int position,
+ int type);
+void radeonCompilerEraseClauses(
+ struct radeon_compiler *compiler,
+ int start,
+ int end);
+
+struct prog_instruction* radeonClauseInsertInstructions(
+ struct radeon_compiler *compiler,
+ struct radeon_clause *clause,
+ int position, int count);
+
+/**
+ *
+ */
+struct radeon_program_transform_context {
+ struct radeon_compiler *compiler;
+
+ /**
+ * Destination clause where new instructions must be written.
+ */
+ struct radeon_clause *dest;
+
+ /**
+ * Original clause that is currently being transformed.
+ */
+ struct radeon_clause *src;
+};
+
+/**
+ * A transformation that can be passed to \ref radeonClauseLinearTransform.
+ *
+ * The function will be called once for each instruction.
+ * It has to either emit the appropriate transformed code for the instruction
+ * and return GL_TRUE, or return GL_FALSE if it doesn't understand the
+ * instruction.
+ *
+ * The function gets passed the userData as last parameter.
+ */
+struct radeon_program_transformation {
+ GLboolean (*function)(
+ struct radeon_program_transform_context*,
+ struct prog_instruction*,
+ void*);
+ void *userData;
+};
+
+void radeonClauseLocalTransform(
+ struct radeon_compiler *compiler,
+ struct radeon_clause *clause,
+ int num_transformations,
+ struct radeon_program_transformation* transformations);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c
new file mode 100644
index 00000000000..7fe940a7d7f
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Shareable transformations that transform "special" ALU instructions
+ * into ALU instructions that are supported by hardware.
+ *
+ */
+
+#include "radeon_program_alu.h"
+
+
+static struct prog_instruction *emit1(struct radeon_program_transform_context* ctx,
+ gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+ struct prog_src_register SrcReg)
+{
+ struct prog_instruction *fpi =
+ radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+ ctx->dest->NumInstructions, 1);
+
+ fpi->Opcode = Opcode;
+ fpi->DstReg = DstReg;
+ fpi->SrcReg[0] = SrcReg;
+ return fpi;
+}
+
+static struct prog_instruction *emit2(struct radeon_program_transform_context* ctx,
+ gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+ struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
+{
+ struct prog_instruction *fpi =
+ radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+ ctx->dest->NumInstructions, 1);
+
+ fpi->Opcode = Opcode;
+ fpi->DstReg = DstReg;
+ fpi->SrcReg[0] = SrcReg0;
+ fpi->SrcReg[1] = SrcReg1;
+ return fpi;
+}
+
+static struct prog_instruction *emit3(struct radeon_program_transform_context* ctx,
+ gl_inst_opcode Opcode, struct prog_dst_register DstReg,
+ struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
+ struct prog_src_register SrcReg2)
+{
+ struct prog_instruction *fpi =
+ radeonClauseInsertInstructions(ctx->compiler, ctx->dest,
+ ctx->dest->NumInstructions, 1);
+
+ fpi->Opcode = Opcode;
+ fpi->DstReg = DstReg;
+ fpi->SrcReg[0] = SrcReg0;
+ fpi->SrcReg[1] = SrcReg1;
+ fpi->SrcReg[2] = SrcReg2;
+ return fpi;
+}
+
+static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
+{
+ SrcReg->Swizzle &= ~(7 << (3*coordinate));
+ SrcReg->Swizzle |= swz << (3*coordinate);
+}
+
+static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
+{
+ SrcReg->NegateBase &= ~(1 << coordinate);
+ SrcReg->NegateBase |= (negate << coordinate);
+}
+
+static struct prog_dst_register dstreg(int file, int index)
+{
+ struct prog_dst_register dst;
+ dst.File = file;
+ dst.Index = index;
+ dst.WriteMask = WRITEMASK_XYZW;
+ dst.CondMask = COND_TR;
+ dst.CondSwizzle = SWIZZLE_NOOP;
+ dst.CondSrc = 0;
+ dst.pad = 0;
+ return dst;
+}
+
+static const struct prog_src_register builtin_zero = {
+ .File = PROGRAM_BUILTIN,
+ .Index = 0,
+ .Swizzle = SWIZZLE_0000
+};
+static const struct prog_src_register builtin_one = {
+ .File = PROGRAM_BUILTIN,
+ .Index = 0,
+ .Swizzle = SWIZZLE_1111
+};
+static const struct prog_src_register srcreg_undefined = {
+ .File = PROGRAM_UNDEFINED,
+ .Index = 0,
+ .Swizzle = SWIZZLE_NOOP
+};
+
+static struct prog_src_register srcreg(int file, int index)
+{
+ struct prog_src_register src = srcreg_undefined;
+ src.File = file;
+ src.Index = index;
+ return src;
+}
+
+static struct prog_src_register negate(struct prog_src_register reg)
+{
+ struct prog_src_register newreg = reg;
+ newreg.NegateAbs = !newreg.NegateAbs;
+ return newreg;
+}
+
+static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
+{
+ struct prog_src_register swizzled = reg;
+ swizzled.Swizzle = MAKE_SWIZZLE4(
+ GET_SWZ(reg.Swizzle, x),
+ GET_SWZ(reg.Swizzle, y),
+ GET_SWZ(reg.Swizzle, z),
+ GET_SWZ(reg.Swizzle, w));
+ return swizzled;
+}
+
+static struct prog_src_register scalar(struct prog_src_register reg)
+{
+ return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+}
+
+static void transform_ABS(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ struct prog_src_register src = inst->SrcReg[0];
+ src.Abs = 1;
+ src.NegateBase = 0;
+ src.NegateAbs = 0;
+ emit1(ctx, OPCODE_MOV, inst->DstReg, src);
+}
+
+static void transform_DPH(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ struct prog_src_register src0 = inst->SrcReg[0];
+ if (src0.NegateAbs) {
+ if (src0.Abs) {
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+ emit1(ctx, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
+ src0 = srcreg(src0.File, src0.Index);
+ } else {
+ src0.NegateAbs = 0;
+ src0.NegateBase ^= NEGATE_XYZW;
+ }
+ }
+ set_swizzle(&src0, 3, SWIZZLE_ONE);
+ set_negate_base(&src0, 3, 0);
+ emit2(ctx, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]);
+}
+
+static void transform_FLR(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+ emit1(ctx, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]);
+ emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+static void transform_POW(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+ struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
+ struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
+ tempdst.WriteMask = WRITEMASK_W;
+ tempsrc.Swizzle = SWIZZLE_WWWW;
+
+ emit1(ctx, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0]));
+ emit2(ctx, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1]));
+ emit1(ctx, OPCODE_EX2, inst->DstReg, tempsrc);
+}
+
+static void transform_SGE(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+ emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+ emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
+}
+
+static void transform_SLT(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+ emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+ emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
+}
+
+static void transform_SUB(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1]));
+}
+
+static void transform_SWZ(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ emit1(ctx, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]);
+}
+
+static void transform_XPD(struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst)
+{
+ int tempreg = radeonCompilerAllocateTemporary(ctx->compiler);
+
+ emit2(ctx, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg),
+ swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+ swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
+ emit3(ctx, OPCODE_MAD, inst->DstReg,
+ swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
+ swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+ negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+
+/**
+ * Can be used as a transformation for @ref radeonClauseLocalTransform,
+ * no userData necessary.
+ *
+ * Eliminates the following ALU instructions:
+ * ABS, DPH, FLR, POW, SGE, SLT, SUB, SWZ, XPD
+ * using:
+ * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
+ *
+ * @note should be applicable to R300 and R500 fragment programs.
+ *
+ * @todo add LIT here as well?
+ */
+GLboolean radeonTransformALU(
+ struct radeon_program_transform_context* ctx,
+ struct prog_instruction* inst,
+ void* unused)
+{
+ switch(inst->Opcode) {
+ case OPCODE_ABS: transform_ABS(ctx, inst); return GL_TRUE;
+ case OPCODE_DPH: transform_DPH(ctx, inst); return GL_TRUE;
+ case OPCODE_FLR: transform_FLR(ctx, inst); return GL_TRUE;
+ case OPCODE_POW: transform_POW(ctx, inst); return GL_TRUE;
+ case OPCODE_SGE: transform_SGE(ctx, inst); return GL_TRUE;
+ case OPCODE_SLT: transform_SLT(ctx, inst); return GL_TRUE;
+ case OPCODE_SUB: transform_SUB(ctx, inst); return GL_TRUE;
+ case OPCODE_SWZ: transform_SWZ(ctx, inst); return GL_TRUE;
+ case OPCODE_XPD: transform_XPD(ctx, inst); return GL_TRUE;
+ default:
+ return GL_FALSE;
+ }
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h
new file mode 100644
index 00000000000..940459624fc
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_ALU_H_
+#define __RADEON_PROGRAM_ALU_H_
+
+#include "radeon_program.h"
+
+GLboolean radeonTransformALU(
+ struct radeon_program_transform_context*,
+ struct prog_instruction*,
+ void*);
+
+#endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
index eae09d6b35e..f1bc56ea6a4 100644
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ b/src/mesa/drivers/dri/r300/radeon_span.c
@@ -282,6 +282,30 @@ static void radeonSpanRenderStart(GLcontext * ctx)
#endif
LOCK_HARDWARE(rmesa);
radeonWaitForIdleLocked(rmesa);
+
+ /* Read the first pixel in the frame buffer. This should
+ * be a noop, right? In fact without this conform fails as reading
+ * from the framebuffer sometimes produces old results -- the
+ * on-card read cache gets mixed up and doesn't notice that the
+ * framebuffer has been updated.
+ *
+ * Note that we should probably be reading some otherwise unused
+ * region of VRAM, otherwise we might get incorrect results when
+ * reading pixels from the top left of the screen.
+ *
+ * I found this problem on an R420 with glean's texCube test.
+ * Note that the R200 span code also *writes* the first pixel in the
+ * framebuffer, but I've found this to be unnecessary.
+ * -- Nicolai Hähnle, June 2008
+ */
+ {
+ int p;
+ driRenderbuffer *drb =
+ (driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
+ volatile int *buf =
+ (volatile int *)(rmesa->dri.screen->pFB + drb->offset);
+ p = *buf;
+ }
}
static void radeonSpanRenderFinish(GLcontext * ctx)