summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/Makefile1
-rw-r--r--src/gallium/auxiliary/cso_cache/cso_hash.c6
-rw-r--r--src/gallium/auxiliary/cso_cache/cso_hash.h6
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h2
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_io.c2
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_exec.c17
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c2
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_cpu.cpp5
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.cpp245
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.h1
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.cpp53
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.h7
-rw-r--r--src/gallium/auxiliary/gallivm/tgsitollvm.cpp10
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.c365
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.h181
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c571
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h87
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.c66
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.h11
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_build.c24
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.c1
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_parse.c54
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c293
-rw-r--r--src/gallium/auxiliary/util/Makefile1
-rw-r--r--src/gallium/auxiliary/util/SConscript5
-rw-r--r--src/gallium/auxiliary/util/p_debug.c14
-rw-r--r--src/gallium/auxiliary/util/u_blit.c1
-rw-r--r--src/gallium/auxiliary/util/u_gen_mipmap.c1
-rw-r--r--src/gallium/auxiliary/util/u_keymap.c309
-rw-r--r--src/gallium/auxiliary/util/u_keymap.h68
-rw-r--r--src/gallium/auxiliary/util/u_math.c21
-rw-r--r--src/gallium/auxiliary/util/u_math.h112
-rw-r--r--src/gallium/auxiliary/util/u_sse.h77
-rw-r--r--src/gallium/drivers/cell/common.h32
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c11
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h39
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c1454
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c1548
-rw-r--r--src/gallium/drivers/cell/ppu/cell_pipe_state.c57
-rw-r--r--src/gallium/drivers/cell/ppu/cell_render.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_screen.c14
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.c8
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state.h5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_emit.c114
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_shader.c5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vbuf.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vertex_fetch.c30
-rw-r--r--src/gallium/drivers/cell/spu/.gitignore1
-rw-r--r--src/gallium/drivers/cell/spu/Makefile4
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c621
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.h7
-rw-r--r--src/gallium/drivers/cell/spu/spu_debug.h60
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c176
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.h35
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.c620
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h19
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.c167
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.h3
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_tile.c37
-rw-r--r--src/gallium/drivers/cell/spu/spu_tile.h6
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c149
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.h2
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_exec.c29
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c2
-rw-r--r--src/gallium/include/pipe/p_compiler.h2
-rw-r--r--src/gallium/include/pipe/p_config.h8
-rw-r--r--src/gallium/include/pipe/p_screen.h2
-rw-r--r--src/gallium/include/pipe/p_shader_tokens.h41
-rw-r--r--src/gallium/include/pipe/p_state.h2
-rw-r--r--src/gallium/include/pipe/p_thread.h2
-rw-r--r--src/gallium/winsys/drm/intel/dri/intel_screen.c382
-rw-r--r--src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c1
-rw-r--r--src/gallium/winsys/drm/intel/egl/intel_device.c2
-rw-r--r--src/gallium/winsys/egl_xlib/egl_xlib.c2
-rw-r--r--src/gallium/winsys/xlib/xm_winsys.c4
76 files changed, 6638 insertions, 1690 deletions
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index aa77021daf3..36bd3623e7f 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -3,6 +3,7 @@ include $(TOP)/configs/current
SUBDIRS = auxiliary drivers
+# Note winsys/ needs to be built after src/mesa
default: subdirs
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 7f0044c5a7f..4e7664f9bf0 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter
--hash->data.d->size;
return ret;
}
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+ struct cso_node **node = cso_hash_find_node(hash, key);
+ return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index 85f3e276c6a..5891c325fa5 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -44,6 +44,7 @@
#ifndef CSO_HASH_H
#define CSO_HASH_H
+#include "pipe/p_compiler.h"
#ifdef __cplusplus
extern "C" {
@@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
*/
struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key);
+
int cso_hash_iter_is_null(struct cso_hash_iter iter);
unsigned cso_hash_iter_key(struct cso_hash_iter iter);
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 37c4c87f87f..5d531146c5f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -196,7 +196,7 @@ struct draw_context
const float (*aligned_constants)[4];
- float (*aligned_constant_storage)[4];
+ const float (*aligned_constant_storage)[4];
unsigned const_storage_size;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 39f75b50b76..dd79bc799aa 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -338,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
struct x86_reg dst_ptr,
struct x86_reg dataXMM )
{
- sse_movups(cp->func, dst_ptr, dataXMM);
+ sse_movaps(cp->func, dst_ptr, dataXMM);
}
static void emit_store_R32G32B32( struct aos_compilation *cp,
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 44563803f90..79a19d6be2b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -46,6 +46,7 @@
struct exec_vertex_shader {
struct draw_vertex_shader base;
struct tgsi_exec_machine *machine;
+ const struct tgsi_token *machine_tokens;
};
static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
@@ -62,12 +63,16 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
{
struct exec_vertex_shader *evs = exec_vertex_shader(shader);
- /* specify the vertex program to interpret/execute */
- tgsi_exec_machine_bind_shader(evs->machine,
- shader->state.tokens,
- PIPE_MAX_SAMPLERS,
- NULL /*samplers*/ );
-
+ /* Specify the vertex program to interpret/execute.
+ * Avoid rebinding when possible.
+ */
+ if (evs->machine_tokens != shader->state.tokens) {
+ tgsi_exec_machine_bind_shader(evs->machine,
+ shader->state.tokens,
+ PIPE_MAX_SAMPLERS,
+ NULL /*samplers*/ );
+ evs->machine_tokens = shader->state.tokens;
+ }
}
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8b..b11ae316627 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
#include "draw_vs.h"
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
#include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a4a41e5445..3a2f2878a30 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -158,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
llvm::ExecutionEngine *ee = cpu->engine;
assert(ee);
- /*FIXME : remove */
- ee->DisableLazyCompilation();
+ /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+ ee->DisableLazyCompilation(false);
ee->addModuleProvider(mp);
llvm::Function *func = func_for_shader(prog);
@@ -202,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
unsigned int i, j;
unsigned slot;
vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
assert(runner);
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index a6580725512..d5600fd22da 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
return res;
}
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
- std::vector<llvm::Value*> res(4);
-
- //Extract x's
- llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
- m_storage->constantInt(0),
- name("extractX"));
- //cast it to an unsigned int
- x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
- res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
- //only x is valid. the others shouldn't be necessary
- /*
- res[1] = Constant::getNullValue(m_floatVecType);
- res[2] = Constant::getNullValue(m_floatVecType);
- res[3] = Constant::getNullValue(m_floatVecType);
- */
-
- return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
- res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
- res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
- res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
- res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
- res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
- res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
- return res;
-}
-
void InstructionsSoa::end()
{
m_builder.CreateRetVoid();
}
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2,
- const std::vector<llvm::Value*> in3)
-{
- std::vector<llvm::Value*> res = mul(in1, in2);
- return add(res, in3);
-}
-
std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
{
std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
return res;
}
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+ return &m_builder;
+}
+
void InstructionsSoa::createFunctionMap()
{
m_functionsMap[TGSI_OPCODE_ABS] = "abs";
@@ -274,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
return callBuiltin(func, in1);
}
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+ res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+ res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+ res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+ std::vector<llvm::Value*> res(4);
+
+ //Extract x's
+ llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+ m_storage->constantInt(0),
+ name("extractX"));
+ //cast it to an unsigned int
+ x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+ res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+ //only x is valid. the others shouldn't be necessary
+ /*
+ res[1] = Constant::getNullValue(m_floatVecType);
+ res[2] = Constant::getNullValue(m_floatVecType);
+ res[3] = Constant::getNullValue(m_floatVecType);
+ */
+
+ return res;
+}
+
std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2)
{
@@ -281,6 +264,59 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
return callBuiltin(func, in1, in2);
}
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_LIT);
+ return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2,
+ const std::vector<llvm::Value*> in3)
+{
+ std::vector<llvm::Value*> res = mul(in1, in2);
+ return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MAX);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MIN);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+ res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+ res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+ res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_POWER);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_RSQ);
+ return callBuiltin(func, in);
+}
std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2)
@@ -289,6 +325,37 @@ std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> i
return callBuiltin(func, in1, in2);
}
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+ res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+ res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+ res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+ return res;
+}
+
+void checkFunction(Function *func)
+{
+ for (Function::const_iterator BI = func->begin(), BE = func->end();
+ BI != BE; ++BI) {
+ const BasicBlock &BB = *BI;
+ for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+ II != IE; ++II) {
+ const Instruction &I = *II;
+ std::cout<< "Instr = "<<I;
+ for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+ const Value *Op = I.getOperand(op);
+ std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+ //I->setOperand(op, V);
+ }
+ }
+ }
+}
+
llvm::Value * InstructionsSoa::allocaTemp()
{
VectorType *vector = VectorType::get(Type::FloatTy, 4);
@@ -408,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
return allocaToResult(allocaPtr);
}
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_POWER);
- return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MIN);
- return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MAX);
- return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
- for (Function::const_iterator BI = func->begin(), BE = func->end();
- BI != BE; ++BI) {
- const BasicBlock &BB = *BI;
- for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
- II != IE; ++II) {
- const Instruction &I = *II;
- std::cout<< "Instr = "<<I;
- for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
- const Value *Op = I.getOperand(op);
- std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
- //I->setOperand(op, V);
- }
- }
- }
-}
-
void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
{
assert(originalFunc);
@@ -492,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
}
}
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
- res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
- res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
- res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_LIT);
- return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_RSQ);
- return callBuiltin(func, in);
-}
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3817fdc904b..d6831e0a6b9 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -76,6 +76,7 @@ public:
void end();
std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+ llvm::IRBuilder<>* getIRBuilder();
private:
const char * name(const char *prefix) const;
llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f0..4fc075cf6d4 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -93,7 +93,7 @@ void StorageSoa::declareImmediates()
std::vector<float> vals(4);
std::vector<Constant*> channelArray;
- vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+ vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
llvm::Constant *xChannel = createConstGlobalVector(vals);
vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
return res;
}
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
{
- std::vector<llvm::Value*> res(4);
+ std::vector<llvm::Value*> x(4);
+ x[0] = m_builder->CreateExtractElement(vector,
+ constantInt(cc),
+ name("x"));
+
+ VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+ Constant *constVector = Constant::getNullValue(vectorType);
+ Value *res = m_builder->CreateInsertElement(constVector, x[0],
+ constantInt(0),
+ name("vecx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+ name("vecxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+ name("vecxxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+ name("vecxxxx"));
+ return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+ llvm::Value* res;
+ std::vector<llvm::Value*> res2(4);
llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
xChannel = elementPointer(m_consts, idx, 0);
- yChannel = elementPointer(m_consts, idx, 1);
- zChannel = elementPointer(m_consts, idx, 2);
- wChannel = elementPointer(m_consts, idx, 3);
- res[0] = alignedArrayLoad(xChannel);
- res[1] = alignedArrayLoad(yChannel);
- res[2] = alignedArrayLoad(zChannel);
- res[3] = alignedArrayLoad(wChannel);
+ res = alignedArrayLoad(xChannel);
- return res;
+ res2[0]=unpackConstElement(m_builder, res,0);
+ res2[1]=unpackConstElement(m_builder, res,1);
+ res2[2]=unpackConstElement(m_builder, res,2);
+ res2[3]=unpackConstElement(m_builder, res,3);
+
+ return res2;
}
std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const
return m_block->getParent()->getParent();
}
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+ Constant*c = ConstantFP::get(APFloat(val));
+ return c;
+}
+
llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
{
VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
}
std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx)
+ llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
{
std::vector<llvm::Value*> val(4);
@@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
val = tempElement(realIndex);
break;
case TGSI_FILE_CONSTANT:
- val = constElement(realIndex);
+ val = constElement(m_builder, realIndex);
break;
case TGSI_FILE_IMMEDIATE:
val = immediateElement(realIndex);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6aee..f21ca6ec433 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
#define STORAGESOA_H
#include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
#include <vector>
#include <list>
@@ -56,7 +57,7 @@ public:
std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx =0);
+ llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
int mask);
@@ -76,10 +77,12 @@ private:
const char *name(const char *prefix) const;
llvm::Value *alignedArrayLoad(llvm::Value *val);
llvm::Module *currentModule() const;
+ llvm::Constant *createConstGlobalFloat(const float val);
llvm::Constant *createConstGlobalVector(const std::vector<float> &vec);
std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
- std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+ llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+ std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 7292c0e366c..1191a6cae97 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
// pass are castable to the following:
// [4 x <4 x float>] inputs,
// [4 x <4 x float>] output,
- // [4 x [4 x float]] consts,
+ // [4 x [1 x float]] consts,
// [4 x <4 x float>] temps
std::vector<const Type*> funcArgs;
@@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType()
PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
ArrayType *floatArray = ArrayType::get(Type::FloatTy, 4);
- ArrayType *constsArray = ArrayType::get(floatArray, 4);
+ ArrayType *constsArray = ArrayType::get(floatArray, 1);
PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
funcArgs.push_back(vectorArrayPtr);//inputs
@@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module,
val = storage->constElement(src->SrcRegister.Index, indIdx);
} else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
val = storage->inputElement(src->SrcRegister.Index, indIdx);
+ // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
val = storage->tempElement(src->SrcRegister.Index);
} else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -676,6 +677,7 @@ translate_instruction(llvm::Module *module,
if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+ // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
} else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -707,9 +709,8 @@ translate_instructionir(llvm::Module *module,
if (src->SrcRegister.Indirect) {
indIdx = storage->addrElement(src->SrcRegisterInd.Index);
}
-
val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
- src->SrcRegister.Index, swizzle, indIdx);
+ src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
inputs[i] = val;
}
@@ -1025,7 +1026,6 @@ translate_instructionir(llvm::Module *module,
/* store results */
for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
storage->store((enum tgsi_file_type)dst->DstRegister.File,
dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 00000000000..534a23568d5
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,365 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p, unsigned max_inst)
+{
+ p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+ p->num_inst = 0;
+ p->max_inst = max_inst;
+ p->vec_used = ~0;
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+ assert(p->num_inst <= p->max_inst);
+ if (p->store != NULL) {
+ align_free(p->store);
+ }
+ p->store = NULL;
+}
+
+
+/**
+ * Alloate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p, int reg)
+{
+ unsigned i;
+ for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+ const uint64_t mask = 1 << i;
+ if ((p->vec_used & mask) != 0) {
+ p->vec_used &= ~mask;
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+ assert(reg < PPC_NUM_VEC_REGS);
+ assert((p->vec_used & (1 << reg)) == 0);
+
+ p->vec_used |= (1 << reg);
+}
+
+
+
+union vx_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned op2:11;
+ } inst;
+};
+
+union vxr_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned rC:1;
+ unsigned op2:10;
+ } inst;
+};
+
+union va_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned vC:5;
+ unsigned op2:6;
+ } inst;
+};
+
+
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+ union vx_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.op2 = op2;
+ p->store[p->num_inst++] = inst.bits;
+ assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+ union vxr_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.rC = 0;
+ inst.inst.op2 = op2;
+ p->store[p->num_inst++] = inst.bits;
+ assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+ union va_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.vC = vC;
+ inst.inst.op2 = op2;
+ p->store[p->num_inst++] = inst.bits;
+ assert(p->num_inst <= p->max_inst);
+};
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 46, vD, vA, vB, vC);
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 586, vD, 0, vB);
+}
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1220, vD, vA, vB);
+}
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 652, vD, imm, vB);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 00000000000..ed14e943df6
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4 /**< 4 bytes / instruction */
+
+#define PPC_NUM_VEC_REGS 32
+
+
+struct ppc_function
+{
+ uint32_t *store; /**< instruction buffer */
+ uint num_inst;
+ uint max_inst;
+ uint32_t vec_used; /** used/free vector registers bitmask */
+ uint32_t reg_used; /** used/free general-purpose registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_release_func(struct ppc_function *p);
+
+extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff7..cc35f0ba5b0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
* Real-time assembly generation interface for Cell B.E. SPEs.
*
* \author Ian Romanick <[email protected]>
+ * \author Brian Paul
*/
+
+#include <stdio.h>
#include "pipe/p_compiler.h"
#include "util/u_memory.h"
#include "rtasm_ppc_spe.h"
+
#ifdef GALLIUM_CELL
/**
* SPE instruction types
@@ -143,8 +147,46 @@ union spe_inst_RI18 {
/*@}*/
+static void
+indent(const struct spe_function *p)
+{
+ int i;
+ for (i = 0; i < p->indent; i++) {
+ putchar(' ');
+ }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+ return longname + 4;
+}
+
+
+static const char *
+reg_name(int reg)
+{
+ switch (reg) {
+ case SPE_REG_SP:
+ return "$sp";
+ case SPE_REG_RA:
+ return "$lr";
+ default:
+ {
+ /* cycle through four buffers to handle multiple calls per printf */
+ static char buf[4][10];
+ static int b = 0;
+ b = (b + 1) % 4;
+ sprintf(buf[b], "$%d", reg);
+ return buf[b];
+ }
+ }
+}
+
+
static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, unsigned rB)
+ unsigned rA, unsigned rB, const char *name)
{
union spe_inst_RR inst;
inst.inst.op = op;
@@ -153,11 +195,16 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, %s\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
+ }
}
static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, unsigned rB, unsigned rC)
+ unsigned rA, unsigned rB, unsigned rC, const char *name)
{
union spe_inst_RRR inst;
inst.inst.op = op;
@@ -167,11 +214,16 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rC = rC;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+ reg_name(rA), reg_name(rB), reg_name(rC));
+ }
}
static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI7 inst;
inst.inst.op = op;
@@ -180,12 +232,17 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
}
static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI8 inst;
inst.inst.op = op;
@@ -194,12 +251,17 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
}
static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI10 inst;
inst.inst.op = op;
@@ -208,11 +270,16 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
}
static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
- int imm)
+ int imm, const char *name)
{
union spe_inst_RI16 inst;
inst.inst.op = op;
@@ -220,11 +287,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+ }
}
static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
- int imm)
+ int imm, const char *name)
{
union spe_inst_RI18 inst;
inst.inst.op = op;
@@ -232,6 +303,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rT = rT;
p->store[p->num_inst++] = inst.bits;
assert(p->num_inst <= p->max_inst);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+ }
}
@@ -240,80 +315,91 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
#define EMIT_(_name, _op) \
void _name (struct spe_function *p, unsigned rT) \
{ \
- emit_RR(p, _op, rT, 0, 0); \
+ emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
}
#define EMIT_R(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA) \
{ \
- emit_RR(p, _op, rT, rA, 0); \
+ emit_RR(p, _op, rT, rA, 0, __FUNCTION__); \
}
#define EMIT_RR(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
{ \
- emit_RR(p, _op, rT, rA, rB); \
+ emit_RR(p, _op, rT, rA, rB, __FUNCTION__); \
}
#define EMIT_RRR(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
{ \
- emit_RRR(p, _op, rT, rA, rB, rC); \
+ emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__); \
}
#define EMIT_RI7(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI7(p, _op, rT, rA, imm); \
+ emit_RI7(p, _op, rT, rA, imm, __FUNCTION__); \
}
#define EMIT_RI8(_name, _op, bias) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI8(p, _op, rT, rA, bias - imm); \
+ emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__); \
}
#define EMIT_RI10(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI10(p, _op, rT, rA, imm); \
+ emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \
}
#define EMIT_RI16(_name, _op) \
void _name (struct spe_function *p, unsigned rT, int imm) \
{ \
- emit_RI16(p, _op, rT, imm); \
+ emit_RI16(p, _op, rT, imm, __FUNCTION__); \
}
#define EMIT_RI18(_name, _op) \
void _name (struct spe_function *p, unsigned rT, int imm) \
{ \
- emit_RI18(p, _op, rT, imm); \
+ emit_RI18(p, _op, rT, imm, __FUNCTION__); \
}
#define EMIT_I16(_name, _op) \
void _name (struct spe_function *p, int imm) \
{ \
- emit_RI16(p, _op, 0, imm); \
+ emit_RI16(p, _op, 0, imm, __FUNCTION__); \
}
#include "rtasm_ppc_spe.h"
+
/**
* Initialize an spe_function.
* \param code_size size of instruction buffer to allocate, in bytes.
*/
void spe_init_func(struct spe_function *p, unsigned code_size)
{
+ unsigned int i;
+
p->store = align_malloc(code_size, 16);
p->num_inst = 0;
p->max_inst = code_size / SPE_INST_SIZE;
+ p->set_count = 0;
+ memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
/* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
*/
- p->regs[0] = ~7;
- p->regs[1] = (1U << (80 - 64)) - 1;
+ p->regs[0] = p->regs[1] = p->regs[2] = 1;
+ for (i = 80; i <= 127; i++) {
+ p->regs[i] = 1;
+ }
+
+ p->print = false;
+ p->indent = 0;
}
@@ -327,20 +413,23 @@ void spe_release_func(struct spe_function *p)
}
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+ return p->num_inst * SPE_INST_SIZE;
+}
+
+
/**
- * Alloate a SPE register.
+ * Allocate a SPE register.
* \return register index or -1 if none left.
*/
int spe_allocate_available_register(struct spe_function *p)
{
unsigned i;
for (i = 0; i < SPE_NUM_REGS; i++) {
- const uint64_t mask = (1ULL << (i % 64));
- const unsigned idx = i / 64;
-
- assert(idx < 2);
- if ((p->regs[idx] & mask) != 0) {
- p->regs[idx] &= ~mask;
+ if (p->regs[i] == 0) {
+ p->regs[i] = 1;
return i;
}
}
@@ -354,31 +443,150 @@ int spe_allocate_available_register(struct spe_function *p)
*/
int spe_allocate_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
-
assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) != 0);
-
- p->regs[idx] &= ~(1ULL << bit);
+ assert(p->regs[reg] == 0);
+ p->regs[reg] = 1;
return reg;
}
/**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated". Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
*/
void spe_release_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
+ assert(reg < SPE_NUM_REGS);
+ assert(p->regs[reg] == 1);
- assert(idx < 2);
+ p->regs[reg] = 0;
+}
- assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers. This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+ unsigned int i;
+
+ /* Keep track of the set count. If it ever wraps around to 0,
+ * we're in trouble.
+ */
+ p->set_count++;
+ assert(p->set_count > 0);
+
+ /* Increment the allocation count of all registers currently
+ * allocated. Then any registers that are allocated in this set
+ * will be the only ones with a count of 1; they'll all be released
+ * when the register set is released.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]++;
+ }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+ unsigned int i;
+
+ /* If the set count drops below zero, we're in trouble. */
+ assert(p->set_count > 0);
+ p->set_count--;
+
+ /* Drop the allocation level of all registers. Any allocated
+ * during this register set will drop to 0 and then become
+ * available.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]--;
+ }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+ unsigned i, num = 0;
+ /* only count registers in the range available to callers */
+ for (i = 2; i < 80; i++) {
+ if (p->regs[i]) {
+ used[num++] = i;
+ }
+ }
+ return num;
+}
+
+
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+ p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+ p->indent += spaces;
+}
+
+
+void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+ if (p->print) {
+ p->indent += rel_indent;
+ indent(p);
+ p->indent -= rel_indent;
+ printf("# %s\n", s);
+ }
+}
+
+
+/**
+ * Load quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
+
+ p->print = FALSE;
+ assert(offset % 4 == 0);
+ emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
- p->regs[idx] |= (1ULL << bit);
+ p->print = FALSE;
+ assert(offset % 4 == 0);
+ emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
}
@@ -392,51 +600,51 @@ void spe_release_register(struct spe_function *p, int reg)
/** Branch Indirect to address in rA */
void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Interupt Return */
void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect and set link on external data */
void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
int e)
{
- emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect and set link. Save PC in rT, jump to rA. */
void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
int e)
{
- emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */
void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */
void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */
void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */
void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
@@ -505,30 +713,223 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
}
else {
spe_ilhu(p, rT, i >> 16);
- spe_iohl(p, rT, i & 0xffff);
+ if (i & 0xffff)
+ spe_iohl(p, rT, i & 0xffff);
+ }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+ /* If the whole value is in the lower 18 bits, use ila, which
+ * doesn't sign-extend. Otherwise, if the two halfwords of
+ * the constant are identical, use ilh. Otherwise, if every byte of
+ * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+ * Bytes Immediate (fsmbi) to load the value in a single instruction.
+ * Otherwise, in the general case, we have to use ilhu followed by iohl.
+ */
+ if ((ui & 0x3ffff) == ui) {
+ spe_ila(p, rT, ui);
+ }
+ else if ((ui >> 16) == (ui & 0xffff)) {
+ spe_ilh(p, rT, ui & 0xffff);
+ }
+ else if (
+ ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+ ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+ ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+ ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+ ) {
+ unsigned int mask = 0;
+ /* fsmbi duplicates each bit in the given mask eight times,
+ * using a 16-bit value to initialize a 16-byte quadword.
+ * Each 4-bit nybble of the mask corresponds to a full word
+ * of the result; look at the value and figure out the mask
+ * (replicated for each word in the quadword), and then
+ * form the "select mask" to get the value.
+ */
+ if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+ if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+ if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+ if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+ spe_fsmbi(p, rT, mask);
+ }
+ else {
+ /* The general case: this usually uses two instructions, but
+ * may use only one if the low-order 16 bits of each word are 0.
+ */
+ spe_ilhu(p, rT, ui >> 16);
+ if (ui & 0xffff)
+ spe_iohl(p, rT, ui & 0xffff);
+ }
+}
+
+/**
+ * This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either And Byte Immediate
+ * (which uses the same constant across each byte), And Halfword Immediate
+ * (which sign-extends a 10-bit immediate to 16 bits and uses that
+ * across each halfword), or And Word Immediate (which sign-extends
+ * a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use And Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_andi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use And Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_andhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the And Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_andbi(p, rT, rA, tmp);
+ return;
+ }
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_and(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either Exclusive Or Byte
+ * Immediate (which uses the same constant across each byte), Exclusive
+ * Or Halfword Immediate (which sign-extends a 10-bit immediate to
+ * 16 bits and uses that across each halfword), or Exclusive Or Word
+ * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use Exclusive Or Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_xori(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use Exclusive Or Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_xorhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the Exclusive Or Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_xorbi(p, rT, rA, tmp);
+ return;
+ }
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_xor(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 9 bits or less, it fits inside a
+ * Compare Equal Word Immediate instruction.
+ */
+ if ((ui & 0x000001ff) == ui) {
+ spe_ceqi(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_ceq(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
}
}
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 10 bits or less, it fits inside a
+ * Compare Logical Greater Than Word Immediate instruction.
+ */
+ if ((ui & 0x000003ff) == ui) {
+ spe_clgti(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_clgt(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+ }
+}
void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_ila(p, rT, 66051);
+ /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+ spe_ila(p, rT, 0x00010203);
spe_shufb(p, rT, rA, rA, rT);
}
void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_nor(p, rT, rT, rT);
+ spe_nor(p, rT, rA, rA);
}
void
spe_move(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_ori(p, rT, rA, 0);
+ /* Use different instructions depending on the instruction address
+ * to take advantage of the dual pipelines.
+ */
+ if (p->num_inst & 1)
+ spe_shlqbyi(p, rT, rA, 0); /* odd pipe */
+ else
+ spe_ori(p, rT, rA, 0); /* even pipe */
}
@@ -539,4 +940,70 @@ spe_zero(struct spe_function *p, unsigned rT)
}
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+ assert(word >= 0);
+ assert(word <= 3);
+
+ if (word == 0) {
+ int tmp1 = rT;
+ spe_ila(p, tmp1, 66051);
+ spe_shufb(p, rT, rA, rA, tmp1);
+ }
+ else {
+ /* XXX review this, we may not need the rotqbyi instruction */
+ int tmp1 = rT;
+ int tmp2 = spe_allocate_available_register(p);
+
+ spe_ila(p, tmp1, 66051);
+ spe_rotqbyi(p, tmp2, rA, 4 * word);
+ spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+ spe_release_register(p, tmp2);
+ }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ *
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB. But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+ unsigned int compare_reg = spe_allocate_available_register(p);
+ spe_fcgt(p, compare_reg, rA, rB);
+ spe_selb(p, rT, rA, rB, compare_reg);
+ spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ *
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+ unsigned int compare_reg = spe_allocate_available_register(p);
+ spe_fcgt(p, compare_reg, rA, rB);
+ spe_selb(p, rT, rB, rA, compare_reg);
+ spe_release_register(p, compare_reg);
+}
+
#endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace34..47dadb343c2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
* For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
*
* \author Ian Romanick <[email protected]>
+ * \author Brian Paul
*/
#ifndef RTASM_PPC_SPE_H
@@ -39,10 +40,10 @@
/** number of general-purpose SIMD registers */
#define SPE_NUM_REGS 128
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
#define SPE_REG_RA 0
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
#define SPE_REG_SP 1
@@ -52,25 +53,49 @@ struct spe_function
uint num_inst;
uint max_inst;
- /**
- * Mask of used / unused registers
- *
- * Each set bit corresponds to an available register. Each cleared bit
- * corresponds to an allocated register.
+ /**
+ * The "set count" reflects the number of nested register sets
+ * are allowed. In the unlikely case that we exceed the set count,
+ * register allocation will start to be confused, which is critical
+ * enough that we check for it.
+ */
+ unsigned char set_count;
+
+ /**
+ * Flags for used and unused registers. Each byte corresponds to a
+ * register; a 0 in that byte means that the register is available.
+ * A value of 1 means that the register was allocated in the current
+ * register set. Any other value N means that the register was allocated
+ * N register sets ago.
*
* \sa
* spe_allocate_register, spe_allocate_available_register,
- * spe_release_register
+ * spe_allocate_register_set, spe_release_register_set, spe_release_register,
*/
- uint64_t regs[SPE_NUM_REGS / 64];
+ unsigned char regs[SPE_NUM_REGS];
+
+ boolean print; /**< print/dump instructions as they're emitted? */
+ int indent; /**< number of spaces to indent */
};
+
extern void spe_init_func(struct spe_function *p, unsigned code_size);
extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
extern int spe_allocate_available_register(struct spe_function *p);
extern int spe_allocate_register(struct spe_function *p, int reg);
extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
#endif /* RTASM_PPC_SPE_H */
@@ -106,11 +131,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
/* Memory load / store instructions
*/
-EMIT_RI10(spe_lqd, 0x034);
EMIT_RR (spe_lqx, 0x1c4);
EMIT_RI16(spe_lqa, 0x061);
EMIT_RI16(spe_lqr, 0x067);
-EMIT_RI10(spe_stqd, 0x024);
EMIT_RR (spe_stqx, 0x144);
EMIT_RI16(spe_stqa, 0x041);
EMIT_RI16(spe_stqr, 0x047);
@@ -268,6 +291,12 @@ EMIT_RI16(spe_brz, 0x040);
EMIT_RI16(spe_brhnz, 0x046);
EMIT_RI16(spe_brhz, 0x044);
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
@@ -292,13 +321,33 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
extern void
spe_load_int(struct spe_function *p, unsigned rT, int i);
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
/** Replicate word 0 of rA across rT. */
extern void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
/** rT = rA. */
extern void
@@ -308,6 +357,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
extern void
spe_zero(struct spe_function *p, unsigned rT);
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
/* Floating-point instructions
*/
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index ad9d8f8ced9..99ee74cf14b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
/* Oh-oh we've stumbled into the SIB thing.
*/
if (regmem.file == file_REG32 &&
- regmem.idx == reg_SP) {
+ regmem.idx == reg_SP &&
+ regmem.mod != mod_REG) {
emit_1ub(p, 0x24); /* simplistic! */
}
@@ -439,25 +440,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
}
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
{
DUMP_RI( dst, imm );
+ assert(dst.file == file_REG32);
assert(dst.mod == mod_REG);
emit_1ub(p, 0xb8 + dst.idx);
emit_1i(p, imm);
}
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void
+x86_group1_imm( struct x86_function *p,
+ unsigned op, struct x86_reg dst, int imm )
{
- DUMP_RI( dst, imm );
+ assert(dst.file == file_REG32);
assert(dst.mod == mod_REG);
- emit_1ub(p, 0x80);
- emit_modrm_noreg(p, 0, dst);
- emit_1ub(p, imm);
+ if(-0x80 <= imm && imm < 0x80) {
+ emit_1ub(p, 0x83);
+ emit_modrm_noreg(p, op, dst);
+ emit_1b(p, (char)imm);
+ }
+ else {
+ emit_1ub(p, 0x81);
+ emit_modrm_noreg(p, op, dst);
+ emit_1i(p, imm);
+ }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 7, dst, imm);
}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af79f07dd39..1b5eaaca850 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
/* void x86_call( struct x86_function *p, void (*label)() ); */
void x86_call( struct x86_function *p, struct x86_reg reg);
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
/* Macro for sse_shufps() and sse2_pshufd():
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d36884..38fcaf88293 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
return instruction_ext_nv;
}
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
{
- unsigned u32;
-};
+ return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
unsigned
tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6b..1a5294eabc3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1674,6 +1674,7 @@ exec_declaration(
break;
default:
+ eval = NULL;
assert( 0 );
}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9b..2cd56e413a5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
}
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings. The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+ memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
static void
next_token(
struct tgsi_parse_context *ctx,
void *token )
{
assert( !tgsi_parse_end_of_tokens( ctx ) );
-
- *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+ copy_token(token, &ctx->Tokens[ctx->Position]);
+ ctx->Position++;
}
+
void
tgsi_parse_token(
struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
*decl = tgsi_default_full_declaration();
- decl->Declaration = *(struct tgsi_declaration *) &token;
+ copy_token(&decl->Declaration, &token);
next_token( ctx, &decl->DeclarationRange );
@@ -132,8 +149,7 @@ tgsi_parse_token(
struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
*imm = tgsi_default_full_immediate();
- imm->Immediate = *(struct tgsi_immediate *) &token;
-
+ copy_token(&imm->Immediate, &token);
assert( !imm->Immediate.Extended );
switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
unsigned extended;
*inst = tgsi_default_full_instruction();
- inst->Instruction = *(struct tgsi_instruction *) &token;
-
+ copy_token(&inst->Instruction, &token);
extended = inst->Instruction.Extended;
while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_INSTRUCTION_EXT_TYPE_NV:
- inst->InstructionExtNv =
- *(struct tgsi_instruction_ext_nv *) &token;
+ copy_token(&inst->InstructionExtNv, &token);
break;
case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
- inst->InstructionExtLabel =
- *(struct tgsi_instruction_ext_label *) &token;
+ copy_token(&inst->InstructionExtLabel, &token);
break;
case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
- inst->InstructionExtTexture =
- *(struct tgsi_instruction_ext_texture *) &token;
+ copy_token(&inst->InstructionExtTexture, &token);
break;
default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
- inst->FullDstRegisters[i].DstRegisterExtConcode =
- *(struct tgsi_dst_register_ext_concode *) &token;
+ copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+ &token);
break;
case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
- inst->FullDstRegisters[i].DstRegisterExtModulate =
- *(struct tgsi_dst_register_ext_modulate *) &token;
+ copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+ &token);
break;
default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
- inst->FullSrcRegisters[i].SrcRegisterExtSwz =
- *(struct tgsi_src_register_ext_swz *) &token;
+ copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+ &token);
break;
case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
- inst->FullSrcRegisters[i].SrcRegisterExtMod =
- *(struct tgsi_src_register_ext_mod *) &token;
+ copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+ &token);
break;
default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4681b29f52b..f79170b9d65 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,14 @@
*
**************************************************************************/
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#include "util/u_sse.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
@@ -35,8 +40,6 @@
#include "rtasm/rtasm_x86sse.h"
-#ifdef PIPE_ARCH_X86
-
/* for 1/sqrt()
*
* This costs about 100fps (close to 10%) in gears:
@@ -480,10 +483,31 @@ emit_coef_dady(
* Function call helpers.
*/
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
static void
-emit_push_gp(
- struct x86_function *func )
+emit_func_call_dst(
+ struct x86_function *func,
+ unsigned xmm_save,
+ unsigned xmm_dst,
+ void (PIPE_CDECL *code)() )
{
+ struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+ unsigned i, n, xmm;
+ unsigned xmm_mask;
+
+ /* Bitmask of the xmm registers to save */
+ xmm_mask = (1 << xmm_save) - 1;
+ xmm_mask &= ~(1 << xmm_dst);
+
+ sse_movaps(
+ func,
+ get_temp( TEMP_R0, 0 ),
+ make_xmm( xmm_dst ) );
+
x86_push(
func,
x86_make_reg( file_REG32, reg_AX) );
@@ -493,12 +517,49 @@ emit_push_gp(
x86_push(
func,
x86_make_reg( file_REG32, reg_DX) );
-}
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i))
+ ++n;
+
+ x86_sub_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i)) {
+ sse_movups(
+ func,
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+ make_xmm( xmm ) );
+ ++n;
+ }
+
+ x86_lea(
+ func,
+ ecx,
+ get_temp( TEMP_R0, 0 ) );
+
+ x86_push( func, ecx );
+ x86_mov_reg_imm( func, ecx, (unsigned long) code );
+ x86_call( func, ecx );
+ x86_pop(func, ecx );
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i)) {
+ sse_movups(
+ func,
+ make_xmm( xmm ),
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+ ++n;
+ }
+
+ x86_add_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
-static void
-x86_pop_gp(
- struct x86_function *func )
-{
/* Restore GP registers in a reverse order.
*/
x86_pop(
@@ -510,39 +571,6 @@ x86_pop_gp(
x86_pop(
func,
x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
- struct x86_function *func,
- unsigned xmm_dst,
- void (PIPE_CDECL *code)() )
-{
- sse_movaps(
- func,
- get_temp( TEMP_R0, 0 ),
- make_xmm( xmm_dst ) );
-
- emit_push_gp(
- func );
-
- {
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
- x86_lea(
- func,
- ecx,
- get_temp( TEMP_R0, 0 ) );
-
- x86_push( func, ecx );
- x86_mov_reg_imm( func, ecx, (unsigned long) code );
- x86_call( func, ecx );
- x86_pop(func, ecx );
- }
-
-
- x86_pop_gp(
- func );
sse_movaps(
func,
@@ -553,6 +581,7 @@ emit_func_call_dst(
static void
emit_func_call_dst_src(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src,
void (PIPE_CDECL *code)() )
@@ -564,10 +593,111 @@ emit_func_call_dst_src(
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
code );
}
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+exp2f4(__m128 x)
+{
+ __m128i ipart;
+ __m128 fpart, expipart, expfpart;
+
+ x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+ x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+ /* ipart = int(x - 0.5) */
+ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+ /* fpart = x - ipart */
+ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+ /* expipart = (float) (1 << ipart) */
+ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+ /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+ expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+ expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+ expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+ expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+ return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+log2f4(__m128 x)
+{
+ __m128i expmask = _mm_set1_epi32(0x7f800000);
+ __m128i mantmask = _mm_set1_epi32(0x007fffff);
+ __m128 one = _mm_set1_ps(1.0f);
+
+ __m128i i = _mm_castps_si128(x);
+
+ /* exp = (float) exponent(x) */
+ __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+ /* mant = (float) mantissa(x) */
+ __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+ __m128 logmant;
+
+ /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+ * These coefficients can be generate with
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+#if LOG_POLY_DEGREE == 6
+ logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+ logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+ logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+ return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+ return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
/**
* Low-level instruction translators.
*/
@@ -610,38 +740,35 @@ cos4f(
static void
emit_cos(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
cos4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
ex24f(
float *store )
{
-#if FAST_MATH
- store[0] = util_fast_exp2( store[0] );
- store[1] = util_fast_exp2( store[1] );
- store[2] = util_fast_exp2( store[2] );
- store[3] = util_fast_exp2( store[3] );
-#else
- store[0] = powf( 2.0f, store[0] );
- store[1] = powf( 2.0f, store[1] );
- store[2] = powf( 2.0f, store[2] );
- store[3] = powf( 2.0f, store[3] );
-#endif
+ _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
}
static void
emit_ex2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
ex24f );
}
@@ -670,10 +797,12 @@ flr4f(
static void
emit_flr(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
flr4f );
}
@@ -691,31 +820,35 @@ frc4f(
static void
emit_frc(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
frc4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
lg24f(
float *store )
{
- store[0] = util_fast_log2( store[0] );
- store[1] = util_fast_log2( store[1] );
- store[2] = util_fast_log2( store[2] );
- store[3] = util_fast_log2( store[3] );
+ _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
}
static void
emit_lg2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
lg24f );
}
@@ -757,14 +890,14 @@ emit_neg(
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
pow4f(
float *store )
{
-#if FAST_MATH
- store[0] = util_fast_pow( store[0], store[4] );
- store[1] = util_fast_pow( store[1], store[5] );
- store[2] = util_fast_pow( store[2], store[6] );
- store[3] = util_fast_pow( store[3], store[7] );
+#if 1
+ _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
#else
store[0] = powf( store[0], store[4] );
store[1] = powf( store[1], store[5] );
@@ -776,11 +909,13 @@ pow4f(
static void
emit_pow(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src )
{
emit_func_call_dst_src(
func,
+ xmm_save,
xmm_dst,
xmm_src,
pow4f );
@@ -873,10 +1008,12 @@ sin4f(
static void
emit_sin (struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst)
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
sin4f );
}
@@ -1296,7 +1433,7 @@ emit_instruction(
get_temp(
TGSI_EXEC_TEMP_MINUS_128_I,
TGSI_EXEC_TEMP_MINUS_128_C ) );
- emit_pow( func, 1, 2 );
+ emit_pow( func, 3, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
sse_xorps(
func,
@@ -1342,11 +1479,11 @@ emit_instruction(
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_MOV( func, 1, 0 );
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = ex2(floor(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
emit_MOV( func, 2, 1 );
- emit_ex2( func, 2 );
+ emit_ex2( func, 3, 2 );
STORE( func, *inst, 2, 0, CHAN_X );
}
/* dst.y = src.x - floor(src.x) */
@@ -1358,7 +1495,7 @@ emit_instruction(
}
/* dst.z = ex2(src.x) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- emit_ex2( func, 0 );
+ emit_ex2( func, 3, 0 );
STORE( func, *inst, 0, 0, CHAN_Z );
}
}
@@ -1376,21 +1513,21 @@ emit_instruction(
FETCH( func, *inst, 0, 0, CHAN_X );
emit_abs( func, 0 );
emit_MOV( func, 1, 0 );
- emit_lg2( func, 1 );
+ emit_lg2( func, 2, 1 );
/* dst.z = lg2(abs(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
STORE( func, *inst, 1, 0, CHAN_Z );
}
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = floor(lg2(abs(src.x))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( func, *inst, 1, 0, CHAN_X );
}
/* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_ex2( func, 1 );
+ emit_ex2( func, 2, 1 );
emit_rcp( func, 1, 1 );
emit_mul( func, 0, 1 );
STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1580,7 +1717,7 @@ emit_instruction(
/* TGSI_OPCODE_FRC */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_frc( func, 0 );
+ emit_frc( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@@ -1593,7 +1730,7 @@ emit_instruction(
/* TGSI_OPCODE_FLR */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_flr( func, 0 );
+ emit_flr( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@@ -1605,7 +1742,7 @@ emit_instruction(
case TGSI_OPCODE_EXPBASE2:
/* TGSI_OPCODE_EX2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_ex2( func, 0 );
+ emit_ex2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1614,7 +1751,7 @@ emit_instruction(
case TGSI_OPCODE_LOGBASE2:
/* TGSI_OPCODE_LG2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_lg2( func, 0 );
+ emit_lg2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1624,7 +1761,7 @@ emit_instruction(
/* TGSI_OPCODE_POW */
FETCH( func, *inst, 0, 0, CHAN_X );
FETCH( func, *inst, 1, 1, CHAN_X );
- emit_pow( func, 0, 1 );
+ emit_pow( func, 0, 0, 1 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1715,7 +1852,7 @@ emit_instruction(
case TGSI_OPCODE_COS:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1774,7 +1911,7 @@ emit_instruction(
case TGSI_OPCODE_SIN:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1868,12 +2005,12 @@ emit_instruction(
case TGSI_OPCODE_SCS:
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_X );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_Y );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d7..b3d1045a8f4 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
u_gen_mipmap.c \
u_handle_table.c \
u_hash_table.c \
+ u_keymap.c \
u_math.c \
u_mm.c \
u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc8..8a04955a16e 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
'u_gen_mipmap.c',
'u_handle_table.c',
'u_hash_table.c',
+ 'u_keymap.c',
'u_math.c',
'u_mm.c',
'u_rect.c',
'u_simple_shaders.c',
'u_snprintf.c',
- 'u_stream_stdc.c',
- 'u_stream_wd.c',
+ 'u_stream_stdc.c',
+ 'u_stream_wd.c',
'u_tile.c',
'u_time.c',
])
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index b6cff281e6d..3ed8bdfdf33 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -306,6 +306,13 @@ debug_get_flags_option(const char *name,
str = _debug_get_option(name);
if(!str)
result = dfault;
+ else if (!util_strcmp(str, "help")) {
+ result = dfault;
+ while (flags->name) {
+ debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+ flags++;
+ }
+ }
else {
result = 0;
while( flags->name ) {
@@ -315,7 +322,12 @@ debug_get_flags_option(const char *name,
}
}
- debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+ if (str) {
+ debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+ }
+ else {
+ debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+ }
return result;
}
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 9adf72944e6..d28201ac8dc 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
ctx->rasterizer.bypass_clipping = 1;
/*ctx->rasterizer.bypass_vs = 1;*/
+ ctx->rasterizer.gl_rasterization_rules = 1;
/* samplers */
memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index b19a649bbcd..9d305ad763a 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
ctx->rasterizer.bypass_clipping = 1;
/*ctx->rasterizer.bypass_vs = 1;*/
+ ctx->rasterizer.gl_rasterization_rules = 1;
/* sampler state */
memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 00000000000..01b17ddb1b3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+ struct cso_hash *cso;
+ unsigned key_size;
+ unsigned max_entries; /* XXX not obeyed net */
+ unsigned num_entries;
+ keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+ void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+ const void *key, void *data, void *user)
+{
+ FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+ return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+ unsigned i, hash;
+
+ keySize /= 4; /* convert from bytes to uints */
+
+ hash = 0;
+ for (i = 0; i < keySize; i++) {
+ hash ^= (i + 1) * ((const unsigned *) key)[i];
+ }
+
+ /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+ return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize size of the keys in bytes
+ * \param maxEntries max number of entries to allow (~0 = infinity)
+ * \param deleteFunc optional callback to call when entries
+ * are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc)
+{
+ struct keymap *map = MALLOC_STRUCT(keymap);
+ if (!map)
+ return NULL;
+
+ map->cso = cso_hash_create();
+ if (!map->cso) {
+ FREE(map);
+ return NULL;
+ }
+
+ map->max_entries = maxEntries;
+ map->num_entries = 0;
+ map->key_size = keySize;
+ map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+ return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries. The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+ util_keymap_remove_all(map, user);
+ cso_hash_delete(map->cso);
+ FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ iter = cso_hash_find(map->cso, key_hash);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *) cso_hash_iter_data(iter);
+ if (!memcmp(item->key, key, map->key_size))
+ break;
+ iter = cso_hash_iter_next(iter);
+ }
+
+ return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter)) {
+ return NULL;
+ }
+ else {
+ return hash_table_item(iter);
+ }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+ struct cso_hash_iter iter;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (item) {
+ /* call delete callback for old entry/item */
+ map->delete_func(map, item->key, item->value, user);
+ item->value = (void *) data;
+ return TRUE;
+ }
+
+ item = MALLOC_STRUCT(keymap_item);
+ if (!item)
+ return FALSE;
+
+ item->key = mem_dup(key, map->key_size);
+ item->value = (void *) data;
+
+ iter = cso_hash_insert(map->cso, key_hash, item);
+ if (cso_hash_iter_is_null(iter)) {
+ FREE(item);
+ return FALSE;
+ }
+
+ map->num_entries++;
+
+ return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (!item)
+ return NULL;
+
+ return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+ unsigned key_hash;
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter))
+ return;
+
+ item = hash_table_item(iter);
+ assert(item);
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+
+ map->num_entries--;
+
+ cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ iter = cso_hash_first_node(map->cso);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *)
+ cso_hash_take(map->cso, cso_hash_iter_key(iter));
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+ iter = cso_hash_first_node(map->cso);
+ }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+ debug_printf("Keymap %p: %u of max %u entries\n",
+ (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 00000000000..8d60a76fc3c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+ const void *key, void *data,
+ void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index 0729114d6a1..5b3cab4642a 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -30,7 +30,7 @@
#include "util/u_math.h"
-
+/** 2^x, for x in [-1.0, 1.0[ */
float pow2_table[POW2_TABLE_SIZE];
@@ -38,9 +38,21 @@ static void
init_pow2_table(void)
{
int i;
- for (i = 0; i < POW2_TABLE_SIZE; i++) {
- pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE);
- }
+ for (i = 0; i < POW2_TABLE_SIZE; i++)
+ pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0[ */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void
+init_log2_table(void)
+{
+ unsigned i;
+ for (i = 0; i < LOG2_TABLE_SIZE; i++)
+ log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE));
}
@@ -53,6 +65,7 @@ util_init_math(void)
static boolean initialized = FALSE;
if (!initialized) {
init_pow2_table();
+ init_log2_table();
initialized = TRUE;
}
}
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 196aeb28fa4..be7303e5503 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -174,8 +174,10 @@ static INLINE float logf( float f )
-#define POW2_TABLE_SIZE 256
-#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
extern float pow2_table[POW2_TABLE_SIZE];
@@ -186,98 +188,78 @@ util_init_math(void);
union fi {
float f;
- int i;
- unsigned ui;
+ int32_t i;
+ uint32_t ui;
};
/**
- * Fast approximation to exp(x).
- * Compute with base 2 exponents: exp(x) = exp2(log2(e) * x)
- * Note: log2(e) is a constant, k = 1.44269
- * So, exp(x) = exp2(k * x);
+ * Fast version of 2^x
* Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(k*x)
- * Let fpart = k*x - ipart;
- * So, exp2(k*x) = exp2(ipart) * exp2(fpart)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
* Compute exp2(ipart) with i << ipart
* Compute exp2(fpart) with lookup table.
*/
static INLINE float
-util_fast_exp(float x)
+util_fast_exp2(float x)
{
- if (x >= 0.0f) {
- float k = 1.44269f; /* = log2(e) */
- float kx = k * x;
- int ipart = (int) kx;
- float fpart = kx - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return y;
- }
- else {
- /* exp(-x) = 1.0 / exp(x) */
- float k = -1.44269f;
- float kx = k * x;
- int ipart = (int) kx;
- float fpart = kx - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return 1.0f / y;
- }
+ int32_t ipart;
+ float fpart, mpart;
+ union fi epart;
+
+ if(x > 129.00000f)
+ return 3.402823466e+38f;
+
+ if(x < -126.99999f)
+ return 0.0f;
+
+ ipart = (int32_t) x;
+ fpart = x - (float) ipart;
+
+ /* same as
+ * epart.f = (float) (1 << ipart)
+ * but faster and without integer overflow for ipart > 31 */
+ epart.i = (ipart + 127 ) << 23;
+
+ mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+
+ return epart.f * mpart;
}
/**
- * Fast version of 2^x
- * XXX the above function could be implemented in terms of this one.
+ * Fast approximation to exp(x).
*/
static INLINE float
-util_fast_exp2(float x)
+util_fast_exp(float x)
{
- if (x >= 0.0f) {
- int ipart = (int) x;
- float fpart = x - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return y;
- }
- else {
- /* exp(-x) = 1.0 / exp(x) */
- int ipart = (int) -x;
- float fpart = -x - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return 1.0f / y;
- }
+ const float k = 1.44269f; /* = log2(e) */
+ return util_fast_exp2(k * x);
}
-/**
- * Based on code from http://www.flipcode.com/totd/
- */
+#define LOG2_TABLE_SIZE_LOG2 8
+#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
static INLINE float
-util_fast_log2(float val)
+util_fast_log2(float x)
{
union fi num;
- int log_2;
- num.f = val;
- log_2 = ((num.i >> 23) & 255) - 128;
- num.i &= ~(255 << 23);
- num.i += 127 << 23;
- num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3;
- return num.f + log_2;
+ float epart, mpart;
+ num.f = x;
+ epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+ mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+ return epart + mpart;
}
static INLINE float
util_fast_pow(float x, float y)
{
- /* XXX these tests may need adjustment */
- if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
- return 0.0f;
- if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
- return 0.0f;
return util_fast_exp2(util_fast_log2(x) * y);
}
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 00000000000..e2a8491e62c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ *
+ * Although the SSE intrinsics are support by all modern x86 and x86-64
+ * compilers, there are some intrisincs missing in some implementations
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+ __m128 m128;
+ __m128i m128i;
+ __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+ union m128_types u;
+ u.m128i = a;
+ return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+ union m128_types u;
+ u.m128 = a;
+ return u.m128i;
+}
+
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#endif /* U_SSE_H_ */
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf52..5dc756023fb 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -94,6 +94,7 @@
#define CELL_CMD_STATE_BIND_VS 18
#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
#define CELL_CMD_STATE_ATTRIB_FETCH 20
+#define CELL_CMD_STATE_FS_CONSTANTS 21
#define CELL_CMD_VS_EXECUTE 22
#define CELL_CMD_FLUSH_BUFFER_RANGE 23
@@ -104,11 +105,11 @@
#define CELL_BUFFER_STATUS_FREE 10
#define CELL_BUFFER_STATUS_USED 20
-
-#define CELL_DEBUG_CHECKER (1 << 0)
-#define CELL_DEBUG_SYNC (1 << 1)
-
-
+#define CELL_DEBUG_CHECKER (1 << 0)
+#define CELL_DEBUG_ASM (1 << 1)
+#define CELL_DEBUG_SYNC (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
/** Max instructions for doing per-fragment operations */
#define SPU_MAX_FRAGMENT_OPS_INSTS 64
@@ -127,10 +128,10 @@ struct cell_command_fragment_ops
/** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
/**
- * Command to send a fragment progra to SPUs.
+ * Command to send a fragment program to SPUs.
*/
struct cell_command_fragment_program
{
@@ -227,6 +228,7 @@ struct cell_command_render
float xmin, ymin, xmax, ymax; /* XXX another dummy field */
uint min_index;
boolean inline_verts;
+ uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
};
@@ -267,6 +269,20 @@ struct cell_command
} ALIGN16_ATTRIB;
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+ uint num;
+ char names[MAX_SPU_FUNCTIONS][16];
+ uint addrs[MAX_SPU_FUNCTIONS];
+ char pad[12]; /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
/** This is the object passed to spe_create_thread() */
struct cell_init_info
{
@@ -278,6 +294,8 @@ struct cell_init_info
/** Buffers for command batches, vertex/index data */
ubyte *buffers[CELL_NUM_BUFFERS];
uint *buffer_status; /**< points at cell_context->buffer_status */
+
+ struct cell_spu_function_info *spu_functions;
} ALIGN16_ATTRIB;
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d1..30ce6f97624 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -62,6 +62,8 @@ cell_destroy_context( struct pipe_context *pipe )
{
struct cell_context *cell = cell_context(pipe);
+ util_delete_keymap(cell->fragment_ops_cache, NULL);
+
cell_spu_exit(cell);
align_free(cell);
@@ -85,13 +87,14 @@ cell_draw_create(struct cell_context *cell)
}
-#ifdef DEBUG
static const struct debug_named_value cell_debug_flags[] = {
{"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+ {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */
{"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */
+ {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+ {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
{NULL, 0}
};
-#endif
struct pipe_context *
@@ -130,6 +133,10 @@ cell_create_context(struct pipe_screen *screen,
cell->draw = cell_draw_create(cell);
+ /* Create cache of fragment ops generated code */
+ cell->fragment_ops_cache =
+ util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
cell_init_vbuf(cell);
draw_set_rasterize_stage(cell->draw, cell->vbuf);
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f8..80a9b3d7e13 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
#include "cell/common.h"
#include "rtasm/rtasm_ppc_spe.h"
#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
struct cell_vbuf_render;
@@ -67,31 +68,15 @@ struct cell_fragment_shader_state
/**
- * Cell blend state atom, subclass of pipe_blend_state.
+ * Key for mapping per-fragment state to cached SPU machine code.
+ * keymap(cell_fragment_ops_key) => cell_command_fragment_ops
*/
-struct cell_blend_state
+struct cell_fragment_ops_key
{
- struct pipe_blend_state base;
-
- /**
- * Generated code to perform alpha blending
- */
- struct spe_function code;
-};
-
-
-/**
- * Cell depth/stencil/alpha state atom, subclass of
- * pipe_depth_stencil_alpha_state.
- */
-struct cell_depth_stencil_alpha_state
-{
- struct pipe_depth_stencil_alpha_state base;
-
- /**
- * Generated code to perform alpha, stencil, and depth testing on the SPE
- */
- struct spe_function code;
+ struct pipe_blend_state blend;
+ struct pipe_depth_stencil_alpha_state dsa;
+ enum pipe_format color_format;
+ enum pipe_format zs_format;
};
@@ -104,10 +89,10 @@ struct cell_context
struct cell_winsys *winsys;
- const struct cell_blend_state *blend;
+ const struct pipe_blend_state *blend;
const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
uint num_samplers;
- const struct cell_depth_stencil_alpha_state *depth_stencil;
+ const struct pipe_depth_stencil_alpha_state *depth_stencil;
const struct pipe_rasterizer_state *rasterizer;
const struct cell_vertex_shader_state *vs;
const struct cell_fragment_shader_state *fs;
@@ -136,6 +121,9 @@ struct cell_context
uint dirty;
+ /** Cache of code generated for per-fragment ops */
+ struct keymap *fragment_ops_cache;
+
/** The primitive drawing context */
struct draw_context *draw;
struct draw_stage *render_stage;
@@ -149,6 +137,7 @@ struct cell_context
/** Mapped constant buffers */
void *mapped_constants[PIPE_SHADER_TYPES];
+ struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
uint num_spus;
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14a..c8125a8a052 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -37,7 +37,7 @@
* \author Brian Paul
*/
-
+#include <math.h>
#include "pipe/p_defines.h"
#include "pipe/p_state.h"
#include "pipe/p_shader_tokens.h"
@@ -51,25 +51,40 @@
#include "cell_gen_fp.h"
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED 8
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
/**
* Context needed during code generation.
*/
struct codegen
{
+ struct cell_context *cell;
int inputs_reg; /**< 1st function parameter */
int outputs_reg; /**< 2nd function parameter */
int constants_reg; /**< 3rd function parameter */
- int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+ int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+ int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */
+
+ int num_imm; /**< number of immediates */
int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */
/** Per-instruction temps / intermediate temps */
int num_itemps;
- int itemps[3];
+ int itemps[10];
+
+ /** Current IF/ELSE/ENDIF nesting level */
+ int if_nesting;
+ /** Index of execution mask register */
+ int exec_mask_reg;
+
+ int frame_size; /**< Stack frame size, in words */
struct spe_function *f;
boolean error;
@@ -112,19 +127,47 @@ get_const_one_reg(struct codegen *gen)
{
if (gen->one_reg <= 0) {
gen->one_reg = spe_allocate_available_register(gen->f);
- }
- /* one = {1.0, 1.0, 1.0, 1.0} */
- spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
- printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+ /* one = {1.0, 1.0, 1.0, 1.0} */
+ spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+ spe_indent(gen->f, -4);
+ }
return gen->one_reg;
}
/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+ if (gen->exec_mask_reg <= 0) {
+ gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+ spe_indent(gen->f, 4);
+ spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+ /* exec_mask = {~0, ~0, ~0, ~0} */
+ spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+ spe_indent(gen->f, -4);
+ }
+
+ return gen->exec_mask_reg;
+}
+
+
+/**
* Return the index of the SPU temporary containing the named TGSI
* source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
* just return the corresponding SPE register. If the TGIS register
@@ -136,35 +179,98 @@ get_src_reg(struct codegen *gen,
int channel,
const struct tgsi_full_src_register *src)
{
- int reg;
+ int reg = -1;
+ int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+ boolean reg_is_itemp = FALSE;
+ uint sign_op;
+
+ assert(swizzle >= TGSI_SWIZZLE_X);
+ assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
- /* XXX need to examine src swizzle info here.
- * That will involve changing the channel var...
+ if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+ /* Load const one float and early out */
+ reg = get_const_one_reg(gen);
+ }
+ else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+ /* Load const zero float and early out */
+ reg = get_itemp(gen);
+ spe_xor(gen->f, reg, reg, reg);
+ }
+ else {
+ assert(swizzle < 4);
+
+ switch (src->SrcRegister.File) {
+ case TGSI_FILE_TEMPORARY:
+ reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_INPUT:
+ {
+ /* offset is measured in quadwords, not bytes */
+ int offset = src->SrcRegister.Index * 4 + swizzle;
+ reg = get_itemp(gen);
+ reg_is_itemp = TRUE;
+ /* Load: reg = memory[(machine_reg) + offset] */
+ spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+ }
+ break;
+ case TGSI_FILE_IMMEDIATE:
+ reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_CONSTANT:
+ {
+ /* offset is measured in quadwords, not bytes */
+ int offset = src->SrcRegister.Index * 4 + swizzle;
+ reg = get_itemp(gen);
+ reg_is_itemp = TRUE;
+ /* Load: reg = memory[(machine_reg) + offset] */
+ spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+ }
+ break;
+ case TGSI_FILE_SAMPLER:
+ {
+ reg = 3; /* XXX total hack */
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ /*
+ * Handle absolute value, negate or set-negative of src register.
*/
+ sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+ if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+ /*
+ * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+ */
+ const int bit31mask_reg = get_itemp(gen);
+ int result_reg;
+
+ if (reg_is_itemp) {
+ /* re-use 'reg' for the result */
+ result_reg = reg;
+ }
+ else {
+ /* alloc a new reg for the result */
+ result_reg = get_itemp(gen);
+ }
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_int(gen->f, bit31mask_reg, (1 << 31));
- switch (src->SrcRegister.File) {
- case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[src->SrcRegister.Index][channel];
- break;
- case TGSI_FILE_INPUT:
- {
- /* offset is measured in quadwords, not bytes */
- int offset = src->SrcRegister.Index * 4 + channel;
- reg = get_itemp(gen);
- /* Load: reg = memory[(machine_reg) + offset] */
- spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
- printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
+ if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+ spe_andc(gen->f, result_reg, reg, bit31mask_reg);
}
- break;
- case TGSI_FILE_IMMEDIATE:
- /* xxx fall-through for now / fix */
- case TGSI_FILE_CONSTANT:
- /* xxx fall-through for now / fix */
- default:
- assert(0);
+ else if (sign_op == TGSI_UTIL_SIGN_SET) {
+ spe_and(gen->f, result_reg, reg, bit31mask_reg);
+ }
+ else {
+ assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+ spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+ }
+
+ reg = result_reg;
}
return reg;
@@ -183,11 +289,14 @@ get_dst_reg(struct codegen *gen,
int channel,
const struct tgsi_full_dst_register *dest)
{
- int reg;
+ int reg = -1;
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ if (gen->if_nesting > 0)
+ reg = get_itemp(gen);
+ else
+ reg = gen->temp_regs[dest->DstRegister.Index][channel];
break;
case TGSI_FILE_OUTPUT:
reg = get_itemp(gen);
@@ -213,17 +322,41 @@ store_dest_reg(struct codegen *gen,
{
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
- /* no-op */
+ if (gen->if_nesting > 0) {
+ int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+ int exec_reg = get_exec_mask_reg(gen);
+ /* Mix d with new value according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+ }
+ else {
+ /* we're not inside a condition or loop: do nothing special */
+
+ }
break;
case TGSI_FILE_OUTPUT:
{
/* offset is measured in quadwords, not bytes */
int offset = dest->DstRegister.Index * 4 + channel;
- /* Store: memory[(machine_reg) + offset] = reg */
- spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
- printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+ if (gen->if_nesting > 0) {
+ int exec_reg = get_exec_mask_reg(gen);
+ int curval_reg = get_itemp(gen);
+ /* First read the current value from memory:
+ * Load: curval = memory[(machine_reg) + offset]
+ */
+ spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+ /* Mix curval with newvalue according to exec mask:
+ * d[i] = mask_reg[i] ? value_reg : d_reg
+ */
+ spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+ /* Store: memory[(machine_reg) + offset] = curval */
+ spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+ }
+ else {
+ /* Store: memory[(machine_reg) + offset] = reg */
+ spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+ }
}
break;
default:
@@ -232,27 +365,63 @@ store_dest_reg(struct codegen *gen,
}
+
+static void
+emit_prologue(struct codegen *gen)
+{
+ gen->frame_size = 256+128; /* XXX temporary */
+
+ spe_comment(gen->f, -4, "Function prologue:");
+
+ /* save $lr on stack # stqd $lr,16($sp) */
+ spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ /* save stack pointer # stqd $sp,-frameSize($sp) */
+ spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+ /* adjust stack pointer # ai $sp,$sp,-frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+ spe_comment(gen->f, -4, "Function epilogue:");
+
+ /* restore stack pointer # ai $sp,$sp,frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+
+ /* restore $lr # lqd $lr,16($sp) */
+ spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ /* return from function call */
+ spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
static boolean
emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, src_reg[4], dst_reg[4];
+ spe_comment(gen->f, -4, "MOV:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
/* XXX we don't always need to actually emit a mov instruction here */
- spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
- printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
- store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+ store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
}
return true;
}
-
/**
* Emit addition instructions. Recall that a single TGSI_OPCODE_ADD
* becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -261,49 +430,200 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
- /* Loop over Red/Green/Blue/Alpha channels */
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+ spe_comment(gen->f, -4, "ADD:");
+ /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
for (ch = 0; ch < 4; ch++) {
/* If the dest R, G, B or A writemask is enabled... */
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* get indexes of the two src, one dest SPE registers */
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* Emit actual SPE instruction: d = s1 + s2 */
+ spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ /* Free any intermediate temps we allocated */
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit subtract. See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "SUB:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 - s2 */
+ spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+/**
+ * Emit multiply add. See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "MAD:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 * s2 + s3 */
+ spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
+
+
+/**
+ * Emit linear interpolate. See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "LERP:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = s3 + s1(s2 - s3) */
+ spe_fs(gen->f, d_reg, s2_reg, s3_reg);
+ spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
- /* Emit actual SPE instruction: d = s1 + s2 */
- spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
+/**
+ * Emit multiply. See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+ spe_comment(gen->f, -4, "MUL:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 * s2 */
+ spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
- /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+/**
+ * Emit reciprocal. See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "RCP:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = 1/s1 */
+ spe_frest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- /* Free any intermediate temps we allocated */
free_itemps(gen);
}
}
return true;
}
+/**
+ * Emit reciprocal sqrt. See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "RSQ:");
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ /* d = 1/s1 */
+ spe_frsqest(gen->f, d_reg, s1_reg);
+ spe_fi(gen->f, d_reg, s1_reg, d_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+ return true;
+}
/**
- * Emit multiply. See emit_ADD for comments.
+ * Emit absolute value. See emit_ADD for comments.
*/
static boolean
-emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "ABS:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* d = s1 * s2 */
- spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
+ const int bit31mask_reg = get_itemp(gen);
+
+ /* mask with bit 31 set, the rest cleared */
+ spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+ /* d = sign bit cleared in s1 */
+ spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -311,6 +631,190 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
return true;
}
+/**
+ * Emit 3 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int s1x_reg, s1y_reg, s1z_reg;
+ int s2x_reg, s2y_reg, s2z_reg;
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+ spe_comment(gen->f, -4, "DP3:");
+
+ s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+ /* t0 = x0 * x1 */
+ spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+ /* t1 = y0 * y1 */
+ spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+ /* t0 = z0 * z1 + t0 */
+ spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit 4 component dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "DP4:");
+
+ int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ int tmp_reg = get_itemp(gen);
+
+ /* t = x0 * x1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* t = y0 * y1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = z0 * z1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+ /* t = w0 * w1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, tmp_reg);
+ store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit homogeneous dot product. See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ spe_comment(gen->f, -4, "DPH:");
+
+ int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ int tmp_reg = get_itemp(gen);
+
+ /* t = x0 * x1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* t = y0 * y1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = z0 * z1 + t */
+ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+ /* t = w1 + t */
+ spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, tmp_reg);
+ store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ free_itemps(gen);
+ return true;
+}
+
+/**
+ * Emit cross product. See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ spe_comment(gen->f, -4, "XPD:");
+
+ int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ int tmp_reg = get_itemp(gen);
+
+ /* t = z0 * y1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = y0 * z1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+ store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+ }
+
+ s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ /* t = x0 * z1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ /* t = z0 * x1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+ store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+ }
+
+ s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ /* t = y0 * x1 */
+ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+ s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ /* t = x0 * y1 - t */
+ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+ store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+ }
+
+ free_itemps(gen);
+ return true;
+}
/**
* Emit set-if-greater-than.
@@ -323,6 +827,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ spe_comment(gen->f, -4, "SGT:");
+
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -331,17 +837,410 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
/* d = (s1 > s2) */
spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
- printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
/* convert d from 0x0/0xffffffff to 0.0/1.0 */
/* d = d & one_reg */
spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
- printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_less-then. See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLT:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 < s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SGE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 >= s2) */
+ spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SLE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 <= s2) */
+ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = ~d & one_reg */
+ spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SEQ:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 == s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit set-if_not_equal. See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "SNE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* d = (s1 != s2) */
+ spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+ spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+ /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+ /* d = d & one_reg */
+ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit compare. See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "CMP:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int zero_reg = get_itemp(gen);
+
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ /* d = (s1 < 0) ? s2 : s3 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit trunc.
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "TRUNC:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit floor.
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FLR:");
+
+ int zero_reg = get_itemp(gen);
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, d_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit frac.
+ * Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "FLR:");
+
+ int zero_reg = get_itemp(gen);
+ spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* If negative, subtract 1.0 */
+ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+ /* Convert float to int */
+ spe_cflts(gen->f, d_reg, d_reg, 0);
+
+ /* Convert int to float */
+ spe_csflt(gen->f, d_reg, d_reg, 0);
+
+ /* d = s1 - FLR(s1) */
+ spe_fs(gen->f, d_reg, s1_reg, d_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+ struct cell_spu_function_info *funcs = &cell->spu_functions;
+ uint i;
+ for (i = 0; i < funcs->num; i++) {
+ printf("SPU func %u: %s at %u\n",
+ i, funcs->names[i], funcs->addrs[i]);
+ }
+}
#endif
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+ const struct cell_spu_function_info *funcs = &cell->spu_functions;
+ uint i, addr = 0;
+ for (i = 0; i < funcs->num; i++) {
+ if (strcmp(funcs->names[i], funcname) == 0) {
+ addr = funcs->addrs[i];
+ }
+ }
+ assert(addr && "spu function not found");
+ return addr / 4; /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+ const struct tgsi_full_instruction *inst,
+ char *funcname, uint num_args)
+{
+ const uint addr = lookup_function(gen->cell, funcname);
+ char comment[100];
+ int ch;
+
+ assert(num_args <= 3);
+
+ snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+ spe_comment(gen->f, -4, comment);
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s_regs[3], d_reg;
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint a, i, numUsed;
+
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ }
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 32);
+
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
+
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return value */
+ spe_move(gen->f, d_reg, 3);
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -351,6 +1250,222 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
}
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const uint addr = lookup_function(gen->cell, "spu_txp");
+ int ch;
+ int coord_regs[4], d_regs[4];
+
+ spe_comment(gen->f, -4, "CALL txp:");
+
+ /* get src/dst reg info */
+ for (ch = 0; ch < 4; ch++) {
+ coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+
+ {
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint i, numUsed;
+
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 32);
+
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+
+ /* setup function arguments */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, 3 + i, coord_regs[i]);
+ }
+
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return values (four pixel's colors) */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, d_regs[i], 3 + i);
+ }
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_regs[0] &&
+ reg != d_regs[1] &&
+ reg != d_regs[2] &&
+ reg != d_regs[3]) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return TRUE;
+}
+
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "MAX:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* d = (s1 > s2) ? s1 : s2 */
+ spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
+ spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Emit max. See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, "MIN:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ int tmp_reg = get_itemp(gen);
+
+ /* d = (s2 > s1) ? s1 : s2 */
+ spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
+ spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int channel = 0;
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "IF:");
+
+ /* update execution mask with the predicate register */
+ int tmp_reg = get_itemp(gen);
+ int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+ /* tmp = (s1_reg == 0) */
+ spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+ /* tmp = !tmp */
+ spe_complement(gen->f, tmp_reg, tmp_reg);
+ /* exec_mask = exec_mask & tmp */
+ spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+ gen->if_nesting++;
+
+ free_itemps(gen);
+
+ return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ELSE:");
+
+ /* exec_mask = !exec_mask */
+ spe_complement(gen->f, exec_reg, exec_reg);
+
+ return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const int exec_reg = get_exec_mask_reg(gen);
+
+ spe_comment(gen->f, -4, "ENDIF:");
+
+ /* XXX todo: pop execution mask */
+
+ spe_load_int(gen->f, exec_reg, ~0x0);
+
+ gen->if_nesting--;
+ return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+ boolean ddx)
+{
+ int ch;
+
+ spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+ int t1_reg = get_itemp(gen);
+ int t2_reg = get_itemp(gen);
+
+ spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+ if (ddx) {
+ spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+ }
+ else {
+ spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+ }
+ spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+ free_itemps(gen);
+ }
+ }
+
+ return true;
+}
+
+
+
+
/**
* Emit END instruction.
* We just return from the shader function at this point.
@@ -361,11 +1476,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_END(struct codegen *gen)
{
- /* return from function call */
- spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
- printf("bi\trRA\n");
-#endif
+ spe_comment(gen->f, -4, "END:");
+ emit_epilogue(gen);
return true;
}
@@ -379,19 +1491,93 @@ emit_instruction(struct codegen *gen,
{
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
return emit_MOV(gen, inst);
case TGSI_OPCODE_MUL:
return emit_MUL(gen, inst);
case TGSI_OPCODE_ADD:
return emit_ADD(gen, inst);
+ case TGSI_OPCODE_SUB:
+ return emit_SUB(gen, inst);
+ case TGSI_OPCODE_MAD:
+ return emit_MAD(gen, inst);
+ case TGSI_OPCODE_LERP:
+ return emit_LERP(gen, inst);
+ case TGSI_OPCODE_DP3:
+ return emit_DP3(gen, inst);
+ case TGSI_OPCODE_DP4:
+ return emit_DP4(gen, inst);
+ case TGSI_OPCODE_DPH:
+ return emit_DPH(gen, inst);
+ case TGSI_OPCODE_XPD:
+ return emit_XPD(gen, inst);
+ case TGSI_OPCODE_RCP:
+ return emit_RCP(gen, inst);
+ case TGSI_OPCODE_RSQ:
+ return emit_RSQ(gen, inst);
+ case TGSI_OPCODE_ABS:
+ return emit_ABS(gen, inst);
case TGSI_OPCODE_SGT:
return emit_SGT(gen, inst);
+ case TGSI_OPCODE_SLT:
+ return emit_SLT(gen, inst);
+ case TGSI_OPCODE_SGE:
+ return emit_SGE(gen, inst);
+ case TGSI_OPCODE_SLE:
+ return emit_SLE(gen, inst);
+ case TGSI_OPCODE_SEQ:
+ return emit_SEQ(gen, inst);
+ case TGSI_OPCODE_SNE:
+ return emit_SNE(gen, inst);
+ case TGSI_OPCODE_CMP:
+ return emit_CMP(gen, inst);
+ case TGSI_OPCODE_MAX:
+ return emit_MAX(gen, inst);
+ case TGSI_OPCODE_MIN:
+ return emit_MIN(gen, inst);
+ case TGSI_OPCODE_TRUNC:
+ return emit_TRUNC(gen, inst);
+ case TGSI_OPCODE_FLR:
+ return emit_FLR(gen, inst);
+ case TGSI_OPCODE_FRC:
+ return emit_FRC(gen, inst);
case TGSI_OPCODE_END:
return emit_END(gen);
+ case TGSI_OPCODE_COS:
+ return emit_function_call(gen, inst, "spu_cos", 1);
+ case TGSI_OPCODE_SIN:
+ return emit_function_call(gen, inst, "spu_sin", 1);
+ case TGSI_OPCODE_POW:
+ return emit_function_call(gen, inst, "spu_pow", 2);
+ case TGSI_OPCODE_EXPBASE2:
+ return emit_function_call(gen, inst, "spu_exp2", 1);
+ case TGSI_OPCODE_LOGBASE2:
+ return emit_function_call(gen, inst, "spu_log2", 1);
+ case TGSI_OPCODE_TEX:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXD:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXP:
+ return emit_TXP(gen, inst);
+
+ case TGSI_OPCODE_IF:
+ return emit_IF(gen, inst);
+ case TGSI_OPCODE_ELSE:
+ return emit_ELSE(gen, inst);
+ case TGSI_OPCODE_ENDIF:
+ return emit_ENDIF(gen, inst);
+
+ case TGSI_OPCODE_DDX:
+ return emit_DDX_DDY(gen, inst, true);
+ case TGSI_OPCODE_DDY:
+ return emit_DDX_DDY(gen, inst, false);
+
/* XXX lots more cases to do... */
default:
+ fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+ inst->Instruction.Opcode);
return false;
}
@@ -401,48 +1587,93 @@ emit_instruction(struct codegen *gen,
/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+ int ch;
+
+ assert(gen->num_imm < MAX_TEMPS);
+
+ spe_comment(gen->f, -4, "IMMEDIATE:");
+
+ for (ch = 0; ch < 4; ch++) {
+ float val = immed->u.ImmediateFloat32[ch].Float;
+
+ if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+ /* re-use previous register */
+ gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+ }
+ else {
+ int reg = spe_allocate_available_register(gen->f);
+
+ if (reg < 0)
+ return false;
+
+ /* update immediate map */
+ gen->imm_regs[gen->num_imm][ch] = reg;
+
+ /* emit initializer instruction */
+ spe_load_float(gen->f, reg, val);
+ }
+ }
+
+ gen->num_imm++;
+
+ return true;
+}
+
+
+
+/**
* Emit "code" for a TGSI declaration.
* We only care about TGSI TEMPORARY register declarations at this time.
* For each TGSI TEMPORARY we allocate four SPE registers.
*/
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+ struct codegen *gen, const struct tgsi_full_declaration *decl)
{
int i, ch;
switch (decl->Declaration.File) {
case TGSI_FILE_TEMPORARY:
-#if DISASSEM
- printf("Declare temp reg %d .. %d\n",
- decl->DeclarationRange.First,
- decl->DeclarationRange.Last);
-#endif
for (i = decl->DeclarationRange.First;
i <= decl->DeclarationRange.Last;
i++) {
+ assert(i < MAX_TEMPS);
for (ch = 0; ch < 4; ch++) {
gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+ if (gen->temp_regs[i][ch] < 0)
+ return false; /* out of regs */
}
/* XXX if we run out of SPE registers, we need to spill
* to SPU memory. someday...
*/
-#if DISASSEM
- printf(" SPE regs: %d %d %d %d\n",
- gen->temp_regs[i][0],
- gen->temp_regs[i][1],
- gen->temp_regs[i][2],
- gen->temp_regs[i][3]);
-#endif
+ {
+ char buf[100];
+ sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+ gen->temp_regs[i][0], gen->temp_regs[i][1],
+ gen->temp_regs[i][2], gen->temp_regs[i][3]);
+ spe_comment(gen->f, -4, buf);
+ }
}
break;
default:
; /* ignore */
}
+
+ return true;
}
+
/**
* Translate TGSI shader code to SPE instructions. This is done when
* the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -460,6 +1691,7 @@ cell_gen_fragment_program(struct cell_context *cell,
struct codegen gen;
memset(&gen, 0, sizeof(gen));
+ gen.cell = cell;
gen.f = f;
/* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -472,50 +1704,50 @@ cell_gen_fragment_program(struct cell_context *cell,
spe_allocate_register(f, gen.outputs_reg);
spe_allocate_register(f, gen.constants_reg);
-#if DISASSEM
- printf("Begin %s\n", __FUNCTION__);
- tgsi_dump(tokens, 0);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ printf("Begin %s\n", __FUNCTION__);
+ tgsi_dump(tokens, 0);
+ }
tgsi_parse_init(&parse, tokens);
+ emit_prologue(&gen);
+
while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
tgsi_parse_token(&parse);
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
- if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
- goto fail;
-#endif
+ if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_DECLARATION:
- emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+ if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+ gen.error = true;
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+ if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
gen.error = true;
- }
break;
default:
assert(0);
-
}
}
-
if (gen.error) {
/* terminate the SPE code */
return emit_END(&gen);
}
-#if DISASSEM
- printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
- printf("End %s\n", __FUNCTION__);
-#endif
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+ printf("End %s\n", __FUNCTION__);
+ }
tgsi_parse_free( &parse );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e980..de170d1036c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,12 +54,17 @@
* \param ifragZ_reg register containing integer fragment Z values (in)
* \param ifbZ_reg register containing integer frame buffer Z values (in/out)
* \param zmask_reg register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
*/
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
- struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
{
+ /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+ * quantities. This only makes a difference for 32-bit Z values though.
+ */
ASSERT(dsa->depth.enabled);
switch (dsa->depth.func) {
@@ -79,28 +84,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
case PIPE_FUNC_GREATER:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LESS:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LEQUAL:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_GEQUAL:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
@@ -129,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
* framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
*/
spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+ return true;
}
+
+ return false;
}
@@ -229,7 +237,39 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
spe_release_register(f, amask_reg);
}
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers. Once a constant is discovered to be
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+{
+ if (*is_already_set) return;
+ *r = spe_allocate_available_register(f);
+}
+
+static inline void
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ if (!*is_already_set) return;
+ spe_release_register(f, r);
+ *is_already_set = false;
+}
+
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+ if (*is_already_set) return;
+ setup_optional_register(f, is_already_set, r);
+ spe_load_float(f, *r, value);
+}
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ release_optional_register(f, is_already_set, r);
+}
/**
* Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +282,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
*/
static void
gen_blend(const struct pipe_blend_state *blend,
+ const struct pipe_blend_color *blend_color,
struct spe_function *f,
enum pipe_format color_format,
int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +303,17 @@ gen_blend(const struct pipe_blend_state *blend,
int fbB_reg = spe_allocate_available_register(f);
int fbA_reg = spe_allocate_available_register(f);
- int one_reg = spe_allocate_available_register(f);
int tmp_reg = spe_allocate_available_register(f);
- boolean one_reg_set = false; /* avoid setting one_reg more than once */
+ /* Optional constant registers we might or might not end up using;
+ * if we do use them, make sure we only allocate them once by
+ * keeping a flag on each one.
+ */
+ boolean one_reg_set = false;
+ unsigned int one_reg;
+ boolean constR_reg_set = false, constG_reg_set = false,
+ constB_reg_set = false, constA_reg_set = false;
+ unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
ASSERT(blend->blend_enable);
@@ -346,127 +394,449 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, mask_reg);
}
-
/*
- * Compute Src RGB terms
+ * Compute Src RGB terms. We're actually looking for the value
+ * of (the appropriate RGB factors) * (the incoming source RGB color),
+ * because in some cases (like PIPE_BLENDFACTOR_ONE and
+ * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
*/
switch (blend->rgb_src_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (R,G,B) */
spe_move(f, term1R_reg, fragR_reg);
spe_move(f, term1G_reg, fragG_reg);
spe_move(f, term1B_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term1R_reg);
- spe_zero(f, term1G_reg);
- spe_zero(f, term1B_reg);
+ /* factors = (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term1R_reg, 0.0f);
+ spe_load_float(f, term1G_reg, 0.0f);
+ spe_load_float(f, term1B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
break;
- /* XXX more cases */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
+ * or in other words term = (R-R*R, G-G*G, B-B*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+ * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+ * or term = (R-R*A,G-G*A,B-B*A)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+ spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
+ * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+ spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+ spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+ spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+ spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
+ * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+ * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+ spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+ spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ /* We'll need the optional {1,1,1,1} register */
+ setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+ /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
+ * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+ * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+ * as long as a is positive), but then we'd have to do three
+ * spe_float_min() functions instead of one, so this is simpler.
+ */
+ /* tmp = 1 - Afb */
+ spe_fs(f, tmp_reg, one_reg, fbA_reg);
+ /* tmp = min(A,tmp) */
+ spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+ /* term = R*tmp */
+ spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+ spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+ spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Src Alpha term
+ * Compute Src Alpha term. Like the above, we're looking for
+ * the full term A*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_src_factor) {
+ case PIPE_BLENDFACTOR_ZERO:
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term1A_reg, 0.0f);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = A */
spe_move(f, term1A_reg, fragA_reg);
break;
+
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = A*A */
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
break;
- /* XXX more cases */
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = A*(1-A) = A-A*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = A*Afb */
+ spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = A*Ac */
+ spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Compute Dest RGB terms
+ * Compute Dest RGB term. Like the above, we're looking for
+ * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->rgb_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
spe_move(f, term2R_reg, fbR_reg);
spe_move(f, term2G_reg, fbG_reg);
spe_move(f, term2B_reg, fbB_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2R_reg);
- spe_zero(f, term2G_reg);
- spe_zero(f, term2B_reg);
+ /* factor s= (0,0,0), so term = (0,0,0) */
+ spe_load_float(f, term2R_reg, 0.0f);
+ spe_load_float(f, term2G_reg, 0.0f);
+ spe_load_float(f, term2B_reg, 0.0f);
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
break;
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
+ * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+ break;
case PIPE_BLENDFACTOR_SRC_ALPHA:
+ /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* term = fb * tmp */
- spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
- spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
- spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
- break;
- /* XXX more cases */
+ /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+ * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+ spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
+ * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+ spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ /* we'll need the optional constant alpha register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+ spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+ spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+ spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
+ * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ /* We need the optional constant color registers */
+ setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+ * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+ * fnms(a,b,c,d) computes a = d - b*c
+ */
+ spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+ spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+ spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+ break;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+ ASSERT(0);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
default:
ASSERT(0);
}
/*
- * Compute Dest Alpha term
+ * Compute Dest Alpha term. Like the above, we're looking for
+ * the full term Afb*factor, not just the factor itself, because
+ * in many cases we can avoid doing unnecessary multiplies.
*/
switch (blend->alpha_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
+ /* factor = 1, so term = Afb */
spe_move(f, term2A_reg, fbA_reg);
break;
case PIPE_BLENDFACTOR_ZERO:
- spe_zero(f, term2A_reg);
+ /* factor = 0, so term = 0 */
+ spe_load_float(f, term2A_reg, 0.0f);
break;
- case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ /* factor = A, so term = Afb*A */
spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
break;
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- /* one = {1.0, 1.0, 1.0, 1.0} */
- if (!one_reg_set) {
- spe_load_float(f, one_reg, 1.0f);
- one_reg_set = true;
- }
- /* tmp = one - fragA */
- spe_fs(f, tmp_reg, one_reg, fragA_reg);
- /* termA = fbA * tmp */
- spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* factor = Afb, so term = Afb*Afb */
+ spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
break;
- /* XXX more cases */
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = Ac, so term = Afb*Ac */
+ spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ /* We need the optional constA_reg register */
+ setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+ /* fnms(a,b,c,d) computes a = d - b*c */
+ spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+ ASSERT(0);
+ break;
+
+ /* These are special D3D cases involving a second color output
+ * from the fragment shader. I'm not sure we can support them
+ * yet... XXX
+ */
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
default:
ASSERT(0);
}
/*
- * Combine Src/Dest RGB terms
+ * Combine Src/Dest RGB terms as per the blend equation.
*/
switch (blend->rgb_func) {
case PIPE_BLEND_ADD:
@@ -479,7 +849,21 @@ gen_blend(const struct pipe_blend_state *blend,
spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+ spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+ spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+ spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+ spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+ break;
default:
ASSERT(0);
}
@@ -494,7 +878,15 @@ gen_blend(const struct pipe_blend_state *blend,
case PIPE_BLEND_SUBTRACT:
spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+ break;
+ case PIPE_BLEND_MIN:
+ spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
+ case PIPE_BLEND_MAX:
+ spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+ break;
default:
ASSERT(0);
}
@@ -514,8 +906,14 @@ gen_blend(const struct pipe_blend_state *blend,
spe_release_register(f, fbB_reg);
spe_release_register(f, fbA_reg);
- spe_release_register(f, one_reg);
spe_release_register(f, tmp_reg);
+
+ /* Free any optional registers that actually got used */
+ release_const_register(f, &one_reg_set, one_reg);
+ release_const_register(f, &constR_reg_set, constR_reg);
+ release_const_register(f, &constG_reg_set, constG_reg);
+ release_const_register(f, &constB_reg_set, constB_reg);
+ release_const_register(f, &constA_reg_set, constA_reg);
}
@@ -524,24 +922,74 @@ gen_logicop(const struct pipe_blend_state *blend,
struct spe_function *f,
int fragRGBA_reg, int fbRGBA_reg)
{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
-}
-
-
-static void
-gen_colormask(uint colormask,
- struct spe_function *f,
- int fragRGBA_reg, int fbRGBA_reg)
-{
- /* XXX to-do */
- /* operate on 32-bit packed pixels, not float colors */
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas.
+ * */
+ ASSERT(blend->logicop_enable);
+
+ switch(blend->logicop_func) {
+ case PIPE_LOGICOP_CLEAR: /* 0 */
+ spe_zero(f, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOR: /* ~(s | d) */
+ spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+ /* andc R, A, B computes R = A & ~B */
+ spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_INVERT: /* ~d */
+ /* Note that (A nor A) == ~(A|A) == ~A */
+ spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_XOR: /* s ^ d */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NAND: /* ~(s & d) */
+ spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_AND: /* s & d */
+ spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+ spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_NOOP: /* d */
+ spe_move(f, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+ break;
+ case PIPE_LOGICOP_COPY: /* s */
+ break;
+ case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+ /* orc R, A, B computes R = A | ~B */
+ spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_OR: /* s | d */
+ spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+ break;
+ case PIPE_LOGICOP_SET: /* 1 */
+ spe_load_int(f, fragRGBA_reg, 0xffffffff);
+ break;
+ default:
+ ASSERT(0);
+ }
}
-
/**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
*
* \param f SPE function to append instruction onto.
* \param color_format the dest color packing format
@@ -557,13 +1005,16 @@ gen_pack_colors(struct spe_function *f,
int r_reg, int g_reg, int b_reg, int a_reg,
int rgba_reg)
{
+ int rg_reg = spe_allocate_available_register(f);
+ int ba_reg = spe_allocate_available_register(f);
+
/* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
spe_cfltu(f, r_reg, r_reg, 32);
spe_cfltu(f, g_reg, g_reg, 32);
spe_cfltu(f, b_reg, b_reg, 32);
spe_cfltu(f, a_reg, a_reg, 32);
- /* Shift the most significant bytes to least the significant positions.
+ /* Shift the most significant bytes to the least significant positions.
* I.e.: reg = reg >> 24
*/
spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,12 +1046,719 @@ gen_pack_colors(struct spe_function *f,
* OR-ing all those together gives us four packed colors:
* RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
*/
- spe_or(f, rgba_reg, r_reg, g_reg);
- spe_or(f, rgba_reg, rgba_reg, b_reg);
- spe_or(f, rgba_reg, rgba_reg, a_reg);
+ spe_or(f, rg_reg, r_reg, g_reg);
+ spe_or(f, ba_reg, a_reg, b_reg);
+ spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+ spe_release_register(f, rg_reg);
+ spe_release_register(f, ba_reg);
+}
+
+static void
+gen_colormask(struct spe_function *f,
+ uint colormask,
+ enum pipe_format color_format,
+ int fragRGBA_reg, int fbRGBA_reg)
+{
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas. Further, the pixels
+ * are packed according to the given color format, not
+ * necessarily RGBA...
+ */
+ unsigned int r_mask;
+ unsigned int g_mask;
+ unsigned int b_mask;
+ unsigned int a_mask;
+
+ /* Calculate exactly where the bits for any particular color
+ * end up, so we can mask them correctly.
+ */
+ switch(color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ /* ARGB */
+ a_mask = 0xff000000;
+ r_mask = 0x00ff0000;
+ g_mask = 0x0000ff00;
+ b_mask = 0x000000ff;
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ /* BGRA */
+ b_mask = 0xff000000;
+ g_mask = 0x00ff0000;
+ r_mask = 0x0000ff00;
+ a_mask = 0x000000ff;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /* For each R, G, B, and A component we're supposed to mask out,
+ * clear its bits. Then our mask operation later will work
+ * as expected.
+ */
+ if (!(colormask & PIPE_MASK_R)) {
+ r_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_G)) {
+ g_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_B)) {
+ b_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_A)) {
+ a_mask = 0;
+ }
+
+ /* Get a temporary register to hold the mask that will be applied to the fragment */
+ int colormask_reg = spe_allocate_available_register(f);
+
+ /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+ * masks. Load the result value into our temporary register.
+ */
+ spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+ /* Use the mask register to select between the fragment color
+ * values and the frame buffer color values. Wherever the
+ * mask has a 0 bit, the current frame buffer color should override
+ * the fragment color. Wherever the mask has a 1 bit, the
+ * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
+ * instruction will select bits from its first operand rA wherever the
+ * the mask bits rM are 0, and from its second operand rB wherever the
+ * mask bits rM are 1. That means that the frame buffer color is the
+ * first operand, and the fragment color the second.
+ */
+ spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+ /* Release the temporary register and we're done */
+ spe_release_register(f, colormask_reg);
+}
+
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value. As such, we have
+ * access to the Compare Immediate instructions where we don't in
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test. The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
+ unsigned int mask_reg, unsigned int fbS_reg,
+ unsigned int stencil_pass_reg)
+{
+ /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+ * register, taking into account whether each fragment was active to begin with.
+ */
+ switch (state->func) {
+ case PIPE_FUNC_EQUAL:
+ /* stencil_pass = mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ /* stencil_fail = mask & ~stencil_pass */
+ break;
+
+ case PIPE_FUNC_NOTEQUAL:
+ /* stencil_pass = mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_GREATER:
+ /* stencil_pass = mask & (s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_LESS: {
+ /* stencil_pass = mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ break;
+ }
+
+ case PIPE_FUNC_LEQUAL:
+ /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_GEQUAL: {
+ /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ break;
+ }
+
+ case PIPE_FUNC_NEVER:
+ /* stencil_pass = mask & 0 = 0 */
+ spe_load_uint(f, stencil_pass_reg, 0);
+ spe_move(f, stencil_pass_reg, mask_reg); /* zmask = mask */
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ /* stencil_pass = mask & 1 = mask */
+ spe_move(f, stencil_pass_reg, mask_reg);
+ break;
+ }
+
+ /* The fragments that passed the stencil test are now in stencil_pass_reg.
+ * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+ */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply. It does not
+ * apply any tests. It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+ unsigned int stencil_ref_value, unsigned int stencil_max_value,
+ unsigned int fbS_reg, unsigned int newS_reg)
+{
+ /* The code below assumes that newS_reg and fbS_reg are not the same
+ * register; if they can be, the calculations below will have to use
+ * an additional temporary register. For now, mark the assumption
+ * with an assertion that will fail if they are the same.
+ */
+ ASSERT(fbS_reg != newS_reg);
+
+ /* The code also assumes the the stencil_max_value is of the form
+ * 2^n-1 and can therefore be used as a mask for the valid bits in
+ * addition to a maximum. Make sure this is the case as well.
+ * The clever math below exploits the fact that incrementing a
+ * binary number serves to flip all the bits of a number starting at
+ * the LSB and continuing to (and including) the first zero bit
+ * found. That means that a number and its increment will always
+ * have at least one bit in common (the high order bit, if nothing
+ * else) *unless* the number is zero, *or* the number is of a form
+ * consisting of some number of 1s in the low-order bits followed
+ * by nothing but 0s in the high-order bits. The latter case
+ * implies it's of the form 2^n-1.
+ */
+ ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+ switch(stencil_op) {
+ case PIPE_STENCIL_OP_KEEP:
+ /* newS = S */
+ spe_move(f, newS_reg, fbS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_ZERO:
+ /* newS = 0 */
+ spe_zero(f, newS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_REPLACE:
+ /* newS = stencil reference value */
+ spe_load_uint(f, newS_reg, stencil_ref_value);
+ break;
+
+ case PIPE_STENCIL_OP_INCR: {
+ /* newS = (s == max ? max : s + 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+ /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_DECR: {
+ /* newS = (s == 0 ? 0 : s - 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+ /* Add Word Immediate with a (-1) value works */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+ * do a normal add and mask off the correct bits
+ */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_INVERT:
+ /* newS = ~s. We take advantage of the mask/max value to invert only
+ * the valid bits for the field so we don't have to do an extra "and".
+ */
+ spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+ break;
+
+ default:
+ ASSERT(0);
+ }
}
+/* This function generates code to get all the necessary possible
+ * stencil values. For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+ unsigned int fbS_reg,
+ unsigned int *fail_reg, unsigned int *zfail_reg,
+ unsigned int *zpass_reg, unsigned int *back_fail_reg,
+ unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+ unsigned zfail_op, back_zfail_op;
+
+ /* Stenciling had better be enabled here */
+ ASSERT(dsa->stencil[0].enabled);
+
+ /* If the depth test is not enabled, it is treated as though it always
+ * passes. In particular, that means that the "zfail_op" (and the backfacing
+ * counterpart, if active) are not considered - a failing stencil test will
+ * trigger the "fail_op", and a passing stencil test will trigger the
+ * "zpass_op".
+ *
+ * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+ * we keep them from being calculated.
+ */
+ if (dsa->depth.enabled) {
+ zfail_op = dsa->stencil[0].zfail_op;
+ back_zfail_op = dsa->stencil[1].zfail_op;
+ }
+ else {
+ zfail_op = PIPE_STENCIL_OP_KEEP;
+ back_zfail_op = PIPE_STENCIL_OP_KEEP;
+ }
+
+ /* One-sided or front-facing stencil */
+ if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+ *fail_reg = fbS_reg;
+ }
+ else {
+ *fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *fail_reg);
+ }
+
+ if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *zfail_reg = fbS_reg;
+ }
+ else if (zfail_op == dsa->stencil[0].fail_op) {
+ *zfail_reg = *fail_reg;
+ }
+ else {
+ *zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *zfail_reg);
+ }
+
+ if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *zpass_reg = fbS_reg;
+ }
+ else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+ *zpass_reg = *fail_reg;
+ }
+ else if (dsa->stencil[0].zpass_op == zfail_op) {
+ *zpass_reg = *zfail_reg;
+ }
+ else {
+ *zpass_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *zpass_reg);
+ }
+
+ /* If two-sided stencil is enabled, we have more work to do. */
+ if (!dsa->stencil[1].enabled) {
+ /* This just flags that the registers need not be deallocated later */
+ *back_fail_reg = fbS_reg;
+ *back_zfail_reg = fbS_reg;
+ *back_zpass_reg = fbS_reg;
+ }
+ else {
+ /* Same calculations as above, but for the back stencil */
+ if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+ *back_fail_reg = fbS_reg;
+ }
+ else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+ *back_fail_reg = *fail_reg;
+ }
+ else if (dsa->stencil[1].fail_op == zfail_op) {
+ *back_fail_reg = *zfail_reg;
+ }
+ else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+ *back_fail_reg = *zpass_reg;
+ }
+ else {
+ *back_fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_fail_reg);
+ }
+
+ if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *back_zfail_reg = fbS_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[0].fail_op) {
+ *back_zfail_reg = *fail_reg;
+ }
+ else if (back_zfail_op == zfail_op) {
+ *back_zfail_reg = *zfail_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+ *back_zfail_reg = *zpass_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[1].fail_op) {
+ *back_zfail_reg = *back_fail_reg;
+ }
+ else {
+ *back_zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_zfail_reg);
+ }
+
+ if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *back_zpass_reg = fbS_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+ *back_zpass_reg = *fail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == zfail_op) {
+ *back_zpass_reg = *zfail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+ *back_zpass_reg = *zpass_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+ *back_zpass_reg = *back_fail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+ *back_zpass_reg = *back_zfail_reg;
+ }
+ else {
+ *back_zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_zpass_reg);
+ }
+ } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
+ const int const facing_reg,
+ const int mask_reg, const int fragZ_reg,
+ const int fbZ_reg, const int fbS_reg)
+{
+ /* True if we've generated code that could require writeback to the
+ * depth and/or stencil buffers
+ */
+ boolean modified_buffers = false;
+
+ boolean need_to_calculate_stencil_values;
+ boolean need_to_writemask_stencil_values;
+
+ /* Registers. We may or may not actually allocate these, depending
+ * on whether the state values indicate that we need them.
+ */
+ unsigned int stencil_pass_reg, stencil_fail_reg;
+ unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+ unsigned int stencil_writemask_reg;
+ unsigned int zmask_reg;
+ unsigned int newS_reg;
+
+ /* Stenciling is quite complex: up to six different configurable stencil
+ * operations/calculations can be required (three each for front-facing
+ * and back-facing fragments). Many of those operations will likely
+ * be identical, so there's good reason to try to avoid calculating
+ * the same values more than once (which unfortunately makes the code less
+ * straightforward).
+ *
+ * To make register management easier, we start a new
+ * register set; we can release all the registers in the set at
+ * once, and avoid having to keep track of exactly which registers
+ * we allocate. We can still allocate and free registers as
+ * desired (if we know we no longer need a register), but we don't
+ * have to spend the complexity to track the more difficult variant
+ * register usage scenarios.
+ */
+ spe_allocate_register_set(f);
+
+ /* Calculate the writemask. If the writemask is trivial (either
+ * all 0s, meaning that we don't need to calculate any stencil values
+ * because they're not going to change the stencil anyway, or all 1s,
+ * meaning that we have to calculate the stencil values but do not
+ * need to mask them), we can avoid generating code. Don't forget
+ * that we need to consider backfacing stencil, if enabled.
+ */
+ if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+ /* Trivial: don't need to calculate stencil values, and don't need to
+ * write them back to the framebuffer.
+ */
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+ /* Still trivial, but a little less so. We need to write the stencil
+ * values, but we don't need to mask them.
+ */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = false;
+ }
+ else {
+ /* The general case: calculate, mask, and write */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = true;
+
+ /* While we're here, generate code that calculates what the
+ * writemask should be. If backface stenciling is enabled,
+ * and the backface writemask is not the same as the frontface
+ * writemask, we'll have to generate code that merges the
+ * two masks into a single effective mask based on fragment facing.
+ */
+ stencil_writemask_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+ if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+ unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+ spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+ spe_release_register(f, back_write_mask_reg);
+ }
+ }
+
+ /* At least one-sided stenciling must be on. Generate code that
+ * runs the stencil test on the basic/front-facing stencil, leaving
+ * the mask of passing stencil bits in stencil_pass_reg. This mask will
+ * be used both to mask the set of active pixels, and also to
+ * determine how the stencil buffer changes.
+ *
+ * This test will *not* change the value in mask_reg (because we don't
+ * yet know whether to apply the two-sided stencil or one-sided stencil).
+ */
+ stencil_pass_reg = spe_allocate_available_register(f);
+ gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+ /* If two-sided stenciling is on, generate code to run the stencil
+ * test on the backfacing stencil as well, and combine the two results
+ * into the one correct result based on facing.
+ */
+ if (dsa->stencil[1].enabled) {
+ unsigned int temp_reg = spe_allocate_available_register(f);
+ gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+ spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+ spe_release_register(f, temp_reg);
+ }
+
+ /* Generate code that, given the mask of valid fragments and the
+ * mask of valid fragments that passed the stencil test, computes
+ * the mask of valid fragments that failed the stencil test. We
+ * have to do this before we run a depth test (because the
+ * depth test should not be performed on fragments that failed the
+ * stencil test, and because the depth test will update the
+ * mask of valid fragments based on the results of the depth test).
+ */
+ stencil_fail_reg = spe_allocate_available_register(f);
+ spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+ /* Now remove the stenciled-out pixels from the valid fragment mask,
+ * so we can later use the valid fragment mask in the depth test.
+ */
+ spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+ /* We may not need to calculate stencil values, if the writemask is off */
+ if (need_to_calculate_stencil_values) {
+ unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+ unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+ /* Generate code that calculates exactly which stencil values we need,
+ * without calculating the same value twice (say, if two different
+ * stencil ops have the same value). This code will work for one-sided
+ * and two-sided stenciling (so that we take into account that operations
+ * may match between front and back stencils), and will also take into
+ * account whether the depth test is enabled (if the depth test is off,
+ * we don't need any of the zfail results, because the depth test always
+ * is considered to pass if it is disabled). Any register value that
+ * does not need to be calculated will come back with the same value
+ * that's in fbS_reg.
+ *
+ * This function will allocate a variant number of registers that
+ * will be released as part of the register set.
+ */
+ gen_get_stencil_values(f, dsa, fbS_reg,
+ &front_stencil_fail_values, &front_stencil_pass_depth_fail_values,
+ &front_stencil_pass_depth_pass_values, &back_stencil_fail_values,
+ &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+ /* Tricky, tricky, tricky - the things we do to create optimal
+ * code...
+ *
+ * The various stencil values registers may overlap with each other
+ * and with fbS_reg arbitrarily (as any particular operation is
+ * only calculated once and stored in one register, no matter
+ * how many times it is used). So we can't change the values
+ * within those registers directly - if we change a value in a
+ * register that's being referenced by two different calculations,
+ * we've just unwittingly changed the second value as well...
+ *
+ * Avoid this by allocating new registers to hold the results
+ * (there may be 2, if the depth test is off, or 3, if it is on).
+ * These will be released as part of the register set.
+ */
+ if (!dsa->stencil[1].enabled) {
+ /* The easy case: if two-sided stenciling is *not* enabled, we
+ * just use the front-sided values.
+ */
+ stencil_fail_values = front_stencil_fail_values;
+ stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+ stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+ }
+ else { /* two-sided stencil enabled */
+ /* Allocate new registers for the needed merged values */
+ stencil_fail_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+ if (dsa->depth.enabled) {
+ stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+ }
+ else {
+ stencil_pass_depth_fail_values = fbS_reg;
+ }
+ stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+ }
+ }
+
+ /* We now have all the stencil values we need. We also need
+ * the results of the depth test to figure out which
+ * stencil values will become the new stencil values. (Even if
+ * we aren't actually calculating stencil values, we need to apply
+ * the depth test if it's enabled.)
+ *
+ * The code generated by gen_depth_test() returns the results of the
+ * test in the given register, but also alters the mask_reg based
+ * on the results of the test.
+ */
+ if (dsa->depth.enabled) {
+ zmask_reg = spe_allocate_available_register(f);
+ modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ }
+
+ if (need_to_calculate_stencil_values) {
+ /* If we need to writemask the stencil values before going into
+ * the stencil buffer, we'll have to use a new register to
+ * hold the new values. If not, we can just keep using the
+ * current register.
+ */
+ if (need_to_writemask_stencil_values) {
+ newS_reg = spe_allocate_available_register(f);
+ spe_move(f, newS_reg, fbS_reg);
+ modified_buffers = true;
+ }
+ else {
+ newS_reg = fbS_reg;
+ }
+
+ /* Merge in the selected stencil fail values */
+ if (stencil_fail_values != fbS_reg) {
+ spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+ }
+
+ /* Same for the stencil pass/depth fail values. If this calculation
+ * is not needed (say, if depth test is off), then the
+ * stencil_pass_depth_fail_values register will be equal to fbS_reg
+ * and we'll skip the calculation.
+ */
+ if (stencil_pass_depth_fail_values != fbS_reg) {
+ /* We don't actually have a stencil pass/depth fail mask yet.
+ * Calculate it here from the stencil passing mask and the
+ * depth passing mask. Note that zmask_reg *must* have been
+ * set above if we're here.
+ */
+ unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+ spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+ spe_release_register(f, stencil_pass_depth_fail_mask);
+ }
+
+ /* Same for the stencil pass/depth pass mask */
+ if (stencil_pass_depth_pass_values != fbS_reg) {
+ unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+ spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+ spe_release_register(f, stencil_pass_depth_pass_mask);
+ }
+
+ /* Almost done. If we need to writemask, do it now, leaving the
+ * results in the fbS_reg register passed in. If we don't need
+ * to writemask, then the results are *already* in the fbS_reg,
+ * so there's nothing more to do.
+ */
+
+ if (need_to_writemask_stencil_values) {
+ /* The Select Bytes command makes a fine writemask. Where
+ * the mask is 0, the first (original) values are retained,
+ * effectively masking out changes. Where the mask is 1, the
+ * second (new) values are retained, incorporating changes.
+ */
+ spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+ }
+ } /* done calculating stencil values */
+
+ /* The stencil and/or depth values have been applied, and the
+ * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+ * We're all done, except that we've allocated a fair number
+ * of registers that we didn't bother tracking. Release all
+ * those registers as part of the register set, and go home.
+ */
+ spe_release_register_set(f);
+
+ /* Return true if we could have modified the stencil and/or
+ * depth buffers.
+ */
+ return modified_buffers;
+}
/**
@@ -626,9 +1784,9 @@ gen_pack_colors(struct spe_function *f,
void
cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
{
- const struct pipe_depth_stencil_alpha_state *dsa =
- &cell->depth_stencil->base;
- const struct pipe_blend_state *blend = &cell->blend->base;
+ const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+ const struct pipe_blend_state *blend = cell->blend;
+ const struct pipe_blend_color *blend_color = &cell->blend_color;
const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
/* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -642,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
const int fragB_reg = 10; /* vector float */
const int fragA_reg = 11; /* vector float */
const int mask_reg = 12; /* vector uint */
+ const int facing_reg = 13; /* uint */
/* offset of quad from start of tile
* XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -652,6 +1811,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_print_code(f, true);
+ spe_indent(f, 8);
+ spe_comment(f, -4, "Begin per-fragment ops");
+ }
+
spe_allocate_register(f, x_reg);
spe_allocate_register(f, y_reg);
spe_allocate_register(f, color_tile_reg);
@@ -662,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_allocate_register(f, fragB_reg);
spe_allocate_register(f, fragA_reg);
spe_allocate_register(f, mask_reg);
+ spe_allocate_register(f, facing_reg);
quad_offset_reg = spe_allocate_available_register(f);
fbRGBA_reg = spe_allocate_available_register(f);
@@ -674,8 +1841,9 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
ASSERT(TILE_SIZE == 32);
- spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
+ spe_comment(f, 0, "Compute quad offset within tile");
spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
+ spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
@@ -684,139 +1852,198 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_release_register(f, y2_reg);
}
-
if (dsa->alpha.enabled) {
gen_alpha_test(dsa, f, mask_reg, fragA_reg);
}
+ /* If we need the stencil buffers (because one- or two-sided stencil is
+ * enabled) or the depth buffer (because the depth test is enabled),
+ * go grab them. Note that if either one- or two-sided stencil is
+ * enabled, dsa->stencil[0].enabled will be true.
+ */
if (dsa->depth.enabled || dsa->stencil[0].enabled) {
const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
boolean write_depth_stencil;
- int fbZ_reg = spe_allocate_available_register(f); /* Z values */
- int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+ /* We may or may not need to allocate a register for Z or stencil values */
+ boolean fbS_reg_set = false, fbZ_reg_set = false;
+ unsigned int fbS_reg, fbZ_reg = 0;
+
+ spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
/* fetch quad of depth/stencil values from tile at (x,y) */
/* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
- if (dsa->depth.enabled) {
- /* Extract Z bits from fbZS_reg into fbZ_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- int mask_reg = spe_allocate_available_register(f);
- spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
- spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
- spe_release_register(f, mask_reg);
- /* OK, fbZ_reg has four 24-bit Z values now */
- }
- else {
- /* XXX handle other z/stencil formats */
- ASSERT(0);
- }
-
- /* Convert fragZ values from float[4] to uint[4] */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM ||
- zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* 24-bit Z values */
- int scale_reg = spe_allocate_available_register(f);
-
- /* scale_reg[0,1,2,3] = float(2^24-1) */
- spe_load_float(f, scale_reg, (float) 0xffffff);
-
- /* XXX these two instructions might be combined */
- spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */
-
- spe_release_register(f, scale_reg);
- }
- else {
- /* XXX handle 16-bit Z format */
- ASSERT(0);
- }
+ /* From the Z/stencil buffer format, pull out the bits we need for
+ * Z and/or stencil. We'll also convert the incoming fragment Z
+ * value in fragZ_reg from a floating point value in [0.0..1.0] to
+ * an unsigned integer value with the appropriate resolution.
+ */
+ switch(zs_format) {
+
+ case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+ case PIPE_FORMAT_X8Z24_UNORM:
+ if (dsa->depth.enabled) {
+ /* We need the Z part at least */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* four 24-bit Z values in the low-order bits */
+ spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+ }
+ if (dsa->stencil[0].enabled) {
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+ /* four 8-bit Z values in the high-order bits */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ }
+ break;
+
+ case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+ case PIPE_FORMAT_Z24X8_UNORM:
+ if (dsa->depth.enabled) {
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* shift by 8 to get the upper 24-bit values */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+ }
+ if (dsa->stencil[0].enabled) {
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+ /* 8-bit stencil in the low-order bits - mask them out */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ }
+ break;
+
+ case PIPE_FORMAT_Z32_UNORM:
+ if (dsa->depth.enabled) {
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 32-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ }
+ /* No stencil, so can't do anything there */
+ break;
+
+ case PIPE_FORMAT_Z16_UNORM:
+ if (dsa->depth.enabled) {
+ /* XXX Not sure this is correct, but it was here before, so we're
+ * going with it for now
+ */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 16-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+ }
+ /* No stencil */
+ break;
+
+ default:
+ ASSERT(0); /* invalid format */
}
+ /* If stencil is enabled, use the stencil-specific code
+ * generator to generate both the stencil and depth (if needed)
+ * tests. Otherwise, if only depth is enabled, generate
+ * a quick depth test. The test generators themselves will
+ * report back whether the depth/stencil buffer has to be
+ * written back.
+ */
if (dsa->stencil[0].enabled) {
- /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX extract with a shift */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* XXX extract with a mask */
- ASSERT(0);
- }
- }
-
+ /* This will perform the stencil and depth tests, and update
+ * the mask_reg, fbZ_reg, and fbS_reg as required by the
+ * tests.
+ */
+ ASSERT(fbS_reg_set);
+ ASSERT(fbZ_reg_set);
+ spe_comment(f, 0, "Perform stencil test");
- if (dsa->stencil[0].enabled) {
- /* XXX this may involve depth testing too */
- // gen_stencil_test(dsa, f, ... );
- ASSERT(0);
+ write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
}
else if (dsa->depth.enabled) {
int zmask_reg = spe_allocate_available_register(f);
- gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ spe_comment(f, 0, "Perform depth test");
+ write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
spe_release_register(f, zmask_reg);
}
-
- /* do we need to write Z and/or Stencil back into framebuffer? */
- write_depth_stencil = (dsa->depth.writemask |
- dsa->stencil[0].write_mask |
- dsa->stencil[1].write_mask);
+ else {
+ write_depth_stencil = false;
+ }
if (write_depth_stencil) {
/* Merge latest Z and Stencil values into fbZS_reg.
* fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
* fbS_reg has four 8-bit Z values in bits [7..0].
*/
+ spe_comment(f, 0, "Store quad's depth/stencil values in tile");
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ if (fbS_reg_set) {
+ spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else {
+ spe_move(f, fbZS_reg, fbZ_reg);
+ }
+ }
+ else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+ zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ if (fbS_reg_set) {
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
}
- else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
- /* XXX to do */
- ASSERT(0);
+ ASSERT(0); /* XXX to do */
}
else {
- /* bad zs_format */
- ASSERT(0);
+ ASSERT(0); /* bad zs_format */
}
/* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
}
- spe_release_register(f, fbZ_reg);
- spe_release_register(f, fbS_reg);
+ release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+ release_optional_register(f, &fbS_reg_set, fbS_reg);
}
-
/* Get framebuffer quad/colors. We'll need these for blending,
* color masking, and to obey the quad/pixel mask.
* Load: fbRGBA_reg = memory[color_tile + quad_offset]
* Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
* we could skip this load.
*/
+ spe_comment(f, 0, "Fetch quad colors from tile");
spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
-
if (blend->blend_enable) {
- gen_blend(blend, f, color_format,
+ spe_comment(f, 0, "Perform blending");
+ gen_blend(blend, blend_color, f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
}
@@ -829,19 +2056,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int rgba_reg = spe_allocate_available_register(f);
/* Pack four float colors as four 32-bit int colors */
+ spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
gen_pack_colors(f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg,
rgba_reg);
if (blend->logicop_enable) {
+ spe_comment(f, 0, "Compute logic op");
gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
}
- if (blend->colormask != 0xf) {
- gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+ if (blend->colormask != PIPE_MASK_RGBA) {
+ spe_comment(f, 0, "Compute color mask");
+ gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
}
-
/* Mix fragment colors with framebuffer colors using the quad/pixel mask:
* if (mask[i])
* rgba[i] = rgba[i];
@@ -853,6 +2082,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
/* Store updated quad in tile:
* memory[color_tile + quad_offset] = rgba_reg;
*/
+ spe_comment(f, 0, "Store quad colors into color tile");
spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
spe_release_register(f, rgba_reg);
@@ -862,9 +2092,11 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
-
spe_release_register(f, fbRGBA_reg);
spe_release_register(f, fbZS_reg);
spe_release_register(f, quad_offset_reg);
-}
+ if (cell->debug_flags & CELL_DEBUG_ASM) {
+ spe_comment(f, -4, "End per-fragment ops");
+ }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce6..8c55b8e0933 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -37,7 +37,6 @@
#include "cell_flush.h"
#include "cell_state.h"
#include "cell_texture.h"
-#include "cell_state_per_fragment.h"
@@ -45,24 +44,18 @@ static void *
cell_create_blend_state(struct pipe_context *pipe,
const struct pipe_blend_state *blend)
{
- struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
-
- (void) memcpy(cb, blend, sizeof(*blend));
-#if 0
- cell_generate_alpha_blend(cb);
-#endif
- return cb;
+ return mem_dup(blend, sizeof(*blend));
}
static void
-cell_bind_blend_state(struct pipe_context *pipe, void *state)
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
{
struct cell_context *cell = cell_context(pipe);
draw_flush(cell->draw);
- cell->blend = (struct cell_blend_state *) state;
+ cell->blend = (struct pipe_blend_state *) blend;
cell->dirty |= CELL_NEW_BLEND;
}
@@ -70,10 +63,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
static void
cell_delete_blend_state(struct pipe_context *pipe, void *blend)
{
- struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-
- spe_release_func(& cb->code);
- FREE(cb);
+ FREE(blend);
}
@@ -95,41 +85,29 @@ cell_set_blend_color(struct pipe_context *pipe,
static void *
cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
- const struct pipe_depth_stencil_alpha_state *depth_stencil)
+ const struct pipe_depth_stencil_alpha_state *dsa)
{
- struct cell_depth_stencil_alpha_state *cdsa =
- MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
-
- (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
-#if 0
- cell_generate_depth_stencil_test(cdsa);
-#endif
- return cdsa;
+ return mem_dup(dsa, sizeof(*dsa));
}
static void
cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
- void *depth_stencil)
+ void *dsa)
{
struct cell_context *cell = cell_context(pipe);
draw_flush(cell->draw);
- cell->depth_stencil =
- (struct cell_depth_stencil_alpha_state *) depth_stencil;
+ cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
cell->dirty |= CELL_NEW_DEPTH_STENCIL;
}
static void
-cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
{
- struct cell_depth_stencil_alpha_state *cdsa =
- (struct cell_depth_stencil_alpha_state *) depth;
-
- spe_release_func(& cdsa->code);
- FREE(cdsa);
+ FREE(dsa);
}
@@ -191,24 +169,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe,
static void *
cell_create_rasterizer_state(struct pipe_context *pipe,
- const struct pipe_rasterizer_state *setup)
+ const struct pipe_rasterizer_state *rasterizer)
{
- struct pipe_rasterizer_state *state
- = MALLOC(sizeof(struct pipe_rasterizer_state));
- memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
- return state;
+ return mem_dup(rasterizer, sizeof(*rasterizer));
}
static void
-cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup)
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
{
+ struct pipe_rasterizer_state *rasterizer =
+ (struct pipe_rasterizer_state *) rast;
struct cell_context *cell = cell_context(pipe);
/* pass-through to draw module */
- draw_set_rasterizer_state(cell->draw, setup);
+ draw_set_rasterizer_state(cell->draw, rasterizer);
- cell->rasterizer = (struct pipe_rasterizer_state *)setup;
+ cell->rasterizer = rasterizer;
cell->dirty |= CELL_NEW_RASTERIZER;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e5..79cb8df82fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
struct cell_command_render *render = &cell_global.command[i].render;
render->prim_type = PIPE_PRIM_TRIANGLES;
render->num_verts = cell->prim_buffer.num_verts;
+ render->front_winding = cell->rasterizer->front_winding;
render->vertex_size = cell->vertex_info->size * 4;
render->xmin = cell->prim_buffer.xmin;
render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b62..47ba6fa2909 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
return CELL_MAX_SAMPLERS;
case PIPE_CAP_NPOT_TEXTURES:
- return 0;
+ return 1;
case PIPE_CAP_TWO_SIDED_STENCIL:
- return 0;
+ return 1;
case PIPE_CAP_GLSL:
return 1;
case PIPE_CAP_S3TC:
@@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_ANISOTROPIC_FILTER:
return 0;
case PIPE_CAP_POINT_SPRITE:
- return 0;
+ return 1;
case PIPE_CAP_MAX_RENDER_TARGETS:
return 1;
case PIPE_CAP_OCCLUSION_QUERY:
- return 0;
+ return 1;
case PIPE_CAP_TEXTURE_SHADOW_MAP:
- return 0;
+ return 10;
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
return 12; /* max 2Kx2K */
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
@@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
return 12; /* max 2Kx2K */
default:
- return 0;
+ return 10;
}
}
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
return 16.0; /* arbitrary */
default:
- return 0;
+ return 10;
}
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e298..df020c4146d 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
#include "cell_spu.h"
#include "pipe/p_format.h"
#include "pipe/p_state.h"
+#include "util/u_memory.h"
#include "cell/common.h"
@@ -131,6 +132,11 @@ cell_start_spus(struct cell_context *cell)
ASSERT_ALIGN16(&cell_global.inits[0]);
ASSERT_ALIGN16(&cell_global.inits[1]);
+ /*
+ * Initialize the global 'inits' structure for each SPU.
+ * A pointer to the init struct will be passed to each SPU.
+ * The SPUs will then each grab their init info with mfc_get().
+ */
for (i = 0; i < cell->num_spus; i++) {
cell_global.inits[i].id = i;
cell_global.inits[i].num_spus = cell->num_spus;
@@ -141,6 +147,8 @@ cell_start_spus(struct cell_context *cell)
}
cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+ cell_global.inits[i].spu_functions = &cell->spu_functions;
+
cell_global.spe_contexts[i] = spe_context_create(0, NULL);
if (!cell_global.spe_contexts[i]) {
fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a31..b193170f9ce 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
#define CELL_NEW_TEXTURE 0x800
#define CELL_NEW_VERTEX 0x1000
#define CELL_NEW_VS 0x2000
-#define CELL_NEW_CONSTANTS 0x4000
-#define CELL_NEW_VERTEX_INFO 0x8000
+#define CELL_NEW_VS_CONSTANTS 0x4000
+#define CELL_NEW_FS_CONSTANTS 0x8000
+#define CELL_NEW_VERTEX_INFO 0x10000
extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983c..cbfa393cfba 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,18 +25,91 @@
*
**************************************************************************/
+#include "pipe/p_inlines.h"
#include "util/u_memory.h"
#include "cell_context.h"
#include "cell_gen_fragment.h"
#include "cell_state.h"
#include "cell_state_emit.h"
-#include "cell_state_per_fragment.h"
#include "cell_batch.h"
#include "cell_texture.h"
#include "draw/draw_context.h"
#include "draw/draw_private.h"
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+ struct cell_fragment_ops_key key;
+ struct cell_command_fragment_ops *ops;
+
+ /*
+ * Build key
+ */
+ memset(&key, 0, sizeof(key));
+ key.blend = *cell->blend;
+ key.dsa = *cell->depth_stencil;
+
+ if (cell->framebuffer.cbufs[0])
+ key.color_format = cell->framebuffer.cbufs[0]->format;
+ else
+ key.color_format = PIPE_FORMAT_NONE;
+
+ if (cell->framebuffer.zsbuf)
+ key.zs_format = cell->framebuffer.zsbuf->format;
+ else
+ key.zs_format = PIPE_FORMAT_NONE;
+
+ /*
+ * Look up key in cache.
+ */
+ ops = (struct cell_command_fragment_ops *)
+ util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+ /*
+ * If not found, create/save new fragment ops command.
+ */
+ if (!ops) {
+ struct spe_function spe_code;
+
+ if (0)
+ debug_printf("**** Create New Fragment Ops\n");
+
+ /* Prepare the buffer that will hold the generated code. */
+ spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+ /* generate new code */
+ cell_gen_fragment_function(cell, &spe_code);
+
+ /* alloc new fragment ops command */
+ ops = CALLOC_STRUCT(cell_command_fragment_ops);
+
+ /* populate the new cell_command_fragment_ops object */
+ ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+ memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+ ops->dsa = *cell->depth_stencil;
+ ops->blend = *cell->blend;
+
+ /* insert cell_command_fragment_ops object into keymap/cache */
+ util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+ /* release rtasm buffer */
+ spe_release_func(&spe_code);
+ }
+ else {
+ if (0)
+ debug_printf("**** Re-use Fragment Ops\n");
+ }
+
+ return ops;
+}
+
+
+
static void
emit_state_cmd(struct cell_context *cell, uint cmd,
const void *state, uint state_size)
@@ -90,26 +163,31 @@ cell_emit_state(struct cell_context *cell)
}
}
+ if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+ const uint shader = PIPE_SHADER_FRAGMENT;
+ const uint num_const = cell->constants[shader].size / sizeof(float);
+ uint i, j;
+ float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+ uint64_t *ibuf = (uint64_t *) buf;
+ const float *constants = pipe_buffer_map(cell->pipe.screen,
+ cell->constants[shader].buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+ ibuf[1] = num_const;
+ j = 4;
+ for (i = 0; i < num_const; i++) {
+ buf[j++] = constants[i];
+ }
+ pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+ }
+
if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
CELL_NEW_DEPTH_STENCIL |
CELL_NEW_BLEND)) {
- /* XXX we don't want to always do codegen here. We should have
- * a hash/lookup table to cache previous results...
- */
- struct cell_command_fragment_ops *fops
- = cell_batch_alloc(cell, sizeof(*fops));
- struct spe_function spe_code;
-
- /* generate new code */
- cell_gen_fragment_function(cell, &spe_code);
- /* put the new code into the batch buffer */
- fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
- memcpy(&fops->code, spe_code.store,
- SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
- fops->dsa = cell->depth_stencil->base;
- fops->blend = cell->blend->base;
- /* free codegen buffer */
- spe_release_func(&spe_code);
+ struct cell_command_fragment_ops *fops, *fops_cmd;
+ fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+ fops = lookup_fragment_ops(cell);
+ memcpy(fops_cmd, fops, sizeof(*fops));
}
if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2a..54a17eaf2b7 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
buf->buffer);
cell->constants[shader].size = buf->size;
- cell->dirty |= CELL_NEW_CONSTANTS;
+ if (shader == PIPE_SHADER_VERTEX)
+ cell->dirty |= CELL_NEW_VS_CONSTANTS;
+ else if (shader == PIPE_SHADER_FRAGMENT)
+ cell->dirty |= CELL_NEW_FS_CONSTANTS;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b934..578ddf62dcb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
render->opcode = CELL_CMD_RENDER;
render->prim_type = cvbr->prim;
+ render->front_winding = cell->rasterizer->front_winding;
render->num_indexes = nr_indices;
render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e3..18969005b02 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
int col3;
- spe_lqd(p, shuf_hi, shuf_ptr, 3);
- spe_lqd(p, shuf_lo, shuf_ptr, 4);
+ spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+ spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
spe_shufb(p, t1, row0, row2, shuf_hi);
spe_shufb(p, t2, row0, row2, shuf_lo);
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
*/
switch (count) {
case 4:
- spe_stqd(p, col3, dest_ptr, 3);
+ spe_stqd(p, col3, dest_ptr, 3 * 16);
case 3:
- spe_stqd(p, col2, dest_ptr, 2);
+ spe_stqd(p, col2, dest_ptr, 2 * 16);
case 2:
- spe_stqd(p, col1, dest_ptr, 1);
+ spe_stqd(p, col1, dest_ptr, 1 * 16);
case 1:
- spe_stqd(p, col0, dest_ptr, 0);
+ spe_stqd(p, col0, dest_ptr, 0 * 16);
}
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
float scale_signed = 0.0;
float scale_unsigned = 0.0;
- spe_lqd(p, v0, in_ptr, 0 + offset[0]);
- spe_lqd(p, v1, in_ptr, 1 + offset[0]);
- spe_lqd(p, v2, in_ptr, 2 + offset[0]);
- spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+ spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+ spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+ spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+ spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
offset[0] += 4;
switch (bytes) {
case 1:
scale_signed = 1.0f / 127.0f;
scale_unsigned = 1.0f / 255.0f;
- spe_lqd(p, tmp, shuf_ptr, 1);
+ spe_lqd(p, tmp, shuf_ptr, 1 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
case 2:
scale_signed = 1.0f / 32767.0f;
scale_unsigned = 1.0f / 65535.0f;
- spe_lqd(p, tmp, shuf_ptr, 2);
+ spe_lqd(p, tmp, shuf_ptr, 2 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
switch (count) {
case 1:
- spe_stqd(p, float_zero, out_ptr, 1);
+ spe_stqd(p, float_zero, out_ptr, 1 * 16);
case 2:
- spe_stqd(p, float_zero, out_ptr, 2);
+ spe_stqd(p, float_zero, out_ptr, 2 * 16);
case 3:
- spe_stqd(p, float_one, out_ptr, 3);
+ spe_stqd(p, float_one, out_ptr, 3 * 16);
}
if (float_zero != -1) {
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 00000000000..2be9a2d3242
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c10..116453b79c5 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,10 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
SOURCES = \
- spu_main.c \
+ spu_command.c \
spu_dcache.c \
+ spu_funcs.c \
+ spu_main.c \
spu_per_fragment_op.c \
spu_render.c \
spu_texture.c \
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 00000000000..91a4c137e7c
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,621 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+ ALIGN16_ATTRIB;
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+ /* Evidently, using less than a 16-byte status doesn't work reliably */
+ static const uint status[4] ALIGN16_ATTRIB
+ = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+ const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+ uint *dst = spu.init.buffer_status + index;
+
+ ASSERT(buffer < CELL_NUM_BUFFERS);
+
+ mfc_put((void *) &status, /* src in local memory */
+ (unsigned int) dst, /* dst in main memory */
+ sizeof(status), /* size */
+ TAG_MISC, /* tag is unimportant */
+ 0, /* tid */
+ 0 /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+ DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+ if (clear->surface == 0) {
+ spu.fb.color_clear_value = clear->value;
+ if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+ uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+ (spu.init.id << 20) | (spu.init.id << 28);
+ spu.fb.color_clear_value ^= x;
+ }
+ }
+ else {
+ spu.fb.depth_clear_value = clear->value;
+ }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+ /* Simply set all tiles' status to CLEAR.
+ * When we actually begin rendering into a tile, we'll initialize it to
+ * the clear value. If any tiles go untouched during the frame,
+ * really_clear_tiles() will set them to the clear value.
+ */
+ if (clear->surface == 0) {
+ memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+ }
+ else {
+ memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+ }
+
+#else
+
+ /*
+ * This path clears the whole framebuffer to the clear color right now.
+ */
+
+ /*
+ printf("SPU: %s num=%d w=%d h=%d\n",
+ __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+ */
+
+ /* init a single tile to the clear value */
+ if (clear->surface == 0) {
+ clear_c_tile(&spu.ctile);
+ }
+ else {
+ clear_z_tile(&spu.ztile);
+ }
+
+ /* walk over my tiles, writing the 'clear' tile's data */
+ {
+ const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+ uint i;
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (clear->surface == 0)
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+ else
+ put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+ }
+ }
+
+ if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+ wait_on_mask(1 << TAG_SURFACE_CLEAR);
+ }
+
+#endif /* CLEAR_OPT */
+
+ DEBUG_PRINTF("CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+ DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+ ASSERT(release->vertex_buf != ~0U);
+ release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+ static int warned = 0;
+
+ DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+ /* Copy SPU code from batch buffer to spu buffer */
+ memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+ /* Copy state info (for fallback case only) */
+ memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+ memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+ /* Parity twist! For now, always use the fallback code by default,
+ * only switching to codegen when specifically requested. This
+ * allows us to develop freely without risking taking down the
+ * branch.
+ *
+ * Later, the parity of this check will be reversed, so that
+ * codegen is *always* used, unless we specifically indicate that
+ * we don't want it.
+ *
+ * Eventually, the option will be removed completely, because in
+ * final code we'll always use codegen and won't even provide the
+ * raw state records that the fallback code requires.
+ */
+ if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+ spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+ }
+ else {
+ /* otherwise, the default fallback code remains in place */
+ if (!warned) {
+ fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+ warned = 1;
+ }
+ }
+
+ spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+ spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+ DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+ /* Copy SPU code from batch buffer to spu buffer */
+ memcpy(spu.fragment_program_code, fp->code,
+ SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+ /* Point function pointer at new code */
+ spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+ const uint num_const = buffer[pos + 1];
+ const float *constants = (const float *) &buffer[pos + 2];
+ uint i;
+
+ DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+ /* Expand each float to float[4] for SOA execution */
+ for (i = 0; i < num_const; i++) {
+ spu.constants[i] = spu_splats(constants[i]);
+ }
+
+ /* return new buffer pos (in 8-byte words) */
+ return pos + 2 + num_const / 2;
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+ DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
+ cmd->width,
+ cmd->height,
+ cmd->color_start,
+ cmd->color_format,
+ cmd->depth_format);
+
+ ASSERT_ALIGN16(cmd->color_start);
+ ASSERT_ALIGN16(cmd->depth_start);
+
+ spu.fb.color_start = cmd->color_start;
+ spu.fb.depth_start = cmd->depth_start;
+ spu.fb.color_format = cmd->color_format;
+ spu.fb.depth_format = cmd->depth_format;
+ spu.fb.width = cmd->width;
+ spu.fb.height = cmd->height;
+ spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+ spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+ switch (spu.fb.depth_format) {
+ case PIPE_FORMAT_Z32_UNORM:
+ spu.fb.zsize = 4;
+ spu.fb.zscale = (float) 0xffffffffu;
+ break;
+ case PIPE_FORMAT_Z24S8_UNORM:
+ case PIPE_FORMAT_S8Z24_UNORM:
+ case PIPE_FORMAT_Z24X8_UNORM:
+ case PIPE_FORMAT_X8Z24_UNORM:
+ spu.fb.zsize = 4;
+ spu.fb.zscale = (float) 0x00ffffffu;
+ break;
+ case PIPE_FORMAT_Z16_UNORM:
+ spu.fb.zsize = 2;
+ spu.fb.zscale = (float) 0xffffu;
+ break;
+ default:
+ spu.fb.zsize = 0;
+ break;
+ }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+ DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+
+ spu.sampler[sampler->unit] = sampler->state;
+ if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+ spu.sample_texture[sampler->unit] = sample_texture_bilinear;
+ else
+ spu.sample_texture[sampler->unit] = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+ const uint unit = texture->unit;
+ const uint width = texture->width;
+ const uint height = texture->height;
+
+ DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n",
+ texture->unit, texture->start,
+ texture->width, texture->height);
+
+ spu.texture[unit].start = texture->start;
+ spu.texture[unit].width = width;
+ spu.texture[unit].height = height;
+
+ spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
+ spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+ spu.texture[unit].tex_size_mask = (vector unsigned int)
+ { width - 1, height - 1, 0, 0 };
+ spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+ spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+ DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+ ASSERT(vinfo->num_attribs >= 1);
+ ASSERT(vinfo->num_attribs <= 8);
+ memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+ const unsigned attr = vs_info->attr;
+
+ ASSERT(attr < PIPE_MAX_ATTRIBS);
+ draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+ draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+ draw.vertex_fetch.size[attr] = vs_info->size;
+ draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+ draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+ mfc_get(attribute_fetch_code_buffer,
+ (unsigned int) code->base, /* src */
+ code->size,
+ TAG_BATCH_BUFFER,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+ draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+ DEBUG_PRINTF("FINISH\n");
+ really_clear_tiles(0);
+ /* wait for all outstanding DMAs to finish */
+ mfc_write_tag_mask(~0);
+ mfc_read_tag_status_all();
+ /* send mbox message to PPU */
+ spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+ const uint buf = (opcode >> 8) & 0xff;
+ uint size = (opcode >> 16);
+ uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+ const unsigned usize = size / sizeof(buffer[0]);
+ uint pos;
+
+ DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+ buf, size, spu.init.buffers[buf]);
+
+ ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+ ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+ size = ROUNDUP16(size);
+
+ ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+ mfc_get(buffer, /* dest */
+ (unsigned int) spu.init.buffers[buf], /* src */
+ size,
+ TAG_BATCH_BUFFER,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+ /* Tell PPU we're done copying the buffer to local store */
+ DEBUG_PRINTF("release batch buf %u\n", buf);
+ release_buffer(buf);
+
+ /*
+ * Loop over commands in the batch buffer
+ */
+ for (pos = 0; pos < usize; /* no incr */) {
+ switch (buffer[pos]) {
+ /*
+ * rendering commands
+ */
+ case CELL_CMD_CLEAR_SURFACE:
+ {
+ struct cell_command_clear_surface *clr
+ = (struct cell_command_clear_surface *) &buffer[pos];
+ cmd_clear_surface(clr);
+ pos += sizeof(*clr) / 8;
+ }
+ break;
+ case CELL_CMD_RENDER:
+ {
+ struct cell_command_render *render
+ = (struct cell_command_render *) &buffer[pos];
+ uint pos_incr;
+ cmd_render(render, &pos_incr);
+ pos += pos_incr;
+ }
+ break;
+ /*
+ * state-update commands
+ */
+ case CELL_CMD_STATE_FRAMEBUFFER:
+ {
+ struct cell_command_framebuffer *fb
+ = (struct cell_command_framebuffer *) &buffer[pos];
+ cmd_state_framebuffer(fb);
+ pos += sizeof(*fb) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_FRAGMENT_OPS:
+ {
+ struct cell_command_fragment_ops *fops
+ = (struct cell_command_fragment_ops *) &buffer[pos];
+ cmd_state_fragment_ops(fops);
+ pos += sizeof(*fops) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+ {
+ struct cell_command_fragment_program *fp
+ = (struct cell_command_fragment_program *) &buffer[pos];
+ cmd_state_fragment_program(fp);
+ pos += sizeof(*fp) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_FS_CONSTANTS:
+ pos = cmd_state_fs_constants(buffer, pos);
+ break;
+ case CELL_CMD_STATE_SAMPLER:
+ {
+ struct cell_command_sampler *sampler
+ = (struct cell_command_sampler *) &buffer[pos];
+ cmd_state_sampler(sampler);
+ pos += sizeof(*sampler) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_TEXTURE:
+ {
+ struct cell_command_texture *texture
+ = (struct cell_command_texture *) &buffer[pos];
+ cmd_state_texture(texture);
+ pos += sizeof(*texture) / 8;
+ }
+ break;
+ case CELL_CMD_STATE_VERTEX_INFO:
+ cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+ break;
+ case CELL_CMD_STATE_VIEWPORT:
+ (void) memcpy(& draw.viewport, &buffer[pos+1],
+ sizeof(struct pipe_viewport_state));
+ pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+ break;
+ case CELL_CMD_STATE_UNIFORMS:
+ draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+ pos += 2;
+ break;
+ case CELL_CMD_STATE_VS_ARRAY_INFO:
+ cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+ break;
+ case CELL_CMD_STATE_BIND_VS:
+#if 0
+ spu_bind_vertex_shader(&draw,
+ (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+ pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+ break;
+ case CELL_CMD_STATE_ATTRIB_FETCH:
+ cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+ &buffer[pos+1]);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+ break;
+ /*
+ * misc commands
+ */
+ case CELL_CMD_FINISH:
+ cmd_finish();
+ pos += 1;
+ break;
+ case CELL_CMD_RELEASE_VERTS:
+ {
+ struct cell_command_release_verts *release
+ = (struct cell_command_release_verts *) &buffer[pos];
+ cmd_release_verts(release);
+ pos += sizeof(*release) / 8;
+ }
+ break;
+ case CELL_CMD_FLUSH_BUFFER_RANGE: {
+ struct cell_buffer_range *br = (struct cell_buffer_range *)
+ &buffer[pos+1];
+
+ spu_dcache_mark_dirty((unsigned) br->base, br->size);
+ pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+ break;
+ }
+ default:
+ printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+ ASSERT(0);
+ break;
+ }
+ }
+
+ DEBUG_PRINTF("BATCH complete\n");
+}
+
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+ struct cell_command cmd;
+ int exitFlag = 0;
+
+ DEBUG_PRINTF("Enter command loop\n");
+
+ ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+ ASSERT_ALIGN16(&cmd);
+
+ while (!exitFlag) {
+ unsigned opcode;
+ int tag = 0;
+
+ DEBUG_PRINTF("Wait for cmd...\n");
+
+ /* read/wait from mailbox */
+ opcode = (unsigned int) spu_read_in_mbox();
+
+ DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+
+ /* command payload */
+ mfc_get(&cmd, /* dest */
+ (unsigned int) spu.init.cmd, /* src */
+ sizeof(struct cell_command), /* bytes */
+ tag,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask( 1 << tag );
+
+ /*
+ * NOTE: most commands should be contained in a batch buffer
+ */
+
+ switch (opcode & CELL_CMD_OPCODE_MASK) {
+ case CELL_CMD_EXIT:
+ DEBUG_PRINTF("EXIT\n");
+ exitFlag = 1;
+ break;
+ case CELL_CMD_VS_EXECUTE:
+#if 0
+ spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+ break;
+ case CELL_CMD_BATCH:
+ cmd_batch(opcode);
+ break;
+ default:
+ printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+ }
+
+ }
+
+ DEBUG_PRINTF("Exit command loop\n");
+
+ spu_dcache_report();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 00000000000..853e9aa5498
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,7 @@
+
+
+
+extern void
+command_loop(void);
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
new file mode 100644
index 00000000000..eeec0526558
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef SPU_DEBUG_H
+#define SPU_DEBUG_H
+
+
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
+extern boolean Debug;
+extern boolean force_fragment_ops_fallback;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+ if (Debug) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+ if (spu.init.debug_flags & (flag)) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
+
+
+#endif /* SPU_DEBUG_H */
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 00000000000..c7bcb3de9dd
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,176 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <transpose_matrix4x4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+ vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+ return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+ return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+ float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+ float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+ float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+ float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+ return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+ float z0 = powf(2.0f, spu_extract(x,0));
+ float z1 = powf(2.0f, spu_extract(x,1));
+ float z2 = powf(2.0f, spu_extract(x,2));
+ float z3 = powf(2.0f, spu_extract(x,3));
+ return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_log2(vector float x)
+{
+ /*
+ * log_base_2(x) = log(x) / log(2)
+ * 1.442695 = 1/log(2).
+ */
+ static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+ float z0 = logf(spu_extract(x,0));
+ float z1 = logf(spu_extract(x,1));
+ float z2 = logf(spu_extract(x,2));
+ float z3 = logf(spu_extract(x,3));
+ vector float v = (vector float) {z0, z1, z2, z3};
+ return spu_mul(v, k);
+}
+
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q)
+{
+ const uint unit = 0;
+ struct vec_4x4 colors;
+ vector float coords[4];
+
+ coords[0] = s;
+ coords[1] = t;
+ coords[2] = r;
+ coords[3] = q;
+ _transpose_matrix4x4(coords, coords);
+
+ /* get four texture samples */
+ colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
+ colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
+ colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
+ colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
+
+ _transpose_matrix4x4(colors.v, colors.v);
+ return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+ const char *name, void *addr)
+{
+ uint n = spu_functions->num;
+ ASSERT(strlen(name) < 16);
+ strcpy(spu_functions->names[n], name);
+ spu_functions->addrs[n] = (uint) addr;
+ spu_functions->num++;
+ ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+ struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+ int tag = TAG_MISC;
+
+ ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+ funcs.num = 0;
+ export_func(&funcs, "spu_cos", &spu_cos);
+ export_func(&funcs, "spu_sin", &spu_sin);
+ export_func(&funcs, "spu_pow", &spu_pow);
+ export_func(&funcs, "spu_exp2", &spu_exp2);
+ export_func(&funcs, "spu_log2", &spu_log2);
+ export_func(&funcs, "spu_txp", &spu_txp);
+
+ /* Send the function info back to the PPU / main memory */
+ mfc_put((void *) &funcs, /* src in local store */
+ (unsigned int) spu.init.spu_functions, /* dst in main memory */
+ sizeof(funcs), /* bytes */
+ tag,
+ 0, /* tid */
+ 0 /* rid */);
+ wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 00000000000..3adb6ae99f9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259c..4becd0f92a4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,16 +32,16 @@
#include <stdio.h>
#include <libmisc.h>
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
#include "spu_main.h"
-#include "spu_render.h"
#include "spu_per_fragment_op.h"
#include "spu_texture.h"
-#include "spu_tile.h"
//#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
+#include "spu_debug.h"
#include "cell/common.h"
-#include "pipe/p_defines.h"
/*
@@ -50,599 +50,13 @@ helpful headers:
/opt/cell/sdk/usr/include/libmisc.h
*/
-boolean Debug = FALSE;
-
struct spu_global spu;
-struct spu_vs_context draw;
-
-
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
- ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
- /* Evidently, using less than a 16-byte status doesn't work reliably */
- static const uint status[4] ALIGN16_ATTRIB
- = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
- const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
- uint *dst = spu.init.buffer_status + index;
-
- ASSERT(buffer < CELL_NUM_BUFFERS);
-
- mfc_put((void *) &status, /* src in local memory */
- (unsigned int) dst, /* dst in main memory */
- sizeof(status), /* size */
- TAG_MISC, /* tag is unimportant */
- 0, /* tid */
- 0 /* rid */);
-}
-
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
- const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
- uint i;
-
- if (surfaceIndex == 0) {
- clear_c_tile(&spu.ctile);
-
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
- }
- }
- }
- else {
- clear_z_tile(&spu.ztile);
-
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
- }
- }
-
-#if 0
- wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
- if (Debug)
- printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
- clear->surface, clear->value);
-
- if (clear->surface == 0) {
- spu.fb.color_clear_value = clear->value;
- if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
- uint x = (spu.init.id << 4) | (spu.init.id << 12) |
- (spu.init.id << 20) | (spu.init.id << 28);
- spu.fb.color_clear_value ^= x;
- }
- }
- else {
- spu.fb.depth_clear_value = clear->value;
- }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
- /* Simply set all tiles' status to CLEAR.
- * When we actually begin rendering into a tile, we'll initialize it to
- * the clear value. If any tiles go untouched during the frame,
- * really_clear_tiles() will set them to the clear value.
- */
- if (clear->surface == 0) {
- memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
- }
- else {
- memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
- }
-
-#else
-
- /*
- * This path clears the whole framebuffer to the clear color right now.
- */
-
- /*
- printf("SPU: %s num=%d w=%d h=%d\n",
- __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
- */
-
- /* init a single tile to the clear value */
- if (clear->surface == 0) {
- clear_c_tile(&spu.ctile);
- }
- else {
- clear_z_tile(&spu.ztile);
- }
-
- /* walk over my tiles, writing the 'clear' tile's data */
- {
- const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
- uint i;
- for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
- uint tx = i % spu.fb.width_tiles;
- uint ty = i / spu.fb.width_tiles;
- if (clear->surface == 0)
- put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
- else
- put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
- }
- }
-
- if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
- wait_on_mask(1 << TAG_SURFACE_CLEAR);
- }
-
-#endif /* CLEAR_OPT */
-
- if (Debug)
- printf("SPU %u: CLEAR SURF done\n", spu.init.id);
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
- if (Debug)
- printf("SPU %u: RELEASE VERTS %u\n",
- spu.init.id, release->vertex_buf);
- ASSERT(release->vertex_buf != ~0U);
- release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
- /* Copy SPU code from batch buffer to spu buffer */
- memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
- /* Copy state info (for fallback case only) */
- memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
- memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
- /* Point function pointer at new code */
- spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-
- spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
- spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
- if (Debug)
- printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
- /* Copy SPU code from batch buffer to spu buffer */
- memcpy(spu.fragment_program_code, fp->code,
- SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
- /* Point function pointer at new code */
- spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
- if (Debug)
- printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
- spu.init.id,
- cmd->width,
- cmd->height,
- cmd->color_start,
- cmd->color_format,
- cmd->depth_format);
-
- ASSERT_ALIGN16(cmd->color_start);
- ASSERT_ALIGN16(cmd->depth_start);
-
- spu.fb.color_start = cmd->color_start;
- spu.fb.depth_start = cmd->depth_start;
- spu.fb.color_format = cmd->color_format;
- spu.fb.depth_format = cmd->depth_format;
- spu.fb.width = cmd->width;
- spu.fb.height = cmd->height;
- spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
- spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
- switch (spu.fb.depth_format) {
- case PIPE_FORMAT_Z32_UNORM:
- spu.fb.zsize = 4;
- spu.fb.zscale = (float) 0xffffffffu;
- break;
- case PIPE_FORMAT_Z24S8_UNORM:
- case PIPE_FORMAT_S8Z24_UNORM:
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_X8Z24_UNORM:
- spu.fb.zsize = 4;
- spu.fb.zscale = (float) 0x00ffffffu;
- break;
- case PIPE_FORMAT_Z16_UNORM:
- spu.fb.zsize = 2;
- spu.fb.zscale = (float) 0xffffu;
- break;
- default:
- spu.fb.zsize = 0;
- break;
- }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
- if (Debug)
- printf("SPU %u: SAMPLER [%u]\n",
- spu.init.id, sampler->unit);
-
- spu.sampler[sampler->unit] = sampler->state;
- if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
- spu.sample_texture[sampler->unit] = sample_texture_bilinear;
- else
- spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
- const uint unit = texture->unit;
- const uint width = texture->width;
- const uint height = texture->height;
-
- if (Debug) {
- printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id,
- texture->unit, texture->start,
- texture->width, texture->height);
- }
-
- spu.texture[unit].start = texture->start;
- spu.texture[unit].width = width;
- spu.texture[unit].height = height;
-
- spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
- spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
- spu.texture[unit].tex_size_mask = (vector unsigned int)
- { width - 1, height - 1, 0, 0 };
- spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
- spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
- if (Debug) {
- printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
- vinfo->num_attribs);
- }
- ASSERT(vinfo->num_attribs >= 1);
- ASSERT(vinfo->num_attribs <= 8);
- memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
- const unsigned attr = vs_info->attr;
-
- ASSERT(attr < PIPE_MAX_ATTRIBS);
- draw.vertex_fetch.src_ptr[attr] = vs_info->base;
- draw.vertex_fetch.pitch[attr] = vs_info->pitch;
- draw.vertex_fetch.size[attr] = vs_info->size;
- draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
- draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
- mfc_get(attribute_fetch_code_buffer,
- (unsigned int) code->base, /* src */
- code->size,
- TAG_BATCH_BUFFER,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask(1 << TAG_BATCH_BUFFER);
-
- draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
- if (Debug)
- printf("SPU %u: FINISH\n", spu.init.id);
- really_clear_tiles(0);
- /* wait for all outstanding DMAs to finish */
- mfc_write_tag_mask(~0);
- mfc_read_tag_status_all();
- /* send mbox message to PPU */
- spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
- const uint buf = (opcode >> 8) & 0xff;
- uint size = (opcode >> 16);
- uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
- const unsigned usize = size / sizeof(buffer[0]);
- uint pos;
-
- if (Debug)
- printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
- spu.init.id, buf, size, spu.init.buffers[buf]);
-
- ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
- ASSERT_ALIGN16(spu.init.buffers[buf]);
-
- size = ROUNDUP16(size);
-
- ASSERT_ALIGN16(spu.init.buffers[buf]);
-
- mfc_get(buffer, /* dest */
- (unsigned int) spu.init.buffers[buf], /* src */
- size,
- TAG_BATCH_BUFFER,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask(1 << TAG_BATCH_BUFFER);
-
- /* Tell PPU we're done copying the buffer to local store */
- if (Debug)
- printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
- release_buffer(buf);
-
- /*
- * Loop over commands in the batch buffer
- */
- for (pos = 0; pos < usize; /* no incr */) {
- switch (buffer[pos]) {
- /*
- * rendering commands
- */
- case CELL_CMD_CLEAR_SURFACE:
- {
- struct cell_command_clear_surface *clr
- = (struct cell_command_clear_surface *) &buffer[pos];
- cmd_clear_surface(clr);
- pos += sizeof(*clr) / 8;
- }
- break;
- case CELL_CMD_RENDER:
- {
- struct cell_command_render *render
- = (struct cell_command_render *) &buffer[pos];
- uint pos_incr;
- cmd_render(render, &pos_incr);
- pos += pos_incr;
- }
- break;
- /*
- * state-update commands
- */
- case CELL_CMD_STATE_FRAMEBUFFER:
- {
- struct cell_command_framebuffer *fb
- = (struct cell_command_framebuffer *) &buffer[pos];
- cmd_state_framebuffer(fb);
- pos += sizeof(*fb) / 8;
- }
- break;
- case CELL_CMD_STATE_FRAGMENT_OPS:
- {
- struct cell_command_fragment_ops *fops
- = (struct cell_command_fragment_ops *) &buffer[pos];
- cmd_state_fragment_ops(fops);
- pos += sizeof(*fops) / 8;
- }
- break;
- case CELL_CMD_STATE_FRAGMENT_PROGRAM:
- {
- struct cell_command_fragment_program *fp
- = (struct cell_command_fragment_program *) &buffer[pos];
- cmd_state_fragment_program(fp);
- pos += sizeof(*fp) / 8;
- }
- break;
- case CELL_CMD_STATE_SAMPLER:
- {
- struct cell_command_sampler *sampler
- = (struct cell_command_sampler *) &buffer[pos];
- cmd_state_sampler(sampler);
- pos += sizeof(*sampler) / 8;
- }
- break;
- case CELL_CMD_STATE_TEXTURE:
- {
- struct cell_command_texture *texture
- = (struct cell_command_texture *) &buffer[pos];
- cmd_state_texture(texture);
- pos += sizeof(*texture) / 8;
- }
- break;
- case CELL_CMD_STATE_VERTEX_INFO:
- cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
- break;
- case CELL_CMD_STATE_VIEWPORT:
- (void) memcpy(& draw.viewport, &buffer[pos+1],
- sizeof(struct pipe_viewport_state));
- pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
- break;
- case CELL_CMD_STATE_UNIFORMS:
- draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
- pos += 2;
- break;
- case CELL_CMD_STATE_VS_ARRAY_INFO:
- cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
- break;
- case CELL_CMD_STATE_BIND_VS:
-#if 0
- spu_bind_vertex_shader(&draw,
- (struct cell_shader_info *) &buffer[pos+1]);
-#endif
- pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
- break;
- case CELL_CMD_STATE_ATTRIB_FETCH:
- cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
- &buffer[pos+1]);
- pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
- break;
- /*
- * misc commands
- */
- case CELL_CMD_FINISH:
- cmd_finish();
- pos += 1;
- break;
- case CELL_CMD_RELEASE_VERTS:
- {
- struct cell_command_release_verts *release
- = (struct cell_command_release_verts *) &buffer[pos];
- cmd_release_verts(release);
- pos += sizeof(*release) / 8;
- }
- break;
- case CELL_CMD_FLUSH_BUFFER_RANGE: {
- struct cell_buffer_range *br = (struct cell_buffer_range *)
- &buffer[pos+1];
-
- spu_dcache_mark_dirty((unsigned) br->base, br->size);
- pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
- break;
- }
- default:
- printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
- ASSERT(0);
- break;
- }
- }
-
- if (Debug)
- printf("SPU %u: BATCH complete\n", spu.init.id);
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
- struct cell_command cmd;
- int exitFlag = 0;
-
- if (Debug)
- printf("SPU %u: Enter main loop\n", spu.init.id);
-
- ASSERT((sizeof(struct cell_command) & 0xf) == 0);
- ASSERT_ALIGN16(&cmd);
-
- while (!exitFlag) {
- unsigned opcode;
- int tag = 0;
-
- if (Debug)
- printf("SPU %u: Wait for cmd...\n", spu.init.id);
-
- /* read/wait from mailbox */
- opcode = (unsigned int) spu_read_in_mbox();
-
- if (Debug)
- printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
-
- /* command payload */
- mfc_get(&cmd, /* dest */
- (unsigned int) spu.init.cmd, /* src */
- sizeof(struct cell_command), /* bytes */
- tag,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask( 1 << tag );
-
- /*
- * NOTE: most commands should be contained in a batch buffer
- */
-
- switch (opcode & CELL_CMD_OPCODE_MASK) {
- case CELL_CMD_EXIT:
- if (Debug)
- printf("SPU %u: EXIT\n", spu.init.id);
- exitFlag = 1;
- break;
- case CELL_CMD_VS_EXECUTE:
-#if 0
- spu_execute_vertex_shader(&draw, &cmd.vs);
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
#endif
- break;
- case CELL_CMD_BATCH:
- cmd_batch(opcode);
- break;
- default:
- printf("Bad opcode!\n");
- }
-
- }
-
- if (Debug)
- printf("SPU %u: Exit main loop\n", spu.init.id);
-
- spu_dcache_report();
-}
-
static void
@@ -653,7 +67,8 @@ one_time_init(void)
invalidate_tex_cache();
/* Install default/fallback fragment processing function.
- * This will normally be overriden by a code-gen'd function.
+ * This will normally be overriden by a code-gen'd function
+ * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
*/
spu.fragment_ops = spu_fallback_fragment_ops;
}
@@ -682,12 +97,15 @@ main(main_param_t speid, main_param_t argp)
ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+ ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+ ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
one_time_init();
- if (Debug)
- printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+ DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+ D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+ /* get initialization data */
mfc_get(&spu.init, /* dest */
(unsigned int) argp, /* src */
sizeof(struct cell_init_info), /* bytes */
@@ -696,12 +114,16 @@ main(main_param_t speid, main_param_t argp)
0 /* rid */);
wait_on_mask( 1 << tag );
+ if (spu.init.id == 0) {
+ return_function_info();
+ }
+
#if 0
if (spu.init.id==0)
- spu_test_misc();
+ spu_test_misc(spu.init.id);
#endif
- main_loop();
+ command_loop();
return 0;
}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b6258402..82c9c69a3a8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
#define MAX_HEIGHT 1024
+#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */
+
+
/**
* A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
* The data may be addressed through several different types.
@@ -73,7 +76,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
- vector unsigned int mask);
+ vector unsigned int mask,
+ uint facing);
/** Function for running fragment program */
typedef void (*spu_fragment_program_func)(vector float *inputs,
@@ -143,22 +147,21 @@ struct spu_global
ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
- /** Current fragment ops machine code */
- uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+ /** Current fragment ops machine code, at 8-byte boundary */
+ uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
/** Current fragment ops function */
spu_fragment_ops_func fragment_ops;
- /** Current fragment program machine code */
- uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+ /** Current fragment program machine code, at 8-byte boundary */
+ uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
/** Current fragment ops function */
spu_fragment_program_func fragment_program;
/** Current texture sampler function */
spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
- /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
- vector float constants[MAX_CONSTANTS];
+ /** Fragment program constants */
+ vector float constants[4 * CELL_MAX_CONSTANTS];
} ALIGN16_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845b..d252fa6dc18 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,12 +57,16 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragG,
vector float fragB,
vector float fragA,
- vector unsigned int mask)
+ vector unsigned int mask,
+ uint facing)
{
vector float frag_aos[4];
- unsigned int c0, c1, c2, c3;
+ unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+ unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */
- /* do alpha test */
+ /*
+ * Do alpha test
+ */
if (spu.depth_stencil_alpha.alpha.enabled) {
vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
vector unsigned int amask;
@@ -102,7 +106,10 @@ spu_fallback_fragment_ops(uint x, uint y,
mask = spu_and(mask, amask);
}
- /* Z and/or stencil testing... */
+
+ /*
+ * Z and/or stencil testing...
+ */
if (spu.depth_stencil_alpha.depth.enabled ||
spu.depth_stencil_alpha.stencil[0].enabled) {
@@ -178,6 +185,32 @@ spu_fallback_fragment_ops(uint x, uint y,
}
}
+
+ /*
+ * If we'll need the current framebuffer/tile colors for blending
+ * or logicop or colormask, fetch them now.
+ */
+ if (spu.blend.blend_enable ||
+ spu.blend.logicop_enable ||
+ spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+ fbc0 = colorTile->ui[y][x*2+0];
+ fbc1 = colorTile->ui[y][x*2+1];
+ fbc2 = colorTile->ui[y][x*2+2];
+ fbc3 = colorTile->ui[y][x*2+3];
+#else
+ fbc0 = colorTile->ui[y+0][x+0];
+ fbc1 = colorTile->ui[y+0][x+1];
+ fbc2 = colorTile->ui[y+1][x+0];
+ fbc3 = colorTile->ui[y+1][x+1];
+#endif
+ }
+
+
+ /*
+ * Do blending
+ */
if (spu.blend.blend_enable) {
/* blending terms, misc regs */
vector float term1r, term1g, term1b, term1a;
@@ -186,39 +219,26 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fbRGBA[4]; /* current framebuffer colors */
- /* get colors from framebuffer/tile */
+ /* convert framebuffer colors from packed int to vector float */
{
- vector float fc[4];
- uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
- c0 = colorTile->ui[y][x*2+0];
- c1 = colorTile->ui[y][x*2+1];
- c2 = colorTile->ui[y][x*2+2];
- c3 = colorTile->ui[y][x*2+3];
-#else
- c0 = colorTile->ui[y+0][x+0];
- c1 = colorTile->ui[y+0][x+1];
- c2 = colorTile->ui[y+1][x+0];
- c3 = colorTile->ui[y+1][x+1];
-#endif
+ vector float temp[4]; /* float colors in AOS form */
switch (spu.fb.color_format) {
case PIPE_FORMAT_B8G8R8A8_UNORM:
- fc[0] = spu_unpack_B8G8R8A8(c0);
- fc[1] = spu_unpack_B8G8R8A8(c1);
- fc[2] = spu_unpack_B8G8R8A8(c2);
- fc[3] = spu_unpack_B8G8R8A8(c3);
+ temp[0] = spu_unpack_B8G8R8A8(fbc0);
+ temp[1] = spu_unpack_B8G8R8A8(fbc1);
+ temp[2] = spu_unpack_B8G8R8A8(fbc2);
+ temp[3] = spu_unpack_B8G8R8A8(fbc3);
break;
case PIPE_FORMAT_A8R8G8B8_UNORM:
- fc[0] = spu_unpack_A8R8G8B8(c0);
- fc[1] = spu_unpack_A8R8G8B8(c1);
- fc[2] = spu_unpack_A8R8G8B8(c2);
- fc[3] = spu_unpack_A8R8G8B8(c3);
+ temp[0] = spu_unpack_A8R8G8B8(fbc0);
+ temp[1] = spu_unpack_A8R8G8B8(fbc1);
+ temp[2] = spu_unpack_A8R8G8B8(fbc2);
+ temp[3] = spu_unpack_A8R8G8B8(fbc3);
break;
default:
ASSERT(0);
}
- _transpose_matrix4x4(fbRGBA, fc);
+ _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
}
/*
@@ -384,21 +404,20 @@ spu_fallback_fragment_ops(uint x, uint y,
#endif
/*
- * Pack float colors into 32-bit RGBA words.
+ * Pack fragment float colors into 32-bit RGBA words.
*/
switch (spu.fb.color_format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
- c0 = spu_pack_A8R8G8B8(frag_aos[0]);
- c1 = spu_pack_A8R8G8B8(frag_aos[1]);
- c2 = spu_pack_A8R8G8B8(frag_aos[2]);
- c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+ fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+ fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+ fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+ fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
break;
-
case PIPE_FORMAT_B8G8R8A8_UNORM:
- c0 = spu_pack_B8G8R8A8(frag_aos[0]);
- c1 = spu_pack_B8G8R8A8(frag_aos[1]);
- c2 = spu_pack_B8G8R8A8(frag_aos[2]);
- c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+ fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+ fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+ fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+ fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
break;
default:
fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +426,57 @@ spu_fallback_fragment_ops(uint x, uint y,
/*
- * Color masking
+ * Do color masking
*/
if (spu.blend.colormask != 0xf) {
- /* XXX to do */
- /* apply color mask to 32-bit packed colors */
+ uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+ /* Form bitmask depending on color buffer format and colormask bits */
+ switch (spu.fb.color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ if (spu.blend.colormask & PIPE_MASK_R)
+ cmask |= 0x00ff0000; /* red */
+ if (spu.blend.colormask & PIPE_MASK_G)
+ cmask |= 0x0000ff00; /* green */
+ if (spu.blend.colormask & PIPE_MASK_B)
+ cmask |= 0x000000ff; /* blue */
+ if (spu.blend.colormask & PIPE_MASK_A)
+ cmask |= 0xff000000; /* alpha */
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ if (spu.blend.colormask & PIPE_MASK_R)
+ cmask |= 0x0000ff00; /* red */
+ if (spu.blend.colormask & PIPE_MASK_G)
+ cmask |= 0x00ff0000; /* green */
+ if (spu.blend.colormask & PIPE_MASK_B)
+ cmask |= 0xff000000; /* blue */
+ if (spu.blend.colormask & PIPE_MASK_A)
+ cmask |= 0x000000ff; /* alpha */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /*
+ * Apply color mask to the 32-bit packed colors.
+ * if (cmask[i])
+ * frag color[i] = frag color[i];
+ * else
+ * frag color[i] = framebuffer color[i];
+ */
+ fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+ fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+ fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+ fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
}
/*
- * Logic Ops
+ * Do logic ops
*/
if (spu.blend.logicop_enable) {
/* XXX to do */
- /* apply logicop to 32-bit packed colors */
+ /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
}
@@ -431,45 +487,46 @@ spu_fallback_fragment_ops(uint x, uint y,
spu.cur_ctile_status = TILE_STATUS_DIRTY;
}
else {
+ /* write no fragments */
return;
}
/*
- * Write new quad colors to the framebuffer/tile.
+ * Write new fragment/quad colors to the framebuffer/tile.
* Only write pixels where the corresponding mask word is set.
*/
#if LINEAR_QUAD_LAYOUT
/*
* Quad layout:
* +--+--+--+--+
- * |p0|p1|p2|p3|
+ * |p0|p1|p2|p3|...
* +--+--+--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y][x*2] = c0;
+ colorTile->ui[y][x*2] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y][x*2+1] = c1;
+ colorTile->ui[y][x*2+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y][x*2+2] = c2;
+ colorTile->ui[y][x*2+2] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y][x*2+3] = c3;
+ colorTile->ui[y][x*2+3] = fragc3;
#else
/*
* Quad layout:
* +--+--+
- * |p0|p1|
+ * |p0|p1|...
* +--+--+
- * |p2|p3|
+ * |p2|p3|...
* +--+--+
*/
if (spu_extract(mask, 0))
- colorTile->ui[y+0][x+0] = c0;
+ colorTile->ui[y+0][x+0] = fragc0;
if (spu_extract(mask, 1))
- colorTile->ui[y+0][x+1] = c1;
+ colorTile->ui[y+0][x+1] = fragc1;
if (spu_extract(mask, 2))
- colorTile->ui[y+1][x+0] = c2;
+ colorTile->ui[y+1][x+0] = fragc2;
if (spu_extract(mask, 3))
- colorTile->ui[y+1][x+1] = c3;
+ colorTile->ui[y+1][x+1] = fragc3;
#endif
}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf0463..a61689c83ab 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
- vector unsigned int mask);
+ vector unsigned int mask,
+ uint facing);
#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc988810..82dbeb26b76 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
- drawn += tri_draw(v0, v1, v2, tx, ty);
+ drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
}
//printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
printf("SPU %u: RENDER done\n",
spu.init.id);
}
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b7..6905015a483 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
0 /* rid */);
}
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+ const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+ uint i;
+
+ if (surfaceIndex == 0) {
+ clear_c_tile(&spu.ctile);
+
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+ }
+ }
+ }
+ else {
+ clear_z_tile(&spu.ztile);
+
+ for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+ uint tx = i % spu.fb.width_tiles;
+ uint ty = i / spu.fb.width_tiles;
+ if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+ put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+ }
+ }
+
+#if 0
+ wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112db..7bfb52be8f3 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
-void
+extern void
get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
-void
+extern void
put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+extern void
+really_clear_tiles(uint surfaceIndex);
static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b938781920..87991c31368 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
float oneoverarea;
+ uint facing;
+
uint tx, ty;
int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -241,6 +243,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
}
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+ eval_coeff(slot, x, y, result);
+ _transpose_matrix4x4(result, result);
+}
+
+
+
static INLINE vector float
eval_z(float x, float y)
{
@@ -261,20 +276,23 @@ eval_z(float x, float y)
* overall.
*/
static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
{
/* If any bits in mask are set... */
if (spu_extract(spu_orx(mask), 0)) {
const int ix = x - setup.cliprect_minx;
const int iy = y - setup.cliprect_miny;
- vector float colors[4];
spu.cur_ctile_status = TILE_STATUS_DIRTY;
spu.cur_ztile_status = TILE_STATUS_DIRTY;
- if (spu.texture[0].start) {
- /* texture mapping */
+ if (0/*spu.texture[0].start*/) {
+ /*
+ * Temporary texture mapping path
+ * This will go away when fragment programs support TEX inst.
+ */
const uint unit = 0;
+ vector float colors[4];
vector float texcoords[4];
eval_coeff(2, (float) x, (float) y, texcoords);
@@ -311,70 +329,62 @@ emit_quad( int x, int y, mask_t mask )
colors[3] = spu_mul(colors[3], colors1[3]);
}
+ {
+ /* Convert fragment data from AoS to SoA format.
+ * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+ * This is temporary!
+ */
+ vector float soa_frag[4];
+ _transpose_matrix4x4(soa_frag, colors);
+
+ vector float fragZ = eval_z((float) x, (float) y);
+
+ /* Do all per-fragment/quad operations here, including:
+ * alpha test, z test, stencil test, blend and framebuffer writing.
+ */
+ spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+ fragZ,
+ soa_frag[0], soa_frag[1],
+ soa_frag[2], soa_frag[3],
+ mask,
+ setup.facing);
+ }
+
}
else {
- /* simple shading */
-#if 0
- eval_coeff(1, (float) x, (float) y, colors);
+ /*
+ * Run fragment shader, execute per-fragment ops, update fb/tile.
+ */
+ vector float inputs[4*4], outputs[2*4];
+ vector float fragZ = eval_z((float) x, (float) y);
+ /* setup inputs */
+#if 0
+ eval_coeff_soa(1, (float) x, (float) y, inputs);
#else
- /* XXX new fragment program code */
-
- if (spu.fragment_program) {
- vector float inputs[4*4], outputs[2*4];
-
- /* setup inputs */
- eval_coeff(1, (float) x, (float) y, inputs);
-
- /* Execute the current fragment program */
- spu.fragment_program(inputs, outputs, spu.constants);
-
- /* Copy outputs */
- colors[0] = outputs[0*4+0];
- colors[1] = outputs[0*4+1];
- colors[2] = outputs[0*4+2];
- colors[3] = outputs[0*4+3];
-
- if (0 && spu.init.id==0 && y == 48) {
- printf("colors[0] = %f %f %f %f\n",
- spu_extract(colors[0], 0),
- spu_extract(colors[0], 1),
- spu_extract(colors[0], 2),
- spu_extract(colors[0], 3));
- printf("colors[1] = %f %f %f %f\n",
- spu_extract(colors[1], 0),
- spu_extract(colors[1], 1),
- spu_extract(colors[1], 2),
- spu_extract(colors[1], 3));
- }
-
+ uint i;
+ for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+ eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
}
#endif
- }
-
-
- {
- /* Convert fragment data from AoS to SoA format.
- * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
- * This is temporary!
- */
- vector float soa_frag[4];
- _transpose_matrix4x4(soa_frag, colors);
+ ASSERT(spu.fragment_program);
+ ASSERT(spu.fragment_ops);
- float4 fragZ;
+ /* Execute the current fragment program */
+ spu.fragment_program(inputs, outputs, spu.constants);
- fragZ.v = eval_z((float) x, (float) y);
-
- /* Do all per-fragment/quad operations here, including:
- * alpha test, z test, stencil test, blend and framebuffer writing.
+ /* Execute per-fragment/quad operations, including:
+ * alpha test, z test, stencil test, blend and framebuffer writing.
*/
spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
- fragZ.v,
- soa_frag[0], soa_frag[1],
- soa_frag[2], soa_frag[3],
- mask);
+ fragZ,
+ outputs[0*4+0],
+ outputs[0*4+1],
+ outputs[0*4+2],
+ outputs[0*4+3],
+ mask,
+ setup.facing);
}
-
}
}
@@ -477,7 +487,7 @@ static void flush_spans( void )
*/
for (x = block(minleft); x <= block(maxright); x += 2) {
#if 1
- emit_quad( x, setup.span.y, calculate_mask( x ) );
+ emit_quad( x, setup.span.y, calculate_mask( x ));
#endif
}
@@ -896,13 +906,28 @@ static void subtriangle( struct edge *eleft,
eright->sy += lines;
}
+static float
+determinant( const float *v0,
+ const float *v1,
+ const float *v2 )
+{
+ /* edge vectors e = v0 - v2, f = v1 - v2 */
+ const float ex = v0[0] - v2[0];
+ const float ey = v0[1] - v2[1];
+ const float fx = v1[0] - v2[0];
+ const float fy = v1[1] - v2[1];
+
+ /* det = cross(e,f).z */
+ return ex * fy - ey * fx;
+}
+
/**
* Draw triangle into tile at (tx, ty) (tile coords)
* The tile data should have already been fetched.
*/
boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
{
setup.tx = tx;
setup.ty = ty;
@@ -913,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+ /* Before we sort vertices, determine the facing of the triangle,
+ * which will be needed for front/back-face stencil application
+ */
+ float det = determinant(v0, v1, v2);
+ setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
if (!setup_sort_vertices((struct vertex_header *) v0,
(struct vertex_header *) v1,
(struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c93..abc3d35160e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f2..4fea71c3140 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -39,11 +39,20 @@
#include "tgsi/tgsi_exec.h"
#include "tgsi/tgsi_parse.h"
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
struct sp_fragment_shader base;
+ const struct tgsi_token *machine_tokens;
};
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+ return (struct sp_exec_fragment_shader *) base;
+}
+
/**
* Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base,
struct tgsi_exec_machine *machine,
struct tgsi_sampler *samplers )
{
- tgsi_exec_machine_bind_shader( machine,
- base->shader.tokens,
- PIPE_MAX_SAMPLERS,
- samplers );
+ struct sp_exec_fragment_shader *spefs =
+ sp_exec_fragment_shader(base);
+
+ /*
+ * Bind tokens/shader to the interpreter's machine state.
+ * Avoid redundant binding.
+ */
+ if (spefs->machine_tokens != base->shader.tokens) {
+ tgsi_exec_machine_bind_shader( machine,
+ base->shader.tokens,
+ PIPE_MAX_SAMPLERS,
+ samplers );
+ spefs->machine_tokens = base->shader.tokens;
+ }
}
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 496ed43df26..0111469405f 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
#include "tgsi/tgsi_sse2.h"
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
#include "rtasm/rtasm_x86sse.h"
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 4d64c74a4aa..7bcebd3d6b6 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,10 +144,12 @@ typedef unsigned char boolean;
#define ALIGN16_DECL(TYPE, NAME, SIZE) TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
#define ALIGN16_ASSIGN(NAME) NAME##___aligned
#define ALIGN16_ATTRIB __attribute__(( aligned( 16 ) ))
+#define ALIGN8_ATTRIB __attribute__(( aligned( 8 ) ))
#else
#define ALIGN16_DECL(TYPE, NAME, SIZE) TYPE NAME##___unaligned[SIZE + 1]
#define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
#define ALIGN16_ATTRIB
+#define ALIGN8_ATTRIB
#endif
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index af3746c0265..ef05547819d 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -85,6 +85,14 @@
#define PIPE_ARCH_X86_64
#endif
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
#if 0 /* FIXME */
#define PIPE_ARCH_PPC
#endif
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index b15affef7a5..3bedc752947 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -26,6 +26,8 @@
**************************************************************************/
/**
+ * @file
+ *
* Screen, Adapter or GPU
*
* These are driver functions/facilities that are context independent.
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a562e3a6b14..78c20de3e27 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -1,4 +1,31 @@
-#if !defined TGSI_TOKEN_H
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TGSI_TOKEN_H
#define TGSI_TOKEN_H
#ifdef __cplusplus
@@ -396,7 +423,7 @@ struct tgsi_immediate_float32
#define TGSI_SAT_ZERO_ONE 1 /* clamp to [0,1] */
#define TGSI_SAT_MINUS_PLUS_ONE 2 /* clamp to [-1,1] */
-/*
+/**
* Opcode is the operation code to execute. A given operation defines the
* semantics how the source registers (if any) are interpreted and what is
* written to the destination registers (if any) as a result of execution.
@@ -481,7 +508,7 @@ struct tgsi_instruction_ext
#define TGSI_SWIZZLE_Z 2
#define TGSI_SWIZZLE_W 3
-/*
+/**
* Precision controls the precision at which the operation should be executed.
*
* CondDstUpdate enables condition code register writes. When this field is
@@ -548,7 +575,7 @@ struct tgsi_instruction_ext_predicate
unsigned Extended : 1; /* BOOL */
};
-/*
+/**
* File specifies the register array to access.
*
* Index specifies the element number of a register in the register file.
@@ -580,7 +607,7 @@ struct tgsi_src_register
unsigned Extended : 1; /* BOOL */
};
-/*
+/**
* If tgsi_src_register::Extended is TRUE, tgsi_src_register_ext follows.
*
* Then, if tgsi_src_register::Indirect is TRUE, another tgsi_src_register
@@ -599,7 +626,7 @@ struct tgsi_src_register_ext
unsigned Extended : 1; /* BOOL */
};
-/*
+/**
* If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
* it should be cast to tgsi_src_register_ext_swz.
*
@@ -617,7 +644,7 @@ struct tgsi_src_register_ext
#define TGSI_EXTSWIZZLE_ZERO 4
#define TGSI_EXTSWIZZLE_ONE 5
-/*
+/**
* ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source
* register in an extended manner.
*
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index da783389dae..342f17260a7 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -27,6 +27,8 @@
/**
+ * @file
+ *
* Abstract graphics pipe state objects.
*
* Basic notes:
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
index e01d5a602b8..8af3cd958b0 100644
--- a/src/gallium/include/pipe/p_thread.h
+++ b/src/gallium/include/pipe/p_thread.h
@@ -25,6 +25,8 @@
/**
+ * @file
+ *
* Thread, mutex, condition var and thread-specific data functions.
*/
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c
index 3a486481f56..ed753689829 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_screen.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c
@@ -113,7 +113,118 @@ static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
extern const struct dri_extension card_extensions[];
+static GLboolean
+intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
+{
+ int ret;
+ struct drm_i915_getparam gp;
+
+ gp.param = param;
+ gp.value = value;
+
+ ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+ if (ret) {
+ fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+ return GL_FALSE;
+ }
+
+ return GL_TRUE;
+}
+
+static void
+intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
+ unsigned long long offset, int depth, uint pitch)
+{
+ abort();
+#if 0
+ struct intel_context *intel = (struct intel_context*)
+ ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+ struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
+ struct st_texture_object *stObj = st_texture_object(tObj);
+
+ if (!stObj)
+ return;
+
+ if (stObj->pt)
+ st->pipe->texture_release(intel->st->pipe, &stObj->pt);
+
+ stObj->imageOverride = GL_TRUE;
+ stObj->depthOverride = depth;
+ stObj->pitchOverride = pitch;
+
+ if (offset)
+ stObj->textureOffset = offset;
+#endif
+}
+
+
+#if 0
+static void
+intelHandleDrawableConfig(__DRIdrawablePrivate *dPriv,
+ __DRIcontextPrivate *pcp,
+ __DRIDrawableConfigEvent *event)
+{
+ (void) dPriv;
+ (void) pcp;
+ (void) event;
+}
+#endif
+
+#if 0
+static void
+intelHandleBufferAttach(__DRIdrawablePrivate *dPriv,
+ __DRIcontextPrivate *pcp,
+ __DRIBufferAttachEvent *ba)
+{
+ struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv);
+
+ switch (ba->buffer.attachment) {
+ case DRI_DRAWABLE_BUFFER_FRONT_LEFT:
+ intelScreen->front.width = dPriv->w;
+ intelScreen->front.height = dPriv->h;
+ intelScreen->front.cpp = ba->buffer.cpp;
+ intelScreen->front.pitch = ba->buffer.pitch;
+ driGenBuffers(intelScreen->base.staticPool, "front", 1, &intelScreen->front.buffer, 0, 0, 0);
+ driBOSetReferenced(intelScreen->front.buffer, ba->buffer.handle);
+ break;
+
+ case DRI_DRAWABLE_BUFFER_BACK_LEFT:
+ case DRI_DRAWABLE_BUFFER_DEPTH:
+ case DRI_DRAWABLE_BUFFER_STENCIL:
+ case DRI_DRAWABLE_BUFFER_ACCUM:
+ /* anything ?? */
+ break;
+
+ default:
+ fprintf(stderr, "unhandled buffer attach event, attachment type %d\n",
+ ba->buffer.attachment);
+ return;
+ }
+}
+#endif
+
+static const __DRItexOffsetExtension intelTexOffsetExtension = {
+ { __DRI_TEX_OFFSET },
+ intelSetTexOffset,
+};
+
+#if 0
+static const __DRItexBufferExtension intelTexBufferExtension = {
+ { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+ intelSetTexBuffer,
+};
+#endif
+static const __DRIextension *intelScreenExtensions[] = {
+ &driReadDrawableExtension,
+ &driCopySubBufferExtension.base,
+ &driSwapControlExtension.base,
+ &driFrameTrackingExtension.base,
+ &driMediaStreamCounterExtension.base,
+ &intelTexOffsetExtension.base,
+// &intelTexBufferExtension.base,
+ NULL
+};
static void
@@ -232,7 +343,8 @@ intelCreatePools(__DRIscreenPrivate * sPriv)
intelScreen->havePools = GL_TRUE;
- intelUpdateScreenRotation(sPriv, intelScreen->sarea);
+ if (intelScreen->sarea)
+ intelUpdateScreenRotation(sPriv, intelScreen->sarea);
return GL_TRUE;
}
@@ -265,11 +377,6 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
struct intel_screen *intelScreen;
I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv;
- PFNGLXSCRENABLEEXTENSIONPROC glx_enable_extension =
- (PFNGLXSCRENABLEEXTENSIONPROC) (*dri_interface->
- getProcAddress("glxEnableExtension"));
- void *const psc = sPriv->psc->screenConfigs;
-
if (sPriv->devPrivSize != sizeof(I830DRIRec)) {
fprintf(stderr,
"\nERROR! sizeof(I830DRIRec) does not match passed size from device driver\n");
@@ -286,28 +393,19 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
__driConfigOptions, __driNConfigOptions);
sPriv->private = (void *) intelScreen;
-
intelScreen->sarea = (drmI830Sarea *) (((GLubyte *) sPriv->pSAREA) +
- gDRIPriv->sarea_priv_offset);
- intelScreen->deviceID = gDRIPriv->deviceID;
- intelScreen->front.cpp = gDRIPriv->cpp;
- intelScreen->drmMinor = sPriv->drmMinor;
+ gDRIPriv->sarea_priv_offset);
- assert(gDRIPriv->bitsPerPixel == 16 ||
- gDRIPriv->bitsPerPixel == 32);
+ intelScreen->deviceID = gDRIPriv->deviceID;
+ intelScreen->front.cpp = gDRIPriv->cpp;
+ intelScreen->drmMinor = sPriv->drm_version.minor;
intelUpdateScreenRotation(sPriv, intelScreen->sarea);
if (0)
intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv);
- if (glx_enable_extension != NULL) {
- (*glx_enable_extension) (psc, "GLX_SGI_swap_control");
- (*glx_enable_extension) (psc, "GLX_SGI_video_sync");
- (*glx_enable_extension) (psc, "GLX_MESA_swap_control");
- (*glx_enable_extension) (psc, "GLX_MESA_swap_frame_usage");
- (*glx_enable_extension) (psc, "GLX_SGI_make_current_read");
- }
+ sPriv->extensions = intelScreenExtensions;
intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
intelScreen->base.base.get_name = intel_get_name;
@@ -387,7 +485,7 @@ intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
{
struct intel_framebuffer *intelfb = intel_framebuffer(driDrawPriv);
assert(intelfb->stfb);
- st_unreference_framebuffer(&intelfb->stfb);
+ st_unreference_framebuffer(intelfb->stfb);
free(intelfb);
}
@@ -406,65 +504,19 @@ intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
return 0;
}
-
-static void
-intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
- unsigned long long offset, int depth, uint pitch)
-{
- abort();
-#if 0
- struct intel_context *intel = (struct intel_context*)
- ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
- struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
- struct st_texture_object *stObj = st_texture_object(tObj);
-
- if (!stObj)
- return;
-
- if (stObj->pt)
- st->pipe->texture_release(intel->st->pipe, &stObj->pt);
-
- stObj->imageOverride = GL_TRUE;
- stObj->depthOverride = depth;
- stObj->pitchOverride = pitch;
-
- if (offset)
- stObj->textureOffset = offset;
-#endif
-}
-
-
-static const struct __DriverAPIRec intelAPI = {
- .InitDriver = intelInitDriver,
- .DestroyScreen = intelDestroyScreen,
- .CreateContext = intelCreateContext,
- .DestroyContext = intelDestroyContext,
- .CreateBuffer = intelCreateBuffer,
- .DestroyBuffer = intelDestroyBuffer,
- .SwapBuffers = intelSwapBuffers,
- .MakeCurrent = intelMakeCurrent,
- .UnbindContext = intelUnbindContext,
- .GetSwapInfo = intelGetSwapInfo,
- .GetMSC = driGetMSC32,
- .WaitForMSC = driWaitForMSC32,
- .WaitForSBC = NULL,
- .SwapBuffersMSC = NULL,
- .CopySubBuffer = intelCopySubBuffer,
- .setTexOffset = intelSetTexOffset,
-};
-
-
-static __GLcontextModes *
-intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
- unsigned stencil_bits, boolean have_back_buffer)
+static __DRIconfig **
+intelFillInModes(__DRIscreenPrivate *psp,
+ unsigned pixel_bits, unsigned depth_bits,
+ unsigned stencil_bits, GLboolean have_back_buffer)
{
- __GLcontextModes *modes;
+ __DRIconfig **configs;
__GLcontextModes *m;
unsigned num_modes;
unsigned depth_buffer_factor;
unsigned back_buffer_factor;
GLenum fb_format;
GLenum fb_type;
+ int i;
/* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
* support pageflipping at all.
@@ -508,100 +560,144 @@ intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
}
- modes =
- (*dri_interface->createContextModes) (num_modes,
- sizeof(__GLcontextModes));
- m = modes;
- if (!driFillInModes(&m, fb_format, fb_type,
- depth_bits_array, stencil_bits_array,
- depth_buffer_factor, back_buffer_modes,
- back_buffer_factor, msaa_samples_array, 1, GLX_TRUE_COLOR)) {
- fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
- __LINE__);
- return NULL;
- }
- if (!driFillInModes(&m, fb_format, fb_type,
- depth_bits_array, stencil_bits_array,
- depth_buffer_factor, back_buffer_modes,
- back_buffer_factor, msaa_samples_array, 1, GLX_DIRECT_COLOR)) {
- fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+ configs = driCreateConfigs(fb_format, fb_type,
+ depth_bits_array, stencil_bits_array,
+ depth_buffer_factor, back_buffer_modes,
+ back_buffer_factor, msaa_samples_array, 1);
+ if (configs == NULL) {
+ fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
__LINE__);
return NULL;
}
/* Mark the visual as slow if there are "fake" stencil bits.
*/
- for (m = modes; m != NULL; m = m->next) {
+ for (i = 0; configs[i]; i++) {
+ m = &configs[i]->modes;
if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
m->visualRating = GLX_SLOW_CONFIG;
}
}
- return modes;
+ return configs;
}
-
/**
- * This is the bootstrap function for the driver. libGL supplies all of the
- * requisite information about the system, and the driver initializes itself.
- * This routine also fills in the linked list pointed to by \c driver_modes
- * with the \c __GLcontextModes that the driver can support for windows or
- * pbuffers.
+ * This is the driver specific part of the createNewScreen entry point.
+ *
+ * \todo maybe fold this into intelInitDriver
*
- * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on
- * failure.
+ * \return the __GLcontextModes supported by this driver
*/
-PUBLIC void *
-__driCreateNewScreen_20050727(__DRInativeDisplay * dpy, int scrn,
- __DRIscreen * psc,
- const __GLcontextModes * modes,
- const __DRIversion * ddx_version,
- const __DRIversion * dri_version,
- const __DRIversion * drm_version,
- const __DRIframebuffer * frame_buffer,
- drmAddress pSAREA, int fd,
- int internal_api_version,
- const __DRIinterfaceMethods * interface,
- __GLcontextModes ** driver_modes)
+static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp)
{
- __DRIscreenPrivate *psp;
- static const __DRIversion ddx_expected = { 1, 7, 0 };
+#ifdef I915
+ static const __DRIversion ddx_expected = { 1, 5, 0 };
+#else
+ static const __DRIversion ddx_expected = { 1, 6, 0 };
+#endif
static const __DRIversion dri_expected = { 4, 0, 0 };
- static const __DRIversion drm_expected = { 1, 7, 0 };
-
- dri_interface = interface;
+ static const __DRIversion drm_expected = { 1, 5, 0 };
+ I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
if (!driCheckDriDdxDrmVersions2("i915",
- dri_version, &dri_expected,
- ddx_version, &ddx_expected,
- drm_version, &drm_expected)) {
+ &psp->dri_version, &dri_expected,
+ &psp->ddx_version, &ddx_expected,
+ &psp->drm_version, &drm_expected)) {
return NULL;
}
- psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL,
- ddx_version, dri_version, drm_version,
- frame_buffer, pSAREA, fd,
- internal_api_version, &intelAPI);
-
- if (psp != NULL) {
- I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
- *driver_modes = intelFillInModes(dri_priv->cpp * 8,
- (dri_priv->cpp == 2) ? 16 : 24,
- (dri_priv->cpp == 2) ? 0 : 8, 1);
-
- /* Calling driInitExtensions here, with a NULL context pointer,
- * does not actually enable the extensions. It just makes sure
- * that all the dispatch offsets for all the extensions that
- * *might* be enables are known. This is needed because the
- * dispatch offsets need to be known when _mesa_context_create
- * is called, but we can't enable the extensions until we have a
- * context pointer.
- *
- * Hello chicken. Hello egg. How are you two today?
- */
- driInitExtensions(NULL, card_extensions, GL_FALSE);
+ /* Calling driInitExtensions here, with a NULL context pointer,
+ * does not actually enable the extensions. It just makes sure
+ * that all the dispatch offsets for all the extensions that
+ * *might* be enables are known. This is needed because the
+ * dispatch offsets need to be known when _mesa_context_create is
+ * called, but we can't enable the extensions until we have a
+ * context pointer.
+ *
+ * Hello chicken. Hello egg. How are you two today?
+ */
+ driInitExtensions( NULL, card_extensions, GL_FALSE );
+ //intelInitExtensions(NULL, GL_TRUE);
+
+ if (!intelInitDriver(psp))
+ return NULL;
+
+ psp->extensions = intelScreenExtensions;
+
+ return (const __DRIconfig **)
+ intelFillInModes(psp, dri_priv->cpp * 8,
+ (dri_priv->cpp == 2) ? 16 : 24,
+ (dri_priv->cpp == 2) ? 0 : 8, 1);
+}
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ *
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
+{
+ struct intel_screen *intelScreen;
+
+ /* Calling driInitExtensions here, with a NULL context pointer,
+ * does not actually enable the extensions. It just makes sure
+ * that all the dispatch offsets for all the extensions that
+ * *might* be enables are known. This is needed because the
+ * dispatch offsets need to be known when _mesa_context_create is
+ * called, but we can't enable the extensions until we have a
+ * context pointer.
+ *
+ * Hello chicken. Hello egg. How are you two today?
+ */
+ //intelInitExtensions(NULL, GL_TRUE);
+
+ /* Allocate the private area */
+ intelScreen = CALLOC_STRUCT(intel_screen);
+ if (!intelScreen) {
+ fprintf(stderr, "\nERROR! Allocating private area failed\n");
+ return GL_FALSE;
}
+ /* parse information in __driConfigOptions */
+ driParseOptionInfo(&intelScreen->optionCache,
+ __driConfigOptions, __driNConfigOptions);
+
+ psp->private = (void *) intelScreen;
+
+ intelScreen->drmMinor = psp->drm_version.minor;
- return (void *) psp;
+ /* Determine chipset ID? */
+ if (!intel_get_param(psp, I915_PARAM_CHIPSET_ID,
+ &intelScreen->deviceID))
+ return GL_FALSE;
+
+ psp->extensions = intelScreenExtensions;
+
+ intel_be_init_device(&intelScreen->base, psp->fd, intelScreen->deviceID);
+ intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
+ intelScreen->base.base.get_name = intel_get_name;
+
+ return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
+ intelFillInModes(psp, 32, 24, 8, 1));
}
+const struct __DriverAPIRec driDriverAPI = {
+ .InitScreen = intelInitScreen,
+ .DestroyScreen = intelDestroyScreen,
+ .CreateContext = intelCreateContext,
+ .DestroyContext = intelDestroyContext,
+ .CreateBuffer = intelCreateBuffer,
+ .DestroyBuffer = intelDestroyBuffer,
+ .SwapBuffers = intelSwapBuffers,
+ .MakeCurrent = intelMakeCurrent,
+ .UnbindContext = intelUnbindContext,
+ .GetSwapInfo = intelGetSwapInfo,
+ .GetDrawableMSC = driDrawableGetMSC32,
+ .WaitForMSC = driWaitForMSC32,
+ .CopySubBuffer = intelCopySubBuffer,
+
+ //.InitScreen2 = intelInitScreen2,
+ //.HandleDrawableConfig = intelHandleDrawableConfig,
+ //.HandleBufferAttach = intelHandleBufferAttach,
+};
diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
index 8a18bfd9a43..34ad7eebe1c 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
@@ -94,7 +94,6 @@ intelDisplaySurface(__DRIdrawablePrivate *dPriv,
int i;
ASSERT(surf->buffer);
- ASSERT(surf->cpp == cpp);
DBG(SWAP, "screen pitch %d src surface pitch %d\n",
pitch, surf->stride);
diff --git a/src/gallium/winsys/drm/intel/egl/intel_device.c b/src/gallium/winsys/drm/intel/egl/intel_device.c
index b9649cbec71..1964745c994 100644
--- a/src/gallium/winsys/drm/intel/egl/intel_device.c
+++ b/src/gallium/winsys/drm/intel/egl/intel_device.c
@@ -131,7 +131,7 @@ intel_destroy_drawable(struct egl_drm_drawable *drawable)
drawable->priv = NULL;
assert(intelfb->stfb);
- st_unreference_framebuffer(&intelfb->stfb);
+ st_unreference_framebuffer(intelfb->stfb);
free(intelfb);
return TRUE;
}
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index e9f821d2764..477d766925c 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -537,7 +537,7 @@ xlib_eglDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
}
else {
XFreeGC(surf->Dpy, surf->Gc);
- st_unreference_framebuffer(&surf->Framebuffer);
+ st_unreference_framebuffer(surf->Framebuffer);
free(surf);
}
return EGL_TRUE;
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 3334af175bc..acb5ad8f714 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -352,7 +352,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
/* offset in pixels */
offset *= TILE_SIZE * TILE_SIZE;
- if (XSHM_ENABLED(xm_buf)) {
+ if (0 && XSHM_ENABLED(xm_buf)) {
ximage->data = (char *) xm_buf->data + 4 * offset;
/* make copy of tile data */
memcpy(tmpTile, (uint *) ximage->data, sizeof(tmpTile));
@@ -365,7 +365,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
#endif
}
else {
- /* twiddel from ximage buffer to temp tile */
+ /* twiddle from ximage buffer to temp tile */
twiddle_tile((uint *) xm_buf->data + offset, tmpTile);
/* display temp tile data */
ximage->data = (char *) tmpTile;