summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/cell
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/cell')
-rw-r--r--src/gallium/drivers/cell/common.h70
-rw-r--r--src/gallium/drivers/cell/ppu/Makefile1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_batch.c43
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c23
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h26
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.c158
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.h (renamed from src/gallium/drivers/cell/spu/spu_debug.h)43
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c125
-rw-r--r--src/gallium/drivers/cell/ppu/cell_pipe_state.c41
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.c38
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.h12
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_emit.c68
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_shader.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.c27
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.h5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vbuf.c7
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c152
-rw-r--r--src/gallium/drivers/cell/spu/spu_dcache.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c34
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.c9
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h50
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.c122
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c34
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.c208
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.h34
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c162
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.h2
27 files changed, 1055 insertions, 445 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index b0169b8e329..23fb0b0831d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -64,10 +64,13 @@
#define ROUNDUP16(k) (((k) + 0xf) & ~0xf)
-#define CELL_MAX_SPUS 6
+#define CELL_MAX_SPUS 8
#define CELL_MAX_SAMPLERS 4
#define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024 /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024 /**< max framebuffer width */
#define TILE_SIZE 32
@@ -96,34 +99,67 @@
#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
#define CELL_CMD_STATE_ATTRIB_FETCH 20
#define CELL_CMD_STATE_FS_CONSTANTS 21
-#define CELL_CMD_VS_EXECUTE 22
-#define CELL_CMD_FLUSH_BUFFER_RANGE 23
+#define CELL_CMD_STATE_RASTERIZER 22
+#define CELL_CMD_VS_EXECUTE 23
+#define CELL_CMD_FLUSH_BUFFER_RANGE 24
+#define CELL_CMD_FENCE 25
+/** Command/batch buffers */
#define CELL_NUM_BUFFERS 4
#define CELL_BUFFER_SIZE (4*1024) /**< 16KB would be the max */
#define CELL_BUFFER_STATUS_FREE 10
#define CELL_BUFFER_STATUS_USED 20
+/** Debug flags */
#define CELL_DEBUG_CHECKER (1 << 0)
#define CELL_DEBUG_ASM (1 << 1)
#define CELL_DEBUG_SYNC (1 << 2)
#define CELL_DEBUG_FRAGMENT_OPS (1 << 3)
#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD (1 << 5)
+#define CELL_DEBUG_CACHE (1 << 6)
/** Max instructions for doing per-fragment operations */
#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+
+#define CELL_FENCE_IDLE 0
+#define CELL_FENCE_EMITTED 1
+#define CELL_FENCE_SIGNALLED 2
+
+struct cell_fence
+{
+ /** There's a 16-byte status qword per SPU */
+ volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs. In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+ uint64_t opcode; /**< CELL_CMD_FENCE */
+ struct cell_fence *fence;
+};
+
+
/**
* Command to specify per-fragment operations state and generated code.
+ * Note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code. They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case).
*/
struct cell_command_fragment_ops
{
uint64_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */
struct pipe_depth_stencil_alpha_state dsa;
struct pipe_blend_state blend;
+ struct pipe_blend_color blend_color;
unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
};
@@ -147,7 +183,7 @@ struct cell_command_fragment_program
*/
struct cell_command_framebuffer
{
- uint64_t opcode; /**< CELL_CMD_FRAMEBUFFER */
+ uint64_t opcode; /**< CELL_CMD_STATE_FRAMEBUFFER */
int width, height;
void *color_start, *depth_start;
enum pipe_format color_format, depth_format;
@@ -155,6 +191,16 @@ struct cell_command_framebuffer
/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+ uint64_t opcode; /**< CELL_CMD_STATE_RASTERIZER */
+ struct pipe_rasterizer_state rasterizer;
+};
+
+
+/**
* Clear framebuffer to the given value/color.
*/
struct cell_command_clear_surface
@@ -229,7 +275,6 @@ struct cell_command_render
float xmin, ymin, xmax, ymax; /* XXX another dummy field */
uint min_index;
boolean inline_verts;
- uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
};
@@ -260,19 +305,6 @@ struct cell_command_texture
};
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
-{
-#if 0
- struct cell_command_framebuffer fb;
- struct cell_command_clear_surface clear;
- struct cell_command_render render;
-#endif
- struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
-
-
#define MAX_SPU_FUNCTIONS 12
/**
* Used to tell the PPU about the address of particular functions in the
@@ -293,7 +325,7 @@ struct cell_init_info
unsigned id;
unsigned num_spus;
unsigned debug_flags; /**< mask of CELL_DEBUG_x flags */
- struct cell_command *cmd;
+ float inv_timebase; /**< 1.0/timebase, for perf measurement */
/** Buffers for command batches, vertex/index data */
ubyte *buffers[CELL_NUM_BUFFERS];
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31f..9358a47284c 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
cell_clear.c \
cell_context.c \
cell_draw_arrays.c \
+ cell_fence.c \
cell_flush.c \
cell_gen_fragment.c \
cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 16882c01295..448b723d85a 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
#include "cell_context.h"
#include "cell_batch.h"
+#include "cell_fence.h"
#include "cell_spu.h"
@@ -42,7 +43,9 @@
uint
cell_get_empty_buffer(struct cell_context *cell)
{
- uint buf = 0, tries = 0;
+ static uint prev_buffer = 0;
+ uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+ uint tries = 0;
/* Find a buffer that's marked as free by all SPUs */
while (1) {
@@ -58,8 +61,13 @@ cell_get_empty_buffer(struct cell_context *cell)
cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
}
/*
- printf("PPU: ALLOC BUFFER %u\n", buf);
+ printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
*/
+ prev_buffer = buf;
+
+ /* release tex buffer associated w/ prev use of this batch buf */
+ cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
return buf;
}
}
@@ -82,6 +90,26 @@ cell_get_empty_buffer(struct cell_context *cell)
/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+ const uint batch = cell->cur_batch;
+ const uint size = cell->buffer_size[batch];
+ struct cell_command_fence *fence_cmd;
+
+ ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+ fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+ fence_cmd->opcode = CELL_CMD_FENCE;
+ fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+}
+
+
+/**
* Flush the current batch buffer to the SPUs.
* An empty buffer will be found and set as the new current batch buffer
* for subsequent commands/data.
@@ -99,6 +127,12 @@ cell_batch_flush(struct cell_context *cell)
if (size == 0)
return;
+ /* Before we use this batch buffer, make sure any fenced texture buffers
+ * are released.
+ */
+ if (cell->fenced_buffers[batch].head)
+ emit_fence(cell);
+
flushing = TRUE;
assert(batch < CELL_NUM_BUFFERS);
@@ -139,6 +173,7 @@ uint
cell_batch_free_space(const struct cell_context *cell)
{
uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+ free -= sizeof(struct cell_command_fence);
return free;
}
@@ -169,7 +204,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
size = cell->buffer_size[cell->cur_batch];
- if (size + bytes > CELL_BUFFER_SIZE) {
+ if (bytes > cell_batch_free_space(cell)) {
cell_batch_flush(cell);
size = 0;
}
@@ -223,7 +258,7 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
padbytes = (alignment - (size % alignment)) % alignment;
- if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+ if (padbytes + bytes > cell_batch_free_space(cell)) {
cell_batch_flush(cell);
size = 0;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b66aa9c9d99..22d552d8e3d 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
#include "cell_clear.h"
#include "cell_context.h"
#include "cell_draw_arrays.h"
+#include "cell_fence.h"
#include "cell_flush.h"
#include "cell_state.h"
#include "cell_surface.h"
@@ -93,6 +94,8 @@ static const struct debug_named_value cell_debug_flags[] = {
{"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */
{"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
{"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+ {"cmd", CELL_DEBUG_CMD}, /**< SPUs dump command buffer info */
+ {"cache", CELL_DEBUG_CACHE}, /**< report texture cache stats on exit */
{NULL, 0}
};
@@ -102,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
struct cell_winsys *cws)
{
struct cell_context *cell;
+ uint i;
/* some fields need to be 16-byte aligned, so align the whole object */
cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -149,13 +153,24 @@ cell_create_context(struct pipe_screen *screen,
cell_debug_flags,
0 );
+ for (i = 0; i < CELL_NUM_BUFFERS; i++)
+ cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
/*
* SPU stuff
*/
- cell->num_spus = 6;
- /* XXX is this in SDK 3.0 only?
- cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
- */
+ /* This call only works with SDK 3.0. Anyone still using 2.1??? */
+ cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+ cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
+ if (cell->debug_flags) {
+ printf("Cell: found %d Cell(s) with %u SPUs\n",
+ cell->num_cells, cell->num_spus);
+ }
+ if (getenv("CELL_NUM_SPUS")) {
+ cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+ assert(cell->num_spus > 0);
+ }
cell_start_spus(cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 80a9b3d7e13..4491ae8cdf1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -74,12 +74,26 @@ struct cell_fragment_shader_state
struct cell_fragment_ops_key
{
struct pipe_blend_state blend;
+ struct pipe_blend_color blend_color;
struct pipe_depth_stencil_alpha_state dsa;
enum pipe_format color_format;
enum pipe_format zs_format;
};
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list. List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+ struct cell_fence fence;
+ struct cell_buffer_node *head;
+};
+
+
/**
* Per-context state, subclass of pipe_context.
*/
@@ -120,6 +134,8 @@ struct cell_context
uint *tex_map;
uint dirty;
+ uint dirty_textures; /* bitmask of texture units */
+ uint dirty_samplers; /* bitmask of sampler units */
/** Cache of code generated for per-fragment ops */
struct keymap *fragment_ops_cache;
@@ -139,7 +155,7 @@ struct cell_context
struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
- uint num_spus;
+ uint num_cells, num_spus;
/** Buffers for command batches, vertex/index data */
uint buffer_size[CELL_NUM_BUFFERS];
@@ -151,6 +167,14 @@ struct cell_context
uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+ /** Associated with each command/batch buffer is a list of pipe_buffers
+ * that are fenced. When the last command in a buffer is executed, the
+ * fence will be signalled, indicating that any pipe_buffers preceeding
+ * that fence can be unreferenced (and probably freed).
+ */
+ struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
struct spe_function attrib_fetch;
unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 00000000000..ffb3bea12b9
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+ uint i;
+ for (i = 0; i < CELL_MAX_SPUS; i++) {
+ fence->status[i][0] = CELL_FENCE_IDLE;
+ }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+ const struct cell_fence *fence)
+{
+ uint i;
+ for (i = 0; i < cell->num_spus; i++) {
+ //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
+ if (fence->status[i][0] == CELL_FENCE_EMITTED)
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+ const struct cell_fence *fence)
+{
+ while (!cell_fence_signalled(cell, fence)) {
+ usleep(10);
+ }
+}
+
+
+
+
+struct cell_buffer_node
+{
+ struct pipe_buffer *buffer;
+ struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+ struct cell_buffer_list *list,
+ struct pipe_buffer *buffer)
+{
+ struct pipe_screen *ps = cell->pipe.screen;
+ struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+ /* create new list node which references the buffer, insert at head */
+ if (node) {
+ pipe_buffer_reference(ps, &node->buffer, buffer);
+ node->next = list->head;
+ list->head = node;
+ }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+ struct cell_buffer_list *list)
+{
+ if (list->head) {
+ struct pipe_screen *ps = cell->pipe.screen;
+ struct cell_buffer_node *node;
+
+ cell_fence_finish(cell, &list->fence);
+
+ /* traverse the list, unreferencing buffers, freeing nodes */
+ node = list->head;
+ while (node) {
+ struct cell_buffer_node *next = node->next;
+ assert(node->buffer);
+ pipe_buffer_unmap(ps, node->buffer);
+#if 0
+ printf("Unref buffer %p\n", node->buffer);
+ if (node->buffer->refcount == 1)
+ printf(" Delete!\n");
+#endif
+ pipe_buffer_reference(ps, &node->buffer, NULL);
+ FREE(node);
+ node = next;
+ }
+ list->head = NULL;
+ }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+ struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+ uint i;
+
+ for (i = 0; i < cell->num_textures; i++) {
+ struct cell_texture *ct = cell->texture[i];
+ if (ct) {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ if (ct->tiled_buffer[level]) {
+#if 0
+ printf("Adding texture %p buffer %p to list\n",
+ ct, ct->tiled_buffer[level]);
+#endif
+ cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+ }
+ }
+ }
+ }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/ppu/cell_fence.h
index eeec0526558..536b4ba411a 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -26,35 +26,32 @@
**************************************************************************/
-#ifndef SPU_DEBUG_H
-#define SPU_DEBUG_H
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
+extern void
+cell_fence_init(struct cell_fence *fence);
-#if DEBUG
-extern boolean Debug;
-extern boolean force_fragment_ops_fallback;
-/* These debug macros use the unusual construction ", ##__VA_ARGS__"
- * which expands to the expected comma + args if variadic arguments
- * are supplied, but swallows the comma if there are no variadic
- * arguments (which avoids syntax errors that would otherwise occur).
- */
-#define DEBUG_PRINTF(format,...) \
- if (Debug) \
- printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-#define D_PRINTF(flag, format,...) \
- if (spu.init.debug_flags & (flag)) \
- printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+ const struct cell_fence *fence);
-#else
-#define DEBUG_PRINTF(...)
-#define D_PRINTF(...)
+extern void
+cell_fence_finish(const struct cell_context *cell,
+ const struct cell_fence *fence);
-#endif
-#endif /* SPU_DEBUG_H */
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+ struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3dfd5f673dd..d4d644d6e81 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,9 @@ struct codegen
/** Index of execution mask register */
int exec_mask_reg;
+ /** KIL mask: indicates which fragments have been killed */
+ int kill_mask_reg;
+
int frame_size; /**< Stack frame size, in words */
struct spe_function *f;
@@ -346,6 +349,22 @@ store_dest_reg(struct codegen *gen,
int value_reg, int channel,
const struct tgsi_full_dst_register *dest)
{
+ /*
+ * XXX need to implement dst reg clamping/saturation
+ */
+#if 0
+ switch (inst->Instruction.Saturate) {
+ case TGSI_SAT_NONE:
+ break;
+ case TGSI_SAT_ZERO_ONE:
+ break;
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ break;
+ default:
+ assert( 0 );
+ }
+#endif
+
switch (dest->DstRegister.File) {
case TGSI_FILE_TEMPORARY:
if (gen->if_nesting > 0) {
@@ -431,8 +450,21 @@ emit_prologue(struct codegen *gen)
static void
emit_epilogue(struct codegen *gen)
{
+ const int return_reg = 3;
+
spe_comment(gen->f, -4, "Function epilogue:");
+ spe_comment(gen->f, 0, "return the killed mask");
+ if (gen->kill_mask_reg > 0) {
+ /* shader called KIL, return the "alive" mask */
+ spe_move(gen->f, return_reg, gen->kill_mask_reg);
+ }
+ else {
+ /* return {0,0,0,0} */
+ spe_load_uint(gen->f, return_reg, 0);
+ }
+
+ spe_comment(gen->f, 0, "restore stack and return");
if (gen->frame_size >= 512) {
/* offset is too large for ai instruction */
int offset_reg = spe_allocate_available_register(gen->f);
@@ -1337,16 +1369,33 @@ emit_function_call(struct codegen *gen,
static boolean
-emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- const uint addr = lookup_function(gen->cell, "spu_txp");
+ const uint target = inst->InstructionExtTexture.Texture;
const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+ uint addr;
int ch;
int coord_regs[4], d_regs[4];
+ switch (target) {
+ case TGSI_TEXTURE_1D:
+ case TGSI_TEXTURE_2D:
+ addr = lookup_function(gen->cell, "spu_tex_2d");
+ break;
+ case TGSI_TEXTURE_3D:
+ addr = lookup_function(gen->cell, "spu_tex_3d");
+ break;
+ case TGSI_TEXTURE_CUBE:
+ addr = lookup_function(gen->cell, "spu_tex_cube");
+ break;
+ default:
+ ASSERT(0 && "unsupported texture target");
+ return FALSE;
+ }
+
assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
- spe_comment(gen->f, -4, "CALL txp:");
+ spe_comment(gen->f, -4, "CALL tex:");
/* get src/dst reg info */
for (ch = 0; ch < 4; ch++) {
@@ -1368,7 +1417,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
}
- /* setup function arguments */
+ /* setup function arguments (XXX depends on target) */
for (i = 0; i < 4; i++) {
spe_move(gen->f, 3 + i, coord_regs[i]);
}
@@ -1407,6 +1456,68 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ int ch;
+ int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+ spe_comment(gen->f, -4, "CALL kil:");
+
+ /* zero = {0,0,0,0} */
+ zero_reg = get_itemp(gen);
+ spe_load_uint(gen->f, zero_reg, 0);
+
+ cmp_reg = get_itemp(gen);
+
+ /* get src regs */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ }
+ }
+
+ /* test if any src regs are < 0 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ if (kil_reg >= 0) {
+ /* cmp = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+ /* kil = kil | cmp */
+ spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+ }
+ else {
+ kil_reg = get_itemp(gen);
+ /* kil = 0 > src ? : ~0 : 0 */
+ spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+ }
+ }
+ }
+
+ if (gen->if_nesting) {
+ /* may have been a conditional kil */
+ spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+ }
+
+ /* allocate the kill mask reg if needed */
+ if (gen->kill_mask_reg <= 0) {
+ gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+ spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+ }
+ else {
+ spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+ }
+
+ free_itemps(gen);
+
+ return TRUE;
+}
+
+
+
+/**
* Emit max. See emit_SGT for comments.
*/
static boolean
@@ -1674,8 +1785,12 @@ emit_instruction(struct codegen *gen,
/* fall-through for now */
case TGSI_OPCODE_TXB:
/* fall-through for now */
+ case TGSI_OPCODE_TXL:
+ /* fall-through for now */
case TGSI_OPCODE_TXP:
- return emit_TXP(gen, inst);
+ return emit_TEX(gen, inst);
+ case TGSI_OPCODE_KIL:
+ return emit_KIL(gen, inst);
case TGSI_OPCODE_IF:
return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 2e3086c4fae..825110c62b7 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -212,17 +212,24 @@ cell_bind_sampler_states(struct pipe_context *pipe,
unsigned num, void **samplers)
{
struct cell_context *cell = cell_context(pipe);
+ uint i, changed = 0x0;
assert(num <= CELL_MAX_SAMPLERS);
draw_flush(cell->draw);
- memcpy(cell->sampler, samplers, num * sizeof(void *));
- memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
- sizeof(void *));
- cell->num_samplers = num;
+ for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+ struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+ if (cell->sampler[i] != new_samp) {
+ cell->sampler[i] = new_samp;
+ changed |= (1 << i);
+ }
+ }
- cell->dirty |= CELL_NEW_SAMPLER;
+ if (changed) {
+ cell->dirty |= CELL_NEW_SAMPLER;
+ cell->dirty_samplers |= changed;
+ }
}
@@ -240,25 +247,25 @@ cell_set_sampler_textures(struct pipe_context *pipe,
unsigned num, struct pipe_texture **texture)
{
struct cell_context *cell = cell_context(pipe);
- uint i;
+ uint i, changed = 0x0;
assert(num <= CELL_MAX_SAMPLERS);
- /* Check for no-op */
- if (num == cell->num_textures &&
- !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
- return;
-
- draw_flush(cell->draw);
-
for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- struct pipe_texture *tex = i < num ? texture[i] : NULL;
-
- pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+ struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+ if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+ pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+ new_tex);
+ changed |= (1 << i);
+ }
}
+
cell->num_textures = num;
- cell->dirty |= CELL_NEW_TEXTURE;
+ if (changed) {
+ cell->dirty |= CELL_NEW_TEXTURE;
+ cell->dirty_textures |= changed;
+ }
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index df020c4146d..28e5e6d706d 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -53,6 +53,35 @@ struct cell_global_info cell_global;
/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+ FILE *f = fopen("/proc/cpuinfo", "r");
+ unsigned timebase;
+
+ assert(f);
+ while (!feof(f)) {
+ char line[80];
+ fgets(line, sizeof(line), f);
+ if (strncmp(line, "timebase", 8) == 0) {
+ char *colon = strchr(line, ':');
+ if (colon) {
+ timebase = atoi(colon + 2);
+ break;
+ }
+ }
+ }
+ fclose(f);
+
+ return timebase;
+}
+
+
+/**
* Write a 1-word message to the given SPE mailbox.
*/
void
@@ -115,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
{
static boolean one_time_init = FALSE;
uint i, j;
+ uint timebase = get_timebase();
if (one_time_init) {
fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -124,10 +154,7 @@ cell_start_spus(struct cell_context *cell)
one_time_init = TRUE;
- assert(cell->num_spus <= MAX_SPUS);
-
- ASSERT_ALIGN16(&cell_global.command[0]);
- ASSERT_ALIGN16(&cell_global.command[1]);
+ assert(cell->num_spus <= CELL_MAX_SPUS);
ASSERT_ALIGN16(&cell_global.inits[0]);
ASSERT_ALIGN16(&cell_global.inits[1]);
@@ -141,7 +168,8 @@ cell_start_spus(struct cell_context *cell)
cell_global.inits[i].id = i;
cell_global.inits[i].num_spus = cell->num_spus;
cell_global.inits[i].debug_flags = cell->debug_flags;
- cell_global.inits[i].cmd = &cell_global.command[i];
+ cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
for (j = 0; j < CELL_NUM_BUFFERS; j++) {
cell_global.inits[i].buffers[j] = cell->buffer[j];
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e4..b633880c256 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -31,13 +31,12 @@
#include <libspe2.h>
#include <libmisc.h>
+#include <pthread.h>
#include "cell/common.h"
#include "cell_context.h"
-#define MAX_SPUS 8
-
/**
* Global vars, for now anyway.
*/
@@ -46,14 +45,13 @@ struct cell_global_info
/**
* SPU/SPE handles, etc
*/
- spe_context_ptr_t spe_contexts[MAX_SPUS];
- pthread_t spe_threads[MAX_SPUS];
+ spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+ pthread_t spe_threads[CELL_MAX_SPUS];
/**
- * Data sent to SPUs
+ * Data sent to SPUs at start-up
*/
- struct cell_init_info inits[MAX_SPUS];
- struct cell_command command[MAX_SPUS];
+ struct cell_init_info inits[CELL_MAX_SPUS];
};
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index bb694aa1073..dd2d7f7d1ef 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell)
*/
memset(&key, 0, sizeof(key));
key.blend = *cell->blend;
+ key.blend_color = cell->blend_color;
key.dsa = *cell->depth_stencil;
if (cell->framebuffer.cbufs[0])
@@ -146,6 +147,13 @@ cell_emit_state(struct cell_context *cell)
#endif
}
+ if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+ struct cell_command_rasterizer *rast =
+ cell_batch_alloc(cell, sizeof(*rast));
+ rast->opcode = CELL_CMD_STATE_RASTERIZER;
+ rast->rasterizer = *cell->rasterizer;
+ }
+
if (cell->dirty & (CELL_NEW_FS)) {
/* Send new fragment program to SPUs */
struct cell_command_fragment_program *fp
@@ -193,44 +201,50 @@ cell_emit_state(struct cell_context *cell)
if (cell->dirty & CELL_NEW_SAMPLER) {
uint i;
for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- if (cell->sampler[i]) {
- struct cell_command_sampler *sampler
- = cell_batch_alloc(cell, sizeof(*sampler));
- sampler->opcode = CELL_CMD_STATE_SAMPLER;
- sampler->unit = i;
- sampler->state = *cell->sampler[i];
+ if (cell->dirty_samplers & (1 << i)) {
+ if (cell->sampler[i]) {
+ struct cell_command_sampler *sampler
+ = cell_batch_alloc(cell, sizeof(*sampler));
+ sampler->opcode = CELL_CMD_STATE_SAMPLER;
+ sampler->unit = i;
+ sampler->state = *cell->sampler[i];
+ }
}
}
+ cell->dirty_samplers = 0x0;
}
if (cell->dirty & CELL_NEW_TEXTURE) {
uint i;
for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
- struct cell_command_texture *texture
- = cell_batch_alloc(cell, sizeof(*texture));
- texture->opcode = CELL_CMD_STATE_TEXTURE;
- texture->unit = i;
- if (cell->texture[i]) {
- uint level;
- for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
- texture->start[level] = cell->texture[i]->tiled_data[level];
- texture->width[level] = cell->texture[i]->base.width[level];
- texture->height[level] = cell->texture[i]->base.height[level];
- texture->depth[level] = cell->texture[i]->base.depth[level];
+ if (cell->dirty_textures & (1 << i)) {
+ struct cell_command_texture *texture
+ = cell_batch_alloc(cell, sizeof(*texture));
+ texture->opcode = CELL_CMD_STATE_TEXTURE;
+ texture->unit = i;
+ if (cell->texture[i]) {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = cell->texture[i]->tiled_mapped[level];
+ texture->width[level] = cell->texture[i]->base.width[level];
+ texture->height[level] = cell->texture[i]->base.height[level];
+ texture->depth[level] = cell->texture[i]->base.depth[level];
+ }
+ texture->target = cell->texture[i]->base.target;
}
- texture->target = cell->texture[i]->base.target;
- }
- else {
- uint level;
- for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
- texture->start[level] = NULL;
- texture->width[level] = 0;
- texture->height[level] = 0;
- texture->depth[level] = 0;
+ else {
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = NULL;
+ texture->width[level] = 0;
+ texture->height[level] = 0;
+ texture->depth[level] = 0;
+ }
+ texture->target = 0;
}
- texture->target = 0;
}
}
+ cell->dirty_textures = 0x0;
}
if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 54a17eaf2b7..cda39f8d592 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -191,6 +191,8 @@ cell_set_constant_buffer(struct pipe_context *pipe,
assert(shader < PIPE_SHADER_TYPES);
assert(index == 0);
+ draw_flush(cell->draw);
+
/* note: reference counting */
winsys_buffer_reference(ws,
&cell->constants[shader].buffer,
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 230e1925733..9ac2f3bbb96 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen,
__FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
*/
if (--(*pt)->refcount <= 0) {
+ /* Delete this texture now.
+ * But note that the underlying pipe_buffer may linger...
+ */
struct cell_texture *ct = cell_texture(*pt);
uint i;
@@ -146,8 +149,12 @@ cell_texture_release(struct pipe_screen *screen,
pipe_buffer_reference(screen, &ct->buffer, NULL);
for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
- if (ct->tiled_data[i]) {
- align_free(ct->tiled_data[i]);
+ /* Unreference the tiled image buffer.
+ * It may not actually be deleted until a fence is hit.
+ */
+ if (ct->tiled_buffer[i]) {
+ ct->tiled_mapped[i] = NULL;
+ winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
}
}
@@ -228,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen,
int offset = bufWidth * bufHeight * 4 * surface->face;
uint *dst;
- if (!ct->tiled_data[level]) {
- ct->tiled_data[level] =
- align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+ if (!ct->tiled_buffer[level]) {
+ /* allocate buffer for tiled data now */
+ struct pipe_winsys *ws = screen->winsys;
+ uint bytes = bufWidth * bufHeight * 4 * numFaces;
+ ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+ PIPE_BUFFER_USAGE_PIXEL,
+ bytes);
+ /* and map it */
+ ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+ PIPE_BUFFER_USAGE_GPU_READ);
}
-
- dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+ dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
surface->stride, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index a0757091b07..2f5fe0dd1b7 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -48,7 +48,10 @@ struct cell_texture
struct pipe_buffer *buffer;
unsigned long buffer_size;
- void *tiled_data[CELL_MAX_TEXTURE_LEVELS]; /* XXX this may be temporary */ /*ALIGN16*/
+ /** Texture data in tiled layout is held here */
+ struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+ /** Mapped, tiled texture data */
+ void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
};
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 578ddf62dcb..65ba51b6bb2 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
#include "cell_batch.h"
#include "cell_context.h"
+#include "cell_fence.h"
#include "cell_flush.h"
#include "cell_spu.h"
#include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
__FUNCTION__, cvbr->vertex_buf, vertices_used);
*/
+ /* Make sure texture buffers aren't released until we're done rendering
+ * with them.
+ */
+ cell_add_fenced_textures(cell);
+
/* Tell SPUs they can release the vert buf */
if (cvbr->vertex_buf != ~0U) {
struct cell_command_release_verts *release
@@ -214,7 +220,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
render->opcode = CELL_CMD_RENDER;
render->prim_type = cvbr->prim;
- render->front_winding = cell->rasterizer->front_winding;
render->num_indexes = nr_indices;
render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c28677ebf87..a6ed29ea631 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -44,7 +44,6 @@
#include "spu_tile.h"
#include "spu_vertex_shader.h"
#include "spu_dcache.h"
-#include "spu_debug.h"
#include "cell/common.h"
@@ -77,9 +76,10 @@ static void
release_buffer(uint buffer)
{
/* Evidently, using less than a 16-byte status doesn't work reliably */
- static const uint status[4] ALIGN16_ATTRIB
- = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
+ static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE,
+ CELL_BUFFER_STATUS_FREE};
const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
uint *dst = spu.init.buffer_status + index;
@@ -94,10 +94,33 @@ release_buffer(uint buffer)
}
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+ static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED,
+ CELL_FENCE_SIGNALLED};
+ uint *dst = (uint *) fence_cmd->fence;
+ dst += 4 * spu.init.id; /* main store/memory address, not local store */
+
+ mfc_put((void *) &status, /* src in local memory */
+ (unsigned int) dst, /* dst in main memory */
+ sizeof(status), /* size */
+ TAG_FENCE, /* tag */
+ 0, /* tid */
+ 0 /* rid */);
+}
+
+
static void
cmd_clear_surface(const struct cell_command_clear_surface *clear)
{
- DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+ D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
if (clear->surface == 0) {
spu.fb.color_clear_value = clear->value;
@@ -165,14 +188,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
#endif /* CLEAR_OPT */
- DEBUG_PRINTF("CLEAR SURF done\n");
+ D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
}
static void
cmd_release_verts(const struct cell_command_release_verts *release)
{
- DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+ D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
ASSERT(release->vertex_buf != ~0U);
release_buffer(release->vertex_buf);
}
@@ -189,12 +212,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
{
static int warned = 0;
- DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
/* Copy SPU code from batch buffer to spu buffer */
memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
/* Copy state info (for fallback case only) */
memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+ memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
/* Parity twist! For now, always use the fallback code by default,
* only switching to codegen when specifically requested. This
@@ -228,7 +252,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
static void
cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
{
- DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
/* Copy SPU code from batch buffer to spu buffer */
memcpy(spu.fragment_program_code, fp->code,
SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -246,10 +270,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
const float *constants = (const float *) &buffer[pos + 2];
uint i;
- DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+ D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
/* Expand each float to float[4] for SOA execution */
for (i = 0; i < num_const; i++) {
+ D_PRINTF(CELL_DEBUG_CMD, " const[%u] = %f\n", i, constants[i]);
spu.constants[i] = spu_splats(constants[i]);
}
@@ -261,7 +286,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
static void
cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
{
- DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
+ D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n",
cmd->width,
cmd->height,
cmd->color_start,
@@ -309,8 +334,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
*/
static void
update_tex_masks(struct spu_texture *texture,
- const struct pipe_sampler_state *sampler,
- uint unit)
+ const struct pipe_sampler_state *sampler)
{
uint i;
@@ -337,11 +361,6 @@ update_tex_masks(struct spu_texture *texture,
texture->level[i].scale_t = spu_splats(1.0f);
}
}
-
- /* XXX temporary hack */
- if (texture->target == PIPE_TEXTURE_CUBE) {
- spu.sample_texture4[unit] = sample_texture4_cube;
- }
}
@@ -350,18 +369,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
{
uint unit = sampler->unit;
- DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+ D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
spu.sampler[unit] = sampler->state;
switch (spu.sampler[unit].min_img_filter) {
case PIPE_TEX_FILTER_LINEAR:
- spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+ spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
break;
case PIPE_TEX_FILTER_ANISO:
/* fall-through, for now */
case PIPE_TEX_FILTER_NEAREST:
- spu.min_sample_texture4[unit] = sample_texture4_nearest;
+ spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
break;
default:
ASSERT(0);
@@ -369,12 +388,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
switch (spu.sampler[sampler->unit].mag_img_filter) {
case PIPE_TEX_FILTER_LINEAR:
- spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+ spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
break;
case PIPE_TEX_FILTER_ANISO:
/* fall-through, for now */
case PIPE_TEX_FILTER_NEAREST:
- spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+ spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
break;
default:
ASSERT(0);
@@ -383,16 +402,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
switch (spu.sampler[sampler->unit].min_mip_filter) {
case PIPE_TEX_MIPFILTER_NEAREST:
case PIPE_TEX_MIPFILTER_LINEAR:
- spu.sample_texture4[unit] = sample_texture4_lod;
+ spu.sample_texture_2d[unit] = sample_texture_2d_lod;
break;
case PIPE_TEX_MIPFILTER_NONE:
- spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+ spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
break;
default:
ASSERT(0);
}
- update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
}
@@ -402,9 +421,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
const uint unit = texture->unit;
uint i;
- //if (spu.init.id==0) Debug=1;
-
- DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+ D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
spu.texture[unit].max_level = 0;
spu.texture[unit].target = texture->target;
@@ -414,7 +431,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
uint height = texture->height[i];
uint depth = texture->depth[i];
- DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i,
+ D_PRINTF(CELL_DEBUG_CMD, " LEVEL %u: at %p size[0] %u x %u\n", i,
texture->start[i], texture->width[i], texture->height[i]);
spu.texture[unit].level[i].start = texture->start[i];
@@ -435,16 +452,14 @@ cmd_state_texture(const struct cell_command_texture *texture)
spu.texture[unit].max_level = i;
}
- update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
-
- //Debug=0;
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
}
static void
cmd_state_vertex_info(const struct vertex_info *vinfo)
{
- DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+ D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
ASSERT(vinfo->num_attribs >= 1);
ASSERT(vinfo->num_attribs <= 8);
memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -483,7 +498,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
static void
cmd_finish(void)
{
- DEBUG_PRINTF("FINISH\n");
+ D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
really_clear_tiles(0);
/* wait for all outstanding DMAs to finish */
mfc_write_tag_mask(~0);
@@ -508,7 +523,7 @@ cmd_batch(uint opcode)
const unsigned usize = size / sizeof(buffer[0]);
uint pos;
- DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+ D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
buf, size, spu.init.buffers[buf]);
ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -528,7 +543,7 @@ cmd_batch(uint opcode)
wait_on_mask(1 << TAG_BATCH_BUFFER);
/* Tell PPU we're done copying the buffer to local store */
- DEBUG_PRINTF("release batch buf %u\n", buf);
+ D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
release_buffer(buf);
/*
@@ -586,6 +601,14 @@ cmd_batch(uint opcode)
case CELL_CMD_STATE_FS_CONSTANTS:
pos = cmd_state_fs_constants(buffer, pos);
break;
+ case CELL_CMD_STATE_RASTERIZER:
+ {
+ struct cell_command_rasterizer *rast =
+ (struct cell_command_rasterizer *) &buffer[pos];
+ spu.rasterizer = rast->rasterizer;
+ pos += sizeof(*rast) / 8;
+ }
+ break;
case CELL_CMD_STATE_SAMPLER:
{
struct cell_command_sampler *sampler
@@ -638,6 +661,14 @@ cmd_batch(uint opcode)
cmd_finish();
pos += 1;
break;
+ case CELL_CMD_FENCE:
+ {
+ struct cell_command_fence *fence_cmd =
+ (struct cell_command_fence *) &buffer[pos];
+ cmd_fence(fence_cmd);
+ pos += sizeof(*fence_cmd) / 8;
+ }
+ break;
case CELL_CMD_RELEASE_VERTS:
{
struct cell_command_release_verts *release
@@ -661,10 +692,12 @@ cmd_batch(uint opcode)
}
}
- DEBUG_PRINTF("BATCH complete\n");
+ D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
}
+#define PERF 0
+
/**
* Main loop for SPEs: Get a command, execute it, repeat.
@@ -672,41 +705,29 @@ cmd_batch(uint opcode)
void
command_loop(void)
{
- struct cell_command cmd;
int exitFlag = 0;
+ uint t0, t1;
- DEBUG_PRINTF("Enter command loop\n");
-
- ASSERT((sizeof(struct cell_command) & 0xf) == 0);
- ASSERT_ALIGN16(&cmd);
+ D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
while (!exitFlag) {
unsigned opcode;
- int tag = 0;
- DEBUG_PRINTF("Wait for cmd...\n");
+ D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+ if (PERF)
+ spu_write_decrementer(~0);
/* read/wait from mailbox */
opcode = (unsigned int) spu_read_in_mbox();
+ D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
- DEBUG_PRINTF("got cmd 0x%x\n", opcode);
-
- /* command payload */
- mfc_get(&cmd, /* dest */
- (unsigned int) spu.init.cmd, /* src */
- sizeof(struct cell_command), /* bytes */
- tag,
- 0, /* tid */
- 0 /* rid */);
- wait_on_mask( 1 << tag );
-
- /*
- * NOTE: most commands should be contained in a batch buffer
- */
+ if (PERF)
+ t0 = spu_read_decrementer();
switch (opcode & CELL_CMD_OPCODE_MASK) {
case CELL_CMD_EXIT:
- DEBUG_PRINTF("EXIT\n");
+ D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
exitFlag = 1;
break;
case CELL_CMD_VS_EXECUTE:
@@ -721,9 +742,16 @@ command_loop(void)
printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
}
+ if (PERF) {
+ t1 = spu_read_decrementer();
+ printf("wait mbox time: %gms batch time: %gms\n",
+ (~0u - t0) * spu.init.inv_timebase,
+ (t0 - t1) * spu.init.inv_timebase);
+ }
}
- DEBUG_PRINTF("Exit command loop\n");
+ D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
- spu_dcache_report();
+ if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+ spu_dcache_report();
}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc54..a6d67634fd8 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
#define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0)
#define CACHE_LOG2NNWAY 2
#define CACHE_LOG2NSETS 6
-/*#define CACHE_STATS 1*/
+#ifdef DEBUG
+#define CACHE_STATS 1
+#endif
#include <cache-api.h>
/* Yes folks, this is ugly.
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 5c3ee305d48..3534b35000d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -43,6 +43,7 @@
#include "cell/common.h"
#include "spu_main.h"
#include "spu_funcs.h"
+#include "spu_texture.h"
/** For "return"-ing four vectors */
@@ -102,11 +103,34 @@ spu_log2(vector float x)
static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q,
- unsigned unit)
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
{
struct vec_4x4 colors;
- spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+ (void) r;
+ (void) q;
+ spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+ return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ (void) r;
+ (void) q;
+ spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+ return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ (void) q;
+ sample_texture_cube(s, t, r, unit, colors.v);
return colors;
}
@@ -147,7 +171,9 @@ return_function_info(void)
export_func(&funcs, "spu_pow", &spu_pow);
export_func(&funcs, "spu_exp2", &spu_exp2);
export_func(&funcs, "spu_log2", &spu_log2);
- export_func(&funcs, "spu_txp", &spu_txp);
+ export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+ export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+ export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
/* Send the function info back to the PPU / main memory */
mfc_put((void *) &funcs, /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4becd0f92a4..c8bb2519050 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -40,7 +40,6 @@
#include "spu_per_fragment_op.h"
#include "spu_texture.h"
//#include "spu_test.h"
-#include "spu_debug.h"
#include "cell/common.h"
@@ -53,12 +52,6 @@ helpful headers:
struct spu_global spu;
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-#endif
-
-
static void
one_time_init(void)
{
@@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp)
one_time_init();
- DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+ D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
/* get initialization data */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index eff43b870ca..668af10be25 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,12 +36,18 @@
#include "pipe/p_state.h"
-
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+ if (spu.init.debug_flags & (flag)) \
+ printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
/**
@@ -64,12 +70,10 @@ typedef union {
/** Function for sampling textures */
-typedef void (*spu_sample_texture4_func)(vector float s,
- vector float t,
- vector float r,
- vector float q,
- uint unit, uint level, uint face,
- vector float colors[4]);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+ vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
/** Function for performing per-fragment ops */
@@ -85,9 +89,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
uint facing);
/** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
- vector float *outputs,
- vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+ vector float *outputs,
+ vector float *constants);
struct spu_framebuffer
@@ -145,7 +149,9 @@ struct spu_global
struct spu_framebuffer fb;
struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
struct pipe_blend_state blend;
+ struct pipe_blend_color blend_color;
struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+ struct pipe_rasterizer_state rasterizer;
struct spu_texture texture[PIPE_MAX_SAMPLERS];
struct vertex_info vertex_info;
@@ -161,8 +167,8 @@ struct spu_global
ubyte cur_ctile_status, cur_ztile_status;
/** Status of all tiles in framebuffer */
- ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
- ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+ ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+ ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
/** Current fragment ops machine code, at 8-byte boundary */
uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
@@ -175,9 +181,9 @@ struct spu_global
spu_fragment_program_func fragment_program;
/** Current texture sampler function */
- spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
- spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
- spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+ spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
/** Fragment program constants */
vector float constants[4 * CELL_MAX_CONSTANTS];
@@ -186,8 +192,6 @@ struct spu_global
extern struct spu_global spu;
-extern boolean Debug;
-
@@ -206,7 +210,7 @@ extern boolean Debug;
#define TAG_DCACHE1 21
#define TAG_DCACHE2 22
#define TAG_DCACHE3 23
-
+#define TAG_FENCE 24
static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index d252fa6dc18..f8ffc704926 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
#define LINEAR_QUAD_LAYOUT 1
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+ vector unsigned int m;
+ m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */
+ return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+ vector unsigned int m;
+ m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */
+ return spu_sel(b, a, m);
+}
+
+
/**
* Called by rasterizer for each quad after the shader has run. Do
* all the per-fragment operations including alpha test, z test,
@@ -242,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y,
}
/*
- * Compute Src RGB terms
+ * Compute Src RGB terms (fragment color * factor)
*/
switch (spu.blend.rgb_src_factor) {
case PIPE_BLENDFACTOR_ONE:
@@ -265,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y,
term1g = spu_mul(fragG, fragA);
term1b = spu_mul(fragB, fragA);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ term1r = spu_mul(fragR, fbRGBA[0]);
+ term1g = spu_mul(fragG, fbRGBA[1]);
+ term1b = spu_mul(fragB, fbRGBA[1]);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term1r = spu_mul(fragR, fbRGBA[3]);
+ term1g = spu_mul(fragG, fbRGBA[3]);
+ term1b = spu_mul(fragB, fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+ term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+ term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+ term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+ term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Src Alpha term
+ * Compute Src Alpha term (fragment alpha * factor)
*/
switch (spu.blend.alpha_src_factor) {
case PIPE_BLENDFACTOR_ONE:
@@ -283,19 +321,29 @@ spu_fallback_fragment_ops(uint x, uint y,
case PIPE_BLENDFACTOR_SRC_ALPHA:
term1a = spu_mul(fragA, fragA);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term1a = spu_mul(fragA, fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Dest RGB terms
+ * Compute Dest RGB terms (framebuffer color * factor)
*/
switch (spu.blend.rgb_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
- term2r = fragR;
- term2g = fragG;
- term2b = fragB;
+ term2r = fbRGBA[0];
+ term2g = fbRGBA[1];
+ term2b = fbRGBA[2];
break;
case PIPE_BLENDFACTOR_ZERO:
term2r =
@@ -319,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y,
term2g = spu_mul(fbRGBA[1], tmp);
term2b = spu_mul(fbRGBA[2], tmp);
break;
- /* XXX more cases */
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+ term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+ term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+ break;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+ term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+ term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+ term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+ term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+ break;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+ term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+ term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+ break;
+ /* XXX more cases */
default:
ASSERT(0);
}
/*
- * Compute Dest Alpha term
+ * Compute Dest Alpha term (framebuffer alpha * factor)
*/
switch (spu.blend.alpha_dst_factor) {
case PIPE_BLENDFACTOR_ONE:
- term2a = fragA;
+ term2a = fbRGBA[3];
break;
case PIPE_BLENDFACTOR_SRC_COLOR:
term2a = spu_splats(0.0f);
@@ -342,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y,
tmp = spu_sub(one, fragA);
term2a = spu_mul(fbRGBA[3], tmp);
break;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+ break;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ /* fall-through */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+ break;
/* XXX more cases */
default:
ASSERT(0);
@@ -361,7 +439,21 @@ spu_fallback_fragment_ops(uint x, uint y,
fragG = spu_sub(term1g, term2g);
fragB = spu_sub(term1b, term2b);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ fragR = spu_sub(term2r, term1r);
+ fragG = spu_sub(term2g, term1g);
+ fragB = spu_sub(term2b, term1b);
+ break;
+ case PIPE_BLEND_MIN:
+ fragR = spu_min(term1r, term2r);
+ fragG = spu_min(term1g, term2g);
+ fragB = spu_min(term1b, term2b);
+ break;
+ case PIPE_BLEND_MAX:
+ fragR = spu_max(term1r, term2r);
+ fragG = spu_max(term1g, term2g);
+ fragB = spu_max(term1b, term2b);
+ break;
default:
ASSERT(0);
}
@@ -376,7 +468,15 @@ spu_fallback_fragment_ops(uint x, uint y,
case PIPE_BLEND_SUBTRACT:
fragA = spu_sub(term1a, term2a);
break;
- /* XXX more cases */
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ fragA = spu_sub(term2a, term1a);
+ break;
+ case PIPE_BLEND_MIN:
+ fragA = spu_min(term1a, term2a);
+ break;
+ case PIPE_BLEND_MAX:
+ fragA = spu_max(term1a, term2a);
+ break;
default:
ASSERT(0);
}
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 82dbeb26b76..5515bb55c95 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -175,22 +175,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
const ubyte *vertices;
const ushort *indexes;
uint i, j;
+ uint num_tiles;
-
- if (Debug) {
- printf("SPU %u: RENDER prim %u, num_vert=%u num_ind=%u "
- "inline_vert=%u\n",
- spu.init.id,
- render->prim_type,
- render->num_verts,
- render->num_indexes,
- render->inline_verts);
-
- /*
- printf(" bound: %g, %g .. %g, %g\n",
- render->xmin, render->ymin, render->xmax, render->ymax);
- */
- }
+ D_PRINTF(CELL_DEBUG_CMD,
+ "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+ render->prim_type,
+ render->num_verts,
+ render->num_indexes,
+ render->inline_verts);
ASSERT(sizeof(*render) % 4 == 0);
ASSERT(total_vertex_bytes % 16 == 0);
@@ -251,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+ num_tiles = 0;
+
/**
** loop over tiles, rendering tris
**/
@@ -264,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
if (!my_tile(tx, ty))
continue;
+ num_tiles++;
+
spu.cur_ctile_status = spu.ctile_status[ty][tx];
spu.cur_ztile_status = spu.ztile_status[ty][tx];
@@ -279,7 +275,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
- drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+ drawn += tri_draw(v0, v1, v2, tx, ty);
}
//printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -293,7 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
spu.ztile_status[ty][tx] = spu.cur_ztile_status;
}
- if (Debug)
- printf("SPU %u: RENDER done\n",
- spu.init.id);
+ D_PRINTF(CELL_DEBUG_CMD,
+ "RENDER done (%u tiles hit)\n",
+ num_tiles);
}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 42eb06a3625..69784c89788 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -72,10 +72,10 @@ invalidate_tex_cache(void)
* a time.
*/
static void
-get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+ vec_int4 x, vec_int4 y,
vec_uint4 *texels)
{
- const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
unsigned texture_ea = (uintptr_t) tlevel->start;
const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */
const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */
@@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max)
* \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
*/
void
-sample_texture4_nearest(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4])
+sample_texture_2d_nearest(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
vector float ss = spu_mul(s, tlevel->scale_s);
@@ -146,7 +145,7 @@ sample_texture4_nearest(vector float s, vector float t,
is = spu_clamp(is, tlevel->max_s);
it = spu_clamp(it, tlevel->max_t);
- get_four_texels(unit, level, face, is, it, texels);
+ get_four_texels(tlevel, face, is, it, texels);
/* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t,
* \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
*/
void
-sample_texture4_bilinear(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4])
+sample_texture_2d_bilinear(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -190,14 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t,
/* get packed int texels */
vector unsigned int texels[16];
- get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */
- get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */
- get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */
- get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
-
- /* XXX possibly rework following code to compute the weighted sample
- * colors with integer arithmetic for fewer int->float conversions.
- */
+ get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
/* convert packed int texels to float colors */
vector float ftexels[16];
@@ -305,13 +299,13 @@ transpose(vector unsigned int *mOut0,
/**
- * Bilinear filtering, using int intead of float arithmetic
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
*/
void
-sample_texture4_bilinear_2(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4])
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -320,19 +314,19 @@ sample_texture4_bilinear_2(vector float s, vector float t,
vector float ss = spu_madd(s, tlevel->scale_s, half);
vector float tt = spu_madd(t, tlevel->scale_t, half);
- /* convert float coords to fixed-pt coords with 8 fraction bits */
- vector signed int is = spu_convts(ss, 8);
- vector signed int it = spu_convts(tt, 8);
+ /* convert float coords to fixed-pt coords with 7 fraction bits */
+ vector signed int is = spu_convts(ss, 7); /* XXX really need floor() here */
+ vector signed int it = spu_convts(tt, 7); /* XXX really need floor() here */
- /* compute integer texel weights in [0, 255] */
- vector signed int sWeights0 = spu_and(is, 255);
- vector signed int tWeights0 = spu_and(it, 255);
- vector signed int sWeights1 = spu_sub(255, sWeights0);
- vector signed int tWeights1 = spu_sub(255, tWeights0);
+ /* compute integer texel weights in [0, 127] */
+ vector signed int sWeights0 = spu_and(is, 127);
+ vector signed int tWeights0 = spu_and(it, 127);
+ vector signed int sWeights1 = spu_sub(127, sWeights0);
+ vector signed int tWeights1 = spu_sub(127, tWeights0);
- /* texel coords: is0 = is / 256, it0 = is / 256 */
- vector signed int is0 = spu_rlmask(is, -8);
- vector signed int it0 = spu_rlmask(it, -8);
+ /* texel coords: is0 = is / 128, it0 = is / 128 */
+ vector signed int is0 = spu_rlmask(is, -7);
+ vector signed int it0 = spu_rlmask(it, -7);
/* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
vector signed int is1 = spu_add(is0, 1);
@@ -352,10 +346,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
/* get packed int texels */
vector unsigned int texels[16];
- get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */
- get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */
- get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */
- get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+ get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
/* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
{
@@ -383,36 +377,36 @@ sample_texture4_bilinear_2(vector float s, vector float t,
vector unsigned int c0, c1, c2, c3, cSum;
/* red */
- c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
- c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
- c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
- c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
- colors[0] = spu_convtf(cSum, 24);
+ colors[0] = spu_convtf(cSum, 22);
/* green */
- c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
- c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
- c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
- c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
- colors[1] = spu_convtf(cSum, 24);
+ colors[1] = spu_convtf(cSum, 22);
/* blue */
- c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
- c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
- c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
- c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
- colors[2] = spu_convtf(cSum, 24);
+ colors[2] = spu_convtf(cSum, 22);
/* alpha */
- c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
- c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
- c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
- c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
- colors[3] = spu_convtf(cSum, 24);
+ colors[3] = spu_convtf(cSum, 22);
}
@@ -420,8 +414,8 @@ sample_texture4_bilinear_2(vector float s, vector float t,
/**
* Compute level of detail factor from texcoords.
*/
-static float
-compute_lambda(uint unit, vector float s, vector float t)
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
{
uint baseLevel = 0;
float width = spu.texture[unit].level[baseLevel].width;
@@ -430,30 +424,60 @@ compute_lambda(uint unit, vector float s, vector float t)
float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+ /* ideal value */
float x = dsdx * dsdx + dtdx * dtdx;
float y = dsdy * dsdy + dtdy * dtdy;
float rho = x > y ? x : y;
rho = sqrtf(rho);
- float lambda = logf(rho) * 1.442695f;
+#else
+ /* approximation */
+ dsdx = fabsf(dsdx);
+ dsdy = fabsf(dsdy);
+ dtdx = fabsf(dtdx);
+ dtdy = fabsf(dtdy);
+ float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+ float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
return lambda;
}
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+ vector float t = spu_splats(weight);
+ vector float dc0 = spu_sub(c1[0], c0[0]);
+ vector float dc1 = spu_sub(c1[1], c0[1]);
+ vector float dc2 = spu_sub(c1[2], c0[2]);
+ vector float dc3 = spu_sub(c1[3], c0[3]);
+ c0[0] = spu_madd(dc0, t, c0[0]);
+ c0[1] = spu_madd(dc1, t, c0[1]);
+ c0[2] = spu_madd(dc2, t, c0[2]);
+ c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
/**
- * Texture sampling with level of detail selection.
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
*/
void
-sample_texture4_lod(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level_ignored, uint face,
- vector float colors[4])
+sample_texture_2d_lod(vector float s, vector float t,
+ uint unit, uint level_ignored, uint face,
+ vector float colors[4])
{
/*
* Note that we're computing a lambda/lod here that's used for all
* four pixels in the quad.
*/
- float lambda = compute_lambda(unit, s, t);
+ float lambda = compute_lambda_2d(unit, s, t);
+
+ (void) face;
+ (void) level_ignored;
/* apply lod bias */
lambda += spu.sampler[unit].lod_bias;
@@ -466,15 +490,34 @@ sample_texture4_lod(vector float s, vector float t,
if (lambda <= 0.0f) {
/* magnify */
- spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+ spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
}
else {
/* minify */
- int level = (int) (lambda + 0.5f);
- if (level > (int) spu.texture[unit].max_level)
- level = spu.texture[unit].max_level;
- spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
- /* XXX to do: mipmap level interpolation */
+ if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+ /* sample two mipmap levels and interpolate */
+ int level = (int) lambda;
+ if (level > (int) spu.texture[unit].max_level)
+ level = spu.texture[unit].max_level;
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+ if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+ /* sample second mipmap level */
+ float weight = lambda - (float) level;
+ level++;
+ if (level <= (int) spu.texture[unit].max_level) {
+ vector float colors2[4];
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+ blend_colors(colors, colors2, weight);
+ }
+ }
+ }
+ else {
+ /* sample one mipmap level */
+ int level = (int) (lambda + 0.5f);
+ if (level > (int) spu.texture[unit].max_level)
+ level = spu.texture[unit].max_level;
+ spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+ }
}
}
@@ -552,16 +595,13 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
void
-sample_texture4_cube(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face_ignored,
- vector float colors[4])
+sample_texture_cube(vector float s, vector float t, vector float r,
+ uint unit, vector float colors[4])
{
- static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
- uint p, faces[4];
+ uint p, faces[4], level = 0;
float newS[4], newT[4];
- /* Compute cube face referenced by the four sets of texcoords.
+ /* Compute cube faces referenced by the four sets of texcoords.
* XXX we should SIMD-ize this.
*/
for (p = 0; p < 4; p++) {
@@ -577,15 +617,15 @@ sample_texture4_cube(vector float s, vector float t,
/* GOOD! All four texcoords refer to the same cube face */
s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
- sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+ spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
}
else {
/* BAD! The four texcoords refer to different faces */
for (p = 0; p < 4; p++) {
vector float c[4];
- sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
- zero, zero, unit, level, faces[p], c);
+ spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+ unit, level, faces[p], c);
float red = spu_extract(c[0], p);
float green = spu_extract(c[1], p);
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 387484c3ad3..7b75b007b5a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,37 +37,31 @@ invalidate_tex_cache(void);
extern void
-sample_texture4_nearest(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4]);
+sample_texture_2d_nearest(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
extern void
-sample_texture4_bilinear(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4]);
-
-extern void
-sample_texture4_bilinear_2(vector float s, vector float t,
- vector float r, vector float q,
+sample_texture_2d_bilinear(vector float s, vector float t,
uint unit, uint level, uint face,
vector float colors[4]);
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
extern void
-sample_texture4_lod(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level, uint face,
- vector float colors[4]);
+sample_texture_2d_lod(vector float s, vector float t,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
extern void
-sample_texture4_cube(vector float s, vector float t,
- vector float r, vector float q,
- uint unit, uint level_ignored, uint face_ignored,
- vector float colors[4]);
+sample_texture_cube(vector float s, vector float t, vector float r,
+ uint unit, vector float colors[4]);
#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 03f094373df..4caf7d6b613 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -43,11 +43,6 @@
/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
typedef vector unsigned int mask_t;
-typedef union
-{
- vector float v;
- float f[4];
-} float4;
/**
@@ -91,9 +86,9 @@ struct edge {
struct interp_coef
{
- float4 a0;
- float4 dadx;
- float4 dady;
+ vector float a0;
+ vector float dadx;
+ vector float dady;
};
@@ -116,7 +111,7 @@ struct setup_stage {
struct edge etop;
struct edge emaj;
- float oneOverArea;
+ float oneOverArea; /* XXX maybe make into vector? */
uint facing;
@@ -152,14 +147,14 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
result[QUAD_TOP_LEFT] =
result[QUAD_TOP_RIGHT] =
result[QUAD_BOTTOM_LEFT] =
- result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+ result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
break;
case INTERP_LINEAR:
{
- vector float dadx = setup.coef[slot].dadx.v;
- vector float dady = setup.coef[slot].dady.v;
+ vector float dadx = setup.coef[slot].dadx;
+ vector float dady = setup.coef[slot].dady;
vector float topLeft =
- spu_add(setup.coef[slot].a0.v,
+ spu_add(setup.coef[slot].a0,
spu_add(spu_mul(spu_splats(x), dadx),
spu_mul(spu_splats(y), dady)));
@@ -171,10 +166,10 @@ eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
break;
case INTERP_PERSPECTIVE:
{
- vector float dadx = setup.coef[slot].dadx.v;
- vector float dady = setup.coef[slot].dady.v;
+ vector float dadx = setup.coef[slot].dadx;
+ vector float dady = setup.coef[slot].dady;
vector float topLeft =
- spu_add(setup.coef[slot].a0.v,
+ spu_add(setup.coef[slot].a0,
spu_add(spu_mul(spu_splats(x), dadx),
spu_mul(spu_splats(y), dady)));
@@ -212,9 +207,9 @@ static INLINE vector float
eval_z(float x, float y)
{
const uint slot = 0;
- const float dzdx = setup.coef[slot].dadx.f[2];
- const float dzdy = setup.coef[slot].dady.f[2];
- const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+ const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+ const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+ const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
const vector float topLeftv = spu_splats(topLeft);
const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
return spu_add(topLeftv, derivs);
@@ -226,9 +221,9 @@ static INLINE vector float
eval_w(float x, float y)
{
const uint slot = 0;
- const float dwdx = setup.coef[slot].dadx.f[3];
- const float dwdy = setup.coef[slot].dady.f[3];
- const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+ const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+ const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+ const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
const vector float topLeftv = spu_splats(topLeft);
const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
return spu_add(topLeftv, derivs);
@@ -259,6 +254,7 @@ emit_quad( int x, int y, mask_t mask)
vector float inputs[4*4], outputs[2*4];
vector float fragZ = eval_z((float) x, (float) y);
vector float fragW = eval_w((float) x, (float) y);
+ vector unsigned int kill_mask;
/* setup inputs */
#if 0
@@ -273,7 +269,9 @@ emit_quad( int x, int y, mask_t mask)
ASSERT(spu.fragment_ops);
/* Execute the current fragment program */
- spu.fragment_program(inputs, outputs, spu.constants);
+ kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+ mask = spu_andc(mask, kill_mask);
/* Execute per-fragment/quad operations, including:
* alpha test, z test, stencil test, blend and framebuffer writing.
@@ -404,30 +402,41 @@ flush_spans(void)
static void
print_vertex(const struct vertex_header *v)
{
- int i;
- fprintf(stderr, "Vertex: (%p)\n", v);
- for (i = 0; i < setup.quad.nr_attrs; i++) {
- fprintf(stderr, " %d: %f %f %f %f\n", i,
- v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+ uint i;
+ fprintf(stderr, " Vertex: (%p)\n", v);
+ for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+ fprintf(stderr, " %d: %f %f %f %f\n", i,
+ spu_extract(v->data[i], 0),
+ spu_extract(v->data[i], 1),
+ spu_extract(v->data[i], 2),
+ spu_extract(v->data[i], 3));
}
}
#endif
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return FALSE if tri is totally outside tile, TRUE otherwise
+ */
static boolean
setup_sort_vertices(const struct vertex_header *v0,
const struct vertex_header *v1,
const struct vertex_header *v2)
{
+ float area, sign;
+
#if DEBUG_VERTS
- fprintf(stderr, "Triangle:\n");
- print_vertex(v0);
- print_vertex(v1);
- print_vertex(v2);
+ if (spu.init.id==0) {
+ fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+ print_vertex(v0);
+ print_vertex(v1);
+ print_vertex(v2);
+ }
#endif
- setup.vprovoke = v2;
-
/* determine bottom to top order of vertices */
{
float y0 = spu_extract(v0->data[0], 1);
@@ -439,18 +448,21 @@ setup_sort_vertices(const struct vertex_header *v0,
setup.vmin = v0;
setup.vmid = v1;
setup.vmax = v2;
+ sign = -1.0f;
}
else if (y2 <= y0) {
/* y2<=y0<=y1 */
setup.vmin = v2;
setup.vmid = v0;
setup.vmax = v1;
+ sign = -1.0f;
}
else {
/* y0<=y2<=y1 */
setup.vmin = v0;
setup.vmid = v2;
setup.vmax = v1;
+ sign = 1.0f;
}
}
else {
@@ -459,18 +471,21 @@ setup_sort_vertices(const struct vertex_header *v0,
setup.vmin = v1;
setup.vmid = v0;
setup.vmax = v2;
+ sign = 1.0f;
}
else if (y2 <= y1) {
/* y2<=y1<=y0 */
setup.vmin = v2;
setup.vmid = v1;
setup.vmax = v0;
+ sign = 1.0f;
}
else {
/* y1<=y2<=y0 */
setup.vmin = v1;
setup.vmid = v2;
setup.vmax = v0;
+ sign = -1.0f;
}
}
}
@@ -499,31 +514,16 @@ setup_sort_vertices(const struct vertex_header *v0,
/*
* Compute triangle's area. Use 1/area to compute partial
* derivatives of attributes later.
- *
- * The area will be the same as prim->det, but the sign may be
- * different depending on how the vertices get sorted above.
- *
- * To determine whether the primitive is front or back facing we
- * use the prim->det value because its sign is correct.
*/
- {
- const float area = (setup.emaj.dx * setup.ebot.dy -
- setup.ebot.dx * setup.emaj.dy);
-
- setup.oneOverArea = 1.0f / area;
- /*
- _mesa_printf("%s one-over-area %f area %f det %f\n",
- __FUNCTION__, setup.oneOverArea, area, prim->det );
- */
- }
+ area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
-#if 0
- /* We need to know if this is a front or back-facing triangle for:
- * - the GLSL gl_FrontFacing fragment attribute (bool)
- * - two-sided stencil test
- */
- setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+ setup.oneOverArea = 1.0f / area;
+
+ /* The product of area * sign indicates front/back orientation (0/1) */
+ setup.facing = (area * sign > 0.0f)
+ ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+ setup.vprovoke = v2;
return TRUE;
}
@@ -538,9 +538,9 @@ setup_sort_vertices(const struct vertex_header *v0,
static INLINE void
const_coeff4(uint slot)
{
- setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
- setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
- setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+ setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+ setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+ setup.coef[slot].a0 = setup.vprovoke->data[slot];
}
@@ -564,13 +564,13 @@ tri_linear_coeff4(uint slot)
vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
spu_mul(majda, spu_splats(setup.ebot.dx)));
- setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
- setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
- vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
- vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+ vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+ vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
- setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+ setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
}
@@ -608,13 +608,13 @@ tri_persp_coeff4(uint slot)
vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
spu_mul(majda, spu_splats(setup.ebot.dx)));
- setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
- setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
- vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
- vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+ vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+ vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
- setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+ setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
}
@@ -750,27 +750,13 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
}
-static float
-determinant(const float *v0, const float *v1, const float *v2)
-{
- /* edge vectors e = v0 - v2, f = v1 - v2 */
- const float ex = v0[0] - v2[0];
- const float ey = v0[1] - v2[1];
- const float fx = v1[0] - v2[0];
- const float fy = v1[1] - v2[1];
-
- /* det = cross(e,f).z */
- return ex * fy - ey * fx;
-}
-
-
/**
* Draw triangle into tile at (tx, ty) (tile coords)
* The tile data should have already been fetched.
*/
boolean
tri_draw(const float *v0, const float *v1, const float *v2,
- uint tx, uint ty, uint front_winding)
+ uint tx, uint ty)
{
setup.tx = tx;
setup.ty = ty;
@@ -781,12 +767,6 @@ tri_draw(const float *v0, const float *v1, const float *v2,
setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
- /* Before we sort vertices, determine the facing of the triangle,
- * which will be needed for front/back-face stencil application
- */
- float det = determinant(v0, v1, v2);
- setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
-
if (!setup_sort_vertices((struct vertex_header *) v0,
(struct vertex_header *) v1,
(struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index abc3d35160e..aa694dd7c93 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
#endif /* SPU_TRI_H */