diff options
author | Brian Paul <[email protected]> | 2009-02-10 16:44:02 -0700 |
---|---|---|
committer | Brian Paul <[email protected]> | 2009-02-10 16:44:02 -0700 |
commit | 5340b6dff73a0a23531ce2a5f28fba8303adab6e (patch) | |
tree | b141fc3648568dd8b941c966059e6ed32a8bd0ad /src/gallium/drivers/cell | |
parent | 9fd26daec24f21dbe17afcb2e2ab272667ee9a69 (diff) | |
parent | ee4c921b65fb76998711f3c40330505cbc49a0e0 (diff) |
Merge commit 'origin/gallium-master-merge'
This is the big merge of the gallium-0.2 branch into master.
gallium-master-merge was just the staging area for it.
Both gallium-0.2 and gallium-master-merge are considered closed now.
Conflicts:
progs/demos/Makefile
src/mesa/main/state.c
src/mesa/main/texenvprogram.c
Diffstat (limited to 'src/gallium/drivers/cell')
72 files changed, 18885 insertions, 0 deletions
diff --git a/src/gallium/drivers/cell/Makefile b/src/gallium/drivers/cell/Makefile new file mode 100644 index 00000000000..47aef7b05f6 --- /dev/null +++ b/src/gallium/drivers/cell/Makefile @@ -0,0 +1,12 @@ +# Cell Gallium driver Makefile + + +default: + ( cd spu ; make ) + ( cd ppu ; make ) + + + +clean: + ( cd spu ; make clean ) + ( cd ppu ; make clean ) diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h new file mode 100644 index 00000000000..1f6860da119 --- /dev/null +++ b/src/gallium/drivers/cell/common.h @@ -0,0 +1,376 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Types and tokens which are common to the SPU and PPU code. + */ + + +#ifndef CELL_COMMON_H +#define CELL_COMMON_H + +#include "pipe/p_compiler.h" +#include "pipe/p_format.h" +#include "pipe/p_state.h" + + +/** The standard assert macro doesn't seem to work reliably */ +#define ASSERT(x) \ + if (!(x)) { \ + ubyte *p = NULL; \ + fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \ + __FILE__, __LINE__, __FUNCTION__, #x); \ + *p = 0; \ + exit(1); \ + } + + + +#define JOIN(x, y) JOIN_AGAIN(x, y) +#define JOIN_AGAIN(x, y) x ## y + +#define STATIC_ASSERT(e) \ +{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];} + + + +/** for sanity checking */ +#define ASSERT_ALIGN16(ptr) \ + ASSERT((((unsigned long) (ptr)) & 0xf) == 0); + + +/** round up value to next multiple of 4 */ +#define ROUNDUP4(k) (((k) + 0x3) & ~0x3) + +/** round up value to next multiple of 8 */ +#define ROUNDUP8(k) (((k) + 0x7) & ~0x7) + +/** round up value to next multiple of 16 */ +#define ROUNDUP16(k) (((k) + 0xf) & ~0xf) + + +#define CELL_MAX_SPUS 8 + +#define CELL_MAX_SAMPLERS 4 +#define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */ +#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */ +#define CELL_MAX_WIDTH 1024 /**< max framebuffer width */ +#define CELL_MAX_HEIGHT 1024 /**< max framebuffer width */ + +#define TILE_SIZE 32 + + +/** + * The low byte of a mailbox word contains the command opcode. + * Remaining higher bytes are command specific. + */ +#define CELL_CMD_OPCODE_MASK 0xff + +#define CELL_CMD_EXIT 1 +#define CELL_CMD_CLEAR_SURFACE 2 +#define CELL_CMD_FINISH 3 +#define CELL_CMD_RENDER 4 +#define CELL_CMD_BATCH 5 +#define CELL_CMD_RELEASE_VERTS 6 +#define CELL_CMD_STATE_FRAMEBUFFER 10 +#define CELL_CMD_STATE_FRAGMENT_OPS 11 +#define CELL_CMD_STATE_SAMPLER 12 +#define CELL_CMD_STATE_TEXTURE 13 +#define CELL_CMD_STATE_VERTEX_INFO 14 +#define CELL_CMD_STATE_VIEWPORT 15 +#define CELL_CMD_STATE_UNIFORMS 16 +#define CELL_CMD_STATE_VS_ARRAY_INFO 17 +#define CELL_CMD_STATE_BIND_VS 18 +#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19 +#define CELL_CMD_STATE_ATTRIB_FETCH 20 +#define CELL_CMD_STATE_FS_CONSTANTS 21 +#define CELL_CMD_STATE_RASTERIZER 22 +#define CELL_CMD_VS_EXECUTE 23 +#define CELL_CMD_FLUSH_BUFFER_RANGE 24 +#define CELL_CMD_FENCE 25 + + +/** Command/batch buffers */ +#define CELL_NUM_BUFFERS 4 +#define CELL_BUFFER_SIZE (4*1024) /**< 16KB would be the max */ + +#define CELL_BUFFER_STATUS_FREE 10 +#define CELL_BUFFER_STATUS_USED 20 + +/** Debug flags */ +#define CELL_DEBUG_CHECKER (1 << 0) +#define CELL_DEBUG_ASM (1 << 1) +#define CELL_DEBUG_SYNC (1 << 2) +#define CELL_DEBUG_FRAGMENT_OPS (1 << 3) +#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) +#define CELL_DEBUG_CMD (1 << 5) +#define CELL_DEBUG_CACHE (1 << 6) + +#define CELL_FENCE_IDLE 0 +#define CELL_FENCE_EMITTED 1 +#define CELL_FENCE_SIGNALLED 2 + +#define CELL_FACING_FRONT 0 +#define CELL_FACING_BACK 1 + +struct cell_fence +{ + /** There's a 16-byte status qword per SPU */ + volatile uint status[CELL_MAX_SPUS][4]; +}; + +#ifdef __SPU__ +typedef vector unsigned int opcode_t; +#else +typedef unsigned int opcode_t[4]; +#endif + +/** + * Fence command sent to SPUs. In response, the SPUs will write + * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory. + */ +struct cell_command_fence +{ + opcode_t opcode; /**< CELL_CMD_FENCE */ + struct cell_fence *fence; + uint32_t pad_[3]; +}; + + +/** + * Command to specify per-fragment operations state and generated code. + * Note that this is a variant-length structure, allocated with as + * much memory as needed to hold the generated code; the "code" + * field *must* be the last field in the structure. Also, the entire + * length of the structure (including the variant code field) must be + * a multiple of 8 bytes; we require that this structure itself be + * a multiple of 8 bytes, and that the generated code also be a multiple + * of 8 bytes. + * + * Also note that the dsa, blend, blend_color fields are really only needed + * for the fallback/C per-pixel code. They're not used when we generate + * dynamic SPU fragment code (which is the normal case), and will eventually + * be removed from this structure. + */ +struct cell_command_fragment_ops +{ + opcode_t opcode; /**< CELL_CMD_STATE_FRAGMENT_OPS */ + + /* Fields for the fallback case */ + struct pipe_depth_stencil_alpha_state dsa; + struct pipe_blend_state blend; + struct pipe_blend_color blend_color; + + /* Fields for the generated SPU code */ + unsigned total_code_size; + unsigned front_code_index; + unsigned back_code_index; + /* this field has variant length, and must be the last field in + * the structure + */ + unsigned code[0]; +}; + + +/** Max instructions for fragment programs */ +#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512 + +/** + * Command to send a fragment program to SPUs. + */ +struct cell_command_fragment_program +{ + opcode_t opcode; /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */ + uint num_inst; /**< Number of instructions */ + uint32_t pad[3]; + unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; +}; + + +/** + * Tell SPUs about the framebuffer size, location + */ +struct cell_command_framebuffer +{ + opcode_t opcode; /**< CELL_CMD_STATE_FRAMEBUFFER */ + int width, height; + void *color_start, *depth_start; + enum pipe_format color_format, depth_format; + uint32_t pad_[2]; +}; + + +/** + * Tell SPUs about rasterizer state. + */ +struct cell_command_rasterizer +{ + opcode_t opcode; /**< CELL_CMD_STATE_RASTERIZER */ + struct pipe_rasterizer_state rasterizer; +}; + + +/** + * Clear framebuffer to the given value/color. + */ +struct cell_command_clear_surface +{ + opcode_t opcode; /**< CELL_CMD_CLEAR_SURFACE */ + uint surface; /**< Temporary: 0=color, 1=Z */ + uint value; + uint32_t pad[2]; +}; + + +/** + * Array info used by the vertex shader's vertex puller. + */ +struct cell_array_info +{ + uint64_t base; /**< Base address of the 0th element. */ + uint attr; /**< Attribute that this state is for. */ + uint pitch; /**< Byte pitch from one entry to the next. */ + uint size; + uint function_offset; +}; + + +struct cell_attribute_fetch_code +{ + uint64_t base; + uint size; +}; + + +struct cell_buffer_range +{ + uint64_t base; + unsigned size; +}; + + +struct cell_shader_info +{ + uint64_t declarations; + uint64_t instructions; + uint64_t immediates; + + unsigned num_outputs; + unsigned num_declarations; + unsigned num_instructions; + unsigned num_immediates; +}; + + +#define SPU_VERTS_PER_BATCH 64 +struct cell_command_vs +{ + opcode_t opcode; /**< CELL_CMD_VS_EXECUTE */ + uint64_t vOut[SPU_VERTS_PER_BATCH]; + unsigned num_elts; + unsigned elts[SPU_VERTS_PER_BATCH]; + float plane[12][4]; + unsigned nr_planes; + unsigned nr_attrs; +}; + + +struct cell_command_render +{ + opcode_t opcode; /**< CELL_CMD_RENDER */ + uint prim_type; /**< PIPE_PRIM_x */ + uint num_verts; + uint vertex_size; /**< bytes per vertex */ + uint num_indexes; + uint vertex_buf; /**< which cell->buffer[] contains the vertex data */ + float xmin, ymin, xmax, ymax; /* XXX another dummy field */ + uint min_index; + boolean inline_verts; + uint32_t pad_[1]; +}; + + +struct cell_command_release_verts +{ + opcode_t opcode; /**< CELL_CMD_RELEASE_VERTS */ + uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ + uint32_t pad_[3]; +}; + + +struct cell_command_sampler +{ + opcode_t opcode; /**< CELL_CMD_STATE_SAMPLER */ + uint unit; + struct pipe_sampler_state state; + uint32_t pad_[1]; +}; + + +struct cell_command_texture +{ + opcode_t opcode; /**< CELL_CMD_STATE_TEXTURE */ + uint target; /**< PIPE_TEXTURE_x */ + uint unit; + void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */ + ushort width[CELL_MAX_TEXTURE_LEVELS]; + ushort height[CELL_MAX_TEXTURE_LEVELS]; + ushort depth[CELL_MAX_TEXTURE_LEVELS]; +}; + + +#define MAX_SPU_FUNCTIONS 12 +/** + * Used to tell the PPU about the address of particular functions in the + * SPU's address space. + */ +struct cell_spu_function_info +{ + uint num; + char names[MAX_SPU_FUNCTIONS][16]; + uint addrs[MAX_SPU_FUNCTIONS]; + char pad[12]; /**< Pad struct to multiple of 16 bytes (256 currently) */ +}; + + +/** This is the object passed to spe_create_thread() */ +struct cell_init_info +{ + unsigned id; + unsigned num_spus; + unsigned debug_flags; /**< mask of CELL_DEBUG_x flags */ + float inv_timebase; /**< 1.0/timebase, for perf measurement */ + + /** Buffers for command batches, vertex/index data */ + ubyte *buffers[CELL_NUM_BUFFERS]; + uint *buffer_status; /**< points at cell_context->buffer_status */ + + struct cell_spu_function_info *spu_functions; +} ALIGN16_ATTRIB; + + +#endif /* CELL_COMMON_H */ diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile new file mode 100644 index 00000000000..c92f8e5cba2 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/Makefile @@ -0,0 +1,86 @@ +# Gallium3D Cell driver: PPU code + +# This makefile builds the libcell.a library which gets pulled into +# the main libGL.so library + + +TOP = ../../../../.. +include $(TOP)/configs/current + + +# This is the "top-level" cell PPU driver code, will get pulled into libGL.so +# by the winsys Makefile. +CELL_LIB = ../libcell.a + + +# This is the SPU code. We'd like to be able to put this into the libcell.a +# archive with the PPU code, but nesting .a libs doesn't seem to work. +# So, it's pulled into libGL.so in gallium/winsys/xlib/Makefile +SPU_CODE_MODULE = ../spu/g3d_spu.a + + +SOURCES = \ + cell_batch.c \ + cell_clear.c \ + cell_context.c \ + cell_draw_arrays.c \ + cell_fence.c \ + cell_flush.c \ + cell_gen_fragment.c \ + cell_gen_fp.c \ + cell_state_derived.c \ + cell_state_emit.c \ + cell_state_shader.c \ + cell_pipe_state.c \ + cell_screen.c \ + cell_state_vertex.c \ + cell_spu.c \ + cell_surface.c \ + cell_texture.c \ + cell_vbuf.c \ + cell_vertex_fetch.c \ + cell_vertex_shader.c + + +OBJECTS = $(SOURCES:.c=.o) \ + +INCLUDE_DIRS = \ + -I$(TOP)/src/mesa \ + -I$(TOP)/src/gallium/include \ + -I$(TOP)/src/gallium/auxiliary \ + -I$(TOP)/src/gallium/drivers + +.c.o: + $(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@ + + +.c.s: + $(CC) -S $(INCLUDE_DIRS) $(CFLAGS) $< -o $@ + + +default: $(CELL_LIB) + + +$(CELL_LIB): $(OBJECTS) $(SPU_CODE_MODULE) +# ar -ru $(CELL_LIB) $(OBJECTS) $(SPU_CODE_MODULE) # doesn't work + ar -ru $(CELL_LIB) $(OBJECTS) + +#$(PROG): $(PPU_OBJECTS) +# $(CC) -o $(PROG) $(PPU_OBJECTS) $(SPU_CODE_MODULE) $(PPU_LFLAGS) + + + +clean: + rm -f *.o *~ $(CELL_LIB) + + + +depend: $(SOURCES) + rm -f depend + touch depend + $(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null + +include depend + + + diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c new file mode 100644 index 00000000000..fe144f8b849 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -0,0 +1,260 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_fence.h" +#include "cell_spu.h" + + + +/** + * Search the buffer pool for an empty/free buffer and return its index. + * Buffers are used for storing vertex data, state and commands which + * will be sent to the SPUs. + * If no empty buffers are available, wait for one. + * \return buffer index in [0, CELL_NUM_BUFFERS-1] + */ +uint +cell_get_empty_buffer(struct cell_context *cell) +{ + static uint prev_buffer = 0; + uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS; + uint tries = 0; + + /* Find a buffer that's marked as free by all SPUs */ + while (1) { + uint spu, num_free = 0; + + for (spu = 0; spu < cell->num_spus; spu++) { + if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) { + num_free++; + + if (num_free == cell->num_spus) { + /* found a free buffer, now mark status as used */ + for (spu = 0; spu < cell->num_spus; spu++) { + cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED; + } + /* + printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries); + */ + prev_buffer = buf; + + /* release tex buffer associated w/ prev use of this batch buf */ + cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]); + + return buf; + } + } + else { + break; + } + } + + /* try next buf */ + buf = (buf + 1) % CELL_NUM_BUFFERS; + + tries++; + if (tries == 100) { + /* + printf("PPU WAITING for buffer...\n"); + */ + } + } +} + + +/** + * Append a fence command to the current batch buffer. + * Note that we're sure there's always room for this because of the + * adjusted size check in cell_batch_free_space(). + */ +static void +emit_fence(struct cell_context *cell) +{ + const uint batch = cell->cur_batch; + const uint size = cell->buffer_size[batch]; + struct cell_command_fence *fence_cmd; + struct cell_fence *fence = &cell->fenced_buffers[batch].fence; + uint i; + + /* set fence status to emitted, not yet signalled */ + for (i = 0; i < cell->num_spus; i++) { + fence->status[i][0] = CELL_FENCE_EMITTED; + } + + STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0); + ASSERT(size % 16 == 0); + ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); + + fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); + fence_cmd->opcode[0] = CELL_CMD_FENCE; + fence_cmd->fence = fence; + + /* update batch buffer size */ + cell->buffer_size[batch] = size + sizeof(struct cell_command_fence); +} + + +/** + * Flush the current batch buffer to the SPUs. + * An empty buffer will be found and set as the new current batch buffer + * for subsequent commands/data. + */ +void +cell_batch_flush(struct cell_context *cell) +{ + static boolean flushing = FALSE; + uint batch = cell->cur_batch; + uint size = cell->buffer_size[batch]; + uint spu, cmd_word; + + assert(!flushing); + + if (size == 0) + return; + + /* Before we use this batch buffer, make sure any fenced texture buffers + * are released. + */ + if (cell->fenced_buffers[batch].head) { + emit_fence(cell); + size = cell->buffer_size[batch]; + } + + flushing = TRUE; + + assert(batch < CELL_NUM_BUFFERS); + + /* + printf("cell_batch_dispatch: buf %u at %p, size %u\n", + batch, &cell->buffer[batch][0], size); + */ + + /* + * Build "BATCH" command and send to all SPUs. + */ + cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16); + + for (spu = 0; spu < cell->num_spus; spu++) { + assert(cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_USED); + send_mbox_message(cell_global.spe_contexts[spu], cmd_word); + } + + /* When the SPUs are done copying the buffer into their locals stores + * they'll write a BUFFER_STATUS_FREE message into the buffer_status[] + * array indicating that the PPU can re-use the buffer. + */ + + batch = cell_get_empty_buffer(cell); + + cell->buffer_size[batch] = 0; /* empty */ + cell->cur_batch = batch; + + flushing = FALSE; +} + + +/** + * Return the number of bytes free in the current batch buffer. + */ +uint +cell_batch_free_space(const struct cell_context *cell) +{ + uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch]; + free -= sizeof(struct cell_command_fence); + return free; +} + + +/** + * Allocate space in the current batch buffer for 'bytes' space. + * Bytes must be a multiple of 16 bytes. Allocation will be 16 byte aligned. + * \return address in batch buffer to put data + */ +void * +cell_batch_alloc16(struct cell_context *cell, uint bytes) +{ + void *pos; + uint size; + + ASSERT(bytes % 16 == 0); + ASSERT(bytes <= CELL_BUFFER_SIZE); + ASSERT(cell->cur_batch >= 0); + +#ifdef ASSERT + { + uint spu; + for (spu = 0; spu < cell->num_spus; spu++) { + ASSERT(cell->buffer_status[spu][cell->cur_batch][0] + == CELL_BUFFER_STATUS_USED); + } + } +#endif + + size = cell->buffer_size[cell->cur_batch]; + + if (bytes > cell_batch_free_space(cell)) { + cell_batch_flush(cell); + size = 0; + } + + ASSERT(size % 16 == 0); + ASSERT(size + bytes <= CELL_BUFFER_SIZE); + + pos = (void *) (cell->buffer[cell->cur_batch] + size); + + cell->buffer_size[cell->cur_batch] = size + bytes; + + return pos; +} + + +/** + * One-time init of batch buffers. + */ +void +cell_init_batch_buffers(struct cell_context *cell) +{ + uint spu, buf; + + /* init command, vertex/index buffer info */ + for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) { + cell->buffer_size[buf] = 0; + + /* init batch buffer status values, + * mark 0th buffer as used, rest as free. + */ + for (spu = 0; spu < cell->num_spus; spu++) { + if (buf == 0) + cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED; + else + cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE; + } + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h new file mode 100644 index 00000000000..290136031a1 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_batch.h @@ -0,0 +1,54 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_BATCH_H +#define CELL_BATCH_H + +#include "pipe/p_compiler.h" + + +struct cell_context; + + +extern uint +cell_get_empty_buffer(struct cell_context *cell); + +extern void +cell_batch_flush(struct cell_context *cell); + +extern uint +cell_batch_free_space(const struct cell_context *cell); + +extern void * +cell_batch_alloc16(struct cell_context *cell, uint bytes); + +extern void +cell_init_batch_buffers(struct cell_context *cell); + + +#endif /* CELL_BATCH_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c new file mode 100644 index 00000000000..c2e276988ca --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -0,0 +1,123 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Authors + * Brian Paul + */ + +#include <stdio.h> +#include <assert.h> +#include <stdint.h> +#include "pipe/p_inlines.h" +#include "util/u_memory.h" +#include "util/u_pack_color.h" +#include "cell/common.h" +#include "cell_clear.h" +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_flush.h" +#include "cell_spu.h" +#include "cell_state.h" + + +/** + * Convert packed pixel from one format to another. + */ +static unsigned +convert_color(enum pipe_format srcFormat, unsigned srcColor, + enum pipe_format dstFormat) +{ + ubyte r, g, b, a; + unsigned dstColor; + + util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a); + util_pack_color_ub(r, g, b, a, dstFormat, &dstColor); + + return dstColor; +} + + + +/** + * Called via pipe->clear() + */ +void +cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, + unsigned clearValue) +{ + struct pipe_screen *screen = pipe->screen; + struct cell_context *cell = cell_context(pipe); + uint surfIndex; + + if (cell->dirty) + cell_update_derived(cell); + + + if (!cell->cbuf_map[0]) + cell->cbuf_map[0] = screen->surface_map(screen, ps, + PIPE_BUFFER_USAGE_GPU_WRITE); + + if (ps == cell->framebuffer.zsbuf) { + /* clear z/stencil buffer */ + surfIndex = 1; + } + else { + /* clear color buffer */ + surfIndex = 0; + + if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) { + clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue, + ps->format); + } + } + + + /* Build a CLEAR command and place it in the current batch buffer */ + { + STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0); + struct cell_command_clear_surface *clr + = (struct cell_command_clear_surface *) + cell_batch_alloc16(cell, sizeof(*clr)); + clr->opcode[0] = CELL_CMD_CLEAR_SURFACE; + clr->surface = surfIndex; + clr->value = clearValue; + } + + /* Technically, the surface's contents are now known and cleared, + * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But + * it turns out it's quite painful to recognize when any particular + * surface goes from PIPE_SURFACE_STATUS_CLEAR to + * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because + * the drawing commands could be operating on numerous draw buffers, + * which we'd have to iterate through to set all their stati... + * For now, we cheat a bit and set the surface's status to DEFINED + * right here. Later we should revisit this and set the status to + * CLEAR here, and find a better place to set the status to DEFINED. + */ + ps->status = PIPE_SURFACE_STATUS_DEFINED; +} diff --git a/src/gallium/drivers/cell/ppu/cell_clear.h b/src/gallium/drivers/cell/ppu/cell_clear.h new file mode 100644 index 00000000000..ff47d43f4cd --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_clear.h @@ -0,0 +1,43 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_CLEAR_H +#define CELL_CLEAR_H + + +struct pipe_context; +struct pipe_surface; + + +extern void +cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, + unsigned clearValue); + + + +#endif /* CELL_CLEAR_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c new file mode 100644 index 00000000000..ae82ded334a --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -0,0 +1,183 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Authors + * Brian Paul + */ + + +#include <stdio.h> + +#include "pipe/p_defines.h" +#include "pipe/p_format.h" +#include "util/u_memory.h" +#include "pipe/internal/p_winsys_screen.h" +#include "pipe/p_screen.h" + +#include "draw/draw_context.h" +#include "draw/draw_private.h" + +#include "cell/common.h" +#include "cell_batch.h" +#include "cell_clear.h" +#include "cell_context.h" +#include "cell_draw_arrays.h" +#include "cell_fence.h" +#include "cell_flush.h" +#include "cell_state.h" +#include "cell_surface.h" +#include "cell_spu.h" +#include "cell_pipe_state.h" +#include "cell_texture.h" +#include "cell_vbuf.h" + + + +static void +cell_destroy_context( struct pipe_context *pipe ) +{ + struct cell_context *cell = cell_context(pipe); + + util_delete_keymap(cell->fragment_ops_cache, NULL); + + cell_spu_exit(cell); + + align_free(cell); +} + + +static struct draw_context * +cell_draw_create(struct cell_context *cell) +{ + struct draw_context *draw = draw_create(); + +#if 0 /* broken */ + if (getenv("GALLIUM_CELL_VS")) { + /* plug in SPU-based vertex transformation code */ + draw->shader_queue_flush = cell_vertex_shader_queue_flush; + draw->driver_private = cell; + } +#endif + + return draw; +} + + +static const struct debug_named_value cell_debug_flags[] = { + {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ + {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */ + {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ + {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ + {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ + {"cmd", CELL_DEBUG_CMD}, /**< SPUs dump command buffer info */ + {"cache", CELL_DEBUG_CACHE}, /**< report texture cache stats on exit */ + {NULL, 0} +}; + + +struct pipe_context * +cell_create_context(struct pipe_screen *screen, + struct cell_winsys *cws) +{ + struct cell_context *cell; + uint i; + + /* some fields need to be 16-byte aligned, so align the whole object */ + cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16); + if (!cell) + return NULL; + + memset(cell, 0, sizeof(*cell)); + + cell->winsys = cws; + cell->pipe.winsys = screen->winsys; + cell->pipe.screen = screen; + cell->pipe.destroy = cell_destroy_context; + + cell->pipe.clear = cell_clear_surface; + cell->pipe.flush = cell_flush; + +#if 0 + cell->pipe.begin_query = cell_begin_query; + cell->pipe.end_query = cell_end_query; + cell->pipe.wait_query = cell_wait_query; +#endif + + cell_init_draw_functions(cell); + cell_init_state_functions(cell); + cell_init_shader_functions(cell); + cell_init_surface_functions(cell); + cell_init_vertex_functions(cell); + + cell->draw = cell_draw_create(cell); + + /* Create cache of fragment ops generated code */ + cell->fragment_ops_cache = + util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL); + + cell_init_vbuf(cell); + + draw_set_rasterize_stage(cell->draw, cell->vbuf); + + /* convert all points/lines to tris for the time being */ + draw_wide_point_threshold(cell->draw, 0.0); + draw_wide_line_threshold(cell->draw, 0.0); + + /* get env vars or read config file to get debug flags */ + cell->debug_flags = debug_get_flags_option("CELL_DEBUG", + cell_debug_flags, + 0 ); + + for (i = 0; i < CELL_NUM_BUFFERS; i++) + cell_fence_init(&cell->fenced_buffers[i].fence); + + + /* + * SPU stuff + */ + /* This call only works with SDK 3.0. Anyone still using 2.1??? */ + cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1); + cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); + if (cell->debug_flags) { + printf("Cell: found %d Cell(s) with %u SPUs\n", + cell->num_cells, cell->num_spus); + } + if (getenv("CELL_NUM_SPUS")) { + cell->num_spus = atoi(getenv("CELL_NUM_SPUS")); + assert(cell->num_spus > 0); + } + + cell_start_spus(cell); + + cell_init_batch_buffers(cell); + + /* make sure SPU initializations are done before proceeding */ + cell_flush_int(cell, CELL_FLUSH_WAIT); + + return &cell->pipe; +} diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h new file mode 100644 index 00000000000..eb1397bb3fa --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -0,0 +1,205 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_CONTEXT_H +#define CELL_CONTEXT_H + + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "draw/draw_vertex.h" +#include "draw/draw_vbuf.h" +#include "cell_winsys.h" +#include "cell/common.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "tgsi/tgsi_scan.h" +#include "util/u_keymap.h" + + +struct cell_vbuf_render; + + +/** + * Cell vertex shader state, subclass of pipe_shader_state. + */ +struct cell_vertex_shader_state +{ + struct pipe_shader_state shader; + struct tgsi_shader_info info; + void *draw_data; +}; + + +/** + * Cell fragment shader state, subclass of pipe_shader_state. + */ +struct cell_fragment_shader_state +{ + struct pipe_shader_state shader; + struct tgsi_shader_info info; + struct spe_function code; + void *data; +}; + + +/** + * Key for mapping per-fragment state to cached SPU machine code. + * keymap(cell_fragment_ops_key) => cell_command_fragment_ops + */ +struct cell_fragment_ops_key +{ + struct pipe_blend_state blend; + struct pipe_blend_color blend_color; + struct pipe_depth_stencil_alpha_state dsa; + enum pipe_format color_format; + enum pipe_format zs_format; +}; + + +struct cell_buffer_node; + +/** + * Fenced buffer list. List of buffers which can be unreferenced after + * the fence has been executed/signalled. + */ +struct cell_buffer_list +{ + struct cell_fence fence ALIGN16_ATTRIB; + struct cell_buffer_node *head; +}; + + +/** + * Per-context state, subclass of pipe_context. + */ +struct cell_context +{ + struct pipe_context pipe; + + struct cell_winsys *winsys; + + const struct pipe_blend_state *blend; + const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS]; + uint num_samplers; + const struct pipe_depth_stencil_alpha_state *depth_stencil; + const struct pipe_rasterizer_state *rasterizer; + const struct cell_vertex_shader_state *vs; + const struct cell_fragment_shader_state *fs; + + struct spe_function logic_op; + + struct pipe_blend_color blend_color; + struct pipe_clip_state clip; + struct pipe_constant_buffer constants[2]; + struct pipe_framebuffer_state framebuffer; + struct pipe_poly_stipple poly_stipple; + struct pipe_scissor_state scissor; + struct cell_texture *texture[PIPE_MAX_SAMPLERS]; + uint num_textures; + struct pipe_viewport_state viewport; + struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; + uint num_vertex_buffers; + struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; + uint num_vertex_elements; + + ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS]; + ubyte *zsbuf_map; + + struct pipe_surface *tex_surf; + uint *tex_map; + + uint dirty; + uint dirty_textures; /* bitmask of texture units */ + uint dirty_samplers; /* bitmask of sampler units */ + + /** Cache of code generated for per-fragment ops */ + struct keymap *fragment_ops_cache; + + /** The primitive drawing context */ + struct draw_context *draw; + struct draw_stage *render_stage; + + /** For post-transformed vertex buffering: */ + struct cell_vbuf_render *vbuf_render; + struct draw_stage *vbuf; + + struct vertex_info vertex_info; + + /** Mapped constant buffers */ + void *mapped_constants[PIPE_SHADER_TYPES]; + + struct cell_spu_function_info spu_functions ALIGN16_ATTRIB; + + uint num_cells, num_spus; + + /** Buffers for command batches, vertex/index data */ + uint buffer_size[CELL_NUM_BUFFERS]; + ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB; + + int cur_batch; /**< which buffer is being filled w/ commands */ + + /** [4] to ensure 16-byte alignment for each status word */ + uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB; + + + /** Associated with each command/batch buffer is a list of pipe_buffers + * that are fenced. When the last command in a buffer is executed, the + * fence will be signalled, indicating that any pipe_buffers preceeding + * that fence can be unreferenced (and probably freed). + */ + struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS]; + + + struct spe_function attrib_fetch; + unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS]; + + unsigned debug_flags; +}; + + + + +static INLINE struct cell_context * +cell_context(struct pipe_context *pipe) +{ + return (struct cell_context *) pipe; +} + + +extern struct pipe_context * +cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws); + +extern void +cell_vertex_shader_queue_flush(struct draw_context *draw); + + +/* XXX find a better home for this */ +extern void cell_update_vertex_fetch(struct draw_context *draw); + + +#endif /* CELL_CONTEXT_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c new file mode 100644 index 00000000000..644496db40c --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c @@ -0,0 +1,191 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Author: + * Brian Paul + * Keith Whitwell + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_context.h" +#include "pipe/internal/p_winsys_screen.h" +#include "pipe/p_inlines.h" + +#include "cell_context.h" +#include "cell_draw_arrays.h" +#include "cell_state.h" +#include "cell_flush.h" + +#include "draw/draw_context.h" + + + +static void +cell_map_constant_buffers(struct cell_context *sp) +{ + struct pipe_winsys *ws = sp->pipe.winsys; + uint i; + for (i = 0; i < 2; i++) { + if (sp->constants[i].buffer && sp->constants[i].buffer->size) { + sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer, + PIPE_BUFFER_USAGE_CPU_READ); + cell_flush_buffer_range(sp, sp->mapped_constants[i], + sp->constants[i].buffer->size); + } + } + + draw_set_mapped_constant_buffer(sp->draw, + sp->mapped_constants[PIPE_SHADER_VERTEX], + sp->constants[PIPE_SHADER_VERTEX].buffer->size); +} + +static void +cell_unmap_constant_buffers(struct cell_context *sp) +{ + struct pipe_winsys *ws = sp->pipe.winsys; + uint i; + for (i = 0; i < 2; i++) { + if (sp->constants[i].buffer && sp->constants[i].buffer->size) + ws->buffer_unmap(ws, sp->constants[i].buffer); + sp->mapped_constants[i] = NULL; + } +} + + + +/** + * Draw vertex arrays, with optional indexing. + * Basically, map the vertex buffers (and drawing surfaces), then hand off + * the drawing to the 'draw' module. + * + * XXX should the element buffer be specified/bound with a separate function? + */ +static boolean +cell_draw_range_elements(struct pipe_context *pipe, + struct pipe_buffer *indexBuffer, + unsigned indexSize, + unsigned min_index, + unsigned max_index, + unsigned mode, unsigned start, unsigned count) +{ + struct cell_context *sp = cell_context(pipe); + struct draw_context *draw = sp->draw; + unsigned i; + + if (sp->dirty) + cell_update_derived( sp ); + +#if 0 + cell_map_surfaces(sp); +#endif + cell_map_constant_buffers(sp); + + /* + * Map vertex buffers + */ + for (i = 0; i < sp->num_vertex_buffers; i++) { + void *buf = pipe_buffer_map(pipe->screen, + sp->vertex_buffer[i].buffer, + PIPE_BUFFER_USAGE_CPU_READ); + cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size); + draw_set_mapped_vertex_buffer(draw, i, buf); + } + /* Map index buffer, if present */ + if (indexBuffer) { + void *mapped_indexes = pipe_buffer_map(pipe->screen, + indexBuffer, + PIPE_BUFFER_USAGE_CPU_READ); + draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes); + } + else { + /* no index/element buffer */ + draw_set_mapped_element_buffer(draw, 0, NULL); + } + + + /* draw! */ + draw_arrays(draw, mode, start, count); + + /* + * unmap vertex/index buffers - will cause draw module to flush + */ + for (i = 0; i < sp->num_vertex_buffers; i++) { + draw_set_mapped_vertex_buffer(draw, i, NULL); + pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer); + } + if (indexBuffer) { + draw_set_mapped_element_buffer(draw, 0, NULL); + pipe_buffer_unmap(pipe->screen, indexBuffer); + } + + /* Note: leave drawing surfaces mapped */ + cell_unmap_constant_buffers(sp); + + return TRUE; +} + + +static boolean +cell_draw_elements(struct pipe_context *pipe, + struct pipe_buffer *indexBuffer, + unsigned indexSize, + unsigned mode, unsigned start, unsigned count) +{ + return cell_draw_range_elements( pipe, indexBuffer, + indexSize, + 0, 0xffffffff, + mode, start, count ); +} + + +static boolean +cell_draw_arrays(struct pipe_context *pipe, unsigned mode, + unsigned start, unsigned count) +{ + return cell_draw_elements(pipe, NULL, 0, mode, start, count); +} + + +static void +cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags) +{ + struct cell_context *cell = cell_context(pipe); + draw_set_edgeflags(cell->draw, edgeflags); +} + + + +void +cell_init_draw_functions(struct cell_context *cell) +{ + cell->pipe.draw_arrays = cell_draw_arrays; + cell->pipe.draw_elements = cell_draw_elements; + cell->pipe.draw_range_elements = cell_draw_range_elements; + cell->pipe.set_edgeflags = cell_set_edgeflags; +} + diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h new file mode 100644 index 00000000000..148873aa675 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h @@ -0,0 +1,36 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_DRAW_ARRAYS_H +#define CELL_DRAW_ARRAYS_H + + +extern void +cell_init_draw_functions(struct cell_context *cell); + + +#endif /* CELL_DRAW_ARRAYS_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c new file mode 100644 index 00000000000..867b5dcaa09 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_fence.c @@ -0,0 +1,168 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <unistd.h> +#include "util/u_memory.h" +#include "pipe/p_inlines.h" +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_fence.h" +#include "cell_texture.h" + + +void +cell_fence_init(struct cell_fence *fence) +{ + uint i; + ASSERT_ALIGN16(fence->status); + for (i = 0; i < CELL_MAX_SPUS; i++) { + fence->status[i][0] = CELL_FENCE_IDLE; + } +} + + +boolean +cell_fence_signalled(const struct cell_context *cell, + const struct cell_fence *fence) +{ + uint i; + for (i = 0; i < cell->num_spus; i++) { + if (fence->status[i][0] != CELL_FENCE_SIGNALLED) + return FALSE; + /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/ + } + return TRUE; +} + + +void +cell_fence_finish(const struct cell_context *cell, + const struct cell_fence *fence) +{ + while (!cell_fence_signalled(cell, fence)) { + usleep(10); + } + +#ifdef DEBUG + { + uint i; + for (i = 0; i < cell->num_spus; i++) { + assert(fence->status[i][0] == CELL_FENCE_SIGNALLED); + } + } +#endif +} + + + + +struct cell_buffer_node +{ + struct pipe_buffer *buffer; + struct cell_buffer_node *next; +}; + + +static void +cell_add_buffer_to_list(struct cell_context *cell, + struct cell_buffer_list *list, + struct pipe_buffer *buffer) +{ + struct pipe_screen *ps = cell->pipe.screen; + struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node); + /* create new list node which references the buffer, insert at head */ + if (node) { + pipe_buffer_reference(ps, &node->buffer, buffer); + node->next = list->head; + list->head = node; + } +} + + +/** + * Wait for completion of the given fence, then unreference any buffers + * on the list. + * This typically unrefs/frees texture buffers after any rendering which uses + * them has completed. + */ +void +cell_free_fenced_buffers(struct cell_context *cell, + struct cell_buffer_list *list) +{ + if (list->head) { + struct pipe_screen *ps = cell->pipe.screen; + struct cell_buffer_node *node; + + cell_fence_finish(cell, &list->fence); + + /* traverse the list, unreferencing buffers, freeing nodes */ + node = list->head; + while (node) { + struct cell_buffer_node *next = node->next; + assert(node->buffer); + pipe_buffer_unmap(ps, node->buffer); +#if 0 + printf("Unref buffer %p\n", node->buffer); + if (node->buffer->refcount == 1) + printf(" Delete!\n"); +#endif + pipe_buffer_reference(ps, &node->buffer, NULL); + FREE(node); + node = next; + } + list->head = NULL; + } +} + + +/** + * This should be called for each render command. + * Any texture buffers that are current bound will be added to a fenced + * list to be freed later when the fence is executed/signalled. + */ +void +cell_add_fenced_textures(struct cell_context *cell) +{ + struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch]; + uint i; + + for (i = 0; i < cell->num_textures; i++) { + struct cell_texture *ct = cell->texture[i]; + if (ct) { + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + if (ct->tiled_buffer[level]) { +#if 0 + printf("Adding texture %p buffer %p to list\n", + ct, ct->tiled_buffer[level]); +#endif + cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]); + } + } + } + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h new file mode 100644 index 00000000000..536b4ba411a --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_fence.h @@ -0,0 +1,57 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_FENCE_H +#define CELL_FENCE_H + + +extern void +cell_fence_init(struct cell_fence *fence); + + +extern boolean +cell_fence_signalled(const struct cell_context *cell, + const struct cell_fence *fence); + + +extern void +cell_fence_finish(const struct cell_context *cell, + const struct cell_fence *fence); + + + +extern void +cell_free_fenced_buffers(struct cell_context *cell, + struct cell_buffer_list *list); + + +extern void +cell_add_fenced_textures(struct cell_context *cell); + + +#endif /* CELL_FENCE_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c new file mode 100644 index 00000000000..8275c9dc9c7 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -0,0 +1,112 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_flush.h" +#include "cell_spu.h" +#include "cell_render.h" +#include "draw/draw_context.h" + + +/** + * Called via pipe->flush() + */ +void +cell_flush(struct pipe_context *pipe, unsigned flags, + struct pipe_fence_handle **fence) +{ + struct cell_context *cell = cell_context(pipe); + + if (fence) { + *fence = NULL; + /* XXX: Implement real fencing */ + flags |= CELL_FLUSH_WAIT; + } + + if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE)) + flags |= CELL_FLUSH_WAIT; + + draw_flush( cell->draw ); + cell_flush_int(cell, flags); +} + + +/** + * Cell internal flush function. Send the current batch buffer to all SPUs. + * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle. + * \param flags bitmask of flags CELL_FLUSH_WAIT, or zero + */ +void +cell_flush_int(struct cell_context *cell, unsigned flags) +{ + static boolean flushing = FALSE; /* recursion catcher */ + uint i; + + ASSERT(!flushing); + flushing = TRUE; + + if (flags & CELL_FLUSH_WAIT) { + STATIC_ASSERT(sizeof(opcode_t) % 16 == 0); + opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t)); + *cmd[0] = CELL_CMD_FINISH; + } + + cell_batch_flush(cell); + +#if 0 + /* Send CMD_FINISH to all SPUs */ + for (i = 0; i < cell->num_spus; i++) { + send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_FINISH); + } +#endif + + if (flags & CELL_FLUSH_WAIT) { + /* Wait for ack */ + for (i = 0; i < cell->num_spus; i++) { + uint k = wait_mbox_message(cell_global.spe_contexts[i]); + assert(k == CELL_CMD_FINISH); + } + } + + flushing = FALSE; +} + + +void +cell_flush_buffer_range(struct cell_context *cell, void *ptr, + unsigned size) +{ + STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0); + uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, + sizeof(opcode_t) + sizeof(struct cell_buffer_range)); + struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4]; + batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE; + br->base = (uintptr_t) ptr; + br->size = size; +} diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h new file mode 100644 index 00000000000..509ae6239ac --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_flush.h @@ -0,0 +1,45 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_FLUSH +#define CELL_FLUSH + +#define CELL_FLUSH_WAIT 0x80000000 + +extern void +cell_flush(struct pipe_context *pipe, unsigned flags, + struct pipe_fence_handle **fence); + +extern void +cell_flush_int(struct cell_context *cell, unsigned flags); + +extern void +cell_flush_buffer_range(struct cell_context *cell, void *ptr, + unsigned size); + +#endif diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c new file mode 100644 index 00000000000..5a889a6119d --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -0,0 +1,2046 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * Copyright 2009 VMware, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +/** + * Generate SPU fragment program/shader code. + * + * Note that we generate SOA-style code here. So each TGSI instruction + * operates on four pixels (and is translated into four SPU instructions, + * generally speaking). + * + * \author Brian Paul + */ + +#include <math.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "util/u_memory.h" +#include "cell_context.h" +#include "cell_gen_fp.h" + + +#define MAX_TEMPS 16 +#define MAX_IMMED 8 + +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 + +/** + * Context needed during code generation. + */ +struct codegen +{ + struct cell_context *cell; + int inputs_reg; /**< 1st function parameter */ + int outputs_reg; /**< 2nd function parameter */ + int constants_reg; /**< 3rd function parameter */ + int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */ + int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */ + + int num_imm; /**< number of immediates */ + + int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ + + int addr_reg; /**< address register, integer values */ + + /** Per-instruction temps / intermediate temps */ + int num_itemps; + int itemps[12]; + + /** Current IF/ELSE/ENDIF nesting level */ + int if_nesting; + /** Current BGNLOOP/ENDLOOP nesting level */ + int loop_nesting; + /** Location of start of current loop */ + int loop_start; + + /** Index of if/conditional mask register */ + int cond_mask_reg; + /** Index of loop mask register */ + int loop_mask_reg; + + /** Index of master execution mask register */ + int exec_mask_reg; + + /** KIL mask: indicates which fragments have been killed */ + int kill_mask_reg; + + int frame_size; /**< Stack frame size, in words */ + + struct spe_function *f; + boolean error; +}; + + +/** + * Allocate an intermediate temporary register. + */ +static int +get_itemp(struct codegen *gen) +{ + int t = spe_allocate_available_register(gen->f); + assert(gen->num_itemps < Elements(gen->itemps)); + gen->itemps[gen->num_itemps++] = t; + return t; +} + +/** + * Free all intermediate temporary registers. To be called after each + * instruction has been emitted. + */ +static void +free_itemps(struct codegen *gen) +{ + int i; + for (i = 0; i < gen->num_itemps; i++) { + spe_release_register(gen->f, gen->itemps[i]); + } + gen->num_itemps = 0; +} + + +/** + * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}. + * The register is allocated and initialized upon the first call. + */ +static int +get_const_one_reg(struct codegen *gen) +{ + if (gen->one_reg <= 0) { + gen->one_reg = spe_allocate_available_register(gen->f); + + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "init constant reg = 1.0:"); + + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(gen->f, gen->one_reg, 1.0f); + + spe_indent(gen->f, -4); + } + + return gen->one_reg; +} + + +/** + * Return index of the address register. + * Used for indirect register loads/stores. + */ +static int +get_address_reg(struct codegen *gen) +{ + if (gen->addr_reg <= 0) { + gen->addr_reg = spe_allocate_available_register(gen->f); + + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "init address reg = 0:"); + + /* init addr = {0, 0, 0, 0} */ + spe_zero(gen->f, gen->addr_reg); + + spe_indent(gen->f, -4); + } + + return gen->addr_reg; +} + + +/** + * Return index of the master execution mask. + * The register is allocated an initialized upon the first call. + * + * The master execution mask controls which pixels in a quad are + * modified, according to surrounding conditionals, loops, etc. + */ +static int +get_exec_mask_reg(struct codegen *gen) +{ + if (gen->exec_mask_reg <= 0) { + gen->exec_mask_reg = spe_allocate_available_register(gen->f); + + /* XXX this may not be needed */ + spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0"); + spe_load_int(gen->f, gen->exec_mask_reg, ~0); + } + + return gen->exec_mask_reg; +} + + +/** Return index of the conditional (if/else) execution mask register */ +static int +get_cond_mask_reg(struct codegen *gen) +{ + if (gen->cond_mask_reg <= 0) { + gen->cond_mask_reg = spe_allocate_available_register(gen->f); + } + + return gen->cond_mask_reg; +} + + +/** Return index of the loop execution mask register */ +static int +get_loop_mask_reg(struct codegen *gen) +{ + if (gen->loop_mask_reg <= 0) { + gen->loop_mask_reg = spe_allocate_available_register(gen->f); + } + + return gen->loop_mask_reg; +} + + + +static boolean +is_register_src(struct codegen *gen, int channel, + const struct tgsi_full_src_register *src) +{ + int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel); + int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); + + if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) { + return FALSE; + } + if (src->SrcRegister.File == TGSI_FILE_TEMPORARY || + src->SrcRegister.File == TGSI_FILE_IMMEDIATE) { + return TRUE; + } + return FALSE; +} + + +static boolean +is_memory_dst(struct codegen *gen, int channel, + const struct tgsi_full_dst_register *dst) +{ + if (dst->DstRegister.File == TGSI_FILE_OUTPUT) { + return TRUE; + } + else { + return FALSE; + } +} + + +/** + * Return the index of the SPU temporary containing the named TGSI + * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we + * just return the corresponding SPE register. If the TGIS register + * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register + * and emit an SPE load instruction. + */ +static int +get_src_reg(struct codegen *gen, + int channel, + const struct tgsi_full_src_register *src) +{ + int reg = -1; + int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel); + boolean reg_is_itemp = FALSE; + uint sign_op; + + assert(swizzle >= TGSI_SWIZZLE_X); + assert(swizzle <= TGSI_EXTSWIZZLE_ONE); + + if (swizzle == TGSI_EXTSWIZZLE_ONE) { + /* Load const one float and early out */ + reg = get_const_one_reg(gen); + } + else if (swizzle == TGSI_EXTSWIZZLE_ZERO) { + /* Load const zero float and early out */ + reg = get_itemp(gen); + spe_xor(gen->f, reg, reg, reg); + } + else { + int index = src->SrcRegister.Index; + + assert(swizzle < 4); + + if (src->SrcRegister.Indirect) { + /* XXX unfinished */ + } + + switch (src->SrcRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[index][swizzle]; + break; + case TGSI_FILE_INPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = index * 4 + swizzle; + reg = get_itemp(gen); + reg_is_itemp = TRUE; + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16); + } + break; + case TGSI_FILE_IMMEDIATE: + reg = gen->imm_regs[index][swizzle]; + break; + case TGSI_FILE_CONSTANT: + { + /* offset is measured in quadwords, not bytes */ + int offset = index * 4 + swizzle; + reg = get_itemp(gen); + reg_is_itemp = TRUE; + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->constants_reg, offset * 16); + } + break; + default: + assert(0); + } + } + + /* + * Handle absolute value, negate or set-negative of src register. + */ + sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); + if (sign_op != TGSI_UTIL_SIGN_KEEP) { + /* + * All sign ops are done by manipulating bit 31, the IEEE float sign bit. + */ + const int bit31mask_reg = get_itemp(gen); + int result_reg; + + if (reg_is_itemp) { + /* re-use 'reg' for the result */ + result_reg = reg; + } + else { + /* alloc a new reg for the result */ + result_reg = get_itemp(gen); + } + + /* mask with bit 31 set, the rest cleared */ + spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); + + if (sign_op == TGSI_UTIL_SIGN_CLEAR) { + spe_andc(gen->f, result_reg, reg, bit31mask_reg); + } + else if (sign_op == TGSI_UTIL_SIGN_SET) { + spe_and(gen->f, result_reg, reg, bit31mask_reg); + } + else { + assert(sign_op == TGSI_UTIL_SIGN_TOGGLE); + spe_xor(gen->f, result_reg, reg, bit31mask_reg); + } + + reg = result_reg; + } + + return reg; +} + + +/** + * Return the index of an SPE register to use for the given TGSI register. + * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the + * corresponding SPE register is returned. If the TGSI register is + * TGSI_FILE_OUTPUT we allocate an intermediate temporary register. + * See store_dest_reg() below... + */ +static int +get_dst_reg(struct codegen *gen, + int channel, + const struct tgsi_full_dst_register *dest) +{ + int reg = -1; + + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + if (gen->if_nesting > 0 || gen->loop_nesting > 0) + reg = get_itemp(gen); + else + reg = gen->temp_regs[dest->DstRegister.Index][channel]; + break; + case TGSI_FILE_OUTPUT: + reg = get_itemp(gen); + break; + default: + assert(0); + } + + return reg; +} + + +/** + * When a TGSI instruction is writing to an output register, this + * function emits the SPE store instruction to store the value_reg. + * \param value_reg the SPE register containing the value to store. + * This would have been returned by get_dst_reg(). + */ +static void +store_dest_reg(struct codegen *gen, + int value_reg, int channel, + const struct tgsi_full_dst_register *dest) +{ + /* + * XXX need to implement dst reg clamping/saturation + */ +#if 0 + switch (inst->Instruction.Saturate) { + case TGSI_SAT_NONE: + break; + case TGSI_SAT_ZERO_ONE: + break; + case TGSI_SAT_MINUS_PLUS_ONE: + break; + default: + assert( 0 ); + } +#endif + + switch (dest->DstRegister.File) { + case TGSI_FILE_TEMPORARY: + if (gen->if_nesting > 0 || gen->loop_nesting > 0) { + int d_reg = gen->temp_regs[dest->DstRegister.Index][channel]; + int exec_reg = get_exec_mask_reg(gen); + /* Mix d with new value according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg); + } + else { + /* we're not inside a condition or loop: do nothing special */ + + } + break; + case TGSI_FILE_OUTPUT: + { + /* offset is measured in quadwords, not bytes */ + int offset = dest->DstRegister.Index * 4 + channel; + if (gen->if_nesting > 0 || gen->loop_nesting > 0) { + int exec_reg = get_exec_mask_reg(gen); + int curval_reg = get_itemp(gen); + /* First read the current value from memory: + * Load: curval = memory[(machine_reg) + offset] + */ + spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); + /* Mix curval with newvalue according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg); + /* Store: memory[(machine_reg) + offset] = curval */ + spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); + } + else { + /* Store: memory[(machine_reg) + offset] = reg */ + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16); + } + } + break; + default: + assert(0); + } +} + + + +static void +emit_prologue(struct codegen *gen) +{ + gen->frame_size = 1024; /* XXX temporary, should be dynamic */ + + spe_comment(gen->f, 0, "Function prologue:"); + + /* save $lr on stack # stqd $lr,16($sp) */ + spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + + if (gen->frame_size >= 512) { + /* offset is too large for ai instruction */ + int offset_reg = spe_allocate_available_register(gen->f); + int sp_reg = spe_allocate_available_register(gen->f); + /* offset = -framesize */ + spe_load_int(gen->f, offset_reg, -gen->frame_size); + /* sp = $sp */ + spe_move(gen->f, sp_reg, SPE_REG_SP); + /* $sp = $sp + offset_reg */ + spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); + /* save $sp in stack frame */ + spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0); + /* clean up */ + spe_release_register(gen->f, offset_reg); + spe_release_register(gen->f, sp_reg); + } + else { + /* save stack pointer # stqd $sp,-frameSize($sp) */ + spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); + + /* adjust stack pointer # ai $sp,$sp,-frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); + } +} + + +static void +emit_epilogue(struct codegen *gen) +{ + const int return_reg = 3; + + spe_comment(gen->f, 0, "Function epilogue:"); + + spe_comment(gen->f, 0, "return the killed mask"); + if (gen->kill_mask_reg > 0) { + /* shader called KIL, return the "alive" mask */ + spe_move(gen->f, return_reg, gen->kill_mask_reg); + } + else { + /* return {0,0,0,0} */ + spe_load_uint(gen->f, return_reg, 0); + } + + spe_comment(gen->f, 0, "restore stack and return"); + if (gen->frame_size >= 512) { + /* offset is too large for ai instruction */ + int offset_reg = spe_allocate_available_register(gen->f); + /* offset = framesize */ + spe_load_int(gen->f, offset_reg, gen->frame_size); + /* $sp = $sp + offset */ + spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); + /* clean up */ + spe_release_register(gen->f, offset_reg); + } + else { + /* restore stack pointer # ai $sp,$sp,frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size); + } + + /* restore $lr # lqd $lr,16($sp) */ + spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + + /* return from function call */ + spe_bi(gen->f, SPE_REG_RA, 0, 0); +} + + +#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \ + for (ch = 0; ch < 4; ch++) \ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) + + +static boolean +emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch = 0, src_reg, addr_reg; + + src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + addr_reg = get_address_reg(gen); + + /* convert float to int */ + spe_cflts(gen->f, addr_reg, src_reg, 0); + + free_itemps(gen); + + return TRUE; +} + + +static boolean +emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, src_reg[4], dst_reg[4]; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) && + is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) { + /* special-case: register to memory store */ + store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]); + } + else { + spe_move(gen->f, dst_reg[ch], src_reg[ch]); + store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + + return TRUE; +} + +/** + * Emit binary operation + */ +static boolean +emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], s2_reg[4], d_reg[4]; + + /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* Emit actual SPE instruction: d = s1 + s2 */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SUB: + spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_MUL: + spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + default: + ; + } + } + + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + /* Free any intermediate temps we allocated */ + free_itemps(gen); + + return TRUE; +} + + +/** + * Emit multiply add. See emit_ADD for comments. + */ +static boolean +emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4]; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + free_itemps(gen); + return TRUE; +} + + +/** + * Emit linear interpolate. See emit_ADD for comments. + */ +static boolean +emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4]; + + /* setup/get src/dst/temp regs */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + + /* d = s3 + s1(s2 - s3) */ + /* do all subtracts, then all fma, then all stores to better pipeline */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + free_itemps(gen); + return TRUE; +} + + + +/** + * Emit reciprocal or recip sqrt. + */ +static boolean +emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], d_reg[4], tmp_reg[4]; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) { + /* tmp = 1/s1 */ + spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]); + } + else { + /* tmp = 1/sqrt(s1) */ + spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]); + } + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* d = float_interp(s1, tmp) */ + spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit absolute value. See emit_ADD for comments. + */ +static boolean +emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], d_reg[4]; + const int bit31mask_reg = get_itemp(gen); + + /* mask with bit 31 set, the rest cleared */ + spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + /* d = sign bit cleared in s1 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + +/** + * Emit 3 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + int s1x_reg, s1y_reg, s1z_reg; + int s2x_reg, s2y_reg, s2z_reg; + int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); + + s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + + /* t0 = x0 * x1 */ + spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg); + + /* t1 = y0 * y1 */ + spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg); + + /* t0 = z0 * z1 + t0 */ + spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg); + + /* t0 = t0 + t1 */ + spe_fa(gen->f, t0_reg, t0_reg, t1_reg); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + +/** + * Emit 4 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + int s0x_reg, s0y_reg, s0z_reg, s0w_reg; + int s1x_reg, s1y_reg, s1z_reg, s1w_reg; + int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); + + s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]); + s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); + + /* t0 = x0 * x1 */ + spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg); + + /* t1 = y0 * y1 */ + spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg); + + /* t0 = z0 * z1 + t0 */ + spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg); + + /* t1 = w0 * w1 + t1 */ + spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg); + + /* t0 = t0 + t1 */ + spe_fa(gen->f, t0_reg, t0_reg, t1_reg); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + +/** + * Emit homogeneous dot product. See emit_ADD for comments. + */ +static boolean +emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + /* XXX rewrite this function to look more like DP3/DP4 */ + int ch; + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + + /* t = x0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = y0 * y1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = z0 * z1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); + /* t = w1 + t */ + spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, tmp_reg); + store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + +/** + * Emit 3-component vector normalize. + */ +static boolean +emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + int src_reg[3]; + int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); + + src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + + /* t0 = x * x */ + spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]); + + /* t1 = y * y */ + spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]); + + /* t0 = z * z + t0 */ + spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg); + + /* t0 = t0 + t1 */ + spe_fa(gen->f, t0_reg, t0_reg, t1_reg); + + /* t1 = 1.0 / sqrt(t0) */ + spe_frsqest(gen->f, t1_reg, t0_reg); + spe_fi(gen->f, t1_reg, t0_reg, t1_reg); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* dst = src[ch] * t1 */ + spe_fm(gen->f, d_reg, src_reg[ch], t1_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit cross product. See emit_ADD for comments. + */ +static boolean +emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + + /* t = z0 * y1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = y0 * z1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) { + store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]); + } + + s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = x0 * z1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + /* t = z0 * x1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) { + store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]); + } + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + /* t = y0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = x0 * y1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) { + store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit inequality instruction. + * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as + * the result but OpenGL/TGSI needs 0.0 and 1.0 results. + * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. + */ +static boolean +emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg; + bool complement = FALSE; + + one_reg = get_const_one_reg(gen); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_SGT: + spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SLT: + spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); + break; + case TGSI_OPCODE_SGE: + spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); + complement = TRUE; + break; + case TGSI_OPCODE_SLE: + spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + complement = TRUE; + break; + case TGSI_OPCODE_SEQ: + spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SNE: + spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + complement = TRUE; + break; + default: + assert(0); + } + } + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* d = d & one_reg */ + if (complement) + spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]); + else + spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit compare. + */ +static boolean +emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int zero_reg = get_itemp(gen); + + spe_zero(gen->f, zero_reg); + + /* d = (s1 < 0) ? s2 : s3 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + + return TRUE; +} + +/** + * Emit trunc. + * Convert float to signed int + * Convert signed int to float + */ +static boolean +emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], d_reg[4]; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0); + } + + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit floor. + * If negative int subtract one + * Convert float to signed int + * Convert signed int to float + */ +static boolean +emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; + + zero_reg = get_itemp(gen); + spe_zero(gen->f, zero_reg); + one_reg = get_const_one_reg(gen); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + + /* If negative, subtract 1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); + } + + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } + + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Compute frac = Input - FLR(Input) + */ +static boolean +emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; + + zero_reg = get_itemp(gen); + spe_zero(gen->f, zero_reg); + one_reg = get_const_one_reg(gen); + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + + /* If negative, subtract 1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); + } + + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } + + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } + + /* d = s1 - FLR(s1) */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); + } + + /* store result */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +#if 0 +static void +print_functions(struct cell_context *cell) +{ + struct cell_spu_function_info *funcs = &cell->spu_functions; + uint i; + for (i = 0; i < funcs->num; i++) { + printf("SPU func %u: %s at %u\n", + i, funcs->names[i], funcs->addrs[i]); + } +} +#endif + + +static uint +lookup_function(struct cell_context *cell, const char *funcname) +{ + const struct cell_spu_function_info *funcs = &cell->spu_functions; + uint i, addr = 0; + for (i = 0; i < funcs->num; i++) { + if (strcmp(funcs->names[i], funcname) == 0) { + addr = funcs->addrs[i]; + } + } + assert(addr && "spu function not found"); + return addr / 4; /* discard 2 least significant bits */ +} + + +/** + * Emit code to call a SPU function. + * Used to implement instructions like SIN/COS/POW/TEX/etc. + * If scalar, only the X components of the src regs are used, and the + * result is replicated across the dest register's XYZW components. + */ +static boolean +emit_function_call(struct codegen *gen, + const struct tgsi_full_instruction *inst, + char *funcname, uint num_args, boolean scalar) +{ + const uint addr = lookup_function(gen->cell, funcname); + char comment[100]; + int s_regs[3]; + int func_called = FALSE; + uint a, ch; + int retval_reg = -1; + + assert(num_args <= 3); + + snprintf(comment, sizeof(comment), "CALL %s:", funcname); + spe_comment(gen->f, -4, comment); + + if (scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]); + } + /* we'll call the function, put the return value in this register, + * then replicate it across all write-enabled components in d_reg. + */ + retval_reg = spe_allocate_available_register(gen->f); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg; + ubyte usedRegs[SPE_NUM_REGS]; + uint i, numUsed; + + if (!scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } + } + + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + if (!scalar || !func_called) { + /* for a scalar function, we'll really only call the function once */ + + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); + + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } + + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return value */ + if (scalar) + spe_move(gen->f, retval_reg, 3); + else + spe_move(gen->f, d_reg, 3); + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg && reg != retval_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + } + + func_called = TRUE; + } + + if (scalar) { + spe_move(gen->f, d_reg, retval_reg); + } + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + + if (scalar) { + spe_release_register(gen->f, retval_reg); + } + + return TRUE; +} + + +static boolean +emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const uint target = inst->InstructionExtTexture.Texture; + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + uint addr; + int ch; + int coord_regs[4], d_regs[4]; + + switch (target) { + case TGSI_TEXTURE_1D: + case TGSI_TEXTURE_2D: + addr = lookup_function(gen->cell, "spu_tex_2d"); + break; + case TGSI_TEXTURE_3D: + addr = lookup_function(gen->cell, "spu_tex_3d"); + break; + case TGSI_TEXTURE_CUBE: + addr = lookup_function(gen->cell, "spu_tex_cube"); + break; + default: + ASSERT(0 && "unsupported texture target"); + return FALSE; + } + + assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER); + + spe_comment(gen->f, -4, "CALL tex:"); + + /* get src/dst reg info */ + for (ch = 0; ch < 4; ch++) { + coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + { + ubyte usedRegs[SPE_NUM_REGS]; + uint i, numUsed; + + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); + + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + + /* setup function arguments (XXX depends on target) */ + for (i = 0; i < 4; i++) { + spe_move(gen->f, 3 + i, coord_regs[i]); + } + spe_load_uint(gen->f, 7, unit); /* sampler unit */ + + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return values (four pixel's colors) */ + for (i = 0; i < 4; i++) { + spe_move(gen->f, d_regs[i], 3 + i); + } + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_regs[0] && + reg != d_regs[1] && + reg != d_regs[2] && + reg != d_regs[3]) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + } + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + + return TRUE; +} + + +/** + * KILL if any of src reg values are less than zero. + */ +static boolean +emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + int s_regs[4], kil_reg = -1, cmp_reg, zero_reg; + + spe_comment(gen->f, -4, "CALL kil:"); + + /* zero = {0,0,0,0} */ + zero_reg = get_itemp(gen); + spe_zero(gen->f, zero_reg); + + cmp_reg = get_itemp(gen); + + /* get src regs */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + } + + /* test if any src regs are < 0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (kil_reg >= 0) { + /* cmp = 0 > src ? : ~0 : 0 */ + spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]); + /* kil = kil | cmp */ + spe_or(gen->f, kil_reg, kil_reg, cmp_reg); + } + else { + kil_reg = get_itemp(gen); + /* kil = 0 > src ? : ~0 : 0 */ + spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]); + } + } + + if (gen->if_nesting || gen->loop_nesting) { + /* may have been a conditional kil */ + spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg); + } + + /* allocate the kill mask reg if needed */ + if (gen->kill_mask_reg <= 0) { + gen->kill_mask_reg = spe_allocate_available_register(gen->f); + spe_move(gen->f, gen->kill_mask_reg, kil_reg); + } + else { + spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg); + } + + free_itemps(gen); + + return TRUE; +} + + + +/** + * Emit min or max. + */ +static boolean +emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + + /* d = (s0 > s1) ? s0 : s1 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (inst->Instruction.Opcode == TGSI_OPCODE_MAX) + spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]); + else + spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; +} + + +/** + * Emit code to update the execution mask. + * This needs to be done whenever the execution status of a conditional + * or loop is changed. + */ +static void +emit_update_exec_mask(struct codegen *gen) +{ + const int exec_reg = get_exec_mask_reg(gen); + const int cond_reg = gen->cond_mask_reg; + const int loop_reg = gen->loop_mask_reg; + + spe_comment(gen->f, 0, "Update master execution mask"); + + if (gen->if_nesting > 0 && gen->loop_nesting > 0) { + /* exec_mask = cond_mask & loop_mask */ + assert(cond_reg > 0); + assert(loop_reg > 0); + spe_and(gen->f, exec_reg, cond_reg, loop_reg); + } + else if (gen->if_nesting > 0) { + assert(cond_reg > 0); + spe_move(gen->f, exec_reg, cond_reg); + } + else if (gen->loop_nesting > 0) { + assert(loop_reg > 0); + spe_move(gen->f, exec_reg, loop_reg); + } + else { + spe_load_int(gen->f, exec_reg, ~0x0); + } +} + + +static boolean +emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int channel = 0; + int cond_reg; + + cond_reg = get_cond_mask_reg(gen); + + /* XXX push cond exec mask */ + + spe_comment(gen->f, 0, "init conditional exec mask = ~0:"); + spe_load_int(gen->f, cond_reg, ~0); + + /* update conditional execution mask with the predicate register */ + int tmp_reg = get_itemp(gen); + int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]); + + /* tmp = (s1_reg == 0) */ + spe_ceqi(gen->f, tmp_reg, s1_reg, 0); + /* tmp = !tmp */ + spe_complement(gen->f, tmp_reg, tmp_reg); + /* cond_mask = cond_mask & tmp */ + spe_and(gen->f, cond_reg, cond_reg, tmp_reg); + + gen->if_nesting++; + + /* update the master execution mask */ + emit_update_exec_mask(gen); + + free_itemps(gen); + + return TRUE; +} + + +static boolean +emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int cond_reg = get_cond_mask_reg(gen); + + spe_comment(gen->f, 0, "cond exec mask = !cond exec mask"); + spe_complement(gen->f, cond_reg, cond_reg); + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + /* XXX todo: pop cond exec mask */ + + gen->if_nesting--; + + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int exec_reg, loop_reg; + + exec_reg = get_exec_mask_reg(gen); + loop_reg = get_loop_mask_reg(gen); + + /* XXX push loop_exec mask */ + + spe_comment(gen->f, 0*-4, "initialize loop exec mask = ~0"); + spe_load_int(gen->f, loop_reg, ~0x0); + + gen->loop_nesting++; + gen->loop_start = spe_code_size(gen->f); /* in bytes */ + + return TRUE; +} + + +static boolean +emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int loop_reg = get_loop_mask_reg(gen); + const int tmp_reg = get_itemp(gen); + int offset; + + /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */ + spe_orx(gen->f, tmp_reg, loop_reg); + + offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */ + + /* branch back to top of loop if tmp_reg != 0 */ + spe_brnz(gen->f, tmp_reg, offset / 4); + + /* XXX pop loop_exec mask */ + + gen->loop_nesting--; + + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int exec_reg = get_exec_mask_reg(gen); + const int loop_reg = get_loop_mask_reg(gen); + + assert(gen->loop_nesting > 0); + + spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask"); + spe_andc(gen->f, loop_reg, loop_reg, exec_reg); + + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + assert(gen->loop_nesting > 0); + + return TRUE; +} + + +static boolean +emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst, + boolean ddx) +{ + int ch; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + int t1_reg = get_itemp(gen); + int t2_reg = get_itemp(gen); + + spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ + if (ddx) { + spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ + } + else { + spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ + } + spe_fs(gen->f, d_reg, t2_reg, t1_reg); + + free_itemps(gen); + } + + return TRUE; +} + + + + +/** + * Emit END instruction. + * We just return from the shader function at this point. + * + * Note that there may be more code after this that would be + * called by TGSI_OPCODE_CALL. + */ +static boolean +emit_END(struct codegen *gen) +{ + emit_epilogue(gen); + return TRUE; +} + + +/** + * Emit code for the given instruction. Just a big switch stmt. + */ +static boolean +emit_instruction(struct codegen *gen, + const struct tgsi_full_instruction *inst) +{ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + return emit_ARL(gen, inst); + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_SWZ: + return emit_MOV(gen, inst); + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_MUL: + return emit_binop(gen, inst); + case TGSI_OPCODE_MAD: + return emit_MAD(gen, inst); + case TGSI_OPCODE_LERP: + return emit_LERP(gen, inst); + case TGSI_OPCODE_DP3: + return emit_DP3(gen, inst); + case TGSI_OPCODE_DP4: + return emit_DP4(gen, inst); + case TGSI_OPCODE_DPH: + return emit_DPH(gen, inst); + case TGSI_OPCODE_NRM: + return emit_NRM3(gen, inst); + case TGSI_OPCODE_XPD: + return emit_XPD(gen, inst); + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + return emit_RCP_RSQ(gen, inst); + case TGSI_OPCODE_ABS: + return emit_ABS(gen, inst); + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SNE: + return emit_inequality(gen, inst); + case TGSI_OPCODE_CMP: + return emit_CMP(gen, inst); + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MAX: + return emit_MIN_MAX(gen, inst); + case TGSI_OPCODE_TRUNC: + return emit_TRUNC(gen, inst); + case TGSI_OPCODE_FLR: + return emit_FLR(gen, inst); + case TGSI_OPCODE_FRC: + return emit_FRC(gen, inst); + case TGSI_OPCODE_END: + return emit_END(gen); + + case TGSI_OPCODE_COS: + return emit_function_call(gen, inst, "spu_cos", 1, TRUE); + case TGSI_OPCODE_SIN: + return emit_function_call(gen, inst, "spu_sin", 1, TRUE); + case TGSI_OPCODE_POW: + return emit_function_call(gen, inst, "spu_pow", 2, TRUE); + case TGSI_OPCODE_EXPBASE2: + return emit_function_call(gen, inst, "spu_exp2", 1, TRUE); + case TGSI_OPCODE_LOGBASE2: + return emit_function_call(gen, inst, "spu_log2", 1, TRUE); + case TGSI_OPCODE_TEX: + /* fall-through for now */ + case TGSI_OPCODE_TXD: + /* fall-through for now */ + case TGSI_OPCODE_TXB: + /* fall-through for now */ + case TGSI_OPCODE_TXL: + /* fall-through for now */ + case TGSI_OPCODE_TXP: + return emit_TEX(gen, inst); + case TGSI_OPCODE_KIL: + return emit_KIL(gen, inst); + + case TGSI_OPCODE_IF: + return emit_IF(gen, inst); + case TGSI_OPCODE_ELSE: + return emit_ELSE(gen, inst); + case TGSI_OPCODE_ENDIF: + return emit_ENDIF(gen, inst); + + case TGSI_OPCODE_BGNLOOP2: + return emit_BGNLOOP(gen, inst); + case TGSI_OPCODE_ENDLOOP2: + return emit_ENDLOOP(gen, inst); + case TGSI_OPCODE_BRK: + return emit_BRK(gen, inst); + case TGSI_OPCODE_CONT: + return emit_CONT(gen, inst); + + case TGSI_OPCODE_DDX: + return emit_DDX_DDY(gen, inst, TRUE); + case TGSI_OPCODE_DDY: + return emit_DDX_DDY(gen, inst, FALSE); + + /* XXX lots more cases to do... */ + + default: + fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", + inst->Instruction.Opcode); + return FALSE; + } + + return TRUE; +} + + + +/** + * Emit code for a TGSI immediate value (vector of four floats). + * This involves register allocation and initialization. + * XXX the initialization should be done by a "prepare" stage, not + * per quad execution! + */ +static boolean +emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) +{ + int ch; + + assert(gen->num_imm < MAX_TEMPS); + + for (ch = 0; ch < 4; ch++) { + float val = immed->u.ImmediateFloat32[ch].Float; + + if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) { + /* re-use previous register */ + gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1]; + } + else { + char str[100]; + int reg = spe_allocate_available_register(gen->f); + + if (reg < 0) + return FALSE; + + sprintf(str, "init $%d = %f", reg, val); + spe_comment(gen->f, 0, str); + + /* update immediate map */ + gen->imm_regs[gen->num_imm][ch] = reg; + + /* emit initializer instruction */ + spe_load_float(gen->f, reg, val); + } + } + + gen->num_imm++; + + return TRUE; +} + + + +/** + * Emit "code" for a TGSI declaration. + * We only care about TGSI TEMPORARY register declarations at this time. + * For each TGSI TEMPORARY we allocate four SPE registers. + */ +static boolean +emit_declaration(struct cell_context *cell, + struct codegen *gen, const struct tgsi_full_declaration *decl) +{ + int i, ch; + + switch (decl->Declaration.File) { + case TGSI_FILE_TEMPORARY: + for (i = decl->DeclarationRange.First; + i <= decl->DeclarationRange.Last; + i++) { + assert(i < MAX_TEMPS); + for (ch = 0; ch < 4; ch++) { + gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); + if (gen->temp_regs[i][ch] < 0) + return FALSE; /* out of regs */ + } + + /* XXX if we run out of SPE registers, we need to spill + * to SPU memory. someday... + */ + + { + char buf[100]; + sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i, + gen->temp_regs[i][0], gen->temp_regs[i][1], + gen->temp_regs[i][2], gen->temp_regs[i][3]); + spe_comment(gen->f, 0, buf); + } + } + break; + default: + ; /* ignore */ + } + + return TRUE; +} + + + +/** + * Translate TGSI shader code to SPE instructions. This is done when + * the state tracker gives us a new shader (via pipe->create_fs_state()). + * + * \param cell the rendering context (in) + * \param tokens the TGSI shader (in) + * \param f the generated function (out) + */ +boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f) +{ + struct tgsi_parse_context parse; + struct codegen gen; + uint ic = 0; + + memset(&gen, 0, sizeof(gen)); + gen.cell = cell; + gen.f = f; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + gen.inputs_reg = 3; /* pointer to inputs array */ + gen.outputs_reg = 4; /* pointer to outputs array */ + gen.constants_reg = 5; /* pointer to constants array */ + + spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + spe_allocate_register(f, gen.inputs_reg); + spe_allocate_register(f, gen.outputs_reg); + spe_allocate_register(f, gen.constants_reg); + + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, TRUE); + spe_indent(f, 2*8); + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); + } + + tgsi_parse_init(&parse, tokens); + + emit_prologue(&gen); + + while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + if (f->print) { + _debug_printf(" # "); + tgsi_dump_immediate(&parse.FullToken.FullImmediate); + } + if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) + gen.error = TRUE; + break; + + case TGSI_TOKEN_TYPE_DECLARATION: + if (f->print) { + _debug_printf(" # "); + tgsi_dump_declaration(&parse.FullToken.FullDeclaration); + } + if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) + gen.error = TRUE; + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (f->print) { + _debug_printf(" # "); + ic++; + tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic); + } + if (!emit_instruction(&gen, &parse.FullToken.FullInstruction)) + gen.error = TRUE; + break; + + default: + assert(0); + } + } + + if (gen.error) { + /* terminate the SPE code */ + return emit_END(&gen); + } + + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); + } + + tgsi_parse_free( &parse ); + + return !gen.error; +} diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h new file mode 100644 index 00000000000..99faea70462 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h @@ -0,0 +1,42 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#ifndef CELL_GEN_FP_H +#define CELL_GEN_FP_H + + + +extern boolean +cell_gen_fragment_program(struct cell_context *cell, + const struct tgsi_token *tokens, + struct spe_function *f); + + +#endif /* CELL_GEN_FP_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c new file mode 100644 index 00000000000..66d4b3b6a31 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -0,0 +1,2181 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * Copyright 2009 VMware, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Generate SPU per-fragment code (actually per-quad code). + * \author Brian Paul + * \author Bob Ellison + */ + + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "rtasm/rtasm_ppc_spe.h" +#include "cell_context.h" +#include "cell_gen_fragment.h" + + + +/** Do extra optimizations? */ +#define OPTIMIZATIONS 1 + + +/** + * Generate SPE code to perform Z/depth testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param ifragZ_reg register containing integer fragment Z values (in) + * \param ifbZ_reg register containing integer frame buffer Z values (in/out) + * \param zmask_reg register containing result of Z test/comparison (out) + * + * Returns TRUE if the Z-buffer needs to be updated. + */ +static boolean +gen_depth_test(struct spe_function *f, + const struct pipe_depth_stencil_alpha_state *dsa, + int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) +{ + /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_ + * quantities. This only makes a difference for 32-bit Z values though. + */ + ASSERT(dsa->depth.enabled); + + switch (dsa->depth.func) { + case PIPE_FUNC_EQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* zmask = (ifragZ == ref) */ + spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GREATER: + /* zmask = (ifragZ > ref) */ + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LESS: + /* zmask = (ref > ifragZ) */ + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & zmask) */ + spe_and(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* zmask = (ifragZ > ref) */ + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* zmask = (ref > ifragZ) */ + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + /* mask = (mask & ~zmask) */ + spe_andc(f, mask_reg, mask_reg, zmask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */ + spe_move(f, zmask_reg, mask_reg); /* zmask = mask */ + break; + + case PIPE_FUNC_ALWAYS: + /* mask unchanged */ + spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */ + break; + + default: + ASSERT(0); + break; + } + + if (dsa->depth.writemask) { + /* + * If (ztest passed) { + * framebufferZ = fragmentZ; + * } + * OR, + * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ; + */ + spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg); + return TRUE; + } + + return FALSE; +} + + +/** + * Generate SPE code to perform alpha testing. + * + * \param dsa Gallium depth/stencil/alpha state to gen code for + * \param f SPE function to append instruction onto. + * \param mask_reg register containing quad/pixel "alive" mask (in/out) + * \param fragA_reg register containing four fragment alpha values (in) + */ +static void +gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask_reg, int fragA_reg) +{ + int ref_reg = spe_allocate_available_register(f); + int amask_reg = spe_allocate_available_register(f); + + ASSERT(dsa->alpha.enabled); + + if ((dsa->alpha.func != PIPE_FUNC_NEVER) && + (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { + /* load/splat the alpha reference float value */ + spe_load_float(f, ref_reg, dsa->alpha.ref_value); + } + + /* emit code to do the alpha comparison, updating 'mask' */ + switch (dsa->alpha.func) { + case PIPE_FUNC_EQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NOTEQUAL: + /* amask = (fragA == ref) */ + spe_fceq(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GREATER: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LESS: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & amask) */ + spe_and(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_LEQUAL: + /* amask = (fragA > ref) */ + spe_fcgt(f, amask_reg, fragA_reg, ref_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_GEQUAL: + /* amask = (ref > fragA) */ + spe_fcgt(f, amask_reg, ref_reg, fragA_reg); + /* mask = (mask & ~amask) */ + spe_andc(f, mask_reg, mask_reg, amask_reg); + break; + + case PIPE_FUNC_NEVER: + spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */ + break; + + case PIPE_FUNC_ALWAYS: + /* no-op, mask unchanged */ + break; + + default: + ASSERT(0); + break; + } + +#if OPTIMIZATIONS + /* if mask == {0,0,0,0} we're all done, return */ + { + /* re-use amask reg here */ + int tmp_reg = amask_reg; + /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */ + spe_orx(f, tmp_reg, mask_reg); + /* if tmp[0] == 0 then return from function call */ + spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0); + } +#endif + + spe_release_register(f, ref_reg); + spe_release_register(f, amask_reg); +} + + +/** + * This pair of functions is used inline to allocate and deallocate + * optional constant registers. Once a constant is discovered to be + * needed, we will likely need it again, so we don't want to deallocate + * it and have to allocate and load it again unnecessarily. + */ +static INLINE void +setup_optional_register(struct spe_function *f, + int *r) +{ + if (*r < 0) + *r = spe_allocate_available_register(f); +} + +static INLINE void +release_optional_register(struct spe_function *f, + int r) +{ + if (r >= 0) + spe_release_register(f, r); +} + +static INLINE void +setup_const_register(struct spe_function *f, + int *r, + float value) +{ + if (*r >= 0) + return; + setup_optional_register(f, r); + spe_load_float(f, *r, value); +} + +static INLINE void +release_const_register(struct spe_function *f, + int r) +{ + release_optional_register(f, r); +} + + + +/** + * Unpack/convert framebuffer colors from four 32-bit packed colors + * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA). + * Each 8-bit color component is expanded into a float in [0.0, 1.0]. + */ +static void +unpack_colors(struct spe_function *f, + enum pipe_format color_format, + int fbRGBA_reg, + int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg) +{ + int mask0_reg = spe_allocate_available_register(f); + int mask1_reg = spe_allocate_available_register(f); + int mask2_reg = spe_allocate_available_register(f); + int mask3_reg = spe_allocate_available_register(f); + + spe_load_int(f, mask0_reg, 0xff); + spe_load_int(f, mask1_reg, 0xff00); + spe_load_int(f, mask2_reg, 0xff0000); + spe_load_int(f, mask3_reg, 0xff000000); + + spe_comment(f, 0, "Unpack framebuffer colors, convert to floats"); + + switch (color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + /* fbB = fbRGBA & mask */ + spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg); + + /* fbG = fbRGBA & mask */ + spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg); + + /* fbR = fbRGBA & mask */ + spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg); + + /* fbA = fbRGBA & mask */ + spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg); + + /* fbG = fbG >> 8 */ + spe_roti(f, fbG_reg, fbG_reg, -8); + + /* fbR = fbR >> 16 */ + spe_roti(f, fbR_reg, fbR_reg, -16); + + /* fbA = fbA >> 24 */ + spe_roti(f, fbA_reg, fbA_reg, -24); + break; + + case PIPE_FORMAT_B8G8R8A8_UNORM: + /* fbA = fbRGBA & mask */ + spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg); + + /* fbR = fbRGBA & mask */ + spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg); + + /* fbG = fbRGBA & mask */ + spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg); + + /* fbB = fbRGBA & mask */ + spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg); + + /* fbR = fbR >> 8 */ + spe_roti(f, fbR_reg, fbR_reg, -8); + + /* fbG = fbG >> 16 */ + spe_roti(f, fbG_reg, fbG_reg, -16); + + /* fbB = fbB >> 24 */ + spe_roti(f, fbB_reg, fbB_reg, -24); + break; + + default: + ASSERT(0); + } + + /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */ + spe_cuflt(f, fbR_reg, fbR_reg, 8); + spe_cuflt(f, fbG_reg, fbG_reg, 8); + spe_cuflt(f, fbB_reg, fbB_reg, 8); + spe_cuflt(f, fbA_reg, fbA_reg, 8); + + spe_release_register(f, mask0_reg); + spe_release_register(f, mask1_reg); + spe_release_register(f, mask2_reg); + spe_release_register(f, mask3_reg); +} + + +/** + * Generate SPE code to implement the given blend mode for a quad of pixels. + * \param f SPE function to append instruction onto. + * \param fragR_reg register with fragment red values (float) (in/out) + * \param fragG_reg register with fragment green values (float) (in/out) + * \param fragB_reg register with fragment blue values (float) (in/out) + * \param fragA_reg register with fragment alpha values (float) (in/out) + * \param fbRGBA_reg register with packed framebuffer colors (integer) (in) + */ +static void +gen_blend(const struct pipe_blend_state *blend, + const struct pipe_blend_color *blend_color, + struct spe_function *f, + enum pipe_format color_format, + int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, + int fbRGBA_reg) +{ + int term1R_reg = spe_allocate_available_register(f); + int term1G_reg = spe_allocate_available_register(f); + int term1B_reg = spe_allocate_available_register(f); + int term1A_reg = spe_allocate_available_register(f); + + int term2R_reg = spe_allocate_available_register(f); + int term2G_reg = spe_allocate_available_register(f); + int term2B_reg = spe_allocate_available_register(f); + int term2A_reg = spe_allocate_available_register(f); + + int fbR_reg = spe_allocate_available_register(f); + int fbG_reg = spe_allocate_available_register(f); + int fbB_reg = spe_allocate_available_register(f); + int fbA_reg = spe_allocate_available_register(f); + + int tmp_reg = spe_allocate_available_register(f); + + /* Optional constant registers we might or might not end up using; + * if we do use them, make sure we only allocate them once by + * keeping a flag on each one. + */ + int one_reg = -1; + int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1; + + ASSERT(blend->blend_enable); + + /* packed RGBA -> float colors */ + unpack_colors(f, color_format, fbRGBA_reg, + fbR_reg, fbG_reg, fbB_reg, fbA_reg); + + /* + * Compute Src RGB terms. We're actually looking for the value + * of (the appropriate RGB factors) * (the incoming source RGB color), + * because in some cases (like PIPE_BLENDFACTOR_ONE and + * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math. + */ + switch (blend->rgb_src_factor) { + case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (R,G,B) */ + spe_move(f, term1R_reg, fragR_reg); + spe_move(f, term1G_reg, fragG_reg); + spe_move(f, term1B_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + /* factors = (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term1R_reg, 0.0f); + spe_load_float(f, term1G_reg, 0.0f); + spe_load_float(f, term1B_reg, 0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*R, G*G, B*B) */ + spe_fm(f, term1R_reg, fragR_reg, fragR_reg); + spe_fm(f, term1G_reg, fragG_reg, fragG_reg); + spe_fm(f, term1B_reg, fragB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (R*A, G*A, B*A) */ + spe_fm(f, term1R_reg, fragR_reg, fragA_reg); + spe_fm(f, term1G_reg, fragG_reg, fragA_reg); + spe_fm(f, term1B_reg, fragB_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) + * or in other words term = (R-R*R, G-G*G, B-B*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */ + spe_fm(f, term1R_reg, fragR_reg, fbR_reg); + spe_fm(f, term1G_reg, fragG_reg, fbG_reg); + spe_fm(f, term1B_reg, fragB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) + * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) + * or term = (R-R*A,G-G*A,B-B*A) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */ + spe_fm(f, term1R_reg, fragR_reg, fbA_reg); + spe_fm(f, term1G_reg, fragG_reg, fbA_reg); + spe_fm(f, term1B_reg, fragB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) + * or term = (R-R*Afb,G-G*Afb,b-B*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */ + spe_fm(f, term1R_reg, fragR_reg, constR_reg); + spe_fm(f, term1G_reg, fragG_reg, constG_reg); + spe_fm(f, term1B_reg, fragB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */ + spe_fm(f, term1R_reg, fragR_reg, constA_reg); + spe_fm(f, term1G_reg, fragG_reg, constA_reg); + spe_fm(f, term1B_reg, fragB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) + * or term = (R-R*Rc, G-G*Gc, B-B*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) + * or term = (R-R*Ac,G-G*Ac,B-B*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + /* We'll need the optional {1,1,1,1} register */ + setup_const_register(f, &one_reg, 1.0f); + /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so + * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb)) + * We could expand the term (as a*min(b,c) == min(a*b,a*c) + * as long as a is positive), but then we'd have to do three + * spe_float_min() functions instead of one, so this is simpler. + */ + /* tmp = 1 - Afb */ + spe_fs(f, tmp_reg, one_reg, fbA_reg); + /* tmp = min(A,tmp) */ + spe_float_min(f, tmp_reg, fragA_reg, tmp_reg); + /* term = R*tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + + default: + ASSERT(0); + } + + /* + * Compute Src Alpha term. Like the above, we're looking for + * the full term A*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. + */ + switch (blend->alpha_src_factor) { + case PIPE_BLENDFACTOR_ZERO: + /* factor = 0, so term = 0 */ + spe_load_float(f, term1A_reg, 0.0f); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */ + case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = A */ + spe_move(f, term1A_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = A*A */ + spe_fm(f, term1A_reg, fragA_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + spe_fm(f, term1A_reg, fragA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = A*(1-A) = A-A*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = A*Afb */ + spe_fm(f, term1A_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = A*Ac */ + spe_fm(f, term1A_reg, fragA_reg, constA_reg); + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + ASSERT(0); + } + + /* + * Compute Dest RGB term. Like the above, we're looking for + * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. + */ + switch (blend->rgb_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */ + spe_move(f, term2R_reg, fbR_reg); + spe_move(f, term2G_reg, fbG_reg); + spe_move(f, term2B_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + /* factor s= (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term2R_reg, 0.0f); + spe_load_float(f, term2G_reg, 0.0f); + spe_load_float(f, term2B_reg, 0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */ + spe_fm(f, term2R_reg, fbR_reg, fragR_reg); + spe_fm(f, term2G_reg, fbG_reg, fragG_reg); + spe_fm(f, term2B_reg, fbB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) + * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */ + spe_fm(f, term2R_reg, fbR_reg, fragA_reg); + spe_fm(f, term2G_reg, fbG_reg, fragA_reg); + spe_fm(f, term2B_reg, fbB_reg, fragA_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */ + spe_fm(f, term2R_reg, fbR_reg, fbR_reg); + spe_fm(f, term2G_reg, fbG_reg, fbG_reg); + spe_fm(f, term2B_reg, fbB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb)) + * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */ + spe_fm(f, term2R_reg, fbR_reg, fbA_reg); + spe_fm(f, term2G_reg, fbG_reg, fbA_reg); + spe_fm(f, term2B_reg, fbB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) + * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */ + spe_fm(f, term2R_reg, fbR_reg, constR_reg); + spe_fm(f, term2G_reg, fbG_reg, constG_reg); + spe_fm(f, term2B_reg, fbB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */ + spe_fm(f, term2R_reg, fbR_reg, constA_reg); + spe_fm(f, term2G_reg, fbG_reg, constA_reg); + spe_fm(f, term2B_reg, fbB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) + * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac)) + * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + + default: + ASSERT(0); + } + + /* + * Compute Dest Alpha term. Like the above, we're looking for + * the full term Afb*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. + */ + switch (blend->alpha_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = Afb */ + spe_move(f, term2A_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_ZERO: + /* factor = 0, so term = 0 */ + spe_load_float(f, term2A_reg, 0.0f); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = Afb*A */ + spe_fm(f, term2A_reg, fbA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = Afb*Afb */ + spe_fm(f, term2A_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = Afb*Ac */ + spe_fm(f, term2A_reg, fbA_reg, constA_reg); + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + ASSERT(0); + } + + /* + * Combine Src/Dest RGB terms as per the blend equation. + */ + switch (blend->rgb_func) { + case PIPE_BLEND_ADD: + spe_fa(f, fragR_reg, term1R_reg, term2R_reg); + spe_fa(f, fragG_reg, term1G_reg, term2G_reg); + spe_fa(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_SUBTRACT: + spe_fs(f, fragR_reg, term1R_reg, term2R_reg); + spe_fs(f, fragG_reg, term1G_reg, term2G_reg); + spe_fs(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragR_reg, term2R_reg, term1R_reg); + spe_fs(f, fragG_reg, term2G_reg, term1G_reg); + spe_fs(f, fragB_reg, term2B_reg, term1B_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_min(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_min(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_max(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_max(f, fragB_reg, term1B_reg, term2B_reg); + break; + default: + ASSERT(0); + } + + /* + * Combine Src/Dest A term + */ + switch (blend->alpha_func) { + case PIPE_BLEND_ADD: + spe_fa(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_SUBTRACT: + spe_fs(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragA_reg, term2A_reg, term1A_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragA_reg, term1A_reg, term2A_reg); + break; + default: + ASSERT(0); + } + + spe_release_register(f, term1R_reg); + spe_release_register(f, term1G_reg); + spe_release_register(f, term1B_reg); + spe_release_register(f, term1A_reg); + + spe_release_register(f, term2R_reg); + spe_release_register(f, term2G_reg); + spe_release_register(f, term2B_reg); + spe_release_register(f, term2A_reg); + + spe_release_register(f, fbR_reg); + spe_release_register(f, fbG_reg); + spe_release_register(f, fbB_reg); + spe_release_register(f, fbA_reg); + + spe_release_register(f, tmp_reg); + + /* Free any optional registers that actually got used */ + release_const_register(f, one_reg); + release_const_register(f, constR_reg); + release_const_register(f, constG_reg); + release_const_register(f, constB_reg); + release_const_register(f, constA_reg); +} + + +static void +gen_logicop(const struct pipe_blend_state *blend, + struct spe_function *f, + int fragRGBA_reg, int fbRGBA_reg) +{ + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. + * */ + ASSERT(blend->logicop_enable); + + switch(blend->logicop_func) { + case PIPE_LOGICOP_CLEAR: /* 0 */ + spe_zero(f, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOR: /* ~(s | d) */ + spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY_INVERTED: /* ~s */ + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_INVERT: /* ~d */ + /* Note that (A nor A) == ~(A|A) == ~A */ + spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_XOR: /* s ^ d */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_NAND: /* ~(s & d) */ + spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND: /* s & d */ + spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOOP: /* d */ + spe_move(f, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY: /* s */ + break; + case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR: /* s | d */ + spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_SET: /* 1 */ + spe_load_int(f, fragRGBA_reg, 0xffffffff); + break; + default: + ASSERT(0); + } +} + + +/** + * Generate code to pack a quad of float colors into four 32-bit integers. + * + * \param f SPE function to append instruction onto. + * \param color_format the dest color packing format + * \param r_reg register containing four red values (in/clobbered) + * \param g_reg register containing four green values (in/clobbered) + * \param b_reg register containing four blue values (in/clobbered) + * \param a_reg register containing four alpha values (in/clobbered) + * \param rgba_reg register to store the packed RGBA colors (out) + */ +static void +gen_pack_colors(struct spe_function *f, + enum pipe_format color_format, + int r_reg, int g_reg, int b_reg, int a_reg, + int rgba_reg) +{ + int rg_reg = spe_allocate_available_register(f); + int ba_reg = spe_allocate_available_register(f); + + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ + spe_cfltu(f, r_reg, r_reg, 32); + spe_cfltu(f, g_reg, g_reg, 32); + spe_cfltu(f, b_reg, b_reg, 32); + spe_cfltu(f, a_reg, a_reg, 32); + + /* Shift the most significant bytes to the least significant positions. + * I.e.: reg = reg >> 24 + */ + spe_rotmi(f, r_reg, r_reg, -24); + spe_rotmi(f, g_reg, g_reg, -24); + spe_rotmi(f, b_reg, b_reg, -24); + spe_rotmi(f, a_reg, a_reg, -24); + + /* Shift the color bytes according to the surface format */ + if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) { + spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */ + spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */ + spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */ + } + else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) { + spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */ + spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */ + spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */ + } + else { + ASSERT(0); + } + + /* Merge red, green, blue, alpha registers to make packed RGBA colors. + * Eg: after shifting according to color_format we might have: + * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000} + * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600} + * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099} + * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000} + * OR-ing all those together gives us four packed colors: + * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} + */ + spe_or(f, rg_reg, r_reg, g_reg); + spe_or(f, ba_reg, a_reg, b_reg); + spe_or(f, rgba_reg, rg_reg, ba_reg); + + spe_release_register(f, rg_reg); + spe_release_register(f, ba_reg); +} + + +static void +gen_colormask(struct spe_function *f, + uint colormask, + enum pipe_format color_format, + int fragRGBA_reg, int fbRGBA_reg) +{ + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. Further, the pixels + * are packed according to the given color format, not + * necessarily RGBA... + */ + uint r_mask; + uint g_mask; + uint b_mask; + uint a_mask; + + /* Calculate exactly where the bits for any particular color + * end up, so we can mask them correctly. + */ + switch(color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + /* ARGB */ + a_mask = 0xff000000; + r_mask = 0x00ff0000; + g_mask = 0x0000ff00; + b_mask = 0x000000ff; + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + /* BGRA */ + b_mask = 0xff000000; + g_mask = 0x00ff0000; + r_mask = 0x0000ff00; + a_mask = 0x000000ff; + break; + default: + ASSERT(0); + } + + /* For each R, G, B, and A component we're supposed to mask out, + * clear its bits. Then our mask operation later will work + * as expected. + */ + if (!(colormask & PIPE_MASK_R)) { + r_mask = 0; + } + if (!(colormask & PIPE_MASK_G)) { + g_mask = 0; + } + if (!(colormask & PIPE_MASK_B)) { + b_mask = 0; + } + if (!(colormask & PIPE_MASK_A)) { + a_mask = 0; + } + + /* Get a temporary register to hold the mask that will be applied + * to the fragment + */ + int colormask_reg = spe_allocate_available_register(f); + + /* The actual mask we're going to use is an OR of the remaining R, G, B, + * and A masks. Load the result value into our temporary register. + */ + spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask); + + /* Use the mask register to select between the fragment color + * values and the frame buffer color values. Wherever the + * mask has a 0 bit, the current frame buffer color should override + * the fragment color. Wherever the mask has a 1 bit, the + * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM) + * instruction will select bits from its first operand rA wherever the + * the mask bits rM are 0, and from its second operand rB wherever the + * mask bits rM are 1. That means that the frame buffer color is the + * first operand, and the fragment color the second. + */ + spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg); + + /* Release the temporary register and we're done */ + spe_release_register(f, colormask_reg); +} + + +/** + * This function is annoyingly similar to gen_depth_test(), above, except + * that instead of comparing two varying values (i.e. fragment and buffer), + * we're comparing a varying value with a static value. As such, we have + * access to the Compare Immediate instructions where we don't in + * gen_depth_test(), which is what makes us very different. + * + * There's some added complexity if there's a non-trivial state->mask + * value; then stencil and reference both must be masked + * + * The return value in the stencil_pass_reg is a bitmask of valid + * fragments that also passed the stencil test. The bitmask of valid + * fragments that failed would be found in + * (fragment_mask_reg & ~stencil_pass_reg). + */ +static void +gen_stencil_test(struct spe_function *f, + const struct pipe_stencil_state *state, + uint stencil_max_value, + int fragment_mask_reg, + int fbS_reg, + int stencil_pass_reg) +{ + /* Generate code that puts the set of passing fragments into the + * stencil_pass_reg register, taking into account whether each fragment + * was active to begin with. + */ + switch (state->func) { + case PIPE_FUNC_EQUAL: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */ + uint tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, + state->valuemask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_NOTEQUAL: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & ~(s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */ + int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, + state->valuemask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_LESS: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference < s) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */ + int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, + state->valuemask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_GREATER: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference > s) */ + /* There's no convenient Compare Less Than Immediate instruction, so + * we'll have to do this one the harder way, by loading a register and + * comparing directly. Compare Logical Greater Than Word (clgt) + * treats its operands as unsigned - no sign extension. + */ + int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */ + int tmp_reg = spe_allocate_available_register(f); + int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->valuemask & state->ref_value); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_GEQUAL: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference >= s) + * = fragment_mask & ~(s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, + state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */ + int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, + state->valuemask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_LEQUAL: + if (state->valuemask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference <= s) ] + * = fragment_mask & ~(reference > s) */ + /* As above, we have to do this by loading a register */ + int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */ + int tmp_reg = spe_allocate_available_register(f); + int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value & state->valuemask); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } + break; + + case PIPE_FUNC_NEVER: + /* stencil_pass = fragment_mask & 0 = 0 */ + spe_load_uint(f, stencil_pass_reg, 0); + break; + + case PIPE_FUNC_ALWAYS: + /* stencil_pass = fragment_mask & 1 = fragment_mask */ + spe_move(f, stencil_pass_reg, fragment_mask_reg); + break; + } + + /* The fragments that passed the stencil test are now in stencil_pass_reg. + * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg). + */ +} + + +/** + * This function generates code that calculates a set of new stencil values + * given the earlier values and the operation to apply. It does not + * apply any tests. It is intended to be called up to 3 times + * (for the stencil fail operation, for the stencil pass-z fail operation, + * and for the stencil pass-z pass operation) to collect up to three + * possible sets of values, and for the caller to combine them based + * on the result of the tests. + * + * stencil_max_value should be (2^n - 1) where n is the number of bits + * in the stencil buffer - in other words, it should be usable as a mask. + */ +static void +gen_stencil_values(struct spe_function *f, + uint stencil_op, + uint stencil_ref_value, + uint stencil_max_value, + int fbS_reg, + int newS_reg) +{ + /* The code below assumes that newS_reg and fbS_reg are not the same + * register; if they can be, the calculations below will have to use + * an additional temporary register. For now, mark the assumption + * with an assertion that will fail if they are the same. + */ + ASSERT(fbS_reg != newS_reg); + + /* The code also assumes the the stencil_max_value is of the form + * 2^n-1 and can therefore be used as a mask for the valid bits in + * addition to a maximum. Make sure this is the case as well. + * The clever math below exploits the fact that incrementing a + * binary number serves to flip all the bits of a number starting at + * the LSB and continuing to (and including) the first zero bit + * found. That means that a number and its increment will always + * have at least one bit in common (the high order bit, if nothing + * else) *unless* the number is zero, *or* the number is of a form + * consisting of some number of 1s in the low-order bits followed + * by nothing but 0s in the high-order bits. The latter case + * implies it's of the form 2^n-1. + */ + ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0); + + switch(stencil_op) { + case PIPE_STENCIL_OP_KEEP: + /* newS = S */ + spe_move(f, newS_reg, fbS_reg); + break; + + case PIPE_STENCIL_OP_ZERO: + /* newS = 0 */ + spe_zero(f, newS_reg); + break; + + case PIPE_STENCIL_OP_REPLACE: + /* newS = stencil reference value */ + spe_load_uint(f, newS_reg, stencil_ref_value); + break; + + case PIPE_STENCIL_OP_INCR: { + /* newS = (s == max ? max : s + 1) */ + int equals_reg = spe_allocate_available_register(f); + + spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value); + /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ + spe_ai(f, newS_reg, fbS_reg, 1); + /* Select from the current value or the new value based on the equality test */ + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); + + spe_release_register(f, equals_reg); + break; + } + case PIPE_STENCIL_OP_DECR: { + /* newS = (s == 0 ? 0 : s - 1) */ + int equals_reg = spe_allocate_available_register(f); + + spe_compare_equal_uint(f, equals_reg, fbS_reg, 0); + /* Add Word Immediate with a (-1) value works */ + spe_ai(f, newS_reg, fbS_reg, -1); + /* Select from the current value or the new value based on the equality test */ + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); + + spe_release_register(f, equals_reg); + break; + } + case PIPE_STENCIL_OP_INCR_WRAP: + /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can + * do a normal add and mask off the correct bits + */ + spe_ai(f, newS_reg, fbS_reg, 1); + spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); + break; + + case PIPE_STENCIL_OP_DECR_WRAP: + /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */ + spe_ai(f, newS_reg, fbS_reg, -1); + spe_and_uint(f, newS_reg, newS_reg, stencil_max_value); + break; + + case PIPE_STENCIL_OP_INVERT: + /* newS = ~s. We take advantage of the mask/max value to invert only + * the valid bits for the field so we don't have to do an extra "and". + */ + spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value); + break; + + default: + ASSERT(0); + } +} + + +/** + * This function generates code to get all the necessary possible + * stencil values. For each of the output registers (fail_reg, + * zfail_reg, and zpass_reg), it either allocates a new register + * and calculates a new set of values based on the stencil operation, + * or it reuses a register allocation and calculation done for an + * earlier (matching) operation, or it reuses the fbS_reg register + * (if the stencil operation is KEEP, which doesn't change the + * stencil buffer). + * + * Since this function allocates a variable number of registers, + * to avoid incurring complex logic to free them, they should + * be allocated after a spe_allocate_register_set() call + * and released by the corresponding spe_release_register_set() call. + */ +static void +gen_get_stencil_values(struct spe_function *f, + const struct pipe_stencil_state *stencil, + const uint depth_enabled, + int fbS_reg, + int *fail_reg, + int *zfail_reg, + int *zpass_reg) +{ + uint zfail_op; + + /* Stenciling had better be enabled here */ + ASSERT(stencil->enabled); + + /* If the depth test is not enabled, it is treated as though it always + * passes, which means that the zfail_op is not considered - a + * failing stencil test triggers the fail_op, and a passing one + * triggers the zpass_op + * + * As an optimization, override calculation of the zfail_op values + * if they aren't going to be used. By setting the value of + * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed + * to match the incoming stencil values, and no calculation will + * be done. + */ + if (depth_enabled) { + zfail_op = stencil->zfail_op; + } + else { + zfail_op = PIPE_STENCIL_OP_KEEP; + } + + /* One-sided or front-facing stencil */ + if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) { + *fail_reg = fbS_reg; + } + else { + *fail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, stencil->fail_op, stencil->ref_value, + 0xff, fbS_reg, *fail_reg); + } + + /* Check the possibly overridden value, not the structure value */ + if (zfail_op == PIPE_STENCIL_OP_KEEP) { + *zfail_reg = fbS_reg; + } + else if (zfail_op == stencil->fail_op) { + *zfail_reg = *fail_reg; + } + else { + *zfail_reg = spe_allocate_available_register(f); + gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, + 0xff, fbS_reg, *zfail_reg); + } + + if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) { + *zpass_reg = fbS_reg; + } + else if (stencil->zpass_op == stencil->fail_op) { + *zpass_reg = *fail_reg; + } + else if (stencil->zpass_op == zfail_op) { + *zpass_reg = *zfail_reg; + } + else { + *zpass_reg = spe_allocate_available_register(f); + gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, + 0xff, fbS_reg, *zpass_reg); + } +} + +/** + * Note that fbZ_reg may *not* be set on entry, if in fact + * the depth test is not enabled. This function must not use + * the register if depth is not enabled. + */ +static boolean +gen_stencil_depth_test(struct spe_function *f, + const struct pipe_depth_stencil_alpha_state *dsa, + const uint facing, + const int mask_reg, const int fragZ_reg, + const int fbZ_reg, const int fbS_reg) +{ + /* True if we've generated code that could require writeback to the + * depth and/or stencil buffers + */ + boolean modified_buffers = FALSE; + + boolean need_to_calculate_stencil_values; + boolean need_to_writemask_stencil_values; + + struct pipe_stencil_state *stencil; + + /* Registers. We may or may not actually allocate these, depending + * on whether the state values indicate that we need them. + */ + int stencil_pass_reg, stencil_fail_reg; + int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values; + int stencil_writemask_reg; + int zmask_reg; + int newS_reg; + + /* Stenciling is quite complex: up to six different configurable stencil + * operations/calculations can be required (three each for front-facing + * and back-facing fragments). Many of those operations will likely + * be identical, so there's good reason to try to avoid calculating + * the same values more than once (which unfortunately makes the code less + * straightforward). + * + * To make register management easier, we start a new + * register set; we can release all the registers in the set at + * once, and avoid having to keep track of exactly which registers + * we allocate. We can still allocate and free registers as + * desired (if we know we no longer need a register), but we don't + * have to spend the complexity to track the more difficult variant + * register usage scenarios. + */ + spe_comment(f, 0, "Allocating stencil register set"); + spe_allocate_register_set(f); + + /* The facing we're given is the fragment facing; it doesn't + * exactly match the stencil facing. If stencil is enabled, + * but two-sided stencil is *not* enabled, we use the same + * stencil settings for both front- and back-facing fragments. + * We only use the "back-facing" stencil for backfacing fragments + * if two-sided stenciling is enabled. + */ + if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) { + stencil = &dsa->stencil[1]; + } + else { + stencil = &dsa->stencil[0]; + } + + /* Calculate the writemask. If the writemask is trivial (either + * all 0s, meaning that we don't need to calculate any stencil values + * because they're not going to change the stencil anyway, or all 1s, + * meaning that we have to calculate the stencil values but do not + * need to mask them), we can avoid generating code. Don't forget + * that we need to consider backfacing stencil, if enabled. + * + * Note that if the backface stencil is *not* enabled, the backface + * stencil will have the same values as the frontface stencil. + */ + if (stencil->fail_op == PIPE_STENCIL_OP_KEEP && + stencil->zfail_op == PIPE_STENCIL_OP_KEEP && + stencil->zpass_op == PIPE_STENCIL_OP_KEEP) { + need_to_calculate_stencil_values = FALSE; + need_to_writemask_stencil_values = FALSE; + } + else if (stencil->writemask == 0x0) { + /* All changes are writemasked out, so no need to calculate + * what those changes might be, and no need to write anything back. + */ + need_to_calculate_stencil_values = FALSE; + need_to_writemask_stencil_values = FALSE; + } + else if (stencil->writemask == 0xff) { + /* Still trivial, but a little less so. We need to write the stencil + * values, but we don't need to mask them. + */ + need_to_calculate_stencil_values = TRUE; + need_to_writemask_stencil_values = FALSE; + } + else { + /* The general case: calculate, mask, and write */ + need_to_calculate_stencil_values = TRUE; + need_to_writemask_stencil_values = TRUE; + + /* While we're here, generate code that calculates what the + * writemask should be. If backface stenciling is enabled, + * and the backface writemask is not the same as the frontface + * writemask, we'll have to generate code that merges the + * two masks into a single effective mask based on fragment facing. + */ + spe_comment(f, 0, "Computing stencil writemask"); + stencil_writemask_reg = spe_allocate_available_register(f); + spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask); + } + + /* At least one-sided stenciling must be on. Generate code that + * runs the stencil test on the basic/front-facing stencil, leaving + * the mask of passing stencil bits in stencil_pass_reg. This mask will + * be used both to mask the set of active pixels, and also to + * determine how the stencil buffer changes. + * + * This test will *not* change the value in mask_reg (because we don't + * yet know whether to apply the two-sided stencil or one-sided stencil). + */ + spe_comment(f, 0, "Running basic stencil test"); + stencil_pass_reg = spe_allocate_available_register(f); + gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg); + + /* Generate code that, given the mask of valid fragments and the + * mask of valid fragments that passed the stencil test, computes + * the mask of valid fragments that failed the stencil test. We + * have to do this before we run a depth test (because the + * depth test should not be performed on fragments that failed the + * stencil test, and because the depth test will update the + * mask of valid fragments based on the results of the depth test). + */ + spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask"); + stencil_fail_reg = spe_allocate_available_register(f); + spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg); + /* Now remove the stenciled-out pixels from the valid fragment mask, + * so we can later use the valid fragment mask in the depth test. + */ + spe_and(f, mask_reg, mask_reg, stencil_pass_reg); + + /* We may not need to calculate stencil values, if the writemask is off */ + if (need_to_calculate_stencil_values) { + /* Generate code that calculates exactly which stencil values we need, + * without calculating the same value twice (say, if two different + * stencil ops have the same value). This code will work for one-sided + * and two-sided stenciling (so that we take into account that operations + * may match between front and back stencils), and will also take into + * account whether the depth test is enabled (if the depth test is off, + * we don't need any of the zfail results, because the depth test always + * is considered to pass if it is disabled). Any register value that + * does not need to be calculated will come back with the same value + * that's in fbS_reg. + * + * This function will allocate a variant number of registers that + * will be released as part of the register set. + */ + spe_comment(f, 0, facing == CELL_FACING_FRONT + ? "Computing front-facing stencil values" + : "Computing back-facing stencil values"); + gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, + &stencil_fail_values, &stencil_pass_depth_fail_values, + &stencil_pass_depth_pass_values); + } + + /* We now have all the stencil values we need. We also need + * the results of the depth test to figure out which + * stencil values will become the new stencil values. (Even if + * we aren't actually calculating stencil values, we need to apply + * the depth test if it's enabled.) + * + * The code generated by gen_depth_test() returns the results of the + * test in the given register, but also alters the mask_reg based + * on the results of the test. + */ + if (dsa->depth.enabled) { + spe_comment(f, 0, "Running stencil depth test"); + zmask_reg = spe_allocate_available_register(f); + modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, + fbZ_reg, zmask_reg); + } + + if (need_to_calculate_stencil_values) { + + /* If we need to writemask the stencil values before going into + * the stencil buffer, we'll have to use a new register to + * hold the new values. If not, we can just keep using the + * current register. + */ + if (need_to_writemask_stencil_values) { + newS_reg = spe_allocate_available_register(f); + spe_comment(f, 0, "Saving current stencil values for writemasking"); + spe_move(f, newS_reg, fbS_reg); + } + else { + newS_reg = fbS_reg; + } + + /* Merge in the selected stencil fail values */ + if (stencil_fail_values != fbS_reg) { + spe_comment(f, 0, "Loading stencil fail values"); + spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg); + modified_buffers = TRUE; + } + + /* Same for the stencil pass/depth fail values. If this calculation + * is not needed (say, if depth test is off), then the + * stencil_pass_depth_fail_values register will be equal to fbS_reg + * and we'll skip the calculation. + */ + if (stencil_pass_depth_fail_values != fbS_reg) { + /* We don't actually have a stencil pass/depth fail mask yet. + * Calculate it here from the stencil passing mask and the + * depth passing mask. Note that zmask_reg *must* have been + * set above if we're here. + */ + uint stencil_pass_depth_fail_mask = + spe_allocate_available_register(f); + + spe_comment(f, 0, "Loading stencil pass/depth fail values"); + spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg); + + spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, + stencil_pass_depth_fail_mask); + + spe_release_register(f, stencil_pass_depth_fail_mask); + modified_buffers = TRUE; + } + + /* Same for the stencil pass/depth pass mask. Note that we + * *can* get here with zmask_reg being unset (if the depth + * test is off but the stencil test is on). In this case, + * we assume the depth test passes, and don't need to mask + * the stencil pass mask with the Z mask. + */ + if (stencil_pass_depth_pass_values != fbS_reg) { + if (dsa->depth.enabled) { + uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f); + /* We'll need a separate register */ + spe_comment(f, 0, "Loading stencil pass/depth pass values"); + spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg); + spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask); + spe_release_register(f, stencil_pass_depth_pass_mask); + } + else { + /* We can use the same stencil-pass register */ + spe_comment(f, 0, "Loading stencil pass values"); + spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg); + } + modified_buffers = TRUE; + } + + /* Almost done. If we need to writemask, do it now, leaving the + * results in the fbS_reg register passed in. If we don't need + * to writemask, then the results are *already* in the fbS_reg, + * so there's nothing more to do. + */ + + if (need_to_writemask_stencil_values && modified_buffers) { + /* The Select Bytes command makes a fine writemask. Where + * the mask is 0, the first (original) values are retained, + * effectively masking out changes. Where the mask is 1, the + * second (new) values are retained, incorporating changes. + */ + spe_comment(f, 0, "Writemasking new stencil values"); + spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg); + } + + } /* done calculating stencil values */ + + /* The stencil and/or depth values have been applied, and the + * mask_reg, fbS_reg, and fbZ_reg values have been updated. + * We're all done, except that we've allocated a fair number + * of registers that we didn't bother tracking. Release all + * those registers as part of the register set, and go home. + */ + spe_comment(f, 0, "Releasing stencil register set"); + spe_release_register_set(f); + + /* Return TRUE if we could have modified the stencil and/or + * depth buffers. + */ + return modified_buffers; +} + + +/** + * Generate depth and/or stencil test code. + * \param cell context + * \param dsa depth/stencil/alpha state + * \param f spe function to emit + * \param facing either CELL_FACING_FRONT or CELL_FACING_BACK + * \param mask_reg register containing the pixel alive/dead mask + * \param depth_tile_reg register containing address of z/stencil tile + * \param quad_offset_reg offset to quad from start of tile + * \param fragZ_reg register containg fragment Z values + */ +static void +gen_depth_stencil(struct cell_context *cell, + const struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, + uint facing, + int mask_reg, + int depth_tile_reg, + int quad_offset_reg, + int fragZ_reg) + +{ + const enum pipe_format zs_format = cell->framebuffer.zsbuf->format; + boolean write_depth_stencil; + + /* framebuffer's combined z/stencil values register */ + int fbZS_reg = spe_allocate_available_register(f); + + /* Framebufer Z values register */ + int fbZ_reg = spe_allocate_available_register(f); + + /* Framebuffer stencil values register (may not be used) */ + int fbS_reg = spe_allocate_available_register(f); + + /* 24-bit mask register (may not be used) */ + int zmask_reg = spe_allocate_available_register(f); + + /** + * The following code: + * 1. fetch quad of packed Z/S values from the framebuffer tile. + * 2. extract the separate the Z and S values from packed values + * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints + * + * The instructions for doing this are interleaved for better performance. + */ + spe_comment(f, 0, "Fetch Z/stencil quad from tile"); + + switch(zs_format) { + case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ + case PIPE_FORMAT_X8Z24_UNORM: + /* prepare mask to extract Z vals from ZS vals */ + spe_load_uint(f, zmask_reg, 0x00ffffff); + + /* convert fragment Z from [0,1] to 32-bit ints */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + /* right shift 32-bit fragment Z to 24 bits */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* extract 24-bit Z values from ZS values by masking */ + spe_and(f, fbZ_reg, fbZS_reg, zmask_reg); + + /* extract 8-bit stencil values by shifting */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + break; + + case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ + case PIPE_FORMAT_Z24X8_UNORM: + /* convert fragment Z from [0,1] to 32-bit ints */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + /* right shift 32-bit fragment Z to 24 bits */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* extract 24-bit Z values from ZS values by shifting */ + spe_rotmi(f, fbZ_reg, fbZS_reg, -8); + + /* extract 8-bit stencil values by masking */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + break; + + case PIPE_FORMAT_Z32_UNORM: + /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg); + + /* convert fragment Z from [0,1] to 32-bit ints */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + + /* No stencil, so can't do anything there */ + break; + + case PIPE_FORMAT_Z16_UNORM: + /* XXX This code for 16bpp Z is broken! */ + + /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */ + spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* convert Z from [0,1] to 16-bit ints */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); + /* No stencil */ + break; + + default: + ASSERT(0); /* invalid format */ + } + + /* If stencil is enabled, use the stencil-specific code + * generator to generate both the stencil and depth (if needed) + * tests. Otherwise, if only depth is enabled, generate + * a quick depth test. The test generators themselves will + * report back whether the depth/stencil buffer has to be + * written back. + */ + if (dsa->stencil[0].enabled) { + /* This will perform the stencil and depth tests, and update + * the mask_reg, fbZ_reg, and fbS_reg as required by the + * tests. + */ + ASSERT(fbS_reg >= 0); + spe_comment(f, 0, "Perform stencil test"); + + /* Note that fbZ_reg may not be set on entry, if stenciling + * is enabled but there's no Z-buffer. The + * gen_stencil_depth_test() function must ignore the + * fbZ_reg register if depth is not enabled. + */ + write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, + mask_reg, fragZ_reg, + fbZ_reg, fbS_reg); + } + else if (dsa->depth.enabled) { + int zmask_reg = spe_allocate_available_register(f); + ASSERT(fbZ_reg >= 0); + spe_comment(f, 0, "Perform depth test"); + write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, + fbZ_reg, zmask_reg); + spe_release_register(f, zmask_reg); + } + else { + write_depth_stencil = FALSE; + } + + if (write_depth_stencil) { + /* Merge latest Z and Stencil values into fbZS_reg. + * fbZ_reg has four Z vals in bits [23..0] or bits [15..0]. + * fbS_reg has four 8-bit Z values in bits [7..0]. + */ + spe_comment(f, 0, "Store quad's depth/stencil values in tile"); + if (zs_format == PIPE_FORMAT_S8Z24_UNORM || + zs_format == PIPE_FORMAT_X8Z24_UNORM) { + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ + } + else if (zs_format == PIPE_FORMAT_S8_UNORM) { + ASSERT(0); /* XXX to do */ + } + else { + ASSERT(0); /* bad zs_format */ + } + + /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ + spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); + } + + /* Don't need these any more */ + spe_release_register(f, fbZS_reg); + spe_release_register(f, fbZ_reg); + spe_release_register(f, fbS_reg); + spe_release_register(f, zmask_reg); +} + + + +/** + * Generate SPE code to implement the fragment operations (alpha test, + * depth test, stencil test, blending, colormask, and final + * framebuffer write) as specified by the current context state. + * + * Logically, this code will be called after running the fragment + * shader. But under some circumstances we could run some of this + * code before the fragment shader to cull fragments/quads that are + * totally occluded/discarded. + * + * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now. + * + * See the spu_default_fragment_ops() function to see how the per-fragment + * operations would be done with ordinary C code. + * The code we generate here though has no branches, is SIMD, etc and + * should be much faster. + * + * \param cell the rendering context (in) + * \param facing whether the generated code is for front-facing or + * back-facing fragments + * \param f the generated function (in/out); on input, the function + * must already have been initialized. On exit, whatever + * instructions within the generated function have had + * the fragment ops appended. + */ +void +cell_gen_fragment_function(struct cell_context *cell, + const uint facing, + struct spe_function *f) +{ + const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil; + const struct pipe_blend_state *blend = cell->blend; + const struct pipe_blend_color *blend_color = &cell->blend_color; + const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; + + /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ + const int x_reg = 3; /* uint */ + const int y_reg = 4; /* uint */ + const int color_tile_reg = 5; /* tile_t * */ + const int depth_tile_reg = 6; /* tile_t * */ + const int fragZ_reg = 7; /* vector float */ + const int fragR_reg = 8; /* vector float */ + const int fragG_reg = 9; /* vector float */ + const int fragB_reg = 10; /* vector float */ + const int fragA_reg = 11; /* vector float */ + const int mask_reg = 12; /* vector uint */ + + ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK); + + /* offset of quad from start of tile + * XXX assuming 4-byte pixels for color AND Z/stencil!!!! + */ + int quad_offset_reg; + + int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */ + + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, TRUE); + spe_indent(f, 8); + spe_comment(f, -4, facing == CELL_FACING_FRONT + ? "Begin front-facing per-fragment ops" + : "Begin back-facing per-fragment ops"); + } + + spe_allocate_register(f, x_reg); + spe_allocate_register(f, y_reg); + spe_allocate_register(f, color_tile_reg); + spe_allocate_register(f, depth_tile_reg); + spe_allocate_register(f, fragZ_reg); + spe_allocate_register(f, fragR_reg); + spe_allocate_register(f, fragG_reg); + spe_allocate_register(f, fragB_reg); + spe_allocate_register(f, fragA_reg); + spe_allocate_register(f, mask_reg); + + quad_offset_reg = spe_allocate_available_register(f); + fbRGBA_reg = spe_allocate_available_register(f); + + /* compute offset of quad from start of tile, in bytes */ + { + int x2_reg = spe_allocate_available_register(f); + int y2_reg = spe_allocate_available_register(f); + + ASSERT(TILE_SIZE == 32); + + spe_comment(f, 0, "Compute quad offset within tile"); + spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */ + spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ + spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */ + spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */ + spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */ + + spe_release_register(f, x2_reg); + spe_release_register(f, y2_reg); + } + + /* Generate the alpha test, if needed. */ + if (dsa->alpha.enabled) { + gen_alpha_test(dsa, f, mask_reg, fragA_reg); + } + + /* generate depth and/or stencil test code */ + if (dsa->depth.enabled || dsa->stencil[0].enabled) { + gen_depth_stencil(cell, dsa, f, + facing, + mask_reg, + depth_tile_reg, + quad_offset_reg, + fragZ_reg); + } + + /* Get framebuffer quad/colors. We'll need these for blending, + * color masking, and to obey the quad/pixel mask. + * Load: fbRGBA_reg = memory[color_tile + quad_offset] + * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking + * we could skip this load. + */ + spe_comment(f, 0, "Fetch quad colors from tile"); + spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg); + + if (blend->blend_enable) { + spe_comment(f, 0, "Perform blending"); + gen_blend(blend, blend_color, f, color_format, + fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); + } + + /* + * Write fragment colors to framebuffer/tile. + * This involves converting the fragment colors from float[4] to the + * tile's specific format and obeying the quad/pixel mask. + */ + { + int rgba_reg = spe_allocate_available_register(f); + + /* Pack four float colors as four 32-bit int colors */ + spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors"); + gen_pack_colors(f, color_format, + fragR_reg, fragG_reg, fragB_reg, fragA_reg, + rgba_reg); + + if (blend->logicop_enable) { + spe_comment(f, 0, "Compute logic op"); + gen_logicop(blend, f, rgba_reg, fbRGBA_reg); + } + + if (blend->colormask != PIPE_MASK_RGBA) { + spe_comment(f, 0, "Compute color mask"); + gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg); + } + + /* Mix fragment colors with framebuffer colors using the quad/pixel mask: + * if (mask[i]) + * rgba[i] = rgba[i]; + * else + * rgba[i] = framebuffer[i]; + */ + spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg); + + /* Store updated quad in tile: + * memory[color_tile + quad_offset] = rgba_reg; + */ + spe_comment(f, 0, "Store quad colors into color tile"); + spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg); + + spe_release_register(f, rgba_reg); + } + + //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst); + + spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */ + + spe_release_register(f, fbRGBA_reg); + spe_release_register(f, quad_offset_reg); + + if (cell->debug_flags & CELL_DEBUG_ASM) { + char buffer[1024]; + sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", + facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst); + spe_comment(f, -4, buffer); + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h new file mode 100644 index 00000000000..21b35d1fafe --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_GEN_FRAGMENT_H +#define CELL_GEN_FRAGMENT_H + + +extern void +cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f); + + +#endif /* CELL_GEN_FRAGMENT_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c new file mode 100644 index 00000000000..ca358ed031e --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -0,0 +1,358 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Authors: + * Keith Whitwell <[email protected]> + * Brian Paul + */ + +#include "util/u_memory.h" +#include "pipe/p_inlines.h" +#include "draw/draw_context.h" +#include "cell_context.h" +#include "cell_flush.h" +#include "cell_pipe_state.h" +#include "cell_state.h" +#include "cell_texture.h" + + + +static void * +cell_create_blend_state(struct pipe_context *pipe, + const struct pipe_blend_state *blend) +{ + return mem_dup(blend, sizeof(*blend)); +} + + +static void +cell_bind_blend_state(struct pipe_context *pipe, void *blend) +{ + struct cell_context *cell = cell_context(pipe); + + draw_flush(cell->draw); + + cell->blend = (struct pipe_blend_state *) blend; + cell->dirty |= CELL_NEW_BLEND; +} + + +static void +cell_delete_blend_state(struct pipe_context *pipe, void *blend) +{ + FREE(blend); +} + + +static void +cell_set_blend_color(struct pipe_context *pipe, + const struct pipe_blend_color *blend_color) +{ + struct cell_context *cell = cell_context(pipe); + + draw_flush(cell->draw); + + cell->blend_color = *blend_color; + + cell->dirty |= CELL_NEW_BLEND; +} + + + + +static void * +cell_create_depth_stencil_alpha_state(struct pipe_context *pipe, + const struct pipe_depth_stencil_alpha_state *dsa) +{ + return mem_dup(dsa, sizeof(*dsa)); +} + + +static void +cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe, + void *dsa) +{ + struct cell_context *cell = cell_context(pipe); + + draw_flush(cell->draw); + + cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa; + cell->dirty |= CELL_NEW_DEPTH_STENCIL; +} + + +static void +cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa) +{ + FREE(dsa); +} + + +static void +cell_set_clip_state(struct pipe_context *pipe, + const struct pipe_clip_state *clip) +{ + struct cell_context *cell = cell_context(pipe); + + /* pass the clip state to the draw module */ + draw_set_clip_state(cell->draw, clip); +} + + + +/* Called when driver state tracker notices changes to the viewport + * matrix: + */ +static void +cell_set_viewport_state( struct pipe_context *pipe, + const struct pipe_viewport_state *viewport ) +{ + struct cell_context *cell = cell_context(pipe); + + cell->viewport = *viewport; /* struct copy */ + cell->dirty |= CELL_NEW_VIEWPORT; + + /* pass the viewport info to the draw module */ + draw_set_viewport_state(cell->draw, viewport); + + /* Using tnl/ and vf/ modules is temporary while getting started. + * Full pipe will have vertex shader, vertex fetch of its own. + */ +} + + +static void +cell_set_scissor_state( struct pipe_context *pipe, + const struct pipe_scissor_state *scissor ) +{ + struct cell_context *cell = cell_context(pipe); + + memcpy( &cell->scissor, scissor, sizeof(*scissor) ); + cell->dirty |= CELL_NEW_SCISSOR; +} + + +static void +cell_set_polygon_stipple( struct pipe_context *pipe, + const struct pipe_poly_stipple *stipple ) +{ + struct cell_context *cell = cell_context(pipe); + + memcpy( &cell->poly_stipple, stipple, sizeof(*stipple) ); + cell->dirty |= CELL_NEW_STIPPLE; +} + + + +static void * +cell_create_rasterizer_state(struct pipe_context *pipe, + const struct pipe_rasterizer_state *rasterizer) +{ + return mem_dup(rasterizer, sizeof(*rasterizer)); +} + + +static void +cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast) +{ + struct pipe_rasterizer_state *rasterizer = + (struct pipe_rasterizer_state *) rast; + struct cell_context *cell = cell_context(pipe); + + /* pass-through to draw module */ + draw_set_rasterizer_state(cell->draw, rasterizer); + + cell->rasterizer = rasterizer; + + cell->dirty |= CELL_NEW_RASTERIZER; +} + + +static void +cell_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer) +{ + FREE(rasterizer); +} + + + +static void * +cell_create_sampler_state(struct pipe_context *pipe, + const struct pipe_sampler_state *sampler) +{ + return mem_dup(sampler, sizeof(*sampler)); +} + + +static void +cell_bind_sampler_states(struct pipe_context *pipe, + unsigned num, void **samplers) +{ + struct cell_context *cell = cell_context(pipe); + uint i, changed = 0x0; + + assert(num <= CELL_MAX_SAMPLERS); + + draw_flush(cell->draw); + + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL; + if (cell->sampler[i] != new_samp) { + cell->sampler[i] = new_samp; + changed |= (1 << i); + } + } + + if (changed) { + cell->dirty |= CELL_NEW_SAMPLER; + cell->dirty_samplers |= changed; + } +} + + +static void +cell_delete_sampler_state(struct pipe_context *pipe, + void *sampler) +{ + FREE( sampler ); +} + + + +static void +cell_set_sampler_textures(struct pipe_context *pipe, + unsigned num, struct pipe_texture **texture) +{ + struct cell_context *cell = cell_context(pipe); + uint i, changed = 0x0; + + assert(num <= CELL_MAX_SAMPLERS); + + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + struct pipe_texture *new_tex = i < num ? texture[i] : NULL; + if ((struct pipe_texture *) cell->texture[i] != new_tex) { + pipe_texture_reference((struct pipe_texture **) &cell->texture[i], + new_tex); + changed |= (1 << i); + } + } + + cell->num_textures = num; + + if (changed) { + cell->dirty |= CELL_NEW_TEXTURE; + cell->dirty_textures |= changed; + } +} + + + +static void +cell_set_framebuffer_state(struct pipe_context *pipe, + const struct pipe_framebuffer_state *fb) +{ + struct cell_context *cell = cell_context(pipe); + + if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) { + struct pipe_surface *csurf = fb->cbufs[0]; + struct pipe_surface *zsurf = fb->zsbuf; + uint i; + uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE | + PIPE_BUFFER_USAGE_GPU_READ); + + /* unmap old surfaces */ + for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { + if (cell->framebuffer.cbufs[i] && cell->cbuf_map[i]) { + pipe_surface_unmap(cell->framebuffer.cbufs[i]); + cell->cbuf_map[i] = NULL; + } + } + + if (cell->framebuffer.zsbuf && cell->zsbuf_map) { + pipe_surface_unmap(cell->framebuffer.zsbuf); + cell->zsbuf_map = NULL; + } + + /* Finish any pending rendering to the current surface before + * installing a new surface! + */ + cell_flush_int(cell, CELL_FLUSH_WAIT); + + /* update my state + * (this is also where old surfaces will finally get freed) + */ + cell->framebuffer.width = fb->width; + cell->framebuffer.height = fb->height; + cell->framebuffer.nr_cbufs = fb->nr_cbufs; + for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { + pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]); + } + pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf); + + /* map new surfaces */ + if (csurf) + cell->cbuf_map[0] = pipe_surface_map(csurf, flags); + + if (zsurf) + cell->zsbuf_map = pipe_surface_map(zsurf, flags); + + cell->dirty |= CELL_NEW_FRAMEBUFFER; + } +} + + + +void +cell_init_state_functions(struct cell_context *cell) +{ + cell->pipe.create_blend_state = cell_create_blend_state; + cell->pipe.bind_blend_state = cell_bind_blend_state; + cell->pipe.delete_blend_state = cell_delete_blend_state; + + cell->pipe.create_sampler_state = cell_create_sampler_state; + cell->pipe.bind_sampler_states = cell_bind_sampler_states; + cell->pipe.delete_sampler_state = cell_delete_sampler_state; + + cell->pipe.set_sampler_textures = cell_set_sampler_textures; + + cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state; + cell->pipe.bind_depth_stencil_alpha_state = cell_bind_depth_stencil_alpha_state; + cell->pipe.delete_depth_stencil_alpha_state = cell_delete_depth_stencil_alpha_state; + + cell->pipe.create_rasterizer_state = cell_create_rasterizer_state; + cell->pipe.bind_rasterizer_state = cell_bind_rasterizer_state; + cell->pipe.delete_rasterizer_state = cell_delete_rasterizer_state; + + cell->pipe.set_blend_color = cell_set_blend_color; + cell->pipe.set_clip_state = cell_set_clip_state; + + cell->pipe.set_framebuffer_state = cell_set_framebuffer_state; + + cell->pipe.set_polygon_stipple = cell_set_polygon_stipple; + cell->pipe.set_scissor_state = cell_set_scissor_state; + cell->pipe.set_viewport_state = cell_set_viewport_state; +} diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.h b/src/gallium/drivers/cell/ppu/cell_pipe_state.h new file mode 100644 index 00000000000..1889bd52ff5 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.h @@ -0,0 +1,39 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_PIPE_STATE_H +#define CELL_PIPE_STATE_H + + +struct cell_context; + +extern void +cell_init_state_functions(struct cell_context *cell); + + +#endif /* CELL_PIPE_STATE_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c new file mode 100644 index 00000000000..79cb8df82fa --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_render.c @@ -0,0 +1,211 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * \brief Last stage of 'draw' pipeline: send tris to SPUs. + * \author Brian Paul + */ + +#include "cell_context.h" +#include "cell_render.h" +#include "cell_spu.h" +#include "util/u_memory.h" +#include "draw/draw_private.h" + + +struct render_stage { + struct draw_stage stage; /**< This must be first (base class) */ + + struct cell_context *cell; +}; + + +static INLINE struct render_stage * +render_stage(struct draw_stage *stage) +{ + return (struct render_stage *) stage; +} + + +static void render_begin( struct draw_stage *stage ) +{ +#if 0 + struct render_stage *render = render_stage(stage); + struct cell_context *sp = render->cell; + const struct pipe_shader_state *fs = &render->cell->fs->shader; + render->quad.nr_attrs = render->cell->nr_frag_attrs; + + render->firstFpInput = fs->input_semantic_name[0]; + + sp->quad.first->begin(sp->quad.first); +#endif +} + + +static void render_end( struct draw_stage *stage ) +{ +} + + +static void reset_stipple_counter( struct draw_stage *stage ) +{ + struct render_stage *render = render_stage(stage); + /*render->cell->line_stipple_counter = 0;*/ +} + + +static void +render_point(struct draw_stage *stage, struct prim_header *prim) +{ +} + + +static void +render_line(struct draw_stage *stage, struct prim_header *prim) +{ +} + + +/** Write a vertex into the prim buffer */ +static void +save_vertex(struct cell_prim_buffer *buf, uint pos, + const struct vertex_header *vert) +{ + uint attr, j; + + for (attr = 0; attr < 2; attr++) { + for (j = 0; j < 4; j++) { + buf->vertex[pos][attr][j] = vert->data[attr][j]; + } + } + + /* update bounding box */ + if (vert->data[0][0] < buf->xmin) + buf->xmin = vert->data[0][0]; + if (vert->data[0][0] > buf->xmax) + buf->xmax = vert->data[0][0]; + if (vert->data[0][1] < buf->ymin) + buf->ymin = vert->data[0][1]; + if (vert->data[0][1] > buf->ymax) + buf->ymax = vert->data[0][1]; +} + + +static void +render_tri(struct draw_stage *stage, struct prim_header *prim) +{ + struct render_stage *rs = render_stage(stage); + struct cell_context *cell = rs->cell; + struct cell_prim_buffer *buf = &cell->prim_buffer; + uint i; + + if (buf->num_verts + 3 > CELL_MAX_VERTS) { + cell_flush_prim_buffer(cell); + } + + i = buf->num_verts; + assert(i+2 <= CELL_MAX_VERTS); + save_vertex(buf, i+0, prim->v[0]); + save_vertex(buf, i+1, prim->v[1]); + save_vertex(buf, i+2, prim->v[2]); + buf->num_verts += 3; +} + + +/** + * Send the a RENDER command to all SPUs to have them render the prims + * in the current prim_buffer. + */ +void +cell_flush_prim_buffer(struct cell_context *cell) +{ + uint i; + + if (cell->prim_buffer.num_verts == 0) + return; + + for (i = 0; i < cell->num_spus; i++) { + struct cell_command_render *render = &cell_global.command[i].render; + render->prim_type = PIPE_PRIM_TRIANGLES; + render->num_verts = cell->prim_buffer.num_verts; + render->front_winding = cell->rasterizer->front_winding; + render->vertex_size = cell->vertex_info->size * 4; + render->xmin = cell->prim_buffer.xmin; + render->ymin = cell->prim_buffer.ymin; + render->xmax = cell->prim_buffer.xmax; + render->ymax = cell->prim_buffer.ymax; + render->vertex_data = &cell->prim_buffer.vertex; + ASSERT_ALIGN16(render->vertex_data); + send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_RENDER); + } + + cell->prim_buffer.num_verts = 0; + + cell->prim_buffer.xmin = 1e100; + cell->prim_buffer.ymin = 1e100; + cell->prim_buffer.xmax = -1e100; + cell->prim_buffer.ymax = -1e100; + + /* XXX temporary, need to double-buffer the prim buffer until we get + * a real command buffer/list system. + */ + cell_flush(&cell->pipe, 0x0); +} + + + +static void render_destroy( struct draw_stage *stage ) +{ + FREE( stage ); +} + + +/** + * Create a new draw/render stage. This will be plugged into the + * draw module as the last pipeline stage. + */ +struct draw_stage *cell_draw_render_stage( struct cell_context *cell ) +{ + struct render_stage *render = CALLOC_STRUCT(render_stage); + + render->cell = cell; + render->stage.draw = cell->draw; + render->stage.begin = render_begin; + render->stage.point = render_point; + render->stage.line = render_line; + render->stage.tri = render_tri; + render->stage.end = render_end; + render->stage.reset_stipple_counter = reset_stipple_counter; + render->stage.destroy = render_destroy; + + /* + render->quad.coef = render->coef; + render->quad.posCoef = &render->posCoef; + */ + + return &render->stage; +} diff --git a/src/gallium/drivers/cell/ppu/cell_render.h b/src/gallium/drivers/cell/ppu/cell_render.h new file mode 100644 index 00000000000..826dcbafeba --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_render.h @@ -0,0 +1,39 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_RENDER_H +#define CELL_RENDER_H + +struct cell_context; +struct draw_stage; + +extern void +cell_flush_prim_buffer(struct cell_context *cell); + +extern struct draw_stage *cell_draw_render_stage( struct cell_context *cell ); + +#endif /* CELL_RENDER_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c new file mode 100644 index 00000000000..512d85d3525 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -0,0 +1,176 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "util/u_memory.h" +#include "util/u_simple_screen.h" +#include "pipe/internal/p_winsys_screen.h" +#include "pipe/p_defines.h" +#include "pipe/p_screen.h" + +#include "cell/common.h" +#include "cell_screen.h" +#include "cell_texture.h" +#include "cell_winsys.h" + + +static const char * +cell_get_vendor(struct pipe_screen *screen) +{ + return "Tungsten Graphics, Inc."; +} + + +static const char * +cell_get_name(struct pipe_screen *screen) +{ + return "Cell"; +} + + +static int +cell_get_param(struct pipe_screen *screen, int param) +{ + switch (param) { + case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: + return CELL_MAX_SAMPLERS; + case PIPE_CAP_NPOT_TEXTURES: + return 1; + case PIPE_CAP_TWO_SIDED_STENCIL: + return 1; + case PIPE_CAP_GLSL: + return 1; + case PIPE_CAP_S3TC: + return 0; + case PIPE_CAP_ANISOTROPIC_FILTER: + return 0; + case PIPE_CAP_POINT_SPRITE: + return 1; + case PIPE_CAP_MAX_RENDER_TARGETS: + return 1; + case PIPE_CAP_OCCLUSION_QUERY: + return 1; + case PIPE_CAP_TEXTURE_SHADOW_MAP: + return 10; + case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: + return CELL_MAX_TEXTURE_LEVELS; + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + return 8; /* max 128x128x128 */ + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return CELL_MAX_TEXTURE_LEVELS; + case PIPE_CAP_TEXTURE_MIRROR_REPEAT: + return 1; /* XXX not really true */ + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + return 0; /* XXX to do */ + default: + return 0; + } +} + + +static float +cell_get_paramf(struct pipe_screen *screen, int param) +{ + switch (param) { + case PIPE_CAP_MAX_LINE_WIDTH: + /* fall-through */ + case PIPE_CAP_MAX_LINE_WIDTH_AA: + return 255.0; /* arbitrary */ + + case PIPE_CAP_MAX_POINT_WIDTH: + /* fall-through */ + case PIPE_CAP_MAX_POINT_WIDTH_AA: + return 255.0; /* arbitrary */ + + case PIPE_CAP_MAX_TEXTURE_ANISOTROPY: + return 0.0; + + case PIPE_CAP_MAX_TEXTURE_LOD_BIAS: + return 16.0; /* arbitrary */ + + default: + return 0; + } +} + + +static boolean +cell_is_format_supported( struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned tex_usage, + unsigned geom_flags ) +{ + /* cell supports most formats, XXX for now anyway */ + if (format == PIPE_FORMAT_DXT5_RGBA || + format == PIPE_FORMAT_R8G8B8A8_SRGB) + return FALSE; + else + return TRUE; +} + + +static void +cell_destroy_screen( struct pipe_screen *screen ) +{ + struct pipe_winsys *winsys = screen->winsys; + + if(winsys->destroy) + winsys->destroy(winsys); + + FREE(screen); +} + + +/** + * Create a new pipe_screen object + * Note: we're not presently subclassing pipe_screen (no cell_screen) but + * that would be the place to put SPU thread/context info... + */ +struct pipe_screen * +cell_create_screen(struct pipe_winsys *winsys) +{ + struct pipe_screen *screen = CALLOC_STRUCT(pipe_screen); + + if (!screen) + return NULL; + + screen->winsys = winsys; + + screen->destroy = cell_destroy_screen; + + screen->get_name = cell_get_name; + screen->get_vendor = cell_get_vendor; + screen->get_param = cell_get_param; + screen->get_paramf = cell_get_paramf; + screen->is_format_supported = cell_is_format_supported; + + cell_init_screen_texture_funcs(screen); + u_simple_screen_init(screen); + + return screen; +} diff --git a/src/gallium/drivers/cell/ppu/cell_screen.h b/src/gallium/drivers/cell/ppu/cell_screen.h new file mode 100644 index 00000000000..c7e15889d66 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_screen.h @@ -0,0 +1,41 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_SCREEN_H +#define CELL_SCREEN_H + + +struct pipe_screen; +struct pipe_winsys; + + +extern struct pipe_screen * +cell_create_screen(struct pipe_winsys *winsys); + + +#endif /* CELL_SCREEN_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c new file mode 100644 index 00000000000..28e5e6d706d --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -0,0 +1,219 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * Utility/wrappers for communicating with the SPUs. + */ + + +#include <pthread.h> + +#include "cell_spu.h" +#include "pipe/p_format.h" +#include "pipe/p_state.h" +#include "util/u_memory.h" +#include "cell/common.h" + + +/* +helpful headers: +/opt/ibm/cell-sdk/prototype/src/include/ppu/cbe_mfc.h +*/ + + +/** + * Cell/SPU info that's not per-context. + */ +struct cell_global_info cell_global; + + +/** + * Scan /proc/cpuinfo to determine the timebase for the system. + * This is used by the SPUs to convert 'decrementer' ticks to seconds. + * There may be a better way to get this value... + */ +static unsigned +get_timebase(void) +{ + FILE *f = fopen("/proc/cpuinfo", "r"); + unsigned timebase; + + assert(f); + while (!feof(f)) { + char line[80]; + fgets(line, sizeof(line), f); + if (strncmp(line, "timebase", 8) == 0) { + char *colon = strchr(line, ':'); + if (colon) { + timebase = atoi(colon + 2); + break; + } + } + } + fclose(f); + + return timebase; +} + + +/** + * Write a 1-word message to the given SPE mailbox. + */ +void +send_mbox_message(spe_context_ptr_t ctx, unsigned int msg) +{ + spe_in_mbox_write(ctx, &msg, 1, SPE_MBOX_ALL_BLOCKING); +} + + +/** + * Wait for a 1-word message to arrive in given mailbox. + */ +uint +wait_mbox_message(spe_context_ptr_t ctx) +{ + do { + unsigned data; + int count = spe_out_mbox_read(ctx, &data, 1); + + if (count == 1) { + return data; + } + + if (count < 0) { + /* error */ ; + } + } while (1); +} + + +/** + * Called by pthread_create() to spawn an SPU thread. + */ +static void * +cell_thread_function(void *arg) +{ + struct cell_init_info *init = (struct cell_init_info *) arg; + unsigned entry = SPE_DEFAULT_ENTRY; + + ASSERT_ALIGN16(init); + + if (spe_context_run(cell_global.spe_contexts[init->id], &entry, 0, + init, NULL, NULL) < 0) { + fprintf(stderr, "spe_context_run() failed\n"); + exit(1); + } + + pthread_exit(NULL); +} + + +/** + * Create the SPU threads. This is done once during driver initialization. + * This involves setting the the "init" message which is sent to each SPU. + * The init message specifies an SPU id, total number of SPUs, location + * and number of batch buffers, etc. + */ +void +cell_start_spus(struct cell_context *cell) +{ + static boolean one_time_init = FALSE; + uint i, j; + uint timebase = get_timebase(); + + if (one_time_init) { + fprintf(stderr, "PPU: Multiple rendering contexts not yet supported " + "on Cell.\n"); + abort(); + } + + one_time_init = TRUE; + + assert(cell->num_spus <= CELL_MAX_SPUS); + + ASSERT_ALIGN16(&cell_global.inits[0]); + ASSERT_ALIGN16(&cell_global.inits[1]); + + /* + * Initialize the global 'inits' structure for each SPU. + * A pointer to the init struct will be passed to each SPU. + * The SPUs will then each grab their init info with mfc_get(). + */ + for (i = 0; i < cell->num_spus; i++) { + cell_global.inits[i].id = i; + cell_global.inits[i].num_spus = cell->num_spus; + cell_global.inits[i].debug_flags = cell->debug_flags; + cell_global.inits[i].inv_timebase = 1000.0f / timebase; + + for (j = 0; j < CELL_NUM_BUFFERS; j++) { + cell_global.inits[i].buffers[j] = cell->buffer[j]; + } + cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0]; + + cell_global.inits[i].spu_functions = &cell->spu_functions; + + cell_global.spe_contexts[i] = spe_context_create(0, NULL); + if (!cell_global.spe_contexts[i]) { + fprintf(stderr, "spe_context_create() failed\n"); + exit(1); + } + + if (spe_program_load(cell_global.spe_contexts[i], &g3d_spu)) { + fprintf(stderr, "spe_program_load() failed\n"); + exit(1); + } + + pthread_create(&cell_global.spe_threads[i], /* returned thread handle */ + NULL, /* pthread attribs */ + &cell_thread_function, /* start routine */ + &cell_global.inits[i]); /* thread argument */ + } +} + + +/** + * Tell all the SPUs to stop/exit. + * This is done when the driver's exiting / cleaning up. + */ +void +cell_spu_exit(struct cell_context *cell) +{ + uint i; + + for (i = 0; i < cell->num_spus; i++) { + send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_EXIT); + } + + /* wait for threads to exit */ + for (i = 0; i < cell->num_spus; i++) { + void *value; + pthread_join(cell_global.spe_threads[i], &value); + cell_global.spe_threads[i] = 0; + cell_global.spe_contexts[i] = 0; + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h new file mode 100644 index 00000000000..c93958a9ed5 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -0,0 +1,79 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_SPU +#define CELL_SPU + + +#include <libspe2.h> +#include <pthread.h> +#include "cell/common.h" + +#include "cell_context.h" + + +/** + * Global vars, for now anyway. + */ +struct cell_global_info +{ + /** + * SPU/SPE handles, etc + */ + spe_context_ptr_t spe_contexts[CELL_MAX_SPUS]; + pthread_t spe_threads[CELL_MAX_SPUS]; + + /** + * Data sent to SPUs at start-up + */ + struct cell_init_info inits[CELL_MAX_SPUS]; +}; + + +extern struct cell_global_info cell_global; + + +/** This is the handle for the actual SPE code */ +extern spe_program_handle_t g3d_spu; + + +extern void +send_mbox_message(spe_context_ptr_t ctx, unsigned int msg); + +extern uint +wait_mbox_message(spe_context_ptr_t ctx); + + +extern void +cell_start_spus(struct cell_context *cell); + + +extern void +cell_spu_exit(struct cell_context *cell); + + +#endif /* CELL_SPU */ diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h new file mode 100644 index 00000000000..b193170f9ce --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state.h @@ -0,0 +1,65 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_STATE_H +#define CELL_STATE_H + + +#define CELL_NEW_VIEWPORT 0x1 +#define CELL_NEW_RASTERIZER 0x2 +#define CELL_NEW_FS 0x4 +#define CELL_NEW_BLEND 0x8 +#define CELL_NEW_CLIP 0x10 +#define CELL_NEW_SCISSOR 0x20 +#define CELL_NEW_STIPPLE 0x40 +#define CELL_NEW_FRAMEBUFFER 0x80 +#define CELL_NEW_ALPHA_TEST 0x100 +#define CELL_NEW_DEPTH_STENCIL 0x200 +#define CELL_NEW_SAMPLER 0x400 +#define CELL_NEW_TEXTURE 0x800 +#define CELL_NEW_VERTEX 0x1000 +#define CELL_NEW_VS 0x2000 +#define CELL_NEW_VS_CONSTANTS 0x4000 +#define CELL_NEW_FS_CONSTANTS 0x8000 +#define CELL_NEW_VERTEX_INFO 0x10000 + + +extern void +cell_update_derived( struct cell_context *softpipe ); + + +extern void +cell_init_shader_functions(struct cell_context *cell); + + +extern void +cell_init_vertex_functions(struct cell_context *cell); + + +#endif /* CELL_STATE_H */ + diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c new file mode 100644 index 00000000000..efc4f78364b --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c @@ -0,0 +1,170 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_memory.h" +#include "pipe/p_shader_tokens.h" +#include "draw/draw_context.h" +#include "draw/draw_vertex.h" +#include "cell_context.h" +#include "cell_batch.h" +#include "cell_state.h" +#include "cell_state_emit.h" + + +/** + * Determine how to map vertex program outputs to fragment program inputs. + * Basically, this will be used when computing the triangle interpolation + * coefficients from the post-transform vertex attributes. + */ +static void +calculate_vertex_layout( struct cell_context *cell ) +{ + const struct cell_fragment_shader_state *fs = cell->fs; + const enum interp_mode colorInterp + = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR; + struct vertex_info *vinfo = &cell->vertex_info; + uint i; + int src; + +#if 0 + if (cell->vbuf) { + /* if using the post-transform vertex buffer, tell draw_vbuf to + * simply emit the whole post-xform vertex as-is: + */ + struct vertex_info *vinfo_vbuf = &cell->vertex_info_vbuf; + vinfo_vbuf->num_attribs = 0; + draw_emit_vertex_attr(vinfo_vbuf, EMIT_ALL, INTERP_NONE, 0); + vinfo_vbuf->size = 4 * vs->num_outputs + sizeof(struct vertex_header)/4; + } +#endif + + /* reset vinfo */ + vinfo->num_attribs = 0; + + /* we always want to emit vertex pos */ + src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0); + assert(src >= 0); + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src); + + + /* + * Loop over fragment shader inputs, searching for the matching output + * from the vertex shader. + */ + for (i = 0; i < fs->info.num_inputs; i++) { + switch (fs->info.input_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + /* already done above */ + break; + + case TGSI_SEMANTIC_COLOR: + src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, + fs->info.input_semantic_index[i]); + assert(src >= 0); + draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src); + break; + + case TGSI_SEMANTIC_FOG: + src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0); +#if 1 + if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */ + src = 0; +#endif + assert(src >= 0); + draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src); + break; + + case TGSI_SEMANTIC_GENERIC: + /* this includes texcoords and varying vars */ + src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC, + fs->info.input_semantic_index[i]); + assert(src >= 0); + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src); + break; + + default: + assert(0); + } + } + + draw_compute_vertex_size(vinfo); + + /* XXX only signal this if format really changes */ + cell->dirty |= CELL_NEW_VERTEX_INFO; +} + + +#if 0 +/** + * Recompute cliprect from scissor bounds, scissor enable and surface size. + */ +static void +compute_cliprect(struct cell_context *sp) +{ + uint surfWidth = sp->framebuffer.width; + uint surfHeight = sp->framebuffer.height; + + if (sp->rasterizer->scissor) { + /* clip to scissor rect */ + sp->cliprect.minx = MAX2(sp->scissor.minx, 0); + sp->cliprect.miny = MAX2(sp->scissor.miny, 0); + sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth); + sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight); + } + else { + /* clip to surface bounds */ + sp->cliprect.minx = 0; + sp->cliprect.miny = 0; + sp->cliprect.maxx = surfWidth; + sp->cliprect.maxy = surfHeight; + } +} +#endif + + + +/** + * Update derived state, send current state to SPUs prior to rendering. + */ +void cell_update_derived( struct cell_context *cell ) +{ + if (cell->dirty & (CELL_NEW_RASTERIZER | + CELL_NEW_FS | + CELL_NEW_VS)) + calculate_vertex_layout( cell ); + +#if 0 + if (cell->dirty & (CELL_NEW_SCISSOR | + CELL_NEW_DEPTH_STENCIL_ALPHA | + CELL_NEW_FRAMEBUFFER)) + compute_cliprect(cell); +#endif + + cell_emit_state(cell); + + cell->dirty = 0; +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c new file mode 100644 index 00000000000..ff529fe22cb --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -0,0 +1,340 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_inlines.h" +#include "util/u_memory.h" +#include "cell_context.h" +#include "cell_gen_fragment.h" +#include "cell_state.h" +#include "cell_state_emit.h" +#include "cell_batch.h" +#include "cell_texture.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" + + +/** + * Find/create a cell_command_fragment_ops object corresponding to the + * current blend/stencil/z/colormask/etc. state. + */ +static struct cell_command_fragment_ops * +lookup_fragment_ops(struct cell_context *cell) +{ + struct cell_fragment_ops_key key; + struct cell_command_fragment_ops *ops; + + /* + * Build key + */ + memset(&key, 0, sizeof(key)); + key.blend = *cell->blend; + key.blend_color = cell->blend_color; + key.dsa = *cell->depth_stencil; + + if (cell->framebuffer.cbufs[0]) + key.color_format = cell->framebuffer.cbufs[0]->format; + else + key.color_format = PIPE_FORMAT_NONE; + + if (cell->framebuffer.zsbuf) + key.zs_format = cell->framebuffer.zsbuf->format; + else + key.zs_format = PIPE_FORMAT_NONE; + + /* + * Look up key in cache. + */ + ops = (struct cell_command_fragment_ops *) + util_keymap_lookup(cell->fragment_ops_cache, &key); + + /* + * If not found, create/save new fragment ops command. + */ + if (!ops) { + struct spe_function spe_code_front, spe_code_back; + unsigned int facing_dependent, total_code_size; + + if (0) + debug_printf("**** Create New Fragment Ops\n"); + + /* Prepare the buffer that will hold the generated code. The + * "0" passed in for the size means that the SPE code will + * use a default size. + */ + spe_init_func(&spe_code_front, 0); + spe_init_func(&spe_code_back, 0); + + /* Generate new code. Always generate new code for both front-facing + * and back-facing fragments, even if it's the same code in both + * cases. + */ + cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front); + cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back); + + /* Make sure the code is a multiple of 8 bytes long; this is + * required to ensure that the dual pipe instruction alignment + * is correct. It's also important for the SPU unpacking, + * which assumes 8-byte boundaries. + */ + unsigned int front_code_size = spe_code_size(&spe_code_front); + while (front_code_size % 8 != 0) { + spe_lnop(&spe_code_front); + front_code_size = spe_code_size(&spe_code_front); + } + unsigned int back_code_size = spe_code_size(&spe_code_back); + while (back_code_size % 8 != 0) { + spe_lnop(&spe_code_back); + back_code_size = spe_code_size(&spe_code_back); + } + + /* Determine whether the code we generated is facing-dependent, by + * determining whether the generated code is different for the front- + * and back-facing fragments. + */ + if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) { + /* Code is identical; only need one copy. */ + facing_dependent = 0; + total_code_size = front_code_size; + } + else { + /* Code is different for front-facing and back-facing fragments. + * Need to send both copies. + */ + facing_dependent = 1; + total_code_size = front_code_size + back_code_size; + } + + /* alloc new fragment ops command. Note that this structure + * has variant length based on the total code size required. + */ + ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size); + /* populate the new cell_command_fragment_ops object */ + ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS; + ops->total_code_size = total_code_size; + ops->front_code_index = 0; + memcpy(ops->code, spe_code_front.store, front_code_size); + if (facing_dependent) { + /* We have separate front- and back-facing code. Append the + * back-facing code to the buffer. Be careful because the code + * size is in bytes, but the buffer is of unsigned elements. + */ + ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]); + memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size); + } + else { + /* Use the same code for front- and back-facing fragments */ + ops->back_code_index = ops->front_code_index; + } + + /* Set the fields for the fallback case. Note that these fields + * (and the whole fallback case) will eventually go away. + */ + ops->dsa = *cell->depth_stencil; + ops->blend = *cell->blend; + ops->blend_color = cell->blend_color; + + /* insert cell_command_fragment_ops object into keymap/cache */ + util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL); + + /* release rtasm buffer */ + spe_release_func(&spe_code_front); + spe_release_func(&spe_code_back); + } + else { + if (0) + debug_printf("**** Re-use Fragment Ops\n"); + } + + return ops; +} + + + +static void +emit_state_cmd(struct cell_context *cell, uint cmd, + const void *state, uint state_size) +{ + uint32_t *dst = (uint32_t *) + cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size)); + *dst = cmd; + memcpy(dst + 4, state, state_size); +} + + +/** + * For state marked as 'dirty', construct a state-update command block + * and insert it into the current batch buffer. + */ +void +cell_emit_state(struct cell_context *cell) +{ + if (cell->dirty & CELL_NEW_FRAMEBUFFER) { + struct pipe_surface *cbuf = cell->framebuffer.cbufs[0]; + struct pipe_surface *zbuf = cell->framebuffer.zsbuf; + STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0); + struct cell_command_framebuffer *fb + = cell_batch_alloc16(cell, sizeof(*fb)); + fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER; + fb->color_start = cell->cbuf_map[0]; + fb->color_format = cbuf->format; + fb->depth_start = cell->zsbuf_map; + fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE; + fb->width = cell->framebuffer.width; + fb->height = cell->framebuffer.height; +#if 0 + printf("EMIT color format %s\n", pf_name(fb->color_format)); + printf("EMIT depth format %s\n", pf_name(fb->depth_format)); +#endif + } + + if (cell->dirty & (CELL_NEW_RASTERIZER)) { + STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0); + struct cell_command_rasterizer *rast = + cell_batch_alloc16(cell, sizeof(*rast)); + rast->opcode[0] = CELL_CMD_STATE_RASTERIZER; + rast->rasterizer = *cell->rasterizer; + } + + if (cell->dirty & (CELL_NEW_FS)) { + /* Send new fragment program to SPUs */ + STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0); + struct cell_command_fragment_program *fp + = cell_batch_alloc16(cell, sizeof(*fp)); + fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM; + fp->num_inst = cell->fs->code.num_inst; + memcpy(&fp->code, cell->fs->code.store, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); + if (0) { + int i; + printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n"); + for (i = 0; i < fp->num_inst; i++) { + printf(" %3d: 0x%08x\n", i, fp->code[i]); + } + } + } + + if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) { + const uint shader = PIPE_SHADER_FRAGMENT; + const uint num_const = cell->constants[shader].buffer->size / sizeof(float); + uint i, j; + float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float))); + uint32_t *ibuf = (uint32_t *) buf; + const float *constants = pipe_buffer_map(cell->pipe.screen, + cell->constants[shader].buffer, + PIPE_BUFFER_USAGE_CPU_READ); + ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS; + ibuf[4] = num_const; + j = 8; + for (i = 0; i < num_const; i++) { + buf[j++] = constants[i]; + } + pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer); + } + + if (cell->dirty & (CELL_NEW_FRAMEBUFFER | + CELL_NEW_DEPTH_STENCIL | + CELL_NEW_BLEND)) { + struct cell_command_fragment_ops *fops, *fops_cmd; + /* Note that cell_command_fragment_ops is a variant-sized record */ + fops = lookup_fragment_ops(cell); + fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size)); + memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size); + } + + if (cell->dirty & CELL_NEW_SAMPLER) { + uint i; + for (i = 0; i < CELL_MAX_SAMPLERS; i++) { + if (cell->dirty_samplers & (1 << i)) { + if (cell->sampler[i]) { + STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0); + struct cell_command_sampler *sampler + = cell_batch_alloc16(cell, sizeof(*sampler)); + sampler->opcode[0] = CELL_CMD_STATE_SAMPLER; + sampler->unit = i; + sampler->state = *cell->sampler[i]; + } + } + } + cell->dirty_samplers = 0x0; + } + + if (cell->dirty & CELL_NEW_TEXTURE) { + uint i; + for (i = 0;i < CELL_MAX_SAMPLERS; i++) { + if (cell->dirty_textures & (1 << i)) { + STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0); + struct cell_command_texture *texture + = (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture)); + texture->opcode[0] = CELL_CMD_STATE_TEXTURE; + texture->unit = i; + if (cell->texture[i]) { + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + texture->start[level] = cell->texture[i]->tiled_mapped[level]; + texture->width[level] = cell->texture[i]->base.width[level]; + texture->height[level] = cell->texture[i]->base.height[level]; + texture->depth[level] = cell->texture[i]->base.depth[level]; + } + texture->target = cell->texture[i]->base.target; + } + else { + uint level; + for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) { + texture->start[level] = NULL; + texture->width[level] = 0; + texture->height[level] = 0; + texture->depth[level] = 0; + } + texture->target = 0; + } + } + } + cell->dirty_textures = 0x0; + } + + if (cell->dirty & CELL_NEW_VERTEX_INFO) { + emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO, + &cell->vertex_info, sizeof(struct vertex_info)); + } + +#if 0 + if (cell->dirty & CELL_NEW_VS) { + const struct draw_context *const draw = cell->draw; + struct cell_shader_info info; + + info.num_outputs = draw_num_vs_outputs(draw); + info.declarations = (uintptr_t) draw->vs.machine.Declarations; + info.num_declarations = draw->vs.machine.NumDeclarations; + info.instructions = (uintptr_t) draw->vs.machine.Instructions; + info.num_instructions = draw->vs.machine.NumInstructions; + info.immediates = (uintptr_t) draw->vs.machine.Imms; + info.num_immediates = draw->vs.machine.ImmLimit / 4; + + emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info)); + } +#endif +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.h b/src/gallium/drivers/cell/ppu/cell_state_emit.h new file mode 100644 index 00000000000..59f8affe8d3 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.h @@ -0,0 +1,36 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_STATE_EMIT_H +#define CELL_STATE_EMIT_H + + +extern void +cell_emit_state(struct cell_context *cell); + + +#endif /* CELL_STATE_EMIT_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c new file mode 100644 index 00000000000..d97c22b2efe --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c @@ -0,0 +1,1430 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file + * Generate code to perform all per-fragment operations. + * + * Code generated by these functions perform both alpha, depth, and stencil + * testing as well as alpha blending. + * + * \note + * Occlusion query is not supported, but this is the right place to add that + * support. + * + * \author Ian Romanick <[email protected]> + */ + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" + +#include "cell_context.h" + +#include "rtasm/rtasm_ppc_spe.h" + + +/** + * Generate code to perform alpha testing. + * + * The code generated by this function uses the register specificed by + * \c mask as both an input and an output. + * + * \param dsa Current alpha-test state + * \param f Function to which code should be appended + * \param mask Index of register containing active fragment mask + * \param alphas Index of register containing per-fragment alpha values + * + * \note Emits a maximum of 6 instructions. + */ +static void +emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask, int alphas) +{ + /* If the alpha function is either NEVER or ALWAYS, there is no need to + * load the reference value into a register. ALWAYS is a fairly common + * case, and this optimization saves 2 instructions. + */ + if (dsa->alpha.enabled + && (dsa->alpha.func != PIPE_FUNC_NEVER) + && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { + int ref = spe_allocate_available_register(f); + int tmp_a = spe_allocate_available_register(f); + int tmp_b = spe_allocate_available_register(f); + union { + float f; + unsigned u; + } ref_val; + boolean complement = FALSE; + + ref_val.f = dsa->alpha.ref; + + spe_il(f, ref, ref_val.u & 0x0000ffff); + spe_ilh(f, ref, ref_val.u >> 16); + + switch (dsa->alpha.func) { + case PIPE_FUNC_NOTEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_EQUAL: + spe_fceq(f, tmp_a, ref, alphas); + break; + + case PIPE_FUNC_LEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_GREATER: + spe_fcgt(f, tmp_a, ref, alphas); + break; + + case PIPE_FUNC_LESS: + complement = TRUE; + /* FALLTHROUGH */ + + case PIPE_FUNC_GEQUAL: + spe_fcgt(f, tmp_a, ref, alphas); + spe_fceq(f, tmp_b, ref, alphas); + spe_or(f, tmp_a, tmp_b, tmp_a); + break; + + case PIPE_FUNC_ALWAYS: + case PIPE_FUNC_NEVER: + default: + assert(0); + break; + } + + if (complement) { + spe_andc(f, mask, mask, tmp_a); + } else { + spe_and(f, mask, mask, tmp_a); + } + + spe_release_register(f, ref); + spe_release_register(f, tmp_a); + spe_release_register(f, tmp_b); + } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) { + spe_il(f, mask, 0); + } +} + + +/** + * Generate code to perform Z testing. Four Z values are tested at once. + * \param dsa Current depth-test state + * \param f Function to which code should be appended + * \param mask Index of register to contain depth-pass mask + * \param stored Index of register containing values from depth buffer + * \param calculated Index of register containing per-fragment depth values + * + * \return + * If the calculated depth comparison mask is the actual mask, \c FALSE is + * returned. If the calculated depth comparison mask is the compliment of + * the actual mask, \c TRUE is returned. + * + * \note Emits a maximum of 3 instructions. + */ +static boolean +emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa, + struct spe_function *f, int mask, int stored, int calculated) +{ + unsigned func = (dsa->depth.enabled) + ? dsa->depth.func : PIPE_FUNC_ALWAYS; + int tmp = spe_allocate_available_register(f); + boolean compliment = FALSE; + + switch (func) { + case PIPE_FUNC_NEVER: + spe_il(f, mask, 0); + break; + + case PIPE_FUNC_NOTEQUAL: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_EQUAL: + spe_ceq(f, mask, calculated, stored); + break; + + case PIPE_FUNC_LEQUAL: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GREATER: + spe_clgt(f, mask, calculated, stored); + break; + + case PIPE_FUNC_LESS: + compliment = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GEQUAL: + spe_clgt(f, mask, calculated, stored); + spe_ceq(f, tmp, calculated, stored); + spe_or(f, mask, mask, tmp); + break; + + case PIPE_FUNC_ALWAYS: + spe_il(f, mask, ~0); + break; + + default: + assert(0); + break; + } + + spe_release_register(f, tmp); + return compliment; +} + + +/** + * Generate code to apply the stencil operation (after testing). + * \note Emits a maximum of 5 instructions. + * + * \warning + * Since \c out and \c in might be the same register, this routine cannot + * generate code that uses \c out as a temporary. + */ +static void +emit_stencil_op(struct spe_function *f, + int out, int in, int mask, unsigned op, unsigned ref) +{ + const int clamp = spe_allocate_available_register(f); + const int clamp_mask = spe_allocate_available_register(f); + const int result = spe_allocate_available_register(f); + + switch(op) { + case PIPE_STENCIL_OP_KEEP: + assert(0); + case PIPE_STENCIL_OP_ZERO: + spe_il(f, result, 0); + break; + case PIPE_STENCIL_OP_REPLACE: + spe_il(f, result, ref); + break; + case PIPE_STENCIL_OP_INCR: + /* clamp = [0xff, 0xff, 0xff, 0xff] */ + spe_il(f, clamp, 0x0ff); + /* result[i] = in[i] + 1 */ + spe_ai(f, result, in, 1); + /* clamp_mask[i] = (result[i] > 0xff) */ + spe_clgti(f, clamp_mask, result, 0x0ff); + /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */ + spe_selb(f, result, result, clamp, clamp_mask); + break; + case PIPE_STENCIL_OP_DECR: + spe_il(f, clamp, 0); + spe_ai(f, result, in, -1); + + /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned + * arithmetic. + */ + spe_clgti(f, clamp_mask, result, 0x0ff); + spe_selb(f, result, result, clamp, clamp_mask); + break; + case PIPE_STENCIL_OP_INCR_WRAP: + spe_ai(f, result, in, 1); + break; + case PIPE_STENCIL_OP_DECR_WRAP: + spe_ai(f, result, in, -1); + break; + case PIPE_STENCIL_OP_INVERT: + spe_nor(f, result, in, in); + break; + default: + assert(0); + } + + spe_selb(f, out, in, result, mask); + + spe_release_register(f, result); + spe_release_register(f, clamp_mask); + spe_release_register(f, clamp); +} + + +/** + * Generate code to do stencil test. Four pixels are tested at once. + * \param dsa Depth / stencil test state + * \param face 0 for front face, 1 for back face + * \param f Function to append instructions to + * \param mask Register containing mask of fragments passing the + * alpha test + * \param depth_mask Register containing mask of fragments passing the + * depth test + * \param depth_compliment Is \c depth_mask the compliment of the actual mask? + * \param stencil Register containing values from stencil buffer + * \param depth_pass Register to store mask of fragments passing stencil test + * and depth test + * + * \note + * Emits a maximum of 10 + (3 * 5) = 25 instructions. + */ +static int +emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, + unsigned face, + struct spe_function *f, + int mask, + int depth_mask, + boolean depth_complement, + int stencil, + int depth_pass) +{ + int stencil_fail = spe_allocate_available_register(f); + int depth_fail = spe_allocate_available_register(f); + int stencil_mask = spe_allocate_available_register(f); + int stencil_pass = spe_allocate_available_register(f); + int face_stencil = spe_allocate_available_register(f); + int stencil_src = stencil; + const unsigned ref = (dsa->stencil[face].ref_value + & dsa->stencil[face].valuemask); + boolean complement = FALSE; + int stored; + int tmp = spe_allocate_available_register(f); + + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS) + && (dsa->stencil[face].valuemask != 0x0ff)) { + stored = spe_allocate_available_register(f); + spe_andi(f, stored, stencil, dsa->stencil[face].valuemask); + } else { + stored = stencil; + } + + + switch (dsa->stencil[face].func) { + case PIPE_FUNC_NEVER: + spe_il(f, stencil_mask, 0); /* stencil_mask[0..3] = [0,0,0,0] */ + break; + + case PIPE_FUNC_NOTEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_EQUAL: + /* stencil_mask[i] = (stored[i] == ref) */ + spe_ceqi(f, stencil_mask, stored, ref); + break; + + case PIPE_FUNC_LEQUAL: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GREATER: + complement = TRUE; + /* stencil_mask[i] = (stored[i] > ref) */ + spe_clgti(f, stencil_mask, stored, ref); + break; + + case PIPE_FUNC_LESS: + complement = TRUE; + /* FALLTHROUGH */ + case PIPE_FUNC_GEQUAL: + /* stencil_mask[i] = (stored[i] > ref) */ + spe_clgti(f, stencil_mask, stored, ref); + /* tmp[i] = (stored[i] == ref) */ + spe_ceqi(f, tmp, stored, ref); + /* stencil_mask[i] = stencil_mask[i] | tmp[i] */ + spe_or(f, stencil_mask, stencil_mask, tmp); + break; + + case PIPE_FUNC_ALWAYS: + /* See comment below. */ + break; + + default: + assert(0); + break; + } + + if (stored != stencil) { + spe_release_register(f, stored); + } + spe_release_register(f, tmp); + + + /* ALWAYS is a very common stencil-test, so some effort is applied to + * optimize that case. The stencil-pass mask is the same as the input + * fragment mask. This makes the stencil-test (above) a no-op, and the + * input fragment mask can be "renamed" the stencil-pass mask. + */ + if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) { + spe_release_register(f, stencil_pass); + stencil_pass = mask; + } else { + if (complement) { + spe_andc(f, stencil_pass, mask, stencil_mask); + } else { + spe_and(f, stencil_pass, mask, stencil_mask); + } + } + + if (depth_complement) { + spe_andc(f, depth_pass, stencil_pass, depth_mask); + } else { + spe_and(f, depth_pass, stencil_pass, depth_mask); + } + + + /* Conditionally emit code to update the stencil value under various + * condititons. Note that there is no need to generate code under the + * following circumstances: + * + * - Stencil write mask is zero. + * - For stencil-fail if the stencil test is ALWAYS + * - For depth-fail if the stencil test is NEVER + * - For depth-pass if the stencil test is NEVER + * - Any of the 3 conditions if the operation is KEEP + */ + if (dsa->stencil[face].writemask != 0) { + if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS) + && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) { + if (complement) { + spe_and(f, stencil_fail, mask, stencil_mask); + } else { + spe_andc(f, stencil_fail, mask, stencil_mask); + } + + emit_stencil_op(f, face_stencil, stencil_src, stencil_fail, + dsa->stencil[face].fail_op, + dsa->stencil[face].ref_value); + + stencil_src = face_stencil; + } + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) { + if (depth_complement) { + spe_and(f, depth_fail, stencil_pass, depth_mask); + } else { + spe_andc(f, depth_fail, stencil_pass, depth_mask); + } + + emit_stencil_op(f, face_stencil, stencil_src, depth_fail, + dsa->stencil[face].zfail_op, + dsa->stencil[face].ref_value); + stencil_src = face_stencil; + } + + if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) + && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) { + emit_stencil_op(f, face_stencil, stencil_src, depth_pass, + dsa->stencil[face].zpass_op, + dsa->stencil[face].ref_value); + stencil_src = face_stencil; + } + } + + spe_release_register(f, stencil_fail); + spe_release_register(f, depth_fail); + spe_release_register(f, stencil_mask); + if (stencil_pass != mask) { + spe_release_register(f, stencil_pass); + } + + /* If all of the stencil operations were KEEP or the stencil write mask was + * zero, "stencil_src" will still be set to "stencil". In this case + * release the "face_stencil" register. Otherwise apply the stencil write + * mask to select bits from the calculated stencil value and the previous + * stencil value. + */ + if (stencil_src == stencil) { + spe_release_register(f, face_stencil); + } else if (dsa->stencil[face].writemask != 0x0ff) { + int tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, dsa->stencil[face].writemask); + spe_selb(f, stencil_src, stencil, stencil_src, tmp); + + spe_release_register(f, tmp); + } + + return stencil_src; +} + + +void +cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa) +{ + struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base; + struct spe_function *const f = &cdsa->code; + + /* This code generates a maximum of 6 (alpha test) + 3 (depth test) + * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions. Round + * up to 64 to make it a happy power-of-two. + */ + spe_init_func(f, SPE_INST_SIZE * 64); + + + /* Allocate registers for the function's input parameters. Cleverly (and + * clever code is usually dangerous, but I couldn't resist) the generated + * function returns a structure. Returned structures start with register + * 3, and the structure fields are ordered to match up exactly with the + * input parameters. + */ + int mask = spe_allocate_register(f, 3); + int depth = spe_allocate_register(f, 4); + int stencil = spe_allocate_register(f, 5); + int zvals = spe_allocate_register(f, 6); + int frag_a = spe_allocate_register(f, 7); + int facing = spe_allocate_register(f, 8); + + int depth_mask = spe_allocate_available_register(f); + + boolean depth_complement; + + + emit_alpha_test(dsa, f, mask, frag_a); + + depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals); + + if (dsa->stencil[0].enabled) { + const int front_depth_pass = spe_allocate_available_register(f); + int front_stencil = emit_stencil_test(dsa, 0, f, mask, + depth_mask, depth_complement, + stencil, front_depth_pass); + + if (dsa->stencil[1].enabled) { + const int back_depth_pass = spe_allocate_available_register(f); + int back_stencil = emit_stencil_test(dsa, 1, f, mask, + depth_mask, depth_complement, + stencil, back_depth_pass); + + /* If the front facing stencil value and the back facing stencil + * value are stored in the same register, there is no need to select + * a value based on the facing. This can happen if the stencil value + * was not modified due to the write masks being zero, the stencil + * operations being KEEP, etc. + */ + if (front_stencil != back_stencil) { + spe_selb(f, stencil, back_stencil, front_stencil, facing); + } + + if (back_stencil != stencil) { + spe_release_register(f, back_stencil); + } + + if (front_stencil != stencil) { + spe_release_register(f, front_stencil); + } + + spe_selb(f, mask, back_depth_pass, front_depth_pass, facing); + + spe_release_register(f, back_depth_pass); + } else { + if (front_stencil != stencil) { + spe_or(f, stencil, front_stencil, front_stencil); + spe_release_register(f, front_stencil); + } + spe_or(f, mask, front_depth_pass, front_depth_pass); + } + + spe_release_register(f, front_depth_pass); + } else if (dsa->depth.enabled) { + if (depth_complement) { + spe_andc(f, mask, mask, depth_mask); + } else { + spe_and(f, mask, mask, depth_mask); + } + } + + if (dsa->depth.writemask) { + spe_selb(f, depth, depth, zvals, mask); + } + + spe_bi(f, 0, 0, 0); /* return from function call */ + + +#if 0 + { + const uint32_t *p = f->store; + unsigned i; + + printf("# alpha (%sabled)\n", + (dsa->alpha.enabled) ? "en" : "dis"); + printf("# func: %u\n", dsa->alpha.func); + printf("# ref: %.2f\n", dsa->alpha.ref); + + printf("# depth (%sabled)\n", + (dsa->depth.enabled) ? "en" : "dis"); + printf("# func: %u\n", dsa->depth.func); + + for (i = 0; i < 2; i++) { + printf("# %s stencil (%sabled)\n", + (i == 0) ? "front" : "back", + (dsa->stencil[i].enabled) ? "en" : "dis"); + + printf("# func: %u\n", dsa->stencil[i].func); + printf("# op (sf, zf, zp): %u %u %u\n", + dsa->stencil[i].fail_op, + dsa->stencil[i].zfail_op, + dsa->stencil[i].zpass_op); + printf("# ref value / value mask / write mask: %02x %02x %02x\n", + dsa->stencil[i].ref_value, + dsa->stencil[i].valuemask, + dsa->stencil[i].writemask); + } + + printf("\t.text\n"); + for (/* empty */; p < f->csr; p++) { + printf("\t.long\t0x%04x\n", *p); + } + fflush(stdout); + } +#endif +} + + +/** + * \note Emits a maximum of 3 instructions + */ +static int +emit_alpha_factor_calculation(struct spe_function *f, + unsigned factor, + int src_alpha, int dst_alpha, int const_alpha) +{ + int factor_reg; + int tmp; + + + switch (factor) { + case PIPE_BLENDFACTOR_ONE: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA: + factor_reg = spe_allocate_available_register(f); + + spe_or(f, factor_reg, src_alpha, src_alpha); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + factor_reg = dst_alpha; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + factor_reg = spe_allocate_available_register(f); + + tmp = spe_allocate_available_register(f); + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor_reg, tmp, const_alpha); + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: + factor_reg = const_alpha; + break; + + case PIPE_BLENDFACTOR_ZERO: + factor_reg = -1; + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + tmp = spe_allocate_available_register(f); + factor_reg = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor_reg, tmp, src_alpha); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + tmp = spe_allocate_available_register(f); + factor_reg = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor_reg, tmp, dst_alpha); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + assert(0); + factor_reg = -1; + break; + } + + return factor_reg; +} + + +/** + * \note Emits a maximum of 6 instructions + */ +static void +emit_color_factor_calculation(struct spe_function *f, + unsigned sF, unsigned mask, + const int *src, + const int *dst, + const int *const_color, + int *factor) +{ + int tmp; + unsigned i; + + + factor[0] = -1; + factor[1] = -1; + factor[2] = -1; + factor[3] = -1; + + switch (sF) { + case PIPE_BLENDFACTOR_ONE: + break; + + case PIPE_BLENDFACTOR_SRC_COLOR: + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_or(f, factor[i], src[i], src[i]); + } + } + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA: + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_or(f, factor[0], src[3], src[3]); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + factor[0] = dst[3]; + factor[1] = dst[3]; + factor[2] = dst[3]; + break; + + case PIPE_BLENDFACTOR_DST_COLOR: + factor[0] = dst[0]; + factor[1] = dst[1]; + factor[2] = dst[2]; + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + /* Alpha saturate means min(As, 1-Ad). + */ + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, tmp, tmp, dst[3]); + spe_fcgt(f, factor[0], tmp, src[3]); + spe_selb(f, factor[0], src[3], tmp, factor[0]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + tmp = spe_allocate_available_register(f); + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + + for (i = 0; i < 3; i++) { + factor[i] = spe_allocate_available_register(f); + + spe_fs(f, factor[i], tmp, const_color[i]); + } + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_CONST_COLOR: + for (i = 0; i < 3; i++) { + factor[i] = const_color[i]; + } + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + tmp = spe_allocate_available_register(f); + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor[0], tmp, const_color[3]); + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: + factor[0] = const_color[3]; + factor[1] = factor[0]; + factor[2] = factor[0]; + break; + + case PIPE_BLENDFACTOR_ZERO: + break; + + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_fs(f, factor[i], tmp, src[i]); + } + } + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor[0], tmp, src[3]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + tmp = spe_allocate_available_register(f); + factor[0] = spe_allocate_available_register(f); + factor[1] = factor[0]; + factor[2] = factor[0]; + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + spe_fs(f, factor[0], tmp, dst[3]); + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_INV_DST_COLOR: + tmp = spe_allocate_available_register(f); + + spe_il(f, tmp, 1); + spe_cuflt(f, tmp, tmp, 0); + + for (i = 0; i < 3; ++i) { + if ((mask & (1U << i)) != 0) { + factor[i] = spe_allocate_available_register(f); + spe_fs(f, factor[i], tmp, dst[i]); + } + } + + spe_release_register(f, tmp); + break; + + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: + assert(0); + } +} + + +static void +emit_blend_calculation(struct spe_function *f, + unsigned func, unsigned sF, unsigned dF, + int src, int src_factor, int dst, int dst_factor) +{ + int tmp = spe_allocate_available_register(f); + + switch (func) { + case PIPE_BLEND_ADD: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + /* Do nothing. */ + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fa(f, src, src, dst); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_or(f, src, dst, dst); + } else { + spe_fm(f, src, dst, dst_factor); + } + } else if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_fm(f, src, src, src_factor); + } else { + spe_fm(f, tmp, dst, dst_factor); + spe_fma(f, src, src, src_factor, tmp); + } + break; + + case PIPE_BLEND_SUBTRACT: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + /* Do nothing. */ + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fs(f, src, src, dst); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_il(f, tmp, 0); + spe_fs(f, src, tmp, dst); + } else { + spe_fm(f, src, dst, dst_factor); + } + } else if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_fm(f, src, src, src_factor); + } else { + spe_fm(f, tmp, dst, dst_factor); + spe_fms(f, src, src, src_factor, tmp); + } + break; + + case PIPE_BLEND_REVERSE_SUBTRACT: + if (sF == PIPE_BLENDFACTOR_ONE) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, tmp, 0); + spe_fs(f, src, tmp, src); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_fs(f, src, dst, src); + } + } else if (sF == PIPE_BLENDFACTOR_ZERO) { + if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_il(f, src, 0); + } else if (dF == PIPE_BLENDFACTOR_ONE) { + spe_or(f, src, dst, dst); + } else { + spe_fm(f, src, dst, dst_factor); + } + } else if (dF == PIPE_BLENDFACTOR_ZERO) { + spe_fm(f, src, src, src_factor); + } else { + spe_fm(f, tmp, src, src_factor); + spe_fms(f, src, src, dst_factor, tmp); + } + break; + + case PIPE_BLEND_MIN: + spe_cgt(f, tmp, src, dst); + spe_selb(f, src, src, dst, tmp); + break; + + case PIPE_BLEND_MAX: + spe_cgt(f, tmp, src, dst); + spe_selb(f, src, dst, src, tmp); + break; + + default: + assert(0); + } + + spe_release_register(f, tmp); +} + + +/** + * Generate code to perform alpha blending on the SPE + */ +void +cell_generate_alpha_blend(struct cell_blend_state *cb) +{ + struct pipe_blend_state *const b = &cb->base; + struct spe_function *const f = &cb->code; + + /* This code generates a maximum of 3 (source alpha factor) + * + 3 (destination alpha factor) + (3 * 6) (source color factor) + * + (3 * 6) (destination color factor) + (4 * 2) (blend equation) + * + 4 (fragment mask) + 1 (return) = 55 instlructions. Round up to 64 to + * make it a happy power-of-two. + */ + spe_init_func(f, SPE_INST_SIZE * 64); + + + const int frag[4] = { + spe_allocate_register(f, 3), + spe_allocate_register(f, 4), + spe_allocate_register(f, 5), + spe_allocate_register(f, 6), + }; + const int pixel[4] = { + spe_allocate_register(f, 7), + spe_allocate_register(f, 8), + spe_allocate_register(f, 9), + spe_allocate_register(f, 10), + }; + const int const_color[4] = { + spe_allocate_register(f, 11), + spe_allocate_register(f, 12), + spe_allocate_register(f, 13), + spe_allocate_register(f, 14), + }; + unsigned func[4]; + unsigned sF[4]; + unsigned dF[4]; + unsigned i; + int src_factor[4]; + int dst_factor[4]; + + + /* Does the selected blend mode make use of the source / destination + * color (RGB) blend factors? + */ + boolean need_color_factor = b->blend_enable + && (b->rgb_func != PIPE_BLEND_MIN) + && (b->rgb_func != PIPE_BLEND_MAX); + + /* Does the selected blend mode make use of the source / destination + * alpha blend factors? + */ + boolean need_alpha_factor = b->blend_enable + && (b->alpha_func != PIPE_BLEND_MIN) + && (b->alpha_func != PIPE_BLEND_MAX); + + + if (b->blend_enable) { + sF[0] = b->rgb_src_factor; + sF[1] = sF[0]; + sF[2] = sF[0]; + switch (b->alpha_src_factor & 0x0f) { + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + sF[3] = PIPE_BLENDFACTOR_ONE; + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_CONST_COLOR: + case PIPE_BLENDFACTOR_SRC1_COLOR: + sF[3] = b->alpha_src_factor + 1; + break; + default: + sF[3] = b->alpha_src_factor; + } + + dF[0] = b->rgb_dst_factor; + dF[1] = dF[0]; + dF[2] = dF[0]; + switch (b->alpha_dst_factor & 0x0f) { + case PIPE_BLENDFACTOR_SRC_COLOR: + case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_CONST_COLOR: + case PIPE_BLENDFACTOR_SRC1_COLOR: + dF[3] = b->alpha_dst_factor + 1; + break; + default: + dF[3] = b->alpha_dst_factor; + } + + func[0] = b->rgb_func; + func[1] = func[0]; + func[2] = func[0]; + func[3] = b->alpha_func; + } else { + sF[0] = PIPE_BLENDFACTOR_ONE; + sF[1] = PIPE_BLENDFACTOR_ONE; + sF[2] = PIPE_BLENDFACTOR_ONE; + sF[3] = PIPE_BLENDFACTOR_ONE; + dF[0] = PIPE_BLENDFACTOR_ZERO; + dF[1] = PIPE_BLENDFACTOR_ZERO; + dF[2] = PIPE_BLENDFACTOR_ZERO; + dF[3] = PIPE_BLENDFACTOR_ZERO; + + func[0] = PIPE_BLEND_ADD; + func[1] = PIPE_BLEND_ADD; + func[2] = PIPE_BLEND_ADD; + func[3] = PIPE_BLEND_ADD; + } + + + /* If alpha writing is enabled and the alpha blend mode requires use of + * the alpha factor, calculate the alpha factor. + */ + if (((b->colormask & 8) != 0) && need_alpha_factor) { + src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3], + frag[3], pixel[3]); + + /* If the alpha destination blend factor is the same as the alpha source + * blend factor, re-use the previously calculated value. + */ + dst_factor[3] = (dF[3] == sF[3]) + ? src_factor[3] + : emit_alpha_factor_calculation(f, dF[3], const_color[3], + frag[3], pixel[3]); + } + + + if (sF[0] == sF[3]) { + src_factor[0] = src_factor[3]; + src_factor[1] = src_factor[3]; + src_factor[2] = src_factor[3]; + } else if (sF[0] == dF[3]) { + src_factor[0] = dst_factor[3]; + src_factor[1] = dst_factor[3]; + src_factor[2] = dst_factor[3]; + } else if (need_color_factor) { + emit_color_factor_calculation(f, + b->rgb_src_factor, + b->colormask, + frag, pixel, const_color, src_factor); + } + + + if (dF[0] == sF[3]) { + dst_factor[0] = src_factor[3]; + dst_factor[1] = src_factor[3]; + dst_factor[2] = src_factor[3]; + } else if (dF[0] == dF[3]) { + dst_factor[0] = dst_factor[3]; + dst_factor[1] = dst_factor[3]; + dst_factor[2] = dst_factor[3]; + } else if (dF[0] == sF[0]) { + dst_factor[0] = src_factor[0]; + dst_factor[1] = src_factor[1]; + dst_factor[2] = src_factor[2]; + } else if (need_color_factor) { + emit_color_factor_calculation(f, + b->rgb_dst_factor, + b->colormask, + frag, pixel, const_color, dst_factor); + } + + + + for (i = 0; i < 4; ++i) { + if ((b->colormask & (1U << i)) != 0) { + emit_blend_calculation(f, + func[i], sF[i], dF[i], + frag[i], src_factor[i], + pixel[i], dst_factor[i]); + } + } + + spe_bi(f, 0, 0, 0); + +#if 0 + { + const uint32_t *p = f->store; + + printf("# %u instructions\n", f->csr - f->store); + printf("# blend (%sabled)\n", + (cb->base.blend_enable) ? "en" : "dis"); + printf("# RGB func / sf / df: %u %u %u\n", + cb->base.rgb_func, + cb->base.rgb_src_factor, + cb->base.rgb_dst_factor); + printf("# ALP func / sf / df: %u %u %u\n", + cb->base.alpha_func, + cb->base.alpha_src_factor, + cb->base.alpha_dst_factor); + + printf("\t.text\n"); + for (/* empty */; p < f->csr; p++) { + printf("\t.long\t0x%04x\n", *p); + } + fflush(stdout); + } +#endif +} + + +static int +PC_OFFSET(const struct spe_function *f, const void *d) +{ + const intptr_t pc = (intptr_t) &f->store[f->num_inst]; + const intptr_t ea = ~0x0f & (intptr_t) d; + + return (ea - pc) >> 2; +} + + +/** + * Generate code to perform color conversion and logic op + * + * \bug + * The code generated by this function should also perform dithering. + * + * \bug + * The code generated by this function should also perform color-write + * masking. + * + * \bug + * Only two framebuffer formats are supported at this time. + */ +void +cell_generate_logic_op(struct spe_function *f, + const struct pipe_blend_state *blend, + struct pipe_surface *surf) +{ + const unsigned logic_op = (blend->logicop_enable) + ? blend->logicop_func : PIPE_LOGICOP_COPY; + + /* This code generates a maximum of 37 instructions. An additional 32 + * bytes (equiv. to 8 instructions) are needed for data storage. Round up + * to 64 to make it a happy power-of-two. + */ + spe_init_func(f, SPE_INST_SIZE * 64); + + + /* Pixel colors in framebuffer format in AoS layout. + */ + const int pixel[4] = { + spe_allocate_register(f, 3), + spe_allocate_register(f, 4), + spe_allocate_register(f, 5), + spe_allocate_register(f, 6), + }; + + /* Fragment colors stored as floats in SoA layout. + */ + const int frag[4] = { + spe_allocate_register(f, 7), + spe_allocate_register(f, 8), + spe_allocate_register(f, 9), + spe_allocate_register(f, 10), + }; + + const int mask = spe_allocate_register(f, 11); + + + /* Short-circuit the noop and invert cases. + */ + if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) { + spe_bi(f, 0, 0, 0); + return; + } else if (logic_op == PIPE_LOGICOP_INVERT) { + spe_nor(f, pixel[0], pixel[0], pixel[0]); + spe_nor(f, pixel[1], pixel[1], pixel[1]); + spe_nor(f, pixel[2], pixel[2], pixel[2]); + spe_nor(f, pixel[3], pixel[3], pixel[3]); + spe_bi(f, 0, 0, 0); + return; + } + + + const int tmp[4] = { + spe_allocate_available_register(f), + spe_allocate_available_register(f), + spe_allocate_available_register(f), + spe_allocate_available_register(f), + }; + + const int shuf_xpose_hi = spe_allocate_available_register(f); + const int shuf_xpose_lo = spe_allocate_available_register(f); + const int shuf_color = spe_allocate_available_register(f); + + + /* Pointer to the begining of the function's private data area. + */ + uint32_t *const data = ((uint32_t *) f->store) + (64 - 8); + + + /* Convert fragment colors to framebuffer format in AoS layout. + */ + switch (surf->format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + data[0] = 0x00010203; + data[1] = 0x10111213; + data[2] = 0x04050607; + data[3] = 0x14151617; + data[4] = 0x0c000408; + data[5] = 0x80808080; + data[6] = 0x80808080; + data[7] = 0x80808080; + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + data[0] = 0x03020100; + data[1] = 0x13121110; + data[2] = 0x07060504; + data[3] = 0x17161514; + data[4] = 0x0804000c; + data[5] = 0x80808080; + data[6] = 0x80808080; + data[7] = 0x80808080; + break; + default: + fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()"); + ASSERT(0); + } + + spe_ilh(f, tmp[0], 0x0808); + spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0)); + spe_lqr(f, shuf_color, PC_OFFSET(f, data+4)); + spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]); + + spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi); + spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo); + spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi); + spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo); + + spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi); + spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo); + spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi); + spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo); + + spe_cfltu(f, frag[0], frag[0], 32); + spe_cfltu(f, frag[1], frag[1], 32); + spe_cfltu(f, frag[2], frag[2], 32); + spe_cfltu(f, frag[3], frag[3], 32); + + spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color); + spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color); + spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color); + spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color); + + + /* If logic op is enabled, perform the requested logical operation on the + * converted fragment colors and the pixel colors. + */ + switch (logic_op) { + case PIPE_LOGICOP_CLEAR: + spe_il(f, frag[0], 0); + spe_il(f, frag[1], 0); + spe_il(f, frag[2], 0); + spe_il(f, frag[3], 0); + break; + case PIPE_LOGICOP_NOR: + spe_nor(f, frag[0], frag[0], pixel[0]); + spe_nor(f, frag[1], frag[1], pixel[1]); + spe_nor(f, frag[2], frag[2], pixel[2]); + spe_nor(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_AND_INVERTED: + spe_andc(f, frag[0], pixel[0], frag[0]); + spe_andc(f, frag[1], pixel[1], frag[1]); + spe_andc(f, frag[2], pixel[2], frag[2]); + spe_andc(f, frag[3], pixel[3], frag[3]); + break; + case PIPE_LOGICOP_COPY_INVERTED: + spe_nor(f, frag[0], frag[0], frag[0]); + spe_nor(f, frag[1], frag[1], frag[1]); + spe_nor(f, frag[2], frag[2], frag[2]); + spe_nor(f, frag[3], frag[3], frag[3]); + break; + case PIPE_LOGICOP_AND_REVERSE: + spe_andc(f, frag[0], frag[0], pixel[0]); + spe_andc(f, frag[1], frag[1], pixel[1]); + spe_andc(f, frag[2], frag[2], pixel[2]); + spe_andc(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_XOR: + spe_xor(f, frag[0], frag[0], pixel[0]); + spe_xor(f, frag[1], frag[1], pixel[1]); + spe_xor(f, frag[2], frag[2], pixel[2]); + spe_xor(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_NAND: + spe_nand(f, frag[0], frag[0], pixel[0]); + spe_nand(f, frag[1], frag[1], pixel[1]); + spe_nand(f, frag[2], frag[2], pixel[2]); + spe_nand(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_AND: + spe_and(f, frag[0], frag[0], pixel[0]); + spe_and(f, frag[1], frag[1], pixel[1]); + spe_and(f, frag[2], frag[2], pixel[2]); + spe_and(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_EQUIV: + spe_eqv(f, frag[0], frag[0], pixel[0]); + spe_eqv(f, frag[1], frag[1], pixel[1]); + spe_eqv(f, frag[2], frag[2], pixel[2]); + spe_eqv(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_OR_INVERTED: + spe_orc(f, frag[0], pixel[0], frag[0]); + spe_orc(f, frag[1], pixel[1], frag[1]); + spe_orc(f, frag[2], pixel[2], frag[2]); + spe_orc(f, frag[3], pixel[3], frag[3]); + break; + case PIPE_LOGICOP_COPY: + break; + case PIPE_LOGICOP_OR_REVERSE: + spe_orc(f, frag[0], frag[0], pixel[0]); + spe_orc(f, frag[1], frag[1], pixel[1]); + spe_orc(f, frag[2], frag[2], pixel[2]); + spe_orc(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_OR: + spe_or(f, frag[0], frag[0], pixel[0]); + spe_or(f, frag[1], frag[1], pixel[1]); + spe_or(f, frag[2], frag[2], pixel[2]); + spe_or(f, frag[3], frag[3], pixel[3]); + break; + case PIPE_LOGICOP_SET: + spe_il(f, frag[0], ~0); + spe_il(f, frag[1], ~0); + spe_il(f, frag[2], ~0); + spe_il(f, frag[3], ~0); + break; + + /* These two cases are short-circuited above. + */ + case PIPE_LOGICOP_INVERT: + case PIPE_LOGICOP_NOOP: + default: + assert(0); + } + + + /* Apply fragment mask. + */ + spe_ilh(f, tmp[0], 0x0000); + spe_ilh(f, tmp[1], 0x0404); + spe_ilh(f, tmp[2], 0x0808); + spe_ilh(f, tmp[3], 0x0c0c); + + spe_shufb(f, tmp[0], mask, mask, tmp[0]); + spe_shufb(f, tmp[1], mask, mask, tmp[1]); + spe_shufb(f, tmp[2], mask, mask, tmp[2]); + spe_shufb(f, tmp[3], mask, mask, tmp[3]); + + spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]); + spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]); + spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]); + spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]); + + spe_bi(f, 0, 0, 0); + +#if 0 + { + const uint32_t *p = f->store; + unsigned i; + + printf("# %u instructions\n", f->csr - f->store); + + printf("\t.text\n"); + for (i = 0; i < 64; i++) { + printf("\t.long\t0x%04x\n", p[i]); + } + fflush(stdout); + } +#endif +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h new file mode 100644 index 00000000000..a8267a51331 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h @@ -0,0 +1,39 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CELL_STATE_PER_FRAGMENT_H +#define CELL_STATE_PER_FRAGMENT_H + +extern void +cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa); + +extern void +cell_generate_alpha_blend(struct cell_blend_state *cb); + +extern void +cell_generate_logic_op(struct spe_function *f, + const struct pipe_blend_state *blend, + struct pipe_surface *surf); + +#endif /* CELL_STATE_PER_FRAGMENT_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c new file mode 100644 index 00000000000..bf517ea5635 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c @@ -0,0 +1,219 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_defines.h" +#include "util/u_memory.h" +#include "pipe/p_inlines.h" +#include "pipe/internal/p_winsys_screen.h" +#include "draw/draw_context.h" +#include "tgsi/tgsi_parse.h" + +#include "cell_context.h" +#include "cell_state.h" +#include "cell_gen_fp.h" + + +/** cast wrapper */ +static INLINE struct cell_fragment_shader_state * +cell_fragment_shader_state(void *shader) +{ + return (struct cell_fragment_shader_state *) shader; +} + + +/** cast wrapper */ +static INLINE struct cell_vertex_shader_state * +cell_vertex_shader_state(void *shader) +{ + return (struct cell_vertex_shader_state *) shader; +} + + +/** + * Create fragment shader state. + * Called via pipe->create_fs_state() + */ +static void * +cell_create_fs_state(struct pipe_context *pipe, + const struct pipe_shader_state *templ) +{ + struct cell_context *cell = cell_context(pipe); + struct cell_fragment_shader_state *cfs; + + cfs = CALLOC_STRUCT(cell_fragment_shader_state); + if (!cfs) + return NULL; + + cfs->shader.tokens = tgsi_dup_tokens(templ->tokens); + if (!cfs->shader.tokens) { + FREE(cfs); + return NULL; + } + + tgsi_scan_shader(templ->tokens, &cfs->info); + + cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code); + + return cfs; +} + + +/** + * Called via pipe->bind_fs_state() + */ +static void +cell_bind_fs_state(struct pipe_context *pipe, void *fs) +{ + struct cell_context *cell = cell_context(pipe); + + cell->fs = cell_fragment_shader_state(fs); + + cell->dirty |= CELL_NEW_FS; +} + + +/** + * Called via pipe->delete_fs_state() + */ +static void +cell_delete_fs_state(struct pipe_context *pipe, void *fs) +{ + struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs); + + spe_release_func(&cfs->code); + + FREE((void *) cfs->shader.tokens); + FREE(cfs); +} + + +/** + * Create vertex shader state. + * Called via pipe->create_vs_state() + */ +static void * +cell_create_vs_state(struct pipe_context *pipe, + const struct pipe_shader_state *templ) +{ + struct cell_context *cell = cell_context(pipe); + struct cell_vertex_shader_state *cvs; + + cvs = CALLOC_STRUCT(cell_vertex_shader_state); + if (!cvs) + return NULL; + + cvs->shader.tokens = tgsi_dup_tokens(templ->tokens); + if (!cvs->shader.tokens) { + FREE(cvs); + return NULL; + } + + tgsi_scan_shader(templ->tokens, &cvs->info); + + cvs->draw_data = draw_create_vertex_shader(cell->draw, &cvs->shader); + if (cvs->draw_data == NULL) { + FREE( (void *) cvs->shader.tokens ); + FREE( cvs ); + return NULL; + } + + return cvs; +} + + +/** + * Called via pipe->bind_vs_state() + */ +static void +cell_bind_vs_state(struct pipe_context *pipe, void *vs) +{ + struct cell_context *cell = cell_context(pipe); + + cell->vs = cell_vertex_shader_state(vs); + + draw_bind_vertex_shader(cell->draw, + (cell->vs ? cell->vs->draw_data : NULL)); + + cell->dirty |= CELL_NEW_VS; +} + + +/** + * Called via pipe->delete_vs_state() + */ +static void +cell_delete_vs_state(struct pipe_context *pipe, void *vs) +{ + struct cell_context *cell = cell_context(pipe); + struct cell_vertex_shader_state *cvs = cell_vertex_shader_state(vs); + + draw_delete_vertex_shader(cell->draw, cvs->draw_data); + FREE( (void *) cvs->shader.tokens ); + FREE( cvs ); +} + + +/** + * Called via pipe->set_constant_buffer() + */ +static void +cell_set_constant_buffer(struct pipe_context *pipe, + uint shader, uint index, + const struct pipe_constant_buffer *buf) +{ + struct cell_context *cell = cell_context(pipe); + + assert(shader < PIPE_SHADER_TYPES); + assert(index == 0); + + draw_flush(cell->draw); + + /* note: reference counting */ + pipe_buffer_reference(pipe->screen, + &cell->constants[shader].buffer, + buf->buffer); + + if (shader == PIPE_SHADER_VERTEX) + cell->dirty |= CELL_NEW_VS_CONSTANTS; + else if (shader == PIPE_SHADER_FRAGMENT) + cell->dirty |= CELL_NEW_FS_CONSTANTS; +} + + +void +cell_init_shader_functions(struct cell_context *cell) +{ + cell->pipe.create_fs_state = cell_create_fs_state; + cell->pipe.bind_fs_state = cell_bind_fs_state; + cell->pipe.delete_fs_state = cell_delete_fs_state; + + cell->pipe.create_vs_state = cell_create_vs_state; + cell->pipe.bind_vs_state = cell_bind_vs_state; + cell->pipe.delete_vs_state = cell_delete_vs_state; + + cell->pipe.set_constant_buffer = cell_set_constant_buffer; +} diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c new file mode 100644 index 00000000000..fbe55c84721 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c @@ -0,0 +1,79 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Authors: Keith Whitwell <[email protected]> + */ + + +#include "cell_context.h" +#include "cell_state.h" + +#include "draw/draw_context.h" + + +static void +cell_set_vertex_elements(struct pipe_context *pipe, + unsigned count, + const struct pipe_vertex_element *elements) +{ + struct cell_context *cell = cell_context(pipe); + + assert(count <= PIPE_MAX_ATTRIBS); + + memcpy(cell->vertex_element, elements, count * sizeof(elements[0])); + cell->num_vertex_elements = count; + + cell->dirty |= CELL_NEW_VERTEX; + + draw_set_vertex_elements(cell->draw, count, elements); +} + + +static void +cell_set_vertex_buffers(struct pipe_context *pipe, + unsigned count, + const struct pipe_vertex_buffer *buffers) +{ + struct cell_context *cell = cell_context(pipe); + + assert(count <= PIPE_MAX_ATTRIBS); + + memcpy(cell->vertex_buffer, buffers, count * sizeof(buffers[0])); + cell->num_vertex_buffers = count; + + cell->dirty |= CELL_NEW_VERTEX; + + draw_set_vertex_buffers(cell->draw, count, buffers); +} + + +void +cell_init_vertex_functions(struct cell_context *cell) +{ + cell->pipe.set_vertex_buffers = cell_set_vertex_buffers; + cell->pipe.set_vertex_elements = cell_set_vertex_elements; +} diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c new file mode 100644 index 00000000000..c9203fee087 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_surface.c @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_rect.h" +#include "cell_context.h" +#include "cell_surface.h" + + +void +cell_init_surface_functions(struct cell_context *cell) +{ + cell->pipe.surface_copy = util_surface_copy; + cell->pipe.surface_fill = util_surface_fill; +} diff --git a/src/gallium/drivers/cell/ppu/cell_surface.h b/src/gallium/drivers/cell/ppu/cell_surface.h new file mode 100644 index 00000000000..9e58f329443 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_surface.h @@ -0,0 +1,42 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Authors: Keith Whitwell <[email protected]> + */ + +#ifndef CELL_SURFACE_H +#define CELL_SURFACE_H + + +struct cell_context; + + +extern void +cell_init_surface_functions(struct cell_context *cell); + + +#endif /* SP_SURFACE_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c new file mode 100644 index 00000000000..9ba995ab7d3 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -0,0 +1,531 @@ +/************************************************************************** + * + * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + * Michel Dänzer <[email protected]> + * Brian Paul + */ + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_inlines.h" +#include "pipe/internal/p_winsys_screen.h" +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "cell_context.h" +#include "cell_state.h" +#include "cell_texture.h" + + + +static unsigned +minify(unsigned d) +{ + return MAX2(1, d>>1); +} + + +static void +cell_texture_layout(struct cell_texture *ct) +{ + struct pipe_texture *pt = &ct->base; + unsigned level; + unsigned width = pt->width[0]; + unsigned height = pt->height[0]; + unsigned depth = pt->depth[0]; + + ct->buffer_size = 0; + + for ( level = 0 ; level <= pt->last_level ; level++ ) { + unsigned size; + unsigned w_tile, h_tile; + + assert(level < CELL_MAX_TEXTURE_LEVELS); + + /* width, height, rounded up to tile size */ + w_tile = align(width, TILE_SIZE); + h_tile = align(height, TILE_SIZE); + + pt->width[level] = width; + pt->height[level] = height; + pt->depth[level] = depth; + pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile); + pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile); + + ct->stride[level] = pt->nblocksx[level] * pt->block.size; + + ct->level_offset[level] = ct->buffer_size; + + size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size; + if (pt->target == PIPE_TEXTURE_CUBE) + size *= 6; + else + size *= depth; + + ct->buffer_size += size; + + width = minify(width); + height = minify(height); + depth = minify(depth); + } +} + + +static struct pipe_texture * +cell_texture_create(struct pipe_screen *screen, + const struct pipe_texture *templat) +{ + struct pipe_winsys *ws = screen->winsys; + struct cell_texture *ct = CALLOC_STRUCT(cell_texture); + if (!ct) + return NULL; + + ct->base = *templat; + ct->base.refcount = 1; + ct->base.screen = screen; + + cell_texture_layout(ct); + + ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL, + ct->buffer_size); + + if (!ct->buffer) { + FREE(ct); + return NULL; + } + + return &ct->base; +} + + +static void +cell_texture_release(struct pipe_screen *screen, + struct pipe_texture **pt) +{ + if (!*pt) + return; + + /* + DBG("%s %p refcount will be %d\n", + __FUNCTION__, (void *) *pt, (*pt)->refcount - 1); + */ + if (--(*pt)->refcount <= 0) { + /* Delete this texture now. + * But note that the underlying pipe_buffer may linger... + */ + struct cell_texture *ct = cell_texture(*pt); + uint i; + + /* + DBG("%s deleting %p\n", __FUNCTION__, (void *) ct); + */ + + pipe_buffer_reference(screen, &ct->buffer, NULL); + + for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { + /* Unreference the tiled image buffer. + * It may not actually be deleted until a fence is hit. + */ + if (ct->tiled_buffer[i]) { + ct->tiled_mapped[i] = NULL; + pipe_buffer_reference(screen, &ct->tiled_buffer[i], NULL); + } + } + + FREE(ct); + } + *pt = NULL; +} + + + +/** + * Convert image from linear layout to tiled layout. 4-byte pixels. + */ +static void +twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, + uint src_stride, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = (h + tile_size - 1) / tile_size; + const uint w_t = (w + tile_size - 1) / tile_size; + + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + src_stride /= 4; /* convert from bytes to pixels */ + + /* loop over dest tiles */ + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* start of dest tile: */ + uint *tdst = dst + (it * w_t + jt) * tile_size2; + + /* compute size of this tile (may be smaller than tile_size) */ + /* XXX note: a compiler bug was found here. That's why the code + * looks as it does. + */ + uint tile_width = w - jt * tile_size; + tile_width = MIN2(tile_width, tile_size); + uint tile_height = h - it * tile_size; + tile_height = MIN2(tile_height, tile_size); + + /* loop over texels in the tile */ + for (i = 0; i < tile_height; i++) { + for (j = 0; j < tile_width; j++) { + const uint srci = it * tile_size + i; + const uint srcj = jt * tile_size + j; + ASSERT(srci < h); + ASSERT(srcj < w); + tdst[i * tile_size + j] = src[srci * src_stride + srcj]; + } + } + } + } +} + + +/** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(const uint *tileIn, uint *tileOut) +{ + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k]; + tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1]; + tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2]; + tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3]; + } + } +} + + +/** + * Convert image from tiled layout to linear layout. 4-byte pixels. + */ +static void +untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, + uint dst_stride, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = (h + tile_size - 1) / tile_size; + const uint w_t = (w + tile_size - 1) / tile_size; + uint *tile_buf; + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + dst_stride /= 4; /* convert from bytes to pixels */ + + tile_buf = align_malloc(tile_size * tile_size * 4, 16); + + /* loop over src tiles */ + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* start of src tile: */ + const uint *tsrc = src + (it * w_t + jt) * tile_size2; + + twiddle_tile(tsrc, tile_buf); + tsrc = tile_buf; + + /* compute size of this tile (may be smaller than tile_size) */ + /* XXX note: a compiler bug was found here. That's why the code + * looks as it does. + */ + uint tile_width = w - jt * tile_size; + tile_width = MIN2(tile_width, tile_size); + uint tile_height = h - it * tile_size; + tile_height = MIN2(tile_height, tile_size); + + /* loop over texels in the tile */ + for (i = 0; i < tile_height; i++) { + for (j = 0; j < tile_width; j++) { + uint dsti = it * tile_size + i; + uint dstj = jt * tile_size + j; + ASSERT(dsti < h); + ASSERT(dstj < w); + dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j]; + } + } + } + } + + align_free(tile_buf); +} + + +/** + * Convert linear texture image data to tiled format for SPU usage. + */ +static void +cell_twiddle_texture(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; + const uint bufWidth = align(texWidth, TILE_SIZE); + const uint bufHeight = align(texHeight, TILE_SIZE); + const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ); + const uint *src = (const uint *) map; + + switch (ct->base.format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_S8Z24_UNORM: + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = bufWidth * bufHeight * 4 * surface->face; + uint *dst; + + if (!ct->tiled_buffer[level]) { + /* allocate buffer for tiled data now */ + struct pipe_winsys *ws = screen->winsys; + uint bytes = bufWidth * bufHeight * 4 * numFaces; + ct->tiled_buffer[level] = + ws->buffer_create(ws, 16, PIPE_BUFFER_USAGE_PIXEL, bytes); + /* and map it */ + ct->tiled_mapped[level] = + ws->buffer_map(ws, ct->tiled_buffer[level], + PIPE_BUFFER_USAGE_GPU_READ); + } + dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset); + + twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); + } + break; + default: + printf("Cell: twiddle unsupported texture format %s\n", + pf_name(ct->base.format)); + } + + screen->surface_unmap(screen, surface); +} + + +/** + * Convert SPU tiled texture image data to linear format for app usage. + */ +static void +cell_untwiddle_texture(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; + const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ); + const uint *src = (const uint *) ((const ubyte *) map + surface->offset); + + switch (ct->base.format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_S8Z24_UNORM: + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = surface->stride * texHeight * 4 * surface->face; + uint *dst; + + if (!ct->untiled_data[level]) { + ct->untiled_data[level] = + align_malloc(surface->stride * texHeight * 4 * numFaces, 16); + } + + dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset); + + untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); + } + break; + default: + { + ct->untiled_data[level] = NULL; + printf("Cell: untwiddle unsupported texture format %s\n", + pf_name(ct->base.format)); + } + } + + screen->surface_unmap(screen, surface); +} + + +static struct pipe_surface * +cell_get_tex_surface(struct pipe_screen *screen, + struct pipe_texture *pt, + unsigned face, unsigned level, unsigned zslice, + unsigned usage) +{ + struct cell_texture *ct = cell_texture(pt); + struct pipe_surface *ps; + + ps = CALLOC_STRUCT(pipe_surface); + if (ps) { + ps->refcount = 1; + pipe_texture_reference(&ps->texture, pt); + ps->format = pt->format; + ps->block = pt->block; + ps->width = pt->width[level]; + ps->height = pt->height[level]; + ps->nblocksx = pt->nblocksx[level]; + ps->nblocksy = pt->nblocksy[level]; + ps->stride = ct->stride[level]; + ps->offset = ct->level_offset[level]; + ps->usage = usage; + + /* XXX may need to override usage flags (see sp_texture.c) */ + + pipe_texture_reference(&ps->texture, pt); + ps->face = face; + ps->level = level; + ps->zslice = zslice; + + if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) { + ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * + ps->nblocksy * + ps->stride; + } + else { + assert(face == 0); + assert(zslice == 0); + } + + if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) { + /* convert from tiled to linear layout */ + cell_untwiddle_texture(screen, ps); + } + } + return ps; +} + + +static void +cell_tex_surface_release(struct pipe_screen *screen, + struct pipe_surface **s) +{ + struct cell_texture *ct = cell_texture((*s)->texture); + const uint level = (*s)->level; + struct pipe_surface *surf = *s; + + if ((surf->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) + { + align_free(ct->untiled_data[level]); + ct->untiled_data[level] = NULL; + } + + if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) && + (surf->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) { + /* convert from linear to tiled layout */ + cell_twiddle_texture(screen, surf); + } + + /* XXX if done rendering to teximage, re-tile */ + + if (--surf->refcount == 0) { + pipe_texture_reference(&surf->texture, NULL); + FREE(surf); + } + *s = NULL; +} + + +static void * +cell_surface_map(struct pipe_screen *screen, + struct pipe_surface *surface, + unsigned flags) +{ + ubyte *map; + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + + assert(ct); + +#if 0 + if (flags & ~surface->usage) { + assert(0); + return NULL; + } +#endif + + map = pipe_buffer_map( screen, ct->buffer, flags ); + if (map == NULL) { + return NULL; + } + else { + if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && + (ct->untiled_data[level])) { + return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset); + } + else { + return (void *) (map + surface->offset); + } + } +} + + +static void +cell_surface_unmap(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + + assert(ct); + + pipe_buffer_unmap( screen, ct->buffer ); +} + + + +void +cell_init_screen_texture_funcs(struct pipe_screen *screen) +{ + screen->texture_create = cell_texture_create; + screen->texture_release = cell_texture_release; + + screen->get_tex_surface = cell_get_tex_surface; + screen->tex_surface_release = cell_tex_surface_release; + + screen->surface_map = cell_surface_map; + screen->surface_unmap = cell_surface_unmap; +} diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h new file mode 100644 index 00000000000..7018b0c9bf7 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -0,0 +1,72 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_TEXTURE_H +#define CELL_TEXTURE_H + + +struct cell_context; +struct pipe_texture; + + +/** + * Subclass of pipe_texture + */ +struct cell_texture +{ + struct pipe_texture base; + + unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS]; + unsigned long stride[CELL_MAX_TEXTURE_LEVELS]; + + /* The data is held here: + */ + struct pipe_buffer *buffer; + unsigned long buffer_size; + + /** Texture data in tiled layout is held here */ + struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS]; + /** Mapped, tiled texture data */ + void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS]; + void *untiled_data[CELL_MAX_TEXTURE_LEVELS]; +}; + + +/** cast wrapper */ +static INLINE struct cell_texture * +cell_texture(struct pipe_texture *pt) +{ + return (struct cell_texture *) pt; +} + + + +extern void +cell_init_screen_texture_funcs(struct pipe_screen *screen); + + +#endif /* CELL_TEXTURE_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c new file mode 100644 index 00000000000..ab54e796895 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c @@ -0,0 +1,309 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Vertex buffer code. The draw module transforms vertices to window + * coords, etc. and emits the vertices into buffer supplied by this module. + * When a vertex buffer is full, or we flush, we'll send the vertex data + * to the SPUs. + * + * Authors + * Brian Paul + */ + + +#include "cell_batch.h" +#include "cell_context.h" +#include "cell_fence.h" +#include "cell_flush.h" +#include "cell_spu.h" +#include "cell_vbuf.h" +#include "draw/draw_vbuf.h" +#include "util/u_memory.h" + + +/** Allow vertex data to be inlined after RENDER command */ +#define ALLOW_INLINE_VERTS 1 + + +/** + * Subclass of vbuf_render because we need a cell_context pointer in + * a few places. + */ +struct cell_vbuf_render +{ + struct vbuf_render base; + struct cell_context *cell; + uint prim; /**< PIPE_PRIM_x */ + uint vertex_size; /**< in bytes */ + void *vertex_buffer; /**< just for debug, really */ + uint vertex_buf; /**< in [0, CELL_NUM_BUFFERS-1] */ +}; + + +/** cast wrapper */ +static struct cell_vbuf_render * +cell_vbuf_render(struct vbuf_render *vbr) +{ + return (struct cell_vbuf_render *) vbr; +} + + + +static const struct vertex_info * +cell_vbuf_get_vertex_info(struct vbuf_render *vbr) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + return &cvbr->cell->vertex_info; +} + + +static void * +cell_vbuf_allocate_vertices(struct vbuf_render *vbr, + ushort vertex_size, ushort nr_vertices) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/ + + assert(cvbr->vertex_buf == ~0); + cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell); + cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf]; + cvbr->vertex_size = vertex_size; + return cvbr->vertex_buffer; +} + + +static void +cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, + unsigned vertex_size, unsigned vertices_used) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + struct cell_context *cell = cvbr->cell; + + /* + printf("%s vertex_buf = %u count = %u\n", + __FUNCTION__, cvbr->vertex_buf, vertices_used); + */ + + /* Make sure texture buffers aren't released until we're done rendering + * with them. + */ + cell_add_fenced_textures(cell); + + /* Tell SPUs they can release the vert buf */ + if (cvbr->vertex_buf != ~0U) { + STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0); + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) + cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts)); + release->opcode[0] = CELL_CMD_RELEASE_VERTS; + release->vertex_buf = cvbr->vertex_buf; + } + + cvbr->vertex_buf = ~0; + cell_flush_int(cell, 0x0); + + assert(vertices == cvbr->vertex_buffer); + cvbr->vertex_buffer = NULL; +} + + + +static boolean +cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + cvbr->prim = prim; + /*printf("cell_set_prim %u\n", prim);*/ + return TRUE; +} + + +static void +cell_vbuf_draw(struct vbuf_render *vbr, + const ushort *indices, + uint nr_indices) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + struct cell_context *cell = cvbr->cell; + float xmin, ymin, xmax, ymax; + uint i; + uint nr_vertices = 0, min_index = ~0; + const void *vertices = cvbr->vertex_buffer; + const uint vertex_size = cvbr->vertex_size; + + for (i = 0; i < nr_indices; i++) { + if (indices[i] > nr_vertices) + nr_vertices = indices[i]; + if (indices[i] < min_index) + min_index = indices[i]; + } + nr_vertices++; + +#if 0 + /*if (min_index > 0)*/ + printf("%s min_index = %u\n", __FUNCTION__, min_index); +#endif + +#if 0 + printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n", + nr_indices, nr_vertices); + printf(" "); + for (i = 0; i < nr_indices; i += 3) { + printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]); + } + printf("\n"); +#elif 0 + printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u indexes = [%u %u %u ...]\n", + nr_indices, nr_vertices, + indices[0], indices[1], indices[2]); + printf("ind space = %u, vert space = %u, space = %u\n", + nr_indices * 2, + nr_vertices * 4 * cell->vertex_info.size, + cell_batch_free_space(cell)); +#endif + + /* compute x/y bounding box */ + xmin = ymin = 1e50; + xmax = ymax = -1e50; + for (i = min_index; i < nr_vertices; i++) { + const float *v = (float *) ((ubyte *) vertices + i * vertex_size); + if (v[0] < xmin) + xmin = v[0]; + if (v[0] > xmax) + xmax = v[0]; + if (v[1] < ymin) + ymin = v[1]; + if (v[1] > ymax) + ymax = v[1]; + } +#if 0 + printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax); + fflush(stdout); +#endif + + if (cvbr->prim != PIPE_PRIM_TRIANGLES) + return; /* only render tris for now */ + + /* build/insert batch RENDER command */ + { + const uint index_bytes = ROUNDUP16(nr_indices * 2); + const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size); + STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0); + const uint batch_size = sizeof(struct cell_command_render) + index_bytes; + + struct cell_command_render *render + = (struct cell_command_render *) + cell_batch_alloc16(cell, batch_size); + + render->opcode[0] = CELL_CMD_RENDER; + render->prim_type = cvbr->prim; + + render->num_indexes = nr_indices; + render->min_index = min_index; + + /* append indices after render command */ + memcpy(render + 1, indices, nr_indices * 2); + + /* if there's room, append vertices after the indices, else leave + * vertices in the original/separate buffer. + */ + render->vertex_size = 4 * cell->vertex_info.size; + render->num_verts = nr_vertices; + if (ALLOW_INLINE_VERTS && + min_index == 0 && + vertex_bytes + 16 <= cell_batch_free_space(cell)) { + /* vertex data inlined, after indices, at 16-byte boundary */ + void *dst = cell_batch_alloc16(cell, vertex_bytes); + memcpy(dst, vertices, vertex_bytes); + render->inline_verts = TRUE; + render->vertex_buf = ~0; + } + else { + /* vertex data in separate buffer */ + render->inline_verts = FALSE; + ASSERT(cvbr->vertex_buf >= 0); + render->vertex_buf = cvbr->vertex_buf; + } + + render->xmin = xmin; + render->ymin = ymin; + render->xmax = xmax; + render->ymax = ymax; + } + +#if 0 + /* helpful for debug */ + cell_flush_int(cell, CELL_FLUSH_WAIT); +#endif +} + + +static void +cell_vbuf_destroy(struct vbuf_render *vbr) +{ + struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr); + cvbr->cell->vbuf_render = NULL; + FREE(cvbr); +} + + +/** + * Initialize the post-transform vertex buffer information for the given + * context. + */ +void +cell_init_vbuf(struct cell_context *cell) +{ + assert(cell->draw); + + cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render); + + /* The max number of indexes is what can fix into a batch buffer, + * minus the render and release-verts commands. + */ + cell->vbuf_render->base.max_indices + = (CELL_BUFFER_SIZE + - sizeof(struct cell_command_render) + - sizeof(struct cell_command_release_verts)) + / sizeof(ushort); + cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE; + + cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info; + cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices; + cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive; + cell->vbuf_render->base.draw = cell_vbuf_draw; + cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices; + cell->vbuf_render->base.destroy = cell_vbuf_destroy; + + cell->vbuf_render->cell = cell; +#if 1 + cell->vbuf_render->vertex_buf = ~0; +#endif + + cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base); +} diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.h b/src/gallium/drivers/cell/ppu/cell_vbuf.h new file mode 100644 index 00000000000..d265cbf7701 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_vbuf.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef CELL_VBUF_H +#define CELL_VBUF_H + + +struct cell_context; + +extern void +cell_init_vbuf(struct cell_context *cell); + + +#endif /* CELL_VBUF_H */ diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c new file mode 100644 index 00000000000..9cba537d9eb --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c @@ -0,0 +1,346 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include "pipe/p_defines.h" +#include "pipe/p_context.h" +#include "pipe/p_format.h" + +#include "../auxiliary/draw/draw_context.h" +#include "../auxiliary/draw/draw_private.h" + +#include "cell_context.h" +#include "rtasm/rtasm_ppc_spe.h" + + +/** + * Emit a 4x4 matrix transpose operation + * + * \param p Function that the transpose operation is to be appended to + * \param row0 Register containing row 0 of the source matrix + * \param row1 Register containing row 1 of the source matrix + * \param row2 Register containing row 2 of the source matrix + * \param row3 Register containing row 3 of the source matrix + * \param dest_ptr Register containing the address of the destination matrix + * \param shuf_ptr Register containing the address of the shuffled data + * \param count Number of colums to actually be written to the destination + * + * \note + * This function assumes that the registers named by \c row0, \c row1, + * \c row2, and \c row3 are scratch and can be modified by the generated code. + * Furthermore, these registers will be released, via calls to + * \c release_register, by this function. + * + * \note + * This function requires that four temporary are available on entry. + */ +static void +emit_matrix_transpose(struct spe_function *p, + unsigned row0, unsigned row1, unsigned row2, + unsigned row3, unsigned dest_ptr, + unsigned shuf_ptr, unsigned count) +{ + int shuf_hi = spe_allocate_available_register(p); + int shuf_lo = spe_allocate_available_register(p); + int t1 = spe_allocate_available_register(p); + int t2 = spe_allocate_available_register(p); + int t3; + int t4; + int col0; + int col1; + int col2; + int col3; + + + spe_lqd(p, shuf_hi, shuf_ptr, 3*16); + spe_lqd(p, shuf_lo, shuf_ptr, 4*16); + spe_shufb(p, t1, row0, row2, shuf_hi); + spe_shufb(p, t2, row0, row2, shuf_lo); + + + /* row0 and row2 are now no longer needed. Re-use those registers as + * temporaries. + */ + t3 = row0; + t4 = row2; + + spe_shufb(p, t3, row1, row3, shuf_hi); + spe_shufb(p, t4, row1, row3, shuf_lo); + + + /* row1 and row3 are now no longer needed. Re-use those registers as + * temporaries. + */ + col0 = row1; + col1 = row3; + + spe_shufb(p, col0, t1, t3, shuf_hi); + if (count > 1) { + spe_shufb(p, col1, t1, t3, shuf_lo); + } + + /* t1 and t3 are now no longer needed. Re-use those registers as + * temporaries. + */ + col2 = t1; + col3 = t3; + + if (count > 2) { + spe_shufb(p, col2, t2, t4, shuf_hi); + } + + if (count > 3) { + spe_shufb(p, col3, t2, t4, shuf_lo); + } + + + /* Store the results. Remember that the stqd instruction is encoded using + * the qword offset (stand-alone assemblers to the byte-offset to + * qword-offset conversion for you), so the byte-offset needs be divided by + * 16. + */ + switch (count) { + case 4: + spe_stqd(p, col3, dest_ptr, 3 * 16); + case 3: + spe_stqd(p, col2, dest_ptr, 2 * 16); + case 2: + spe_stqd(p, col1, dest_ptr, 1 * 16); + case 1: + spe_stqd(p, col0, dest_ptr, 0 * 16); + } + + + /* Release all of the temporary registers used. + */ + spe_release_register(p, col0); + spe_release_register(p, col1); + spe_release_register(p, col2); + spe_release_register(p, col3); + spe_release_register(p, shuf_hi); + spe_release_register(p, shuf_lo); + spe_release_register(p, t2); + spe_release_register(p, t4); +} + + +#if 0 +/* This appears to not be used currently */ +static void +emit_fetch(struct spe_function *p, + unsigned in_ptr, unsigned *offset, + unsigned out_ptr, unsigned shuf_ptr, + enum pipe_format format) +{ + const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0) + + (pf_size_z(format) != 0) + (pf_size_w(format) != 0); + const unsigned type = pf_type(format); + const unsigned bytes = pf_size_x(format); + + int v0 = spe_allocate_available_register(p); + int v1 = spe_allocate_available_register(p); + int v2 = spe_allocate_available_register(p); + int v3 = spe_allocate_available_register(p); + int tmp = spe_allocate_available_register(p); + int float_zero = -1; + int float_one = -1; + float scale_signed = 0.0; + float scale_unsigned = 0.0; + + spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16); + spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16); + spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16); + spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16); + offset[0] += 4; + + switch (bytes) { + case 1: + scale_signed = 1.0f / 127.0f; + scale_unsigned = 1.0f / 255.0f; + spe_lqd(p, tmp, shuf_ptr, 1 * 16); + spe_shufb(p, v0, v0, v0, tmp); + spe_shufb(p, v1, v1, v1, tmp); + spe_shufb(p, v2, v2, v2, tmp); + spe_shufb(p, v3, v3, v3, tmp); + break; + case 2: + scale_signed = 1.0f / 32767.0f; + scale_unsigned = 1.0f / 65535.0f; + spe_lqd(p, tmp, shuf_ptr, 2 * 16); + spe_shufb(p, v0, v0, v0, tmp); + spe_shufb(p, v1, v1, v1, tmp); + spe_shufb(p, v2, v2, v2, tmp); + spe_shufb(p, v3, v3, v3, tmp); + break; + case 4: + scale_signed = 1.0f / 2147483647.0f; + scale_unsigned = 1.0f / 4294967295.0f; + break; + default: + assert(0); + break; + } + + switch (type) { + case PIPE_FORMAT_TYPE_FLOAT: + break; + case PIPE_FORMAT_TYPE_UNORM: + spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16); + spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff); + spe_cuflt(p, v0, v0, 0); + spe_fm(p, v0, v0, tmp); + break; + case PIPE_FORMAT_TYPE_SNORM: + spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16); + spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff); + spe_csflt(p, v0, v0, 0); + spe_fm(p, v0, v0, tmp); + break; + case PIPE_FORMAT_TYPE_USCALED: + spe_cuflt(p, v0, v0, 0); + break; + case PIPE_FORMAT_TYPE_SSCALED: + spe_csflt(p, v0, v0, 0); + break; + } + + + if (count < 4) { + float_one = spe_allocate_available_register(p); + spe_il(p, float_one, 1); + spe_cuflt(p, float_one, float_one, 0); + + if (count < 3) { + float_zero = spe_allocate_available_register(p); + spe_il(p, float_zero, 0); + } + } + + spe_release_register(p, tmp); + + emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count); + + switch (count) { + case 1: + spe_stqd(p, float_zero, out_ptr, 1 * 16); + case 2: + spe_stqd(p, float_zero, out_ptr, 2 * 16); + case 3: + spe_stqd(p, float_one, out_ptr, 3 * 16); + } + + if (float_zero != -1) { + spe_release_register(p, float_zero); + } + + if (float_one != -1) { + spe_release_register(p, float_one); + } +} +#endif + + +void cell_update_vertex_fetch(struct draw_context *draw) +{ +#if 0 + struct cell_context *const cell = + (struct cell_context *) draw->driver_private; + struct spe_function *p = &cell->attrib_fetch; + unsigned function_index[PIPE_MAX_ATTRIBS]; + unsigned unique_attr_formats; + int out_ptr; + int in_ptr; + int shuf_ptr; + unsigned i; + unsigned j; + + + /* Determine how many unique input attribute formats there are. At the + * same time, store the index of the lowest numbered attribute that has + * the same format as any non-unique format. + */ + unique_attr_formats = 1; + function_index[0] = 0; + for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) { + const enum pipe_format curr_fmt = draw->vertex_element[i].src_format; + + for (j = 0; j < i; j++) { + if (curr_fmt == draw->vertex_element[j].src_format) { + break; + } + } + + if (j == i) { + unique_attr_formats++; + } + + function_index[i] = j; + } + + + /* Each fetch function can be a maximum of 34 instructions (note: this is + * actually a slight over-estimate). + */ + spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats); + + + /* Allocate registers for the function's input parameters. + */ + out_ptr = spe_allocate_register(p, 3); + in_ptr = spe_allocate_register(p, 4); + shuf_ptr = spe_allocate_register(p, 5); + + + /* Generate code for the individual attribute fetch functions. + */ + for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { + unsigned offset; + + if (function_index[i] == i) { + cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr + - (void *) p->store); + + offset = 0; + emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr, + draw->vertex_element[i].src_format); + spe_bi(p, 0, 0, 0); + + /* Round up to the next 16-byte boundary. + */ + if ((((unsigned) p->store) & 0x0f) != 0) { + const unsigned align = ((unsigned) p->store) & 0x0f; + p->store = (uint32_t *) (((void *) p->store) + align); + } + } else { + /* Use the same function entry-point as a previously seen attribute + * with the same format. + */ + cell->attrib_fetch_offsets[i] = + cell->attrib_fetch_offsets[function_index[i]]; + } + } +#else + assert(0); +#endif +} diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c new file mode 100644 index 00000000000..403cf6d50fc --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c @@ -0,0 +1,146 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file cell_vertex_shader.c + * Vertex shader interface routines for Cell. + * + * \author Ian Romanick <[email protected]> + */ + +#include "pipe/p_defines.h" +#include "pipe/p_context.h" +#include "pipe/internal/p_winsys_screen.h" +#include "util/u_math.h" + +#include "cell_context.h" +#include "cell_draw_arrays.h" +#include "cell_flush.h" +#include "cell_spu.h" +#include "cell_batch.h" + +#include "cell/common.h" +#include "draw/draw_context.h" +#include "draw/draw_private.h" + +/** + * Run the vertex shader on all vertices in the vertex queue. + * Called by the draw module when the vertx cache needs to be flushed. + */ +void +cell_vertex_shader_queue_flush(struct draw_context *draw) +{ +#if 0 + struct cell_context *const cell = + (struct cell_context *) draw->driver_private; + struct cell_command_vs *const vs = &cell_global.command[0].vs; + uint64_t *batch; + struct cell_array_info *array_info; + unsigned i, j; + struct cell_attribute_fetch_code *cf; + + assert(draw->vs.queue_nr != 0); + + /* XXX: do this on statechange: + */ + draw_update_vertex_fetch(draw); + cell_update_vertex_fetch(draw); + + + batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf)); + batch[0] = CELL_CMD_STATE_ATTRIB_FETCH; + cf = (struct cell_attribute_fetch_code *) (&batch[1]); + cf->base = (uint64_t) cell->attrib_fetch.store; + cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr + - (void *) cell->attrib_fetch.store)); + + + for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) { + const enum pipe_format format = draw->vertex_element[i].src_format; + const unsigned count = ((pf_size_x(format) != 0) + + (pf_size_y(format) != 0) + + (pf_size_z(format) != 0) + + (pf_size_w(format) != 0)); + const unsigned size = pf_size_x(format) * count; + + batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info)); + + batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO; + + array_info = (struct cell_array_info *) &batch[1]; + assert(draw->vertex_fetch.src_ptr[i] != NULL); + array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i]; + array_info->attr = i; + array_info->pitch = draw->vertex_fetch.pitch[i]; + array_info->size = size; + array_info->function_offset = cell->attrib_fetch_offsets[i]; + } + + batch = cell_batch_alloc(cell, sizeof(batch[0]) + + sizeof(struct pipe_viewport_state)); + batch[0] = CELL_CMD_STATE_VIEWPORT; + (void) memcpy(&batch[1], &draw->viewport, + sizeof(struct pipe_viewport_state)); + + { + uint64_t uniforms = (uintptr_t) draw->user.constants; + + batch = cell_batch_alloc(cell, 2 *sizeof(batch[0])); + batch[0] = CELL_CMD_STATE_UNIFORMS; + batch[1] = uniforms; + } + + cell_batch_flush(cell); + + vs->opcode = CELL_CMD_VS_EXECUTE; + vs->nr_attrs = draw->vertex_fetch.nr_attrs; + + (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane)); + vs->nr_planes = draw->nr_planes; + + for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) { + const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i); + + for (j = 0; j < n; j++) { + vs->elts[j] = draw->vs.queue[i + j].elt; + vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex; + } + + for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) { + vs->elts[j] = vs->elts[0]; + vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex; + } + + vs->num_elts = n; + send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE); + + cell_flush_int(cell, CELL_FLUSH_WAIT); + } + + draw->vs.post_nr = draw->vs.queue_nr; + draw->vs.queue_nr = 0; +#else + assert(0); +#endif +} diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.h b/src/gallium/drivers/cell/ppu/cell_winsys.h new file mode 100644 index 00000000000..ae2af5696b5 --- /dev/null +++ b/src/gallium/drivers/cell/ppu/cell_winsys.h @@ -0,0 +1,50 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef CELL_WINSYS_H +#define CELL_WINSYS_H + +#include "pipe/p_compiler.h" + + +/** + * Very simple winsys at this time. + * Will probably eventually add SPU control info. + */ +struct cell_winsys +{ + uint preferredFormat; +}; + + +extern struct cell_winsys * +cell_get_winsys(uint format); + + + +#endif diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore new file mode 100644 index 00000000000..2be9a2d3242 --- /dev/null +++ b/src/gallium/drivers/cell/spu/.gitignore @@ -0,0 +1 @@ +g3d_spu diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile new file mode 100644 index 00000000000..116453b79c5 --- /dev/null +++ b/src/gallium/drivers/cell/spu/Makefile @@ -0,0 +1,82 @@ +# Gallium3D Cell driver: SPU code + +# This makefile builds the g3d_spu.a file that's linked into the +# PPU code/library. + + +TOP = ../../../../.. +include $(TOP)/configs/current + + +PROG = g3d + +PROG_SPU = $(PROG)_spu +PROG_SPU_A = $(PROG)_spu.a +PROG_SPU_EMBED_O = $(PROG)_spu-embed.o + + +SOURCES = \ + spu_command.c \ + spu_dcache.c \ + spu_funcs.c \ + spu_main.c \ + spu_per_fragment_op.c \ + spu_render.c \ + spu_texture.c \ + spu_tile.c \ + spu_tri.c + +OLD_SOURCES = \ + spu_exec.c \ + spu_util.c \ + spu_vertex_fetch.c \ + spu_vertex_shader.c + + +SPU_OBJECTS = $(SOURCES:.c=.o) \ + +SPU_ASM_OUT = $(SOURCES:.c=.s) \ + +INCLUDE_DIRS = \ + -I$(TOP)/src/mesa \ + -I$(TOP)/src/gallium/include \ + -I$(TOP)/src/gallium/auxiliary \ + -I$(TOP)/src/gallium/drivers + + +.c.o: + $(SPU_CC) $(SPU_CFLAGS) -c $< + +.c.s: + $(SPU_CC) $(SPU_CFLAGS) -O3 -S $< + + +# The .a file will be linked into the main/PPU executable +default: $(PROG_SPU_A) + +$(PROG_SPU_A): $(PROG_SPU_EMBED_O) + $(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O) + +$(PROG_SPU_EMBED_O): $(PROG_SPU) + $(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O) + +$(PROG_SPU): $(SPU_OBJECTS) + $(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS) + + + +asmfiles: $(SPU_ASM_OUT) + + +clean: + rm -f *~ *.o *.a *.d *.s $(PROG_SPU) + + + +depend: $(SOURCES) + rm -f depend + touch depend + $(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null + +include depend + diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h new file mode 100644 index 00000000000..d7ce0055248 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_colorpack.h @@ -0,0 +1,145 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#ifndef SPU_COLORPACK_H +#define SPU_COLORPACK_H + + +#include <transpose_matrix4x4.h> +#include <spu_intrinsics.h> + + +static INLINE unsigned int +spu_pack_R8G8B8A8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + + out = spu_shuffle(out, out, ((vector unsigned char) { + 0, 4, 8, 12, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }) ); + + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_A8R8G8B8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, ((vector unsigned char) { + 12, 0, 4, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}) ); + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_B8G8R8A8(vector float rgba) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, ((vector unsigned char) { + 8, 4, 0, 12, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}) ); + return spu_extract(out, 0); +} + + +static INLINE unsigned int +spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle) +{ + vector unsigned int out = spu_convtu(rgba, 32); + out = spu_shuffle(out, out, shuffle); + return spu_extract(out, 0); +} + + +static INLINE vector float +spu_unpack_B8G8R8A8(uint color) +{ + vector unsigned int color_u4 = spu_splats(color); + color_u4 = spu_shuffle(color_u4, color_u4, + ((vector unsigned char) { + 2, 2, 2, 2, + 1, 1, 1, 1, + 0, 0, 0, 0, + 3, 3, 3, 3}) ); + return spu_convtf(color_u4, 32); +} + + +static INLINE vector float +spu_unpack_A8R8G8B8(uint color) +{ + vector unsigned int color_u4 = spu_splats(color); + color_u4 = spu_shuffle(color_u4, color_u4, + ((vector unsigned char) { + 1, 1, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, + 0, 0, 0, 0}) ); + return spu_convtf(color_u4, 32); +} + + +/** + * \param color_in - array of 32-bit packed ARGB colors + * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order + */ +static INLINE void +spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4], + vector float color_out[4]) +{ + vector unsigned int c0; + + c0 = spu_shuffle(color_in[0], color_in[0], + ((vector unsigned char) { + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) ); + color_out[0] = spu_convtf(c0, 32); + + c0 = spu_shuffle(color_in[1], color_in[1], + ((vector unsigned char) { + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) ); + color_out[1] = spu_convtf(c0, 32); + + c0 = spu_shuffle(color_in[2], color_in[2], + ((vector unsigned char) { + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) ); + color_out[2] = spu_convtf(c0, 32); + + c0 = spu_shuffle(color_in[3], color_in[3], + ((vector unsigned char) { + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) ); + color_out[3] = spu_convtf(c0, 32); + + _transpose_matrix4x4(color_out, color_out); +} + + + +#endif /* SPU_COLORPACK_H */ diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c new file mode 100644 index 00000000000..5c0179d9546 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -0,0 +1,815 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * SPU command processing code + */ + + +#include <stdio.h> +#include <libmisc.h> + +#include "pipe/p_defines.h" + +#include "spu_command.h" +#include "spu_main.h" +#include "spu_render.h" +#include "spu_per_fragment_op.h" +#include "spu_texture.h" +#include "spu_tile.h" +#include "spu_vertex_shader.h" +#include "spu_dcache.h" +#include "cell/common.h" + + +struct spu_vs_context draw; + + +/** + * Buffers containing dynamically generated SPU code: + */ +static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] + ALIGN16_ATTRIB; + + + +static INLINE int +align(int value, int alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + + + +/** + * Tell the PPU that this SPU has finished copying a buffer to + * local store and that it may be reused by the PPU. + * This is done by writting a 16-byte batch-buffer-status block back into + * main memory (in cell_context->buffer_status[]). + */ +static void +release_buffer(uint buffer) +{ + /* Evidently, using less than a 16-byte status doesn't work reliably */ + static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE, + CELL_BUFFER_STATUS_FREE}; + const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); + uint *dst = spu.init.buffer_status + index; + + ASSERT(buffer < CELL_NUM_BUFFERS); + + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_MISC, /* tag is unimportant */ + 0, /* tid */ + 0 /* rid */); +} + + +/** + * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory. + * There's a qword of status per SPU. + */ +static void +cmd_fence(struct cell_command_fence *fence_cmd) +{ + static const vector unsigned int status = {CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED, + CELL_FENCE_SIGNALLED}; + uint *dst = (uint *) fence_cmd->fence; + dst += 4 * spu.init.id; /* main store/memory address, not local store */ + ASSERT_ALIGN16(dst); + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_FENCE, /* tag */ + 0, /* tid */ + 0 /* rid */); +} + + +static void +cmd_clear_surface(const struct cell_command_clear_surface *clear) +{ + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); + + if (clear->surface == 0) { + spu.fb.color_clear_value = clear->value; + if (spu.init.debug_flags & CELL_DEBUG_CHECKER) { + uint x = (spu.init.id << 4) | (spu.init.id << 12) | + (spu.init.id << 20) | (spu.init.id << 28); + spu.fb.color_clear_value ^= x; + } + } + else { + spu.fb.depth_clear_value = clear->value; + } + +#define CLEAR_OPT 1 +#if CLEAR_OPT + + /* Simply set all tiles' status to CLEAR. + * When we actually begin rendering into a tile, we'll initialize it to + * the clear value. If any tiles go untouched during the frame, + * really_clear_tiles() will set them to the clear value. + */ + if (clear->surface == 0) { + memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); + } + else { + memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); + } + +#else + + /* + * This path clears the whole framebuffer to the clear color right now. + */ + + /* + printf("SPU: %s num=%d w=%d h=%d\n", + __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles); + */ + + /* init a single tile to the clear value */ + if (clear->surface == 0) { + clear_c_tile(&spu.ctile); + } + else { + clear_z_tile(&spu.ztile); + } + + /* walk over my tiles, writing the 'clear' tile's data */ + { + const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + uint i; + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (clear->surface == 0) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); + else + put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); + } + } + + if (spu.init.debug_flags & CELL_DEBUG_SYNC) { + wait_on_mask(1 << TAG_SURFACE_CLEAR); + } + +#endif /* CLEAR_OPT */ + + D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n"); +} + + +static void +cmd_release_verts(const struct cell_command_release_verts *release) +{ + D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf); + ASSERT(release->vertex_buf != ~0U); + release_buffer(release->vertex_buf); +} + + +/** + * Process a CELL_CMD_STATE_FRAGMENT_OPS command. + * This involves installing new fragment ops SPU code. + * If this function is never called, we'll use a regular C fallback function + * for fragment processing. + */ +static void +cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) +{ + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n"); + + /* Copy state info (for fallback case only - this will eventually + * go away when the fallback case goes away) + */ + memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); + memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color)); + + /* Make sure the SPU knows which buffers it's expected to read when + * it's told to pull tiles. + */ + spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled); + + /* If we're forcing the fallback code to be used (for debug purposes), + * install that. Otherwise install the incoming SPU code. + */ + if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) { + static unsigned int warned = 0; + if (!warned) { + fprintf(stderr, "Cell Warning: using fallback per-fragment code\n"); + warned = 1; + } + /* The following two lines aren't really necessary if you + * know the debug flags won't change during a run, and if you + * know that the function pointers are initialized correctly. + * We set them here to allow a person to change the debug + * flags during a run (from inside a debugger). + */ + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + return; + } + + /* Make sure the SPU code buffer is large enough to hold the incoming code. + * Note that we *don't* use align_malloc() and align_free(), because + * those utility functions are *not* available in SPU code. + * */ + if (spu.fragment_ops_code_size < fops->total_code_size) { + if (spu.fragment_ops_code != NULL) { + free(spu.fragment_ops_code); + } + spu.fragment_ops_code_size = fops->total_code_size; + spu.fragment_ops_code = malloc(fops->total_code_size); + if (spu.fragment_ops_code == NULL) { + /* Whoops. */ + fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size); + spu.fragment_ops_code = NULL; + spu.fragment_ops_code_size = 0; + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + return; + } + } + + /* Copy the SPU code from the command buffer to the spu buffer */ + memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size); + + /* Set the pointers for the front-facing and back-facing fragments + * to the specified offsets within the code. Note that if the + * front-facing and back-facing code are the same, they'll have + * the same offset. + */ + spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index]; + spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index]; +} + +static void +cmd_state_fragment_program(const struct cell_command_fragment_program *fp) +{ + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n"); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_program_code, fp->code, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); +#if 01 + /* Point function pointer at new code */ + spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; +#endif +} + + +static uint +cmd_state_fs_constants(const qword *buffer, uint pos) +{ + const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0); + const float *constants = (const float *) &buffer[pos+2]; + uint i; + + D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const); + + /* Expand each float to float[4] for SOA execution */ + for (i = 0; i < num_const; i++) { + D_PRINTF(CELL_DEBUG_CMD, " const[%u] = %f\n", i, constants[i]); + spu.constants[i] = spu_splats(constants[i]); + } + + /* return new buffer pos (in 16-byte words) */ + return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16); +} + + +static void +cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) +{ + D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", + cmd->width, + cmd->height, + cmd->color_start, + cmd->color_format, + cmd->depth_format); + + ASSERT_ALIGN16(cmd->color_start); + ASSERT_ALIGN16(cmd->depth_start); + + spu.fb.color_start = cmd->color_start; + spu.fb.depth_start = cmd->depth_start; + spu.fb.color_format = cmd->color_format; + spu.fb.depth_format = cmd->depth_format; + spu.fb.width = cmd->width; + spu.fb.height = cmd->height; + spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE; + spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE; + + switch (spu.fb.depth_format) { + case PIPE_FORMAT_Z32_UNORM: + spu.fb.zsize = 4; + spu.fb.zscale = (float) 0xffffffffu; + break; + case PIPE_FORMAT_Z24S8_UNORM: + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + spu.fb.zsize = 4; + spu.fb.zscale = (float) 0x00ffffffu; + break; + case PIPE_FORMAT_Z16_UNORM: + spu.fb.zsize = 2; + spu.fb.zscale = (float) 0xffffu; + break; + default: + spu.fb.zsize = 0; + break; + } +} + + +/** + * Tex texture mask_s/t and scale_s/t fields depend on the texture size and + * sampler wrap modes. + */ +static void +update_tex_masks(struct spu_texture *texture, + const struct pipe_sampler_state *sampler) +{ + uint i; + + for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { + int width = texture->level[i].width; + int height = texture->level[i].height; + + if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) + texture->level[i].mask_s = spu_splats(width - 1); + else + texture->level[i].mask_s = spu_splats(~0); + + if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT) + texture->level[i].mask_t = spu_splats(height - 1); + else + texture->level[i].mask_t = spu_splats(~0); + + if (sampler->normalized_coords) { + texture->level[i].scale_s = spu_splats((float) width); + texture->level[i].scale_t = spu_splats((float) height); + } + else { + texture->level[i].scale_s = spu_splats(1.0f); + texture->level[i].scale_t = spu_splats(1.0f); + } + } +} + + +static void +cmd_state_sampler(const struct cell_command_sampler *sampler) +{ + uint unit = sampler->unit; + + D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit); + + spu.sampler[unit] = sampler->state; + + switch (spu.sampler[unit].min_img_filter) { + case PIPE_TEX_FILTER_LINEAR: + spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear; + break; + case PIPE_TEX_FILTER_ANISO: + /* fall-through, for now */ + case PIPE_TEX_FILTER_NEAREST: + spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest; + break; + default: + ASSERT(0); + } + + switch (spu.sampler[sampler->unit].mag_img_filter) { + case PIPE_TEX_FILTER_LINEAR: + spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear; + break; + case PIPE_TEX_FILTER_ANISO: + /* fall-through, for now */ + case PIPE_TEX_FILTER_NEAREST: + spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest; + break; + default: + ASSERT(0); + } + + switch (spu.sampler[sampler->unit].min_mip_filter) { + case PIPE_TEX_MIPFILTER_NEAREST: + case PIPE_TEX_MIPFILTER_LINEAR: + spu.sample_texture_2d[unit] = sample_texture_2d_lod; + break; + case PIPE_TEX_MIPFILTER_NONE: + spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit]; + break; + default: + ASSERT(0); + } + + update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); +} + + +static void +cmd_state_texture(const struct cell_command_texture *texture) +{ + const uint unit = texture->unit; + uint i; + + D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit); + + spu.texture[unit].max_level = 0; + spu.texture[unit].target = texture->target; + + for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) { + uint width = texture->width[i]; + uint height = texture->height[i]; + uint depth = texture->depth[i]; + + D_PRINTF(CELL_DEBUG_CMD, " LEVEL %u: at %p size[0] %u x %u\n", i, + texture->start[i], texture->width[i], texture->height[i]); + + spu.texture[unit].level[i].start = texture->start[i]; + spu.texture[unit].level[i].width = width; + spu.texture[unit].level[i].height = height; + spu.texture[unit].level[i].depth = depth; + + spu.texture[unit].level[i].tiles_per_row = + (width + TILE_SIZE - 1) / TILE_SIZE; + + spu.texture[unit].level[i].bytes_per_image = + 4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth; + + spu.texture[unit].level[i].max_s = spu_splats((int) width - 1); + spu.texture[unit].level[i].max_t = spu_splats((int) height - 1); + + if (texture->start[i]) + spu.texture[unit].max_level = i; + } + + update_tex_masks(&spu.texture[unit], &spu.sampler[unit]); +} + + +static void +cmd_state_vertex_info(const struct vertex_info *vinfo) +{ + D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); + ASSERT(vinfo->num_attribs >= 1); + ASSERT(vinfo->num_attribs <= 8); + memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); +} + + +static void +cmd_state_vs_array_info(const struct cell_array_info *vs_info) +{ + const unsigned attr = vs_info->attr; + + ASSERT(attr < PIPE_MAX_ATTRIBS); + draw.vertex_fetch.src_ptr[attr] = vs_info->base; + draw.vertex_fetch.pitch[attr] = vs_info->pitch; + draw.vertex_fetch.size[attr] = vs_info->size; + draw.vertex_fetch.code_offset[attr] = vs_info->function_offset; + draw.vertex_fetch.dirty = 1; +} + + +static void +cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) +{ + mfc_get(attribute_fetch_code_buffer, + (unsigned int) code->base, /* src */ + code->size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + draw.vertex_fetch.code = attribute_fetch_code_buffer; +} + + +static void +cmd_finish(void) +{ + D_PRINTF(CELL_DEBUG_CMD, "FINISH\n"); + really_clear_tiles(0); + /* wait for all outstanding DMAs to finish */ + mfc_write_tag_mask(~0); + mfc_read_tag_status_all(); + /* send mbox message to PPU */ + spu_write_out_mbox(CELL_CMD_FINISH); +} + + +/** + * Execute a batch of commands which was sent to us by the PPU. + * See the cell_emit_state.c code to see where the commands come from. + * + * The opcode param encodes the location of the buffer and its size. + */ +static void +cmd_batch(uint opcode) +{ + const uint buf = (opcode >> 8) & 0xff; + uint size = (opcode >> 16); + qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB; + const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]); + uint pos; + + D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n", + buf, size, spu.init.buffers[buf]); + + ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); + + ASSERT_ALIGN16(spu.init.buffers[buf]); + + size = ROUNDUP16(size); + + ASSERT_ALIGN16(spu.init.buffers[buf]); + + mfc_get(buffer, /* dest */ + (unsigned int) spu.init.buffers[buf], /* src */ + size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + /* Tell PPU we're done copying the buffer to local store */ + D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf); + release_buffer(buf); + + /* + * Loop over commands in the batch buffer + */ + for (pos = 0; pos < usize; /* no incr */) { + switch (si_to_uint(buffer[pos])) { + /* + * rendering commands + */ + case CELL_CMD_CLEAR_SURFACE: + { + struct cell_command_clear_surface *clr + = (struct cell_command_clear_surface *) &buffer[pos]; + cmd_clear_surface(clr); + pos += sizeof(*clr) / 16; + } + break; + case CELL_CMD_RENDER: + { + struct cell_command_render *render + = (struct cell_command_render *) &buffer[pos]; + uint pos_incr; + cmd_render(render, &pos_incr); + pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return + } + break; + /* + * state-update commands + */ + case CELL_CMD_STATE_FRAMEBUFFER: + { + struct cell_command_framebuffer *fb + = (struct cell_command_framebuffer *) &buffer[pos]; + cmd_state_framebuffer(fb); + pos += sizeof(*fb) / 16; + } + break; + case CELL_CMD_STATE_FRAGMENT_OPS: + { + struct cell_command_fragment_ops *fops + = (struct cell_command_fragment_ops *) &buffer[pos]; + cmd_state_fragment_ops(fops); + /* This is a variant-sized command */ + pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16; + } + break; + case CELL_CMD_STATE_FRAGMENT_PROGRAM: + { + struct cell_command_fragment_program *fp + = (struct cell_command_fragment_program *) &buffer[pos]; + cmd_state_fragment_program(fp); + pos += sizeof(*fp) / 16; + } + break; + case CELL_CMD_STATE_FS_CONSTANTS: + pos = cmd_state_fs_constants(buffer, pos); + break; + case CELL_CMD_STATE_RASTERIZER: + { + struct cell_command_rasterizer *rast = + (struct cell_command_rasterizer *) &buffer[pos]; + spu.rasterizer = rast->rasterizer; + pos += sizeof(*rast) / 16; + } + break; + case CELL_CMD_STATE_SAMPLER: + { + struct cell_command_sampler *sampler + = (struct cell_command_sampler *) &buffer[pos]; + cmd_state_sampler(sampler); + pos += sizeof(*sampler) / 16; + } + break; + case CELL_CMD_STATE_TEXTURE: + { + struct cell_command_texture *texture + = (struct cell_command_texture *) &buffer[pos]; + cmd_state_texture(texture); + pos += sizeof(*texture) / 16; + } + break; + case CELL_CMD_STATE_VERTEX_INFO: + cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); + pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16; + break; + case CELL_CMD_STATE_VIEWPORT: + (void) memcpy(& draw.viewport, &buffer[pos+1], + sizeof(struct pipe_viewport_state)); + pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16; + break; + case CELL_CMD_STATE_UNIFORMS: + draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0); + pos += 2; + break; + case CELL_CMD_STATE_VS_ARRAY_INFO: + cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); + pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16; + break; + case CELL_CMD_STATE_BIND_VS: +#if 0 + spu_bind_vertex_shader(&draw, + (struct cell_shader_info *) &buffer[pos+1]); +#endif + pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16; + break; + case CELL_CMD_STATE_ATTRIB_FETCH: + cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) + &buffer[pos+1]); + pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16; + break; + /* + * misc commands + */ + case CELL_CMD_FINISH: + cmd_finish(); + pos += 1; + break; + case CELL_CMD_FENCE: + { + struct cell_command_fence *fence_cmd = + (struct cell_command_fence *) &buffer[pos]; + cmd_fence(fence_cmd); + pos += sizeof(*fence_cmd) / 16; + } + break; + case CELL_CMD_RELEASE_VERTS: + { + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) &buffer[pos]; + cmd_release_verts(release); + pos += sizeof(*release) / 16; + } + break; + case CELL_CMD_FLUSH_BUFFER_RANGE: { + struct cell_buffer_range *br = (struct cell_buffer_range *) + &buffer[pos+1]; + + spu_dcache_mark_dirty((unsigned) br->base, br->size); + pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16; + break; + } + default: + printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos])); + ASSERT(0); + break; + } + } + + D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n"); +} + + +#define PERF 0 + + +/** + * Main loop for SPEs: Get a command, execute it, repeat. + */ +void +command_loop(void) +{ + int exitFlag = 0; + uint t0, t1; + + D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n"); + + while (!exitFlag) { + unsigned opcode; + + D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n"); + + if (PERF) + spu_write_decrementer(~0); + + /* read/wait from mailbox */ + opcode = (unsigned int) spu_read_in_mbox(); + D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode); + + if (PERF) + t0 = spu_read_decrementer(); + + switch (opcode & CELL_CMD_OPCODE_MASK) { + case CELL_CMD_EXIT: + D_PRINTF(CELL_DEBUG_CMD, "EXIT\n"); + exitFlag = 1; + break; + case CELL_CMD_VS_EXECUTE: +#if 0 + spu_execute_vertex_shader(&draw, &cmd.vs); +#endif + break; + case CELL_CMD_BATCH: + cmd_batch(opcode); + break; + default: + printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); + } + + if (PERF) { + t1 = spu_read_decrementer(); + printf("wait mbox time: %gms batch time: %gms\n", + (~0u - t0) * spu.init.inv_timebase, + (t0 - t1) * spu.init.inv_timebase); + } + } + + D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n"); + + if (spu.init.debug_flags & CELL_DEBUG_CACHE) + spu_dcache_report(); +} + +/* Initialize this module; we manage the fragment ops buffer here. */ +void +spu_command_init(void) +{ + /* Install default/fallback fragment processing function. + * This will normally be overriden by a code-gen'd function + * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. + */ + spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops; + spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops; + + /* Set up the basic empty buffer for code-gen'ed fragment ops */ + spu.fragment_ops_code = NULL; + spu.fragment_ops_code_size = 0; +} + +void +spu_command_close(void) +{ + /* Deallocate the code-gen buffer for fragment ops, and reset the + * fragment ops functions to their initial setting (just to leave + * things in a good state). + */ + if (spu.fragment_ops_code != NULL) { + free(spu.fragment_ops_code); + } + spu_command_init(); +} diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h new file mode 100644 index 00000000000..83dcdade288 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_command.h @@ -0,0 +1,35 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +extern void +command_loop(void); + +extern void +spu_command_init(void); + +extern void +spu_command_close(void); diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c new file mode 100644 index 00000000000..a6d67634fd8 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.c @@ -0,0 +1,145 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cell/common.h" +#include "spu_main.h" +#include "spu_dcache.h" + +#define CACHELINE_LOG2SIZE 7 +#define LINE_SIZE (1U << 7) +#define ALIGN_MASK (~(LINE_SIZE - 1)) + +#define CACHE_NAME data +#define CACHED_TYPE qword +#define CACHE_TYPE CACHE_TYPE_RO +#define CACHE_SET_TAGID(set) (((set) & 0x03) + TAG_DCACHE0) +#define CACHE_LOG2NNWAY 2 +#define CACHE_LOG2NSETS 6 +#ifdef DEBUG +#define CACHE_STATS 1 +#endif +#include <cache-api.h> + +/* Yes folks, this is ugly. + */ +#undef CACHE_NWAY +#undef CACHE_NSETS +#define CACHE_NAME data +#define CACHE_NWAY 4 +#define CACHE_NSETS (1U << 6) + + +/** + * Fetch between arbitrary number of bytes from an unaligned address + * + * \param dst Destination data buffer + * \param ea Main memory effective address of source data + * \param size Number of bytes to read + * + * \warning + * As is hinted by the type of the \c dst pointer, this function writes + * multiples of 16-bytes. + */ +void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size) +{ + const int shift = ea & 0x0f; + const unsigned read_size = ROUNDUP16(size + shift); + const unsigned last_read = ROUNDUP16(ea + size); + const qword *const last_write = dst + (ROUNDUP16(size) / 16); + unsigned i; + + + if (shift == 0) { + /* Data is already aligned. Fetch directly into the destination buffer. + */ + for (i = 0; i < size; i += 16) { + *(dst++) = cache_rd(data, ea + i); + } + } else { + qword hi; + + + /* Please exercise extreme caution when modifying this code. This code + * must not read past the end of the page containing the source data, + * and it must not write more than ((size + 15) / 16) qwords to the + * destination buffer. + */ + ea &= ~0x0f; + hi = cache_rd(data, ea); + for (i = 16; i < read_size; i += 16) { + qword lo = cache_rd(data, ea + i); + + *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), + (qword) spu_rlmaskqwbyte(lo, shift - 16)); + hi = lo; + } + + if (dst != last_write) { + *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0)); + } + } + + ASSERT((ea + i) == last_read); + ASSERT(dst == last_write); +} + + +/** + * Notify the cache that a range of main memory may have been modified + */ +void +spu_dcache_mark_dirty(unsigned ea, unsigned size) +{ + unsigned i; + const unsigned aligned_start = (ea & ALIGN_MASK); + const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) + & ALIGN_MASK; + + + for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { + const unsigned entry = __cache_dir[i]; + const unsigned addr = entry & ~0x0f; + + __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end)) + ? (entry & ~CACHELINE_VALID) : entry; + } +} + + +/** + * Print cache utilization report + */ +void +spu_dcache_report(void) +{ +#ifdef CACHE_STATS + if (spu.init.id == 0) { + printf("SPU 0: Texture cache report:\n"); + cache_pr_stats(data); + } +#endif +} + + diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h new file mode 100644 index 00000000000..39a19eb31b5 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_dcache.h @@ -0,0 +1,37 @@ +/* + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SPU_DCACHE_H +#define SPU_DCACHE_H + +extern void +spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size); + +extern void +spu_dcache_mark_dirty(unsigned ea, unsigned size); + +extern void +spu_dcache_report(void); + +#endif /* SPU_DCACHE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c new file mode 100644 index 00000000000..e27df2dfb38 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_exec.c @@ -0,0 +1,1933 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * TGSI interpretor/executor. + * + * Flow control information: + * + * Since we operate on 'quads' (4 pixels or 4 vertices in parallel) + * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special + * care since a condition may be true for some quad components but false + * for other components. + * + * We basically execute all statements (even if they're in the part of + * an IF/ELSE clause that's "not taken") and use a special mask to + * control writing to destination registers. This is the ExecMask. + * See store_dest(). + * + * The ExecMask is computed from three other masks (CondMask, LoopMask and + * ContMask) which are controlled by the flow control instructions (namely: + * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT). + * + * + * Authors: + * Michal Krol + * Brian Paul + */ + +#include <transpose_matrix4x4.h> +#include <simdmath/ceilf4.h> +#include <simdmath/cosf4.h> +#include <simdmath/divf4.h> +#include <simdmath/floorf4.h> +#include <simdmath/log2f4.h> +#include <simdmath/powf4.h> +#include <simdmath/sinf4.h> +#include <simdmath/sqrtf4.h> +#include <simdmath/truncf4.h> + +#include "pipe/p_compiler.h" +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "spu_exec.h" +#include "spu_main.h" +#include "spu_vertex_shader.h" +#include "spu_dcache.h" +#include "cell/common.h" + +#define TILE_TOP_LEFT 0 +#define TILE_TOP_RIGHT 1 +#define TILE_BOTTOM_LEFT 2 +#define TILE_BOTTOM_RIGHT 3 + +/* + * Shorthand locations of various utility registers (_I = Index, _C = Channel) + */ +#define TEMP_0_I TGSI_EXEC_TEMP_00000000_I +#define TEMP_0_C TGSI_EXEC_TEMP_00000000_C +#define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I +#define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C +#define TEMP_80_I TGSI_EXEC_TEMP_80000000_I +#define TEMP_80_C TGSI_EXEC_TEMP_80000000_C +#define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I +#define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C +#define TEMP_1_I TGSI_EXEC_TEMP_ONE_I +#define TEMP_1_C TGSI_EXEC_TEMP_ONE_C +#define TEMP_2_I TGSI_EXEC_TEMP_TWO_I +#define TEMP_2_C TGSI_EXEC_TEMP_TWO_C +#define TEMP_128_I TGSI_EXEC_TEMP_128_I +#define TEMP_128_C TGSI_EXEC_TEMP_128_C +#define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I +#define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C +#define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I +#define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C +#define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I +#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C +#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I +#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C +#define TEMP_R0 TGSI_EXEC_TEMP_R0 + +#define FOR_EACH_CHANNEL(CHAN)\ + for (CHAN = 0; CHAN < 4; CHAN++) + +#define IS_CHANNEL_ENABLED(INST, CHAN)\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + +#define IS_CHANNEL_ENABLED2(INST, CHAN)\ + ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN))) + +#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED( INST, CHAN )) + +#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED2( INST, CHAN )) + + +/** The execution mask depends on the conditional mask and the loop mask */ +#define UPDATE_EXEC_MASK(MACH) \ + MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + + +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 + + + +/** + * Initialize machine state by expanding tokens to full instructions, + * allocating temporary storage, setting up constants, etc. + * After this, we can call spu_exec_machine_run() many times. + */ +void +spu_exec_machine_init(struct spu_exec_machine *mach, + uint numSamplers, + struct spu_sampler *samplers, + unsigned processor) +{ + const qword zero = si_il(0); + const qword not_zero = si_il(~0); + + (void) numSamplers; + mach->Samplers = samplers; + mach->Processor = processor; + mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; + + /* Setup constants. */ + mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero; + mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero; + mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1); + mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31); + + mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f); + mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f); + mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f); + mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f); +} + + +static INLINE qword +micro_abs(qword src) +{ + return si_rotmi(si_shli(src, 1), -1); +} + +static INLINE qword +micro_ceil(qword src) +{ + return (qword) _ceilf4((vec_float4) src); +} + +static INLINE qword +micro_cos(qword src) +{ + return (qword) _cosf4((vec_float4) src); +} + +static const qword br_shuf = { + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, + TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, + TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, +}; + +static const qword bl_shuf = { + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, + TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, + TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, +}; + +static const qword tl_shuf = { + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, + TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, + TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, +}; + +static qword +micro_ddx(qword src) +{ + qword bottom_right = si_shufb(src, src, br_shuf); + qword bottom_left = si_shufb(src, src, bl_shuf); + + return si_fs(bottom_right, bottom_left); +} + +static qword +micro_ddy(qword src) +{ + qword top_left = si_shufb(src, src, tl_shuf); + qword bottom_left = si_shufb(src, src, bl_shuf); + + return si_fs(top_left, bottom_left); +} + +static INLINE qword +micro_div(qword src0, qword src1) +{ + return (qword) _divf4((vec_float4) src0, (vec_float4) src1); +} + +static qword +micro_flr(qword src) +{ + return (qword) _floorf4((vec_float4) src); +} + +static qword +micro_frc(qword src) +{ + return si_fs(src, (qword) _floorf4((vec_float4) src)); +} + +static INLINE qword +micro_ge(qword src0, qword src1) +{ + return si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); +} + +static qword +micro_lg2(qword src) +{ + return (qword) _log2f4((vec_float4) src); +} + +static INLINE qword +micro_lt(qword src0, qword src1) +{ + const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); + + return si_xori(tmp, 0xff); +} + +static INLINE qword +micro_max(qword src0, qword src1) +{ + return si_selb(src1, src0, si_fcgt(src0, src1)); +} + +static INLINE qword +micro_min(qword src0, qword src1) +{ + return si_selb(src0, src1, si_fcgt(src0, src1)); +} + +static qword +micro_neg(qword src) +{ + return si_xor(src, (qword) spu_splats(0x80000000)); +} + +static qword +micro_set_sign(qword src) +{ + return si_or(src, (qword) spu_splats(0x80000000)); +} + +static qword +micro_pow(qword src0, qword src1) +{ + return (qword) _powf4((vec_float4) src0, (vec_float4) src1); +} + +static qword +micro_rnd(qword src) +{ + const qword half = (qword) spu_splats(0.5f); + + /* May be able to use _roundf4. There may be some difference, though. + */ + return (qword) _floorf4((vec_float4) si_fa(src, half)); +} + +static INLINE qword +micro_ishr(qword src0, qword src1) +{ + return si_rotma(src0, si_sfi(src1, 0)); +} + +static qword +micro_trunc(qword src) +{ + return (qword) _truncf4((vec_float4) src); +} + +static qword +micro_sin(qword src) +{ + return (qword) _sinf4((vec_float4) src); +} + +static INLINE qword +micro_sqrt(qword src) +{ + return (qword) _sqrtf4((vec_float4) src); +} + +static void +fetch_src_file_channel( + const struct spu_exec_machine *mach, + const uint file, + const uint swizzle, + const union spu_exec_channel *index, + union spu_exec_channel *chan ) +{ + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( file ) { + case TGSI_FILE_CONSTANT: { + unsigned i; + + for (i = 0; i < 4; i++) { + const float *ptr = mach->Consts[index->i[i]]; + float tmp[4]; + + spu_dcache_fetch_unaligned((qword *) tmp, + (uintptr_t)(ptr + swizzle), + sizeof(float)); + + chan->f[i] = tmp[0]; + } + break; + } + + case TGSI_FILE_INPUT: + chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_TEMPORARY: + chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_IMMEDIATE: + ASSERT( index->i[0] < (int) mach->ImmLimit ); + ASSERT( index->i[1] < (int) mach->ImmLimit ); + ASSERT( index->i[2] < (int) mach->ImmLimit ); + ASSERT( index->i[3] < (int) mach->ImmLimit ); + + chan->f[0] = mach->Imms[index->i[0]][swizzle]; + chan->f[1] = mach->Imms[index->i[1]][swizzle]; + chan->f[2] = mach->Imms[index->i[2]][swizzle]; + chan->f[3] = mach->Imms[index->i[3]][swizzle]; + break; + + case TGSI_FILE_ADDRESS: + chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_OUTPUT: + /* vertex/fragment output vars can be read too */ + chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + default: + ASSERT( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]; + break; + + case TGSI_EXTSWIZZLE_ONE: + *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]; + break; + + default: + ASSERT( 0 ); + } +} + +static void +fetch_source( + const struct spu_exec_machine *mach, + union spu_exec_channel *chan, + const struct tgsi_full_src_register *reg, + const uint chan_index ) +{ + union spu_exec_channel index; + uint swizzle; + + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->SrcRegister.Index; + + if (reg->SrcRegister.Indirect) { + union spu_exec_channel index2; + union spu_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle(®->SrcRegisterInd, + CHAN_X); + fetch_src_file_channel( + mach, + reg->SrcRegisterInd.File, + swizzle, + &index2, + &indir_index ); + + index.q = si_a(index.q, indir_index.q); + } + + if( reg->SrcRegister.Dimension ) { + switch( reg->SrcRegister.File ) { + case TGSI_FILE_INPUT: + index.q = si_mpyi(index.q, 17); + break; + case TGSI_FILE_CONSTANT: + index.q = si_shli(index.q, 12); + break; + default: + ASSERT( 0 ); + } + + index.i[0] += reg->SrcRegisterDim.Index; + index.i[1] += reg->SrcRegisterDim.Index; + index.i[2] += reg->SrcRegisterDim.Index; + index.i[3] += reg->SrcRegisterDim.Index; + + if (reg->SrcRegisterDim.Indirect) { + union spu_exec_channel index2; + union spu_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterDimInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterDimInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterDimInd.File, + swizzle, + &index2, + &indir_index ); + + index.q = si_a(index.q, indir_index.q); + } + } + + swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + fetch_src_file_channel( + mach, + reg->SrcRegister.File, + swizzle, + &index, + chan ); + + switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) { + case TGSI_UTIL_SIGN_CLEAR: + chan->q = micro_abs(chan->q); + break; + + case TGSI_UTIL_SIGN_SET: + chan->q = micro_set_sign(chan->q); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + chan->q = micro_neg(chan->q); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + + if (reg->SrcRegisterExtMod.Complement) { + chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q); + } +} + +static void +store_dest( + struct spu_exec_machine *mach, + const union spu_exec_channel *chan, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + uint chan_index ) +{ + union spu_exec_channel *dst; + + switch( reg->DstRegister.File ) { + case TGSI_FILE_NULL: + return; + + case TGSI_FILE_OUTPUT: + dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_TEMPORARY: + dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_ADDRESS: + dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + break; + + default: + ASSERT( 0 ); + return; + } + + switch (inst->Instruction.Saturate) + { + case TGSI_SAT_NONE: + if (mach->ExecMask & 0x1) + dst->i[0] = chan->i[0]; + if (mach->ExecMask & 0x2) + dst->i[1] = chan->i[1]; + if (mach->ExecMask & 0x4) + dst->i[2] = chan->i[2]; + if (mach->ExecMask & 0x8) + dst->i[3] = chan->i[3]; + break; + + case TGSI_SAT_ZERO_ONE: + /* XXX need to obey ExecMask here */ + dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + ASSERT( 0 ); + break; + + default: + ASSERT( 0 ); + } +} + +#define FETCH(VAL,INDEX,CHAN)\ + fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN) + +#define STORE(VAL,INDEX,CHAN)\ + store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN ) + + +/** + * Execute ARB-style KIL which is predicated by a src register. + * Kill fragment if any of the four values is less than zero. + */ +static void +exec_kil(struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + uint uniquemask; + uint chan_index; + uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ + union spu_exec_channel r[1]; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + for (chan_index = 0; chan_index < 4; chan_index++) + { + uint swizzle; + uint i; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle ( + &inst->FullSrcRegisters[0], + chan_index); + + /* check if the component has not been already tested */ + if (uniquemask & (1 << swizzle)) + continue; + uniquemask |= 1 << swizzle; + + FETCH(&r[0], 0, chan_index); + for (i = 0; i < 4; i++) + if (r[0].f[i] < 0.0f) + kilmask |= 1 << i; + } + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; +} + +/** + * Execute NVIDIA-style KIL which is predicated by a condition code. + * Kill fragment if the condition code is TRUE. + */ +static void +exec_kilp(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ + + /* TODO: build kilmask from CC mask */ + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; +} + +/* + * Fetch a texel using STR texture coordinates. + */ +static void +fetch_texel( struct spu_sampler *sampler, + const union spu_exec_channel *s, + const union spu_exec_channel *t, + const union spu_exec_channel *p, + float lodbias, /* XXX should be float[4] */ + union spu_exec_channel *r, + union spu_exec_channel *g, + union spu_exec_channel *b, + union spu_exec_channel *a ) +{ + qword rgba[4]; + qword out[4]; + + sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, + (float (*)[4]) rgba); + + _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba); + r->q = out[0]; + g->q = out[1]; + b->q = out[2]; + a->q = out[3]; +} + + +static void +exec_tex(struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst, + boolean biasLod, boolean projected) +{ + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + union spu_exec_channel r[8]; + uint chan_index; + float lodBias; + + /* printf("Sampler %u unit %u\n", sampler, unit); */ + + switch (inst->InstructionExtTexture.Texture) { + case TGSI_TEXTURE_1D: + + FETCH(&r[0], 0, CHAN_X); + + if (projected) { + FETCH(&r[1], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[1].q); + } + + if (biasLod) { + FETCH(&r[1], 0, CHAN_W); + lodBias = r[2].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ + break; + + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + if (projected) { + FETCH(&r[3], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[3].q); + r[1].q = micro_div(r[1].q, r[3].q); + r[2].q = micro_div(r[2].q, r[3].q); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, /* inputs */ + &r[0], &r[1], &r[2], &r[3]); /* outputs */ + break; + + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + if (projected) { + FETCH(&r[3], 0, CHAN_W); + r[0].q = micro_div(r[0].q, r[3].q); + r[1].q = micro_div(r[1].q, r[3].q); + r[2].q = micro_div(r[2].q, r[3].q); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, + &r[0], &r[1], &r[2], &r[3]); + break; + + default: + ASSERT (0); + } + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[chan_index], 0, chan_index ); + } +} + + + +static void +constant_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + unsigned i; + + for( i = 0; i < QUAD_SIZE; i++ ) { + mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan]; + } +} + +static void +linear_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + mach->Inputs[attrib].xyzw[chan].f[0] = a0; + mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx; + mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady; + mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady; +} + +static void +perspective_interpolation( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ) +{ + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + const float *w = mach->QuadPos.xyzw[3].f; + /* divide by W here */ + mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0]; + mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1]; + mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2]; + mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3]; +} + + +typedef void (* interpolation_func)( + struct spu_exec_machine *mach, + unsigned attrib, + unsigned chan ); + +static void +exec_declaration(struct spu_exec_machine *mach, + const struct tgsi_full_declaration *decl) +{ + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + interpolation_func interp; + + first = decl->DeclarationRange.First; + last = decl->DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + switch( decl->Declaration.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + interp = constant_interpolation; + break; + + case TGSI_INTERPOLATE_LINEAR: + interp = linear_interpolation; + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + interp = perspective_interpolation; + break; + + default: + ASSERT( 0 ); + } + + if( mask == TGSI_WRITEMASK_XYZW ) { + unsigned i, j; + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + interp( mach, i, j ); + } + } + } + else { + unsigned i, j; + + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + for( i = first; i <= last; i++ ) { + interp( mach, i, j ); + } + } + } + } + } + } +} + +static void +exec_instruction( + struct spu_exec_machine *mach, + const struct tgsi_full_instruction *inst, + int *pc ) +{ + uint chan_index; + union spu_exec_channel r[8]; + + (*pc)++; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_cflts(r[0].q, 0); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_SWZ: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_X ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[1], 0, CHAN_Y ); + r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + + FETCH( &r[2], 0, CHAN_W ); + r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q); + r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q); + r[1].q = micro_pow(r[1].q, r[2].q); + + /* r0 = (r0 > 0.0) ? r1 : 0.0 + */ + r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); + r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q, + r[0].q); + STORE( &r[0], 0, CHAN_Z ); + } + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_sqrt(r[0].q); + r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + ASSERT (0); + break; + + case TGSI_OPCODE_LOG: + ASSERT (0); + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) + { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = si_fm(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_fa(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + + FETCH( &r[1], 0, CHAN_Z ); + FETCH( &r[2], 1, CHAN_Z ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_W); + FETCH(&r[2], 1, CHAN_W); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + FETCH( &r[0], 0, CHAN_Y ); + FETCH( &r[1], 1, CHAN_Y); + r[0].q = si_fm(r[0].q, r[1].q); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_Z ); + STORE( &r[0], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + FETCH( &r[0], 1, CHAN_W ); + STORE( &r[0], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = micro_min(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = micro_max(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = micro_ge(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = micro_ge(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + FETCH( &r[2], 2, chan_index ); + r[0].q = si_fma(r[0].q, r[1].q, r[2].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + r[0].q = si_fs(r[0].q, r[1].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + r[1].q = si_fs(r[1].q, r[2].q); + r[0].q = si_fma(r[0].q, r[1].q, r[2].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_CND: + ASSERT (0); + break; + + case TGSI_OPCODE_CND0: + ASSERT (0); + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + ASSERT (0); + break; + + case TGSI_OPCODE_INDEX: + ASSERT (0); + break; + + case TGSI_OPCODE_NEGATE: + ASSERT (0); + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_frc(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + ASSERT (0); + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_flr(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_rnd(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH(&r[0], 0, CHAN_X); + + r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_lg2(r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = micro_pow(r[0].q, r[1].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + FETCH(&r[0], 0, CHAN_Y); + FETCH(&r[1], 1, CHAN_Z); + FETCH(&r[3], 0, CHAN_Z); + FETCH(&r[4], 1, CHAN_Y); + + /* r2 = (r0 * r1) - (r3 * r5) + */ + r[2].q = si_fm(r[3].q, r[5].q); + r[2].q = si_fms(r[0].q, r[1].q, r[2].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &r[2], 0, CHAN_X ); + } + + FETCH(&r[2], 1, CHAN_X); + FETCH(&r[5], 0, CHAN_X); + + /* r3 = (r3 * r2) - (r1 * r5) + */ + r[1].q = si_fm(r[1].q, r[5].q); + r[3].q = si_fms(r[3].q, r[2].q, r[1].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + STORE( &r[3], 0, CHAN_Y ); + } + + /* r5 = (r5 * r4) - (r0 * r2) + */ + r[0].q = si_fm(r[0].q, r[2].q); + r[5].q = si_fms(r[5].q, r[4].q, r[0].q); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + STORE( &r[5], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + ASSERT (0); + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + + r[0].q = micro_abs(r[0].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_RCC: + ASSERT (0); + break; + + case TGSI_OPCODE_DPH: + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FETCH(&r[1], 1, CHAN_W); + + r[0].q = si_fa(r[0].q, r[1].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH(&r[0], 0, CHAN_X); + + r[0].q = micro_cos(r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ddx(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDY: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ddy(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_KILP: + exec_kilp (mach, inst); + break; + + case TGSI_OPCODE_KIL: + exec_kil (mach, inst); + break; + + case TGSI_OPCODE_PK2H: + ASSERT (0); + break; + + case TGSI_OPCODE_PK2US: + ASSERT (0); + break; + + case TGSI_OPCODE_PK4B: + ASSERT (0); + break; + + case TGSI_OPCODE_PK4UB: + ASSERT (0); + break; + + case TGSI_OPCODE_RFL: + ASSERT (0); + break; + + case TGSI_OPCODE_SEQ: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fceq(r[0].q, r[1].q); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SFL: + ASSERT (0); + break; + + case TGSI_OPCODE_SGT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_fcgt(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SIN: + FETCH( &r[0], 0, CHAN_X ); + r[0].q = micro_sin(r[0].q); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fcgt(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SNE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_fceq(r[0].q, r[1].q); + r[0].q = si_xori(r[0].q, 0xff); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_STR: + ASSERT (0); + break; + + case TGSI_OPCODE_TEX: + /* simple texture lookup */ + /* src[0] = texcoord */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, FALSE, FALSE); + break; + + case TGSI_OPCODE_TXB: + /* Texture lookup with lod bias */ + /* src[0] = texcoord (src[0].w = load bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE, FALSE); + break; + + case TGSI_OPCODE_TXD: + /* Texture lookup with explict partial derivatives */ + /* src[0] = texcoord */ + /* src[1] = d[strq]/dx */ + /* src[2] = d[strq]/dy */ + /* src[3] = sampler unit */ + ASSERT (0); + break; + + case TGSI_OPCODE_TXL: + /* Texture lookup with explit LOD */ + /* src[0] = texcoord (src[0].w = load bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE, FALSE); + break; + + case TGSI_OPCODE_TXP: + /* Texture lookup with projection */ + /* src[0] = texcoord (src[0].w = projection) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE, TRUE); + break; + + case TGSI_OPCODE_UP2H: + ASSERT (0); + break; + + case TGSI_OPCODE_UP2US: + ASSERT (0); + break; + + case TGSI_OPCODE_UP4B: + ASSERT (0); + break; + + case TGSI_OPCODE_UP4UB: + ASSERT (0); + break; + + case TGSI_OPCODE_X2D: + ASSERT (0); + break; + + case TGSI_OPCODE_ARA: + ASSERT (0); + break; + + case TGSI_OPCODE_ARR: + ASSERT (0); + break; + + case TGSI_OPCODE_BRA: + ASSERT (0); + break; + + case TGSI_OPCODE_CAL: + /* skip the call if no execution channels are enabled */ + if (mach->ExecMask) { + /* do the call */ + + /* push the Cond, Loop, Cont stacks */ + ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + + ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING); + mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask; + + /* note that PC was already incremented above */ + mach->CallStack[mach->CallStackTop++] = *pc; + *pc = inst->InstructionExtLabel.Label; + } + break; + + case TGSI_OPCODE_RET: + mach->FuncMask &= ~mach->ExecMask; + UPDATE_EXEC_MASK(mach); + + if (mach->ExecMask == 0x0) { + /* really return now (otherwise, keep executing */ + + if (mach->CallStackTop == 0) { + /* returning from main() */ + *pc = -1; + return; + } + *pc = mach->CallStack[--mach->CallStackTop]; + + /* pop the Cond, Loop, Cont stacks */ + ASSERT(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + ASSERT(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + ASSERT(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + ASSERT(mach->FuncStackTop > 0); + mach->FuncMask = mach->FuncStack[--mach->FuncStackTop]; + + UPDATE_EXEC_MASK(mach); + } + break; + + case TGSI_OPCODE_SSG: + ASSERT (0); + break; + + case TGSI_OPCODE_CMP: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + /* r0 = (r0 < 0.0) ? r1 : r2 + */ + r[3].q = si_xor(r[3].q, r[3].q); + r[0].q = micro_lt(r[0].q, r[3].q); + r[0].q = si_selb(r[1].q, r[2].q, r[0].q); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_SCS: + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( &r[0], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + r[1].q = micro_cos(r[0].q); + STORE( &r[1], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + r[1].q = micro_sin(r[0].q); + STORE( &r[1], 0, CHAN_Y ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_NRM: + ASSERT (0); + break; + + case TGSI_OPCODE_DIV: + ASSERT( 0 ); + break; + + case TGSI_OPCODE_DP2: + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + r[0].q = si_fm(r[0].q, r[1].q); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + r[0].q = si_fma(r[1].q, r[2].q, r[0].q); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_IF: + /* push CondMask */ + ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + FETCH( &r[0], 0, CHAN_X ); + /* update CondMask */ + if( ! r[0].u[0] ) { + mach->CondMask &= ~0x1; + } + if( ! r[0].u[1] ) { + mach->CondMask &= ~0x2; + } + if( ! r[0].u[2] ) { + mach->CondMask &= ~0x4; + } + if( ! r[0].u[3] ) { + mach->CondMask &= ~0x8; + } + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ELSE */ + break; + + case TGSI_OPCODE_ELSE: + /* invert CondMask wrt previous mask */ + { + uint prevMask; + ASSERT(mach->CondStackTop > 0); + prevMask = mach->CondStack[mach->CondStackTop - 1]; + mach->CondMask = ~mach->CondMask & prevMask; + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ENDIF */ + } + break; + + case TGSI_OPCODE_ENDIF: + /* pop CondMask */ + ASSERT(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_END: + /* halt execution */ + *pc = -1; + break; + + case TGSI_OPCODE_REP: + ASSERT (0); + break; + + case TGSI_OPCODE_ENDREP: + ASSERT (0); + break; + + case TGSI_OPCODE_PUSHA: + ASSERT (0); + break; + + case TGSI_OPCODE_POPA: + ASSERT (0); + break; + + case TGSI_OPCODE_CEIL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_ceil(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_I2F: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_csflt(r[0].q, 0); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_NOT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = si_xorbi(r[0].q, 0xff); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_TRUNC: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + r[0].q = micro_trunc(r[0].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + + r[0].q = si_shl(r[0].q, r[1].q); + + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = micro_ishr(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_AND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_and(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_OR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_or(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOD: + ASSERT (0); + break; + + case TGSI_OPCODE_XOR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + r[0].q = si_xor(r[0].q, r[1].q); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SAD: + ASSERT (0); + break; + + case TGSI_OPCODE_TXF: + ASSERT (0); + break; + + case TGSI_OPCODE_TXQ: + ASSERT (0); + break; + + case TGSI_OPCODE_EMIT: + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++; + break; + + case TGSI_OPCODE_ENDPRIM: + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0; + break; + + case TGSI_OPCODE_LOOP: + /* fall-through (for now) */ + case TGSI_OPCODE_BGNLOOP2: + /* push LoopMask and ContMasks */ + ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + break; + + case TGSI_OPCODE_ENDLOOP: + /* fall-through (for now at least) */ + case TGSI_OPCODE_ENDLOOP2: + /* Restore ContMask, but don't pop */ + ASSERT(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[mach->ContStackTop - 1]; + if (mach->LoopMask) { + /* repeat loop: jump to instruction just past BGNLOOP */ + *pc = inst->InstructionExtLabel.Label + 1; + } + else { + /* exit loop: pop LoopMask */ + ASSERT(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + /* pop ContMask */ + ASSERT(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + } + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BRK: + /* turn off loop channels for each enabled exec channel */ + mach->LoopMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_CONT: + /* turn off cont channels for each enabled exec channel */ + mach->ContMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BGNSUB: + /* no-op */ + break; + + case TGSI_OPCODE_ENDSUB: + /* no-op */ + break; + + case TGSI_OPCODE_NOISE1: + ASSERT( 0 ); + break; + + case TGSI_OPCODE_NOISE2: + ASSERT( 0 ); + break; + + case TGSI_OPCODE_NOISE3: + ASSERT( 0 ); + break; + + case TGSI_OPCODE_NOISE4: + ASSERT( 0 ); + break; + + case TGSI_OPCODE_NOP: + break; + + default: + ASSERT( 0 ); + } +} + + +/** + * Run TGSI interpreter. + * \return bitmask of "alive" quad components + */ +uint +spu_exec_machine_run( struct spu_exec_machine *mach ) +{ + uint i; + int pc = 0; + + mach->CondMask = 0xf; + mach->LoopMask = 0xf; + mach->ContMask = 0xf; + mach->FuncMask = 0xf; + mach->ExecMask = 0xf; + + mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */ + ASSERT(mach->CondStackTop == 0); + ASSERT(mach->LoopStackTop == 0); + ASSERT(mach->ContStackTop == 0); + ASSERT(mach->CallStackTop == 0); + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; + + if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) { + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0; + mach->Primitives[0] = 0; + } + + + /* execute declarations (interpolants) */ + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + for (i = 0; i < mach->NumDeclarations; i++) { + union { + struct tgsi_full_declaration decl; + qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16]; + } d ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Declarations + pc); + + spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl)); + + exec_declaration( mach, &d.decl ); + } + } + + /* execute instructions, until pc is set to -1 */ + while (pc != -1) { + union { + struct tgsi_full_instruction inst; + qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16]; + } i ALIGN16_ATTRIB; + unsigned ea = (unsigned) (mach->Instructions + pc); + + spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst)); + exec_instruction( mach, & i.inst, &pc ); + } + +#if 0 + /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */ + if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) { + /* + * Scale back depth component. + */ + for (i = 0; i < 4; i++) + mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF; + } +#endif + + return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; +} + + diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h new file mode 100644 index 00000000000..86056799405 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_exec.h @@ -0,0 +1,172 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#if !defined SPU_EXEC_H +#define SPU_EXEC_H + +#include "pipe/p_compiler.h" +#include "tgsi/tgsi_exec.h" + +#if defined __cplusplus +extern "C" { +#endif + +/** + * Registers may be treated as float, signed int or unsigned int. + */ +union spu_exec_channel +{ + float f[QUAD_SIZE]; + int i[QUAD_SIZE]; + unsigned u[QUAD_SIZE]; + qword q; +}; + +/** + * A vector[RGBA] of channels[4 pixels] + */ +struct spu_exec_vector +{ + union spu_exec_channel xyzw[NUM_CHANNELS]; +}; + +/** + * For fragment programs, information for computing fragment input + * values from plane equation of the triangle/line. + */ +struct spu_interp_coef +{ + float a0[NUM_CHANNELS]; /* in an xyzw layout */ + float dadx[NUM_CHANNELS]; + float dady[NUM_CHANNELS]; +}; + + +struct softpipe_tile_cache; /**< Opaque to TGSI */ + +/** + * Information for sampling textures, which must be implemented + * by code outside the TGSI executor. + */ +struct spu_sampler +{ + const struct pipe_sampler_state *state; + struct pipe_texture *texture; + /** Get samples for four fragments in a quad */ + void (*get_samples)(struct spu_sampler *sampler, + const float s[QUAD_SIZE], + const float t[QUAD_SIZE], + const float p[QUAD_SIZE], + float lodbias, + float rgba[NUM_CHANNELS][QUAD_SIZE]); + void *pipe; /*XXX temporary*/ + struct softpipe_tile_cache *cache; +}; + + +/** + * Run-time virtual machine state for executing TGSI shader. + */ +struct spu_exec_machine +{ + /* + * 32 program temporaries + * 4 internal temporaries + * 1 address + */ + struct spu_exec_vector Temps[TGSI_EXEC_NUM_TEMPS + + TGSI_EXEC_NUM_TEMP_EXTRAS + 1] + ALIGN16_ATTRIB; + + struct spu_exec_vector *Addrs; + + struct spu_sampler *Samplers; + + float Imms[TGSI_EXEC_NUM_IMMEDIATES][4]; + unsigned ImmLimit; + float (*Consts)[4]; + struct spu_exec_vector *Inputs; + struct spu_exec_vector *Outputs; + unsigned Processor; + + /* GEOMETRY processor only. */ + unsigned *Primitives; + + /* FRAGMENT processor only. */ + const struct spu_interp_coef *InterpCoefs; + struct spu_exec_vector QuadPos; + + /* Conditional execution masks */ + uint CondMask; /**< For IF/ELSE/ENDIF */ + uint LoopMask; /**< For BGNLOOP/ENDLOOP */ + uint ContMask; /**< For loop CONT statements */ + uint FuncMask; /**< For function calls */ + uint ExecMask; /**< = CondMask & LoopMask */ + + /** Condition mask stack (for nested conditionals) */ + uint CondStack[TGSI_EXEC_MAX_COND_NESTING]; + int CondStackTop; + + /** Loop mask stack (for nested loops) */ + uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int LoopStackTop; + + /** Loop continue mask stack (see comments in tgsi_exec.c) */ + uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int ContStackTop; + + /** Function execution mask stack (for executing subroutine code) */ + uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING]; + int FuncStackTop; + + /** Function call stack for saving/restoring the program counter */ + uint CallStack[TGSI_EXEC_MAX_CALL_NESTING]; + int CallStackTop; + + struct tgsi_full_instruction *Instructions; + uint NumInstructions; + + struct tgsi_full_declaration *Declarations; + uint NumDeclarations; +}; + + +extern void +spu_exec_machine_init(struct spu_exec_machine *mach, + uint numSamplers, + struct spu_sampler *samplers, + unsigned processor); + +extern uint +spu_exec_machine_run( struct spu_exec_machine *mach ); + + +#if defined __cplusplus +} /* extern "C" */ +#endif + +#endif /* SPU_EXEC_H */ diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c new file mode 100644 index 00000000000..ff3d609d258 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -0,0 +1,173 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * SPU functions accessed by shaders. + * + * Authors: Brian Paul + */ + + +#include <string.h> +#include <libmisc.h> +#include <math.h> +#include <cos14_v.h> +#include <sin14_v.h> +#include <simdmath/exp2f4.h> +#include <simdmath/log2f4.h> +#include <simdmath/powf4.h> + +#include "cell/common.h" +#include "spu_main.h" +#include "spu_funcs.h" +#include "spu_texture.h" + + +/** For "return"-ing four vectors */ +struct vec_4x4 +{ + vector float v[4]; +}; + + +static vector float +spu_cos(vector float x) +{ + return _cos14_v(x); +} + +static vector float +spu_sin(vector float x) +{ + return _sin14_v(x); +} + +static vector float +spu_pow(vector float x, vector float y) +{ + return _powf4(x, y); +} + +static vector float +spu_exp2(vector float x) +{ + return _exp2f4(x); +} + +static vector float +spu_log2(vector float x) +{ + return _log2f4(x); +} + + +static struct vec_4x4 +spu_tex_2d(vector float s, vector float t, vector float r, vector float q, + unsigned unit) +{ + struct vec_4x4 colors; + (void) r; + (void) q; + spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v); + return colors; +} + +static struct vec_4x4 +spu_tex_3d(vector float s, vector float t, vector float r, vector float q, + unsigned unit) +{ + struct vec_4x4 colors; + (void) r; + (void) q; + spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v); + return colors; +} + +static struct vec_4x4 +spu_tex_cube(vector float s, vector float t, vector float r, vector float q, + unsigned unit) +{ + struct vec_4x4 colors; + (void) q; + sample_texture_cube(s, t, r, unit, colors.v); + return colors; +} + + +/** + * Add named function to list of "exported" functions that will be + * made available to the PPU-hosted code generator. + */ +static void +export_func(struct cell_spu_function_info *spu_functions, + const char *name, void *addr) +{ + uint n = spu_functions->num; + ASSERT(strlen(name) < 16); + strcpy(spu_functions->names[n], name); + spu_functions->addrs[n] = (uint) addr; + spu_functions->num++; + ASSERT(spu_functions->num <= 16); +} + + +/** + * Return info about the SPU's function to the PPU / main memory. + * The PPU needs to know the address of some SPU-side functions so + * that we can generate shader code with function calls. + */ +void +return_function_info(void) +{ + struct cell_spu_function_info funcs ALIGN16_ATTRIB; + int tag = TAG_MISC; + + ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */ + + funcs.num = 0; + export_func(&funcs, "spu_cos", &spu_cos); + export_func(&funcs, "spu_sin", &spu_sin); + export_func(&funcs, "spu_pow", &spu_pow); + export_func(&funcs, "spu_exp2", &spu_exp2); + export_func(&funcs, "spu_log2", &spu_log2); + export_func(&funcs, "spu_tex_2d", &spu_tex_2d); + export_func(&funcs, "spu_tex_3d", &spu_tex_3d); + export_func(&funcs, "spu_tex_cube", &spu_tex_cube); + + /* Send the function info back to the PPU / main memory */ + mfc_put((void *) &funcs, /* src in local store */ + (unsigned int) spu.init.spu_functions, /* dst in main memory */ + sizeof(funcs), /* bytes */ + tag, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << tag); +} + + + diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h new file mode 100644 index 00000000000..3adb6ae99f9 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.h @@ -0,0 +1,35 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_FUNCS_H +#define SPU_FUNCS_H + +extern void +return_function_info(void); + +#endif + diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c new file mode 100644 index 00000000000..97c86d194da --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -0,0 +1,117 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/* main() for Cell SPU code */ + + +#include <stdio.h> +#include <libmisc.h> + +#include "pipe/p_defines.h" + +#include "spu_funcs.h" +#include "spu_command.h" +#include "spu_main.h" +#include "spu_per_fragment_op.h" +#include "spu_texture.h" +//#include "spu_test.h" +#include "cell/common.h" + + +/* +helpful headers: +/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h +/opt/cell/sdk/usr/include/libmisc.h +*/ + +struct spu_global spu; + + +static void +one_time_init(void) +{ + memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status)); + memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status)); + invalidate_tex_cache(); +} + +/* In some versions of the SDK the SPE main takes 'unsigned long' as a + * parameter. In others it takes 'unsigned long long'. Use a define to + * select between the two. + */ +#ifdef SPU_MAIN_PARAM_LONG_LONG +typedef unsigned long long main_param_t; +#else +typedef unsigned long main_param_t; +#endif + +/** + * SPE entrypoint. + */ +int +main(main_param_t speid, main_param_t argp) +{ + int tag = 0; + + (void) speid; + + ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); + ASSERT(sizeof(struct cell_command_render) % 8 == 0); + ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0); + + one_time_init(); + spu_command_init(); + + D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); + + /* get initialization data */ + mfc_get(&spu.init, /* dest */ + (unsigned int) argp, /* src */ + sizeof(struct cell_init_info), /* bytes */ + tag, + 0, /* tid */ + 0 /* rid */); + wait_on_mask( 1 << tag ); + + if (spu.init.id == 0) { + return_function_info(); + } + +#if 0 + if (spu.init.id==0) + spu_test_misc(spu.init.id); +#endif + + command_loop(); + + spu_command_close(); + + return 0; +} diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h new file mode 100644 index 00000000000..33767e7c51d --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -0,0 +1,254 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_MAIN_H +#define SPU_MAIN_H + + +#include <spu_mfcio.h> + +#include "cell/common.h" +#include "draw/draw_vertex.h" +#include "pipe/p_state.h" + + +#if DEBUG +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#else +#define D_PRINTF(...) +#endif + + +/** + * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels. + * The data may be addressed through several different types. + */ +typedef union { + ushort us[TILE_SIZE][TILE_SIZE]; + uint ui[TILE_SIZE][TILE_SIZE]; + vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4]; + vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2]; +} tile_t; + + +#define TILE_STATUS_CLEAR 1 +#define TILE_STATUS_DEFINED 2 /**< defined in FB, but not in local store */ +#define TILE_STATUS_CLEAN 3 /**< in local store, but not changed */ +#define TILE_STATUS_DIRTY 4 /**< modified locally, but not put back yet */ +#define TILE_STATUS_GETTING 5 /**< mfc_get() called but not yet arrived */ + + +/** Function for sampling textures */ +typedef void (*spu_sample_texture_2d_func)(vector float s, + vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + + +/** Function for performing per-fragment ops */ +typedef void (*spu_fragment_ops_func)(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + +/** Function for running fragment program */ +typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs, + vector float *outputs, + vector float *constants); + + +struct spu_framebuffer +{ + void *color_start; /**< addr of color surface in main memory */ + void *depth_start; /**< addr of depth surface in main memory */ + enum pipe_format color_format; + enum pipe_format depth_format; + uint width, height; /**< size in pixels */ + uint width_tiles, height_tiles; /**< width and height in tiles */ + + uint color_clear_value; + uint depth_clear_value; + + uint zsize; /**< 0, 2 or 4 bytes per Z */ + float zscale; /**< 65535.0, 2^24-1 or 2^32-1 */ +} ALIGN16_ATTRIB; + + +/** per-texture level info */ +struct spu_texture_level +{ + void *start; + ushort width, height, depth; + ushort tiles_per_row; + uint bytes_per_image; + /** texcoord scale factors */ + vector float scale_s, scale_t, scale_r; + /** texcoord masks (if REPEAT then size-1, else ~0) */ + vector signed int mask_s, mask_t, mask_r; + /** texcoord clamp limits */ + vector signed int max_s, max_t, max_r; +} ALIGN16_ATTRIB; + + +struct spu_texture +{ + struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS]; + uint max_level; + uint target; /**< PIPE_TEXTURE_x */ +} ALIGN16_ATTRIB; + + +/** + * All SPU global/context state will be in a singleton object of this type: + */ +struct spu_global +{ + /** One-time init/constant info */ + struct cell_init_info init; + + /* + * Current state + */ + struct spu_framebuffer fb; + struct pipe_depth_stencil_alpha_state depth_stencil_alpha; + struct pipe_blend_state blend; + struct pipe_blend_color blend_color; + struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS]; + struct pipe_rasterizer_state rasterizer; + struct spu_texture texture[PIPE_MAX_SAMPLERS]; + struct vertex_info vertex_info; + + /** Current color and Z tiles */ + tile_t ctile ALIGN16_ATTRIB; + tile_t ztile ALIGN16_ATTRIB; + + /** Read depth/stencil tiles? */ + boolean read_depth_stencil; + + /** Current tiles' status */ + ubyte cur_ctile_status, cur_ztile_status; + + /** Status of all tiles in framebuffer */ + ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; + + /** Current fragment ops machine code, at 8-byte boundary */ + uint *fragment_ops_code; + uint fragment_ops_code_size; + /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */ + spu_fragment_ops_func fragment_ops[2]; + + /** Current fragment program machine code, at 8-byte boundary */ + uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB; + /** Current fragment ops function */ + spu_fragment_program_func fragment_program; + + /** Current texture sampler function */ + spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS]; + spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS]; + spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS]; + + /** Fragment program constants */ + vector float constants[4 * CELL_MAX_CONSTANTS]; + +} ALIGN16_ATTRIB; + + +extern struct spu_global spu; + + + +/* DMA TAGS */ + +#define TAG_SURFACE_CLEAR 10 +#define TAG_VERTEX_BUFFER 11 +#define TAG_READ_TILE_COLOR 12 +#define TAG_READ_TILE_Z 13 +#define TAG_WRITE_TILE_COLOR 14 +#define TAG_WRITE_TILE_Z 15 +#define TAG_INDEX_BUFFER 16 +#define TAG_BATCH_BUFFER 17 +#define TAG_MISC 18 +#define TAG_DCACHE0 20 +#define TAG_DCACHE1 21 +#define TAG_DCACHE2 22 +#define TAG_DCACHE3 23 +#define TAG_FENCE 24 + + +static INLINE void +wait_on_mask(unsigned tagMask) +{ + mfc_write_tag_mask( tagMask ); + /* wait for completion of _any_ DMAs specified by tagMask */ + mfc_read_tag_status_any(); +} + + +static INLINE void +wait_on_mask_all(unsigned tagMask) +{ + mfc_write_tag_mask( tagMask ); + /* wait for completion of _any_ DMAs specified by tagMask */ + mfc_read_tag_status_all(); +} + + + + + +static INLINE void +memset16(ushort *d, ushort value, uint count) +{ + uint i; + for (i = 0; i < count; i++) + d[i] = value; +} + + +static INLINE void +memset32(uint *d, uint value, uint count) +{ + uint i; + for (i = 0; i < count; i++) + d[i] = value; +} + + +#endif /* SPU_MAIN_H */ diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c new file mode 100644 index 00000000000..eba9f95cf1f --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -0,0 +1,631 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * \author Brian Paul + */ + + +#include <transpose_matrix4x4.h> +#include "pipe/p_format.h" +#include "spu_main.h" +#include "spu_colorpack.h" +#include "spu_per_fragment_op.h" + + +#define LINEAR_QUAD_LAYOUT 1 + + +static INLINE vector float +spu_min(vector float a, vector float b) +{ + vector unsigned int m; + m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */ + return spu_sel(a, b, m); +} + + +static INLINE vector float +spu_max(vector float a, vector float b) +{ + vector unsigned int m; + m = spu_cmpgt(a, b); /* m = a > b ? ~0 : 0 */ + return spu_sel(b, a, m); +} + + +/** + * Called by rasterizer for each quad after the shader has run. Do + * all the per-fragment operations including alpha test, z test, + * stencil test, blend, colormask and logicops. This is a + * fallback/debug function. In reality we'll use a generated function + * produced by the PPU. But this function is useful for + * debug/validation. + */ +void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragR, + vector float fragG, + vector float fragB, + vector float fragA, + vector unsigned int mask) +{ + vector float frag_aos[4]; + unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ + unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */ + + /* + * Do alpha test + */ + if (spu.depth_stencil_alpha.alpha.enabled) { + vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value); + vector unsigned int amask; + + switch (spu.depth_stencil_alpha.alpha.func) { + case PIPE_FUNC_LESS: + amask = spu_cmpgt(ref, fragA); /* mask = (fragA < ref) */ + break; + case PIPE_FUNC_GREATER: + amask = spu_cmpgt(fragA, ref); /* mask = (fragA > ref) */ + break; + case PIPE_FUNC_GEQUAL: + amask = spu_cmpgt(ref, fragA); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_LEQUAL: + amask = spu_cmpgt(fragA, ref); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_EQUAL: + amask = spu_cmpeq(ref, fragA); + break; + case PIPE_FUNC_NOTEQUAL: + amask = spu_cmpeq(ref, fragA); + amask = spu_nor(amask, amask); + break; + case PIPE_FUNC_ALWAYS: + amask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + amask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, amask); + } + + + /* + * Z and/or stencil testing... + */ + if (spu.depth_stencil_alpha.depth.enabled || + spu.depth_stencil_alpha.stencil[0].enabled) { + + /* get four Z/Stencil values from tile */ + vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU); + vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2]; + vector unsigned int ifbZ = spu_and(ifbZS, mask24); + vector unsigned int ifbS = spu_andc(ifbZS, mask24); + + if (spu.depth_stencil_alpha.stencil[0].enabled) { + /* do stencil test */ + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM); + + } + else if (spu.depth_stencil_alpha.depth.enabled) { + /* do depth test */ + + ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM || + spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM); + + vector unsigned int ifragZ; + vector unsigned int zmask; + + /* convert four fragZ from float to uint */ + fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff)); + ifragZ = spu_convtu(fragZ, 0); + + /* do depth comparison, setting zmask with results */ + switch (spu.depth_stencil_alpha.depth.func) { + case PIPE_FUNC_LESS: + zmask = spu_cmpgt(ifbZ, ifragZ); /* mask = (ifragZ < ifbZ) */ + break; + case PIPE_FUNC_GREATER: + zmask = spu_cmpgt(ifragZ, ifbZ); /* mask = (ifbZ > ifragZ) */ + break; + case PIPE_FUNC_GEQUAL: + zmask = spu_cmpgt(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_LEQUAL: + zmask = spu_cmpgt(ifragZ, ifbZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_EQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + break; + case PIPE_FUNC_NOTEQUAL: + zmask = spu_cmpeq(ifbZ, ifragZ); + zmask = spu_nor(zmask, zmask); + break; + case PIPE_FUNC_ALWAYS: + zmask = spu_splats(0xffffffffU); + break; + case PIPE_FUNC_NEVER: + zmask = spu_splats( 0x0U); + break; + default: + ; + } + + mask = spu_and(mask, zmask); + + /* merge framebuffer Z and fragment Z according to the mask */ + ifbZ = spu_or(spu_and(ifragZ, mask), + spu_andc(ifbZ, mask)); + } + + if (spu_extract(spu_orx(mask), 0)) { + /* put new fragment Z/Stencil values back into Z/Stencil tile */ + depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS); + + spu.cur_ztile_status = TILE_STATUS_DIRTY; + } + } + + + /* + * If we'll need the current framebuffer/tile colors for blending + * or logicop or colormask, fetch them now. + */ + if (spu.blend.blend_enable || + spu.blend.logicop_enable || + spu.blend.colormask != 0xf) { + +#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ + fbc0 = colorTile->ui[y][x*2+0]; + fbc1 = colorTile->ui[y][x*2+1]; + fbc2 = colorTile->ui[y][x*2+2]; + fbc3 = colorTile->ui[y][x*2+3]; +#else + fbc0 = colorTile->ui[y+0][x+0]; + fbc1 = colorTile->ui[y+0][x+1]; + fbc2 = colorTile->ui[y+1][x+0]; + fbc3 = colorTile->ui[y+1][x+1]; +#endif + } + + + /* + * Do blending + */ + if (spu.blend.blend_enable) { + /* blending terms, misc regs */ + vector float term1r, term1g, term1b, term1a; + vector float term2r, term2g, term2b, term2a; + vector float one, tmp; + + vector float fbRGBA[4]; /* current framebuffer colors */ + + /* convert framebuffer colors from packed int to vector float */ + { + vector float temp[4]; /* float colors in AOS form */ + switch (spu.fb.color_format) { + case PIPE_FORMAT_B8G8R8A8_UNORM: + temp[0] = spu_unpack_B8G8R8A8(fbc0); + temp[1] = spu_unpack_B8G8R8A8(fbc1); + temp[2] = spu_unpack_B8G8R8A8(fbc2); + temp[3] = spu_unpack_B8G8R8A8(fbc3); + break; + case PIPE_FORMAT_A8R8G8B8_UNORM: + temp[0] = spu_unpack_A8R8G8B8(fbc0); + temp[1] = spu_unpack_A8R8G8B8(fbc1); + temp[2] = spu_unpack_A8R8G8B8(fbc2); + temp[3] = spu_unpack_A8R8G8B8(fbc3); + break; + default: + ASSERT(0); + } + _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */ + } + + /* + * Compute Src RGB terms (fragment color * factor) + */ + switch (spu.blend.rgb_src_factor) { + case PIPE_BLENDFACTOR_ONE: + term1r = fragR; + term1g = fragG; + term1b = fragB; + break; + case PIPE_BLENDFACTOR_ZERO: + term1r = + term1g = + term1b = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term1r = spu_mul(fragR, fragR); + term1g = spu_mul(fragG, fragG); + term1b = spu_mul(fragB, fragB); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term1r = spu_mul(fragR, fragA); + term1g = spu_mul(fragG, fragA); + term1b = spu_mul(fragB, fragA); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + term1r = spu_mul(fragR, fbRGBA[0]); + term1g = spu_mul(fragG, fbRGBA[1]); + term1b = spu_mul(fragB, fbRGBA[1]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term1r = spu_mul(fragR, fbRGBA[3]); + term1g = spu_mul(fragG, fbRGBA[3]); + term1b = spu_mul(fragB, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3])); + term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Src Alpha term (fragment alpha * factor) + */ + switch (spu.blend.alpha_src_factor) { + case PIPE_BLENDFACTOR_ONE: + term1a = fragA; + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term1a = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term1a = spu_mul(fragA, fragA); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term1a = spu_mul(fragA, fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest RGB terms (framebuffer color * factor) + */ + switch (spu.blend.rgb_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + term2r = fbRGBA[0]; + term2g = fbRGBA[1]; + term2b = fbRGBA[2]; + break; + case PIPE_BLENDFACTOR_ZERO: + term2r = + term2g = + term2b = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term2r = spu_mul(fbRGBA[0], fragR); + term2g = spu_mul(fbRGBA[1], fragG); + term2b = spu_mul(fbRGBA[2], fragB); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term2r = spu_mul(fbRGBA[0], fragA); + term2g = spu_mul(fbRGBA[1], fragA); + term2b = spu_mul(fbRGBA[2], fragA); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + one = spu_splats(1.0f); + tmp = spu_sub(one, fragA); + term2r = spu_mul(fbRGBA[0], tmp); + term2g = spu_mul(fbRGBA[1], tmp); + term2b = spu_mul(fbRGBA[2], tmp); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + term2r = spu_mul(fbRGBA[0], fbRGBA[0]); + term2g = spu_mul(fbRGBA[1], fbRGBA[1]); + term2b = spu_mul(fbRGBA[2], fbRGBA[2]); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + term2r = spu_mul(fbRGBA[0], fbRGBA[3]); + term2g = spu_mul(fbRGBA[1], fbRGBA[3]); + term2b = spu_mul(fbRGBA[2], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2])); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3])); + term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3])); + term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Compute Dest Alpha term (framebuffer alpha * factor) + */ + switch (spu.blend.alpha_dst_factor) { + case PIPE_BLENDFACTOR_ONE: + term2a = fbRGBA[3]; + break; + case PIPE_BLENDFACTOR_SRC_COLOR: + term2a = spu_splats(0.0f); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA: + term2a = spu_mul(fbRGBA[3], fragA); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + one = spu_splats(1.0f); + tmp = spu_sub(one, fragA); + term2a = spu_mul(fbRGBA[3], tmp); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_DST_ALPHA: + term2a = spu_mul(fbRGBA[3], fbRGBA[3]); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* fall-through */ + case PIPE_BLENDFACTOR_CONST_ALPHA: + term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3])); + break; + /* XXX more cases */ + default: + ASSERT(0); + } + + /* + * Combine Src/Dest RGB terms + */ + switch (spu.blend.rgb_func) { + case PIPE_BLEND_ADD: + fragR = spu_add(term1r, term2r); + fragG = spu_add(term1g, term2g); + fragB = spu_add(term1b, term2b); + break; + case PIPE_BLEND_SUBTRACT: + fragR = spu_sub(term1r, term2r); + fragG = spu_sub(term1g, term2g); + fragB = spu_sub(term1b, term2b); + break; + case PIPE_BLEND_REVERSE_SUBTRACT: + fragR = spu_sub(term2r, term1r); + fragG = spu_sub(term2g, term1g); + fragB = spu_sub(term2b, term1b); + break; + case PIPE_BLEND_MIN: + fragR = spu_min(term1r, term2r); + fragG = spu_min(term1g, term2g); + fragB = spu_min(term1b, term2b); + break; + case PIPE_BLEND_MAX: + fragR = spu_max(term1r, term2r); + fragG = spu_max(term1g, term2g); + fragB = spu_max(term1b, term2b); + break; + default: + ASSERT(0); + } + + /* + * Combine Src/Dest A term + */ + switch (spu.blend.alpha_func) { + case PIPE_BLEND_ADD: + fragA = spu_add(term1a, term2a); + break; + case PIPE_BLEND_SUBTRACT: + fragA = spu_sub(term1a, term2a); + break; + case PIPE_BLEND_REVERSE_SUBTRACT: + fragA = spu_sub(term2a, term1a); + break; + case PIPE_BLEND_MIN: + fragA = spu_min(term1a, term2a); + break; + case PIPE_BLEND_MAX: + fragA = spu_max(term1a, term2a); + break; + default: + ASSERT(0); + } + } + + + /* + * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA. + */ +#if 0 + /* original code */ + { + vector float frag_soa[4]; + frag_soa[0] = fragR; + frag_soa[1] = fragG; + frag_soa[2] = fragB; + frag_soa[3] = fragA; + _transpose_matrix4x4(frag_aos, frag_soa); + } +#else + /* short-cut relying on function parameter layout: */ + _transpose_matrix4x4(frag_aos, &fragR); + (void) fragG; + (void) fragB; +#endif + + /* + * Pack fragment float colors into 32-bit RGBA words. + */ + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + fragc0 = spu_pack_A8R8G8B8(frag_aos[0]); + fragc1 = spu_pack_A8R8G8B8(frag_aos[1]); + fragc2 = spu_pack_A8R8G8B8(frag_aos[2]); + fragc3 = spu_pack_A8R8G8B8(frag_aos[3]); + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + fragc0 = spu_pack_B8G8R8A8(frag_aos[0]); + fragc1 = spu_pack_B8G8R8A8(frag_aos[1]); + fragc2 = spu_pack_B8G8R8A8(frag_aos[2]); + fragc3 = spu_pack_B8G8R8A8(frag_aos[3]); + break; + default: + fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n"); + ASSERT(0); + } + + + /* + * Do color masking + */ + if (spu.blend.colormask != 0xf) { + uint cmask = 0x0; /* each byte corresponds to a color channel */ + + /* Form bitmask depending on color buffer format and colormask bits */ + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + if (spu.blend.colormask & PIPE_MASK_R) + cmask |= 0x00ff0000; /* red */ + if (spu.blend.colormask & PIPE_MASK_G) + cmask |= 0x0000ff00; /* green */ + if (spu.blend.colormask & PIPE_MASK_B) + cmask |= 0x000000ff; /* blue */ + if (spu.blend.colormask & PIPE_MASK_A) + cmask |= 0xff000000; /* alpha */ + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + if (spu.blend.colormask & PIPE_MASK_R) + cmask |= 0x0000ff00; /* red */ + if (spu.blend.colormask & PIPE_MASK_G) + cmask |= 0x00ff0000; /* green */ + if (spu.blend.colormask & PIPE_MASK_B) + cmask |= 0xff000000; /* blue */ + if (spu.blend.colormask & PIPE_MASK_A) + cmask |= 0x000000ff; /* alpha */ + break; + default: + ASSERT(0); + } + + /* + * Apply color mask to the 32-bit packed colors. + * if (cmask[i]) + * frag color[i] = frag color[i]; + * else + * frag color[i] = framebuffer color[i]; + */ + fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask); + fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask); + fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask); + fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask); + } + + + /* + * Do logic ops + */ + if (spu.blend.logicop_enable) { + /* XXX to do */ + /* apply logicop to 32-bit packed colors (fragcx and fbcx) */ + } + + + /* + * If mask is non-zero, mark tile as dirty. + */ + if (spu_extract(spu_orx(mask), 0)) { + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + else { + /* write no fragments */ + return; + } + + + /* + * Write new fragment/quad colors to the framebuffer/tile. + * Only write pixels where the corresponding mask word is set. + */ +#if LINEAR_QUAD_LAYOUT + /* + * Quad layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|... + * +--+--+--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y][x*2] = fragc0; + if (spu_extract(mask, 1)) + colorTile->ui[y][x*2+1] = fragc1; + if (spu_extract(mask, 2)) + colorTile->ui[y][x*2+2] = fragc2; + if (spu_extract(mask, 3)) + colorTile->ui[y][x*2+3] = fragc3; +#else + /* + * Quad layout: + * +--+--+ + * |p0|p1|... + * +--+--+ + * |p2|p3|... + * +--+--+ + */ + if (spu_extract(mask, 0)) + colorTile->ui[y+0][x+0] = fragc0; + if (spu_extract(mask, 1)) + colorTile->ui[y+0][x+1] = fragc1; + if (spu_extract(mask, 2)) + colorTile->ui[y+1][x+0] = fragc2; + if (spu_extract(mask, 3)) + colorTile->ui[y+1][x+1] = fragc3; +#endif +} diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h new file mode 100644 index 00000000000..f817abf0463 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h @@ -0,0 +1,44 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_PER_FRAGMENT_OP +#define SPU_PER_FRAGMENT_OP + + +extern void +spu_fallback_fragment_ops(uint x, uint y, + tile_t *colorTile, + tile_t *depthStencilTile, + vector float fragZ, + vector float fragRed, + vector float fragGreen, + vector float fragBlue, + vector float fragAlpha, + vector unsigned int mask); + + +#endif /* SPU_PER_FRAGMENT_OP */ diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c new file mode 100644 index 00000000000..7c225e2f27c --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_render.c @@ -0,0 +1,295 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include <stdio.h> +#include <libmisc.h> +#include <spu_mfcio.h> + +#include "spu_main.h" +#include "spu_render.h" +#include "spu_tri.h" +#include "spu_tile.h" +#include "cell/common.h" +#include "util/u_memory.h" + + +/** + * Given a rendering command's bounding box (in pixels) compute the + * location of the corresponding screen tile bounding box. + */ +static INLINE void +tile_bounding_box(const struct cell_command_render *render, + uint *txmin, uint *tymin, + uint *box_num_tiles, uint *box_width_tiles) +{ +#if 0 + /* Debug: full-window bounding box */ + uint txmax = spu.fb.width_tiles - 1; + uint tymax = spu.fb.height_tiles - 1; + *txmin = 0; + *tymin = 0; + *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + *box_width_tiles = spu.fb.width_tiles; + (void) render; + (void) txmax; + (void) tymax; +#else + uint txmax, tymax, box_height_tiles; + + *txmin = (uint) render->xmin / TILE_SIZE; + *tymin = (uint) render->ymin / TILE_SIZE; + txmax = (uint) render->xmax / TILE_SIZE; + tymax = (uint) render->ymax / TILE_SIZE; + if (txmax >= spu.fb.width_tiles) + txmax = spu.fb.width_tiles-1; + if (tymax >= spu.fb.height_tiles) + tymax = spu.fb.height_tiles-1; + *box_width_tiles = txmax - *txmin + 1; + box_height_tiles = tymax - *tymin + 1; + *box_num_tiles = *box_width_tiles * box_height_tiles; +#endif +#if 0 + printf("SPU %u: bounds: %g, %g ... %g, %g\n", spu.init.id, + render->xmin, render->ymin, render->xmax, render->ymax); + printf("SPU %u: tiles: %u, %u .. %u, %u\n", + spu.init.id, *txmin, *tymin, txmax, tymax); + ASSERT(render->xmin <= render->xmax); + ASSERT(render->ymin <= render->ymax); +#endif +} + + +/** Check if the tile at (tx,ty) belongs to this SPU */ +static INLINE boolean +my_tile(uint tx, uint ty) +{ + return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id; +} + + +/** + * Start fetching non-clear color/Z tiles from main memory + */ +static INLINE void +get_cz_tiles(uint tx, uint ty) +{ + if (spu.read_depth_stencil) { + if (spu.cur_ztile_status != TILE_STATUS_CLEAR) { + //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty); + get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1); + spu.cur_ztile_status = TILE_STATUS_GETTING; + } + } + + if (spu.cur_ctile_status != TILE_STATUS_CLEAR) { + //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty); + get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0); + spu.cur_ctile_status = TILE_STATUS_GETTING; + } +} + + +/** + * Start putting dirty color/Z tiles back to main memory + */ +static INLINE void +put_cz_tiles(uint tx, uint ty) +{ + if (spu.cur_ztile_status == TILE_STATUS_DIRTY) { + /* tile was modified and needs to be written back */ + //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty); + put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1); + spu.cur_ztile_status = TILE_STATUS_DEFINED; + } + else if (spu.cur_ztile_status == TILE_STATUS_GETTING) { + /* tile was never used */ + spu.cur_ztile_status = TILE_STATUS_DEFINED; + //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty); + } + + if (spu.cur_ctile_status == TILE_STATUS_DIRTY) { + /* tile was modified and needs to be written back */ + //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty); + put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0); + spu.cur_ctile_status = TILE_STATUS_DEFINED; + } + else if (spu.cur_ctile_status == TILE_STATUS_GETTING) { + /* tile was never used */ + spu.cur_ctile_status = TILE_STATUS_DEFINED; + //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty); + } +} + + +/** + * Wait for 'put' of color/z tiles to complete. + */ +static INLINE void +wait_put_cz_tiles(void) +{ + wait_on_mask(1 << TAG_WRITE_TILE_COLOR); + if (spu.read_depth_stencil) { + wait_on_mask(1 << TAG_WRITE_TILE_Z); + } +} + + +/** + * Render primitives + * \param pos_incr returns value indicating how may words to skip after + * this command in the batch buffer + */ +void +cmd_render(const struct cell_command_render *render, uint *pos_incr) +{ + /* we'll DMA into these buffers */ + ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB; + const uint vertex_size = render->vertex_size; /* in bytes */ + /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size; + uint index_bytes; + const ubyte *vertices; + const ushort *indexes; + uint i, j; + uint num_tiles; + + D_PRINTF(CELL_DEBUG_CMD, + "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n", + render->prim_type, + render->num_verts, + render->num_indexes, + render->inline_verts); + + ASSERT(sizeof(*render) % 4 == 0); + ASSERT(total_vertex_bytes % 16 == 0); + ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES); + ASSERT(render->num_indexes % 3 == 0); + + + /* indexes are right after the render command in the batch buffer */ + indexes = (const ushort *) (render + 1); + index_bytes = ROUNDUP8(render->num_indexes * 2); + *pos_incr = index_bytes / 8 + sizeof(*render) / 8; + + + if (render->inline_verts) { + /* Vertices are after indexes in batch buffer at next 16-byte addr */ + vertices = (const ubyte *) render + (*pos_incr * 8); + vertices = (const ubyte *) align_pointer((void *) vertices, 16); + ASSERT_ALIGN16(vertices); + *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8; + } + else { + /* Begin DMA fetch of vertex buffer */ + ubyte *src = spu.init.buffers[render->vertex_buf]; + ubyte *dest = vertex_data; + + /* skip vertex data we won't use */ +#if 01 + src += render->min_index * vertex_size; + dest += render->min_index * vertex_size; + total_vertex_bytes -= render->min_index * vertex_size; +#endif + ASSERT(total_vertex_bytes % 16 == 0); + ASSERT_ALIGN16(dest); + ASSERT_ALIGN16(src); + + mfc_get(dest, /* in vertex_data[] array */ + (unsigned int) src, /* src in main memory */ + total_vertex_bytes, /* size */ + TAG_VERTEX_BUFFER, + 0, /* tid */ + 0 /* rid */); + + vertices = vertex_data; + + wait_on_mask(1 << TAG_VERTEX_BUFFER); + } + + + /** + ** find tiles which intersect the prim bounding box + **/ + uint txmin, tymin, box_width_tiles, box_num_tiles; + tile_bounding_box(render, &txmin, &tymin, + &box_num_tiles, &box_width_tiles); + + + /* make sure any pending clears have completed */ + wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */ + + + num_tiles = 0; + + /** + ** loop over tiles, rendering tris + **/ + for (i = 0; i < box_num_tiles; i++) { + const uint tx = txmin + i % box_width_tiles; + const uint ty = tymin + i / box_width_tiles; + + ASSERT(tx < spu.fb.width_tiles); + ASSERT(ty < spu.fb.height_tiles); + + if (!my_tile(tx, ty)) + continue; + + num_tiles++; + + spu.cur_ctile_status = spu.ctile_status[ty][tx]; + spu.cur_ztile_status = spu.ztile_status[ty][tx]; + + get_cz_tiles(tx, ty); + + uint drawn = 0; + + /* loop over tris */ + for (j = 0; j < render->num_indexes; j += 3) { + const float *v0, *v1, *v2; + + v0 = (const float *) (vertices + indexes[j+0] * vertex_size); + v1 = (const float *) (vertices + indexes[j+1] * vertex_size); + v2 = (const float *) (vertices + indexes[j+2] * vertex_size); + + drawn += tri_draw(v0, v1, v2, tx, ty); + } + + //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3); + + /* write color/z tiles back to main framebuffer, if dirtied */ + put_cz_tiles(tx, ty); + + wait_put_cz_tiles(); /* XXX seems unnecessary... */ + + spu.ctile_status[ty][tx] = spu.cur_ctile_status; + spu.ztile_status[ty][tx] = spu.cur_ztile_status; + } + + D_PRINTF(CELL_DEBUG_CMD, + "RENDER done (%u tiles hit)\n", + num_tiles); +} diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h new file mode 100644 index 00000000000..493434f0878 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_render.h @@ -0,0 +1,38 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef SPU_RENDER_H +#define SPU_RENDER_H + +#include "cell/common.h" + +extern void +cmd_render(const struct cell_command_render *render, uint *pos_incr); + +#endif /* SPU_RENDER_H */ + diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h new file mode 100644 index 00000000000..74f2a0b6d2e --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_shuffle.h @@ -0,0 +1,186 @@ +#ifndef SPU_SHUFFLE_H +#define SPU_SHUFFLE_H + +/* + * Generate shuffle patterns with minimal fuss. + * + * Based on ideas from + * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf + * + * A-P indicates 0-15th position in first vector + * a-p indicates 0-15th position in second vector + * + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | A| B| C| D| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | A| B| C| D| E| F| G| H| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * + * x or X indicates 0xff + * 8 indicates 0x80 + * 0 indicates 0x00 + * + * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector + * unsigned char literal suitable for use with spu_shuffle(). + * + * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector + * literal suitable for use with si_shufb(). + * + * + * For example : + * SHUFB4(A,A,A,A) + * expands to : + * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}) + * + * SHUFFLE8(A,B,a,b,C,c,8,8) + * expands to : + * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13, + * 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0}) + * + */ + +#include <spu_intrinsics.h> + +#define SHUFFLE_PATTERN_4_A__ 0x00, 0x01, 0x02, 0x03 +#define SHUFFLE_PATTERN_4_B__ 0x04, 0x05, 0x06, 0x07 +#define SHUFFLE_PATTERN_4_C__ 0x08, 0x09, 0x0a, 0x0b +#define SHUFFLE_PATTERN_4_D__ 0x0c, 0x0d, 0x0e, 0x0f +#define SHUFFLE_PATTERN_4_a__ 0x10, 0x11, 0x12, 0x13 +#define SHUFFLE_PATTERN_4_b__ 0x14, 0x15, 0x16, 0x17 +#define SHUFFLE_PATTERN_4_c__ 0x18, 0x19, 0x1a, 0x1b +#define SHUFFLE_PATTERN_4_d__ 0x1c, 0x1d, 0x1e, 0x1f +#define SHUFFLE_PATTERN_4_X__ 0xc0, 0xc0, 0xc0, 0xc0 +#define SHUFFLE_PATTERN_4_x__ 0xc0, 0xc0, 0xc0, 0xc0 +#define SHUFFLE_PATTERN_4_0__ 0x80, 0x80, 0x80, 0x80 +#define SHUFFLE_PATTERN_4_8__ 0xe0, 0xe0, 0xe0, 0xe0 + +#define SHUFFLE_VECTOR_4__(A, B, C, D) \ + SHUFFLE_PATTERN_4_##A##__, \ + SHUFFLE_PATTERN_4_##B##__, \ + SHUFFLE_PATTERN_4_##C##__, \ + SHUFFLE_PATTERN_4_##D##__ + +#define SHUFFLE4(A, B, C, D) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_4__(A, B, C, D) \ + }) + +#define SHUFB4(A, B, C, D) \ + ((const qword){ \ + SHUFFLE_VECTOR_4__(A, B, C, D) \ + }) + + +#define SHUFFLE_PATTERN_8_A__ 0x00, 0x01 +#define SHUFFLE_PATTERN_8_B__ 0x02, 0x03 +#define SHUFFLE_PATTERN_8_C__ 0x04, 0x05 +#define SHUFFLE_PATTERN_8_D__ 0x06, 0x07 +#define SHUFFLE_PATTERN_8_E__ 0x08, 0x09 +#define SHUFFLE_PATTERN_8_F__ 0x0a, 0x0b +#define SHUFFLE_PATTERN_8_G__ 0x0c, 0x0d +#define SHUFFLE_PATTERN_8_H__ 0x0e, 0x0f +#define SHUFFLE_PATTERN_8_a__ 0x10, 0x11 +#define SHUFFLE_PATTERN_8_b__ 0x12, 0x13 +#define SHUFFLE_PATTERN_8_c__ 0x14, 0x15 +#define SHUFFLE_PATTERN_8_d__ 0x16, 0x17 +#define SHUFFLE_PATTERN_8_e__ 0x18, 0x19 +#define SHUFFLE_PATTERN_8_f__ 0x1a, 0x1b +#define SHUFFLE_PATTERN_8_g__ 0x1c, 0x1d +#define SHUFFLE_PATTERN_8_h__ 0x1e, 0x1f +#define SHUFFLE_PATTERN_8_X__ 0xc0, 0xc0 +#define SHUFFLE_PATTERN_8_x__ 0xc0, 0xc0 +#define SHUFFLE_PATTERN_8_0__ 0x80, 0x80 +#define SHUFFLE_PATTERN_8_8__ 0xe0, 0xe0 + + +#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + SHUFFLE_PATTERN_8_##A##__, \ + SHUFFLE_PATTERN_8_##B##__, \ + SHUFFLE_PATTERN_8_##C##__, \ + SHUFFLE_PATTERN_8_##D##__, \ + SHUFFLE_PATTERN_8_##E##__, \ + SHUFFLE_PATTERN_8_##F##__, \ + SHUFFLE_PATTERN_8_##G##__, \ + SHUFFLE_PATTERN_8_##H##__ + +#define SHUFFLE8(A, B, C, D, E, F, G, H) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + }) + +#define SHUFB8(A, B, C, D, E, F, G, H) \ + ((const qword){ \ + SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + }) + + +#define SHUFFLE_PATTERN_16_A__ 0x00 +#define SHUFFLE_PATTERN_16_B__ 0x01 +#define SHUFFLE_PATTERN_16_C__ 0x02 +#define SHUFFLE_PATTERN_16_D__ 0x03 +#define SHUFFLE_PATTERN_16_E__ 0x04 +#define SHUFFLE_PATTERN_16_F__ 0x05 +#define SHUFFLE_PATTERN_16_G__ 0x06 +#define SHUFFLE_PATTERN_16_H__ 0x07 +#define SHUFFLE_PATTERN_16_I__ 0x08 +#define SHUFFLE_PATTERN_16_J__ 0x09 +#define SHUFFLE_PATTERN_16_K__ 0x0a +#define SHUFFLE_PATTERN_16_L__ 0x0b +#define SHUFFLE_PATTERN_16_M__ 0x0c +#define SHUFFLE_PATTERN_16_N__ 0x0d +#define SHUFFLE_PATTERN_16_O__ 0x0e +#define SHUFFLE_PATTERN_16_P__ 0x0f +#define SHUFFLE_PATTERN_16_a__ 0x10 +#define SHUFFLE_PATTERN_16_b__ 0x11 +#define SHUFFLE_PATTERN_16_c__ 0x12 +#define SHUFFLE_PATTERN_16_d__ 0x13 +#define SHUFFLE_PATTERN_16_e__ 0x14 +#define SHUFFLE_PATTERN_16_f__ 0x15 +#define SHUFFLE_PATTERN_16_g__ 0x16 +#define SHUFFLE_PATTERN_16_h__ 0x17 +#define SHUFFLE_PATTERN_16_i__ 0x18 +#define SHUFFLE_PATTERN_16_j__ 0x19 +#define SHUFFLE_PATTERN_16_k__ 0x1a +#define SHUFFLE_PATTERN_16_l__ 0x1b +#define SHUFFLE_PATTERN_16_m__ 0x1c +#define SHUFFLE_PATTERN_16_n__ 0x1d +#define SHUFFLE_PATTERN_16_o__ 0x1e +#define SHUFFLE_PATTERN_16_p__ 0x1f +#define SHUFFLE_PATTERN_16_X__ 0xc0 +#define SHUFFLE_PATTERN_16_x__ 0xc0 +#define SHUFFLE_PATTERN_16_0__ 0x80 +#define SHUFFLE_PATTERN_16_8__ 0xe0 + +#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + SHUFFLE_PATTERN_16_##A##__, \ + SHUFFLE_PATTERN_16_##B##__, \ + SHUFFLE_PATTERN_16_##C##__, \ + SHUFFLE_PATTERN_16_##D##__, \ + SHUFFLE_PATTERN_16_##E##__, \ + SHUFFLE_PATTERN_16_##F##__, \ + SHUFFLE_PATTERN_16_##G##__, \ + SHUFFLE_PATTERN_16_##H##__, \ + SHUFFLE_PATTERN_16_##I##__, \ + SHUFFLE_PATTERN_16_##J##__, \ + SHUFFLE_PATTERN_16_##K##__, \ + SHUFFLE_PATTERN_16_##L##__, \ + SHUFFLE_PATTERN_16_##M##__, \ + SHUFFLE_PATTERN_16_##N##__, \ + SHUFFLE_PATTERN_16_##O##__, \ + SHUFFLE_PATTERN_16_##P##__ + +#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + }) + +#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + ((const qword){ \ + SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + }) + +#endif diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c new file mode 100644 index 00000000000..69784c89788 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_texture.c @@ -0,0 +1,641 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include <math.h> + +#include "pipe/p_compiler.h" +#include "spu_main.h" +#include "spu_texture.h" +#include "spu_tile.h" +#include "spu_colorpack.h" +#include "spu_dcache.h" + + +/** + * Mark all tex cache entries as invalid. + */ +void +invalidate_tex_cache(void) +{ + uint lvl; + for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) { + uint unit = 0; + uint bytes = 4 * spu.texture[unit].level[lvl].width + * spu.texture[unit].level[lvl].height; + + if (spu.texture[unit].target == PIPE_TEXTURE_CUBE) + bytes *= 6; + else if (spu.texture[unit].target == PIPE_TEXTURE_3D) + bytes *= spu.texture[unit].level[lvl].depth; + + spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes); + } +} + + +/** + * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ... + * + * NOTE: in the typical case of bilinear filtering, the four texels + * are in a 2x2 group so we could get by with just two dcache fetches + * (two side-by-side texels per fetch). But when bilinear filtering + * wraps around a texture edge, we'll probably need code like we have + * now. + * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time, + * it's quite likely that the four pixels in a quad will need some of the + * same texels. So look into doing texture fetches for four pixels at + * a time. + */ +static void +get_four_texels(const struct spu_texture_level *tlevel, uint face, + vec_int4 x, vec_int4 y, + vec_uint4 *texels) +{ + unsigned texture_ea = (uintptr_t) tlevel->start; + const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */ + const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */ + const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */ + const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */ + + const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row); + const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t)); + + qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); + tile_offset = si_mpy((qword) tile_offset, tile_size); + + qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x); + texel_offset = si_mpyui(texel_offset, 4); + + vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset); + + texture_ea = texture_ea + face * tlevel->bytes_per_image; + + spu_dcache_fetch_unaligned((qword *) & texels[0], + texture_ea + spu_extract(offset, 0), 4); + spu_dcache_fetch_unaligned((qword *) & texels[1], + texture_ea + spu_extract(offset, 1), 4); + spu_dcache_fetch_unaligned((qword *) & texels[2], + texture_ea + spu_extract(offset, 2), 4); + spu_dcache_fetch_unaligned((qword *) & texels[3], + texture_ea + spu_extract(offset, 3), 4); +} + + +/** clamp vec to [0, max] */ +static INLINE vector signed int +spu_clamp(vector signed int vec, vector signed int max) +{ + static const vector signed int zero = {0,0,0,0}; + vector unsigned int c; + c = spu_cmpgt(vec, zero); /* c = vec > zero ? ~0 : 0 */ + vec = spu_sel(zero, vec, c); + c = spu_cmpgt(vec, max); /* c = vec > max ? ~0 : 0 */ + vec = spu_sel(vec, max, c); + return vec; +} + + + +/** + * Do nearest texture sampling for four pixels. + * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). + */ +void +sample_texture_2d_nearest(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) +{ + const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; + vector float ss = spu_mul(s, tlevel->scale_s); + vector float tt = spu_mul(t, tlevel->scale_t); + vector signed int is = spu_convts(ss, 0); + vector signed int it = spu_convts(tt, 0); + vec_uint4 texels[4]; + + /* PIPE_TEX_WRAP_REPEAT */ + is = spu_and(is, tlevel->mask_s); + it = spu_and(it, tlevel->mask_t); + + /* PIPE_TEX_WRAP_CLAMP */ + is = spu_clamp(is, tlevel->max_s); + it = spu_clamp(it, tlevel->max_t); + + get_four_texels(tlevel, face, is, it, texels); + + /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */ + spu_unpack_A8R8G8B8_transpose4(texels, colors); +} + + +/** + * Do bilinear texture sampling for four pixels. + * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). + */ +void +sample_texture_2d_bilinear(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) +{ + const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; + static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; + + vector float ss = spu_madd(s, tlevel->scale_s, half); + vector float tt = spu_madd(t, tlevel->scale_t, half); + + vector signed int is0 = spu_convts(ss, 0); + vector signed int it0 = spu_convts(tt, 0); + + /* is + 1, it + 1 */ + vector signed int is1 = spu_add(is0, 1); + vector signed int it1 = spu_add(it0, 1); + + /* PIPE_TEX_WRAP_REPEAT */ + is0 = spu_and(is0, tlevel->mask_s); + it0 = spu_and(it0, tlevel->mask_t); + is1 = spu_and(is1, tlevel->mask_s); + it1 = spu_and(it1, tlevel->mask_t); + + /* PIPE_TEX_WRAP_CLAMP */ + is0 = spu_clamp(is0, tlevel->max_s); + it0 = spu_clamp(it0, tlevel->max_t); + is1 = spu_clamp(is1, tlevel->max_s); + it1 = spu_clamp(it1, tlevel->max_t); + + /* get packed int texels */ + vector unsigned int texels[16]; + get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */ + + /* convert packed int texels to float colors */ + vector float ftexels[16]; + spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0); + spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4); + spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8); + spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12); + + /* Compute weighting factors in [0,1] + * Multiply texcoord by 1024, AND with 1023, convert back to float. + */ + vector float ss1024 = spu_mul(ss, spu_splats(1024.0f)); + vector signed int iss1024 = spu_convts(ss1024, 0); + iss1024 = spu_and(iss1024, 1023); + vector float sWeights0 = spu_convtf(iss1024, 10); + + vector float tt1024 = spu_mul(tt, spu_splats(1024.0f)); + vector signed int itt1024 = spu_convts(tt1024, 0); + itt1024 = spu_and(itt1024, 1023); + vector float tWeights0 = spu_convtf(itt1024, 10); + + /* 1 - sWeight and 1 - tWeight */ + vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0); + vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0); + + /* reds, for four pixels */ + ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/ + ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/ + ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/ + ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/ + colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]), + spu_add(ftexels[8], ftexels[12])); + + /* greens, for four pixels */ + ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/ + ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/ + ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/ + ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/ + colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]), + spu_add(ftexels[9], ftexels[13])); + + /* blues, for four pixels */ + ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/ + ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/ + ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/ + ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/ + colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]), + spu_add(ftexels[10], ftexels[14])); + + /* alphas, for four pixels */ + ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/ + ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/ + ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/ + ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/ + colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]), + spu_add(ftexels[11], ftexels[15])); +} + + + +/** + * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h + */ +static INLINE void +transpose(vector unsigned int *mOut0, + vector unsigned int *mOut1, + vector unsigned int *mOut2, + vector unsigned int *mOut3, + vector unsigned int *mIn) +{ + vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */ + vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */ + vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */ + + vector unsigned char shufflehi = ((vector unsigned char) { + 0x00, 0x01, 0x02, 0x03, + 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, + 0x14, 0x15, 0x16, 0x17}); + vector unsigned char shufflelo = ((vector unsigned char) { + 0x08, 0x09, 0x0A, 0x0B, + 0x18, 0x19, 0x1A, 0x1B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x1C, 0x1D, 0x1E, 0x1F}); + abcd = *(mIn+0); + efgh = *(mIn+1); + ijkl = *(mIn+2); + mnop = *(mIn+3); + + aibj = spu_shuffle(abcd, ijkl, shufflehi); + ckdl = spu_shuffle(abcd, ijkl, shufflelo); + emfn = spu_shuffle(efgh, mnop, shufflehi); + gohp = spu_shuffle(efgh, mnop, shufflelo); + + aeim = spu_shuffle(aibj, emfn, shufflehi); + bfjn = spu_shuffle(aibj, emfn, shufflelo); + cgko = spu_shuffle(ckdl, gohp, shufflehi); + dhlp = spu_shuffle(ckdl, gohp, shufflelo); + + *mOut0 = aeim; + *mOut1 = bfjn; + *mOut2 = cgko; + *mOut3 = dhlp; +} + + +/** + * Bilinear filtering, using int instead of float arithmetic for computing + * sample weights. + */ +void +sample_texture_2d_bilinear_int(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]) +{ + const struct spu_texture_level *tlevel = &spu.texture[unit].level[level]; + static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; + + /* Scale texcoords by size of texture, and add half pixel bias */ + vector float ss = spu_madd(s, tlevel->scale_s, half); + vector float tt = spu_madd(t, tlevel->scale_t, half); + + /* convert float coords to fixed-pt coords with 7 fraction bits */ + vector signed int is = spu_convts(ss, 7); /* XXX really need floor() here */ + vector signed int it = spu_convts(tt, 7); /* XXX really need floor() here */ + + /* compute integer texel weights in [0, 127] */ + vector signed int sWeights0 = spu_and(is, 127); + vector signed int tWeights0 = spu_and(it, 127); + vector signed int sWeights1 = spu_sub(127, sWeights0); + vector signed int tWeights1 = spu_sub(127, tWeights0); + + /* texel coords: is0 = is / 128, it0 = is / 128 */ + vector signed int is0 = spu_rlmask(is, -7); + vector signed int it0 = spu_rlmask(it, -7); + + /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */ + vector signed int is1 = spu_add(is0, 1); + vector signed int it1 = spu_add(it0, 1); + + /* PIPE_TEX_WRAP_REPEAT */ + is0 = spu_and(is0, tlevel->mask_s); + it0 = spu_and(it0, tlevel->mask_t); + is1 = spu_and(is1, tlevel->mask_s); + it1 = spu_and(it1, tlevel->mask_t); + + /* PIPE_TEX_WRAP_CLAMP */ + is0 = spu_clamp(is0, tlevel->max_s); + it0 = spu_clamp(it0, tlevel->max_t); + is1 = spu_clamp(is1, tlevel->max_s); + it1 = spu_clamp(it1, tlevel->max_t); + + /* get packed int texels */ + vector unsigned int texels[16]; + get_four_texels(tlevel, face, is0, it0, texels + 0); /* upper-left */ + get_four_texels(tlevel, face, is1, it0, texels + 4); /* upper-right */ + get_four_texels(tlevel, face, is0, it1, texels + 8); /* lower-left */ + get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */ + + /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */ + { + static const unsigned char ZERO = 0x80; + int i; + for (i = 0; i < 16; i++) { + texels[i] = spu_shuffle(texels[i], texels[i], + ((vector unsigned char) { + ZERO, ZERO, ZERO, 1, + ZERO, ZERO, ZERO, 2, + ZERO, ZERO, ZERO, 3, + ZERO, ZERO, ZERO, 0})); + } + } + + /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */ + vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7, + texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15; + transpose(&texel0, &texel1, &texel2, &texel3, texels + 0); + transpose(&texel4, &texel5, &texel6, &texel7, texels + 4); + transpose(&texel8, &texel9, &texel10, &texel11, texels + 8); + transpose(&texel12, &texel13, &texel14, &texel15, texels + 12); + + /* computed weighted colors */ + vector unsigned int c0, c1, c2, c3, cSum; + + /* red */ + c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ + cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); + colors[0] = spu_convtf(cSum, 22); + + /* green */ + c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ + cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); + colors[1] = spu_convtf(cSum, 22); + + /* blue */ + c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ + cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); + colors[2] = spu_convtf(cSum, 22); + + /* alpha */ + c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/ + c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/ + c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/ + c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/ + cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); + colors[3] = spu_convtf(cSum, 22); +} + + + +/** + * Compute level of detail factor from texcoords. + */ +static INLINE float +compute_lambda_2d(uint unit, vector float s, vector float t) +{ + uint baseLevel = 0; + float width = spu.texture[unit].level[baseLevel].width; + float height = spu.texture[unit].level[baseLevel].width; + float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0)); + float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0)); + float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0)); + float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0)); +#if 0 + /* ideal value */ + float x = dsdx * dsdx + dtdx * dtdx; + float y = dsdy * dsdy + dtdy * dtdy; + float rho = x > y ? x : y; + rho = sqrtf(rho); +#else + /* approximation */ + dsdx = fabsf(dsdx); + dsdy = fabsf(dsdy); + dtdx = fabsf(dtdx); + dtdy = fabsf(dtdy); + float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5; +#endif + float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */ + return lambda; +} + + +/** + * Blend two sets of colors according to weight. + */ +static void +blend_colors(vector float c0[4], const vector float c1[4], float weight) +{ + vector float t = spu_splats(weight); + vector float dc0 = spu_sub(c1[0], c0[0]); + vector float dc1 = spu_sub(c1[1], c0[1]); + vector float dc2 = spu_sub(c1[2], c0[2]); + vector float dc3 = spu_sub(c1[3], c0[3]); + c0[0] = spu_madd(dc0, t, c0[0]); + c0[1] = spu_madd(dc1, t, c0[1]); + c0[2] = spu_madd(dc2, t, c0[2]); + c0[3] = spu_madd(dc3, t, c0[3]); +} + + +/** + * Texture sampling with level of detail selection and possibly mipmap + * interpolation. + */ +void +sample_texture_2d_lod(vector float s, vector float t, + uint unit, uint level_ignored, uint face, + vector float colors[4]) +{ + /* + * Note that we're computing a lambda/lod here that's used for all + * four pixels in the quad. + */ + float lambda = compute_lambda_2d(unit, s, t); + + (void) face; + (void) level_ignored; + + /* apply lod bias */ + lambda += spu.sampler[unit].lod_bias; + + /* clamp */ + if (lambda < spu.sampler[unit].min_lod) + lambda = spu.sampler[unit].min_lod; + else if (lambda > spu.sampler[unit].max_lod) + lambda = spu.sampler[unit].max_lod; + + if (lambda <= 0.0f) { + /* magnify */ + spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors); + } + else { + /* minify */ + if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) { + /* sample two mipmap levels and interpolate */ + int level = (int) lambda; + if (level > (int) spu.texture[unit].max_level) + level = spu.texture[unit].max_level; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors); + if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) { + /* sample second mipmap level */ + float weight = lambda - (float) level; + level++; + if (level <= (int) spu.texture[unit].max_level) { + vector float colors2[4]; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2); + blend_colors(colors, colors2, weight); + } + } + } + else { + /* sample one mipmap level */ + int level = (int) (lambda + 0.5f); + if (level > (int) spu.texture[unit].max_level) + level = spu.texture[unit].max_level; + spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors); + } + } +} + + +/** XXX need a SIMD version of this */ +static unsigned +choose_cube_face(float rx, float ry, float rz, float *newS, float *newT) +{ + /* + major axis + direction target sc tc ma + ---------- ------------------------------- --- --- --- + +rx TEXTURE_CUBE_MAP_POSITIVE_X_EXT -rz -ry rx + -rx TEXTURE_CUBE_MAP_NEGATIVE_X_EXT +rz -ry rx + +ry TEXTURE_CUBE_MAP_POSITIVE_Y_EXT +rx +rz ry + -ry TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT +rx -rz ry + +rz TEXTURE_CUBE_MAP_POSITIVE_Z_EXT +rx -ry rz + -rz TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT -rx -ry rz + */ + const float arx = fabsf(rx); + const float ary = fabsf(ry); + const float arz = fabsf(rz); + unsigned face; + float sc, tc, ma; + + if (arx > ary && arx > arz) { + if (rx >= 0.0F) { + face = PIPE_TEX_FACE_POS_X; + sc = -rz; + tc = -ry; + ma = arx; + } + else { + face = PIPE_TEX_FACE_NEG_X; + sc = rz; + tc = -ry; + ma = arx; + } + } + else if (ary > arx && ary > arz) { + if (ry >= 0.0F) { + face = PIPE_TEX_FACE_POS_Y; + sc = rx; + tc = rz; + ma = ary; + } + else { + face = PIPE_TEX_FACE_NEG_Y; + sc = rx; + tc = -rz; + ma = ary; + } + } + else { + if (rz > 0.0F) { + face = PIPE_TEX_FACE_POS_Z; + sc = rx; + tc = -ry; + ma = arz; + } + else { + face = PIPE_TEX_FACE_NEG_Z; + sc = -rx; + tc = -ry; + ma = arz; + } + } + + *newS = (sc / ma + 1.0F) * 0.5F; + *newT = (tc / ma + 1.0F) * 0.5F; + + return face; +} + + + +void +sample_texture_cube(vector float s, vector float t, vector float r, + uint unit, vector float colors[4]) +{ + uint p, faces[4], level = 0; + float newS[4], newT[4]; + + /* Compute cube faces referenced by the four sets of texcoords. + * XXX we should SIMD-ize this. + */ + for (p = 0; p < 4; p++) { + float rx = spu_extract(s, p); + float ry = spu_extract(t, p); + float rz = spu_extract(r, p); + faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]); + } + + if (faces[0] == faces[1] && + faces[0] == faces[2] && + faces[0] == faces[3]) { + /* GOOD! All four texcoords refer to the same cube face */ + s = (vector float) {newS[0], newS[1], newS[2], newS[3]}; + t = (vector float) {newT[0], newT[1], newT[2], newT[3]}; + spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors); + } + else { + /* BAD! The four texcoords refer to different faces */ + for (p = 0; p < 4; p++) { + vector float c[4]; + + spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]), + unit, level, faces[p], c); + + float red = spu_extract(c[0], p); + float green = spu_extract(c[1], p); + float blue = spu_extract(c[2], p); + float alpha = spu_extract(c[3], p); + + colors[0] = spu_insert(red, colors[0], p); + colors[1] = spu_insert(green, colors[1], p); + colors[2] = spu_insert(blue, colors[2], p); + colors[3] = spu_insert(alpha, colors[3], p); + } + } +} diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h new file mode 100644 index 00000000000..7b75b007b5a --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_texture.h @@ -0,0 +1,67 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_TEXTURE_H +#define SPU_TEXTURE_H + + +#include "pipe/p_compiler.h" + + +extern void +invalidate_tex_cache(void); + + +extern void +sample_texture_2d_nearest(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + + +extern void +sample_texture_2d_bilinear(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + +extern void +sample_texture_2d_bilinear_int(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + + +extern void +sample_texture_2d_lod(vector float s, vector float t, + uint unit, uint level, uint face, + vector float colors[4]); + + +extern void +sample_texture_cube(vector float s, vector float t, vector float r, + uint unit, vector float colors[4]); + + +#endif /* SPU_TEXTURE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c new file mode 100644 index 00000000000..6905015a483 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_tile.c @@ -0,0 +1,126 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#include "spu_tile.h" +#include "spu_main.h" + + +/** + * Get tile of color or Z values from main memory, put into SPU memory. + */ +void +get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf) +{ + const uint offset = ty * spu.fb.width_tiles + tx; + const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4); + const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start; + + src += offset * bytesPerTile; + + ASSERT(tx < spu.fb.width_tiles); + ASSERT(ty < spu.fb.height_tiles); + ASSERT_ALIGN16(tile); + /* + printf("get_tile: dest: %p src: 0x%x size: %d\n", + tile, (unsigned int) src, bytesPerTile); + */ + mfc_get(tile->ui, /* dest in local memory */ + (unsigned int) src, /* src in main memory */ + bytesPerTile, + tag, + 0, /* tid */ + 0 /* rid */); +} + + +/** + * Move tile of color or Z values from SPU memory to main memory. + */ +void +put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf) +{ + const uint offset = ty * spu.fb.width_tiles + tx; + const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4); + ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start; + + dst += offset * bytesPerTile; + + ASSERT(tx < spu.fb.width_tiles); + ASSERT(ty < spu.fb.height_tiles); + ASSERT_ALIGN16(tile); + /* + printf("SPU %u: put_tile: src: %p dst: 0x%x size: %d\n", + spu.init.id, + tile, (unsigned int) dst, bytesPerTile); + */ + mfc_put((void *) tile->ui, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + bytesPerTile, + tag, + 0, /* tid */ + 0 /* rid */); +} + + +/** + * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled + * tiles back to the main framebuffer. + */ +void +really_clear_tiles(uint surfaceIndex) +{ + const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + uint i; + + if (surfaceIndex == 0) { + clear_c_tile(&spu.ctile); + + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) { + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); + } + } + } + else { + clear_z_tile(&spu.ztile); + + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1); + } + } + +#if 0 + wait_on_mask(1 << TAG_SURFACE_CLEAR); +#endif +} diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h new file mode 100644 index 00000000000..7bfb52be8f3 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_tile.h @@ -0,0 +1,75 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_TILE_H +#define SPU_TILE_H + + +#include <libmisc.h> +#include <spu_mfcio.h> +#include "spu_main.h" +#include "cell/common.h" + + + +extern void +get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf); + +extern void +put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf); + +extern void +really_clear_tiles(uint surfaceIndex); + + +static INLINE void +clear_c_tile(tile_t *ctile) +{ + memset32((uint*) ctile->ui, + spu.fb.color_clear_value, + TILE_SIZE * TILE_SIZE); +} + + +static INLINE void +clear_z_tile(tile_t *ztile) +{ + if (spu.fb.zsize == 2) { + memset16((ushort*) ztile->us, + spu.fb.depth_clear_value, + TILE_SIZE * TILE_SIZE); + } + else { + ASSERT(spu.fb.zsize != 0); + memset32((uint*) ztile->ui, + spu.fb.depth_clear_value, + TILE_SIZE * TILE_SIZE); + } +} + + +#endif /* SPU_TILE_H */ diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c new file mode 100644 index 00000000000..0d9fcb99970 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -0,0 +1,809 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Triangle rendering within a tile. + */ + +#include <transpose_matrix4x4.h> +#include "pipe/p_compiler.h" +#include "pipe/p_format.h" +#include "util/u_math.h" +#include "spu_colorpack.h" +#include "spu_main.h" +#include "spu_shuffle.h" +#include "spu_texture.h" +#include "spu_tile.h" +#include "spu_tri.h" + + +/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */ +typedef vector unsigned int mask_t; + + + +/** + * Simplified types taken from other parts of Gallium + */ +struct vertex_header { + vector float data[1]; +}; + + + +/* XXX fix this */ +#undef CEILF +#define CEILF(X) ((float) (int) ((X) + 0.99999f)) + + +#define QUAD_TOP_LEFT 0 +#define QUAD_TOP_RIGHT 1 +#define QUAD_BOTTOM_LEFT 2 +#define QUAD_BOTTOM_RIGHT 3 +#define MASK_TOP_LEFT (1 << QUAD_TOP_LEFT) +#define MASK_TOP_RIGHT (1 << QUAD_TOP_RIGHT) +#define MASK_BOTTOM_LEFT (1 << QUAD_BOTTOM_LEFT) +#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT) +#define MASK_ALL 0xf + + +#define DEBUG_VERTS 0 + +/** + * Triangle edge info + */ +struct edge { + union { + struct { + float dx; /**< X(v1) - X(v0), used only during setup */ + float dy; /**< Y(v1) - Y(v0), used only during setup */ + }; + vec_float4 ds; /**< vector accessor for dx and dy */ + }; + float dxdy; /**< dx/dy */ + float sx, sy; /**< first sample point coord */ + int lines; /**< number of lines on this edge */ +}; + + +struct interp_coef +{ + vector float a0; + vector float dadx; + vector float dady; +}; + + +/** + * Triangle setup info (derived from draw_stage). + * Also used for line drawing (taking some liberties). + */ +struct setup_stage { + + /* Vertices are just an array of floats making up each attribute in + * turn. Currently fixed at 4 floats, but should change in time. + * Codegen will help cope with this. + */ + union { + struct { + const struct vertex_header *vmin; + const struct vertex_header *vmid; + const struct vertex_header *vmax; + const struct vertex_header *vprovoke; + }; + qword vertex_headers; + }; + + struct edge ebot; + struct edge etop; + struct edge emaj; + + float oneOverArea; /* XXX maybe make into vector? */ + + uint facing; + + uint tx, ty; /**< position of current tile (x, y) */ + + int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy; + + struct interp_coef coef[PIPE_MAX_SHADER_INPUTS]; + + struct { + vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */ + int y; + unsigned y_flags; + unsigned mask; /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */ + } span; +}; + + +static struct setup_stage setup; + + +/** + * Evaluate attribute coefficients (plane equations) to compute + * attribute values for the four fragments in a quad. + * Eg: four colors will be computed (in AoS format). + */ +static INLINE void +eval_coeff(uint slot, float x, float y, vector float w, vector float result[4]) +{ + switch (spu.vertex_info.attrib[slot].interp_mode) { + case INTERP_CONSTANT: + result[QUAD_TOP_LEFT] = + result[QUAD_TOP_RIGHT] = + result[QUAD_BOTTOM_LEFT] = + result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0; + break; + case INTERP_LINEAR: + { + vector float dadx = setup.coef[slot].dadx; + vector float dady = setup.coef[slot].dady; + vector float topLeft = + spu_add(setup.coef[slot].a0, + spu_add(spu_mul(spu_splats(x), dadx), + spu_mul(spu_splats(y), dady))); + + result[QUAD_TOP_LEFT] = topLeft; + result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx); + result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady); + result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady); + } + break; + case INTERP_PERSPECTIVE: + { + vector float dadx = setup.coef[slot].dadx; + vector float dady = setup.coef[slot].dady; + vector float topLeft = + spu_add(setup.coef[slot].a0, + spu_add(spu_mul(spu_splats(x), dadx), + spu_mul(spu_splats(y), dady))); + + vector float wInv = spu_re(w); /* 1.0 / w */ + + result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv); + result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv); + result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv); + result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv); + } + break; + case INTERP_POS: + case INTERP_NONE: + break; + default: + ASSERT(0); + } +} + + +/** + * As above, but return 4 vectors in SOA format. + * XXX this will all be re-written someday. + */ +static INLINE void +eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4]) +{ + eval_coeff(slot, x, y, w, result); + _transpose_matrix4x4(result, result); +} + + +/** Evalute coefficients to get Z for four pixels in a quad */ +static INLINE vector float +eval_z(float x, float y) +{ + const uint slot = 0; + const float dzdx = spu_extract(setup.coef[slot].dadx, 2); + const float dzdy = spu_extract(setup.coef[slot].dady, 2); + const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy; + const vector float topLeftv = spu_splats(topLeft); + const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy }; + return spu_add(topLeftv, derivs); +} + + +/** Evalute coefficients to get W for four pixels in a quad */ +static INLINE vector float +eval_w(float x, float y) +{ + const uint slot = 0; + const float dwdx = spu_extract(setup.coef[slot].dadx, 3); + const float dwdy = spu_extract(setup.coef[slot].dady, 3); + const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy; + const vector float topLeftv = spu_splats(topLeft); + const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy }; + return spu_add(topLeftv, derivs); +} + + +/** + * Emit a quad (pass to next stage). No clipping is done. + * Note: about 1/5 to 1/7 of the time, mask is zero and this function + * should be skipped. But adding the test for that slows things down + * overall. + */ +static INLINE void +emit_quad( int x, int y, mask_t mask) +{ + /* If any bits in mask are set... */ + if (spu_extract(spu_orx(mask), 0)) { + const int ix = x - setup.cliprect_minx; + const int iy = y - setup.cliprect_miny; + + spu.cur_ctile_status = TILE_STATUS_DIRTY; + spu.cur_ztile_status = TILE_STATUS_DIRTY; + + { + /* + * Run fragment shader, execute per-fragment ops, update fb/tile. + */ + vector float inputs[4*4], outputs[2*4]; + vector float fragZ = eval_z((float) x, (float) y); + vector float fragW = eval_w((float) x, (float) y); + vector unsigned int kill_mask; + + /* setup inputs */ +#if 0 + eval_coeff_soa(1, (float) x, (float) y, fragW, inputs); +#else + uint i; + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4); + } +#endif + ASSERT(spu.fragment_program); + ASSERT(spu.fragment_ops); + + /* Execute the current fragment program */ + kill_mask = spu.fragment_program(inputs, outputs, spu.constants); + + mask = spu_andc(mask, kill_mask); + + /* Execute per-fragment/quad operations, including: + * alpha test, z test, stencil test, blend and framebuffer writing. + * Note that there are two different fragment operations functions + * that can be called, one for front-facing fragments, and one + * for back-facing fragments. (Often the two are the same; + * but in some cases, like two-sided stenciling, they can be + * very different.) So choose the correct function depending + * on the calculated facing. + */ + spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile, + fragZ, + outputs[0*4+0], + outputs[0*4+1], + outputs[0*4+2], + outputs[0*4+3], + mask); + } + } +} + + +/** + * Given an X or Y coordinate, return the block/quad coordinate that it + * belongs to. + */ +static INLINE int +block(int x) +{ + return x & ~1; +} + + +/** + * Render a horizontal span of quads + */ +static void +flush_spans(void) +{ + int minleft, maxright; + + const int l0 = spu_extract(setup.span.quad, 0); + const int l1 = spu_extract(setup.span.quad, 1); + const int r0 = spu_extract(setup.span.quad, 2); + const int r1 = spu_extract(setup.span.quad, 3); + + switch (setup.span.y_flags) { + case 0x3: + /* both odd and even lines written (both quad rows) */ + minleft = MIN2(l0, l1); + maxright = MAX2(r0, r1); + break; + + case 0x1: + /* only even line written (quad top row) */ + minleft = l0; + maxright = r0; + break; + + case 0x2: + /* only odd line written (quad bottom row) */ + minleft = l1; + maxright = r1; + break; + + default: + return; + } + + /* OK, we're very likely to need the tile data now. + * clear or finish waiting if needed. + */ + if (spu.cur_ctile_status == TILE_STATUS_GETTING) { + /* wait for mfc_get() to complete */ + //printf("SPU: %u: waiting for ctile\n", spu.init.id); + wait_on_mask(1 << TAG_READ_TILE_COLOR); + spu.cur_ctile_status = TILE_STATUS_CLEAN; + } + else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) { + //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty); + clear_c_tile(&spu.ctile); + spu.cur_ctile_status = TILE_STATUS_DIRTY; + } + ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED); + + if (spu.read_depth_stencil) { + if (spu.cur_ztile_status == TILE_STATUS_GETTING) { + /* wait for mfc_get() to complete */ + //printf("SPU: %u: waiting for ztile\n", spu.init.id); + wait_on_mask(1 << TAG_READ_TILE_Z); + spu.cur_ztile_status = TILE_STATUS_CLEAN; + } + else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) { + //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty); + clear_z_tile(&spu.ztile); + spu.cur_ztile_status = TILE_STATUS_DIRTY; + } + ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); + } + + /* XXX this loop could be moved into the above switch cases... */ + + /* Setup for mask calculation */ + const vec_int4 quad_LlRr = setup.span.quad; + const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); + const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); + const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); + + const vec_int4 twos = spu_splats(2); + + const int x = block(minleft); + vec_int4 xs = {x, x+1, x, x+1}; + + for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { + /** + * Computes mask to indicate which pixels in the 2x2 quad are actually + * inside the triangle's bounds. + */ + + /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ + const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); + const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); + + /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ + const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); + + /* Combine results to create mask */ + const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); + + emit_quad(spu_extract(xs, 0), setup.span.y, mask); + } + + setup.span.y = 0; + setup.span.y_flags = 0; + /* Zero right elements */ + setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); +} + + +#if DEBUG_VERTS +static void +print_vertex(const struct vertex_header *v) +{ + uint i; + fprintf(stderr, " Vertex: (%p)\n", v); + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + fprintf(stderr, " %d: %f %f %f %f\n", i, + spu_extract(v->data[i], 0), + spu_extract(v->data[i], 1), + spu_extract(v->data[i], 2), + spu_extract(v->data[i], 3)); + } +} +#endif + + +/** + * Sort vertices from top to bottom. + * Compute area and determine front vs. back facing. + * Do coarse clip test against tile bounds + * \return FALSE if tri is totally outside tile, TRUE otherwise + */ +static boolean +setup_sort_vertices(const struct vertex_header *v0, + const struct vertex_header *v1, + const struct vertex_header *v2) +{ + float area, sign; + +#if DEBUG_VERTS + if (spu.init.id==0) { + fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id); + print_vertex(v0); + print_vertex(v1); + print_vertex(v2); + } +#endif + + /* determine bottom to top order of vertices */ + { + /* A table of shuffle patterns for putting vertex_header pointers into + correct order. Quite magical. */ + const vec_uchar16 sort_order_patterns[] = { + SHUFFLE4(A,B,C,C), + SHUFFLE4(C,A,B,C), + SHUFFLE4(A,C,B,C), + SHUFFLE4(B,C,A,C), + SHUFFLE4(B,A,C,C), + SHUFFLE4(C,B,A,C) }; + + /* The vertex_header pointers, packed for easy shuffling later */ + const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2}; + + /* Collate y values into two vectors for comparison. + Using only one shuffle constant! ;) */ + const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C)); + + /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */ + const vec_uint4 compare = spu_cmpgt(y_012, y_120); + /* Compress the result of the comparison into 4 bits */ + const vec_uint4 gather = spu_gather(compare); + /* Subtract one to attain the index into the LUT. Magical. */ + const unsigned int index = spu_extract(gather, 0) - 1; + + /* Load the appropriate pattern and construct the desired vector. */ + setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]); + + /* Using the result of the comparison, set sign. + Very magical. */ + sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f); + } + + /* Check if triangle is completely outside the tile bounds */ + if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy) + return FALSE; + if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny) + return FALSE; + if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx && + spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx && + spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx) + return FALSE; + if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx && + spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx && + spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx) + return FALSE; + + setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]); + setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]); + setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]); + + /* + * Compute triangle's area. Use 1/area to compute partial + * derivatives of attributes later. + */ + area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy; + + setup.oneOverArea = 1.0f / area; + + /* The product of area * sign indicates front/back orientation (0/1). + * Just in case someone gets the bright idea of switching the front + * and back constants without noticing that we're assuming their + * values in this operation, also assert that the values are + * what we think they are. + */ + ASSERT(CELL_FACING_FRONT == 0); + ASSERT(CELL_FACING_BACK == 1); + setup.facing = (area * sign > 0.0f) + ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW); + + return TRUE; +} + + +/** + * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + * The value value comes from vertex->data[slot]. + * The result will be put into setup.coef[slot].a0. + * \param slot which attribute slot + */ +static INLINE void +const_coeff4(uint slot) +{ + setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0}; + setup.coef[slot].a0 = setup.vprovoke->data[slot]; +} + + +/** + * As above, but interp setup all four vector components. + */ +static INLINE void +tri_linear_coeff4(uint slot) +{ + const vector float vmin_d = setup.vmin->data[slot]; + const vector float vmid_d = setup.vmid->data[slot]; + const vector float vmax_d = setup.vmax->data[slot]; + const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f); + const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f); + + vector float botda = vmid_d - vmin_d; + vector float majda = vmax_d - vmin_d; + + vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda), + spu_mul(botda, spu_splats(setup.emaj.dy))); + vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda), + spu_mul(majda, spu_splats(setup.ebot.dx))); + + setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea)); + setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea)); + + vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx); + vector float tempy = spu_mul(setup.coef[slot].dady, yyyy); + + setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy)); +} + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void +tri_persp_coeff4(uint slot) +{ + const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f); + const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f); + + const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3)); + const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3)); + const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3)); + + vector float vmin_d = setup.vmin->data[slot]; + vector float vmid_d = setup.vmid->data[slot]; + vector float vmax_d = setup.vmax->data[slot]; + + vmin_d = spu_mul(vmin_d, vmin_w); + vmid_d = spu_mul(vmid_d, vmid_w); + vmax_d = spu_mul(vmax_d, vmax_w); + + vector float botda = vmid_d - vmin_d; + vector float majda = vmax_d - vmin_d; + + vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda), + spu_mul(botda, spu_splats(setup.emaj.dy))); + vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda), + spu_mul(majda, spu_splats(setup.ebot.dx))); + + setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea)); + setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea)); + + vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx); + vector float tempy = spu_mul(setup.coef[slot].dady, yyyy); + + setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy)); +} + + + +/** + * Compute the setup.coef[] array dadx, dady, a0 values. + * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized. + */ +static void +setup_tri_coefficients(void) +{ + uint i; + + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + switch (spu.vertex_info.attrib[i].interp_mode) { + case INTERP_NONE: + break; + case INTERP_CONSTANT: + const_coeff4(i); + break; + case INTERP_POS: + /* fall-through */ + case INTERP_LINEAR: + tri_linear_coeff4(i); + break; + case INTERP_PERSPECTIVE: + tri_persp_coeff4(i); + break; + default: + ASSERT(0); + } + } +} + + +static void +setup_tri_edges(void) +{ + float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f; + float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f; + + float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f; + float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f; + float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f; + + setup.emaj.sy = CEILF(vmin_y); + setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy); + setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy; + setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy; + + setup.etop.sy = CEILF(vmid_y); + setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy); + setup.etop.dxdy = setup.etop.dx / setup.etop.dy; + setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy; + + setup.ebot.sy = CEILF(vmin_y); + setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy); + setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy; + setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy; +} + + +/** + * Render the upper or lower half of a triangle. + * Scissoring/cliprect is applied here too. + */ +static void +subtriangle(struct edge *eleft, struct edge *eright, unsigned lines) +{ + const int minx = setup.cliprect_minx; + const int maxx = setup.cliprect_maxx; + const int miny = setup.cliprect_miny; + const int maxy = setup.cliprect_maxy; + int y, start_y, finish_y; + int sy = (int)eleft->sy; + + ASSERT((int)eleft->sy == (int) eright->sy); + + /* clip top/bottom */ + start_y = sy; + finish_y = sy + lines; + + if (start_y < miny) + start_y = miny; + + if (finish_y > maxy) + finish_y = maxy; + + start_y -= sy; + finish_y -= sy; + + /* + _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y); + */ + + for (y = start_y; y < finish_y; y++) { + + /* avoid accumulating adds as floats don't have the precision to + * accurately iterate large triangle edges that way. luckily we + * can just multiply these days. + * + * this is all drowned out by the attribute interpolation anyway. + */ + int left = (int)(eleft->sx + y * eleft->dxdy); + int right = (int)(eright->sx + y * eright->dxdy); + + /* clip left/right */ + if (left < minx) + left = minx; + if (right > maxx) + right = maxx; + + if (left < right) { + int _y = sy + y; + if (block(_y) != setup.span.y) { + flush_spans(); + setup.span.y = block(_y); + } + + int offset = _y&1; + vec_int4 quad_LlRr = {left, left, right, right}; + /* Store left and right in 0 or 1 row of quad based on offset */ + setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset)); + setup.span.y_flags |= 1<<offset; + } + } + + + /* save the values so that emaj can be restarted: + */ + eleft->sx += lines * eleft->dxdy; + eright->sx += lines * eright->dxdy; + eleft->sy += lines; + eright->sy += lines; +} + + +/** + * Draw triangle into tile at (tx, ty) (tile coords) + * The tile data should have already been fetched. + */ +boolean +tri_draw(const float *v0, const float *v1, const float *v2, + uint tx, uint ty) +{ + setup.tx = tx; + setup.ty = ty; + + /* set clipping bounds to tile bounds */ + setup.cliprect_minx = tx * TILE_SIZE; + setup.cliprect_miny = ty * TILE_SIZE; + setup.cliprect_maxx = (tx + 1) * TILE_SIZE; + setup.cliprect_maxy = (ty + 1) * TILE_SIZE; + + if (!setup_sort_vertices((struct vertex_header *) v0, + (struct vertex_header *) v1, + (struct vertex_header *) v2)) { + return FALSE; /* totally clipped */ + } + + setup_tri_coefficients(); + setup_tri_edges(); + + setup.span.y = 0; + setup.span.y_flags = 0; + /* Zero right elements */ + setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); + + if (setup.oneOverArea < 0.0) { + /* emaj on left */ + subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines ); + subtriangle( &setup.emaj, &setup.etop, setup.etop.lines ); + } + else { + /* emaj on right */ + subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines ); + subtriangle( &setup.etop, &setup.emaj, setup.etop.lines ); + } + + flush_spans(); + + return TRUE; +} diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h new file mode 100644 index 00000000000..aa694dd7c93 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_tri.h @@ -0,0 +1,37 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef SPU_TRI_H +#define SPU_TRI_H + + +extern boolean +tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty); + + +#endif /* SPU_TRI_H */ diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c new file mode 100644 index 00000000000..b8a0d4a265f --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_util.c @@ -0,0 +1,167 @@ + +#include "cell/common.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/p_debug.h" +#include "tgsi/tgsi_parse.h" +//#include "tgsi_build.h" +#include "tgsi/tgsi_util.h" + +unsigned +tgsi_util_get_src_register_swizzle( + const struct tgsi_src_register *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->SwizzleX; + case 1: + return reg->SwizzleY; + case 2: + return reg->SwizzleZ; + case 3: + return reg->SwizzleW; + default: + ASSERT( 0 ); + } + return 0; +} + +unsigned +tgsi_util_get_src_register_extswizzle( + const struct tgsi_src_register_ext_swz *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->ExtSwizzleX; + case 1: + return reg->ExtSwizzleY; + case 2: + return reg->ExtSwizzleZ; + case 3: + return reg->ExtSwizzleW; + default: + ASSERT( 0 ); + } + return 0; +} + +unsigned +tgsi_util_get_full_src_register_extswizzle( + const struct tgsi_full_src_register *reg, + unsigned component ) +{ + unsigned swizzle; + + /* + * First, calculate the extended swizzle for a given channel. This will give + * us either a channel index into the simple swizzle or a constant 1 or 0. + */ + swizzle = tgsi_util_get_src_register_extswizzle( + ®->SrcRegisterExtSwz, + component ); + + ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X); + ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y); + ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z); + ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W); + ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W); + ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W); + + /* + * Second, calculate the simple swizzle for the unswizzled channel index. + * Leave the constants intact, they are not affected by the simple swizzle. + */ + if( swizzle <= TGSI_SWIZZLE_W ) { + swizzle = tgsi_util_get_src_register_swizzle( + ®->SrcRegister, + component ); + } + + return swizzle; +} + +unsigned +tgsi_util_get_src_register_extnegate( + const struct tgsi_src_register_ext_swz *reg, + unsigned component ) +{ + switch( component ) { + case 0: + return reg->NegateX; + case 1: + return reg->NegateY; + case 2: + return reg->NegateZ; + case 3: + return reg->NegateW; + default: + ASSERT( 0 ); + } + return 0; +} + +void +tgsi_util_set_src_register_extnegate( + struct tgsi_src_register_ext_swz *reg, + unsigned negate, + unsigned component ) +{ + switch( component ) { + case 0: + reg->NegateX = negate; + break; + case 1: + reg->NegateY = negate; + break; + case 2: + reg->NegateZ = negate; + break; + case 3: + reg->NegateW = negate; + break; + default: + ASSERT( 0 ); + } +} + +unsigned +tgsi_util_get_full_src_register_sign_mode( + const struct tgsi_full_src_register *reg, + unsigned component ) +{ + unsigned sign_mode; + + if( reg->SrcRegisterExtMod.Absolute ) { + /* Consider only the post-abs negation. */ + + if( reg->SrcRegisterExtMod.Negate ) { + sign_mode = TGSI_UTIL_SIGN_SET; + } + else { + sign_mode = TGSI_UTIL_SIGN_CLEAR; + } + } + else { + /* Accumulate the three negations. */ + + unsigned negate; + + negate = reg->SrcRegister.Negate; + if( tgsi_util_get_src_register_extnegate( ®->SrcRegisterExtSwz, component ) ) { + negate = !negate; + } + if( reg->SrcRegisterExtMod.Negate ) { + negate = !negate; + } + + if( negate ) { + sign_mode = TGSI_UTIL_SIGN_TOGGLE; + } + else { + sign_mode = TGSI_UTIL_SIGN_KEEP; + } + } + + return sign_mode; +} diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c new file mode 100644 index 00000000000..03375d84a57 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c @@ -0,0 +1,145 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * (C) Copyright IBM Corporation 2008 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell <[email protected]> + * Ian Romanick <[email protected]> + */ + +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "spu_exec.h" +#include "spu_vertex_shader.h" +#include "spu_main.h" +#include "spu_dcache.h" + +typedef void (*spu_fetch_func)(qword *out, const qword *in, + const qword *shuffle_data); + + +static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = { + /* Shuffle used by CVT_64_FLOAT + */ + { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + }, + + /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED + */ + { + 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, + 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, + }, + + /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED + */ + { + 0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80, + 0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80, + }, + + /* High value shuffle used by trans4x4. + */ + { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17 + }, + + /* Low value shuffle used by trans4x4. + */ + { + 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, + 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F + } +}; + + +/** + * Fetch vertex attributes for 'count' vertices. + */ +static void generic_vertex_fetch(struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count) +{ + unsigned nr_attrs = draw->vertex_fetch.nr_attrs; + unsigned attr; + + ASSERT(count <= 4); + +#if DRAW_DBG + printf("SPU: %s count = %u, nr_attrs = %u\n", + __FUNCTION__, count, nr_attrs); +#endif + + /* loop over vertex attributes (vertex shader inputs) + */ + for (attr = 0; attr < nr_attrs; attr++) { + const unsigned pitch = draw->vertex_fetch.pitch[attr]; + const uint64_t src = draw->vertex_fetch.src_ptr[attr]; + const spu_fetch_func fetch = (spu_fetch_func) + (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]); + unsigned i; + unsigned idx; + const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; + const unsigned quads_per_entry = (bytes_per_entry + 15) / 16; + qword in[2 * 4] ALIGN16_ATTRIB; + + + /* Fetch four attributes for four vertices. + */ + idx = 0; + for (i = 0; i < count; i++) { + const uint64_t addr = src + (elts[i] * pitch); + +#if DRAW_DBG + printf("SPU: fetching = 0x%llx\n", addr); +#endif + + spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry); + idx += quads_per_entry; + } + + /* Be nice and zero out any missing vertices. + */ + (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword)); + + + /* Convert all 4 vertices to vectors of float. + */ + (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data); + } +} + + +void spu_update_vertex_fetch( struct spu_vs_context *draw ) +{ + draw->vertex_fetch.fetch_func = generic_vertex_fetch; +} diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c new file mode 100644 index 00000000000..fbe5b34d397 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c @@ -0,0 +1,244 @@ +/************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell <[email protected]> + * Brian Paul + * Ian Romanick <[email protected]> + */ + +#include <spu_mfcio.h> + +#include "pipe/p_state.h" +#include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "draw/draw_private.h" +#include "draw/draw_context.h" +#include "cell/common.h" +#include "spu_vertex_shader.h" +#include "spu_exec.h" +#include "spu_main.h" + + +#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float)) + + +#define CLIP_RIGHT_BIT 0x01 +#define CLIP_LEFT_BIT 0x02 +#define CLIP_TOP_BIT 0x04 +#define CLIP_BOTTOM_BIT 0x08 +#define CLIP_FAR_BIT 0x10 +#define CLIP_NEAR_BIT 0x20 + + +static INLINE float +dot4(const float *a, const float *b) +{ + return (a[0]*b[0] + + a[1]*b[1] + + a[2]*b[2] + + a[3]*b[3]); +} + +static INLINE unsigned +compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr) +{ + unsigned mask = 0; + unsigned i; + + /* Do the hardwired planes first: + */ + if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT; + if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT; + if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT; + if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT; + if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT; + if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT; + + /* Followed by any remaining ones: + */ + for (i = 6; i < nr; i++) { + if (dot4(clip, plane[i]) < 0) + mask |= (1<<i); + } + + return mask; +} + + +/** + * Transform vertices with the current vertex program/shader + * Up to four vertices can be shaded at a time. + * \param vbuffer the input vertex data + * \param elts indexes of four input vertices + * \param count number of vertices to shade [1..4] + * \param vOut array of pointers to four output vertices + */ +static void +run_vertex_program(struct spu_vs_context *draw, + unsigned elts[4], unsigned count, + const uint64_t *vOut) +{ + struct spu_exec_machine *machine = &draw->machine; + unsigned int j; + + ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS); + ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS); + const float *scale = draw->viewport.scale; + const float *trans = draw->viewport.translate; + + ASSERT(count <= 4); + + machine->Processor = TGSI_PROCESSOR_VERTEX; + + ASSERT_ALIGN16(draw->constants); + machine->Consts = (float (*)[4]) draw->constants; + + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); + + spu_vertex_fetch( draw, machine, elts, count ); + + /* run shader */ + spu_exec_machine_run( machine ); + + + /* store machine results */ + for (j = 0; j < count; j++) { + unsigned slot; + float x, y, z, w; + unsigned char buffer[sizeof(struct vertex_header) + + MAX_VERTEX_SIZE] ALIGN16_ATTRIB; + struct vertex_header *const tmpOut = + (struct vertex_header *) buffer; + const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header) + + (sizeof(float) * 4 + * draw->num_vs_outputs)); + + mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0); + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + + /* Handle attr[0] (position) specially: + * + * XXX: Computing the clipmask should be done in the vertex + * program as a set of DP4 instructions appended to the + * user-provided code. + */ + x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j]; + + tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane, + draw->nr_planes); + tmpOut->edgeflag = 1; + + /* divide by w */ + w = 1.0f / w; + x *= w; + y *= w; + z *= w; + + /* Viewport mapping */ + tmpOut->data[0][0] = x * scale[0] + trans[0]; + tmpOut->data[0][1] = y * scale[1] + trans[1]; + tmpOut->data[0][2] = z * scale[2] + trans[2]; + tmpOut->data[0][3] = w; + + /* Remaining attributes are packed into sequential post-transform + * vertex attrib slots. + */ + for (slot = 1; slot < draw->num_vs_outputs; slot++) { + tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; + } + + mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0); + } /* loop over vertices */ +} + + +unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32] + ALIGN16_ATTRIB; + + +void +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs) +{ + const unsigned immediate_addr = vs->immediates; + const unsigned immediate_size = + ROUNDUP16((sizeof(float) * 4 * vs->num_immediates) + + (immediate_addr & 0x0f)); + + + mfc_get(immediates, immediate_addr & ~0x0f, immediate_size, + TAG_VERTEX_BUFFER, 0, 0); + + draw->machine.Instructions = (struct tgsi_full_instruction *) + vs->instructions; + draw->machine.NumInstructions = vs->num_instructions; + + draw->machine.Declarations = (struct tgsi_full_declaration *) + vs->declarations; + draw->machine.NumDeclarations = vs->num_declarations; + + draw->num_vs_outputs = vs->num_outputs; + + /* specify the shader to interpret/execute */ + spu_exec_machine_init(&draw->machine, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/, + PIPE_SHADER_VERTEX); + + wait_on_mask(1 << TAG_VERTEX_BUFFER); + + (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f], + sizeof(float) * 4 * vs->num_immediates); +} + + +void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs) +{ + unsigned i; + + (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes); + draw->nr_planes = vs->nr_planes; + draw->vertex_fetch.nr_attrs = vs->nr_attrs; + + for (i = 0; i < vs->num_elts; i += 4) { + const unsigned batch_size = MIN2(vs->num_elts - i, 4); + + run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]); + } +} diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h new file mode 100644 index 00000000000..4c74f5e74d5 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h @@ -0,0 +1,66 @@ +#ifndef SPU_VERTEX_SHADER_H +#define SPU_VERTEX_SHADER_H + +#include "cell/common.h" +#include "pipe/p_format.h" +#include "spu_exec.h" + +struct spu_vs_context; + +typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count ); + +struct spu_vs_context { + struct pipe_viewport_state viewport; + + struct { + uint64_t src_ptr[PIPE_MAX_ATTRIBS]; + unsigned pitch[PIPE_MAX_ATTRIBS]; + unsigned size[PIPE_MAX_ATTRIBS]; + unsigned code_offset[PIPE_MAX_ATTRIBS]; + unsigned nr_attrs; + boolean dirty; + + spu_full_fetch_func fetch_func; + void *code; + } vertex_fetch; + + /* Clip derived state: + */ + float plane[12][4]; + unsigned nr_planes; + + struct spu_exec_machine machine; + const float (*constants)[4]; + + unsigned num_vs_outputs; +}; + +extern void spu_update_vertex_fetch(struct spu_vs_context *draw); + +static INLINE void spu_vertex_fetch(struct spu_vs_context *draw, + struct spu_exec_machine *machine, + const unsigned *elts, + unsigned count) +{ + if (draw->vertex_fetch.dirty) { + spu_update_vertex_fetch(draw); + draw->vertex_fetch.dirty = 0; + } + + (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count); +} + +struct cell_command_vs; + +extern void +spu_bind_vertex_shader(struct spu_vs_context *draw, + struct cell_shader_info *vs); + +extern void +spu_execute_vertex_shader(struct spu_vs_context *draw, + const struct cell_command_vs *vs); + +#endif /* SPU_VERTEX_SHADER_H */ |