8 files changed, 194 insertions, 95 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cf892206c66..f32ad5bfbe6 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -93,6 +93,7 @@
 #define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_VS_EXECUTE          21
+#define CELL_CMD_FLUSH_BUFFER_RANGE  22
 
 
 #define CELL_NUM_BUFFERS 4
@@ -144,6 +145,13 @@ struct cell_attribute_fetch_code {
    uint size;
 };
 
+
+struct cell_buffer_range {
+   uint64_t base;
+   unsigned size;
+};
+
+
 struct cell_shader_info
 {
    uint64_t declarations;
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index f12613649b9..cbd387f0142 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -49,9 +49,12 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].size) {
          sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
                                                   PIPE_BUFFER_USAGE_CPU_READ);
+         cell_flush_buffer_range(sp, sp->mapped_constants[i], 
+                                 sp->constants[i].buffer->size);
+      }
    }
 
    draw_set_mapped_constant_buffer(sp->draw,
@@ -124,6 +127,7 @@ cell_draw_elements(struct pipe_context *pipe,
          void *buf = pipe->winsys->buffer_map(pipe->winsys,
                                               sp->vertex_buffer[i].buffer,
                                               PIPE_BUFFER_USAGE_CPU_READ);
+	 cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
          draw_set_mapped_vertex_buffer(draw, i, buf);
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 20f27531fce..66a5627d844 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -82,3 +82,17 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
 
    flushing = FALSE;
 }
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
+   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
+
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+   cell_batch_append(cell, batch, sizeof(batch));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 00000000000..9e30e178804
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,100 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned aligned_start_ea = ea & ~0x0f;
+   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
+   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < num_entries; i++) {
+         dst[i] = cache_rd(data, (ea & ~0x0f) + (i * 16));
+      }
+   } else {
+      qword tmp[2] ALIGN16_ATTRIB;
+
+
+      tmp[0] = cache_rd(data, (ea & ~0x0f));
+      for (i = 0; i < (num_entries & ~1); i++) {
+         const unsigned curr = i & 1;
+         const unsigned next = curr ^ 1;
+
+         tmp[next] = cache_rd(data, (ea & ~0x0f) + (next * 16));
+
+         dst[i] = si_or((qword) spu_slqwbyte(tmp[curr], shift),
+                        (qword) spu_rlmaskqwbyte(tmp[next], shift - 16));
+      }
+
+      if (i < num_entries) {
+         dst[i] = si_or((qword) spu_slqwbyte(tmp[(i & 1)], shift),
+                        si_il(0));
+      }
+   }
+}
+
+
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+
+   (void) ea;
+   (void) size;
+
+   /* Invalidate the whole cache for now.
+    */
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      CACHELINE_CLEARVALID(i);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 00000000000..7a06b8c25af
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,34 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 0eb5ea1a3f3..94ac6a28850 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -72,6 +72,7 @@
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
+#include "spu_dcache.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -352,19 +353,17 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT: {
-         unsigned char buffer[32] ALIGN16_ATTRIB;
          unsigned i;
 
          for (i = 0; i < 4; i++) {
             const float *ptr = mach->Consts[index->i[i]];
-            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
-            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+            float tmp[4];
 
-            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
-            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
 
-            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
-                + (sizeof(float) * swizzle)], sizeof(float));
+            chan->f[i] = tmp[0];
          }
          break;
       }
@@ -1899,32 +1898,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
-	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-	 struct tgsi_full_declaration decl;
-	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[2 * ((sizeof(struct tgsi_full_declaration) + 31) 
+                              / 32)];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
 
-	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
 
-	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, &decl );
+         exec_declaration( mach, &d.decl );
       }
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
-      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_instruction inst;
-      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
-      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
-
-      assert(pc < mach->NumInstructions);
-      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
-
-      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, & inst, &pc );
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[2 * ((sizeof(struct tgsi_full_instruction) + 31) 
+                           / 32)];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
    }
 
 #if 0
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index dbc3705c241..1136dba62d5 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -462,6 +462,14 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
       default:
          printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index e5d9910ff30..f7e4e653e31 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -40,25 +40,7 @@
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
 #include "spu_main.h"
-
-#define CACHE_NAME            attribute
-#define CACHED_TYPE           qword
-#define CACHE_TYPE            CACHE_TYPE_RO
-#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
-#define CACHE_LOG2NNWAY       2
-#define CACHE_LOG2NSETS       6
-#include <cache-api.h>
-
-/* Yes folks, this is ugly.
- */
-#undef CACHE_NWAY
-#undef CACHE_NSETS
-#define CACHE_NAME            attribute
-#define CACHE_NWAY            4
-#define CACHE_NSETS           (1U << 6)
-
-
-#define DRAW_DBG 0
+#include "spu_dcache.h"
 
 typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
@@ -103,44 +85,6 @@ static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
 
 
 /**
- * Fetch between 1 and 32 bytes from an unaligned address
- */
-static INLINE void
-fetch_unaligned(qword *dst, unsigned ea, unsigned size)
-{
-   qword tmp[4] ALIGN16_ATTRIB;
-   const int shift = ea & 0x0f;
-   const unsigned aligned_start_ea = ea & ~0x0f;
-   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
-   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
-   unsigned i;
-
-
-   if (shift == 0) {
-      /* Data is already aligned.  Fetch directly into the destination buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-   } else {
-      /* Fetch data from the cache to the local buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-
-
-      /* Fix the alignment of the data and write to the destination buffer.
-       */
-      for (i = 0; i < ((size + 15) / 16); i++) {
-	 dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift),
-			(qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16));
-      }
-   }
-}
-
-
-/**
  * Fetch vertex attributes for 'count' vertices.
  */
 static void generic_vertex_fetch(struct spu_vs_context *draw,
@@ -182,7 +126,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          printf("SPU: fetching = 0x%llx\n", addr);
 #endif
 
-         fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
          idx += quads_per_entry;
       }
 
@@ -200,15 +144,5 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
 void spu_update_vertex_fetch( struct spu_vs_context *draw )
 {
-   unsigned i;
-
-   
-   /* Invalidate the vertex cache.
-    */
-   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
-      CACHELINE_CLEARVALID(i);
-   }
-
-
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 }