i965: Massively simplify the intel_upload implementation.

The implementation kept a page-sized area for uploading data, and uploaded chunks from that to a 64kb-sized streamed buffer. This wasted cache footprint (and extra state tracking to do so) when we want to just write our data into the buffer immediately. Instead, build it around an interface like brw_state_batch() that just gets you a pointer to BO memory to upload your stuff immediately. Improves OpenArena on HSW by 1.62209% +/- 0.355299% (n=61) and on BYT by 1.7916% +/- 0.415743% (n=31). v2: Rebase on Mesa master, drop old prototypes. Re-do performance comparison on a kernel that doesn't punish CPU efficiency improvements. Reviewed-by: Kenneth Graunke <[email protected]>
author: Eric Anholt <[email protected]> 2013-01-23 17:05:10 -0800
committer: Eric Anholt <[email protected]> 2014-03-26 13:13:26 -0700
commit: 3b579882903c577daa1af286a5e0bf5bc122a34d (patch)
tree: 79013294b0c19a6d6ad98f6c4a3307bfce3a9bfa /src/mesa/drivers/dri/i965/intel_upload.c
parent: b1909b260f6c3855c8214319c602fc7adea7faf9 (diff)
1 files changed, 62 insertions, 105 deletions
diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index ec3109bd441..bb3f615d987 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -57,127 +57,84 @@ intel_upload_finish(struct brw_context *brw)
    if (!brw->upload.bo)
       return;
 
-   if (brw->upload.buffer_len) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
-   }
-
+   drm_intel_bo_unmap(brw->upload.bo);
    drm_intel_bo_unreference(brw->upload.bo);
    brw->upload.bo = NULL;
+   brw->upload.next_offset = 0;
 }
 
-static void
-wrap_buffers(struct brw_context *brw, GLuint size)
-{
-   intel_upload_finish(brw);
-
-   if (size < INTEL_UPLOAD_SIZE)
-      size = INTEL_UPLOAD_SIZE;
-
-   brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "upload", size, 0);
-   brw->upload.offset = 0;
-}
-
-void
-intel_upload_data(struct brw_context *brw,
-                  const void *ptr, GLuint size, GLuint align,
-                  drm_intel_bo **return_bo,
-                  GLuint *return_offset)
-{
-   GLuint base, delta;
-
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
-      wrap_buffers(brw, size);
-      base = 0;
-   }
-
-   drm_intel_bo_reference(brw->upload.bo);
-   *return_bo = brw->upload.bo;
-   *return_offset = base;
-
-   delta = base - brw->upload.offset;
-   if (brw->upload.buffer_len &&
-       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
-   }
-
-   if (size < sizeof(brw->upload.buffer)) {
-      if (brw->upload.buffer_len == 0)
-         brw->upload.buffer_offset = base;
-      else
-         brw->upload.buffer_len += delta;
-
-      memcpy(brw->upload.buffer + brw->upload.buffer_len, ptr, size);
-      brw->upload.buffer_len += size;
-   } else {
-      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
-   }
-
-   brw->upload.offset = base + size;
-}
-
+/**
+ * Interface for getting memory for uploading streamed data to the GPU
+ *
+ * In most cases, streamed data (for GPU state structures, for example) is
+ * uploaded through brw_state_batch(), since that interface allows relocations
+ * from the streamed space returned to other BOs.  However, that interface has
+ * the restriction that the amount of space allocated has to be "small" (see
+ * estimated_max_prim_size in brw_draw.c).
+ *
+ * This interface, on the other hand, is able to handle arbitrary sized
+ * allocation requests, though it will batch small allocations into the same
+ * BO for efficiency and reduced memory footprint.
+ *
+ * \note The returned pointer is valid only until intel_upload_finish(), which
+ * will happen at batch flush or the next
+ * intel_upload_space()/intel_upload_data().
+ *
+ * \param out_bo Pointer to a BO, which must point to a valid BO or NULL on
+ * entry, and will have a reference to the new BO containing the state on
+ * return.
+ *
+ * \param out_offset Offset within the buffer object that the data will land.
+ */
 void *
-intel_upload_map(struct brw_context *brw, GLuint size, GLuint align)
+intel_upload_space(struct brw_context *brw,
+                   uint32_t size,
+                   uint32_t alignment,
+                   drm_intel_bo **out_bo,
+                   uint32_t *out_offset)
 {
-   GLuint base, delta;
-   char *ptr;
+   uint32_t offset;
 
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
-      wrap_buffers(brw, size);
-      base = 0;
+   offset = ALIGN_NPOT(brw->upload.next_offset, alignment);
+   if (brw->upload.bo && offset + size > brw->upload.bo->size) {
+      intel_upload_finish(brw);
+      offset = 0;
    }
 
-   delta = base - brw->upload.offset;
-   if (brw->upload.buffer_len &&
-       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
+   if (!brw->upload.bo) {
+      brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data",
+                                          MAX2(INTEL_UPLOAD_SIZE, size), 4096);
+      if (brw->has_llc)
+         drm_intel_bo_map(brw->upload.bo, true);
+      else
+         drm_intel_gem_bo_map_gtt(brw->upload.bo);
    }
 
-   if (size <= sizeof(brw->upload.buffer)) {
-      if (brw->upload.buffer_len == 0)
-         brw->upload.buffer_offset = base;
-      else
-         brw->upload.buffer_len += delta;
+   brw->upload.next_offset = offset + size;
 
-      ptr = brw->upload.buffer + brw->upload.buffer_len;
-      brw->upload.buffer_len += size;
-   } else {
-      ptr = malloc(size);
+   *out_offset = offset;
+   if (*out_bo != brw->upload.bo) {
+      drm_intel_bo_unreference(*out_bo);
+      *out_bo = brw->upload.bo;
+      drm_intel_bo_reference(brw->upload.bo);
    }
 
-   return ptr;
+   return brw->upload.bo->virtual + offset;
 }
 
+/**
+ * Handy interface to upload some data to temporary GPU memory quickly.
+ *
+ * References to this memory should not be retained across batch flushes.
+ */
 void
-intel_upload_unmap(struct brw_context *brw,
-                   const void *ptr, GLuint size, GLuint align,
-                   drm_intel_bo **return_bo,
-                   GLuint *return_offset)
+intel_upload_data(struct brw_context *brw,
+                  const void *data,
+                  uint32_t size,
+                  uint32_t alignment,
+                  drm_intel_bo **out_bo,
+                  uint32_t *out_offset)
 {
-   GLuint base;
-
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
-      free((void*)ptr);
-   }
-
-   drm_intel_bo_reference(brw->upload.bo);
-   *return_bo = brw->upload.bo;
-   *return_offset = base;
-
-   brw->upload.offset = base + size;
+   void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset);
+   memcpy(dst, data, size);
 }
author	Eric Anholt <[email protected]>	2013-01-23 17:05:10 -0800
committer	Eric Anholt <[email protected]>	2014-03-26 13:13:26 -0700
commit	3b579882903c577daa1af286a5e0bf5bc122a34d (patch)
tree	79013294b0c19a6d6ad98f6c4a3307bfce3a9bfa /src/mesa/drivers/dri/i965/intel_upload.c
parent	b1909b260f6c3855c8214319c602fc7adea7faf9 (diff)