intel: Improve teximage perf for Google Chrome paint rects (v3)

This patch reduces the time spent in glTexImage and glTexSubImage by over 5x on Sandybridge for the workload described below. It adds a new fast path for glTexImage2D and glTexSubImage2D, intel_texsubimage_tiled_memcpy, which is optimized for Google Chrome's paint rectangles. The fast path is implemented only for 2D GL_BGRA textures for chipsets with a LLC. === Performance Analysis === Workload description: Personalize your google.com page with a wallpaper. Start chromium with flags "--ignore-gpu-blacklist --enable-accelerated-painting --force-compositing-mode". Start recording with chrome://tracing. Visit google.com and wait for page to finish rendering. Measure the time spent by process CrGpuMain in GLES2DecoderImpl::HandleTexImage2D and HandleTexSubImage2D. System config: cpu: Sandybridge Mobile GT2+ (0x0126) kernel 3.4.9 x86_64 chromium 21.0.1180.89 (154005) Statistics: | N Median Avg Stddev --------------|------------------------- before (msec) | 8 472.5 463.75 72.6 after (msec) | 8 78.0 79.6 5.7 Arithmetic difference at 95.0% confidence: -384.1 +/- 55.2 msec -82.8% +/- 11.9% Ratio at 95.0% confidence: 5.81 +/- 0.119 v2: - Replace check for `intel->gen >= 6` with `intel->has_llc`, per danvet. - Fix typo in comment, s/throuh/through/. - Swap 'before' and 'after' rows in stat table. v3: - If the current batch references the bo, then flush batch before mapping the bo. Found by Chris. - Restrict supported texture images to level 0 of target GL_TEXTURE_2D. This avoids an arithmetic bug in calculating image offsets within the miptree, found by Paul. This restriction does not diminish this patch's benefit to Chrome OS performance. - Use less instructions for bit6 swizzling, suggested by Paul. - Remove erroneous comment about Y-tiling, for Paul. - Print perf_debug messages when flushing and stalling. - Update stats in commit message; run workload under a release build rather than a debug build. Note: This is a candidate for the 9.0 branch. Acked-by: Eric Anholt <[email protected]> CC: Stéphane Marchesin <[email protected]> Signed-off-by: Chad Versace <[email protected]>
author: Chad Versace <[email protected]> 2012-09-04 12:15:29 -0700
committer: Chad Versace <[email protected]> 2012-09-25 10:58:45 -0700
commit: 413c4914129cd26ca87960852d8c0264c0fb29e7 (patch)
tree: 722c87715533b2ac9fe9d57553d766436ff5be5a /src
parent: 581619f5a70c0e2ef6ac6d1e810b4f5a6e6416d4 (diff)
3 files changed, 186 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index 88a7d55414a..777574ddd9b 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -85,4 +85,15 @@ bool intel_copy_texsubimage(struct intel_context *intel,
                             GLint x, GLint y,
                             GLsizei width, GLsizei height);
 
+bool
+intel_texsubimage_tiled_memcpy(struct gl_context *ctx,
+                               GLuint dims,
+                               struct gl_texture_image *texImage,
+                               GLint xoffset, GLint yoffset, GLint zoffset,
+                               GLsizei width, GLsizei height, GLsizei depth,
+                               GLenum format, GLenum type,
+                               const GLvoid *pixels,
+                               const struct gl_pixelstore_attrib *packing,
+                               bool for_glTexImage);
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index a08a5a200b8..7b9638f666a 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -206,10 +206,22 @@ intelTexImage(struct gl_context * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack)
 {
+   bool ok;
+
    DBG("%s target %s level %d %dx%dx%d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
+   ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage,
+                                       0, 0, 0, /*x,y,z offsets*/
+                                       texImage->Width,
+                                       texImage->Height,
+                                       texImage->Depth,
+                                       format, type, pixels, unpack,
+                                       true /*for_glTexImage*/);
+   if (ok)
+      return;
+
    /* Attempt to use the blitter for PBO image uploads.
     */
    if (dims <= 2 &&
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index ae4b3bca305..d3a873655fb 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -26,6 +26,7 @@
  * 
  **************************************************************************/
 
+#include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/pbo.h"
 #include "main/texobj.h"
@@ -33,6 +34,7 @@
 #include "main/texcompress.h"
 #include "main/enums.h"
 
+#include "intel_batchbuffer.h"
 #include "intel_context.h"
 #include "intel_tex.h"
 #include "intel_mipmap_tree.h"
@@ -148,6 +150,157 @@ intel_blit_texsubimage(struct gl_context * ctx,
    return true;
 }
 
+/**
+ * \brief A fast path for glTexImage and glTexSubImage.
+ *
+ * \param for_glTexImage Was this called from glTexImage or glTexSubImage?
+ *
+ * This fast path is taken when the hardware natively supports the texture
+ * format (such as GL_BGRA) and when the texture memory is X-tiled. It uploads
+ * the texture data by mapping the texture memory without a GTT fence, thus
+ * acquiring a tiled view of the memory, and then memcpy'ing sucessive
+ * subspans within each tile.
+ *
+ * This is a performance win over the conventional texture upload path because
+ * it avoids the performance penalty of writing through the write-combine
+ * buffer. In the conventional texture upload path,
+ * texstore.c:store_texsubimage(), the texture memory is mapped through a GTT
+ * fence, thus acquiring a linear view of the memory, then each row in the
+ * image is memcpy'd. In this fast path, we replace each row's memcpy with
+ * a sequence of memcpy's over each bit6 swizzle span in the row.
+ *
+ * This fast path's use case is Google Chrome's paint rectangles.  Chrome (as
+ * of version 21) renders each page as a tiling of 256x256 GL_BGRA textures.
+ * Each page's content is initially uploaded with glTexImage2D and damaged
+ * regions are updated with glTexSubImage2D. On some workloads, the
+ * performance gain of this fastpath on Sandybridge is over 5x.
+ */
+bool
+intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
+                               GLuint dims,
+                               struct gl_texture_image *texImage,
+                               GLint xoffset, GLint yoffset, GLint zoffset,
+                               GLsizei width, GLsizei height, GLsizei depth,
+                               GLenum format, GLenum type,
+                               const GLvoid *pixels,
+                               const struct gl_pixelstore_attrib *packing,
+                               bool for_glTexImage)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_image *image = intel_texture_image(texImage);
+
+   /* The miptree's buffer. */
+   drm_intel_bo *bo;
+
+   int error = 0;
+
+   /* This fastpath is restricted to a specific texture type: level 0 of
+    * a 2D BGRA texture. It could be generalized to support more types by
+    * varying the arithmetic loop below.
+    */
+   if (!intel->has_llc ||
+       format != GL_BGRA ||
+       type != GL_UNSIGNED_BYTE ||
+       texImage->TexObject->Target != GL_TEXTURE_2D ||
+       texImage->Level != 0 ||
+       pixels == NULL ||
+       packing->Alignment > 4)
+      return false;
+
+   if (for_glTexImage)
+      ctx->Driver.AllocTextureImageBuffer(ctx, texImage);
+
+   if (!image->mt ||
+       image->mt->region->tiling != I915_TILING_X) {
+      /* The algorithm below is written only for X-tiled memory. */
+      return false;
+   }
+
+   bo = image->mt->region->bo;
+
+   if (drm_intel_bo_references(intel->batch.bo, bo)) {
+      perf_debug("Flushing before mapping a referenced bo.\n");
+      intel_batchbuffer_flush(intel);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      if (drm_intel_bo_busy(bo)) {
+         perf_debug("Mapping a busy BO, causing a stall on the GPU.\n");
+      }
+   }
+
+   error = drm_intel_bo_map(bo, true /*write_enable*/);
+   if (error || bo->virtual == NULL) {
+      DBG("%s: failed to map bo\n", __FUNCTION__);
+      return false;
+   }
+
+   /* We postponed printing this message until having committed to executing
+    * the function.
+    */
+   DBG("%s: level=%d offset=(%d,%d) (w,h)=(%d,%d)\n",
+       __FUNCTION__, texImage->Level, xoffset, yoffset, width, height);
+
+   /* In the tiling algorithm below, some variables are in units of pixels,
+    * others are in units of bytes, and others (such as height) are unitless.
+    * Each variable name is suffixed with its units.
+    */
+
+   const uint32_t x_max_pixels = xoffset + width;
+   const uint32_t y_max_pixels = yoffset + height;
+
+   const uint32_t tile_size_bytes = 4096;
+
+   const uint32_t tile_width_bytes = 512;
+   const uint32_t tile_width_pixels = 128;
+
+   const uint32_t tile_height = 8;
+
+   const uint32_t cpp = 4; /* chars per pixel of GL_BGRA */
+   const uint32_t swizzle_width_pixels = 16;
+
+   const uint32_t stride_bytes = image->mt->region->pitch * cpp;
+   const uint32_t width_tiles = stride_bytes / tile_width_bytes;
+
+   for (uint32_t y_pixels = yoffset; y_pixels < y_max_pixels; ++y_pixels) {
+      const uint32_t y_offset_bytes = (y_pixels / tile_height) * width_tiles * tile_size_bytes
+                                    + (y_pixels % tile_height) * tile_width_bytes;
+
+      for (uint32_t x_pixels = xoffset; x_pixels < x_max_pixels; x_pixels += swizzle_width_pixels) {
+         const uint32_t x_offset_bytes = (x_pixels / tile_width_pixels) * tile_size_bytes
+                                       + (x_pixels % tile_width_pixels) * cpp;
+
+         intptr_t offset_bytes = y_offset_bytes + x_offset_bytes;
+         if (intel->has_swizzling) {
+#if 0
+            /* Clear, unoptimized version. */
+            bool bit6 = (offset_bytes >> 6) & 1;
+            bool bit9 = (offset_bytes >> 9) & 1;
+            bool bit10 = (offset_bytes >> 10) & 1;
+
+            if (bit9 ^ bit10)
+               offset_bytes ^= (1 << 6);
+#else
+            /* Optimized, obfuscated version. */
+            offset_bytes ^= ((offset_bytes >> 3) ^ (offset_bytes >> 4))
+                          & (1 << 6);
+#endif
+         }
+
+         const uint32_t swizzle_bound_pixels = ALIGN(x_pixels + 1, swizzle_width_pixels);
+         const uint32_t memcpy_bound_pixels = MIN2(x_max_pixels, swizzle_bound_pixels);
+         const uint32_t copy_size = cpp * (memcpy_bound_pixels - x_pixels);
+
+         memcpy(bo->virtual + offset_bytes, pixels, copy_size);
+         pixels += copy_size;
+         x_pixels -= (x_pixels % swizzle_width_pixels);
+      }
+   }
+
+   drm_intel_bo_unmap(bo);
+   return true;
+}
+
 static void
 intelTexSubImage(struct gl_context * ctx,
                  GLuint dims,
@@ -158,6 +311,16 @@ intelTexSubImage(struct gl_context * ctx,
                  const GLvoid * pixels,
                  const struct gl_pixelstore_attrib *packing)
 {
+   bool ok;
+
+   ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage,
+                                       xoffset, yoffset, zoffset,
+                                       width, height, depth,
+                                       format, type, pixels, packing,
+                                       false /*for_glTexImage*/);
+   if (ok)
+     return;
+
    /* The intel_blit_texsubimage() function only handles 2D images */
    if (dims != 2 || !intel_blit_texsubimage(ctx, texImage,
 			       xoffset, yoffset,
author	Chad Versace <[email protected]>	2012-09-04 12:15:29 -0700
committer	Chad Versace <[email protected]>	2012-09-25 10:58:45 -0700
commit	413c4914129cd26ca87960852d8c0264c0fb29e7 (patch)
tree	722c87715533b2ac9fe9d57553d766436ff5be5a /src
parent	581619f5a70c0e2ef6ac6d1e810b4f5a6e6416d4 (diff)