diff options
author | Matt Turner <[email protected]> | 2017-07-11 22:27:34 +0100 |
---|---|---|
committer | Kenneth Graunke <[email protected]> | 2017-07-22 19:34:42 -0700 |
commit | f37ede40badb7a654dc423e29d45c1e2be2d49ce (patch) | |
tree | 710aeee54f25b3f9b28b79b35d33bf003f335295 /src/mesa/drivers/dri/i965/brw_bufmgr.c | |
parent | bdae2ddff89004c199b71cb6a4a306dee616f7f9 (diff) |
i965/bufmgr: Use write-combine mappings where available
Write-combine mappings give much better performance on writes than
uncached access through the GTT.
Improves performance of GFXBench 4's gl_driver2 benchmark at 1024x768
on Apollolake by 3.6086% +/- 0.674193% (n=15).
v2: (by Ken) Rebase on lockless mappings, map_count deletion, valgrind
updates, potential for CPU/WC maps failing, and other changes.
v3: (by Ken and Chris Wilson)
(Ken): Rebase on set_domain -> gem_wait
(Chris): Fix up a failed CPU/WC mmaping with a GTT mapping
Not all objects will be mappable for direct access by the CPU
(either using WC/CPU or WC paths), for example, a dmabuf wrapping an
object on a foreign device or an object wrapping access to stolen
memory. Since either the physical pages are not known or even do not
exist, we need to use the mediated, indirect access via the GTT. (If
one day, the kernel does suddenly start providing mediated access
via a regular WB/WC mmapping, we no longer need the fallback.)
v4: Avoid falling back for MAP_RAW (Chris).
Reviewed-by: Kenneth Graunke <[email protected]>
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_bufmgr.c')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_bufmgr.c | 91 |
1 files changed, 88 insertions, 3 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c index 5edf4626b4f..e9e4d3b9884 100644 --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c @@ -121,6 +121,7 @@ struct brw_bufmgr { struct hash_table *handle_table; bool has_llc:1; + bool has_mmap_wc:1; bool bo_reuse:1; }; @@ -767,6 +768,52 @@ brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags) } static void * +brw_bo_map_wc(struct brw_context *brw, struct brw_bo *bo, unsigned flags) +{ + struct brw_bufmgr *bufmgr = bo->bufmgr; + + if (!bufmgr->has_mmap_wc) + return NULL; + + if (!bo->map_wc) { + struct drm_i915_gem_mmap mmap_arg; + void *map; + + DBG("brw_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name); + + memclear(mmap_arg); + mmap_arg.handle = bo->gem_handle; + mmap_arg.size = bo->size; + mmap_arg.flags = I915_MMAP_WC; + int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); + if (ret != 0) { + ret = -errno; + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + map = (void *) (uintptr_t) mmap_arg.addr_ptr; + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) { + VG_NOACCESS(map, bo->size); + drm_munmap(map, bo->size); + } + } + assert(bo->map_wc); + + DBG("brw_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(brw, bo, "WC mapping"); + } + + return bo->map_wc; +} + +static void * brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { struct brw_bufmgr *bufmgr = bo->bufmgr; @@ -850,10 +897,32 @@ brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW)) return brw_bo_map_gtt(brw, bo, flags); - else if (can_map_cpu(bo, flags)) - return brw_bo_map_cpu(brw, bo, flags); + + void *map; + + if (can_map_cpu(bo, flags)) + map = brw_bo_map_cpu(brw, bo, flags); else - return brw_bo_map_gtt(brw, bo, flags); + map = brw_bo_map_wc(brw, bo, flags); + + /* Allow the attempt to fail by falling back to the GTT where necessary. + * + * Not every buffer can be mmaped directly using the CPU (or WC), for + * example buffers that wrap stolen memory or are imported from other + * devices. For those, we have little choice but to use a GTT mmapping. + * However, if we use a slow GTT mmapping for reads where we expected fast + * access, that order of magnitude difference in throughput will be clearly + * expressed by angry users. + * + * We skip MAP_RAW because we want to avoid map_gtt's fence detiling. + */ + if (!map && !(flags & MAP_RAW)) { + perf_debug("Fallback GTT mapping for %s with access flags %x\n", + bo->name, flags); + map = brw_bo_map_gtt(brw, bo, flags); + } + + return map; } int @@ -1211,6 +1280,21 @@ brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset, uint64_t *result) return ret; } +static int +gem_param(int fd, int name) +{ + drm_i915_getparam_t gp; + int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */ + + memset(&gp, 0, sizeof(gp)); + gp.param = name; + gp.value = &v; + if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp)) + return -1; + + return v; +} + /** * Initializes the GEM buffer manager, which uses the kernel to allocate, map, * and manage map buffer objections. @@ -1243,6 +1327,7 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd, int batch_size) } bufmgr->has_llc = devinfo->has_llc; + bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0; init_cache_buckets(bufmgr); |