diff options
author | Lionel Landwerlin <[email protected]> | 2018-02-21 19:15:46 +0000 |
---|---|---|
committer | Lionel Landwerlin <[email protected]> | 2018-03-22 20:14:22 +0000 |
commit | 57a11550bc6195c404496e4278920ea63a343f08 (patch) | |
tree | 86e49b4c8d3677007c85aad8ea98dd762ef81e66 /src/mesa/drivers | |
parent | c1900f5b0fb7a6f22a13f67e2645f3754b5df245 (diff) |
i965: perf: query topology
With the introduction of asymmetric slices in CNL, we cannot rely on
the previous SUBSLICE_MASK getparam to tell userspace what subslices
are available.
We introduce a new uAPI in the kernel driver to report exactly what
part of the GPU are fused and require this to be available on Gen10+.
Prior generations can continue to rely on GETPARAM on older kernels.
This patch is quite a lot of code because we have to support lots of
different kernel versions, ranging from not providing any information
(for Haswell on 4.13 through 4.17), to being able to query through
GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17
for Gen10+.
This change stores topology information in a unified way on
brw_context.topology from the various kernel APIs. And then generates
the appropriate values for the equations from that unified topology.
v2: Move slice/subslice masks fields to gen_device_info (Rafael)
v3: Add a gen_device_info_subslice_available() helper (Lionel)
Signed-off-by: Lionel Landwerlin <[email protected]>
Acked-by: Rafael Antognolli <[email protected]>
Reviewed-by: Kenneth Graunke <[email protected]>
Diffstat (limited to 'src/mesa/drivers')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_performance_query.c | 181 |
1 files changed, 110 insertions, 71 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c index 12f797c1297..cca74001f19 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_query.c +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c @@ -1911,6 +1911,100 @@ init_oa_configs(struct brw_context *brw) } static bool +query_topology(struct brw_context *brw) +{ + __DRIscreen *screen = brw->screen->driScrnPriv; + struct drm_i915_query_item item = { + .query_id = DRM_I915_QUERY_TOPOLOGY_INFO, + }; + struct drm_i915_query query = { + .num_items = 1, + .items_ptr = (uintptr_t) &item, + }; + + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query)) + return false; + + struct drm_i915_query_topology_info *topo_info = + (struct drm_i915_query_topology_info *) calloc(1, item.length); + item.data_ptr = (uintptr_t) topo_info; + + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query) || + item.length <= 0) + return false; + + gen_device_info_update_from_topology(&brw->screen->devinfo, + topo_info); + + free(topo_info); + + return true; +} + +static bool +getparam_topology(struct brw_context *brw) +{ + __DRIscreen *screen = brw->screen->driScrnPriv; + drm_i915_getparam_t gp; + int ret; + + int slice_mask = 0; + gp.param = I915_PARAM_SLICE_MASK; + gp.value = &slice_mask; + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); + if (ret) + return false; + + int subslice_mask = 0; + gp.param = I915_PARAM_SUBSLICE_MASK; + gp.value = &subslice_mask; + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); + if (ret) + return false; + + gen_device_info_update_from_masks(&brw->screen->devinfo, + slice_mask, + subslice_mask, + brw->screen->eu_total); + + return true; +} + +static void +compute_topology_builtins(struct brw_context *brw) +{ + const struct gen_device_info *devinfo = &brw->screen->devinfo; + + brw->perfquery.sys_vars.slice_mask = devinfo->slice_masks; + brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices; + + for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) { + brw->perfquery.sys_vars.n_eu_sub_slices += + _mesa_bitcount(devinfo->subslice_masks[i]); + } + + for (int i = 0; i < sizeof(devinfo->eu_masks); i++) + brw->perfquery.sys_vars.n_eus += _mesa_bitcount(devinfo->eu_masks[i]); + + brw->perfquery.sys_vars.eu_threads_count = + brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu; + + /* At the moment the subslice mask builtin has groups of 3bits for each + * slice. + * + * Ideally equations would be updated to have a slice/subslice query + * function/operator. + */ + brw->perfquery.sys_vars.subslice_mask = 0; + for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) { + for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) { + if (gen_device_info_subslice_available(devinfo, s, ss)) + brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss); + } + } +} + +static bool init_oa_sys_vars(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; @@ -1923,83 +2017,28 @@ init_oa_sys_vars(struct brw_context *brw) if (!read_sysfs_drm_device_file_uint64(brw, "gt_max_freq_mhz", &max_freq_mhz)) return false; - brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000; - brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000; - brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency; - - brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd); - brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices; - /* Assuming uniform distribution of subslices per slices. */ - brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0]; - - if (devinfo->is_haswell) { - brw->perfquery.sys_vars.slice_mask = 0; - brw->perfquery.sys_vars.subslice_mask = 0; - - for (int s = 0; s < devinfo->num_slices; s++) - brw->perfquery.sys_vars.slice_mask |= 1U << s; - for (int ss = 0; ss < devinfo->num_subslices[0]; ss++) - brw->perfquery.sys_vars.subslice_mask |= 1U << ss; - - if (devinfo->gt == 1) { - brw->perfquery.sys_vars.n_eus = 10; - } else if (devinfo->gt == 2) { - brw->perfquery.sys_vars.n_eus = 20; - } else if (devinfo->gt == 3) { - brw->perfquery.sys_vars.n_eus = 40; - } else - unreachable("not reached"); - } else { - drm_i915_getparam_t gp; - int ret; - int slice_mask = 0; - int ss_mask = 0; - /* maximum number of slices */ - int s_max = devinfo->num_slices; - /* maximum number of subslices per slice (assuming uniform subslices per - * slices) - */ - int ss_max = devinfo->num_subslices[0]; - uint64_t subslice_mask = 0; - int s; - - gp.param = I915_PARAM_SLICE_MASK; - gp.value = &slice_mask; - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); - if (ret) + if (!query_topology(brw)) { + /* We need the i915 query uAPI on CNL+ (kernel 4.17+). */ + if (devinfo->gen >= 10) return false; - gp.param = I915_PARAM_SUBSLICE_MASK; - gp.value = &ss_mask; - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); - if (ret) - return false; + if (!getparam_topology(brw)) { + /* We need the SLICE_MASK/SUBSLICE_MASK on gen8+ (kernel 4.13+). */ + if (devinfo->gen >= 8) + return false; - brw->perfquery.sys_vars.n_eus = brw->screen->eu_total; - brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask); - brw->perfquery.sys_vars.slice_mask = slice_mask; - - /* Note: the _SUBSLICE_MASK param only reports a global subslice mask - * which applies to all slices. - * - * Note: some of the metrics we have (as described in XML) are - * conditional on a $SubsliceMask variable which is expected to also - * reflect the slice mask by packing together subslice masks for each - * slice in one value.. - */ - for (s = 0; s < s_max; s++) { - if (slice_mask & (1<<s)) { - subslice_mask |= ss_mask << (ss_max * s); - } + /* On Haswell, the values are already computed for us in + * gen_device_info. + */ } - - brw->perfquery.sys_vars.subslice_mask = subslice_mask; - brw->perfquery.sys_vars.n_eu_sub_slices = - __builtin_popcount(subslice_mask); } - brw->perfquery.sys_vars.eu_threads_count = - brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu; + memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars)); + brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000; + brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000; + brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency; + brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd); + compute_topology_builtins(brw); return true; } |