i965: perf: query topology

With the introduction of asymmetric slices in CNL, we cannot rely on the previous SUBSLICE_MASK getparam to tell userspace what subslices are available. We introduce a new uAPI in the kernel driver to report exactly what part of the GPU are fused and require this to be available on Gen10+. Prior generations can continue to rely on GETPARAM on older kernels. This patch is quite a lot of code because we have to support lots of different kernel versions, ranging from not providing any information (for Haswell on 4.13 through 4.17), to being able to query through GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17 for Gen10+. This change stores topology information in a unified way on brw_context.topology from the various kernel APIs. And then generates the appropriate values for the equations from that unified topology. v2: Move slice/subslice masks fields to gen_device_info (Rafael) v3: Add a gen_device_info_subslice_available() helper (Lionel) Signed-off-by: Lionel Landwerlin <[email protected]> Acked-by: Rafael Antognolli <[email protected]> Reviewed-by: Kenneth Graunke <[email protected]>
author: Lionel Landwerlin <[email protected]> 2018-02-21 19:15:46 +0000
committer: Lionel Landwerlin <[email protected]> 2018-03-22 20:14:22 +0000
commit: 57a11550bc6195c404496e4278920ea63a343f08 (patch)
tree: 86e49b4c8d3677007c85aad8ea98dd762ef81e66
parent: c1900f5b0fb7a6f22a13f67e2645f3754b5df245 (diff)
2 files changed, 118 insertions, 71 deletions
diff --git a/src/intel/dev/gen_device_info.h b/src/intel/dev/gen_device_info.h
index 4d08f0dfedd..40b72383420 100644
--- a/src/intel/dev/gen_device_info.h
+++ b/src/intel/dev/gen_device_info.h
@@ -247,6 +247,14 @@ struct gen_device_info
 #define gen_device_info_is_9lp(devinfo) \
    ((devinfo)->is_broxton || (devinfo)->is_geminilake)
 
+static inline bool
+gen_device_info_subslice_available(const struct gen_device_info *devinfo,
+                                   int slice, int subslice)
+{
+   return (devinfo->subslice_masks[slice * devinfo->subslice_slice_stride +
+                                   subslice / 8] & (1U << (subslice % 8))) != 0;
+}
+
 int gen_get_pci_device_id_override(void);
 int gen_device_name_to_pci_device_id(const char *name);
 bool gen_get_device_info(int devid, struct gen_device_info *devinfo);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 12f797c1297..cca74001f19 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -1911,6 +1911,100 @@ init_oa_configs(struct brw_context *brw)
 }
 
 static bool
+query_topology(struct brw_context *brw)
+{
+   __DRIscreen *screen = brw->screen->driScrnPriv;
+   struct drm_i915_query_item item = {
+      .query_id = DRM_I915_QUERY_TOPOLOGY_INFO,
+   };
+   struct drm_i915_query query = {
+      .num_items = 1,
+      .items_ptr = (uintptr_t) &item,
+   };
+
+   if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
+      return false;
+
+   struct drm_i915_query_topology_info *topo_info =
+      (struct drm_i915_query_topology_info *) calloc(1, item.length);
+   item.data_ptr = (uintptr_t) topo_info;
+
+   if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query) ||
+       item.length <= 0)
+      return false;
+
+   gen_device_info_update_from_topology(&brw->screen->devinfo,
+                                        topo_info);
+
+   free(topo_info);
+
+   return true;
+}
+
+static bool
+getparam_topology(struct brw_context *brw)
+{
+   __DRIscreen *screen = brw->screen->driScrnPriv;
+   drm_i915_getparam_t gp;
+   int ret;
+
+   int slice_mask = 0;
+   gp.param = I915_PARAM_SLICE_MASK;
+   gp.value = &slice_mask;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret)
+      return false;
+
+   int subslice_mask = 0;
+   gp.param = I915_PARAM_SUBSLICE_MASK;
+   gp.value = &subslice_mask;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret)
+      return false;
+
+   gen_device_info_update_from_masks(&brw->screen->devinfo,
+                                     slice_mask,
+                                     subslice_mask,
+                                     brw->screen->eu_total);
+
+   return true;
+}
+
+static void
+compute_topology_builtins(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   brw->perfquery.sys_vars.slice_mask = devinfo->slice_masks;
+   brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
+
+   for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
+      brw->perfquery.sys_vars.n_eu_sub_slices +=
+         _mesa_bitcount(devinfo->subslice_masks[i]);
+   }
+
+   for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
+      brw->perfquery.sys_vars.n_eus += _mesa_bitcount(devinfo->eu_masks[i]);
+
+   brw->perfquery.sys_vars.eu_threads_count =
+      brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+
+   /* At the moment the subslice mask builtin has groups of 3bits for each
+    * slice.
+    *
+    * Ideally equations would be updated to have a slice/subslice query
+    * function/operator.
+    */
+   brw->perfquery.sys_vars.subslice_mask = 0;
+   for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
+      for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
+         if (gen_device_info_subslice_available(devinfo, s, ss))
+            brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss);
+      }
+   }
+}
+
+static bool
 init_oa_sys_vars(struct brw_context *brw)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
@@ -1923,83 +2017,28 @@ init_oa_sys_vars(struct brw_context *brw)
    if (!read_sysfs_drm_device_file_uint64(brw,  "gt_max_freq_mhz", &max_freq_mhz))
       return false;
 
-   brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
-   brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
-   brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
-
-   brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
-   brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
-   /* Assuming uniform distribution of subslices per slices. */
-   brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
-
-   if (devinfo->is_haswell) {
-      brw->perfquery.sys_vars.slice_mask = 0;
-      brw->perfquery.sys_vars.subslice_mask = 0;
-
-      for (int s = 0; s < devinfo->num_slices; s++)
-         brw->perfquery.sys_vars.slice_mask |= 1U << s;
-      for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
-         brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
-
-      if (devinfo->gt == 1) {
-         brw->perfquery.sys_vars.n_eus = 10;
-      } else if (devinfo->gt == 2) {
-         brw->perfquery.sys_vars.n_eus = 20;
-      } else if (devinfo->gt == 3) {
-         brw->perfquery.sys_vars.n_eus = 40;
-      } else
-         unreachable("not reached");
-   } else {
-      drm_i915_getparam_t gp;
-      int ret;
-      int slice_mask = 0;
-      int ss_mask = 0;
-      /* maximum number of slices */
-      int s_max = devinfo->num_slices;
-      /* maximum number of subslices per slice (assuming uniform subslices per
-       * slices)
-       */
-      int ss_max = devinfo->num_subslices[0];
-      uint64_t subslice_mask = 0;
-      int s;
-
-      gp.param = I915_PARAM_SLICE_MASK;
-      gp.value = &slice_mask;
-      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
-      if (ret)
+   if (!query_topology(brw)) {
+      /* We need the i915 query uAPI on CNL+ (kernel 4.17+). */
+      if (devinfo->gen >= 10)
          return false;
 
-      gp.param = I915_PARAM_SUBSLICE_MASK;
-      gp.value = &ss_mask;
-      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
-      if (ret)
-         return false;
+      if (!getparam_topology(brw)) {
+         /* We need the SLICE_MASK/SUBSLICE_MASK on gen8+ (kernel 4.13+). */
+         if (devinfo->gen >= 8)
+            return false;
 
-      brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
-      brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
-      brw->perfquery.sys_vars.slice_mask = slice_mask;
-
-      /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
-       * which applies to all slices.
-       *
-       * Note: some of the metrics we have (as described in XML) are
-       * conditional on a $SubsliceMask variable which is expected to also
-       * reflect the slice mask by packing together subslice masks for each
-       * slice in one value..
-       */
-      for (s = 0; s < s_max; s++) {
-         if (slice_mask & (1<<s)) {
-            subslice_mask |= ss_mask << (ss_max * s);
-         }
+         /* On Haswell, the values are already computed for us in
+          * gen_device_info.
+          */
       }
-
-      brw->perfquery.sys_vars.subslice_mask = subslice_mask;
-      brw->perfquery.sys_vars.n_eu_sub_slices =
-         __builtin_popcount(subslice_mask);
    }
 
-   brw->perfquery.sys_vars.eu_threads_count =
-      brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+   memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars));
+   brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
+   brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
+   brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
+   brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
+   compute_topology_builtins(brw);
 
    return true;
 }
author	Lionel Landwerlin <[email protected]>	2018-02-21 19:15:46 +0000
committer	Lionel Landwerlin <[email protected]>	2018-03-22 20:14:22 +0000
commit	57a11550bc6195c404496e4278920ea63a343f08 (patch)
tree	86e49b4c8d3677007c85aad8ea98dd762ef81e66
parent	c1900f5b0fb7a6f22a13f67e2645f3754b5df245 (diff)