winsys/amdgpu: start with smaller IBs, growing as necessary

This avoids allocating giant IBs from the outset, especially for CE and DMA. Since we now limit max_dw only by the size that the buffer happens to be (which, due to the buffer cache, can be even larger than the rounded-up size we request), the new function amdgpu_ib_max_submit_dwords controls when we submit an IB. With this change, we effectively never flush prematurely due to the CE IB, after an initial warm-up phase. v2: - clean up buffer_size calculation Reviewed-by: Marek Olšák <[email protected]>
author: Nicolai Hähnle <[email protected]> 2016-05-07 10:58:13 -0500
committer: Nicolai Hähnle <[email protected]> 2016-06-01 22:52:19 +0200
commit: 83a01cb4983fd4b8ee8402a0679bead2bc0094af (patch)
tree: f8ced215b942f8beaa8a8b529948c50eaee1c1d1
parent: f80c6abb9e591d788c7c8f5167dcd7cb744e8b4a (diff)
2 files changed, 71 insertions, 10 deletions
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index f070307e25e..781960c9600 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -336,11 +336,33 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
    return index;
 }
 
-static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
-                                 unsigned buffer_size)
+static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
 {
    struct pb_buffer *pb;
    uint8_t *mapped;
+   unsigned buffer_size;
+
+   /* Always create a buffer that is 4 times larger than the maximum seen IB
+    * size, aligned to a power of two. Limit to 512k dwords, which is the
+    * largest power of two that fits into the size field of the INDIRECT_BUFFER
+    * packet.
+    */
+   buffer_size = 4 * MIN2(util_next_power_of_two(4 * ib->max_ib_size),
+                          512 * 1024);
+
+   switch (ib->ib_type) {
+   case IB_CONST_PREAMBLE:
+      buffer_size = MAX2(buffer_size, 4 * 1024);
+      break;
+   case IB_CONST:
+      buffer_size = MAX2(buffer_size, 16 * 1024 * 4);
+      break;
+   case IB_MAIN:
+      buffer_size = MAX2(buffer_size, 8 * 1024 * 4);
+      break;
+   default:
+      unreachable("unhandled IB type");
+   }
 
    pb = ws->base.buffer_create(&ws->base, buffer_size,
                                ws->info.gart_page_size,
@@ -364,6 +386,27 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
    return true;
 }
 
+static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
+{
+   switch (ib_type) {
+   case IB_MAIN:
+      /* Smaller submits means the GPU gets busy sooner and there is less
+       * waiting for buffers and fences. Proof:
+       *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
+       */
+      return 20 * 1024;
+   case IB_CONST_PREAMBLE:
+   case IB_CONST:
+      /* There isn't really any reason to limit CE IB size beyond the natural
+       * limit implied by the main IB, except perhaps GTT size. Just return
+       * an extremely large value that we never get anywhere close to.
+       */
+      return 16 * 1024 * 1024;
+   default:
+      unreachable("bad ib_type");
+   }
+}
+
 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
                               enum ib_type ib_type)
 {
@@ -374,35 +417,36 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
     */
    struct amdgpu_ib *ib = NULL;
    struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type];
-   unsigned buffer_size, ib_size;
+   unsigned ib_size = 0;
 
    switch (ib_type) {
    case IB_CONST_PREAMBLE:
       ib = &cs->const_preamble_ib;
-      buffer_size = 4 * 1024 * 4;
-      ib_size = 1024 * 4;
+      ib_size = 256 * 4;
       break;
    case IB_CONST:
       ib = &cs->const_ib;
-      buffer_size = 512 * 1024 * 4;
-      ib_size = 128 * 1024 * 4;
+      ib_size = 8 * 1024 * 4;
       break;
    case IB_MAIN:
       ib = &cs->main;
-      buffer_size = 128 * 1024 * 4;
-      ib_size = 20 * 1024 * 4;
+      ib_size = 4 * 1024 * 4;
       break;
    default:
       unreachable("unhandled IB type");
    }
 
+   ib_size = MAX2(ib_size,
+                  4 * MIN2(util_next_power_of_two(ib->max_ib_size),
+                           amdgpu_ib_max_submit_dwords(ib_type)));
+
    ib->base.cdw = 0;
    ib->base.buf = NULL;
 
    /* Allocate a new buffer for IBs if the current buffer is all used. */
    if (!ib->big_ib_buffer ||
        ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
-      if (!amdgpu_ib_new_buffer(aws, ib, buffer_size))
+      if (!amdgpu_ib_new_buffer(aws, ib))
          return false;
    }
 
@@ -412,6 +456,8 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
                         RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
 
    ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+
+   ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
    ib->base.max_dw = ib_size / 4;
    return true;
 }
@@ -624,7 +670,17 @@ static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
 
 static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 {
+   struct amdgpu_ib *ib = amdgpu_ib(rcs);
+   struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
+   unsigned requested_size = rcs->cdw + dw;
+
    assert(rcs->cdw <= rcs->max_dw);
+
+   if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
+      return false;
+
+   ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
+
    return rcs->max_dw - rcs->cdw >= dw;
 }
 
@@ -861,15 +917,19 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
       /* Set IB sizes. */
       cur->ib[IB_MAIN].size = cs->main.base.cdw;
       cs->main.used_ib_space += cs->main.base.cdw * 4;
+      cs->main.max_ib_size = MAX2(cs->main.max_ib_size, cs->main.base.cdw);
 
       if (cs->const_ib.ib_mapped) {
          cur->ib[IB_CONST].size = cs->const_ib.base.cdw;
          cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
+         cs->const_ib.max_ib_size = MAX2(cs->const_ib.max_ib_size, cs->const_ib.base.cdw);
       }
 
       if (cs->const_preamble_ib.ib_mapped) {
          cur->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
          cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4;
+         cs->const_preamble_ib.max_ib_size =
+            MAX2(cs->const_preamble_ib.max_ib_size, cs->const_preamble_ib.base.cdw);
       }
 
       /* Create a fence. */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 25bad07af3e..62811e9aa10 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -64,6 +64,7 @@ struct amdgpu_ib {
    struct pb_buffer        *big_ib_buffer;
    uint8_t                 *ib_mapped;
    unsigned                used_ib_space;
+   unsigned                max_ib_size;
    enum ib_type            ib_type;
 };
author	Nicolai Hähnle <[email protected]>	2016-05-07 10:58:13 -0500
committer	Nicolai Hähnle <[email protected]>	2016-06-01 22:52:19 +0200
commit	83a01cb4983fd4b8ee8402a0679bead2bc0094af (patch)
tree	f8ced215b942f8beaa8a8b529948c50eaee1c1d1
parent	f80c6abb9e591d788c7c8f5167dcd7cb744e8b4a (diff)