summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nv50/nv50_screen.c
diff options
context:
space:
mode:
authorMarcin Slusarz <[email protected]>2012-06-27 14:45:17 +0200
committerMarcin Slusarz <[email protected]>2012-06-28 00:01:02 +0200
commit1906d2b46b21a8e7496409e0639d8463ad86dcfe (patch)
tree24800adb306c232fe2253c536b6071292a04b293 /src/gallium/drivers/nv50/nv50_screen.c
parent0fceaee4fd8f745e71cbc4e1d24520ac7e11c2cd (diff)
nv50: dynamically allocate space for shader local storage
Fixes 21 piglit tests: spec/glsl-1.10/execution/variable-indexing/ fs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-row-wr spec/glsl-1.20/execution/variable-indexing/ fs-temp-array-mat3-index-col-row-rd fs-temp-array-mat3-index-row-rd fs-temp-array-mat4-col-row-wr fs-temp-array-mat4-index-col-row-rd fs-temp-array-mat4-index-col-row-wr fs-temp-array-mat4-index-row-rd fs-temp-array-mat4-index-row-wr vs-temp-array-mat3-index-col-row-rd vs-temp-array-mat3-index-col-row-wr vs-temp-array-mat3-index-row-rd vs-temp-array-mat3-index-row-wr vs-temp-array-mat4-col-row-wr vs-temp-array-mat4-index-col-row-rd vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-wr vs-temp-array-mat4-index-row-rd vs-temp-array-mat4-index-row-wr vs-temp-array-mat4-index-wr ... and prevents a lot of GPU lockups
Diffstat (limited to 'src/gallium/drivers/nv50/nv50_screen.c')
-rw-r--r--src/gallium/drivers/nv50/nv50_screen.c107
1 files changed, 88 insertions, 19 deletions
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 873946f520d..8c30c96aa73 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -28,11 +28,21 @@
#include "nv50_screen.h"
#include "nouveau/nv_object.xml.h"
+#include <errno.h>
#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
# define NOUVEAU_GETPARAM_GRAPH_UNITS 13
#endif
+/* affected by LOCAL_WARPS_LOG_ALLOC / LOCAL_WARPS_NO_CLAMP */
+#define LOCAL_WARPS_ALLOC 32
+/* affected by STACK_WARPS_LOG_ALLOC / STACK_WARPS_NO_CLAMP */
+#define STACK_WARPS_ALLOC 32
+
+#define THREADS_IN_WARP 32
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
static boolean
nv50_screen_is_format_supported(struct pipe_screen *pscreen,
enum pipe_format format,
@@ -209,7 +219,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_MAX_PREDS:
return 0;
case PIPE_SHADER_CAP_MAX_TEMPS:
- return NV50_CAP_MAX_PROGRAM_TEMPS;
+ return nv50_screen(pscreen)->max_tls_space / ONE_TEMP_SIZE;
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
return 1;
case PIPE_SHADER_CAP_SUBROUTINES:
@@ -311,7 +321,7 @@ nv50_screen_fence_update(struct pipe_screen *pscreen)
}
static void
-nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
+nv50_screen_init_hwctx(struct nv50_screen *screen)
{
struct nouveau_pushbuf *push = screen->base.pushbuf;
struct nv04_fifo *fifo;
@@ -411,7 +421,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
PUSH_DATAh(push, screen->tls_bo->offset);
PUSH_DATA (push, screen->tls_bo->offset);
- PUSH_DATA (push, util_logbase2(tls_space / 8));
+ PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
BEGIN_NV04(push, NV50_3D(STACK_ADDRESS_HIGH), 3);
PUSH_DATAh(push, screen->stack_bo->offset);
@@ -508,6 +518,60 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
PUSH_KICK (push);
}
+static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space,
+ uint64_t *tls_size)
+{
+ struct nouveau_device *dev = screen->base.device;
+ int ret;
+
+ screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) *
+ ONE_TEMP_SIZE;
+ if (nouveau_mesa_debug)
+ debug_printf("allocating space for %u temps\n",
+ util_next_power_of_two(tls_space / ONE_TEMP_SIZE));
+ *tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) *
+ screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP;
+
+ ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+ *tls_size, NULL, &screen->tls_bo);
+ if (ret) {
+ NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
+{
+ struct nouveau_pushbuf *push = screen->base.pushbuf;
+ int ret;
+ uint64_t tls_size;
+
+ if (tls_space < screen->cur_tls_space)
+ return 0;
+ if (tls_space > screen->max_tls_space) {
+ /* fixable by limiting number of warps (LOCAL_WARPS_LOG_ALLOC /
+ * LOCAL_WARPS_NO_CLAMP) */
+ NOUVEAU_ERR("Unsupported number of temporaries (%u > %u). Fixable if someone cares.\n",
+ (unsigned)(tls_space / ONE_TEMP_SIZE),
+ (unsigned)(screen->max_tls_space / ONE_TEMP_SIZE));
+ return -ENOMEM;
+ }
+
+ nouveau_bo_ref(NULL, &screen->tls_bo);
+ ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+ if (ret)
+ return ret;
+
+ BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
+ PUSH_DATAh(push, screen->tls_bo->offset);
+ PUSH_DATA (push, screen->tls_bo->offset);
+ PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
+
+ return 1;
+}
+
struct pipe_screen *
nv50_screen_create(struct nouveau_device *dev)
{
@@ -516,7 +580,7 @@ nv50_screen_create(struct nouveau_device *dev)
struct nouveau_object *chan;
uint64_t value;
uint32_t tesla_class;
- unsigned stack_size, max_warps, tls_space;
+ unsigned stack_size;
int ret;
screen = CALLOC_STRUCT(nv50_screen);
@@ -637,10 +701,11 @@ nv50_screen_create(struct nouveau_device *dev)
nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
- max_warps = util_bitcount(value & 0xffff);
- max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+ screen->TPs = util_bitcount(value & 0xffff);
+ screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
- stack_size = max_warps * 64 * 8;
+ stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
+ STACK_WARPS_ALLOC * 64 * 8;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, NULL,
&screen->stack_bo);
@@ -649,20 +714,24 @@ nv50_screen_create(struct nouveau_device *dev)
goto fail;
}
- tls_space = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+ uint64_t size_of_one_temp = util_next_power_of_two(screen->TPs) *
+ screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP *
+ ONE_TEMP_SIZE;
+ screen->max_tls_space = dev->vram_size / size_of_one_temp * ONE_TEMP_SIZE;
+ screen->max_tls_space /= 2; /* half of vram */
- screen->tls_size = tls_space * max_warps * 32;
+ /* hw can address max 64 KiB */
+ screen->max_tls_space = MIN2(screen->max_tls_space, 64 << 10);
- if (nouveau_mesa_debug)
- debug_printf("max_warps = %i, tls_size = %"PRIu64" KiB\n",
- max_warps, screen->tls_size >> 10);
-
- ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, screen->tls_size, NULL,
- &screen->tls_bo);
- if (ret) {
- NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
+ uint64_t tls_size;
+ unsigned tls_space = 4/*temps*/ * ONE_TEMP_SIZE;
+ ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+ if (ret)
goto fail;
- }
+
+ if (nouveau_mesa_debug)
+ debug_printf("TPs = %u, MPsInTP = %u, VRAM = %"PRIu64" MiB, tls_size = %"PRIu64" KiB\n",
+ screen->TPs, screen->MPsInTP, dev->vram_size >> 20, tls_size >> 10);
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 4 << 16, NULL,
&screen->uniforms);
@@ -684,7 +753,7 @@ nv50_screen_create(struct nouveau_device *dev)
if (!nv50_blitctx_create(screen))
goto fail;
- nv50_screen_init_hwctx(screen, tls_space);
+ nv50_screen_init_hwctx(screen);
nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);