summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
authorSamuel Pitoiset <[email protected]>2016-01-14 18:24:53 +0100
committerSamuel Pitoiset <[email protected]>2016-04-01 22:26:24 +0200
commit3b246a71d7fe12c4b0670a9dadf566ea3eca1128 (patch)
treee51a947d61c73c614b9f423cdceca4fcf0e2a1ab /src/gallium/drivers
parent7797d5f7d9b367f96200093cbe166c4478eae65e (diff)
nvc0: add indirect compute support on Kepler
The grid size is stored as three 32-bits integers in the indirect buffer but the launch descriptor uses a 32-bits integer for both griddim_y and griddim_z like this (z << 16) | y. To make it work, the 16 high bits of griddim_y are overwritten by griddim_z. Signed-off-by: Samuel Pitoiset <[email protected]> Reviewed-by: Ilia Mirkin <[email protected]>
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nve4_compute.c111
1 files changed, 77 insertions, 34 deletions
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 04ede3e51e1..4d069df983e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -435,9 +435,7 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
static void
nve4_compute_upload_input(struct nvc0_context *nvc0,
struct nve4_cp_launch_desc *desc,
- const void *input,
- const uint *block_layout,
- const uint *grid_layout)
+ const struct pipe_grid_info *info)
{
struct nvc0_screen *screen = nvc0->screen;
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -455,7 +453,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
PUSH_DATA (push, 0x1);
BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
- PUSH_DATAp(push, input, cp->parm_size / 4);
+ PUSH_DATAp(push, info->input, cp->parm_size / 4);
/* Bind user parameters coming from clover. */
/* TODO: This should be harmonized with uniform_bo. */
@@ -468,10 +466,25 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, 7 * 4);
PUSH_DATA (push, 0x1);
- BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
- PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
- PUSH_DATAp(push, block_layout, 3);
- PUSH_DATAp(push, grid_layout, 3);
+
+ if (unlikely(info->indirect)) {
+ struct nv04_resource *res = nv04_resource(info->indirect);
+ uint32_t offset = res->offset + info->indirect_offset;
+
+ nouveau_pushbuf_space(push, 16, 0, 1);
+ PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+ PUSH_DATAp(push, info->block, 3);
+ nouveau_pushbuf_data(push, res->bo, offset,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+ } else {
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+ PUSH_DATAp(push, info->block, 3);
+ PUSH_DATAp(push, info->grid, 3);
+ }
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
@@ -491,23 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
static void
nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
struct nve4_cp_launch_desc *desc,
- uint32_t label,
- const uint *block_layout,
- const uint *grid_layout)
+ const struct pipe_grid_info *info)
{
const struct nvc0_screen *screen = nvc0->screen;
const struct nvc0_program *cp = nvc0->compprog;
nve4_cp_launch_desc_init_default(desc);
- desc->entry = nvc0_program_symbol_offset(cp, label);
+ desc->entry = nvc0_program_symbol_offset(cp, info->pc);
- desc->griddim_x = grid_layout[0];
- desc->griddim_y = grid_layout[1];
- desc->griddim_z = grid_layout[2];
- desc->blockdim_x = block_layout[0];
- desc->blockdim_y = block_layout[1];
- desc->blockdim_z = block_layout[2];
+ desc->griddim_x = info->grid[0];
+ desc->griddim_y = info->grid[1];
+ desc->griddim_z = info->grid[2];
+ desc->blockdim_x = info->block[0];
+ desc->blockdim_y = info->block[1];
+ desc->blockdim_z = info->block[2];
desc->shared_size = align(cp->cp.smem_size, 0x100);
desc->local_size_p = align(cp->cp.lmem_size, 0x10);
@@ -566,30 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
if (ret)
goto out;
- nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
- info->block, info->grid);
+ nve4_compute_setup_launch_desc(nvc0, desc, info);
- nve4_compute_upload_input(nvc0, desc, info->input, info->block, info->grid);
+ nve4_compute_upload_input(nvc0, desc, info);
#ifdef DEBUG
if (debug_get_num_option("NV50_PROG_DEBUG", 0))
nve4_compute_dump_launch_desc(desc);
#endif
+ if (unlikely(info->indirect)) {
+ struct nv04_resource *res = nv04_resource(info->indirect);
+ uint32_t offset = res->offset + info->indirect_offset;
+
+ /* upload the descriptor */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr);
+ PUSH_DATA (push, desc_gpuaddr);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 256);
+ PUSH_DATA (push, 1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+
+ /* overwrite griddim_x and griddim_y as two 32-bits integers even
+ * if griddim_y must be a 16-bits integer */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr + 48);
+ PUSH_DATA (push, desc_gpuaddr + 48);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 8);
+ PUSH_DATA (push, 1);
+
+ nouveau_pushbuf_space(push, 16, 0, 1);
+ PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ nouveau_pushbuf_data(push, res->bo, offset,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
+
+ /* overwrite the 16 high bits of griddim_y with griddim_z because
+ * we need (z << 16) | x */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr + 54);
+ PUSH_DATA (push, desc_gpuaddr + 54);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 4);
+ PUSH_DATA (push, 1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ nouveau_pushbuf_data(push, res->bo, offset + 8,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+ }
+
/* upload descriptor and flush */
-#if 0
- BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, desc_gpuaddr);
- PUSH_DATA (push, desc_gpuaddr);
- BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
- PUSH_DATA (push, 256);
- PUSH_DATA (push, 1);
- BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
- PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
- PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
- BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
- PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
-#endif
BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
PUSH_DATA (push, desc_gpuaddr >> 8);
BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);