summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/r600/evergreen_compute.c
diff options
context:
space:
mode:
authorAdam Rak <[email protected]>2011-11-30 22:20:41 +0100
committerTom Stellard <[email protected]>2012-06-01 11:28:10 -0400
commit6a829a1b724ca0d960decee217d260b4de8a5463 (patch)
tree0bc6a96e7a7151ba50aa391e94ca6f0adfc37376 /src/gallium/drivers/r600/evergreen_compute.c
parent46a13b3b11d859e131399853c11ae2be0eb02f0a (diff)
r600g: compute support for evergreen
Tom Stellard: - Updated for gallium interface changes - Fixed a few bugs: + Set the loop counter + Calculate the correct number of pipes - Added hooks into the LLVM compiler
Diffstat (limited to 'src/gallium/drivers/r600/evergreen_compute.c')
-rw-r--r--src/gallium/drivers/r600/evergreen_compute.c814
1 files changed, 814 insertions, 0 deletions
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
new file mode 100644
index 00000000000..7aeb4038ae1
--- /dev/null
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright 2011 Adam Rak <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Adam Rak <[email protected]>
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "pipebuffer/pb_buffer.h"
+#include "r600.h"
+#include "evergreend.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "evergreen_compute.h"
+#include "r600_hw_context_priv.h"
+#include "evergreen_compute_internal.h"
+#include "compute_memory_pool.h"
+#ifdef HAVE_OPENCL
+#include "llvm_wrapper.h"
+#endif
+
+/**
+RAT0 is for global binding write
+VTX1 is for global binding read
+
+for wrting images RAT1...
+for reading images TEX2...
+ TEX2-RAT1 is paired
+
+TEX2... consumes the same fetch resources, that VTX2... would consume
+
+CONST0 and VTX0 is for parameters
+ CONST0 is binding smaller input parameter buffer, and for constant indexing,
+ also constant cached
+ VTX0 is for indirect/non-constant indexing, or if the input is bigger than
+ the constant cache can handle
+
+RAT-s are limited to 12, so we can only bind at most 11 texture for writing
+because we reserve RAT0 for global bindings. With byteaddressing enabled,
+we should reserve another one too.=> 10 image binding for writing max.
+
+from Nvidia OpenCL:
+ CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
+ CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
+
+so 10 for writing is enough. 176 is the max for reading according to the docs
+
+writable images should be listed first < 10, so their id corresponds to RAT(id+1)
+writable images will consume TEX slots, VTX slots too because of linear indexing
+
+*/
+
+const struct u_resource_vtbl r600_global_buffer_vtbl =
+{
+ u_default_resource_get_handle, /* get_handle */
+ r600_compute_global_buffer_destroy, /* resource_destroy */
+ r600_compute_global_get_transfer, /* get_transfer */
+ r600_compute_global_transfer_destroy, /* transfer_destroy */
+ r600_compute_global_transfer_map, /* transfer_map */
+ r600_compute_global_transfer_flush_region,/* transfer_flush_region */
+ r600_compute_global_transfer_unmap, /* transfer_unmap */
+ r600_compute_global_transfer_inline_write /* transfer_inline_write */
+};
+
+
+void *evergreen_create_compute_state(
+ struct pipe_context *ctx_,
+ const const struct pipe_compute_state *cso)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+
+#ifdef HAVE_OPENCL
+ const struct pipe_llvm_program_header * header;
+ const unsigned char * code;
+
+ header = cso->prog;
+ code = cso->prog + sizeof(struct pipe_llvm_program_header);
+#endif
+
+ if (!ctx->screen->screen.get_param(&ctx->screen->screen,
+ PIPE_CAP_COMPUTE)) {
+ fprintf(stderr, "Compute is not supported\n");
+ return NULL;
+ }
+ struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
+
+ shader->ctx = (struct r600_context*)ctx;
+ shader->resources = (struct evergreen_compute_resource*)
+ CALLOC(sizeof(struct evergreen_compute_resource),
+ get_compute_resource_num());
+ shader->local_size = cso->req_local_mem; ///TODO: assert it
+ shader->private_size = cso->req_private_mem;
+ shader->input_size = cso->req_input_mem;
+
+#ifdef HAVE_OPENCL
+ shader->mod = llvm_parse_bitcode(code, header->num_bytes);
+
+ r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
+#endif
+ return shader;
+}
+
+void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
+{
+ struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
+
+ free(shader->resources);
+ free(shader);
+}
+
+static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+
+ ctx->cs_shader = (struct r600_pipe_compute *)state;
+
+ assert(!ctx->cs_shader->shader_code_bo);
+
+ ctx->cs_shader->shader_code_bo =
+ r600_compute_buffer_alloc_vram(ctx->screen,
+ ctx->cs_shader->bc.ndw * 4);
+
+ void *p = ctx->ws->buffer_map(ctx->cs_shader->shader_code_bo->cs_buf,
+ ctx->cs, PIPE_TRANSFER_WRITE);
+
+ memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
+
+ ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
+
+ evergreen_compute_init_config(ctx);
+
+ struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+ COMPUTE_RESOURCE_SHADER, 0);
+
+ evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
+ S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
+
+ ///maybe we can use it later
+ evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
+ ///maybe we can use it later
+ evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+
+ evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
+ S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
+ | S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
+ evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
+
+ evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
+ res->bo = ctx->cs_shader->shader_code_bo;
+ res->usage = RADEON_USAGE_READ;
+ res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
+ res->flags = COMPUTE_RES_SH_FLUSH;
+
+ /* We can't always determine the
+ * number of iterations in a loop before it's executed,
+ * so we just need to set up the loop counter to give us the maximum
+ * number of iterations possible. Currently, loops in shader code
+ * ignore the loop counter and use a break instruction to exit the
+ * loop at the correct time.
+ */
+ evergreen_set_loop_const(ctx->cs_shader,
+ 0, /* index */
+ 0xFFF, /* Maximum value of the loop counter (i.e. when the loop
+ * counter reaches this value, the program will break
+ * out of the loop. */
+ 0x0, /* Starting value of the loop counter. */
+ 0x1); /* Amount to increment the loop counter each iteration. */
+}
+
+/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
+ * kernel parameters there are inplicit parameters that need to be stored
+ * in the vertex buffer as well. Here is how these parameters are organized in
+ * the buffer:
+ *
+ * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
+ * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
+ * DWORDS 6-8: Number of work items within each work group in each dimension
+ * (x,y,z)
+ * DWORDS 9+ : Kernel parameters
+ */
+void evergreen_compute_upload_input(
+ struct pipe_context *ctx_,
+ const uint *block_layout,
+ const uint *grid_layout,
+ const void *input)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ int i;
+ unsigned kernel_parameters_offset_bytes = 36;
+ uint32_t * num_work_groups_start;
+ uint32_t * global_size_start;
+ uint32_t * local_size_start;
+ uint32_t * kernel_parameters_start;
+
+ if (ctx->cs_shader->input_size == 0) {
+ return;
+ }
+
+ if (!ctx->cs_shader->kernel_param) {
+ unsigned buffer_size = ctx->cs_shader->input_size;
+
+ /* Add space for the grid dimensions */
+ buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
+ ctx->cs_shader->kernel_param =
+ r600_compute_buffer_alloc_vram(ctx->screen,
+ buffer_size);
+ }
+
+ num_work_groups_start = ctx->ws->buffer_map(
+ ctx->cs_shader->kernel_param->cs_buf,
+ ctx->cs, PIPE_TRANSFER_WRITE);
+ global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
+ local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
+ kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
+
+ /* Copy the work group size */
+ memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
+
+ /* Copy the global size */
+ for (i = 0; i < 3; i++) {
+ global_size_start[i] = grid_layout[i] * block_layout[i];
+ }
+
+ /* Copy the local dimensions */
+ memcpy(local_size_start, block_layout, 3 * sizeof(uint));
+
+ /* Copy the kernel inputs */
+ memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
+
+ for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
+ (ctx->cs_shader->input_size / 4); i++) {
+ COMPUTE_DBG("input %i : %i\n", i,
+ ((unsigned*)num_work_groups_start)[i]);
+ }
+
+ ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
+
+ ///ID=0 is reserved for the parameters
+ evergreen_set_vtx_resource(ctx->cs_shader,
+ ctx->cs_shader->kernel_param, 0, 0, 0);
+ ///ID=0 is reserved for parameters
+ evergreen_set_const_cache(ctx->cs_shader, 0,
+ ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
+}
+
+void evergreen_direct_dispatch(
+ struct pipe_context *ctx_,
+ const uint *block_layout, const uint *grid_layout)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+
+ int i;
+
+ struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+ COMPUTE_RESOURCE_DISPATCH, 0);
+
+ evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
+
+ evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
+ evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
+ evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
+
+ evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
+ evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
+ evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
+
+ int group_size = 1;
+
+ int grid_size = 1;
+
+ for (i = 0; i < 3; i++) {
+ group_size *= block_layout[i];
+ }
+
+ for (i = 0; i < 3; i++) {
+ grid_size *= grid_layout[i];
+ }
+
+ evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
+ evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
+
+ evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
+ evergreen_emit_raw_value(res, grid_layout[0]);
+ evergreen_emit_raw_value(res, grid_layout[1]);
+ evergreen_emit_raw_value(res, grid_layout[2]);
+ ///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
+ evergreen_emit_raw_value(res, 1);
+}
+
+static void compute_emit_cs(struct r600_context *ctx)
+{
+ struct radeon_winsys_cs *cs = ctx->cs;
+ int i;
+
+ r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+ struct r600_resource *onebo = NULL;
+
+ for (i = 0; i < get_compute_resource_num(); i++) {
+ if (ctx->cs_shader->resources[i].enabled) {
+ int j;
+ COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
+
+ for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
+ if (ctx->cs_shader->resources[i].do_reloc[j]) {
+ assert(ctx->cs_shader->resources[i].bo);
+ evergreen_emit_ctx_reloc(ctx,
+ ctx->cs_shader->resources[i].bo,
+ ctx->cs_shader->resources[i].usage);
+ }
+
+ cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
+ }
+
+ if (ctx->cs_shader->resources[i].bo) {
+ onebo = ctx->cs_shader->resources[i].bo;
+ evergreen_emit_ctx_reloc(ctx,
+ ctx->cs_shader->resources[i].bo,
+ ctx->cs_shader->resources[i].usage);
+
+ ///special case for textures
+ if (ctx->cs_shader->resources[i].do_reloc
+ [ctx->cs_shader->resources[i].cs_end] == 2) {
+ evergreen_emit_ctx_reloc(ctx,
+ ctx->cs_shader->resources[i].bo,
+ ctx->cs_shader->resources[i].usage);
+ }
+
+ evergreen_set_buffer_sync(ctx, ctx->cs_shader->resources[i].bo,
+ ctx->cs_shader->resources[i].coher_bo_size,
+ ctx->cs_shader->resources[i].flags,
+ ctx->cs_shader->resources[i].usage);
+ }
+ }
+ }
+
+#if 0
+ COMPUTE_DBG("cdw: %i\n", cs->cdw);
+ for (i = 0; i < cs->cdw; i++) {
+ COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
+ }
+#endif
+
+ ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC);
+
+ ctx->pm4_dirty_cdwords = 0;
+ ctx->flags = 0;
+
+ COMPUTE_DBG("shader started\n");
+
+ ctx->ws->buffer_wait(onebo->buf, 0);
+
+ COMPUTE_DBG("...\n");
+
+ r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+ ctx->streamout_start = TRUE;
+ ctx->streamout_append_bitmask = ~0;
+
+}
+
+static void evergreen_launch_grid(
+ struct pipe_context *ctx_,
+ const uint *block_layout, const uint *grid_layout,
+ uint32_t pc, const void *input)
+{
+ COMPUTE_DBG("PC: %i\n", pc);
+
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ unsigned num_waves;
+ unsigned num_pipes = ctx->screen->info.r600_max_pipes;
+ unsigned wave_divisor = (16 * num_pipes);
+
+ /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
+ num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
+ wave_divisor - 1) / wave_divisor;
+
+ COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
+ num_pipes, num_waves);
+
+ evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
+ evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
+ evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
+ compute_emit_cs(ctx);
+}
+
+static void evergreen_set_compute_resources(struct pipe_context * ctx_,
+ unsigned start, unsigned count,
+ struct pipe_surface ** surfaces)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct r600_surface **resources = (struct r600_surface **)surfaces;
+ for (int i = 0; i < count; i++) {
+ if (resources[i]) {
+ struct r600_resource_global *buffer =
+ (struct r600_resource_global*)resources[i]->base.texture;
+ if (resources[i]->base.writable) {
+ assert(i+1 < 12);
+ struct r600_resource_global *buffer =
+ (struct r600_resource_global*)
+ resources[i]->base.texture;
+
+ evergreen_set_rat(ctx->cs_shader, i+1,
+ (struct r600_resource *)resources[i]->base.texture,
+ buffer->chunk->start_in_dw*4,
+ resources[i]->base.texture->width0);
+ }
+
+ evergreen_set_vtx_resource(ctx->cs_shader,
+ (struct r600_resource *)resources[i]->base.texture, i+2,
+ buffer->chunk->start_in_dw*4, resources[i]->base.writable);
+ }
+ }
+
+}
+
+static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
+ unsigned start_slot, unsigned count,
+ struct pipe_sampler_view **views)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct r600_pipe_sampler_view **resource =
+ (struct r600_pipe_sampler_view **)views;
+
+ for (int i = 0; i < count; i++) {
+ if (resource[i]) {
+ assert(i+1 < 12);
+ ///FETCH0 = VTX0 (param buffer),
+ //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
+ evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
+ }
+ }
+}
+
+static void evergreen_bind_compute_sampler_states(
+ struct pipe_context *ctx_,
+ unsigned start_slot,
+ unsigned num_samplers,
+ void **samplers_)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct compute_sampler_state ** samplers =
+ (struct compute_sampler_state **)samplers_;
+
+ for (int i = 0; i < num_samplers; i++) {
+ if (samplers[i]) {
+ evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
+ }
+ }
+}
+
+static void evergreen_set_global_binding(
+ struct pipe_context *ctx_, unsigned first, unsigned n,
+ struct pipe_resource **resources,
+ uint32_t **handles)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct compute_memory_pool *pool = ctx->screen->global_pool;
+ struct r600_resource_global **buffers =
+ (struct r600_resource_global **)resources;
+
+ if (!resources) {
+ /* XXX: Unset */
+ return;
+ }
+
+ compute_memory_finalize_pending(pool, ctx_);
+
+ for (int i = 0; i < n; i++)
+ {
+ assert(resources[i]->target == PIPE_BUFFER);
+ assert(resources[i]->bind & PIPE_BIND_GLOBAL);
+
+ *(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
+ }
+
+ evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
+ evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1);
+}
+
+
+void evergreen_compute_init_config(struct r600_context *ctx)
+{
+ struct evergreen_compute_resource* res =
+ get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0);
+
+ int num_threads;
+ int num_stack_entries;
+ int num_temp_gprs;
+
+ enum radeon_family family;
+ unsigned tmp;
+
+ family = ctx->family;
+
+ switch (family) {
+ case CHIP_CEDAR:
+ default:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ case CHIP_REDWOOD:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ case CHIP_JUNIPER:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 512;
+ break;
+ case CHIP_CYPRESS:
+ case CHIP_HEMLOCK:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 512;
+ break;
+ case CHIP_PALM:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ case CHIP_SUMO:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ case CHIP_SUMO2:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 512;
+ break;
+ case CHIP_BARTS:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 512;
+ break;
+ case CHIP_TURKS:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ case CHIP_CAICOS:
+ num_temp_gprs = 4;
+ num_threads = 128;
+ num_stack_entries = 256;
+ break;
+ }
+
+ tmp = 0x00000000;
+ switch (family) {
+ case CHIP_CEDAR:
+ case CHIP_PALM:
+ case CHIP_SUMO:
+ case CHIP_SUMO2:
+ case CHIP_CAICOS:
+ break;
+ default:
+ tmp |= S_008C00_VC_ENABLE(1);
+ break;
+ }
+ tmp |= S_008C00_EXPORT_SRC_C(1);
+ tmp |= S_008C00_CS_PRIO(0);
+ tmp |= S_008C00_LS_PRIO(0);
+ tmp |= S_008C00_HS_PRIO(0);
+ tmp |= S_008C00_PS_PRIO(0);
+ tmp |= S_008C00_VS_PRIO(0);
+ tmp |= S_008C00_GS_PRIO(0);
+ tmp |= S_008C00_ES_PRIO(0);
+
+ evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp);
+
+ evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1,
+ S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
+ evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0);
+ evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0);
+ evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+ evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
+ /* workaround for hw issues with dyn gpr - must set all limits to 240
+ * instead of 0, 0x1e == 240/8 */
+ evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
+ S_028838_PS_GPRS(0x1e) |
+ S_028838_VS_GPRS(0x1e) |
+ S_028838_GS_GPRS(0x1e) |
+ S_028838_ES_GPRS(0x1e) |
+ S_028838_HS_GPRS(0x1e) |
+ S_028838_LS_GPRS(0x1e));
+
+
+ evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
+ evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
+ evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF);
+ evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0);
+ tmp = S_008C1C_NUM_LS_THREADS(num_threads);
+ evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp);
+ evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0);
+ evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0);
+ tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries);
+ evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp);
+ evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1));
+ evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0);
+ evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0);
+ evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0);
+ evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20);
+ tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK;
+ evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp);
+ tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1);
+ evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp);
+ evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
+ evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0);
+ evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0);
+ evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1));
+ evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0);
+ evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
+ S_0286E8_TID_IN_GROUP_ENA
+ | S_0286E8_TGID_ENA
+ | S_0286E8_DISABLE_INDEX_PACK)
+ ;
+}
+
+void evergreen_init_compute_state_functions(struct r600_context *ctx)
+{
+ ctx->context.create_compute_state = evergreen_create_compute_state;
+ ctx->context.delete_compute_state = evergreen_delete_compute_state;
+ ctx->context.bind_compute_state = evergreen_bind_compute_state;
+// ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
+ ctx->context.set_compute_resources = evergreen_set_compute_resources;
+ ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
+ ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
+ ctx->context.set_global_binding = evergreen_set_global_binding;
+ ctx->context.launch_grid = evergreen_launch_grid;
+}
+
+
+struct pipe_resource *r600_compute_global_buffer_create(
+ struct pipe_screen *screen,
+ const struct pipe_resource *templ)
+{
+ assert(templ->target == PIPE_BUFFER);
+ assert(templ->bind & PIPE_BIND_GLOBAL);
+ assert(templ->array_size == 1 || templ->array_size == 0);
+ assert(templ->depth0 == 1 || templ->depth0 == 0);
+ assert(templ->height0 == 1 || templ->height0 == 0);
+
+ struct r600_resource_global* result = (struct r600_resource_global*)
+ CALLOC(sizeof(struct r600_resource_global), 1);
+ struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+ result->base.b.vtbl = &r600_global_buffer_vtbl;
+ result->base.b.b.screen = screen;
+ result->base.b.b = *templ;
+ pipe_reference_init(&result->base.b.b.reference, 1);
+
+ int size_in_dw = (templ->width0+3) / 4;
+
+ result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
+
+ if (result->chunk == NULL)
+ {
+ free(result);
+ return NULL;
+ }
+
+ return &result->base.b.b;
+}
+
+void r600_compute_global_buffer_destroy(
+ struct pipe_screen *screen,
+ struct pipe_resource *res)
+{
+ assert(res->target == PIPE_BUFFER);
+ assert(res->bind & PIPE_BIND_GLOBAL);
+
+ struct r600_resource_global* buffer = (struct r600_resource_global*)res;
+ struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+ compute_memory_free(rscreen->global_pool, buffer->chunk->id);
+
+ buffer->chunk = NULL;
+ free(res);
+}
+
+void* r600_compute_global_transfer_map(
+ struct pipe_context *ctx_,
+ struct pipe_transfer* transfer)
+{
+ assert(transfer->resource->target == PIPE_BUFFER);
+ assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+ assert(transfer->box.x >= 0);
+ assert(transfer->box.y == 0);
+ assert(transfer->box.z == 0);
+
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct r600_resource_global* buffer =
+ (struct r600_resource_global*)transfer->resource;
+
+ uint32_t* map;
+ ///TODO: do it better, mapping is not possible if the pool is too big
+
+ if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
+ ctx->cs, transfer->usage))) {
+ return NULL;
+ }
+
+ COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
+ return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
+}
+
+void r600_compute_global_transfer_unmap(
+ struct pipe_context *ctx_,
+ struct pipe_transfer* transfer)
+{
+ assert(transfer->resource->target == PIPE_BUFFER);
+ assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct r600_resource_global* buffer =
+ (struct r600_resource_global*)transfer->resource;
+
+ ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
+}
+
+struct pipe_transfer * r600_compute_global_get_transfer(
+ struct pipe_context *ctx_,
+ struct pipe_resource *resource,
+ unsigned level,
+ unsigned usage,
+ const struct pipe_box *box)
+{
+ struct r600_context *ctx = (struct r600_context *)ctx_;
+ struct compute_memory_pool *pool = ctx->screen->global_pool;
+
+ compute_memory_finalize_pending(pool, ctx_);
+
+ assert(resource->target == PIPE_BUFFER);
+ struct r600_context *rctx = (struct r600_context*)ctx_;
+ struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+
+ transfer->resource = resource;
+ transfer->level = level;
+ transfer->usage = usage;
+ transfer->box = *box;
+ transfer->stride = 0;
+ transfer->layer_stride = 0;
+ transfer->data = NULL;
+
+ /* Note strides are zero, this is ok for buffers, but not for
+ * textures 2d & higher at least.
+ */
+ return transfer;
+}
+
+void r600_compute_global_transfer_destroy(
+ struct pipe_context *ctx_,
+ struct pipe_transfer *transfer)
+{
+ struct r600_context *rctx = (struct r600_context*)ctx_;
+ util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+void r600_compute_global_transfer_flush_region(
+ struct pipe_context *ctx_,
+ struct pipe_transfer *transfer,
+ const struct pipe_box *box)
+{
+ assert(0 && "TODO");
+}
+
+void r600_compute_global_transfer_inline_write(
+ struct pipe_context *pipe,
+ struct pipe_resource *resource,
+ unsigned level,
+ unsigned usage,
+ const struct pipe_box *box,
+ const void *data,
+ unsigned stride,
+ unsigned layer_stride)
+{
+ assert(0 && "TODO");
+}