diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/panfrost/pan_compute.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_mfbd.c | 12 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_sfbd.c | 6 | ||||
-rw-r--r-- | src/panfrost/include/panfrost-job.h | 53 | ||||
-rw-r--r-- | src/panfrost/pandecode/decode.c | 107 |
5 files changed, 96 insertions, 96 deletions
diff --git a/src/gallium/drivers/panfrost/pan_compute.c b/src/gallium/drivers/panfrost/pan_compute.c index 56bac7a8523..33618cb6997 100644 --- a/src/gallium/drivers/panfrost/pan_compute.c +++ b/src/gallium/drivers/panfrost/pan_compute.c @@ -127,20 +127,12 @@ panfrost_launch_grid(struct pipe_context *pipe, panfrost_emit_for_draw(ctx, false); - /* Compute jobs have a "compute FBD". It's not a real framebuffer - * descriptor - there is no framebuffer - but it takes the place of - * one. As far as I can tell, it's actually the beginning of a - * single-render-target framebuffer descriptor with almost everything - * zeroed out. - */ - struct mali_compute_fbd compute_fbd = { - .unknown1 = { - 0, 0x1F, 0, 0, 0, 0, 0, 0 - } + struct mali_shared_memory shared = { + .shared_workgroup_count = ~0 }; payload->postfix.framebuffer = - panfrost_upload_transient(batch, &compute_fbd, sizeof(compute_fbd)); + panfrost_upload_transient(batch, &shared, sizeof(shared)); /* Invoke according to the grid info */ diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c index fe427c452b0..3e0f5cbd275 100644 --- a/src/gallium/drivers/panfrost/pan_mfbd.c +++ b/src/gallium/drivers/panfrost/pan_mfbd.c @@ -380,12 +380,14 @@ panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count) .rt_count_1 = MALI_POSITIVE(batch->key.nr_cbufs), .rt_count_2 = 4, - .unknown2 = 0x1f, .tiler = panfrost_emit_midg_tiler(batch, vertex_count), - - .stack_shift = shift, - .unk0 = 0x1e, - .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu + + .shared_memory = { + .unk0 = 0x1e, + .stack_shift = shift, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, + .shared_workgroup_count = ~0, + } }; return framebuffer; diff --git a/src/gallium/drivers/panfrost/pan_sfbd.c b/src/gallium/drivers/panfrost/pan_sfbd.c index 97d00651076..a4d29adc460 100644 --- a/src/gallium/drivers/panfrost/pan_sfbd.c +++ b/src/gallium/drivers/panfrost/pan_sfbd.c @@ -213,12 +213,14 @@ panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count) struct mali_single_framebuffer framebuffer = { .width = MALI_POSITIVE(width), .height = MALI_POSITIVE(height), - .unknown2 = 0x1f, + .shared_memory = { + .shared_workgroup_count = ~0, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, + }, .format = { .unk3 = 0x3, }, .clear_flags = 0x1000, - .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, .tiler = panfrost_emit_midg_tiler(batch, vertex_count), }; diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h index 1a59f4c77c8..8027abebf34 100644 --- a/src/panfrost/include/panfrost-job.h +++ b/src/panfrost/include/panfrost-job.h @@ -1470,14 +1470,41 @@ struct mali_sfbd_format { unsigned unk3 : 4; }; -struct mali_single_framebuffer { - u32 unknown1; - u32 unknown2; +/* Shared structure at the start of framebuffer descriptors, or used bare for + * compute jobs, configuring stack and shared memory */ + +struct mali_shared_memory { + u32 stack_shift : 4; + u32 unk0 : 28; + + /* Configuration for shared memory for compute shaders. + * shared_workgroup_count is logarithmic and may be computed for a + * compute shader using shared memory as: + * + * shared_workgroup_count = MAX2(ceil(log2(count_x)) + ... + ceil(log2(count_z), 10) + * + * For compute shaders that don't use shared memory, or non-compute + * shaders, this is set to ~0 + */ + + u32 shared_workgroup_count : 5; + u32 shared_unk1 : 3; + u32 shared_shift : 4; + u32 shared_zero : 20; + mali_ptr scratchpad; - u64 zero1; - u64 zero0; + /* For compute shaders, the RAM backing of workgroup-shared memory. For + * fragment shaders on Bifrost, apparently multisampling locations */ + + mali_ptr shared_memory; + mali_ptr unknown1; +} __attribute__((packed)); + + +struct mali_single_framebuffer { + struct mali_shared_memory shared_memory; struct mali_sfbd_format format; u32 clear_flags; @@ -1540,13 +1567,6 @@ struct mali_single_framebuffer { /* More below this, maybe */ } __attribute__((packed)); -/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field - * of compute jobs. Superficially resembles a single framebuffer descriptor */ - -struct mali_compute_fbd { - u32 unknown1[8]; -} __attribute__((packed)); - /* Format bits for the render target flags */ #define MALI_MFBD_FORMAT_MSAA (1 << 1) @@ -1675,15 +1695,8 @@ struct bifrost_fb_extra { #define MALI_MFBD_EXTRA (1 << 13) struct bifrost_framebuffer { - u32 stack_shift : 4; - u32 unk0 : 28; + struct mali_shared_memory shared_memory; - u32 unknown2; // = 0x1f, same as SFBD - mali_ptr scratchpad; - - /* 0x10 */ - mali_ptr sample_locations; - mali_ptr unknown1; /* 0x20 */ u16 width1, height1; u32 zero3; diff --git a/src/panfrost/pandecode/decode.c b/src/panfrost/pandecode/decode.c index dc755fcc364..41b843f7a3b 100644 --- a/src/panfrost/pandecode/decode.c +++ b/src/panfrost/pandecode/decode.c @@ -666,6 +666,41 @@ pandecode_sfbd_format(struct mali_sfbd_format format) pandecode_log("},\n"); } +static void +pandecode_shared_memory(const struct mali_shared_memory *desc, bool is_compute) +{ + pandecode_prop("stack_shift = 0x%x", desc->stack_shift); + + if (desc->unk0) + pandecode_prop("unk0 = 0x%x", desc->unk0); + + if (desc->shared_workgroup_count != 0x1F) { + pandecode_prop("shared_workgroup_count = %d", desc->shared_workgroup_count); + if (!is_compute) + pandecode_msg("XXX: wrong workgroup count for noncompute\n"); + } + + if (desc->shared_unk1 || desc->shared_shift) { + pandecode_prop("shared_unk1 = %X", desc->shared_unk1); + pandecode_prop("shared_shift = %X", desc->shared_shift); + + if (!is_compute) + pandecode_msg("XXX: shared memory configured in noncompute shader"); + } + + if (desc->shared_zero) { + pandecode_msg("XXX: shared memory zero tripped\n"); + pandecode_prop("shared_zero = 0x%" PRIx32, desc->shared_zero); + } + + if (desc->shared_memory && !is_compute) + pandecode_msg("XXX: shared memory used in noncompute shader\n"); + + MEMORY_PROP(desc, scratchpad); + MEMORY_PROP(desc, shared_memory); + MEMORY_PROP(desc, unknown1); +} + static struct pandecode_fbd pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) { @@ -680,8 +715,11 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1); - pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2); + pandecode_log(".shared_memory = {\n"); + pandecode_indent++; + pandecode_shared_memory(&s->shared_memory, false); + pandecode_indent--; + pandecode_log("},\n"); pandecode_sfbd_format(s->format); @@ -748,7 +786,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_prop("clear_stencil = 0x%x", s->clear_stencil); } - MEMORY_PROP(s, scratchpad); const struct midgard_tiler_descriptor t = s->tiler; bool has_hierarchy = !(gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830); @@ -757,8 +794,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_indent--; pandecode_log("};\n"); - pandecode_prop("zero0 = 0x%" PRIx64, s->zero0); - pandecode_prop("zero1 = 0x%" PRIx64, s->zero1); pandecode_prop("zero2 = 0x%" PRIx32, s->zero2); pandecode_prop("zero4 = 0x%" PRIx32, s->zero4); pandecode_prop("zero5 = 0x%" PRIx32, s->zero5); @@ -784,20 +819,13 @@ static void pandecode_compute_fbd(uint64_t gpu_va, int job_no) { struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); - const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + const struct mali_shared_memory *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); - pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_log("struct mali_shared_memory shared_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - - pandecode_log(".unknown1 = {"); - - for (int i = 0; i < ARRAY_SIZE(s->unknown1); ++i) - pandecode_log_cont("%X, ", s->unknown1[i]); - - pandecode_log("},\n"); - + pandecode_shared_memory(s, true); pandecode_indent--; - pandecode_log_cont("},\n"); + pandecode_log("},\n"); } /* Extracts the number of components associated with a Mali format */ @@ -1034,45 +1062,14 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput struct pandecode_fbd info; - if (fb->sample_locations) { - /* The blob stores all possible sample locations in a single buffer - * allocated on startup, and just switches the pointer when switching - * MSAA state. For now, we just put the data into the cmdstream, but we - * should do something like what the blob does with a real driver. - * - * There seem to be 32 slots for sample locations, followed by another - * 16. The second 16 is just the center location followed by 15 zeros - * in all the cases I've identified (maybe shader vs. depth/color - * samples?). - */ - - struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations); - - const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations); - - pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no); - pandecode_indent++; - - for (int i = 0; i < 32 + 16; i++) { - pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]); - } - - pandecode_indent--; - pandecode_log("};\n"); - } - pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - pandecode_prop("stack_shift = 0x%x", fb->stack_shift); - pandecode_prop("unk0 = 0x%x", fb->unk0); - - if (fb->sample_locations) - pandecode_prop("sample_locations = sample_locations_%d", job_no); - - /* Assume that unknown1 was emitted in the last job for - * now */ - MEMORY_PROP(fb, unknown1); + pandecode_log(".shared_memory = {\n"); + pandecode_indent++; + pandecode_shared_memory(&fb->shared_memory, is_compute); + pandecode_indent--; + pandecode_log("},\n"); info.width = fb->width1 + 1; info.height = fb->height1 + 1; @@ -1098,12 +1095,6 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput if (fb->clear_depth) pandecode_prop("clear_depth = %f", fb->clear_depth); - /* TODO: What is this? Let's not blow up.. */ - if (fb->unknown2 != 0x1F) - pandecode_prop("unknown2 = 0x%x", fb->unknown2); - - pandecode_prop("unknown2 = 0x%x", fb->unknown2); - MEMORY_PROP(fb, scratchpad); const struct midgard_tiler_descriptor t = fb->tiler; if (!is_compute) pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment, true); |