diff options
-rw-r--r-- | src/gallium/drivers/panfrost/meson.build | 1 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_context.c | 132 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_context.h | 20 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_drm.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_job.c | 18 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_job.h | 71 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_scoreboard.c | 453 |
7 files changed, 558 insertions, 143 deletions
diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 43d73ce2086..4298242f6b9 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -57,6 +57,7 @@ files_panfrost = files( 'pan_blend_shaders.c', 'pan_pretty_print.c', 'pan_fragment.c', + 'pan_scoreboard.c', 'pan_sfbd.c', 'pan_mfbd.c', 'pan_tiler.c', diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 41656236b5b..d8c5510a31e 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -517,15 +517,6 @@ panfrost_default_shader_backend(struct panfrost_context *ctx) memcpy(&ctx->fragment_shader_core, &shader, sizeof(shader)); } -static void -panfrost_link_job_pair(struct mali_job_descriptor_header *first, mali_ptr next) -{ - if (first->job_descriptor_size) - first->next_job_64 = (u64) (uintptr_t) next; - else - first->next_job_32 = (u32) (uintptr_t) next; -} - /* Generates a vertex/tiler job. This is, in some sense, the heart of the * graphics command stream. It should be called once per draw, accordding to * presentations. Set is_tiler for "tiler" jobs (fragment shader jobs, but in @@ -535,12 +526,8 @@ panfrost_link_job_pair(struct mali_job_descriptor_header *first, mali_ptr next) struct panfrost_transfer panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler) { - /* Each draw call corresponds to two jobs, and the set-value job is first */ - int draw_job_index = 1 + (2 * ctx->draw_count) + 1; - struct mali_job_descriptor_header job = { .job_type = is_tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX, - .job_index = draw_job_index + (is_tiler ? 1 : 0), #ifdef __LP64__ .job_descriptor_size = 1, #endif @@ -557,65 +544,11 @@ panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler) #endif struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload)); - if (is_tiler) { - /* Tiler jobs depend on vertex jobs */ - - job.job_dependency_index_1 = draw_job_index; - - /* Tiler jobs also depend on the previous tiler job */ - - if (ctx->draw_count) { - job.job_dependency_index_2 = draw_job_index - 1; - /* Previous tiler job points to this tiler job */ - panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->draw_count - 1], transfer.gpu); - } else { - /* The only vertex job so far points to first tiler job */ - panfrost_link_job_pair(ctx->u_vertex_jobs[0], transfer.gpu); - } - } else { - if (ctx->draw_count) { - /* Previous vertex job points to this vertex job */ - panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->draw_count - 1], transfer.gpu); - - /* Last vertex job points to first tiler job */ - panfrost_link_job_pair(&job, ctx->tiler_jobs[0]); - } else { - /* Have the first vertex job depend on the set value job */ - job.job_dependency_index_1 = ctx->u_set_value_job->job_index; - panfrost_link_job_pair(ctx->u_set_value_job, transfer.gpu); - } - } - memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job) - offset, payload, sizeof(*payload)); return transfer; } -/* Generates a set value job. It's unclear what exactly this does, why it's - * necessary, and when to call it. */ - -static void -panfrost_set_value_job(struct panfrost_context *ctx) -{ - struct mali_job_descriptor_header job = { - .job_type = JOB_TYPE_SET_VALUE, - .job_descriptor_size = 1, - .job_index = 1, - }; - - struct mali_payload_set_value payload = { - .out = ctx->tiler_polygon_list.gpu, - .unknown = 0x3, - }; - - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload)); - memcpy(transfer.cpu, &job, sizeof(job)); - memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); - - ctx->u_set_value_job = (struct mali_job_descriptor_header *) transfer.cpu; - ctx->set_value_job = transfer.gpu; -} - static mali_ptr panfrost_emit_varyings( struct panfrost_context *ctx, @@ -1214,6 +1147,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) for (unsigned i = 0; i < 1; ++i) { bool is_srgb = + (ctx->pipe_framebuffer.nr_cbufs > i) && util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format); rts[i].flags = blend_count; @@ -1377,7 +1311,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll * just... be faster :) */ - if (!ctx->in_wallpaper) + if (!ctx->wallpaper_batch) panfrost_job_union_scissor(job, minx, miny, maxx, maxy); /* Upload */ @@ -1401,28 +1335,18 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) static void panfrost_queue_draw(struct panfrost_context *ctx) { - /* TODO: Expand the array? */ - if (ctx->draw_count >= MAX_DRAW_CALLS) { - DBG("Job buffer overflow, ignoring draw\n"); - assert(0); - } - /* Handle dirty flags now */ panfrost_emit_for_draw(ctx, true); - /* We need a set_value job before any other draw jobs */ - if (ctx->draw_count == 0) - panfrost_set_value_job(ctx); - struct panfrost_transfer vertex = panfrost_vertex_tiler_job(ctx, false); - ctx->u_vertex_jobs[ctx->vertex_job_count] = (struct mali_job_descriptor_header *) vertex.cpu; - ctx->vertex_jobs[ctx->vertex_job_count++] = vertex.gpu; - struct panfrost_transfer tiler = panfrost_vertex_tiler_job(ctx, true); - ctx->u_tiler_jobs[ctx->tiler_job_count] = (struct mali_job_descriptor_header *) tiler.cpu; - ctx->tiler_jobs[ctx->tiler_job_count++] = tiler.gpu; - ctx->draw_count++; + struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + + if (ctx->wallpaper_batch) + panfrost_scoreboard_queue_fused_job_prepend(batch, vertex, tiler); + else + panfrost_scoreboard_queue_fused_job(batch, vertex, tiler); } /* The entire frame is in memory -- send it off to the kernel! */ @@ -1473,37 +1397,12 @@ panfrost_draw_wallpaper(struct pipe_context *pipe) if (ctx->pipe_framebuffer.cbufs[0] == NULL) return; - /* Blit the wallpaper in */ - ctx->in_wallpaper = true; - panfrost_blit_wallpaper(ctx); - ctx->in_wallpaper = false; - - /* We are flushing all queued draws and we know that no more jobs will - * be added until the next frame. - * We also know that the last jobs are the wallpaper jobs, and they - * need to be linked so they execute right after the set_value job. - */ - - /* set_value job to wallpaper vertex job */ - panfrost_link_job_pair(ctx->u_set_value_job, ctx->vertex_jobs[ctx->vertex_job_count - 1]); - ctx->u_vertex_jobs[ctx->vertex_job_count - 1]->job_dependency_index_1 = ctx->u_set_value_job->job_index; + /* Save the batch */ + struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); - /* wallpaper vertex job to first vertex job */ - panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->vertex_job_count - 1], ctx->vertex_jobs[0]); - ctx->u_vertex_jobs[0]->job_dependency_index_1 = ctx->u_set_value_job->job_index; - - /* last vertex job to wallpaper tiler job */ - panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->vertex_job_count - 2], ctx->tiler_jobs[ctx->tiler_job_count - 1]); - ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_dependency_index_1 = ctx->u_vertex_jobs[ctx->vertex_job_count - 1]->job_index; - ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_dependency_index_2 = 0; - - /* wallpaper tiler job to first tiler job */ - panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->tiler_job_count - 1], ctx->tiler_jobs[0]); - ctx->u_tiler_jobs[0]->job_dependency_index_1 = ctx->u_vertex_jobs[0]->job_index; - ctx->u_tiler_jobs[0]->job_dependency_index_2 = ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_index; - - /* last tiler job to NULL */ - panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->tiler_job_count - 2], 0); + ctx->wallpaper_batch = batch; + panfrost_blit_wallpaper(ctx); + ctx->wallpaper_batch = NULL; } void @@ -1516,7 +1415,7 @@ panfrost_flush( struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); /* Nothing to do! */ - if (!ctx->draw_count && !job->clear) return; + if (!job->last_job.gpu && !job->clear) return; if (!job->clear) panfrost_draw_wallpaper(&ctx->base); @@ -2256,8 +2155,9 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx, * state is being restored by u_blitter */ + struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); bool is_scanout = panfrost_is_scanout(ctx); - bool has_draws = ctx->draw_count > 0; + bool has_draws = job->last_job.gpu; if (!ctx->blitter->running && (!is_scanout || has_draws)) { panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index deba4668767..1f718bcd9c4 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -47,7 +47,6 @@ /* Forward declare to avoid extra header dep */ struct prim_convert_context; -#define MAX_DRAW_CALLS 4096 #define MAX_VARYINGS 4096 //#define PAN_DIRTY_CLEAR (1 << 0) @@ -149,23 +148,6 @@ struct panfrost_context { struct mali_shader_meta fragment_shader_core; - /* A frame is composed of a starting set value job, a number of vertex - * and tiler jobs, linked to the fragment job at the end. See the - * presentations for more information how this works */ - - unsigned draw_count; - - mali_ptr set_value_job; - mali_ptr vertex_jobs[MAX_DRAW_CALLS]; - mali_ptr tiler_jobs[MAX_DRAW_CALLS]; - - struct mali_job_descriptor_header *u_set_value_job; - struct mali_job_descriptor_header *u_vertex_jobs[MAX_DRAW_CALLS]; - struct mali_job_descriptor_header *u_tiler_jobs[MAX_DRAW_CALLS]; - - unsigned vertex_job_count; - unsigned tiler_job_count; - /* Per-draw Dirty flags are setup like any other driver */ int dirty; @@ -201,7 +183,7 @@ struct panfrost_context { struct primconvert_context *primconvert; struct blitter_context *blitter; - bool in_wallpaper; + struct panfrost_job *wallpaper_batch; struct panfrost_blend_state *blend; diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c index aed50477ff7..77ec419398e 100644 --- a/src/gallium/drivers/panfrost/pan_drm.c +++ b/src/gallium/drivers/panfrost/pan_drm.c @@ -258,8 +258,10 @@ panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0]; int ret; - if (has_draws) { - ret = panfrost_drm_submit_job(ctx, ctx->set_value_job, 0, NULL); + struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); + + if (job->first_job.gpu) { + ret = panfrost_drm_submit_job(ctx, job->first_job.gpu, 0, NULL); assert(!ret); } diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 37bd70cb9a9..1878d6855c7 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -41,6 +41,9 @@ panfrost_create_job(struct panfrost_context *ctx) job->minx = job->miny = ~0; job->maxx = job->maxy = 0; + + util_dynarray_init(&job->headers, job); + util_dynarray_init(&job->gpu_headers, job); return job; } @@ -102,6 +105,12 @@ panfrost_get_job(struct panfrost_context *ctx, struct panfrost_job * panfrost_get_job_for_fbo(struct panfrost_context *ctx) { + /* If we're wallpapering, we special case to workaround + * u_blitter abuse */ + + if (ctx->wallpaper_batch) + return ctx->wallpaper_batch; + /* If we already began rendering, use that */ if (ctx->job) @@ -151,7 +160,9 @@ panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job) struct panfrost_screen *screen = pan_screen(gallium->screen); int ret; - bool has_draws = ctx->draw_count > 0; + panfrost_scoreboard_link_batch(job); + + bool has_draws = job->last_job.gpu; bool is_scanout = panfrost_is_scanout(ctx); if (!job) @@ -161,11 +172,6 @@ panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job) if (ret) fprintf(stderr, "panfrost_job_submit failed: %d\n", ret); - - /* Reset job counters */ - ctx->draw_count = 0; - ctx->vertex_job_count = 0; - ctx->tiler_job_count = 0; } void diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index ed8b084246a..b4c9db9828e 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -26,6 +26,11 @@ #ifndef __PAN_JOB_H__ #define __PAN_JOB_H__ +#include "util/u_dynarray.h" +#include "pipe/p_state.h" +#include "pan_allocate.h" +#include "pan_resource.h" + /* Used as a hash table key */ struct panfrost_job_key { @@ -61,6 +66,40 @@ struct panfrost_job { unsigned minx, miny; unsigned maxx, maxy; + /* CPU pointers to the job descriptor headers. next_job is only + * set at submit time (since only then are all the dependencies + * known). The upshot is that this is append-only. + * + * These arrays contain the headers for the "primary batch", our jargon + * referring to the part of the panfrost_job that actually contains + * meaningful work. In an OpenGL ES setting, that means the + * SET_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT + * job, which is sent on as a secondary batch containing only a single + * hardware job. Since there's one and only one FRAGMENT job issued per + * panfrost_job, there is no need to do any scoreboarding / management; + * it's easy enough to open-code it and it's not like we can get any + * better anyway. */ + struct util_dynarray headers; + + /* (And the GPU versions; TODO maybe combine) */ + struct util_dynarray gpu_headers; + + /* The last job in the primary batch */ + struct panfrost_transfer last_job; + + /* The first/last tiler job */ + struct panfrost_transfer first_tiler; + struct panfrost_transfer last_tiler; + + /* The first vertex job used as the input to a tiler job */ + struct panfrost_transfer first_vertex_for_tiler; + + /* The first job. Notice we've created a linked list */ + struct panfrost_transfer first_job; + + /* The number of jobs in the primary batch, essentially */ + unsigned job_index; + /* BOs referenced -- will be used for flushing logic */ struct set *bos; }; @@ -113,4 +152,36 @@ panfrost_job_union_scissor(struct panfrost_job *job, unsigned minx, unsigned miny, unsigned maxx, unsigned maxy); +/* Scoreboarding */ + +void +panfrost_scoreboard_queue_compute_job( + struct panfrost_job *batch, + struct panfrost_transfer job); + +void +panfrost_scoreboard_queue_vertex_job( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + bool requires_tiling); + +void +panfrost_scoreboard_queue_tiler_job( + struct panfrost_job *batch, + struct panfrost_transfer tiler); + +void +panfrost_scoreboard_queue_fused_job( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + struct panfrost_transfer tiler); +void +panfrost_scoreboard_queue_fused_job_prepend( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + struct panfrost_transfer tiler); + +void +panfrost_scoreboard_link_batch(struct panfrost_job *batch); + #endif diff --git a/src/gallium/drivers/panfrost/pan_scoreboard.c b/src/gallium/drivers/panfrost/pan_scoreboard.c new file mode 100644 index 00000000000..2e9c7b5074c --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_scoreboard.c @@ -0,0 +1,453 @@ +/* + * Copyright (C) 2019 Collabora + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "pan_context.h" +#include "pan_job.h" +#include "pan_allocate.h" +#include "util/bitset.h" + +/* + * Within a batch (panfrost_job), there are various types of Mali jobs: + * + * - SET_VALUE: initializes tiler + * - VERTEX: runs a vertex shader + * - TILER: runs tiling and sets up a fragment shader + * - FRAGMENT: runs fragment shaders and writes out + * - COMPUTE: runs a compute shader + * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost) + * - GEOMETRY: runs a geometry shader (unimplemented) + * - CACHE_FLUSH: unseen in the wild, theoretically cache flush + * + * In between a full batch and a single Mali job is the "job chain", a series + * of Mali jobs together forming a linked list. Within the job chain, each Mali + * job can set (up to) two dependencies on other earlier jobs in the chain. + * This dependency graph forms a scoreboard. The general idea of a scoreboard + * applies: when there is a data dependency of job B on job A, job B sets one + * of its dependency indices to job A, ensuring that job B won't start until + * job A finishes. + * + * More specifically, here are a set of rules: + * + * - A set value job must appear if and only if there is at least one tiler job. + * + * - Vertex jobs and tiler jobs are independent. + * + * - A tiler job must have a dependency on its data source. If it's getting + * data from a vertex job, it depends on the vertex job. If it's getting data + * from software, this is null. + * + * - The first vertex job used as the input to tiling must depend on the set + * value job, if it is present. + * + * - Tiler jobs must be strictly ordered. So each tiler job must depend on the + * previous job in the chain. + * + * - Jobs linking via next_job has no bearing on order of execution, rather it + * just establishes the linked list of jobs, EXCEPT: + * + * - A job's dependencies must appear earlier in the linked list (job chain). + * + * Justification for each rule: + * + * - Set value jobs set up tiling, essentially. If tiling occurs, they are + * needed; if it does not, we cannot emit them since then tiling partially + * occurs and it's bad. + * + * - The hardware has no notion of a "vertex/tiler job" (at least not our + * hardware -- other revs have fused jobs, but --- crap, this just got even + * more complicated). They are independent units that take in data, process + * it, and spit out data. + * + * - Any job must depend on its data source, in fact, or risk a + * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo + * tiler jobs depend on the corresponding vertex job (if it's there). + * + * - In fact, tiling depends on the set value job, but tiler jobs depend on the + * corresponding vertex jobs and each other, so this rule ensures each tiler + * job automatically depends on the set value job. + * + * - The tiler is not thread-safe; this dependency prevents race conditions + * between two different jobs trying to write to the tiler outputs at the + * same time. + * + * - Internally, jobs are scoreboarded; the next job fields just form a linked + * list to allow the jobs to be read in; the execution order is from + * resolving the dependency fields instead. + * + * - The hardware cannot set a dependency on a job it doesn't know about yet, + * and dependencies are processed in-order of the next job fields. + * + */ + +/* Accessor to set the next job field */ + +static void +panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next) +{ + if (first->job_descriptor_size) + first->next_job_64 = (u64) (uintptr_t) next; + else + first->next_job_32 = (u32) (uintptr_t) next; +} + +/* Coerce a panfrost_transfer to a header */ + +static inline struct mali_job_descriptor_header * +job_descriptor_header(struct panfrost_transfer t) +{ + return (struct mali_job_descriptor_header *) t.cpu; +} + +static void +panfrost_assign_index( + struct panfrost_job *job, + struct panfrost_transfer transfer) +{ + /* Assign the index */ + unsigned index = ++job->job_index; + job_descriptor_header(transfer)->job_index = index; +} + +/* Helper to add a dependency to a job */ + +static void +panfrost_add_dependency( + struct panfrost_transfer depender, + struct panfrost_transfer dependent) +{ + + struct mali_job_descriptor_header *first = + job_descriptor_header(dependent); + + struct mali_job_descriptor_header *second = + job_descriptor_header(depender); + + /* Ensure we're ready for dependencies */ + assert(second->job_index); + assert(first->job_index); + + /* Look for an open slot */ + + if (!second->job_dependency_index_1) + second->job_dependency_index_1 = first->job_index; + else if (!second->job_dependency_index_2) + second->job_dependency_index_2 = first->job_index; + else + unreachable("No available slot for new dependency"); +} + +/* Queues a job WITHOUT updating pointers. Be careful. */ + +static void +panfrost_scoreboard_queue_job_internal( + struct panfrost_job *batch, + struct panfrost_transfer job) +{ + panfrost_assign_index(batch, job); + + /* Queue a pointer to the job */ + util_dynarray_append(&batch->headers, void*, job.cpu); + util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu); +} + + +/* Queues a compute job, with no special dependencies. This is a bit of a + * misnomer -- internally, all job types are queued with this function, but + * outside of this file, it's for pure compute jobs */ + +void +panfrost_scoreboard_queue_compute_job( + struct panfrost_job *batch, + struct panfrost_transfer job) +{ + panfrost_scoreboard_queue_job_internal(batch, job); + + /* Update the linked list metadata as appropriate */ + batch->last_job = job; + + if (!batch->first_job.gpu) + batch->first_job = job; +} + +/* Queues a vertex job. There are no special dependencies yet, but if + * tiling is required (anytime 'rasterize discard' is disabled), we have + * some extra bookkeeping for later */ + +void +panfrost_scoreboard_queue_vertex_job( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + bool requires_tiling) +{ + panfrost_scoreboard_queue_compute_job(batch, vertex); + + if (requires_tiling && !batch->first_vertex_for_tiler.gpu) + batch->first_vertex_for_tiler = vertex; +} + +/* Queues a tiler job, respecting the dependency of each tiler job on the + * previous */ + +void +panfrost_scoreboard_queue_tiler_job( + struct panfrost_job *batch, + struct panfrost_transfer tiler) +{ + panfrost_scoreboard_queue_compute_job(batch, tiler); + + if (!batch->first_tiler.gpu) + batch->first_tiler = tiler; + + if (batch->last_tiler.gpu) + panfrost_add_dependency(tiler, batch->last_tiler); + + batch->last_tiler = tiler; +} + +/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if + * fused jobs are not supported (the default until Bifrost rolls out) */ + +void +panfrost_scoreboard_queue_fused_job( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + struct panfrost_transfer tiler) +{ + panfrost_scoreboard_queue_vertex_job(batch, vertex, true); + panfrost_scoreboard_queue_tiler_job(batch, tiler); + panfrost_add_dependency(tiler, vertex); +} + +/* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for + * wallpaper blits */ + +void +panfrost_scoreboard_queue_fused_job_prepend( + struct panfrost_job *batch, + struct panfrost_transfer vertex, + struct panfrost_transfer tiler) +{ + /* Sanity check */ + assert(batch->last_tiler.gpu); + assert(batch->first_tiler.gpu); + + /* First, we add the vertex job directly to the queue, forcing it to + * the front */ + + panfrost_scoreboard_queue_job_internal(batch, vertex); + batch->first_job = vertex; + batch->first_vertex_for_tiler = vertex; + + /* Similarly, we add the tiler job directly to the queue, forcing it to + * the front (second place), manually setting the tiler on vertex + * dependency (since this is pseudofused) and forcing a dependency of + * the now-second tiler on us (since all tiler jobs are linked in order + * and we're injecting ourselves at the front) */ + + panfrost_scoreboard_queue_job_internal(batch, tiler); + panfrost_add_dependency(tiler, vertex); + panfrost_add_dependency(batch->first_tiler, tiler); + batch->first_tiler = tiler; +} + +/* Generates a set value job, used below as part of TILER job scheduling. */ + +static struct panfrost_transfer +panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list) +{ + struct mali_job_descriptor_header job = { + .job_type = JOB_TYPE_SET_VALUE, + .job_descriptor_size = 1, + }; + + struct mali_payload_set_value payload = { + .out = polygon_list, + .unknown = 0x3, + }; + + struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload)); + memcpy(transfer.cpu, &job, sizeof(job)); + memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); + + return transfer; +} + +/* If there are any tiler jobs, there needs to be a corresponding set value job + * linked to the first vertex job feeding into tiling. */ + +static void +panfrost_scoreboard_set_value(struct panfrost_job *batch) +{ + /* Check if we even need tiling */ + if (!batch->last_tiler.gpu) + return; + + /* Okay, we do. Let's generate it */ + + struct panfrost_context *ctx = batch->ctx; + mali_ptr polygon_list = ctx->tiler_polygon_list.gpu; + + struct panfrost_transfer job = + panfrost_set_value_job(ctx, polygon_list); + + /* Queue it */ + panfrost_scoreboard_queue_compute_job(batch, job); + + /* Tiler jobs need us */ + panfrost_add_dependency(batch->first_tiler, job); +} + +/* Once all jobs have been added to a batch and we're ready to submit, we need + * to order them to set each of the next_job fields, obeying the golden rule: + * "A job's dependencies must appear earlier in the job chain than itself". + * Fortunately, computing this job chain is a well-studied graph theory problem + * known as "topological sorting", which has linear time algorithms. We let + * each job represent a node, each dependency a directed edge, and the entire + * set of jobs to be a dependency graph. This graph is inherently acyclic, as + * otherwise there are unresolveable dependencies. + * + * We implement Kahn's algorithm here to compute the next_job chain: + * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm + * + * A few implementation notes: we represent S explicitly with a bitset, L + * implicitly in the next_job fields. The indices of the bitset are off-by-one: + * nodes are numbered [0, node_count - 1], whereas in reality job_index in the + * hardware and dependencies are [1, node_count]. + * + * We represent edge removal implicitly with another pair of bitsets, rather + * than explicitly removing the edges, since we need to keep the dependencies + * there for the hardware. + */ + +#define DESCRIPTOR_FOR_NODE(count) \ + *(util_dynarray_element(&batch->headers, \ + struct mali_job_descriptor_header*, count)) + +#define GPU_ADDRESS_FOR_NODE(count) \ + *(util_dynarray_element(&batch->gpu_headers, \ + mali_ptr, count)) + +void +panfrost_scoreboard_link_batch(struct panfrost_job *batch) +{ + /* Finalize the batch */ + panfrost_scoreboard_set_value(batch); + + /* Let no_incoming represent the set S described. */ + + unsigned node_count = batch->job_index; + + size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD); + BITSET_WORD *no_incoming = calloc(sz, 1); + + /* Sets for edges being removed in dep 1 or 2 respectively */ + + BITSET_WORD *edge_removal_1 = calloc(sz, 1); + BITSET_WORD *edge_removal_2 = calloc(sz, 1); + + /* We compute no_incoming by traversing the batch. */ + + for (unsigned i = 0; i < node_count; ++i) { + struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i); + + unsigned dep_1 = node->job_dependency_index_1; + unsigned dep_2 = node->job_dependency_index_2; + + if (!(dep_1 || dep_2)) + BITSET_SET(no_incoming, i); + } + + /* No next_job fields are set at the beginning, so L is implciitly the + * empty set. As next_job fields are filled, L is implicitly set. Tail + * is the tail of L, however. */ + + struct mali_job_descriptor_header *tail = NULL; + + /* We iterate, popping off elements of S. A simple foreach won't do, + * since we mutate S as we go (even adding elements) */ + + unsigned arr_size = BITSET_WORDS(node_count); + + for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size); + (node_n_1 != 0); + node_n_1 = __bitset_ffs(no_incoming, arr_size)) { + + unsigned node_n = node_n_1 - 1; + + /* We've got a node n, pop it off */ + BITSET_CLEAR(no_incoming, node_n); + + /* Add it to the list */ + struct mali_job_descriptor_header *n = + DESCRIPTOR_FOR_NODE(node_n); + + mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n); + + if (tail) { + /* Link us to the last node */ + panfrost_set_job_next(tail, addr); + } else { + /* We are the first/last node */ + batch->first_job.cpu = (uint8_t *) n; + batch->first_job.gpu = addr; + } + + tail = n; + + /* Scan dependencies */ + for (unsigned node_m = 0; node_m < node_count; ++node_m) { + struct mali_job_descriptor_header *m = + DESCRIPTOR_FOR_NODE(node_m); + + /* Get the deps, accounting for removal */ + unsigned dep_1 = m->job_dependency_index_1; + unsigned dep_2 = m->job_dependency_index_2; + + if (BITSET_TEST(edge_removal_1, node_m)) + dep_1 = 0; + + if (BITSET_TEST(edge_removal_2, node_m)) + dep_2 = 0; + + /* Pretend to remove edges */ + if (dep_1 == node_n_1) { + BITSET_SET(edge_removal_1, node_m); + dep_1 = 0; + } else if (dep_2 == node_n_1) { + BITSET_SET(edge_removal_2, node_m); + dep_2 = 0; + } else { + /* This node has no relevant dependencies */ + continue; + } + + /* Are there edges left? If not, add us to S */ + bool has_edges = dep_1 || dep_2; + + if (!has_edges) + BITSET_SET(no_incoming, node_m); + } + } + +} |