summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/panfrost/meson.build1
-rw-r--r--src/gallium/drivers/panfrost/pan_context.c132
-rw-r--r--src/gallium/drivers/panfrost/pan_context.h20
-rw-r--r--src/gallium/drivers/panfrost/pan_drm.c6
-rw-r--r--src/gallium/drivers/panfrost/pan_job.c18
-rw-r--r--src/gallium/drivers/panfrost/pan_job.h71
-rw-r--r--src/gallium/drivers/panfrost/pan_scoreboard.c453
7 files changed, 558 insertions, 143 deletions
diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build
index 43d73ce2086..4298242f6b9 100644
--- a/src/gallium/drivers/panfrost/meson.build
+++ b/src/gallium/drivers/panfrost/meson.build
@@ -57,6 +57,7 @@ files_panfrost = files(
'pan_blend_shaders.c',
'pan_pretty_print.c',
'pan_fragment.c',
+ 'pan_scoreboard.c',
'pan_sfbd.c',
'pan_mfbd.c',
'pan_tiler.c',
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 41656236b5b..d8c5510a31e 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -517,15 +517,6 @@ panfrost_default_shader_backend(struct panfrost_context *ctx)
memcpy(&ctx->fragment_shader_core, &shader, sizeof(shader));
}
-static void
-panfrost_link_job_pair(struct mali_job_descriptor_header *first, mali_ptr next)
-{
- if (first->job_descriptor_size)
- first->next_job_64 = (u64) (uintptr_t) next;
- else
- first->next_job_32 = (u32) (uintptr_t) next;
-}
-
/* Generates a vertex/tiler job. This is, in some sense, the heart of the
* graphics command stream. It should be called once per draw, accordding to
* presentations. Set is_tiler for "tiler" jobs (fragment shader jobs, but in
@@ -535,12 +526,8 @@ panfrost_link_job_pair(struct mali_job_descriptor_header *first, mali_ptr next)
struct panfrost_transfer
panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler)
{
- /* Each draw call corresponds to two jobs, and the set-value job is first */
- int draw_job_index = 1 + (2 * ctx->draw_count) + 1;
-
struct mali_job_descriptor_header job = {
.job_type = is_tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX,
- .job_index = draw_job_index + (is_tiler ? 1 : 0),
#ifdef __LP64__
.job_descriptor_size = 1,
#endif
@@ -557,65 +544,11 @@ panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler)
#endif
struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload));
- if (is_tiler) {
- /* Tiler jobs depend on vertex jobs */
-
- job.job_dependency_index_1 = draw_job_index;
-
- /* Tiler jobs also depend on the previous tiler job */
-
- if (ctx->draw_count) {
- job.job_dependency_index_2 = draw_job_index - 1;
- /* Previous tiler job points to this tiler job */
- panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->draw_count - 1], transfer.gpu);
- } else {
- /* The only vertex job so far points to first tiler job */
- panfrost_link_job_pair(ctx->u_vertex_jobs[0], transfer.gpu);
- }
- } else {
- if (ctx->draw_count) {
- /* Previous vertex job points to this vertex job */
- panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->draw_count - 1], transfer.gpu);
-
- /* Last vertex job points to first tiler job */
- panfrost_link_job_pair(&job, ctx->tiler_jobs[0]);
- } else {
- /* Have the first vertex job depend on the set value job */
- job.job_dependency_index_1 = ctx->u_set_value_job->job_index;
- panfrost_link_job_pair(ctx->u_set_value_job, transfer.gpu);
- }
- }
-
memcpy(transfer.cpu, &job, sizeof(job));
memcpy(transfer.cpu + sizeof(job) - offset, payload, sizeof(*payload));
return transfer;
}
-/* Generates a set value job. It's unclear what exactly this does, why it's
- * necessary, and when to call it. */
-
-static void
-panfrost_set_value_job(struct panfrost_context *ctx)
-{
- struct mali_job_descriptor_header job = {
- .job_type = JOB_TYPE_SET_VALUE,
- .job_descriptor_size = 1,
- .job_index = 1,
- };
-
- struct mali_payload_set_value payload = {
- .out = ctx->tiler_polygon_list.gpu,
- .unknown = 0x3,
- };
-
- struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
- memcpy(transfer.cpu, &job, sizeof(job));
- memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
-
- ctx->u_set_value_job = (struct mali_job_descriptor_header *) transfer.cpu;
- ctx->set_value_job = transfer.gpu;
-}
-
static mali_ptr
panfrost_emit_varyings(
struct panfrost_context *ctx,
@@ -1214,6 +1147,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
for (unsigned i = 0; i < 1; ++i) {
bool is_srgb =
+ (ctx->pipe_framebuffer.nr_cbufs > i) &&
util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
rts[i].flags = blend_count;
@@ -1377,7 +1311,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
* scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
* just... be faster :) */
- if (!ctx->in_wallpaper)
+ if (!ctx->wallpaper_batch)
panfrost_job_union_scissor(job, minx, miny, maxx, maxy);
/* Upload */
@@ -1401,28 +1335,18 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
static void
panfrost_queue_draw(struct panfrost_context *ctx)
{
- /* TODO: Expand the array? */
- if (ctx->draw_count >= MAX_DRAW_CALLS) {
- DBG("Job buffer overflow, ignoring draw\n");
- assert(0);
- }
-
/* Handle dirty flags now */
panfrost_emit_for_draw(ctx, true);
- /* We need a set_value job before any other draw jobs */
- if (ctx->draw_count == 0)
- panfrost_set_value_job(ctx);
-
struct panfrost_transfer vertex = panfrost_vertex_tiler_job(ctx, false);
- ctx->u_vertex_jobs[ctx->vertex_job_count] = (struct mali_job_descriptor_header *) vertex.cpu;
- ctx->vertex_jobs[ctx->vertex_job_count++] = vertex.gpu;
-
struct panfrost_transfer tiler = panfrost_vertex_tiler_job(ctx, true);
- ctx->u_tiler_jobs[ctx->tiler_job_count] = (struct mali_job_descriptor_header *) tiler.cpu;
- ctx->tiler_jobs[ctx->tiler_job_count++] = tiler.gpu;
- ctx->draw_count++;
+ struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
+
+ if (ctx->wallpaper_batch)
+ panfrost_scoreboard_queue_fused_job_prepend(batch, vertex, tiler);
+ else
+ panfrost_scoreboard_queue_fused_job(batch, vertex, tiler);
}
/* The entire frame is in memory -- send it off to the kernel! */
@@ -1473,37 +1397,12 @@ panfrost_draw_wallpaper(struct pipe_context *pipe)
if (ctx->pipe_framebuffer.cbufs[0] == NULL)
return;
- /* Blit the wallpaper in */
- ctx->in_wallpaper = true;
- panfrost_blit_wallpaper(ctx);
- ctx->in_wallpaper = false;
-
- /* We are flushing all queued draws and we know that no more jobs will
- * be added until the next frame.
- * We also know that the last jobs are the wallpaper jobs, and they
- * need to be linked so they execute right after the set_value job.
- */
-
- /* set_value job to wallpaper vertex job */
- panfrost_link_job_pair(ctx->u_set_value_job, ctx->vertex_jobs[ctx->vertex_job_count - 1]);
- ctx->u_vertex_jobs[ctx->vertex_job_count - 1]->job_dependency_index_1 = ctx->u_set_value_job->job_index;
+ /* Save the batch */
+ struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
- /* wallpaper vertex job to first vertex job */
- panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->vertex_job_count - 1], ctx->vertex_jobs[0]);
- ctx->u_vertex_jobs[0]->job_dependency_index_1 = ctx->u_set_value_job->job_index;
-
- /* last vertex job to wallpaper tiler job */
- panfrost_link_job_pair(ctx->u_vertex_jobs[ctx->vertex_job_count - 2], ctx->tiler_jobs[ctx->tiler_job_count - 1]);
- ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_dependency_index_1 = ctx->u_vertex_jobs[ctx->vertex_job_count - 1]->job_index;
- ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_dependency_index_2 = 0;
-
- /* wallpaper tiler job to first tiler job */
- panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->tiler_job_count - 1], ctx->tiler_jobs[0]);
- ctx->u_tiler_jobs[0]->job_dependency_index_1 = ctx->u_vertex_jobs[0]->job_index;
- ctx->u_tiler_jobs[0]->job_dependency_index_2 = ctx->u_tiler_jobs[ctx->tiler_job_count - 1]->job_index;
-
- /* last tiler job to NULL */
- panfrost_link_job_pair(ctx->u_tiler_jobs[ctx->tiler_job_count - 2], 0);
+ ctx->wallpaper_batch = batch;
+ panfrost_blit_wallpaper(ctx);
+ ctx->wallpaper_batch = NULL;
}
void
@@ -1516,7 +1415,7 @@ panfrost_flush(
struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
/* Nothing to do! */
- if (!ctx->draw_count && !job->clear) return;
+ if (!job->last_job.gpu && !job->clear) return;
if (!job->clear)
panfrost_draw_wallpaper(&ctx->base);
@@ -2256,8 +2155,9 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
* state is being restored by u_blitter
*/
+ struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
bool is_scanout = panfrost_is_scanout(ctx);
- bool has_draws = ctx->draw_count > 0;
+ bool has_draws = job->last_job.gpu;
if (!ctx->blitter->running && (!is_scanout || has_draws)) {
panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME);
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index deba4668767..1f718bcd9c4 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -47,7 +47,6 @@
/* Forward declare to avoid extra header dep */
struct prim_convert_context;
-#define MAX_DRAW_CALLS 4096
#define MAX_VARYINGS 4096
//#define PAN_DIRTY_CLEAR (1 << 0)
@@ -149,23 +148,6 @@ struct panfrost_context {
struct mali_shader_meta fragment_shader_core;
- /* A frame is composed of a starting set value job, a number of vertex
- * and tiler jobs, linked to the fragment job at the end. See the
- * presentations for more information how this works */
-
- unsigned draw_count;
-
- mali_ptr set_value_job;
- mali_ptr vertex_jobs[MAX_DRAW_CALLS];
- mali_ptr tiler_jobs[MAX_DRAW_CALLS];
-
- struct mali_job_descriptor_header *u_set_value_job;
- struct mali_job_descriptor_header *u_vertex_jobs[MAX_DRAW_CALLS];
- struct mali_job_descriptor_header *u_tiler_jobs[MAX_DRAW_CALLS];
-
- unsigned vertex_job_count;
- unsigned tiler_job_count;
-
/* Per-draw Dirty flags are setup like any other driver */
int dirty;
@@ -201,7 +183,7 @@ struct panfrost_context {
struct primconvert_context *primconvert;
struct blitter_context *blitter;
- bool in_wallpaper;
+ struct panfrost_job *wallpaper_batch;
struct panfrost_blend_state *blend;
diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c
index aed50477ff7..77ec419398e 100644
--- a/src/gallium/drivers/panfrost/pan_drm.c
+++ b/src/gallium/drivers/panfrost/pan_drm.c
@@ -258,8 +258,10 @@ panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool
struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0];
int ret;
- if (has_draws) {
- ret = panfrost_drm_submit_job(ctx, ctx->set_value_job, 0, NULL);
+ struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
+ if (job->first_job.gpu) {
+ ret = panfrost_drm_submit_job(ctx, job->first_job.gpu, 0, NULL);
assert(!ret);
}
diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
index 37bd70cb9a9..1878d6855c7 100644
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -41,6 +41,9 @@ panfrost_create_job(struct panfrost_context *ctx)
job->minx = job->miny = ~0;
job->maxx = job->maxy = 0;
+
+ util_dynarray_init(&job->headers, job);
+ util_dynarray_init(&job->gpu_headers, job);
return job;
}
@@ -102,6 +105,12 @@ panfrost_get_job(struct panfrost_context *ctx,
struct panfrost_job *
panfrost_get_job_for_fbo(struct panfrost_context *ctx)
{
+ /* If we're wallpapering, we special case to workaround
+ * u_blitter abuse */
+
+ if (ctx->wallpaper_batch)
+ return ctx->wallpaper_batch;
+
/* If we already began rendering, use that */
if (ctx->job)
@@ -151,7 +160,9 @@ panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job)
struct panfrost_screen *screen = pan_screen(gallium->screen);
int ret;
- bool has_draws = ctx->draw_count > 0;
+ panfrost_scoreboard_link_batch(job);
+
+ bool has_draws = job->last_job.gpu;
bool is_scanout = panfrost_is_scanout(ctx);
if (!job)
@@ -161,11 +172,6 @@ panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job)
if (ret)
fprintf(stderr, "panfrost_job_submit failed: %d\n", ret);
-
- /* Reset job counters */
- ctx->draw_count = 0;
- ctx->vertex_job_count = 0;
- ctx->tiler_job_count = 0;
}
void
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index ed8b084246a..b4c9db9828e 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -26,6 +26,11 @@
#ifndef __PAN_JOB_H__
#define __PAN_JOB_H__
+#include "util/u_dynarray.h"
+#include "pipe/p_state.h"
+#include "pan_allocate.h"
+#include "pan_resource.h"
+
/* Used as a hash table key */
struct panfrost_job_key {
@@ -61,6 +66,40 @@ struct panfrost_job {
unsigned minx, miny;
unsigned maxx, maxy;
+ /* CPU pointers to the job descriptor headers. next_job is only
+ * set at submit time (since only then are all the dependencies
+ * known). The upshot is that this is append-only.
+ *
+ * These arrays contain the headers for the "primary batch", our jargon
+ * referring to the part of the panfrost_job that actually contains
+ * meaningful work. In an OpenGL ES setting, that means the
+ * SET_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT
+ * job, which is sent on as a secondary batch containing only a single
+ * hardware job. Since there's one and only one FRAGMENT job issued per
+ * panfrost_job, there is no need to do any scoreboarding / management;
+ * it's easy enough to open-code it and it's not like we can get any
+ * better anyway. */
+ struct util_dynarray headers;
+
+ /* (And the GPU versions; TODO maybe combine) */
+ struct util_dynarray gpu_headers;
+
+ /* The last job in the primary batch */
+ struct panfrost_transfer last_job;
+
+ /* The first/last tiler job */
+ struct panfrost_transfer first_tiler;
+ struct panfrost_transfer last_tiler;
+
+ /* The first vertex job used as the input to a tiler job */
+ struct panfrost_transfer first_vertex_for_tiler;
+
+ /* The first job. Notice we've created a linked list */
+ struct panfrost_transfer first_job;
+
+ /* The number of jobs in the primary batch, essentially */
+ unsigned job_index;
+
/* BOs referenced -- will be used for flushing logic */
struct set *bos;
};
@@ -113,4 +152,36 @@ panfrost_job_union_scissor(struct panfrost_job *job,
unsigned minx, unsigned miny,
unsigned maxx, unsigned maxy);
+/* Scoreboarding */
+
+void
+panfrost_scoreboard_queue_compute_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer job);
+
+void
+panfrost_scoreboard_queue_vertex_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ bool requires_tiling);
+
+void
+panfrost_scoreboard_queue_tiler_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer tiler);
+
+void
+panfrost_scoreboard_queue_fused_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ struct panfrost_transfer tiler);
+void
+panfrost_scoreboard_queue_fused_job_prepend(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ struct panfrost_transfer tiler);
+
+void
+panfrost_scoreboard_link_batch(struct panfrost_job *batch);
+
#endif
diff --git a/src/gallium/drivers/panfrost/pan_scoreboard.c b/src/gallium/drivers/panfrost/pan_scoreboard.c
new file mode 100644
index 00000000000..2e9c7b5074c
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_scoreboard.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (C) 2019 Collabora
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "pan_context.h"
+#include "pan_job.h"
+#include "pan_allocate.h"
+#include "util/bitset.h"
+
+/*
+ * Within a batch (panfrost_job), there are various types of Mali jobs:
+ *
+ * - SET_VALUE: initializes tiler
+ * - VERTEX: runs a vertex shader
+ * - TILER: runs tiling and sets up a fragment shader
+ * - FRAGMENT: runs fragment shaders and writes out
+ * - COMPUTE: runs a compute shader
+ * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
+ * - GEOMETRY: runs a geometry shader (unimplemented)
+ * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
+ *
+ * In between a full batch and a single Mali job is the "job chain", a series
+ * of Mali jobs together forming a linked list. Within the job chain, each Mali
+ * job can set (up to) two dependencies on other earlier jobs in the chain.
+ * This dependency graph forms a scoreboard. The general idea of a scoreboard
+ * applies: when there is a data dependency of job B on job A, job B sets one
+ * of its dependency indices to job A, ensuring that job B won't start until
+ * job A finishes.
+ *
+ * More specifically, here are a set of rules:
+ *
+ * - A set value job must appear if and only if there is at least one tiler job.
+ *
+ * - Vertex jobs and tiler jobs are independent.
+ *
+ * - A tiler job must have a dependency on its data source. If it's getting
+ * data from a vertex job, it depends on the vertex job. If it's getting data
+ * from software, this is null.
+ *
+ * - The first vertex job used as the input to tiling must depend on the set
+ * value job, if it is present.
+ *
+ * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
+ * previous job in the chain.
+ *
+ * - Jobs linking via next_job has no bearing on order of execution, rather it
+ * just establishes the linked list of jobs, EXCEPT:
+ *
+ * - A job's dependencies must appear earlier in the linked list (job chain).
+ *
+ * Justification for each rule:
+ *
+ * - Set value jobs set up tiling, essentially. If tiling occurs, they are
+ * needed; if it does not, we cannot emit them since then tiling partially
+ * occurs and it's bad.
+ *
+ * - The hardware has no notion of a "vertex/tiler job" (at least not our
+ * hardware -- other revs have fused jobs, but --- crap, this just got even
+ * more complicated). They are independent units that take in data, process
+ * it, and spit out data.
+ *
+ * - Any job must depend on its data source, in fact, or risk a
+ * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
+ * tiler jobs depend on the corresponding vertex job (if it's there).
+ *
+ * - In fact, tiling depends on the set value job, but tiler jobs depend on the
+ * corresponding vertex jobs and each other, so this rule ensures each tiler
+ * job automatically depends on the set value job.
+ *
+ * - The tiler is not thread-safe; this dependency prevents race conditions
+ * between two different jobs trying to write to the tiler outputs at the
+ * same time.
+ *
+ * - Internally, jobs are scoreboarded; the next job fields just form a linked
+ * list to allow the jobs to be read in; the execution order is from
+ * resolving the dependency fields instead.
+ *
+ * - The hardware cannot set a dependency on a job it doesn't know about yet,
+ * and dependencies are processed in-order of the next job fields.
+ *
+ */
+
+/* Accessor to set the next job field */
+
+static void
+panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
+{
+ if (first->job_descriptor_size)
+ first->next_job_64 = (u64) (uintptr_t) next;
+ else
+ first->next_job_32 = (u32) (uintptr_t) next;
+}
+
+/* Coerce a panfrost_transfer to a header */
+
+static inline struct mali_job_descriptor_header *
+job_descriptor_header(struct panfrost_transfer t)
+{
+ return (struct mali_job_descriptor_header *) t.cpu;
+}
+
+static void
+panfrost_assign_index(
+ struct panfrost_job *job,
+ struct panfrost_transfer transfer)
+{
+ /* Assign the index */
+ unsigned index = ++job->job_index;
+ job_descriptor_header(transfer)->job_index = index;
+}
+
+/* Helper to add a dependency to a job */
+
+static void
+panfrost_add_dependency(
+ struct panfrost_transfer depender,
+ struct panfrost_transfer dependent)
+{
+
+ struct mali_job_descriptor_header *first =
+ job_descriptor_header(dependent);
+
+ struct mali_job_descriptor_header *second =
+ job_descriptor_header(depender);
+
+ /* Ensure we're ready for dependencies */
+ assert(second->job_index);
+ assert(first->job_index);
+
+ /* Look for an open slot */
+
+ if (!second->job_dependency_index_1)
+ second->job_dependency_index_1 = first->job_index;
+ else if (!second->job_dependency_index_2)
+ second->job_dependency_index_2 = first->job_index;
+ else
+ unreachable("No available slot for new dependency");
+}
+
+/* Queues a job WITHOUT updating pointers. Be careful. */
+
+static void
+panfrost_scoreboard_queue_job_internal(
+ struct panfrost_job *batch,
+ struct panfrost_transfer job)
+{
+ panfrost_assign_index(batch, job);
+
+ /* Queue a pointer to the job */
+ util_dynarray_append(&batch->headers, void*, job.cpu);
+ util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
+}
+
+
+/* Queues a compute job, with no special dependencies. This is a bit of a
+ * misnomer -- internally, all job types are queued with this function, but
+ * outside of this file, it's for pure compute jobs */
+
+void
+panfrost_scoreboard_queue_compute_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer job)
+{
+ panfrost_scoreboard_queue_job_internal(batch, job);
+
+ /* Update the linked list metadata as appropriate */
+ batch->last_job = job;
+
+ if (!batch->first_job.gpu)
+ batch->first_job = job;
+}
+
+/* Queues a vertex job. There are no special dependencies yet, but if
+ * tiling is required (anytime 'rasterize discard' is disabled), we have
+ * some extra bookkeeping for later */
+
+void
+panfrost_scoreboard_queue_vertex_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ bool requires_tiling)
+{
+ panfrost_scoreboard_queue_compute_job(batch, vertex);
+
+ if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
+ batch->first_vertex_for_tiler = vertex;
+}
+
+/* Queues a tiler job, respecting the dependency of each tiler job on the
+ * previous */
+
+void
+panfrost_scoreboard_queue_tiler_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer tiler)
+{
+ panfrost_scoreboard_queue_compute_job(batch, tiler);
+
+ if (!batch->first_tiler.gpu)
+ batch->first_tiler = tiler;
+
+ if (batch->last_tiler.gpu)
+ panfrost_add_dependency(tiler, batch->last_tiler);
+
+ batch->last_tiler = tiler;
+}
+
+/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
+ * fused jobs are not supported (the default until Bifrost rolls out) */
+
+void
+panfrost_scoreboard_queue_fused_job(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ struct panfrost_transfer tiler)
+{
+ panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
+ panfrost_scoreboard_queue_tiler_job(batch, tiler);
+ panfrost_add_dependency(tiler, vertex);
+}
+
+/* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
+ * wallpaper blits */
+
+void
+panfrost_scoreboard_queue_fused_job_prepend(
+ struct panfrost_job *batch,
+ struct panfrost_transfer vertex,
+ struct panfrost_transfer tiler)
+{
+ /* Sanity check */
+ assert(batch->last_tiler.gpu);
+ assert(batch->first_tiler.gpu);
+
+ /* First, we add the vertex job directly to the queue, forcing it to
+ * the front */
+
+ panfrost_scoreboard_queue_job_internal(batch, vertex);
+ batch->first_job = vertex;
+ batch->first_vertex_for_tiler = vertex;
+
+ /* Similarly, we add the tiler job directly to the queue, forcing it to
+ * the front (second place), manually setting the tiler on vertex
+ * dependency (since this is pseudofused) and forcing a dependency of
+ * the now-second tiler on us (since all tiler jobs are linked in order
+ * and we're injecting ourselves at the front) */
+
+ panfrost_scoreboard_queue_job_internal(batch, tiler);
+ panfrost_add_dependency(tiler, vertex);
+ panfrost_add_dependency(batch->first_tiler, tiler);
+ batch->first_tiler = tiler;
+}
+
+/* Generates a set value job, used below as part of TILER job scheduling. */
+
+static struct panfrost_transfer
+panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list)
+{
+ struct mali_job_descriptor_header job = {
+ .job_type = JOB_TYPE_SET_VALUE,
+ .job_descriptor_size = 1,
+ };
+
+ struct mali_payload_set_value payload = {
+ .out = polygon_list,
+ .unknown = 0x3,
+ };
+
+ struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
+ memcpy(transfer.cpu, &job, sizeof(job));
+ memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
+
+ return transfer;
+}
+
+/* If there are any tiler jobs, there needs to be a corresponding set value job
+ * linked to the first vertex job feeding into tiling. */
+
+static void
+panfrost_scoreboard_set_value(struct panfrost_job *batch)
+{
+ /* Check if we even need tiling */
+ if (!batch->last_tiler.gpu)
+ return;
+
+ /* Okay, we do. Let's generate it */
+
+ struct panfrost_context *ctx = batch->ctx;
+ mali_ptr polygon_list = ctx->tiler_polygon_list.gpu;
+
+ struct panfrost_transfer job =
+ panfrost_set_value_job(ctx, polygon_list);
+
+ /* Queue it */
+ panfrost_scoreboard_queue_compute_job(batch, job);
+
+ /* Tiler jobs need us */
+ panfrost_add_dependency(batch->first_tiler, job);
+}
+
+/* Once all jobs have been added to a batch and we're ready to submit, we need
+ * to order them to set each of the next_job fields, obeying the golden rule:
+ * "A job's dependencies must appear earlier in the job chain than itself".
+ * Fortunately, computing this job chain is a well-studied graph theory problem
+ * known as "topological sorting", which has linear time algorithms. We let
+ * each job represent a node, each dependency a directed edge, and the entire
+ * set of jobs to be a dependency graph. This graph is inherently acyclic, as
+ * otherwise there are unresolveable dependencies.
+ *
+ * We implement Kahn's algorithm here to compute the next_job chain:
+ * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
+ *
+ * A few implementation notes: we represent S explicitly with a bitset, L
+ * implicitly in the next_job fields. The indices of the bitset are off-by-one:
+ * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
+ * hardware and dependencies are [1, node_count].
+ *
+ * We represent edge removal implicitly with another pair of bitsets, rather
+ * than explicitly removing the edges, since we need to keep the dependencies
+ * there for the hardware.
+ */
+
+#define DESCRIPTOR_FOR_NODE(count) \
+ *(util_dynarray_element(&batch->headers, \
+ struct mali_job_descriptor_header*, count))
+
+#define GPU_ADDRESS_FOR_NODE(count) \
+ *(util_dynarray_element(&batch->gpu_headers, \
+ mali_ptr, count))
+
+void
+panfrost_scoreboard_link_batch(struct panfrost_job *batch)
+{
+ /* Finalize the batch */
+ panfrost_scoreboard_set_value(batch);
+
+ /* Let no_incoming represent the set S described. */
+
+ unsigned node_count = batch->job_index;
+
+ size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
+ BITSET_WORD *no_incoming = calloc(sz, 1);
+
+ /* Sets for edges being removed in dep 1 or 2 respectively */
+
+ BITSET_WORD *edge_removal_1 = calloc(sz, 1);
+ BITSET_WORD *edge_removal_2 = calloc(sz, 1);
+
+ /* We compute no_incoming by traversing the batch. */
+
+ for (unsigned i = 0; i < node_count; ++i) {
+ struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
+
+ unsigned dep_1 = node->job_dependency_index_1;
+ unsigned dep_2 = node->job_dependency_index_2;
+
+ if (!(dep_1 || dep_2))
+ BITSET_SET(no_incoming, i);
+ }
+
+ /* No next_job fields are set at the beginning, so L is implciitly the
+ * empty set. As next_job fields are filled, L is implicitly set. Tail
+ * is the tail of L, however. */
+
+ struct mali_job_descriptor_header *tail = NULL;
+
+ /* We iterate, popping off elements of S. A simple foreach won't do,
+ * since we mutate S as we go (even adding elements) */
+
+ unsigned arr_size = BITSET_WORDS(node_count);
+
+ for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
+ (node_n_1 != 0);
+ node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
+
+ unsigned node_n = node_n_1 - 1;
+
+ /* We've got a node n, pop it off */
+ BITSET_CLEAR(no_incoming, node_n);
+
+ /* Add it to the list */
+ struct mali_job_descriptor_header *n =
+ DESCRIPTOR_FOR_NODE(node_n);
+
+ mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
+
+ if (tail) {
+ /* Link us to the last node */
+ panfrost_set_job_next(tail, addr);
+ } else {
+ /* We are the first/last node */
+ batch->first_job.cpu = (uint8_t *) n;
+ batch->first_job.gpu = addr;
+ }
+
+ tail = n;
+
+ /* Scan dependencies */
+ for (unsigned node_m = 0; node_m < node_count; ++node_m) {
+ struct mali_job_descriptor_header *m =
+ DESCRIPTOR_FOR_NODE(node_m);
+
+ /* Get the deps, accounting for removal */
+ unsigned dep_1 = m->job_dependency_index_1;
+ unsigned dep_2 = m->job_dependency_index_2;
+
+ if (BITSET_TEST(edge_removal_1, node_m))
+ dep_1 = 0;
+
+ if (BITSET_TEST(edge_removal_2, node_m))
+ dep_2 = 0;
+
+ /* Pretend to remove edges */
+ if (dep_1 == node_n_1) {
+ BITSET_SET(edge_removal_1, node_m);
+ dep_1 = 0;
+ } else if (dep_2 == node_n_1) {
+ BITSET_SET(edge_removal_2, node_m);
+ dep_2 = 0;
+ } else {
+ /* This node has no relevant dependencies */
+ continue;
+ }
+
+ /* Are there edges left? If not, add us to S */
+ bool has_edges = dep_1 || dep_2;
+
+ if (!has_edges)
+ BITSET_SET(no_incoming, node_m);
+ }
+ }
+
+}