/* * Copyright (C) 2019 Collabora, Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include "pan_context.h" #include "pan_job.h" #include "pan_allocate.h" #include "util/bitset.h" /* * Within a batch (panfrost_job), there are various types of Mali jobs: * * - WRITE_VALUE: generic write primitive, used to zero tiler field * - VERTEX: runs a vertex shader * - TILER: runs tiling and sets up a fragment shader * - FRAGMENT: runs fragment shaders and writes out * - COMPUTE: runs a compute shader * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost) * - GEOMETRY: runs a geometry shader (unimplemented) * - CACHE_FLUSH: unseen in the wild, theoretically cache flush * * In between a full batch and a single Mali job is the "job chain", a series * of Mali jobs together forming a linked list. Within the job chain, each Mali * job can set (up to) two dependencies on other earlier jobs in the chain. * This dependency graph forms a scoreboard. The general idea of a scoreboard * applies: when there is a data dependency of job B on job A, job B sets one * of its dependency indices to job A, ensuring that job B won't start until * job A finishes. * * More specifically, here are a set of rules: * * - A set value job must appear if and only if there is at least one tiler job. * * - Vertex jobs and tiler jobs are independent. * * - A tiler job must have a dependency on its data source. If it's getting * data from a vertex job, it depends on the vertex job. If it's getting data * from software, this is null. * * - The first vertex job used as the input to tiling must depend on the set * value job, if it is present. * * - Tiler jobs must be strictly ordered. So each tiler job must depend on the * previous job in the chain. * * - Jobs linking via next_job has no bearing on order of execution, rather it * just establishes the linked list of jobs, EXCEPT: * * - A job's dependencies must appear earlier in the linked list (job chain). * * Justification for each rule: * * - Set value jobs set up tiling, essentially. If tiling occurs, they are * needed; if it does not, we cannot emit them since then tiling partially * occurs and it's bad. * * - The hardware has no notion of a "vertex/tiler job" (at least not our * hardware -- other revs have fused jobs, but --- crap, this just got even * more complicated). They are independent units that take in data, process * it, and spit out data. * * - Any job must depend on its data source, in fact, or risk a * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo * tiler jobs depend on the corresponding vertex job (if it's there). * * - In fact, tiling depends on the set value job, but tiler jobs depend on the * corresponding vertex jobs and each other, so this rule ensures each tiler * job automatically depends on the set value job. * * - The tiler is not thread-safe; this dependency prevents race conditions * between two different jobs trying to write to the tiler outputs at the * same time. * * - Internally, jobs are scoreboarded; the next job fields just form a linked * list to allow the jobs to be read in; the execution order is from * resolving the dependency fields instead. * * - The hardware cannot set a dependency on a job it doesn't know about yet, * and dependencies are processed in-order of the next job fields. * */ /* Coerce a panfrost_transfer to a header */ static inline struct mali_job_descriptor_header * job_descriptor_header(struct panfrost_transfer t) { return (struct mali_job_descriptor_header *) t.cpu; } static void panfrost_assign_index( struct panfrost_batch *batch, struct panfrost_transfer transfer) { /* Assign the index */ unsigned index = ++batch->job_index; job_descriptor_header(transfer)->job_index = index; } /* Helper to add a dependency to a job */ static void panfrost_add_dependency( struct panfrost_transfer depender, struct panfrost_transfer dependent) { struct mali_job_descriptor_header *first = job_descriptor_header(dependent); struct mali_job_descriptor_header *second = job_descriptor_header(depender); /* Look for an open slot */ if (!second->job_dependency_index_1) second->job_dependency_index_1 = first->job_index; else if (!second->job_dependency_index_2) second->job_dependency_index_2 = first->job_index; else unreachable("No available slot for new dependency"); } /* Queues a job WITHOUT updating pointers. Be careful. */ static void panfrost_scoreboard_queue_job_internal( struct panfrost_batch *batch, struct panfrost_transfer job) { panfrost_assign_index(batch, job); /* Queue a pointer to the job */ util_dynarray_append(&batch->headers, void*, job.cpu); util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu); } /* Queues a compute job, with no special dependencies. This is a bit of a * misnomer -- internally, all job types are queued with this function, but * outside of this file, it's for pure compute jobs */ void panfrost_scoreboard_queue_compute_job( struct panfrost_batch *batch, struct panfrost_transfer job) { panfrost_scoreboard_queue_job_internal(batch, job); /* Update the linked list metadata as appropriate */ batch->last_job = job; if (!batch->first_job.gpu) batch->first_job = job; } /* Queues a vertex job. There are no special dependencies yet, but if * tiling is required (anytime 'rasterize discard' is disabled), we have * some extra bookkeeping for later */ void panfrost_scoreboard_queue_vertex_job( struct panfrost_batch *batch, struct panfrost_transfer vertex, bool requires_tiling) { panfrost_scoreboard_queue_compute_job(batch, vertex); if (requires_tiling && !batch->first_vertex_for_tiler.gpu) batch->first_vertex_for_tiler = vertex; } /* Queues a tiler job, respecting the dependency of each tiler job on the * previous */ void panfrost_scoreboard_queue_tiler_job( struct panfrost_batch *batch, struct panfrost_transfer tiler) { panfrost_scoreboard_queue_compute_job(batch, tiler); if (!batch->first_tiler.gpu) batch->first_tiler = tiler; if (batch->last_tiler.gpu) panfrost_add_dependency(tiler, batch->last_tiler); batch->last_tiler = tiler; } /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if * fused jobs are not supported (the default until Bifrost rolls out) */ void panfrost_scoreboard_queue_fused_job( struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler) { panfrost_scoreboard_queue_vertex_job(batch, vertex, true); panfrost_scoreboard_queue_tiler_job(batch, tiler); panfrost_add_dependency(tiler, vertex); } /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for * wallpaper blits */ void panfrost_scoreboard_queue_fused_job_prepend( struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler) { /* Sanity check */ assert(batch->last_tiler.gpu); assert(batch->first_tiler.gpu); /* First, we add the vertex job directly to the queue, forcing it to * the front */ panfrost_scoreboard_queue_job_internal(batch, vertex); batch->first_job = vertex; batch->first_vertex_for_tiler = vertex; /* Similarly, we add the tiler job directly to the queue, forcing it to * the front (second place), manually setting the tiler on vertex * dependency (since this is pseudofused) and forcing a dependency of * the now-second tiler on us (since all tiler jobs are linked in order * and we're injecting ourselves at the front) */ panfrost_scoreboard_queue_job_internal(batch, tiler); panfrost_add_dependency(tiler, vertex); panfrost_add_dependency(batch->first_tiler, tiler); batch->first_tiler = tiler; } /* Generates a write value job, used to initialize the tiler structures. */ static struct panfrost_transfer panfrost_write_value_job(struct panfrost_batch *batch, mali_ptr polygon_list) { struct mali_job_descriptor_header job = { .job_type = JOB_TYPE_WRITE_VALUE, .job_descriptor_size = 1, }; struct mali_payload_write_value payload = { .address = polygon_list, .value_descriptor = MALI_WRITE_VALUE_ZERO, }; struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(payload)); memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); return transfer; } /* If there are any tiler jobs, we need to initialize the tiler by writing * zeroes to a magic tiler structure. We do so via a WRITE_VALUE job linked to * the first vertex job feeding into tiling. */ static void panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch) { /* Check if we even need tiling */ if (!batch->last_tiler.gpu) return; /* Okay, we do. Let's generate it. We'll need the job's polygon list * regardless of size. */ mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, MALI_TILER_MINIMUM_HEADER_SIZE); struct panfrost_transfer job = panfrost_write_value_job(batch, polygon_list); /* Queue it */ panfrost_scoreboard_queue_compute_job(batch, job); /* Tiler jobs need us */ panfrost_add_dependency(batch->first_tiler, job); } /* Once all jobs have been added to a batch and we're ready to submit, we need * to order them to set each of the next_job fields, obeying the golden rule: * "A job's dependencies must appear earlier in the job chain than itself". * Fortunately, computing this job chain is a well-studied graph theory problem * known as "topological sorting", which has linear time algorithms. We let * each job represent a node, each dependency a directed edge, and the entire * set of jobs to be a dependency graph. This graph is inherently acyclic, as * otherwise there are unresolveable dependencies. * * We implement Kahn's algorithm here to compute the next_job chain: * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm * * A few implementation notes: we represent S explicitly with a bitset, L * implicitly in the next_job fields. The indices of the bitset are off-by-one: * nodes are numbered [0, node_count - 1], whereas in reality job_index in the * hardware and dependencies are [1, node_count]. * * We represent edge removal implicitly with another pair of bitsets, rather * than explicitly removing the edges, since we need to keep the dependencies * there for the hardware. */ #define DESCRIPTOR_FOR_NODE(count) \ *(util_dynarray_element(&batch->headers, \ struct mali_job_descriptor_header*, count)) #define GPU_ADDRESS_FOR_NODE(count) \ *(util_dynarray_element(&batch->gpu_headers, \ mali_ptr, count)) void panfrost_scoreboard_link_batch(struct panfrost_batch *batch) { /* Finalize the batch */ panfrost_scoreboard_initialize_tiler(batch); /* Let no_incoming represent the set S described. */ unsigned node_count = batch->job_index; size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD); BITSET_WORD *no_incoming = calloc(sz, 1); /* Sets for edges being removed in dep 1 or 2 respectively */ BITSET_WORD *edge_removal_1 = calloc(sz, 1); BITSET_WORD *edge_removal_2 = calloc(sz, 1); /* We compute no_incoming by traversing the batch. Simultaneously, we * would like to keep track of a parity-reversed version of the * dependency graph. Dependency indices are 16-bit and in practice (for * ES3.0, at least), we can guarantee a given node will be depended on * by no more than one other nodes. P.f: * * Proposition: Given a node N of type T, no more than one other node * depends on N. * * If type is WRITE_VALUE: The only dependency added against us is from * the first tiler job, so there is 1 dependent. * * If type is VERTEX: If there is a tiler node, that tiler node depends * on us; if there is not (transform feedback), nothing depends on us. * Therefore there is at most 1 dependent. * * If type is TILER: If there is another TILER job in succession, that * node depends on us. No other job type depends on us. Therefore there * is at most 1 dependent. * * If type is FRAGMENT: This type cannot be in a primary chain, so it * is irrelevant. Just for kicks, nobody would depend on us, so there * are zero dependents, so it holds anyway. * * TODO: Revise this logic for ES3.1 and above. This result may not * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case * those. Can FBO dependencies be expressed within a chain? * --- * * Point is, we only need to hold a single dependent, which is a pretty * helpful result. */ unsigned *dependents = calloc(node_count, sizeof(unsigned)); for (unsigned i = 0; i < node_count; ++i) { struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i); unsigned dep_1 = node->job_dependency_index_1; unsigned dep_2 = node->job_dependency_index_2; /* Record no_incoming info for this node */ if (!(dep_1 || dep_2)) BITSET_SET(no_incoming, i); /* Record this node as the dependent of each of its * dependencies */ if (dep_1) { assert(!dependents[dep_1 - 1]); dependents[dep_1 - 1] = i + 1; } if (dep_2) { assert(!dependents[dep_2 - 1]); dependents[dep_2 - 1] = i + 1; } } /* No next_job fields are set at the beginning, so L is implciitly the * empty set. As next_job fields are filled, L is implicitly set. Tail * is the tail of L, however. */ struct mali_job_descriptor_header *tail = NULL; /* We iterate, popping off elements of S. A simple foreach won't do, * since we mutate S as we go (even adding elements) */ unsigned arr_size = BITSET_WORDS(node_count); for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size); (node_n_1 != 0); node_n_1 = __bitset_ffs(no_incoming, arr_size)) { unsigned node_n = node_n_1 - 1; /* We've got a node n, pop it off */ BITSET_CLEAR(no_incoming, node_n); /* Add it to the list */ struct mali_job_descriptor_header *n = DESCRIPTOR_FOR_NODE(node_n); mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n); if (tail) { /* Link us to the last node */ tail->next_job = addr; } else { /* We are the first/last node */ batch->first_job.cpu = (uint8_t *) n; batch->first_job.gpu = addr; } tail = n; /* Grab the dependent, if there is one */ unsigned node_m_1 = dependents[node_n]; if (node_m_1) { unsigned node_m = node_m_1 - 1; struct mali_job_descriptor_header *m = DESCRIPTOR_FOR_NODE(node_m); /* Get the deps, accounting for removal */ unsigned dep_1 = m->job_dependency_index_1; unsigned dep_2 = m->job_dependency_index_2; if (BITSET_TEST(edge_removal_1, node_m)) dep_1 = 0; if (BITSET_TEST(edge_removal_2, node_m)) dep_2 = 0; /* Pretend to remove edges */ if (dep_1 == node_n_1) { BITSET_SET(edge_removal_1, node_m); dep_1 = 0; } else if (dep_2 == node_n_1) { BITSET_SET(edge_removal_2, node_m); dep_2 = 0; } else { /* This node has no relevant dependencies */ assert(0); } /* Are there edges left? If not, add us to S */ bool has_edges = dep_1 || dep_2; if (!has_edges) BITSET_SET(no_incoming, node_m); } } /* Cleanup */ free(no_incoming); free(dependents); free(edge_removal_1); free(edge_removal_2); }