src/gallium/drivers/panfrost/pan_scoreboard.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506

/*
 * Copyright (C) 2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include "pan_context.h"
#include "pan_job.h"
#include "pan_allocate.h"
#include "util/bitset.h"

/*
 * Within a batch (panfrost_job), there are various types of Mali jobs:
 *
 *  - SET_VALUE: initializes tiler
 *  - VERTEX: runs a vertex shader
 *  - TILER: runs tiling and sets up a fragment shader
 *  - FRAGMENT: runs fragment shaders and writes out
 *  - COMPUTE: runs a compute shader
 *  - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
 *  - GEOMETRY: runs a geometry shader (unimplemented)
 *  - CACHE_FLUSH: unseen in the wild, theoretically cache flush
 *
 * In between a full batch and a single Mali job is the "job chain", a series
 * of Mali jobs together forming a linked list. Within the job chain, each Mali
 * job can set (up to) two dependencies on other earlier jobs in the chain.
 * This dependency graph forms a scoreboard. The general idea of a scoreboard
 * applies: when there is a data dependency of job B on job A, job B sets one
 * of its dependency indices to job A, ensuring that job B won't start until
 * job A finishes.
 *
 * More specifically, here are a set of rules:
 *
 * - A set value job must appear if and only if there is at least one tiler job.
 *
 * - Vertex jobs and tiler jobs are independent.
 *
 * - A tiler job must have a dependency on its data source. If it's getting
 *   data from a vertex job, it depends on the vertex job. If it's getting data
 *   from software, this is null.
 *
 * - The first vertex job used as the input to tiling must depend on the set
 *   value job, if it is present.
 *
 * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
 *   previous job in the chain.
 *
 * - Jobs linking via next_job has no bearing on order of execution, rather it
 *   just establishes the linked list of jobs, EXCEPT:
 *
 * - A job's dependencies must appear earlier in the linked list (job chain).
 *
 * Justification for each rule:
 *
 * - Set value jobs set up tiling, essentially. If tiling occurs, they are
 *   needed; if it does not, we cannot emit them since then tiling partially
 *   occurs and it's bad.
 *
 * - The hardware has no notion of a "vertex/tiler job" (at least not our
 *   hardware -- other revs have fused jobs, but --- crap, this just got even
 *   more complicated). They are independent units that take in data, process
 *   it, and spit out data.
 *
 * - Any job must depend on its data source, in fact, or risk a
 *   read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
 *   tiler jobs depend on the corresponding vertex job (if it's there).
 *
 * - In fact, tiling depends on the set value job, but tiler jobs depend on the
 *   corresponding vertex jobs and each other, so this rule ensures each tiler
 *   job automatically depends on the set value job.
 *
 * - The tiler is not thread-safe; this dependency prevents race conditions
 *   between two different jobs trying to write to the tiler outputs at the
 *   same time.
 *
 * - Internally, jobs are scoreboarded; the next job fields just form a linked
 *   list to allow the jobs to be read in; the execution order is from
 *   resolving the dependency fields instead.
 *
 * - The hardware cannot set a dependency on a job it doesn't know about yet,
 *   and dependencies are processed in-order of the next job fields.
 *
 */

/* Accessor to set the next job field */

static void
panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
{
        if (first->job_descriptor_size)
                first->next_job_64 = (u64) (uintptr_t) next;
        else
                first->next_job_32 = (u32) (uintptr_t) next;
}

/* Coerce a panfrost_transfer to a header */

static inline struct mali_job_descriptor_header *
job_descriptor_header(struct panfrost_transfer t)
{
        return (struct mali_job_descriptor_header *) t.cpu;
}

static void
panfrost_assign_index(
        struct panfrost_job *job,
        struct panfrost_transfer transfer)
{
        /* Assign the index */
        unsigned index = ++job->job_index;
        job_descriptor_header(transfer)->job_index = index;
}

/* Helper to add a dependency to a job */

static void
panfrost_add_dependency(
        struct panfrost_transfer depender,
        struct panfrost_transfer dependent)
{

        struct mali_job_descriptor_header *first =
                job_descriptor_header(dependent);

        struct mali_job_descriptor_header *second =
                job_descriptor_header(depender);

        /* Look for an open slot */

        if (!second->job_dependency_index_1)
                second->job_dependency_index_1 = first->job_index;
        else if (!second->job_dependency_index_2)
                second->job_dependency_index_2 = first->job_index;
        else
                unreachable("No available slot for new dependency");
}

/* Queues a job WITHOUT updating pointers. Be careful. */

static void
panfrost_scoreboard_queue_job_internal(
        struct panfrost_job *batch,
        struct panfrost_transfer job)
{
        panfrost_assign_index(batch, job);

        /* Queue a pointer to the job */
        util_dynarray_append(&batch->headers, void*, job.cpu);
        util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
}


/* Queues a compute job, with no special dependencies. This is a bit of a
 * misnomer -- internally, all job types are queued with this function, but
 * outside of this file, it's for pure compute jobs */

void
panfrost_scoreboard_queue_compute_job(
        struct panfrost_job *batch,
        struct panfrost_transfer job)
{
        panfrost_scoreboard_queue_job_internal(batch, job);

        /* Update the linked list metadata as appropriate */
        batch->last_job = job;

        if (!batch->first_job.gpu)
                batch->first_job = job;
}

/* Queues a vertex job. There are no special dependencies yet, but if
 * tiling is required (anytime 'rasterize discard' is disabled), we have
 * some extra bookkeeping for later */

void
panfrost_scoreboard_queue_vertex_job(
        struct panfrost_job *batch,
        struct panfrost_transfer vertex,
        bool requires_tiling)
{
        panfrost_scoreboard_queue_compute_job(batch, vertex);

        if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
                batch->first_vertex_for_tiler = vertex;
}

/* Queues a tiler job, respecting the dependency of each tiler job on the
 * previous */

void
panfrost_scoreboard_queue_tiler_job(
        struct panfrost_job *batch,
        struct panfrost_transfer tiler)
{
        panfrost_scoreboard_queue_compute_job(batch, tiler);

        if (!batch->first_tiler.gpu)
                batch->first_tiler = tiler;

        if (batch->last_tiler.gpu)
                panfrost_add_dependency(tiler, batch->last_tiler);

        batch->last_tiler = tiler;
}

/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
 * fused jobs are not supported (the default until Bifrost rolls out) */

void
panfrost_scoreboard_queue_fused_job(
        struct panfrost_job *batch,
        struct panfrost_transfer vertex,
        struct panfrost_transfer tiler)
{
        panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
        panfrost_scoreboard_queue_tiler_job(batch, tiler);
        panfrost_add_dependency(tiler, vertex);
}

/* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
 * wallpaper blits */

void
panfrost_scoreboard_queue_fused_job_prepend(
        struct panfrost_job *batch,
        struct panfrost_transfer vertex,
        struct panfrost_transfer tiler)
{
        /* Sanity check */
        assert(batch->last_tiler.gpu);
        assert(batch->first_tiler.gpu);

        /* First, we add the vertex job directly to the queue, forcing it to
         * the front */

        panfrost_scoreboard_queue_job_internal(batch, vertex);
        batch->first_job = vertex;
        batch->first_vertex_for_tiler = vertex;

        /* Similarly, we add the tiler job directly to the queue, forcing it to
         * the front (second place), manually setting the tiler on vertex
         * dependency (since this is pseudofused) and forcing a dependency of
         * the now-second tiler on us (since all tiler jobs are linked in order
         * and we're injecting ourselves at the front) */

        panfrost_scoreboard_queue_job_internal(batch, tiler);
        panfrost_add_dependency(tiler, vertex);
        panfrost_add_dependency(batch->first_tiler, tiler);
        batch->first_tiler = tiler;
}

/* Generates a set value job, used below as part of TILER job scheduling. */

static struct panfrost_transfer
panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list)
{
        struct mali_job_descriptor_header job = {
                .job_type = JOB_TYPE_SET_VALUE,
                .job_descriptor_size = 1,
        };

        struct mali_payload_set_value payload = {
                .out = polygon_list,
                .unknown = 0x3,
        };

        struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
        memcpy(transfer.cpu, &job, sizeof(job));
        memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));

        return transfer;
}

/* If there are any tiler jobs, there needs to be a corresponding set value job
 * linked to the first vertex job feeding into tiling. */

static void
panfrost_scoreboard_set_value(struct panfrost_job *batch)
{
        /* Check if we even need tiling */
        if (!batch->last_tiler.gpu)
                return;

        /* Okay, we do. Let's generate it. We'll need the job's polygon list
         * regardless of size. */

        struct panfrost_context *ctx = batch->ctx;
        mali_ptr polygon_list = panfrost_job_get_polygon_list(batch, 0);

        struct panfrost_transfer job =
                panfrost_set_value_job(ctx, polygon_list);

        /* Queue it */
        panfrost_scoreboard_queue_compute_job(batch, job);

        /* Tiler jobs need us */
        panfrost_add_dependency(batch->first_tiler, job);
}

/* Once all jobs have been added to a batch and we're ready to submit, we need
 * to order them to set each of the next_job fields, obeying the golden rule:
 * "A job's dependencies must appear earlier in the job chain than itself".
 * Fortunately, computing this job chain is a well-studied graph theory problem
 * known as "topological sorting", which has linear time algorithms. We let
 * each job represent a node, each dependency a directed edge, and the entire
 * set of jobs to be a dependency graph. This graph is inherently acyclic, as
 * otherwise there are unresolveable dependencies.
 *
 * We implement Kahn's algorithm here to compute the next_job chain:
 * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
 *
 * A few implementation notes: we represent S explicitly with a bitset, L
 * implicitly in the next_job fields. The indices of the bitset are off-by-one:
 * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
 * hardware and dependencies are [1, node_count].
 *
 * We represent edge removal implicitly with another pair of bitsets, rather
 * than explicitly removing the edges, since we need to keep the dependencies
 * there for the hardware.
 */

#define DESCRIPTOR_FOR_NODE(count) \
        *(util_dynarray_element(&batch->headers, \
                struct mali_job_descriptor_header*, count))

#define GPU_ADDRESS_FOR_NODE(count) \
        *(util_dynarray_element(&batch->gpu_headers, \
                mali_ptr, count))

void
panfrost_scoreboard_link_batch(struct panfrost_job *batch)
{
        /* Finalize the batch */
        panfrost_scoreboard_set_value(batch);

        /* Let no_incoming represent the set S described. */

        unsigned node_count = batch->job_index;

        size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
        BITSET_WORD *no_incoming = calloc(sz, 1);

        /* Sets for edges being removed in dep 1 or 2 respectively */

        BITSET_WORD *edge_removal_1 = calloc(sz, 1);
        BITSET_WORD *edge_removal_2 = calloc(sz, 1);

        /* We compute no_incoming by traversing the batch. Simultaneously, we
         * would like to keep track of a parity-reversed version of the
         * dependency graph. Dependency indices are 16-bit and in practice (for
         * ES3.0, at least), we can guarantee a given node will be depended on
         * by no more than one other nodes. P.f:
         *
         * Proposition: Given a node N of type T, no more than one other node
         * depends on N.
         *
         * If type is SET_VALUE: The only dependency added against us is from
         * the first tiler job, so there is 1 dependent.
         *
         * If type is VERTEX: If there is a tiler node, that tiler node depends
         * on us; if there is not (transform feedback), nothing depends on us.
         * Therefore there is at most 1 dependent.
         *
         * If type is TILER: If there is another TILER job in succession, that
         * node depends on us. No other job type depends on us. Therefore there
         * is at most 1 dependent.
         *
         * If type is FRAGMENT: This type cannot be in a primary chain, so it
         * is irrelevant. Just for kicks, nobody would depend on us, so there
         * are zero dependents, so it holds anyway.
         *
         * TODO: Revise this logic for ES3.1 and above. This result may not
         * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case
         * those. Can FBO dependencies be expressed within a chain?
         * ---
         *
         * Point is, we only need to hold a single dependent, which is a pretty
         * helpful result.
         */

        unsigned *dependents = calloc(node_count, sizeof(unsigned));

        for (unsigned i = 0; i < node_count; ++i) {
                struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);

                unsigned dep_1 = node->job_dependency_index_1;
                unsigned dep_2 = node->job_dependency_index_2;

                /* Record no_incoming info for this node */

                if (!(dep_1 || dep_2))
                        BITSET_SET(no_incoming, i);

                /* Record this node as the dependent of each of its
                 * dependencies */

                if (dep_1) {
                        assert(!dependents[dep_1 - 1]);
                        dependents[dep_1 - 1] = i;
                }

                if (dep_2) {
                        assert(!dependents[dep_2 - 1]);
                        dependents[dep_2 - 1] = i;
                }
        }

        /* No next_job fields are set at the beginning, so L is implciitly the
         * empty set. As next_job fields are filled, L is implicitly set. Tail
         * is the tail of L, however. */

        struct mali_job_descriptor_header *tail = NULL;

        /* We iterate, popping off elements of S. A simple foreach won't do,
         * since we mutate S as we go (even adding elements) */

        unsigned arr_size = BITSET_WORDS(node_count);

        for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
             (node_n_1 != 0);
             node_n_1 = __bitset_ffs(no_incoming, arr_size)) {

                unsigned node_n = node_n_1 - 1;

                /* We've got a node n, pop it off */
                BITSET_CLEAR(no_incoming, node_n);

                /* Add it to the list */
                struct mali_job_descriptor_header *n =
                        DESCRIPTOR_FOR_NODE(node_n);

                mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);

                if (tail) {
                        /* Link us to the last node */
                        panfrost_set_job_next(tail, addr);
                } else {
                        /* We are the first/last node */
                        batch->first_job.cpu = (uint8_t *) n;
                        batch->first_job.gpu = addr;
                }

                tail = n;

                /* Grab the dependent, if there is one */
                unsigned node_m = dependents[node_n];

                if (node_m) {
                        struct mali_job_descriptor_header *m =
                                DESCRIPTOR_FOR_NODE(node_m);

                        /* Get the deps, accounting for removal */
                        unsigned dep_1 = m->job_dependency_index_1;
                        unsigned dep_2 = m->job_dependency_index_2;

                        if (BITSET_TEST(edge_removal_1, node_m))
                                dep_1 = 0;

                        if (BITSET_TEST(edge_removal_2, node_m))
                                dep_2 = 0;

                        /* Pretend to remove edges */
                        if (dep_1 == node_n_1) {
                                BITSET_SET(edge_removal_1, node_m);
                                dep_1 = 0;
                        } else if (dep_2 == node_n_1) {
                                BITSET_SET(edge_removal_2, node_m);
                                dep_2 = 0;
                        } else {
                                /* This node has no relevant dependencies */
                                assert(0);
                        }

                        /* Are there edges left? If not, add us to S */
                        bool has_edges = dep_1 || dep_2;

                        if (!has_edges)
                                BITSET_SET(no_incoming, node_m);
                }
        }

        /* Cleanup */
        free(no_incoming);
        free(dependents);
        free(edge_removal_1);
        free(edge_removal_2);

}