src/panfrost/bifrost/compiler.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589

/*
 * Copyright (C) 2020 Collabora Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors (Collabora):
 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

#ifndef __BIFROST_COMPILER_H
#define __BIFROST_COMPILER_H

#include "bifrost.h"
#include "compiler/nir/nir.h"
#include "panfrost/util/pan_ir.h"

/* Bifrost opcodes are tricky -- the same op may exist on both FMA and
 * ADD with two completely different opcodes, and opcodes can be varying
 * length in some cases. Then we have different opcodes for int vs float
 * and then sometimes even for different typesizes. Further, virtually
 * every op has a number of flags which depend on the op. In constrast
 * to Midgard where you have a strict ALU/LDST/TEX division and within
 * ALU you have strict int/float and that's it... here it's a *lot* more
 * involved. As such, we use something much higher level for our IR,
 * encoding "classes" of operations, letting the opcode details get
 * sorted out at emit time.
 *
 * Please keep this list alphabetized. Please use a dictionary if you
 * don't know how to do that.
 */

enum bi_class {
        BI_ADD,
        BI_ATEST,
        BI_BRANCH,
        BI_CMP,
        BI_BLEND,
        BI_BITWISE,
        BI_COMBINE,
        BI_CONVERT,
        BI_CSEL,
        BI_DISCARD,
        BI_FMA,
        BI_FMOV,
        BI_FREXP,
        BI_ISUB,
        BI_LOAD,
        BI_LOAD_UNIFORM,
        BI_LOAD_ATTR,
        BI_LOAD_VAR,
        BI_LOAD_VAR_ADDRESS,
        BI_MINMAX,
        BI_MOV,
        BI_REDUCE_FMA,
        BI_SHIFT,
        BI_STORE,
        BI_STORE_VAR,
        BI_SPECIAL, /* _FAST on supported GPUs */
        BI_SWIZZLE,
        BI_TABLE,
        BI_TEX,
        BI_ROUND,
        BI_NUM_CLASSES
};

/* Properties of a class... */
extern unsigned bi_class_props[BI_NUM_CLASSES];

/* abs/neg/outmod valid for a float op */
#define BI_MODS (1 << 0)

/* Generic enough that little class-specific information is required. In other
 * words, it acts as a "normal" ALU op, even if the encoding ends up being
 * irregular enough to warrant a separate class */
#define BI_GENERIC (1 << 1)

/* Accepts a bifrost_roundmode */
#define BI_ROUNDMODE (1 << 2)

/* Can be scheduled to FMA */
#define BI_SCHED_FMA (1 << 3)

/* Can be scheduled to ADD */
#define BI_SCHED_ADD (1 << 4)

/* Most ALU ops can do either, actually */
#define BI_SCHED_ALL (BI_SCHED_FMA | BI_SCHED_ADD)

/* Along with setting BI_SCHED_ADD, eats up the entire cycle, so FMA must be
 * nopped out. Used for _FAST operations. */
#define BI_SCHED_SLOW (1 << 5)

/* Swizzling allowed for the 8/16-bit source */
#define BI_SWIZZLABLE (1 << 6)

/* For scheduling purposes this is a high latency instruction and must be at
 * the end of a clause. Implies ADD */
#define BI_SCHED_HI_LATENCY (1 << 7)

/* Intrinsic is vectorized and should read 4 components in the first source
 * regardless of writemask */
#define BI_VECTOR (1 << 8)

/* Use a data register for src0/dest respectively, bypassing the usual
 * register accessor. Mutually exclusive. */
#define BI_DATA_REG_SRC (1 << 9)
#define BI_DATA_REG_DEST (1 << 10)

/* Quirk: cannot encode multiple abs on FMA in fp16 mode */
#define BI_NO_ABS_ABS_FP16_FMA (1 << 11)

/* It can't get any worse than csel4... can it? */
#define BIR_SRC_COUNT 4

/* BI_LD_VARY */
struct bi_load_vary {
        enum bifrost_interp_mode interp_mode;
        bool reuse;
        bool flat;
};

/* BI_BRANCH encoding the details of the branch itself as well as a pointer to
 * the target. We forward declare bi_block since this is mildly circular (not
 * strictly, but this order of the file makes more sense I think)
 *
 * We define our own enum of conditions since the conditions in the hardware
 * packed in crazy ways that would make manipulation unweildly (meaning changes
 * based on port swapping, etc), so we defer dealing with that until emit time.
 * Likewise, we expose NIR types instead of the crazy branch types, although
 * the restrictions do eventually apply of course. */

struct bi_block;

enum bi_cond {
        BI_COND_ALWAYS,
        BI_COND_LT,
        BI_COND_LE,
        BI_COND_GE,
        BI_COND_GT,
        BI_COND_EQ,
        BI_COND_NE,
};

struct bi_branch {
        /* Types are specified in src_types and must be compatible (either both
         * int, or both float, 16/32, and same size or 32/16 if float. Types
         * ignored if BI_COND_ALWAYS is set for an unconditional branch. */

        enum bi_cond cond;
        struct bi_block *target;
};

/* Opcodes within a class */
enum bi_minmax_op {
        BI_MINMAX_MIN,
        BI_MINMAX_MAX
};

enum bi_bitwise_op {
        BI_BITWISE_AND,
        BI_BITWISE_OR,
        BI_BITWISE_XOR
};

enum bi_round_op {
        BI_ROUND_MODE, /* use round mode */
        BI_ROUND_ROUND /* i.e.: fround() */
};

enum bi_table_op {
        /* fp32 log2() with low precision, suitable for GL or half_log2() in
         * CL. In the first argument, takes x. Letting u be such that x =
         * 2^{-m} u with m integer and 0.75 <= u < 1.5, returns
         * log2(u) / (u - 1). */

        BI_TABLE_LOG2_U_OVER_U_1_LOW,
};

enum bi_reduce_op {
        /* Takes two fp32 arguments and returns x + frexp(y). Used in
         * low-precision log2 argument reduction on newer models. */

        BI_REDUCE_ADD_FREXPM,
};

enum bi_frexp_op {
        BI_FREXPE_LOG,
};

enum bi_special_op {
        BI_SPECIAL_FRCP,
        BI_SPECIAL_FRSQ,

        /* fp32 exp2() with low precision, suitable for half_exp2() in CL or
         * exp2() in GL. In the first argument, it takes f2i_rte(x * 2^24). In
         * the second, it takes x itself. */
        BI_SPECIAL_EXP2_LOW,
};

typedef struct {
        struct list_head link; /* Must be first */
        enum bi_class type;

        /* Indices, see bir_ssa_index etc. Note zero is special cased
         * to "no argument" */
        unsigned dest;
        unsigned src[BIR_SRC_COUNT];

        /* If one of the sources has BIR_INDEX_CONSTANT */
        union {
                uint64_t u64;
                uint32_t u32;
                uint16_t u16[2];
                uint8_t u8[4];
        } constant;

        /* Floating-point modifiers, type/class permitting. If not
         * allowed for the type/class, these are ignored. */
        enum bifrost_outmod outmod;
        bool src_abs[BIR_SRC_COUNT];
        bool src_neg[BIR_SRC_COUNT];

        /* Round mode (requires BI_ROUNDMODE) */
        enum bifrost_roundmode roundmode;

        /* Writemask (bit for each affected byte). This is quite restricted --
         * ALU ops can only write to a single channel (exception: <32 in which
         * you can write to 32/N contiguous aligned channels). Load/store can
         * only write to all channels at once, in a sense. But it's still
         * better to use this generic form than have synthetic ops flying
         * about, since we're not essentially vector for RA purposes. */
        uint16_t writemask;

        /* Destination type. Usually the type of the instruction
         * itself, but if sources and destination have different
         * types, the type of the destination wins (so f2i would be
         * int). Zero if there is no destination. Bitsize included */
        nir_alu_type dest_type;

        /* Source types if required by the class */
        nir_alu_type src_types[BIR_SRC_COUNT];

        /* If the source type is 8-bit or 16-bit such that SIMD is possible,
         * and the class has BI_SWIZZLABLE, this is a swizzle in the usual
         * sense. On non-SIMD instructions, it can be used for component
         * selection, so we don't have to special case extraction. */
        uint8_t swizzle[BIR_SRC_COUNT][NIR_MAX_VEC_COMPONENTS];

        /* A class-specific op from which the actual opcode can be derived
         * (along with the above information) */

        union {
                enum bi_minmax_op minmax;
                enum bi_bitwise_op bitwise;
                enum bi_round_op round;
                enum bi_special_op special;
                enum bi_reduce_op reduce;
                enum bi_table_op table;
                enum bi_frexp_op frexp;
                enum bi_cond compare;
        } op;

        /* Union for class-specific information */
        union {
                enum bifrost_minmax_mode minmax;
                struct bi_load_vary load_vary;
                struct bi_branch branch;

                /* For CSEL, the comparison op. BI_COND_ALWAYS doesn't make
                 * sense here but you can always just use a move for that */
                enum bi_cond csel_cond;

                /* For BLEND -- the location 0-7 */
                unsigned blend_location;

                /* For STORE, STORE_VAR -- channel count */
                unsigned store_channels;
        };
} bi_instruction;

/* Scheduling takes place in two steps. Step 1 groups instructions within a
 * block into distinct clauses (bi_clause). Step 2 schedules instructions
 * within a clause into FMA/ADD pairs (bi_bundle).
 *
 * A bi_bundle contains two paired instruction pointers. If a slot is unfilled,
 * leave it NULL; the emitter will fill in a nop.
 */

typedef struct {
        bi_instruction *fma;
        bi_instruction *add;
} bi_bundle;

typedef struct {
        struct list_head link;

        /* A clause can have 8 instructions in bundled FMA/ADD sense, so there
         * can be 8 bundles. But each bundle can have both an FMA and an ADD,
         * so a clause can have up to 16 bi_instructions. Whether bundles or
         * instructions are used depends on where in scheduling we are. */

        unsigned instruction_count;
        unsigned bundle_count;

        union {
                bi_instruction *instructions[16];
                bi_bundle bundles[8];
        };

        /* For scoreboarding -- the clause ID (this is not globally unique!)
         * and its dependencies in terms of other clauses, computed during
         * scheduling and used when emitting code. Dependencies expressed as a
         * bitfield matching the hardware, except shifted by a clause (the
         * shift back to the ISA's off-by-one encoding is worked out when
         * emitting clauses) */
        unsigned scoreboard_id;
        uint8_t dependencies;

        /* Back-to-back corresponds directly to the back-to-back bit. Branch
         * conditional corresponds to the branch conditional bit except that in
         * the emitted code it's always set if back-to-bit is, whereas we use
         * the actual value (without back-to-back so to speak) internally */
        bool back_to_back;
        bool branch_conditional;

        /* Assigned data register */
        unsigned data_register;

        /* Corresponds to the usual bit but shifted by a clause */
        bool data_register_write_barrier;

        /* Constants read by this clause. ISA limit. */
        uint64_t constants[8];
        unsigned constant_count;

        /* What type of high latency instruction is here, basically */
        unsigned clause_type;
} bi_clause;

typedef struct bi_block {
        pan_block base; /* must be first */

        /* If true, uses clauses; if false, uses instructions */
        bool scheduled;
        struct list_head clauses; /* list of bi_clause */
} bi_block;

typedef struct {
       nir_shader *nir;
       gl_shader_stage stage;
       struct list_head blocks; /* list of bi_block */
       struct panfrost_sysvals sysvals;
       uint32_t quirks;

       /* During NIR->BIR */
       nir_function_impl *impl;
       bi_block *current_block;
       unsigned block_name_count;
       bi_block *after_block;
       bi_block *break_block;
       bi_block *continue_block;
       bool emitted_atest;

       /* For creating temporaries */
       unsigned temp_alloc;

       /* Analysis results */
       bool has_liveness;

       /* Stats for shader-db */
       unsigned instruction_count;
       unsigned loop_count;
} bi_context;

static inline bi_instruction *
bi_emit(bi_context *ctx, bi_instruction ins)
{
        bi_instruction *u = rzalloc(ctx, bi_instruction);
        memcpy(u, &ins, sizeof(ins));
        list_addtail(&u->link, &ctx->current_block->base.instructions);
        return u;
}

static inline bi_instruction *
bi_emit_before(bi_context *ctx, bi_instruction *tag, bi_instruction ins)
{
        bi_instruction *u = rzalloc(ctx, bi_instruction);
        memcpy(u, &ins, sizeof(ins));
        list_addtail(&u->link, &tag->link);
        return u;
}

static inline void
bi_remove_instruction(bi_instruction *ins)
{
        list_del(&ins->link);
}

/* So we can distinguish between SSA/reg/sentinel quickly */
#define BIR_NO_ARG (0)
#define BIR_IS_REG (1)

/* If high bits are set, instead of SSA/registers, we have specials indexed by
 * the low bits if necessary.
 *
 *  Fixed register: do not allocate register, do not collect $200.
 *  Uniform: access a uniform register given by low bits.
 *  Constant: access the specified constant (specifies a bit offset / shift)
 *  Zero: special cased to avoid wasting a constant
 *  Passthrough: a bifrost_packed_src to passthrough T/T0/T1
 */

#define BIR_INDEX_REGISTER (1 << 31)
#define BIR_INDEX_UNIFORM  (1 << 30)
#define BIR_INDEX_CONSTANT (1 << 29)
#define BIR_INDEX_ZERO     (1 << 28)
#define BIR_INDEX_PASS     (1 << 27)

/* Keep me synced please so we can check src & BIR_SPECIAL */

#define BIR_SPECIAL        ((BIR_INDEX_REGISTER | BIR_INDEX_UNIFORM) | \
        (BIR_INDEX_CONSTANT | BIR_INDEX_ZERO | BIR_INDEX_PASS))

static inline unsigned
bi_max_temp(bi_context *ctx)
{
        unsigned alloc = MAX2(ctx->impl->reg_alloc, ctx->impl->ssa_alloc);
        return ((alloc + 2 + ctx->temp_alloc) << 1);
}

static inline unsigned
bi_make_temp(bi_context *ctx)
{
        return (ctx->impl->ssa_alloc + 1 + ctx->temp_alloc++) << 1;
}

static inline unsigned
bi_make_temp_reg(bi_context *ctx)
{
        return ((ctx->impl->reg_alloc + ctx->temp_alloc++) << 1) | BIR_IS_REG;
}

static inline unsigned
bir_ssa_index(nir_ssa_def *ssa)
{
        /* Off-by-one ensures BIR_NO_ARG is skipped */
        return ((ssa->index + 1) << 1) | 0;
}

static inline unsigned
bir_src_index(nir_src *src)
{
        if (src->is_ssa)
                return bir_ssa_index(src->ssa);
        else {
                assert(!src->reg.indirect);
                return (src->reg.reg->index << 1) | BIR_IS_REG;
        }
}

static inline unsigned
bir_dest_index(nir_dest *dst)
{
        if (dst->is_ssa)
                return bir_ssa_index(&dst->ssa);
        else {
                assert(!dst->reg.indirect);
                return (dst->reg.reg->index << 1) | BIR_IS_REG;
        }
}

/* Iterators for Bifrost IR */

#define bi_foreach_block(ctx, v) \
        list_for_each_entry(pan_block, v, &ctx->blocks, link)

#define bi_foreach_block_from(ctx, from, v) \
        list_for_each_entry_from(pan_block, v, from, &ctx->blocks, link)

#define bi_foreach_instr_in_block(block, v) \
        list_for_each_entry(bi_instruction, v, &(block)->base.instructions, link)

#define bi_foreach_instr_in_block_rev(block, v) \
        list_for_each_entry_rev(bi_instruction, v, &(block)->base.instructions, link)

#define bi_foreach_instr_in_block_safe(block, v) \
        list_for_each_entry_safe(bi_instruction, v, &(block)->base.instructions, link)

#define bi_foreach_instr_in_block_safe_rev(block, v) \
        list_for_each_entry_safe_rev(bi_instruction, v, &(block)->base.instructions, link)

#define bi_foreach_instr_in_block_from(block, v, from) \
        list_for_each_entry_from(bi_instruction, v, from, &(block)->base.instructions, link)

#define bi_foreach_instr_in_block_from_rev(block, v, from) \
        list_for_each_entry_from_rev(bi_instruction, v, from, &(block)->base.instructions, link)

#define bi_foreach_clause_in_block(block, v) \
        list_for_each_entry(bi_clause, v, &(block)->clauses, link)

#define bi_foreach_instr_global(ctx, v) \
        bi_foreach_block(ctx, v_block) \
                bi_foreach_instr_in_block((bi_block *) v_block, v)

#define bi_foreach_instr_global_safe(ctx, v) \
        bi_foreach_block(ctx, v_block) \
                bi_foreach_instr_in_block_safe((bi_block *) v_block, v)

/* Based on set_foreach, expanded with automatic type casts */

#define bi_foreach_predecessor(blk, v) \
        struct set_entry *_entry_##v; \
        bi_block *v; \
        for (_entry_##v = _mesa_set_next_entry(blk->base.predecessors, NULL), \
                v = (bi_block *) (_entry_##v ? _entry_##v->key : NULL);  \
                _entry_##v != NULL; \
                _entry_##v = _mesa_set_next_entry(blk->base.predecessors, _entry_##v), \
                v = (bi_block *) (_entry_##v ? _entry_##v->key : NULL))

#define bi_foreach_src(ins, v) \
        for (unsigned v = 0; v < ARRAY_SIZE(ins->src); ++v)

static inline bi_instruction *
bi_prev_op(bi_instruction *ins)
{
        return list_last_entry(&(ins->link), bi_instruction, link);
}

static inline bi_instruction *
bi_next_op(bi_instruction *ins)
{
        return list_first_entry(&(ins->link), bi_instruction, link);
}

static inline pan_block *
pan_next_block(pan_block *block)
{
        return list_first_entry(&(block->link), pan_block, link);
}

/* BIR manipulation */

bool bi_has_outmod(bi_instruction *ins);
bool bi_has_source_mods(bi_instruction *ins);
bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
bool bi_has_arg(bi_instruction *ins, unsigned arg);
uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
unsigned bi_get_component_count(bi_instruction *ins, unsigned s);
unsigned bi_load32_components(bi_instruction *ins);
uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);
uint64_t bi_get_immediate(bi_instruction *ins, unsigned index);
bool bi_writes_component(bi_instruction *ins, unsigned comp);

/* BIR passes */

void bi_lower_combine(bi_context *ctx, bi_block *block);
bool bi_opt_dead_code_eliminate(bi_context *ctx, bi_block *block);
void bi_schedule(bi_context *ctx);
void bi_register_allocate(bi_context *ctx);

/* Liveness */

void bi_compute_liveness(bi_context *ctx);
void bi_liveness_ins_update(uint16_t *live, bi_instruction *ins, unsigned max);
void bi_invalidate_liveness(bi_context *ctx);
bool bi_is_live_after(bi_context *ctx, bi_block *block, bi_instruction *start, int src);

/* Code emit */

void bi_pack(bi_context *ctx, struct util_dynarray *emission);

#endif