summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_compiler.c86
-rw-r--r--src/gallium/drivers/freedreno/a3xx/instr-a3xx.h12
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3.h19
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_cp.c5
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_depth.c3
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_dump.c10
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_ra.c55
-rw-r--r--src/gallium/drivers/freedreno/a3xx/ir3_sched.c81
8 files changed, 203 insertions, 68 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index cee446a9fa8..1138ec9be34 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -192,8 +192,7 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so,
#define FM(x) (1 << TGSI_FILE_##x)
/* optimize can't deal with relative addressing: */
- if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) |
- FM(OUTPUT) | FM(IMMEDIATE) | FM(CONSTANT)))
+ if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
return TGSI_PARSE_ERROR;
/* Immediates go after constants: */
@@ -414,14 +413,7 @@ block_temporary(struct ir3_block *block, unsigned n)
static struct ir3_instruction *
create_immed(struct fd3_compile_context *ctx, float val)
{
- /* this can happen when registers (or components of a TGSI
- * register) are used as src before they have been assigned
- * (undefined contents). To avoid confusing the rest of the
- * compiler, and to generally keep things peachy, substitute
- * an instruction that sets the src to 0.0. Or to keep
- * things undefined, I could plug in a random number? :-P
- *
- * NOTE: *don't* use instr_create() here!
+ /* NOTE: *don't* use instr_create() here!
*/
struct ir3_instruction *instr;
instr = ir3_instr_create(ctx->block, 1, 0);
@@ -464,6 +456,12 @@ ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
ctx->output_updates[idx].instr = instr;
ctx->num_output_updates++;
break;
+ case TGSI_FILE_ADDRESS:
+ compile_assert(ctx, n < 1);
+ ctx->output_updates[idx].instrp = &ctx->block->address;
+ ctx->output_updates[idx].instr = instr;
+ ctx->num_output_updates++;
+ break;
}
}
@@ -526,7 +524,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx,
/* uses SSA */
break;
case TGSI_FILE_ADDRESS:
- num = REG_A0;
+ flags |= IR3_REG_ADDR;
+ /* uses SSA */
break;
default:
compile_error(ctx, "unsupported dst register file: %s\n",
@@ -553,7 +552,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx,
if (!ctx->atomic)
ssa_dst(ctx, instr, dst, chan);
} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
- (dst->File == TGSI_FILE_OUTPUT)) {
+ (dst->File == TGSI_FILE_OUTPUT) ||
+ (dst->File == TGSI_FILE_ADDRESS)) {
unsigned i;
/* if instruction writes multiple, we need to create
@@ -591,6 +591,7 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx,
{
unsigned flags = 0, num = 0;
struct ir3_register *reg;
+ struct ir3_instruction *orig = NULL;
/* TODO we need to use a mov to temp for const >= 64.. or maybe
* we could use relative addressing..
@@ -628,9 +629,21 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx,
flags |= IR3_REG_ABS;
if (src->Negate)
flags |= IR3_REG_NEGATE;
- if (src->Indirect)
+
+ if (src->Indirect) {
flags |= IR3_REG_RELATIV;
+ /* shouldn't happen, and we can't cope with it below: */
+ compile_assert(ctx, wrmask == 0x1);
+
+ /* wrap in a meta-deref to track both the src and address: */
+ orig = instr;
+
+ instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
+ ir3_reg_create(instr, 0, 0);
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
+ }
+
reg = ir3_reg_create(instr, regid(num, chan), flags);
reg->wrmask = wrmask;
@@ -643,6 +656,8 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx,
struct ir3_instruction *collect;
unsigned i;
+ compile_assert(ctx, !src->Indirect);
+
/* if instruction reads multiple, we need to create
* some place-holder collect the registers:
*/
@@ -666,6 +681,10 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx,
reg->instr = collect;
}
+ if (src->Indirect) {
+ reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
+ reg->instr = instr;
+ }
return reg;
}
@@ -718,36 +737,6 @@ get_internal_temp(struct fd3_compile_context *ctx,
return tmp_src;
}
-/* Get internal half-precision temp src/dst to use for a sequence of
- * instructions generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp_hr(struct fd3_compile_context *ctx,
- struct tgsi_dst_register *tmp_dst)
-{
- struct tgsi_src_register *tmp_src;
- int n;
-
- tmp_dst->File = TGSI_FILE_TEMPORARY;
- tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
- tmp_dst->Indirect = 0;
- tmp_dst->Dimension = 0;
-
- /* assign next temporary: */
- n = ctx->num_internal_temps++;
- compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
- tmp_src = &ctx->internal_temps[n];
-
- /* just use hr0 because no one else should be using half-
- * precision regs:
- */
- tmp_dst->Index = 0;
-
- src_from_dst(tmp_src, tmp_dst);
-
- return tmp_src;
-}
-
static inline bool
is_const(struct tgsi_src_register *src)
{
@@ -1049,11 +1038,18 @@ trans_arl(const struct instr_translater *t,
struct tgsi_dst_register *dst = &inst->Dst[0].Register;
struct tgsi_src_register *src = &inst->Src[0].Register;
unsigned chan = src->SwizzleX;
+
compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
- tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
+ /* NOTE: we allocate a temporary from a flat register
+ * namespace (ignoring half vs full). It turns out
+ * not to really matter since registers get reassigned
+ * later in ir3_ra which (hopefully!) can deal a bit
+ * better with mixed half and full precision.
+ */
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
- /* cov.{f32,f16}s16 Rtmp, Rsrc */
+ /* cov.f{32,16}s16 Rtmp, Rsrc */
instr = instr_create(ctx, 1, 0);
instr->cat1.src_type = get_ftype(ctx);
instr->cat1.dst_type = TYPE_S16;
diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h
index a79998ef56f..c67f1037ced 100644
--- a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h
@@ -204,6 +204,8 @@ typedef enum {
/* branches/flow control */
OPC_META_FLOW = 4,
OPC_META_PHI = 5,
+ /* relative addressing */
+ OPC_META_DEREF = 6,
} opc_t;
@@ -244,6 +246,16 @@ static inline int type_float(type_t type)
return (type == TYPE_F32) || (type == TYPE_F16);
}
+static inline int type_uint(type_t type)
+{
+ return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+ return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
typedef union PACKED {
/* normal gpr or const src register: */
struct PACKED {
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
index 872f47883bb..9ec05da6ae4 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -70,6 +70,7 @@ struct ir3_register {
*/
IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */
IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */
+ IR3_REG_ADDR = 0x4000, /* register is a0.x */
} flags;
union {
/* normal registers:
@@ -232,6 +233,8 @@ struct ir3_block {
struct ir3_instruction **temporaries;
struct ir3_instruction **inputs;
struct ir3_instruction **outputs;
+ /* only a single address register: */
+ struct ir3_instruction *address;
struct ir3_block *parent;
struct ir3_instruction *head;
};
@@ -351,10 +354,24 @@ static inline bool is_meta(struct ir3_instruction *instr)
return (instr->category == -1);
}
+static inline bool is_deref(struct ir3_instruction *instr)
+{
+ return is_meta(instr) && (instr->opc == OPC_META_DEREF);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return !!(dst->flags & IR3_REG_ADDR);
+ }
+ return false;
+}
+
/* TODO combine is_gpr()/reg_gpr().. */
static inline bool reg_gpr(struct ir3_register *r)
{
- if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA))
+ if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR))
return false;
if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
return false;
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c
index 81f6c902816..0faed89c25e 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c
@@ -43,10 +43,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
{
if ((instr->category == 1) &&
(instr->cat1.src_type == instr->cat1.dst_type)) {
+ struct ir3_register *dst = instr->regs[0];
struct ir3_register *src = instr->regs[1];
+ if (dst->flags & IR3_REG_ADDR)
+ return false;
if ((src->flags & IR3_REG_SSA) &&
/* TODO: propagate abs/neg modifiers if possible */
- !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE)))
+ !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)))
return true;
}
return false;
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
index 1715f1917f0..b84629b2e07 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
@@ -63,6 +63,9 @@ int ir3_delayslots(struct ir3_instruction *assigner,
if (is_meta(assigner))
return 0;
+ if (writes_addr(assigner))
+ return 6;
+
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner))
return 0;
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c
index 3984cd60e6e..a186d62a819 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c
@@ -58,6 +58,9 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx,
case OPC_META_PHI:
fprintf(ctx->f, "&#934;");
break;
+ case OPC_META_DEREF:
+ fprintf(ctx->f, "(*)");
+ break;
default:
/* shouldn't hit here.. just for debugging: */
switch (instr->opc) {
@@ -66,7 +69,6 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx,
case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break;
case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break;
case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break;
- case OPC_META_PHI: fprintf(ctx->f, "_meta:phi"); break;
default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
}
@@ -162,7 +164,8 @@ static void dump_instr(struct ir3_dump_ctx *ctx,
ir3_block_dump(ctx, instr->flow.else_block, "else");
if (reg->flags & IR3_REG_SSA)
dump_instr(ctx, reg->instr);
- } else if (instr->opc == OPC_META_PHI) {
+ } else if ((instr->opc == OPC_META_PHI) ||
+ (instr->opc == OPC_META_DEREF)) {
/* treat like a normal instruction: */
ir3_instr_dump(ctx, instr);
}
@@ -228,7 +231,8 @@ static void dump_link2(struct ir3_dump_ctx *ctx,
printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
PTRID(instr->inout.block),
instr->regs[0]->num, target);
- } else if (instr->opc == OPC_META_PHI) {
+ } else if ((instr->opc == OPC_META_PHI) ||
+ (instr->opc == OPC_META_DEREF)) {
/* treat like a normal instruction: */
printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
index 57c68c729c5..a9a510f3bc2 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -82,8 +82,8 @@ static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
* Register Allocation:
*/
-#define REG(n, wm) (struct ir3_register){ \
- /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \
+#define REG(n, wm, f) (struct ir3_register){ \
+ .flags = (f), \
.num = (n), \
.wrmask = TGSI_WRITEMASK_ ## wm, \
}
@@ -145,7 +145,7 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
/* be sure to account for output registers too: */
for (i = 0; i < block->noutputs; i++) {
- struct ir3_register reg = REG(output_base(ctx) + i, X);
+ struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
regmask_set_if_not(liveregs, &reg, &written);
}
}
@@ -212,14 +212,15 @@ static bool compute_clobbers(struct ir3_ra_ctx *ctx,
return live || was_live;
}
-static int find_available(regmask_t *liveregs, int size)
+static int find_available(regmask_t *liveregs, int size, bool half)
{
unsigned i;
+ unsigned f = half ? IR3_REG_HALF : 0;
for (i = 0; i < MAX_REG - size; i++) {
- if (!regmask_get(liveregs, &REG(i, X))) {
+ if (!regmask_get(liveregs, &REG(i, X, f))) {
unsigned start = i++;
for (; (i < MAX_REG) && ((i - start) < size); i++)
- if (regmask_get(liveregs, &REG(i, X)))
+ if (regmask_get(liveregs, &REG(i, X, f)))
break;
if ((i - start) >= size)
return start;
@@ -240,7 +241,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx,
*/
return 0;
} else {
+ struct ir3_register *dst = instr->regs[0];
regmask_t liveregs;
+
compute_liveregs(ctx, instr, &liveregs);
// XXX XXX XXX XXX XXX XXX XXX XXX XXX
@@ -257,7 +260,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx,
} else
// XXX XXX XXX XXX XXX XXX XXX XXX XXX
compute_clobbers(ctx, instr->next, instr, &liveregs);
- return find_available(&liveregs, size);
+
+ return find_available(&liveregs, size,
+ !!(dst->flags & IR3_REG_HALF));
}
}
@@ -547,24 +552,32 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
struct ir3_instruction *instr)
{
- struct ir3_ra_assignment a;
+ struct ir3_register *dst;
unsigned num;
/* skip over nop's */
if (instr->regs_count == 0)
return;
- /* skip writes to a0, p0, etc */
- if (!reg_gpr(instr->regs[0]))
- return;
+ dst = instr->regs[0];
/* if we've already visited this instruction, bail now: */
if (instr->flags & IR3_INSTR_MARK)
return;
/* allocate register(s): */
- a = ra_calc(instr);
- num = alloc_block(ctx, instr, a.num) + a.off;
+ if (is_deref(instr)) {
+ num = instr->regs[2]->num;
+ } else if (reg_gpr(dst)) {
+ struct ir3_ra_assignment a;
+ a = ra_calc(instr);
+ num = alloc_block(ctx, instr, a.num) + a.off;
+ } else if (dst->flags & IR3_REG_ADDR) {
+ dst->flags &= ~IR3_REG_ADDR;
+ num = regid(REG_A0, 0) | REG_HALF;
+ } else {
+ assert(0);
+ }
ra_assign(ctx, instr, num);
}
@@ -578,6 +591,7 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
struct ir3_instruction *end =
ir3_instr_create(block, 0, OPC_END);
struct ir3_instruction *last_input = NULL;
+ struct ir3_instruction *last_rel = NULL;
regmask_t needs_ss_war; /* write after read */
regmask_t needs_ss;
regmask_t needs_sy;
@@ -614,6 +628,13 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
regmask_init(&needs_sy);
}
}
+
+ /* TODO: is it valid to have address reg loaded from a
+ * relative src (ie. mova a0, c<a0.x+4>)? If so, the
+ * last_rel check below should be moved ahead of this:
+ */
+ if (reg->flags & IR3_REG_RELATIV)
+ last_rel = n;
}
if (n->regs_count > 0) {
@@ -622,6 +643,11 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
n->flags |= IR3_INSTR_SS;
regmask_init(&needs_ss_war); // ??? I assume?
}
+
+ if (last_rel && (reg->num == regid(REG_A0, 0))) {
+ last_rel->flags |= IR3_INSTR_UL;
+ last_rel = NULL;
+ }
}
/* cat5+ does not have an (ss) bit, if needed we need to
@@ -685,6 +711,9 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
if (last_input)
last_input->regs[0]->flags |= IR3_REG_EI;
+ if (last_rel)
+ last_rel->flags |= IR3_INSTR_UL;
+
shader->instrs[shader->instrs_count++] = end;
shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c
index 5e585271f92..4fd3da58b46 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c
@@ -31,6 +31,11 @@
#include "ir3.h"
+enum {
+ SCHEDULED = -1,
+ DELAYED = -2,
+};
+
/*
* Instruction Scheduling:
*
@@ -46,7 +51,8 @@
*/
struct ir3_sched_ctx {
- struct ir3_instruction *scheduled;
+ struct ir3_instruction *scheduled; /* last scheduled instr */
+ struct ir3_instruction *deref; /* current deref, if any */
unsigned cnt;
};
@@ -123,6 +129,11 @@ static void schedule(struct ir3_sched_ctx *ctx,
block->head = instr->next;
}
+ if (writes_addr(instr)) {
+ assert(ctx->deref == NULL);
+ ctx->deref = instr;
+ }
+
instr->flags |= IR3_INSTR_MARK;
instr->next = ctx->scheduled;
@@ -210,13 +221,19 @@ static int trysched(struct ir3_sched_ctx *ctx,
* we have enough delay slots to schedule ourself:
*/
delay = delay_calc(ctx, instr);
+ if (delay)
+ return delay;
- if (!delay) {
- schedule(ctx, instr, true);
- return -1;
+ /* if this is a write to address register, and addr register
+ * is currently in use, we need to defer until it is free:
+ */
+ if (writes_addr(instr) && ctx->deref) {
+ assert(ctx->deref != instr);
+ return DELAYED;
}
- return delay;
+ schedule(ctx, instr, true);
+ return SCHEDULED;
}
static struct ir3_instruction * reverse(struct ir3_instruction *instr)
@@ -231,6 +248,56 @@ static struct ir3_instruction * reverse(struct ir3_instruction *instr)
return reversed;
}
+static bool uses_current_deref(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i;
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA) {
+ if (is_deref(reg->instr)) {
+ struct ir3_instruction *deref;
+ deref = reg->instr->regs[1]->instr; /* the mova */
+ if (ctx->deref == deref)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/* when we encounter an instruction that writes to the address register
+ * when it is in use, we delay that instruction and try to schedule all
+ * other instructions using the current address register:
+ */
+static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
+ struct ir3_block *block)
+{
+ struct ir3_instruction *instr = block->head;
+ bool in_use = false;
+ unsigned cnt = ~0;
+
+ while (instr) {
+ struct ir3_instruction *next = instr->next;
+
+ if (uses_current_deref(ctx, instr)) {
+ int ret = trysched(ctx, instr);
+ if (ret == SCHEDULED)
+ cnt = 0;
+ else if (ret > 0)
+ cnt = MIN2(cnt, ret);
+ in_use = true;
+ }
+
+ instr = next;
+ }
+
+ if (!in_use)
+ ctx->deref = NULL;
+
+ return cnt;
+}
+
static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
struct ir3_instruction *instr;
@@ -255,6 +322,10 @@ static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
*/
struct ir3_instruction *next = instr->next;
int cnt = trysched(ctx, instr);
+
+ if (cnt == DELAYED)
+ cnt = block_sched_undelayed(ctx, block);
+
/* -1 is signal to return up stack, but to us means same as 0: */
cnt = MAX2(0, cnt);
cnt += ctx->cnt;