summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nv50/nv50_program.c
diff options
context:
space:
mode:
authorChristoph Bumiller <[email protected]>2009-09-14 20:23:39 +0200
committerChristoph Bumiller <[email protected]>2009-09-15 12:13:23 +0200
commit6516594c8eec1088ee59e7c3254b2fdced2ff04b (patch)
treeae802e374800429562be21ecc3b098862cf5bbb6 /src/gallium/drivers/nv50/nv50_program.c
parent38849c529e76b99f56f522be183a5935d617bcab (diff)
nv50: proper linkage between VP and FP
This moves construction of the mapping between VP outputs and FP inputs into validation. The map also contains slots for special outputs like clip distance and point size, so we need to at least merge the VP related and FP related parts on validation if we want to support those. Now we match every single FP input component with results from the VP and leave those not read out of the map, or replace those not written by 0 (xyz) or 1 (w). The bitmap indicating linear interpolants is also filled, and flat FP inputs are mapped in only after non-flat ones, as is required. Furthermore, we can save some space by only fetching VP attrs we actually use, and avoid wasting any output regs because of TGSI using less than 4 components.
Diffstat (limited to 'src/gallium/drivers/nv50/nv50_program.c')
-rw-r--r--src/gallium/drivers/nv50/nv50_program.c422
1 files changed, 269 insertions, 153 deletions
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 7618ff33759..7bc8e13d2ad 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -139,6 +139,14 @@ ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
reg->acc = 0;
}
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+ static const unsigned cnt[16]
+ = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+ return cnt[val & 0xf];
+}
+
static void
alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
{
@@ -1975,59 +1983,48 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
return TRUE;
}
-static unsigned
-load_fp_attrib(struct nv50_pc *pc, int i, int *mid, int *aid, int *p_oid)
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
{
- struct nv50_reg *iv;
- int oid, c, n;
- unsigned mask = 0;
+ struct nv50_reg *iv, **ppiv;
+ unsigned mode = pc->interp_mode[reg->index];
- iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+ ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+ iv = *ppiv;
- for (c = 0, n = i * 4; c < 4; c++, n++) {
- oid = (*p_oid)++;
+ if ((mode & INTERP_PERSPECTIVE) && !iv) {
+ iv = *ppiv = alloc_temp(pc, NULL);
+ iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
- if (!pc->attr[n].acc)
- continue;
- mask |= (1 << c);
-
- alloc_reg(pc, &pc->attr[n]);
-
- pc->attr[n].rhw = (*aid)++;
- emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+ emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+ emit_flop(pc, 0, iv, iv);
- pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
- (*mid)++;
- pc->p->cfg.fp.regs[1] += 0x00010001;
+ /* XXX: when loading interpolants dynamically, move these
+ * to the program head, or make sure it can't be skipped.
+ */
}
- return mask;
+ emit_interp(pc, reg, iv, mode);
}
static boolean
nv50_program_tx_prep(struct nv50_pc *pc)
{
- struct tgsi_parse_context p;
+ struct tgsi_parse_context tp;
+ struct nv50_program *p = pc->p;
boolean ret = FALSE;
- unsigned i, c;
- unsigned fcol, bcol, fcrd;
-
- /* count (centroid) perspective interpolations */
- unsigned centroid_loads = 0;
- unsigned perspect_loads = 0;
-
- fcol = bcol = fcrd = ~0;
+ unsigned i, c, flat_nr = 0;
- tgsi_parse_init(&p, pc->p->pipe.tokens);
- while (!tgsi_parse_end_of_tokens(&p)) {
- const union tgsi_full_token *tok = &p.FullToken;
+ tgsi_parse_init(&tp, pc->p->pipe.tokens);
+ while (!tgsi_parse_end_of_tokens(&tp)) {
+ const union tgsi_full_token *tok = &tp.FullToken;
- tgsi_parse_token(&p);
+ tgsi_parse_token(&tp);
switch (tok->Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
{
const struct tgsi_full_immediate *imm =
- &p.FullToken.FullImmediate;
+ &tp.FullToken.FullImmediate;
ctor_immd(pc, imm->u[0].Float,
imm->u[1].Float,
@@ -2038,9 +2035,9 @@ nv50_program_tx_prep(struct nv50_pc *pc)
case TGSI_TOKEN_TYPE_DECLARATION:
{
const struct tgsi_full_declaration *d;
- unsigned last, first, mode;
+ unsigned si, last, first, mode;
- d = &p.FullToken.FullDeclaration;
+ d = &tp.FullToken.FullDeclaration;
first = d->DeclarationRange.First;
last = d->DeclarationRange.Last;
@@ -2048,43 +2045,41 @@ nv50_program_tx_prep(struct nv50_pc *pc)
case TGSI_FILE_TEMPORARY:
break;
case TGSI_FILE_OUTPUT:
+ if (!d->Declaration.Semantic ||
+ p->type == PIPE_SHADER_FRAGMENT)
+ break;
+
+ si = d->Semantic.SemanticIndex;
+ switch (d->Semantic.SemanticName) {
+ /*
+ case TGSI_SEMANTIC_CLIP_DISTANCE:
+ p->cfg.clpd = MIN2(p->cfg.clpd, first);
+ break;
+ */
+ default:
+ break;
+ }
break;
case TGSI_FILE_INPUT:
{
- if (pc->p->type != PIPE_SHADER_FRAGMENT)
+ if (p->type != PIPE_SHADER_FRAGMENT)
break;
switch (d->Declaration.Interpolate) {
case TGSI_INTERPOLATE_CONSTANT:
mode = INTERP_FLAT;
+ flat_nr++;
break;
case TGSI_INTERPOLATE_PERSPECTIVE:
mode = INTERP_PERSPECTIVE;
+ p->cfg.regs[1] |= 0x08 << 24;
break;
default:
mode = INTERP_LINEAR;
break;
}
-
- if (d->Declaration.Semantic) {
- switch (d->Semantic.SemanticName) {
- case TGSI_SEMANTIC_POSITION:
- fcrd = first;
- break;
- case TGSI_SEMANTIC_COLOR:
- fcol = first;
- mode = INTERP_PERSPECTIVE;
- break;
- }
- }
-
- if (d->Declaration.Centroid) {
+ if (d->Declaration.Centroid)
mode |= INTERP_CENTROID;
- if (mode & INTERP_PERSPECTIVE)
- centroid_loads++;
- } else
- if (mode & INTERP_PERSPECTIVE)
- perspect_loads++;
assert(last < 32);
for (i = first; i <= last; i++)
@@ -2111,92 +2106,117 @@ nv50_program_tx_prep(struct nv50_pc *pc)
}
}
- if (pc->attr_nr) {
- int oid = 4, mid = 4, aid = 0;
- /* oid = VP output id
- * aid = FP attribute/interpolant id
- * mid = VP output mapping field ID
- */
- if (pc->p->type == PIPE_SHADER_FRAGMENT) {
- /* position should be loaded first */
- if (fcrd < 0x40) {
- unsigned mask;
- mid = 0;
- mask = load_fp_attrib(pc, fcrd, &mid, &aid,
- &oid);
- pc->p->cfg.fp.regs[1] |= (mask << 24);
- pc->p->cfg.fp.map[0] += 0x04040404 * fcrd;
- }
+ if (p->type == PIPE_SHADER_VERTEX) {
+ int rid = 0;
- /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
-
- if (perspect_loads) {
- pc->iv_p = alloc_temp(pc, NULL);
-
- if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
- pc->p->cfg.fp.regs[1] |= 0x08000000;
- pc->iv_p->rhw = aid++;
- emit_interp(pc, pc->iv_p, NULL,
- INTERP_LINEAR);
- emit_flop(pc, 0, pc->iv_p, pc->iv_p);
- } else {
- pc->iv_p->rhw = aid - 1;
- emit_flop(pc, 0, pc->iv_p,
- &pc->attr[fcrd * 4 + 3]);
- }
+ for (i = 0; i < pc->attr_nr * 4; ++i) {
+ if (pc->attr[i].acc) {
+ pc->attr[i].hw = rid++;
+ p->cfg.attr[i / 32] |= 1 << (i % 32);
}
+ }
+
+ for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+ p->cfg.io[i].hw = rid;
+ p->cfg.io[i].id_vp = i;
- if (centroid_loads) {
- pc->iv_c = alloc_temp(pc, NULL);
- pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
- emit_interp(pc, pc->iv_c, NULL,
- INTERP_CENTROID);
- emit_flop(pc, 0, pc->iv_c, pc->iv_c);
- pc->p->cfg.fp.regs[1] |= 0x08000000;
+ for (c = 0; c < 4; ++c) {
+ int n = i * 4 + c;
+ if (!pc->result[n].acc)
+ continue;
+ pc->result[n].hw = rid++;
+ p->cfg.io[i].mask |= 1 << c;
}
+ }
+ } else
+ if (p->type == PIPE_SHADER_FRAGMENT) {
+ int rid, aid;
+ unsigned n = 0, m = pc->attr_nr - flat_nr;
+
+ int base = (TGSI_SEMANTIC_POSITION ==
+ p->info.input_semantic_name[0]) ? 0 : 1;
- for (c = 0; c < 4; c++) {
- /* XXX: secondary colour, tbd */
- if (fcol < 0x40 && pc->attr[fcol * 4 + c].acc)
- pc->p->cfg.fp.regs[0] += 0x00010000;
+ /* non-flat interpolants have to be mapped to
+ * the lower hardware IDs, so sort them:
+ */
+ for (i = 0; i < pc->attr_nr; i++) {
+ if (pc->interp_mode[i] == INTERP_FLAT) {
+ p->cfg.io[m].id_vp = i + base;
+ p->cfg.io[m++].id_fp = i;
+ } else {
+ if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+ p->cfg.io[n].linear = TRUE;
+ p->cfg.io[n].id_vp = i + base;
+ p->cfg.io[n++].id_fp = i;
}
+ }
- for (i = ((fcrd < 0x40) ? 1 : 0); i < pc->attr_nr; i++)
- load_fp_attrib(pc, i, &mid, &aid, &oid);
+ if (!base) /* set w-coordinate mask from perspective interp */
+ p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
- if (pc->iv_p)
- free_temp(pc, pc->iv_p);
- if (pc->iv_c)
- free_temp(pc, pc->iv_c);
+ aid = popcnt4( /* if fcrd isn't contained in cfg.io */
+ base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
- pc->p->cfg.fp.high_map = (mid / 4);
- pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
- } else {
- /* vertex program */
- for (i = 0; i < pc->attr_nr * 4; i++) {
- pc->p->cfg.vp.attr[aid / 32] |=
- (1 << (aid % 32));
- pc->attr[i].hw = aid++;
+ for (n = 0; n < pc->attr_nr; ++n) {
+ p->cfg.io[n].hw = rid = aid;
+ i = p->cfg.io[n].id_fp;
+
+ for (c = 0; c < 4; ++c) {
+ if (!pc->attr[i * 4 + c].acc)
+ continue;
+ pc->attr[i * 4 + c].rhw = rid++;
+ p->cfg.io[n].mask |= 1 << c;
+
+ load_interpolant(pc, &pc->attr[i * 4 + c]);
}
+ aid += popcnt4(p->cfg.io[n].mask);
}
- }
- if (pc->result_nr) {
- if (pc->p->type == PIPE_SHADER_VERTEX) {
- for (i = 0; i < pc->result_nr * 4; i++)
- pc->result[i].hw = i;
- } else {
- /* type == PIPE_SHADER_FRAGMENT
- * FragDepth is always first TGSI and last HW output
- */
- int rid = 0;
- i = pc->p->info.writes_z ? 4 : 0;
+ if (!base)
+ p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
+
+ m = popcnt4(p->cfg.regs[1] >> 24);
+
+ /* set count of non-position inputs and of non-flat
+ * non-position inputs for FP_INTERPOLANT_CTRL
+ */
+ p->cfg.regs[1] |= aid - m;
+
+ if (flat_nr) {
+ i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+ p->cfg.regs[1] |= (i - m) << 16;
+ } else
+ p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+ /* mark color semantic for light-twoside */
+ n = 0x40;
+ for (i = 0; i < pc->attr_nr; i++) {
+ ubyte si, sn;
- for (; i < pc->result_nr * 4; i++)
- pc->result[i].rhw = rid++;
- if (pc->p->info.writes_z)
- pc->result[2].rhw = rid;
+ sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
+ si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
+
+ if (sn == TGSI_SEMANTIC_COLOR) {
+ p->cfg.two_side[si] = p->cfg.io[i];
+
+ /* increase colour count */
+ p->cfg.regs[0] += popcnt4(
+ p->cfg.two_side[si].mask) << 16;
+
+ n = MIN2(n, p->cfg.io[i].hw - m);
+ }
}
+ if (n < 0x40)
+ p->cfg.regs[0] += n;
+
+ /* Initialize FP results:
+ * FragDepth is always first TGSI and last hw output
+ */
+ i = p->info.writes_z ? 4 : 0;
+ for (rid = 0; i < pc->result_nr * 4; i++)
+ pc->result[i].rhw = rid++;
+ if (p->info.writes_z)
+ pc->result[2].rhw = rid;
}
if (pc->immd_nr) {
@@ -2214,7 +2234,12 @@ nv50_program_tx_prep(struct nv50_pc *pc)
ret = TRUE;
out_err:
- tgsi_parse_free(&p);
+ if (pc->iv_p)
+ free_temp(pc, pc->iv_p);
+ if (pc->iv_c)
+ free_temp(pc, pc->iv_c);
+
+ tgsi_parse_free(&tp);
return ret;
}
@@ -2249,24 +2274,26 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
p->cfg.high_temp = 4;
+ p->cfg.two_side[0].hw = 0x40;
+ p->cfg.two_side[1].hw = 0x40;
+
switch (p->type) {
case PIPE_SHADER_VERTEX:
+ p->cfg.clpd = 0x40;
+ p->cfg.io_nr = pc->result_nr;
break;
case PIPE_SHADER_FRAGMENT:
- p->cfg.fp.regs[0] = 0x01000404;
- p->cfg.fp.regs[1] = 0x00000400;
-
- p->cfg.fp.map[0] = 0x03020100;
- p->cfg.fp.high_map = 1;
-
rtype[0] = rtype[1] = P_TEMP;
+ p->cfg.regs[0] = 0x01000004;
+ p->cfg.io_nr = pc->attr_nr;
+
if (p->info.writes_z) {
- p->cfg.fp.regs[2] |= 0x00000100;
- p->cfg.fp.regs[3] |= 0x00000011;
+ p->cfg.regs[2] |= 0x00000100;
+ p->cfg.regs[3] |= 0x00000011;
}
if (p->info.uses_kill)
- p->cfg.fp.regs[2] |= 0x00100000;
+ p->cfg.regs[2] |= 0x00100000;
break;
}
@@ -2609,8 +2636,8 @@ nv50_vertprog_validate(struct nv50_context *nv50)
so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
NOUVEAU_BO_LOW, 0, 0);
so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
- so_data (so, p->cfg.vp.attr[0]);
- so_data (so, p->cfg.vp.attr[1]);
+ so_data (so, p->cfg.attr[0]);
+ so_data (so, p->cfg.attr[1]);
so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
so_data (so, p->cfg.high_result);
so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
@@ -2628,7 +2655,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
struct nouveau_grobj *tesla = nv50->screen->tesla;
struct nv50_program *p = nv50->fragprog;
struct nouveau_stateobj *so;
- unsigned i;
if (!p->translated) {
nv50_program_validate(nv50, p);
@@ -2645,29 +2671,119 @@ nv50_fragprog_validate(struct nv50_context *nv50)
NOUVEAU_BO_HIGH, 0, 0);
so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
NOUVEAU_BO_LOW, 0, 0);
- so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
- so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
- so_data (so, 0x00000004);
- so_data (so, 0x00000000);
- so_data (so, 0x00000000);
- so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
- for (i = 0; i < p->cfg.fp.high_map; i++)
- so_data(so, p->cfg.fp.map[i]);
- so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
- so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+ so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
so_data (so, p->cfg.high_temp);
so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
so_data (so, p->cfg.high_result);
so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
- so_data (so, p->cfg.fp.regs[2]);
+ so_data (so, p->cfg.regs[2]);
so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
- so_data (so, p->cfg.fp.regs[3]);
+ so_data (so, p->cfg.regs[3]);
so_method(so, tesla, NV50TCL_FP_START_ID, 1);
so_data (so, 0); /* program start offset */
so_ref(so, &nv50->state.fragprog);
so_ref(NULL, &so);
}
+static int
+nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
+ struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+ int c;
+ uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+ uint8_t *map = (uint8_t *)p_map;
+
+ for (c = 0; c < 4; ++c) {
+ if (mf & 1) {
+ if (fpi->linear == TRUE)
+ lin[mid / 32] |= 1 << (mid % 32);
+ map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+ }
+
+ oid += mv & 1;
+ mf >>= 1;
+ mv >>= 1;
+ }
+
+ return mid;
+}
+
+void
+nv50_linkage_validate(struct nv50_context *nv50)
+{
+ struct nouveau_grobj *tesla = nv50->screen->tesla;
+ struct nv50_program *vp = nv50->vertprog;
+ struct nv50_program *fp = nv50->fragprog;
+ struct nouveau_stateobj *so;
+ struct nv50_sreg4 dummy, *vpo;
+ int i, n, c, m = 0;
+ uint32_t map[16], lin[4], reg[5];
+
+ memset(map, 0, sizeof(map));
+ memset(lin, 0, sizeof(lin));
+
+ reg[1] = 0x00000004; /* low and high clip distance map ids */
+ reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+ reg[3] = 0x00000000; /* point size map id & enable */
+ reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+ reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+ dummy.linear = FALSE;
+ dummy.mask = 0xf; /* map all components of HPOS */
+ m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+
+ dummy.mask = 0x0;
+
+ if (vp->cfg.clpd < 0x40) {
+ for (c = 0; c < vp->cfg.clpd_nr; ++c)
+ map[m++] = vp->cfg.clpd + c;
+ reg[1] = (m << 8);
+ }
+
+ reg[0] |= m << 8; /* adjust BFC0 id */
+ reg[0] += m - 4; /* adjust FFC0 id */
+ reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+ i = 0;
+ if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+ i = 1;
+ for (; i < fp->cfg.io_nr; i++) {
+ ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
+ ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
+
+ n = fp->cfg.io[i].id_vp;
+ if (n >= vp->cfg.io_nr ||
+ vp->info.output_semantic_name[n] != sn ||
+ vp->info.output_semantic_index[n] != si)
+ vpo = &dummy;
+ else
+ vpo = &vp->cfg.io[n];
+
+ m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+ }
+
+ /* now fill the stateobj */
+ so = so_new(64, 0);
+
+ n = (m + 3) / 4;
+ so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+ so_data (so, m);
+ so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+ so_datap (so, map, n);
+
+ so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+ so_datap (so, reg, 4);
+
+ so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+ so_data (so, reg[4]);
+
+ so_method(so, tesla, 0x1540, 4);
+ so_datap (so, lin, 4);
+
+ so_ref(so, &nv50->state.programs);
+ so_ref(NULL, &so);
+}
+
void
nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
{