diff options
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_context.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_context.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_fragprog.c | 185 | ||||
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_state.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_state_emit.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nvfx/nvfx_vertprog.c | 49 |
6 files changed, 149 insertions, 94 deletions
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c index 8e852010f57..e78fc14da44 100644 --- a/src/gallium/drivers/nvfx/nvfx_context.c +++ b/src/gallium/drivers/nvfx/nvfx_context.c @@ -90,6 +90,8 @@ nvfx_create(struct pipe_screen *pscreen, void *priv) /* set these to that we init them on first validation */ nvfx->state.scissor_enabled = ~0; + nvfx->hw_pointsprite_control = -1; + nvfx->hw_vp_output = -1; nvfx->use_vertex_buffers = -1; LIST_INITHEAD(&nvfx->render_cache); diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h index 63fbce87b50..fb4a9da5792 100644 --- a/src/gallium/drivers/nvfx/nvfx_context.h +++ b/src/gallium/drivers/nvfx/nvfx_context.h @@ -193,6 +193,8 @@ struct nvfx_context { uint32_t hw_txf[8]; struct nvfx_render_target hw_rt[4]; struct nvfx_render_target hw_zeta; + int hw_pointsprite_control; + int hw_vp_output; }; static INLINE struct nvfx_context * diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index 025989ac5bb..e40a814e18c 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -1101,6 +1101,35 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len) #endif } +/* The hardware only supports immediate constants inside the fragment program, + * and at least on nv30 doesn't support an indirect linkage table. + * + * Hence, we need to patch the fragment program itself both to update constants + * and update linkage. + * + * Using a single fragment program would entail unacceptable stalls if the GPU is + * already rendering with that fragment program. + * Thus, we instead use a "rotating queue" of buffer objects, each of which is + * packed with multiple versions of the same program. + * + * Whenever we need to patch something, we move to the next program and + * patch it. If all buffer objects are in use by the GPU, we allocate another one, + * expanding the queue. + * + * As an additional optimization, we record when all the programs have the + * current input slot configuration, and at that point we stop patching inputs. + * This happens, for instance, if a given fragment program is always used with + * the same vertex program (i.e. always with GLSL), or if the layouts match + * enough (non-GLSL). + * + * Note that instead of using multiple programs, we could push commands + * on the FIFO to patch a single program: it's not fully clear which option is + * faster, but my guess is that the current way is faster. + * + * We also track the previous slot assignments for each version and don't + * patch if they are the same (this could perhaps be removed). + */ + void nvfx_fragprog_validate(struct nvfx_context *nvfx) { @@ -1109,6 +1138,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) int update = 0; struct nvfx_vertex_program* vp; unsigned sprite_coord_enable; + boolean update_pointsprite = !!(nvfx->dirty & NVFX_NEW_FRAGPROG); if (!fp->translated) { @@ -1141,80 +1171,96 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) fp->bo_prog_idx = fp->progs_per_bo - 1; } - /* we must update constants even on "just" fragprog changes, because - we don't check whether the current constant buffer matches the latest - one bound to this fragment program */ - if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) - update = TRUE; - vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog; - if (fp->last_vp_id != vp->id) { - char* vp_sem_table = vp->generic_to_fp_input; - unsigned char* fp_semantics = fp->slot_to_generic; - unsigned diff = 0; - unsigned char* cur_slots; - fp->last_vp_id = nvfx->vertprog->id; - cur_slots = fp->slot_to_fp_input; - for(unsigned i = 0; i < fp->num_slots; ++i) { - unsigned char slot_mask = vp_sem_table[fp_semantics[i]]; - diff |= (slot_mask >> 4) & (slot_mask ^ cur_slots[i]); - } + sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable; + + if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) { + int sprite_input = -1; + unsigned i; + fp->last_vp_id = vp->id; + fp->last_sprite_coord_enable = sprite_coord_enable; - if(diff) + if(sprite_coord_enable) { - for(unsigned i = 0; i < fp->num_slots; ++i) { - /* if 0xff, then this will write to the dummy value at fp->last_layout_mask[0] */ - fp->slot_to_fp_input[i] = vp_sem_table[fp_semantics[i]] & 0xf; - //printf("fp: GENERIC[%i] from fpreg %i\n", fp_semantics[i], fp->slot_to_fp_input[i]); + sprite_input = vp->sprite_fp_input; + if(sprite_input < 0) + { + unsigned used_texcoords = 0; + for(unsigned i = 0; i < fp->num_slots; ++i) { + unsigned generic = fp->slot_to_generic[i]; + if(!((1 << generic) & sprite_coord_enable)) + { + unsigned char slot_mask = vp->generic_to_fp_input[generic]; + if(slot_mask >= 0xf0) + used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0); + } + } + + sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords)); } - fp->progs_left_with_obsolete_slot_assignments = fp->progs; - update = TRUE; + fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8)); } - } + else + fp->point_sprite_control = 0; - // last_sprite_coord_enable - sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable; - if(fp->last_sprite_coord_enable != sprite_coord_enable) - { - unsigned texcoord_mask = vp->texcoord_ouput_mask; - fp->last_sprite_coord_enable = sprite_coord_enable; - fp->point_sprite_control = 0; - for(unsigned i = 0; i < fp->num_slots; ++i) { - if((1 << fp->slot_to_generic[i]) & sprite_coord_enable) + for(i = 0; i < fp->num_slots; ++i) { + unsigned generic = fp->slot_to_generic[i]; + if((1 << generic) & sprite_coord_enable) { - unsigned fpin = fp->slot_to_fp_input[i]; - //printf("sprite: slot %i generic %i had texcoord %i\n", i, fp->slot_to_generic[i], fpin - NVFX_FP_OP_INPUT_SRC_TC0); - if(fpin >= 0x0f) - { - unsigned tc = __builtin_ctz(~texcoord_mask); - texcoord_mask |= (1 << tc); - fp->slot_to_fp_input[i] = fpin = NVFX_FP_OP_INPUT_SRC_TC(tc); - - fp->progs_left_with_obsolete_slot_assignments = fp->progs; - update = TRUE; - } - //printf("sprite: slot %i texcoord %i\n", i, fpin - NVFX_FP_OP_INPUT_SRC_TC0); - fp->point_sprite_control |= (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0 + 8)); + if(fp->slot_to_fp_input[i] != sprite_input) + goto update_slots; } else { - unsigned fpin = fp->slot_to_fp_input[i]; - if(!(vp->texcoord_ouput_mask & (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0)))) - { - fp->slot_to_fp_input[i] = 0x0f; + unsigned char slot_mask = vp->generic_to_fp_input[generic]; + if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i])) + goto update_slots; + } + } + + if(0) + { +update_slots: + /* optimization: we start updating from the slot we found the first difference in */ + for(; i < fp->num_slots; ++i) + { + unsigned generic = fp->slot_to_generic[i]; + if((1 << generic) & sprite_coord_enable) + fp->slot_to_fp_input[i] = sprite_input; + else + fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf; + } - fp->progs_left_with_obsolete_slot_assignments = fp->progs; - update = TRUE; + if(nvfx->is_nv4x) + { + fp->or = 0; + for(i = 0; i < fp->num_slots; ++i) { + unsigned fp_input = fp->slot_to_fp_input[i]; + if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8)) + fp->or |= (1 << 12); + else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9)) + fp->or |= (1 << 13); + else if(fp_input != 0xf) + fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14)); } } + + fp->progs_left_with_obsolete_slot_assignments = fp->progs; + goto update; } } - if(update) { + /* We must update constants even on "just" fragprog changes, because + * we don't check whether the current constant buffer matches the latest + * one bound to this fragment program. + * Doing such a check would likely be a pessimization. + */ + if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) { int offset; uint32_t* fpmap; +update: ++fp->bo_prog_idx; if(fp->bo_prog_idx >= fp->progs_per_bo) { @@ -1278,6 +1324,9 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } } + /* we only do this if we aren't sure that all program versions have the + * current slot assignments, otherwise we just update constants for speed + */ if(fp->progs_left_with_obsolete_slot_assignments) { unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8]; for(unsigned i = 0; i < fp->num_slots; ++i) { @@ -1296,10 +1345,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } --fp->progs_left_with_obsolete_slot_assignments; } - } - if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) { - int offset = fp->bo_prog_idx * fp->prog_size; MARK_RING(chan, 8, 1); OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1)); OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM | @@ -1316,11 +1362,28 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } } - if(nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SPRITE)) { - WAIT_RING(chan, 2); - OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1)); - OUT_RING(chan, fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization); + unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization; + if(pointsprite_control != nvfx->hw_pointsprite_control) + { + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1)); + OUT_RING(chan, pointsprite_control); + nvfx->hw_pointsprite_control = pointsprite_control; + } + } + + if(nvfx->is_nv4x) + { + unsigned vp_output = vp->or | fp->or; + + if(vp_output != nvfx->hw_vp_output) + { + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1)); + OUT_RING(chan, vp_output); + nvfx->hw_vp_output = vp_output; + } } } diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h index 1247abcfa21..05d41cfc8dd 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.h +++ b/src/gallium/drivers/nvfx/nvfx_state.h @@ -32,7 +32,7 @@ struct nvfx_vertex_program { unsigned nr_consts; char generic_to_fp_input[256]; - unsigned texcoord_ouput_mask; + int sprite_fp_input; struct nouveau_resource *exec; unsigned exec_start; @@ -67,6 +67,7 @@ struct nvfx_fragment_program { boolean translated; unsigned samplers; unsigned point_sprite_control; + unsigned or; uint32_t *insn; int insn_len; diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c index 8e3c342179d..bd89a385d7c 100644 --- a/src/gallium/drivers/nvfx/nvfx_state_emit.c +++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c @@ -17,6 +17,8 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) { nvfx->dirty = ~0; nvfx->hw_vtxelt_nr = 16; + nvfx->hw_pointsprite_control = -1; + nvfx->hw_vp_output = -1; nvfx->screen->cur_ctx = nvfx; } diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c index 3c3521e1622..806f263dcff 100644 --- a/src/gallium/drivers/nvfx/nvfx_vertprog.c +++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c @@ -235,24 +235,10 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot dst.index = NVFX_VP(INST_DEST_PSZ); break; default: - if(!nvfx->is_nv4x) { - switch (dst.index) { - case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break; - case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break; - case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break; - case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break; - case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break; - case NV30_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break; - case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break; - case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break; - case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break; - case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break; - case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break; - case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break; - case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break; - case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break; - } - } else { + if(nvfx->is_nv4x) { + /* we don't need vp->or on nv3x + * texcoords are handled by fragment program + */ switch (dst.index) { case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break; case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break; @@ -260,14 +246,6 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break; case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break; case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break; - case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break; - case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break; - case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break; - case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break; - case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break; - case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break; - case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break; - case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break; } } break; @@ -817,13 +795,21 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc) /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */ memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input)); - vpc->vp->texcoord_ouput_mask = 0; for(int i = 0; i < 8; ++i) { if(sem_layout[i] == 0xff) continue; - vpc->vp->texcoord_ouput_mask |= (1 << i); //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i); - vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | (NVFX_FP_OP_INPUT_SRC_TC(0) + i); + vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i); + } + + vpc->vp->sprite_fp_input = -1; + for(int i = 0; i < 8; ++i) + { + if(sem_layout[i] == 0xff) + { + vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i); + break; + } } tgsi_parse_init(&p, vpc->vp->pipe.tokens); @@ -1233,13 +1219,12 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP)) { - WAIT_RING(chan, 7); + WAIT_RING(chan, 6); OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1)); OUT_RING(chan, vp->exec->start); if(nvfx->is_nv4x) { - OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2)); + OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1)); OUT_RING(chan, vp->ir); - OUT_RING(chan, vp->or); } OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1)); OUT_RING(chan, vp->clip_ctrl); |