diff options
Diffstat (limited to 'src/gallium/drivers/i965/brw_curbe.c')
-rw-r--r-- | src/gallium/drivers/i965/brw_curbe.c | 376 |
1 files changed, 376 insertions, 0 deletions
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c new file mode 100644 index 00000000000..4be6c77aa1e --- /dev/null +++ b/src/gallium/drivers/i965/brw_curbe.c @@ -0,0 +1,376 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "main/glheader.h" +#include "main/context.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/prog_parameter.h" +#include "shader/prog_print.h" +#include "shader/prog_statevars.h" +#include "intel_batchbuffer.h" +#include "intel_regions.h" +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_state.h" +#include "brw_util.h" + + +/** + * Partition the CURBE between the various users of constant values: + * Note that vertex and fragment shaders can now fetch constants out + * of constant buffers. We no longer allocatea block of the GRF for + * constants. That greatly reduces the demand for space in the CURBE. + * Some of the comments within are dated... + */ +static void calculate_curbe_offsets( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + /* CACHE_NEW_WM_PROG */ + const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16; + + /* BRW_NEW_VERTEX_PROGRAM */ + const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16; + GLuint nr_clip_regs = 0; + GLuint total_regs; + + /* _NEW_TRANSFORM */ + if (ctx->Transform.ClipPlanesEnabled) { + GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled); + nr_clip_regs = (nr_planes * 4 + 15) / 16; + } + + + total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs; + + /* This can happen - what to do? Probably rather than falling + * back, the best thing to do is emit programs which code the + * constants as immediate values. Could do this either as a static + * cap on WM and VS, or adaptively. + * + * Unfortunately, this is currently dependent on the results of the + * program generation process (in the case of wm), so this would + * introduce the need to re-generate programs in the event of a + * curbe allocation failure. + */ + /* Max size is 32 - just large enough to + * hold the 128 parameters allowed by + * the fragment and vertex program + * api's. It's not clear what happens + * when both VP and FP want to use 128 + * parameters, though. + */ + assert(total_regs <= 32); + + /* Lazy resize: + */ + if (nr_fp_regs > brw->curbe.wm_size || + nr_vp_regs > brw->curbe.vs_size || + nr_clip_regs != brw->curbe.clip_size || + (total_regs < brw->curbe.total_size / 4 && + brw->curbe.total_size > 16)) { + + GLuint reg = 0; + + /* Calculate a new layout: + */ + reg = 0; + brw->curbe.wm_start = reg; + brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs; + brw->curbe.clip_start = reg; + brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs; + brw->curbe.vs_start = reg; + brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs; + brw->curbe.total_size = reg; + + if (0) + _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n", + brw->curbe.wm_start, + brw->curbe.wm_size, + brw->curbe.clip_start, + brw->curbe.clip_size, + brw->curbe.vs_start, + brw->curbe.vs_size ); + + brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS; + } +} + + +const struct brw_tracked_state brw_curbe_offsets = { + .dirty = { + .mesa = _NEW_TRANSFORM, + .brw = BRW_NEW_VERTEX_PROGRAM, + .cache = CACHE_NEW_WM_PROG + }, + .prepare = calculate_curbe_offsets +}; + + + + +/* Define the number of curbes within CS's urb allocation. Multiple + * urb entries -> multiple curbes. These will be used by + * fixed-function hardware in a double-buffering scheme to avoid a + * pipeline stall each time the contents of the curbe is changed. + */ +void brw_upload_cs_urb_state(struct brw_context *brw) +{ + struct brw_cs_urb_state cs_urb; + memset(&cs_urb, 0, sizeof(cs_urb)); + + /* It appears that this is the state packet for the CS unit, ie. the + * urb entries detailed here are housed in the CS range from the + * URB_FENCE command. + */ + cs_urb.header.opcode = CMD_CS_URB_STATE; + cs_urb.header.length = sizeof(cs_urb)/4 - 2; + + /* BRW_NEW_URB_FENCE */ + cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries; + cs_urb.bits0.urb_entry_size = brw->urb.csize - 1; + + assert(brw->urb.nr_cs_entries); + BRW_CACHED_BATCH_STRUCT(brw, &cs_urb); +} + +static GLfloat fixed_plane[6][4] = { + { 0, 0, -1, 1 }, + { 0, 0, 1, 1 }, + { 0, -1, 0, 1 }, + { 0, 1, 0, 1 }, + {-1, 0, 0, 1 }, + { 1, 0, 0, 1 } +}; + +/* Upload a new set of constants. Too much variability to go into the + * cache mechanism, but maybe would benefit from a comparison against + * the current uploaded set of constants. + */ +static void prepare_constant_buffer(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + const struct brw_vertex_program *vp = + brw_vertex_program_const(brw->vertex_program); + const struct brw_fragment_program *fp = + brw_fragment_program_const(brw->fragment_program); + const GLuint sz = brw->curbe.total_size; + const GLuint bufsz = sz * 16 * sizeof(GLfloat); + GLfloat *buf; + GLuint i; + + if (sz == 0) { + if (brw->curbe.last_buf) { + free(brw->curbe.last_buf); + brw->curbe.last_buf = NULL; + brw->curbe.last_bufsz = 0; + } + return; + } + + buf = (GLfloat *) _mesa_calloc(bufsz); + + /* fragment shader constants */ + if (brw->curbe.wm_size) { + GLuint offset = brw->curbe.wm_start * 16; + + _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); + + /* copy float constants */ + for (i = 0; i < brw->wm.prog_data->nr_params; i++) + buf[offset + i] = *brw->wm.prog_data->param[i]; + } + + + /* The clipplanes are actually delivered to both CLIP and VS units. + * VS uses them to calculate the outcode bitmasks. + */ + if (brw->curbe.clip_size) { + GLuint offset = brw->curbe.clip_start * 16; + GLuint j; + + /* If any planes are going this way, send them all this way: + */ + for (i = 0; i < 6; i++) { + buf[offset + i * 4 + 0] = fixed_plane[i][0]; + buf[offset + i * 4 + 1] = fixed_plane[i][1]; + buf[offset + i * 4 + 2] = fixed_plane[i][2]; + buf[offset + i * 4 + 3] = fixed_plane[i][3]; + } + + /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to + * clip-space: + */ + assert(MAX_CLIP_PLANES == 6); + for (j = 0; j < MAX_CLIP_PLANES; j++) { + if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { + buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0]; + buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1]; + buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2]; + buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3]; + i++; + } + } + } + + /* vertex shader constants */ + if (brw->curbe.vs_size) { + GLuint offset = brw->curbe.vs_start * 16; + GLuint nr = brw->vs.prog_data->nr_params / 4; + + if (brw->vertex_program->IsNVProgram) + _mesa_load_tracked_matrices(ctx); + + /* Updates the ParamaterValues[i] pointers for all parameters of the + * basic type of PROGRAM_STATE_VAR. + */ + _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); + + /* XXX just use a memcpy here */ + for (i = 0; i < nr; i++) { + const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i]; + buf[offset + i * 4 + 0] = value[0]; + buf[offset + i * 4 + 1] = value[1]; + buf[offset + i * 4 + 2] = value[2]; + buf[offset + i * 4 + 3] = value[3]; + } + } + + if (0) { + for (i = 0; i < sz*16; i+=4) + _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4, + buf[i+0], buf[i+1], buf[i+2], buf[i+3]); + + _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n", + brw->curbe.last_buf, buf, + bufsz, brw->curbe.last_bufsz, + brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1); + } + + if (brw->curbe.curbe_bo != NULL && + brw->curbe.last_buf && + bufsz == brw->curbe.last_bufsz && + memcmp(buf, brw->curbe.last_buf, bufsz) == 0) { + /* constants have not changed */ + _mesa_free(buf); + } + else { + /* constants have changed */ + if (brw->curbe.last_buf) + _mesa_free(brw->curbe.last_buf); + + brw->curbe.last_buf = buf; + brw->curbe.last_bufsz = bufsz; + + if (brw->curbe.curbe_bo != NULL && + (brw->curbe.need_new_bo || + brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size)) + { + dri_bo_unreference(brw->curbe.curbe_bo); + brw->curbe.curbe_bo = NULL; + } + + if (brw->curbe.curbe_bo == NULL) { + /* Allocate a single page for CURBE entries for this batchbuffer. + * They're generally around 64b. + */ + brw->curbe.curbe_bo = dri_bo_alloc(brw->intel.bufmgr, "CURBE", + 4096, 1 << 6); + brw->curbe.curbe_next_offset = 0; + } + + brw->curbe.curbe_offset = brw->curbe.curbe_next_offset; + brw->curbe.curbe_next_offset += bufsz; + brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64); + + /* Copy data to the buffer: + */ + dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf); + } + + brw_add_validated_bo(brw, brw->curbe.curbe_bo); + + /* Because this provokes an action (ie copy the constants into the + * URB), it shouldn't be shortcircuited if identical to the + * previous time - because eg. the urb destination may have + * changed, or the urb contents different to last time. + * + * Note that the data referred to is actually copied internally, + * not just used in place according to passed pointer. + * + * It appears that the CS unit takes care of using each available + * URB entry (Const URB Entry == CURBE) in turn, and issuing + * flushes as necessary when doublebuffering of CURBEs isn't + * possible. + */ +} + +static void emit_constant_buffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + GLuint sz = brw->curbe.total_size; + + BEGIN_BATCH(2, IGNORE_CLIPRECTS); + if (sz == 0) { + OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); + OUT_BATCH(0); + } else { + OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); + OUT_RELOC(brw->curbe.curbe_bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + (sz - 1) + brw->curbe.curbe_offset); + } + ADVANCE_BATCH(); +} + +/* This tracked state is unique in that the state it monitors varies + * dynamically depending on the parameters tracked by the fragment and + * vertex programs. This is the template used as a starting point, + * each context will maintain a copy of this internally and update as + * required. + */ +const struct brw_tracked_state brw_constant_buffer = { + .dirty = { + .mesa = _NEW_PROGRAM_CONSTANTS, + .brw = (BRW_NEW_FRAGMENT_PROGRAM | + BRW_NEW_VERTEX_PROGRAM | + BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */ + BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */ + BRW_NEW_CURBE_OFFSETS | + BRW_NEW_BATCH), + .cache = (CACHE_NEW_WM_PROG) + }, + .prepare = prepare_constant_buffer, + .emit = emit_constant_buffer, +}; + |