diff options
Diffstat (limited to 'src/gallium/drivers')
68 files changed, 29208 insertions, 0 deletions
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile new file mode 100644 index 00000000000..7a55333e896 --- /dev/null +++ b/src/gallium/drivers/i965/Makefile @@ -0,0 +1,104 @@ + +TOP = ../../../../.. +include $(TOP)/configs/current + +LIBNAME = i965_dri.so + +DRIVER_SOURCES = \ + intel_batchbuffer.c \ + intel_blit.c \ + intel_buffer_objects.c \ + intel_buffers.c \ + intel_clear.c \ + intel_context.c \ + intel_decode.c \ + intel_extensions.c \ + intel_fbo.c \ + intel_mipmap_tree.c \ + intel_regions.c \ + intel_screen.c \ + intel_span.c \ + intel_pixel.c \ + intel_pixel_bitmap.c \ + intel_pixel_copy.c \ + intel_pixel_draw.c \ + intel_pixel_read.c \ + intel_state.c \ + intel_swapbuffers.c \ + intel_syncobj.c \ + intel_tex.c \ + intel_tex_copy.c \ + intel_tex_format.c \ + intel_tex_image.c \ + intel_tex_layout.c \ + intel_tex_subimage.c \ + intel_tex_validate.c \ + brw_cc.c \ + brw_clip.c \ + brw_clip_line.c \ + brw_clip_point.c \ + brw_clip_state.c \ + brw_clip_tri.c \ + brw_clip_unfilled.c \ + brw_clip_util.c \ + brw_context.c \ + brw_curbe.c \ + brw_disasm.c \ + brw_draw.c \ + brw_draw_upload.c \ + brw_eu.c \ + brw_eu_debug.c \ + brw_eu_emit.c \ + brw_eu_util.c \ + brw_fallback.c \ + brw_gs.c \ + brw_gs_emit.c \ + brw_gs_state.c \ + brw_misc_state.c \ + brw_program.c \ + brw_queryobj.c \ + brw_sf.c \ + brw_sf_emit.c \ + brw_sf_state.c \ + brw_state_batch.c \ + brw_state_cache.c \ + brw_state_dump.c \ + brw_state_upload.c \ + brw_tex.c \ + brw_tex_layout.c \ + brw_urb.c \ + brw_util.c \ + brw_vs.c \ + brw_vs_constval.c \ + brw_vs_emit.c \ + brw_vs_state.c \ + brw_vs_surface_state.c \ + brw_vtbl.c \ + brw_wm.c \ + brw_wm_debug.c \ + brw_wm_emit.c \ + brw_wm_fp.c \ + brw_wm_iz.c \ + brw_wm_glsl.c \ + brw_wm_pass0.c \ + brw_wm_pass1.c \ + brw_wm_pass2.c \ + brw_wm_sampler_state.c \ + brw_wm_state.c \ + brw_wm_surface_state.c + +C_SOURCES = \ + $(COMMON_SOURCES) \ + $(MINIGLX_SOURCES) \ + $(DRIVER_SOURCES) + +ASM_SOURCES = + +DRIVER_DEFINES = -I../intel -I../intel/server + +DRI_LIB_DEPS += -ldrm_intel + +include ../Makefile.template + +intel_decode.o: ../intel/intel_decode.c +intel_tex_layout.o: ../intel/intel_tex_layout.c diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c new file mode 100644 index 00000000000..1088a7a6070 --- /dev/null +++ b/src/gallium/drivers/i965/brw_cc.c @@ -0,0 +1,297 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "brw_util.h" +#include "main/macros.h" +#include "main/enums.h" + +static void prepare_cc_vp( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_cc_viewport ccv; + + memset(&ccv, 0, sizeof(ccv)); + + /* _NEW_VIEWPORT */ + ccv.min_depth = ctx->Viewport.Near; + ccv.max_depth = ctx->Viewport.Far; + + dri_bo_unreference(brw->cc.vp_bo); + brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 ); +} + +const struct brw_tracked_state brw_cc_vp = { + .dirty = { + .mesa = _NEW_VIEWPORT, + .brw = BRW_NEW_CONTEXT, + .cache = 0 + }, + .prepare = prepare_cc_vp +}; + +struct brw_cc_unit_key { + GLboolean stencil, stencil_two_side, color_blend, alpha_enabled; + + GLenum stencil_func[2], stencil_fail_op[2]; + GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2]; + GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2]; + GLenum logic_op; + + GLenum blend_eq_rgb, blend_eq_a; + GLenum blend_src_rgb, blend_src_a; + GLenum blend_dst_rgb, blend_dst_a; + + GLenum alpha_func; + GLclampf alpha_ref; + + GLboolean dither; + + GLboolean depth_test, depth_write; + GLenum depth_func; +}; + +static void +cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + const unsigned back = ctx->Stencil._BackFace; + + memset(key, 0, sizeof(*key)); + + key->stencil = ctx->Stencil._Enabled; + key->stencil_two_side = ctx->Stencil._TestTwoSide; + + if (key->stencil) { + key->stencil_func[0] = ctx->Stencil.Function[0]; + key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0]; + key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0]; + key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0]; + key->stencil_ref[0] = ctx->Stencil.Ref[0]; + key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0]; + key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0]; + } + if (key->stencil_two_side) { + key->stencil_func[1] = ctx->Stencil.Function[back]; + key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back]; + key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back]; + key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back]; + key->stencil_ref[1] = ctx->Stencil.Ref[back]; + key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back]; + key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back]; + } + + if (ctx->Color._LogicOpEnabled) + key->logic_op = ctx->Color.LogicOp; + else + key->logic_op = GL_COPY; + + key->color_blend = ctx->Color.BlendEnabled; + if (key->color_blend) { + key->blend_eq_rgb = ctx->Color.BlendEquationRGB; + key->blend_eq_a = ctx->Color.BlendEquationA; + key->blend_src_rgb = ctx->Color.BlendSrcRGB; + key->blend_dst_rgb = ctx->Color.BlendDstRGB; + key->blend_src_a = ctx->Color.BlendSrcA; + key->blend_dst_a = ctx->Color.BlendDstA; + } + + key->alpha_enabled = ctx->Color.AlphaEnabled; + if (key->alpha_enabled) { + key->alpha_func = ctx->Color.AlphaFunc; + key->alpha_ref = ctx->Color.AlphaRef; + } + + key->dither = ctx->Color.DitherFlag; + + key->depth_test = ctx->Depth.Test; + if (key->depth_test) { + key->depth_func = ctx->Depth.Func; + key->depth_write = ctx->Depth.Mask; + } +} + +/** + * Creates the state cache entry for the given CC unit key. + */ +static dri_bo * +cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key) +{ + struct brw_cc_unit_state cc; + dri_bo *bo; + + memset(&cc, 0, sizeof(cc)); + + /* _NEW_STENCIL */ + if (key->stencil) { + cc.cc0.stencil_enable = 1; + cc.cc0.stencil_func = + intel_translate_compare_func(key->stencil_func[0]); + cc.cc0.stencil_fail_op = + intel_translate_stencil_op(key->stencil_fail_op[0]); + cc.cc0.stencil_pass_depth_fail_op = + intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]); + cc.cc0.stencil_pass_depth_pass_op = + intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]); + cc.cc1.stencil_ref = key->stencil_ref[0]; + cc.cc1.stencil_write_mask = key->stencil_write_mask[0]; + cc.cc1.stencil_test_mask = key->stencil_test_mask[0]; + + if (key->stencil_two_side) { + cc.cc0.bf_stencil_enable = 1; + cc.cc0.bf_stencil_func = + intel_translate_compare_func(key->stencil_func[1]); + cc.cc0.bf_stencil_fail_op = + intel_translate_stencil_op(key->stencil_fail_op[1]); + cc.cc0.bf_stencil_pass_depth_fail_op = + intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]); + cc.cc0.bf_stencil_pass_depth_pass_op = + intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]); + cc.cc1.bf_stencil_ref = key->stencil_ref[1]; + cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1]; + cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1]; + } + + /* Not really sure about this: + */ + if (key->stencil_write_mask[0] || + (key->stencil_two_side && key->stencil_write_mask[1])) + cc.cc0.stencil_write_enable = 1; + } + + /* _NEW_COLOR */ + if (key->logic_op != GL_COPY) { + cc.cc2.logicop_enable = 1; + cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op); + } else if (key->color_blend) { + GLenum eqRGB = key->blend_eq_rgb; + GLenum eqA = key->blend_eq_a; + GLenum srcRGB = key->blend_src_rgb; + GLenum dstRGB = key->blend_dst_rgb; + GLenum srcA = key->blend_src_a; + GLenum dstA = key->blend_dst_a; + + if (eqRGB == GL_MIN || eqRGB == GL_MAX) { + srcRGB = dstRGB = GL_ONE; + } + + if (eqA == GL_MIN || eqA == GL_MAX) { + srcA = dstA = GL_ONE; + } + + cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB); + cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB); + cc.cc6.blend_function = brw_translate_blend_equation(eqRGB); + + cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA); + cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA); + cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA); + + cc.cc3.blend_enable = 1; + cc.cc3.ia_blend_enable = (srcA != srcRGB || + dstA != dstRGB || + eqA != eqRGB); + } + + if (key->alpha_enabled) { + cc.cc3.alpha_test = 1; + cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func); + cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8; + + UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref); + } + + if (key->dither) { + cc.cc5.dither_enable = 1; + cc.cc6.y_dither_offset = 0; + cc.cc6.x_dither_offset = 0; + } + + /* _NEW_DEPTH */ + if (key->depth_test) { + cc.cc2.depth_test = 1; + cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func); + cc.cc2.depth_write_enable = key->depth_write; + } + + /* CACHE_NEW_CC_VP */ + cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */ + + if (INTEL_DEBUG & DEBUG_STATS) + cc.cc5.statistics_enable = 1; + + bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT, + key, sizeof(*key), + &brw->cc.vp_bo, 1, + &cc, sizeof(cc), + NULL, NULL); + + /* Emit CC viewport relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, + 0, + 0, + offsetof(struct brw_cc_unit_state, cc4), + brw->cc.vp_bo); + + return bo; +} + +static void prepare_cc_unit( struct brw_context *brw ) +{ + struct brw_cc_unit_key key; + + cc_unit_populate_key(brw, &key); + + dri_bo_unreference(brw->cc.state_bo); + brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT, + &key, sizeof(key), + &brw->cc.vp_bo, 1, + NULL); + + if (brw->cc.state_bo == NULL) + brw->cc.state_bo = cc_unit_create_from_key(brw, &key); +} + +const struct brw_tracked_state brw_cc_unit = { + .dirty = { + .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH, + .brw = 0, + .cache = CACHE_NEW_CC_VP + }, + .prepare = prepare_cc_unit, +}; + + + diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c new file mode 100644 index 00000000000..20a927cf386 --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip.c @@ -0,0 +1,273 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_state.h" +#include "brw_clip.h" + + +#define FRONT_UNFILLED_BIT 0x1 +#define BACK_UNFILLED_BIT 0x2 + + +static void compile_clip_prog( struct brw_context *brw, + struct brw_clip_prog_key *key ) +{ + struct brw_clip_compile c; + const GLuint *program; + GLuint program_size; + GLuint delta; + GLuint i; + + memset(&c, 0, sizeof(c)); + + /* Begin the compilation: + */ + brw_init_compile(brw, &c.func); + + c.func.single_program_flow = 1; + + c.key = *key; + c.need_ff_sync = BRW_IS_IGDNG(brw); + + /* Need to locate the two positions present in vertex + header. + * These are currently hardcoded: + */ + c.header_position_offset = ATTR_SIZE; + + if (BRW_IS_IGDNG(brw)) + delta = 3 * REG_SIZE; + else + delta = REG_SIZE; + + for (i = 0; i < VERT_RESULT_MAX; i++) + if (c.key.attrs & (1<<i)) { + c.offset[i] = delta; + delta += ATTR_SIZE; + } + + c.nr_attrs = brw_count_bits(c.key.attrs); + + if (BRW_IS_IGDNG(brw)) + c.nr_regs = (c.nr_attrs + 1) / 2 + 3; /* are vertices packed, or reg-aligned? */ + else + c.nr_regs = (c.nr_attrs + 1) / 2 + 1; /* are vertices packed, or reg-aligned? */ + + c.nr_bytes = c.nr_regs * REG_SIZE; + + c.prog_data.clip_mode = c.key.clip_mode; /* XXX */ + + /* For some reason the thread is spawned with only 4 channels + * unmasked. + */ + brw_set_mask_control(&c.func, BRW_MASK_DISABLE); + + + /* Would ideally have the option of producing a program which could + * do all three: + */ + switch (key->primitive) { + case GL_TRIANGLES: + if (key->do_unfilled) + brw_emit_unfilled_clip( &c ); + else + brw_emit_tri_clip( &c ); + break; + case GL_LINES: + brw_emit_line_clip( &c ); + break; + case GL_POINTS: + brw_emit_point_clip( &c ); + break; + default: + assert(0); + return; + } + + + + /* get the program + */ + program = brw_get_program(&c.func, &program_size); + + /* Upload + */ + dri_bo_unreference(brw->clip.prog_bo); + brw->clip.prog_bo = brw_upload_cache( &brw->cache, + BRW_CLIP_PROG, + &c.key, sizeof(c.key), + NULL, 0, + program, program_size, + &c.prog_data, + &brw->clip.prog_data ); +} + +/* Calculate interpolants for triangle and line rasterization. + */ +static void upload_clip_prog(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_clip_prog_key key; + + memset(&key, 0, sizeof(key)); + + /* Populate the key: + */ + /* BRW_NEW_REDUCED_PRIMITIVE */ + key.primitive = brw->intel.reduced_primitive; + /* CACHE_NEW_VS_PROG */ + key.attrs = brw->vs.prog_data->outputs_written; + /* _NEW_LIGHT */ + key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT); + /* _NEW_TRANSFORM */ + key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled); + + if (BRW_IS_IGDNG(brw)) + key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP; + else + key.clip_mode = BRW_CLIPMODE_NORMAL; + + /* _NEW_POLYGON */ + if (key.primitive == GL_TRIANGLES) { + if (ctx->Polygon.CullFlag && + ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK) + key.clip_mode = BRW_CLIPMODE_REJECT_ALL; + else { + GLuint fill_front = CLIP_CULL; + GLuint fill_back = CLIP_CULL; + GLuint offset_front = 0; + GLuint offset_back = 0; + + if (!ctx->Polygon.CullFlag || + ctx->Polygon.CullFaceMode != GL_FRONT) { + switch (ctx->Polygon.FrontMode) { + case GL_FILL: + fill_front = CLIP_FILL; + offset_front = 0; + break; + case GL_LINE: + fill_front = CLIP_LINE; + offset_front = ctx->Polygon.OffsetLine; + break; + case GL_POINT: + fill_front = CLIP_POINT; + offset_front = ctx->Polygon.OffsetPoint; + break; + } + } + + if (!ctx->Polygon.CullFlag || + ctx->Polygon.CullFaceMode != GL_BACK) { + switch (ctx->Polygon.BackMode) { + case GL_FILL: + fill_back = CLIP_FILL; + offset_back = 0; + break; + case GL_LINE: + fill_back = CLIP_LINE; + offset_back = ctx->Polygon.OffsetLine; + break; + case GL_POINT: + fill_back = CLIP_POINT; + offset_back = ctx->Polygon.OffsetPoint; + break; + } + } + + if (ctx->Polygon.BackMode != GL_FILL || + ctx->Polygon.FrontMode != GL_FILL) { + key.do_unfilled = 1; + + /* Most cases the fixed function units will handle. Cases where + * one or more polygon faces are unfilled will require help: + */ + key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED; + + if (offset_back || offset_front) { + /* _NEW_POLYGON, _NEW_BUFFERS */ + key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale; + key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD; + } + + switch (ctx->Polygon.FrontFace) { + case GL_CCW: + key.fill_ccw = fill_front; + key.fill_cw = fill_back; + key.offset_ccw = offset_front; + key.offset_cw = offset_back; + if (ctx->Light.Model.TwoSide && + key.fill_cw != CLIP_CULL) + key.copy_bfc_cw = 1; + break; + case GL_CW: + key.fill_cw = fill_front; + key.fill_ccw = fill_back; + key.offset_cw = offset_front; + key.offset_ccw = offset_back; + if (ctx->Light.Model.TwoSide && + key.fill_ccw != CLIP_CULL) + key.copy_bfc_ccw = 1; + break; + } + } + } + } + + dri_bo_unreference(brw->clip.prog_bo); + brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG, + &key, sizeof(key), + NULL, 0, + &brw->clip.prog_data); + if (brw->clip.prog_bo == NULL) + compile_clip_prog( brw, &key ); +} + + +const struct brw_tracked_state brw_clip_prog = { + .dirty = { + .mesa = (_NEW_LIGHT | + _NEW_TRANSFORM | + _NEW_POLYGON | + _NEW_BUFFERS), + .brw = (BRW_NEW_REDUCED_PRIMITIVE), + .cache = CACHE_NEW_VS_PROG + }, + .prepare = upload_clip_prog +}; diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h new file mode 100644 index 00000000000..957df441ab0 --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip.h @@ -0,0 +1,179 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#ifndef BRW_CLIP_H +#define BRW_CLIP_H + + +#include "brw_context.h" +#include "brw_eu.h" + +#define MAX_VERTS (3+6+6) + +/* Note that if unfilled primitives are being emitted, we have to fix + * up polygon offset and flatshading at this point: + */ +struct brw_clip_prog_key { + GLuint attrs:32; + GLuint primitive:4; + GLuint nr_userclip:3; + GLuint do_flat_shading:1; + GLuint do_unfilled:1; + GLuint fill_cw:2; /* includes cull information */ + GLuint fill_ccw:2; /* includes cull information */ + GLuint offset_cw:1; + GLuint offset_ccw:1; + GLuint pad0:17; + + GLuint copy_bfc_cw:1; + GLuint copy_bfc_ccw:1; + GLuint clip_mode:3; + GLuint pad1:27; + + GLfloat offset_factor; + GLfloat offset_units; +}; + + +#define CLIP_LINE 0 +#define CLIP_POINT 1 +#define CLIP_FILL 2 +#define CLIP_CULL 3 + + +#define PRIM_MASK (0x1f) + +struct brw_clip_compile { + struct brw_compile func; + struct brw_clip_prog_key key; + struct brw_clip_prog_data prog_data; + + struct { + struct brw_reg R0; + struct brw_reg vertex[MAX_VERTS]; + + struct brw_reg t; + struct brw_reg t0, t1; + struct brw_reg dp0, dp1; + + struct brw_reg dpPrev; + struct brw_reg dp; + struct brw_reg loopcount; + struct brw_reg nr_verts; + struct brw_reg planemask; + + struct brw_reg inlist; + struct brw_reg outlist; + struct brw_reg freelist; + + struct brw_reg dir; + struct brw_reg tmp0, tmp1; + struct brw_reg offset; + + struct brw_reg fixed_planes; + struct brw_reg plane_equation; + + struct brw_reg ff_sync; + } reg; + + /* 3 different ways of expressing vertex size: + */ + GLuint nr_attrs; + GLuint nr_regs; + GLuint nr_bytes; + + GLuint first_tmp; + GLuint last_tmp; + + GLboolean need_direction; + + GLuint last_mrf; + + GLuint header_position_offset; + GLuint offset[VERT_ATTRIB_MAX]; + GLboolean need_ff_sync; +}; + +#define ATTR_SIZE (4*4) + +/* Points are only culled, so no need for a clip routine, however it + * works out easier to have a dummy one. + */ +void brw_emit_unfilled_clip( struct brw_clip_compile *c ); +void brw_emit_tri_clip( struct brw_clip_compile *c ); +void brw_emit_line_clip( struct brw_clip_compile *c ); +void brw_emit_point_clip( struct brw_clip_compile *c ); + +/* brw_clip_tri.c, for use by the unfilled clip routine: + */ +void brw_clip_tri_init_vertices( struct brw_clip_compile *c ); +void brw_clip_tri_flat_shade( struct brw_clip_compile *c ); +void brw_clip_tri( struct brw_clip_compile *c ); +void brw_clip_tri_emit_polygon( struct brw_clip_compile *c ); +void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, + GLuint nr_verts ); + + +/* Utils: + */ + +void brw_clip_interp_vertex( struct brw_clip_compile *c, + struct brw_indirect dest_ptr, + struct brw_indirect v0_ptr, /* from */ + struct brw_indirect v1_ptr, /* to */ + struct brw_reg t0, + GLboolean force_edgeflag ); + +void brw_clip_init_planes( struct brw_clip_compile *c ); + +void brw_clip_emit_vue(struct brw_clip_compile *c, + struct brw_indirect vert, + GLboolean allocate, + GLboolean eot, + GLuint header); + +void brw_clip_kill_thread(struct brw_clip_compile *c); + +struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c ); +struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c ); + +void brw_clip_copy_colors( struct brw_clip_compile *c, + GLuint to, GLuint from ); + +void brw_clip_init_clipmask( struct brw_clip_compile *c ); + +struct brw_reg get_tmp( struct brw_clip_compile *c ); + +void brw_clip_project_position(struct brw_clip_compile *c, + struct brw_reg pos ); +void brw_clip_ff_sync(struct brw_clip_compile *c); +void brw_clip_init_ff_sync(struct brw_clip_compile *c); +#endif diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c new file mode 100644 index 00000000000..048ca620fab --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_line.c @@ -0,0 +1,276 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/program.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_clip.h" + + + +static void brw_clip_line_alloc_regs( struct brw_clip_compile *c ) +{ + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + if (c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec4_grf(i, 0); + i += (6 + c->key.nr_userclip + 1) / 2; + + c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2; + } + else + c->prog_data.curb_read_length = 0; + + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < 4; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + c->reg.t = brw_vec1_grf(i, 0); + c->reg.t0 = brw_vec1_grf(i, 1); + c->reg.t1 = brw_vec1_grf(i, 2); + c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD); + c->reg.plane_equation = brw_vec4_grf(i, 4); + i++; + + c->reg.dp0 = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */ + c->reg.dp1 = brw_vec1_grf(i, 4); + i++; + + if (!c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec8_grf(i, 0); + i++; + } + + if (c->need_ff_sync) { + c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + i++; + } + + c->first_tmp = i; + c->last_tmp = i; + + c->prog_data.urb_read_length = c->nr_regs; /* ? */ + c->prog_data.total_grf = i; +} + + + +/* Line clipping, more or less following the following algorithm: + * + * for (p=0;p<MAX_PLANES;p++) { + * if (clipmask & (1 << p)) { + * GLfloat dp0 = DOTPROD( vtx0, plane[p] ); + * GLfloat dp1 = DOTPROD( vtx1, plane[p] ); + * + * if (IS_NEGATIVE(dp1)) { + * GLfloat t = dp1 / (dp1 - dp0); + * if (t > t1) t1 = t; + * } else { + * GLfloat t = dp0 / (dp0 - dp1); + * if (t > t0) t0 = t; + * } + * + * if (t0 + t1 >= 1.0) + * return; + * } + * } + * + * interp( ctx, newvtx0, vtx0, vtx1, t0 ); + * interp( ctx, newvtx1, vtx1, vtx0, t1 ); + * + */ +static void clip_and_emit_line( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_indirect vtx0 = brw_indirect(0, 0); + struct brw_indirect vtx1 = brw_indirect(1, 0); + struct brw_indirect newvtx0 = brw_indirect(2, 0); + struct brw_indirect newvtx1 = brw_indirect(3, 0); + struct brw_indirect plane_ptr = brw_indirect(4, 0); + struct brw_instruction *plane_loop; + struct brw_instruction *plane_active; + struct brw_instruction *is_negative; + struct brw_instruction *is_neg2 = NULL; + struct brw_instruction *not_culled; + struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD); + + brw_MOV(p, get_addr_reg(vtx0), brw_address(c->reg.vertex[0])); + brw_MOV(p, get_addr_reg(vtx1), brw_address(c->reg.vertex[1])); + brw_MOV(p, get_addr_reg(newvtx0), brw_address(c->reg.vertex[2])); + brw_MOV(p, get_addr_reg(newvtx1), brw_address(c->reg.vertex[3])); + brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c)); + + /* Note: init t0, t1 together: + */ + brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0)); + + brw_clip_init_planes(c); + brw_clip_init_clipmask(c); + + /* -ve rhw workaround */ + if (BRW_IS_965(p->brw)) { + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), + brw_imm_ud(1<<20)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f)); + } + + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + plane_loop = brw_DO(p, BRW_EXECUTE_1); + { + /* if (planemask & 1) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1)); + + plane_active = brw_IF(p, BRW_EXECUTE_1); + { + if (c->key.nr_userclip) + brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0)); + else + brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0)); + + /* dp = DP4(vtx->position, plane) + */ + brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation); + + /* if (IS_NEGATIVE(dp1)) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_L); + brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation); + is_negative = brw_IF(p, BRW_EXECUTE_1); + { + /* + * Both can be negative on GM965/G965 due to RHW workaround + * if so, this object should be rejected. + */ + if (BRW_IS_965(p->brw)) { + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0)); + is_neg2 = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p, is_neg2); + } + + brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1); + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 ); + brw_MOV(p, c->reg.t1, c->reg.t); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + is_negative = brw_ELSE(p, is_negative); + { + /* Coming back in. We know that both cannot be negative + * because the line would have been culled in that case. + */ + + /* If both are positive, do nothing */ + /* Only on GM965/G965 */ + if (BRW_IS_965(p->brw)) { + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0)); + is_neg2 = brw_IF(p, BRW_EXECUTE_1); + } + + { + brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0); + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 ); + brw_MOV(p, c->reg.t0, c->reg.t); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + + if (BRW_IS_965(p->brw)) { + brw_ENDIF(p, is_neg2); + } + } + brw_ENDIF(p, is_negative); + } + brw_ENDIF(p, plane_active); + + /* plane_ptr++; + */ + brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c)); + + /* while (planemask>>=1) != 0 + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1)); + } + brw_WHILE(p, plane_loop); + + brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1); + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0)); + not_culled = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, GL_FALSE); + brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, GL_FALSE); + + brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START); + brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END); + } + brw_ENDIF(p, not_culled); + brw_clip_kill_thread(c); +} + + + +void brw_emit_line_clip( struct brw_clip_compile *c ) +{ + brw_clip_line_alloc_regs(c); + brw_clip_init_ff_sync(c); + + if (c->key.do_flat_shading) + brw_clip_copy_colors(c, 0, 1); + + clip_and_emit_line(c); +} diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c new file mode 100644 index 00000000000..8458f61c5a0 --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_point.c @@ -0,0 +1,56 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/program.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_clip.h" + + +/* Point clipping, nothing to do? + */ +void brw_emit_point_clip( struct brw_clip_compile *c ) +{ + /* Send an empty message to kill the thread: + */ + brw_clip_tri_alloc_regs(c, 0); + brw_clip_init_ff_sync(c); + + brw_clip_kill_thread(c); +} diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c new file mode 100644 index 00000000000..234b3744bfc --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_state.c @@ -0,0 +1,184 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "main/macros.h" + +struct brw_clip_unit_key { + unsigned int total_grf; + unsigned int urb_entry_read_length; + unsigned int curb_entry_read_length; + unsigned int clip_mode; + + unsigned int curbe_offset; + + unsigned int nr_urb_entries, urb_size; + + GLboolean depth_clamp; +}; + +static void +clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + memset(key, 0, sizeof(*key)); + + /* CACHE_NEW_CLIP_PROG */ + key->total_grf = brw->clip.prog_data->total_grf; + key->urb_entry_read_length = brw->clip.prog_data->urb_read_length; + key->curb_entry_read_length = brw->clip.prog_data->curb_read_length; + key->clip_mode = brw->clip.prog_data->clip_mode; + + /* BRW_NEW_CURBE_OFFSETS */ + key->curbe_offset = brw->curbe.clip_start; + + /* BRW_NEW_URB_FENCE */ + key->nr_urb_entries = brw->urb.nr_clip_entries; + key->urb_size = brw->urb.vsize; + + /* _NEW_TRANSOFORM */ + key->depth_clamp = ctx->Transform.DepthClamp; +} + +static dri_bo * +clip_unit_create_from_key(struct brw_context *brw, + struct brw_clip_unit_key *key) +{ + struct brw_clip_unit_state clip; + dri_bo *bo; + + memset(&clip, 0, sizeof(clip)); + + clip.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + /* reloc */ + clip.thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6; + + clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + clip.thread1.single_program_flow = 1; + + clip.thread3.urb_entry_read_length = key->urb_entry_read_length; + clip.thread3.const_urb_entry_read_length = key->curb_entry_read_length; + clip.thread3.const_urb_entry_read_offset = key->curbe_offset * 2; + clip.thread3.dispatch_grf_start_reg = 1; + clip.thread3.urb_entry_read_offset = 0; + + clip.thread4.nr_urb_entries = key->nr_urb_entries; + clip.thread4.urb_entry_allocation_size = key->urb_size - 1; + /* If we have enough clip URB entries to run two threads, do so. + */ + if (key->nr_urb_entries >= 10) { + /* Half of the URB entries go to each thread, and it has to be an + * even number. + */ + assert(key->nr_urb_entries % 2 == 0); + + /* Although up to 16 concurrent Clip threads are allowed on IGDNG, + * only 2 threads can output VUEs at a time. + */ + if (BRW_IS_IGDNG(brw)) + clip.thread4.max_threads = 16 - 1; + else + clip.thread4.max_threads = 2 - 1; + } else { + assert(key->nr_urb_entries >= 5); + clip.thread4.max_threads = 1 - 1; + } + + if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) + clip.thread4.max_threads = 0; + + if (INTEL_DEBUG & DEBUG_STATS) + clip.thread4.stats_enable = 1; + + clip.clip5.userclip_enable_flags = 0x7f; + clip.clip5.userclip_must_clip = 1; + clip.clip5.guard_band_enable = 0; + if (!key->depth_clamp) + clip.clip5.viewport_z_clip_enable = 1; + clip.clip5.viewport_xy_clip_enable = 1; + clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE; + clip.clip5.api_mode = BRW_CLIP_API_OGL; + clip.clip5.clip_mode = key->clip_mode; + + if (BRW_IS_G4X(brw)) + clip.clip5.negative_w_clip_test = 1; + + clip.clip6.clipper_viewport_state_ptr = 0; + clip.viewport_xmin = -1; + clip.viewport_xmax = 1; + clip.viewport_ymin = -1; + clip.viewport_ymax = 1; + + bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT, + key, sizeof(*key), + &brw->clip.prog_bo, 1, + &clip, sizeof(clip), + NULL, NULL); + + /* Emit clip program relocation */ + assert(brw->clip.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, + 0, + clip.thread0.grf_reg_count << 1, + offsetof(struct brw_clip_unit_state, thread0), + brw->clip.prog_bo); + + return bo; +} + +static void upload_clip_unit( struct brw_context *brw ) +{ + struct brw_clip_unit_key key; + + clip_unit_populate_key(brw, &key); + + dri_bo_unreference(brw->clip.state_bo); + brw->clip.state_bo = brw_search_cache(&brw->cache, BRW_CLIP_UNIT, + &key, sizeof(key), + &brw->clip.prog_bo, 1, + NULL); + if (brw->clip.state_bo == NULL) { + brw->clip.state_bo = clip_unit_create_from_key(brw, &key); + } +} + +const struct brw_tracked_state brw_clip_unit = { + .dirty = { + .mesa = _NEW_TRANSFORM, + .brw = (BRW_NEW_CURBE_OFFSETS | + BRW_NEW_URB_FENCE), + .cache = CACHE_NEW_CLIP_PROG + }, + .prepare = upload_clip_unit, +}; diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c new file mode 100644 index 00000000000..0efd77225e2 --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_tri.c @@ -0,0 +1,603 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/program.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_clip.h" + +static void release_tmps( struct brw_clip_compile *c ) +{ + c->last_tmp = c->first_tmp; +} + + +void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, + GLuint nr_verts ) +{ + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + if (c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec4_grf(i, 0); + i += (6 + c->key.nr_userclip + 1) / 2; + + c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2; + } + else + c->prog_data.curb_read_length = 0; + + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < nr_verts; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + if (c->nr_attrs & 1) { + for (j = 0; j < 3; j++) { + GLuint delta = c->nr_attrs*16 + 32; + + if (BRW_IS_IGDNG(c->func.brw)) + delta = c->nr_attrs * 16 + 32 * 3; + + brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0)); + } + } + + c->reg.t = brw_vec1_grf(i, 0); + c->reg.loopcount = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D); + c->reg.nr_verts = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD); + c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD); + c->reg.plane_equation = brw_vec4_grf(i, 4); + i++; + + c->reg.dpPrev = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */ + c->reg.dp = brw_vec1_grf(i, 4); + i++; + + c->reg.inlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + c->reg.outlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + c->reg.freelist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + if (!c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec8_grf(i, 0); + i++; + } + + if (c->key.do_unfilled) { + c->reg.dir = brw_vec4_grf(i, 0); + c->reg.offset = brw_vec4_grf(i, 4); + i++; + c->reg.tmp0 = brw_vec4_grf(i, 0); + c->reg.tmp1 = brw_vec4_grf(i, 4); + i++; + } + + if (c->need_ff_sync) { + c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + i++; + } + + c->first_tmp = i; + c->last_tmp = i; + + c->prog_data.urb_read_length = c->nr_regs; /* ? */ + c->prog_data.total_grf = i; +} + + + +void brw_clip_tri_init_vertices( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + struct brw_instruction *is_rev; + + /* Initial list of indices for incoming vertexes: + */ + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE)); + + /* XXX: Is there an easier way to do this? Need to reverse every + * second tristrip element: Can ignore sometimes? + */ + is_rev = brw_IF(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[1]) ); + brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[0]) ); + if (c->need_direction) + brw_MOV(p, c->reg.dir, brw_imm_f(-1)); + } + is_rev = brw_ELSE(p, is_rev); + { + brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[0]) ); + brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[1]) ); + if (c->need_direction) + brw_MOV(p, c->reg.dir, brw_imm_f(1)); + } + brw_ENDIF(p, is_rev); + + brw_MOV(p, get_element(c->reg.inlist, 2), brw_address(c->reg.vertex[2]) ); + brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0)); + brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3)); +} + + + +void brw_clip_tri_flat_shade( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *is_poly; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_POLYGON)); + + is_poly = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_copy_colors(c, 1, 0); + brw_clip_copy_colors(c, 2, 0); + } + is_poly = brw_ELSE(p, is_poly); + { + brw_clip_copy_colors(c, 0, 2); + brw_clip_copy_colors(c, 1, 2); + } + brw_ENDIF(p, is_poly); +} + + + +/* Use mesa's clipping algorithms, translated to GEN4 assembly. + */ +void brw_clip_tri( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_indirect vtx = brw_indirect(0, 0); + struct brw_indirect vtxPrev = brw_indirect(1, 0); + struct brw_indirect vtxOut = brw_indirect(2, 0); + struct brw_indirect plane_ptr = brw_indirect(3, 0); + struct brw_indirect inlist_ptr = brw_indirect(4, 0); + struct brw_indirect outlist_ptr = brw_indirect(5, 0); + struct brw_indirect freelist_ptr = brw_indirect(6, 0); + struct brw_instruction *plane_loop; + struct brw_instruction *plane_active; + struct brw_instruction *vertex_loop; + struct brw_instruction *next_test; + struct brw_instruction *prev_test; + + brw_MOV(p, get_addr_reg(vtxPrev), brw_address(c->reg.vertex[2]) ); + brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c)); + brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist)); + + brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) ); + + plane_loop = brw_DO(p, BRW_EXECUTE_1); + { + /* if (planemask & 1) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1)); + + plane_active = brw_IF(p, BRW_EXECUTE_1); + { + /* vtxOut = freelist_ptr++ + */ + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(freelist_ptr) ); + brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE)); + + if (c->key.nr_userclip) + brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0)); + else + brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0)); + + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0)); + + vertex_loop = brw_DO(p, BRW_EXECUTE_1); + { + /* vtx = *input_ptr; + */ + brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0)); + + /* IS_NEGATIVE(prev) */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_L); + brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation); + prev_test = brw_IF(p, BRW_EXECUTE_1); + { + /* IS_POSITIVE(next) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_GE); + brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation); + next_test = brw_IF(p, BRW_EXECUTE_1); + { + + /* Coming back in. + */ + brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev); + + /* If (vtxOut == 0) vtxOut = vtxPrev + */ + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) ); + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) ); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, GL_FALSE); + + /* *outlist_ptr++ = vtxOut; + * nr_verts++; + * vtxOut = 0; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) ); + } + brw_ENDIF(p, next_test); + + } + prev_test = brw_ELSE(p, prev_test); + { + /* *outlist_ptr++ = vtxPrev; + * nr_verts++; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + + /* IS_NEGATIVE(next) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_L); + brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation); + next_test = brw_IF(p, BRW_EXECUTE_1); + { + /* Going out of bounds. Avoid division by zero as we + * know dp != dpPrev from DIFFERENT_SIGNS, above. + */ + brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp); + + /* If (vtxOut == 0) vtxOut = vtx + */ + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) ); + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) ); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, GL_TRUE); + + /* *outlist_ptr++ = vtxOut; + * nr_verts++; + * vtxOut = 0; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) ); + } + brw_ENDIF(p, next_test); + } + brw_ENDIF(p, prev_test); + + /* vtxPrev = vtx; + * inlist_ptr++; + */ + brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx)); + brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short))); + + /* while (--loopcount != 0) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + } + brw_WHILE(p, vertex_loop); + + /* vtxPrev = *(outlist_ptr-1) OR: outlist[nr_verts-1] + * inlist = outlist + * inlist_ptr = &inlist[0] + * outlist_ptr = &outlist[0] + */ + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2)); + brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0)); + brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0)); + brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist)); + } + brw_ENDIF(p, plane_active); + + /* plane_ptr++; + */ + brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c)); + + /* nr_verts >= 3 + */ + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + c->reg.nr_verts, + brw_imm_ud(3)); + + /* && (planemask>>=1) != 0 + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1)); + } + brw_WHILE(p, plane_loop); +} + + + +void brw_clip_tri_emit_polygon(struct brw_clip_compile *c) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *loop, *if_insn; + + /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--) + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_G); + brw_ADD(p, + c->reg.loopcount, + c->reg.nr_verts, + brw_imm_d(-2)); + + if_insn = brw_IF(p, BRW_EXECUTE_1); + { + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect vptr = brw_indirect(1, 0); + + brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START)); + + brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + loop = brw_DO(p, BRW_EXECUTE_1); + { + brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2)); + + brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + } + brw_WHILE(p, loop); + + brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END)); + } + brw_ENDIF(p, if_insn); +} + +static void do_clip_tri( struct brw_clip_compile *c ) +{ + brw_clip_init_planes(c); + + brw_clip_tri(c); +} + + +static void maybe_do_clip_tri( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *do_clip; + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0)); + do_clip = brw_IF(p, BRW_EXECUTE_1); + { + do_clip_tri(c); + } + brw_ENDIF(p, do_clip); +} + +static void brw_clip_test( struct brw_clip_compile *c ) +{ + struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + + struct brw_reg v0 = get_tmp(c); + struct brw_reg v1 = get_tmp(c); + struct brw_reg v2 = get_tmp(c); + + struct brw_indirect vt0 = brw_indirect(0, 0); + struct brw_indirect vt1 = brw_indirect(1, 0); + struct brw_indirect vt2 = brw_indirect(2, 0); + + struct brw_compile *p = &c->func; + struct brw_instruction *is_outside; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + + brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0])); + brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1])); + brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2])); + brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS])); + brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS])); + brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS])); + brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f)); + + /* test nearz, xmin, ymin plane */ + /* clip.xyz < -clip.w */ + brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* All vertices are outside of a plane, rejected */ + brw_AND(p, t, t1, t2); + brw_AND(p, t, t, t3); + brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1)); + brw_OR(p, tmp0, tmp0, get_element(t, 2)); + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1)); + is_outside = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p, is_outside); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* some vertices are inside a plane, some are outside,need to clip */ + brw_XOR(p, t, t1, t2); + brw_XOR(p, t1, t2, t3); + brw_OR(p, t, t, t1); + brw_AND(p, t, t, brw_imm_ud(0x1)); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 0), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 1), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 2), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* test farz, xmax, ymax plane */ + /* clip.xyz > clip.w */ + brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* All vertices are outside of a plane, rejected */ + brw_AND(p, t, t1, t2); + brw_AND(p, t, t, t3); + brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1)); + brw_OR(p, tmp0, tmp0, get_element(t, 2)); + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1)); + is_outside = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p, is_outside); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* some vertices are inside a plane, some are outside,need to clip */ + brw_XOR(p, t, t1, t2); + brw_XOR(p, t1, t2, t3); + brw_OR(p, t, t, t1); + brw_AND(p, t, t, brw_imm_ud(0x1)); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 0), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 1), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 2), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + release_tmps(c); +} + + +void brw_emit_tri_clip( struct brw_clip_compile *c ) +{ + struct brw_instruction *neg_rhw; + struct brw_compile *p = &c->func; + brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6); + brw_clip_tri_init_vertices(c); + brw_clip_init_clipmask(c); + brw_clip_init_ff_sync(c); + + /* if -ve rhw workaround bit is set, + do cliptest */ + if (BRW_IS_965(p->brw)) { + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), + brw_imm_ud(1<<20)); + neg_rhw = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_test(c); + } + brw_ENDIF(p, neg_rhw); + } + /* Can't push into do_clip_tri because with polygon (or quad) + * flatshading, need to apply the flatshade here because we don't + * respect the PV when converting to trifan for emit: + */ + if (c->key.do_flat_shading) + brw_clip_tri_flat_shade(c); + + if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) || + (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP)) + do_clip_tri(c); + else + maybe_do_clip_tri(c); + + brw_clip_tri_emit_polygon(c); + + /* Send an empty message to kill the thread: + */ + brw_clip_kill_thread(c); +} diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c new file mode 100644 index 00000000000..ad1bfa435fb --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_unfilled.c @@ -0,0 +1,505 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/program.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_clip.h" + + + +/* This is performed against the original triangles, so no indirection + * required: +BZZZT! + */ +static void compute_tri_direction( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg e = c->reg.tmp0; + struct brw_reg f = c->reg.tmp1; + struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); + struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); + struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); + + + struct brw_reg v0n = get_tmp(c); + struct brw_reg v1n = get_tmp(c); + struct brw_reg v2n = get_tmp(c); + + /* Convert to NDC. + * NOTE: We can't modify the original vertex coordinates, + * as it may impact further operations. + * So, we have to keep normalized coordinates in temp registers. + * + * TBD-KC + * Try to optimize unnecessary MOV's. + */ + brw_MOV(p, v0n, v0); + brw_MOV(p, v1n, v1); + brw_MOV(p, v2n, v2); + + brw_clip_project_position(c, v0n); + brw_clip_project_position(c, v1n); + brw_clip_project_position(c, v2n); + + /* Calculate the vectors of two edges of the triangle: + */ + brw_ADD(p, e, v0n, negate(v2n)); + brw_ADD(p, f, v1n, negate(v2n)); + + /* Take their crossproduct: + */ + brw_set_access_mode(p, BRW_ALIGN_16); + brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3), brw_swizzle(f,2,0,1,3)); + brw_MAC(p, vec4(e), negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3)); + brw_set_access_mode(p, BRW_ALIGN_1); + + brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e)); +} + + +static void cull_direction( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *ccw; + GLuint conditional; + + assert (!(c->key.fill_ccw == CLIP_CULL && + c->key.fill_cw == CLIP_CULL)); + + if (c->key.fill_ccw == CLIP_CULL) + conditional = BRW_CONDITIONAL_GE; + else + conditional = BRW_CONDITIONAL_L; + + brw_CMP(p, + vec1(brw_null_reg()), + conditional, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + ccw = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p, ccw); +} + + + +static void copy_bfc( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *ccw; + GLuint conditional; + + /* Do we have any colors to copy? + */ + if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) && + !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])) + return; + + /* In some wierd degnerate cases we can end up testing the + * direction twice, once for culling and once for bfc copying. Oh + * well, that's what you get for setting wierd GL state. + */ + if (c->key.copy_bfc_ccw) + conditional = BRW_CONDITIONAL_GE; + else + conditional = BRW_CONDITIONAL_L; + + brw_CMP(p, + vec1(brw_null_reg()), + conditional, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + ccw = brw_IF(p, BRW_EXECUTE_1); + { + GLuint i; + + for (i = 0; i < 3; i++) { + if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) + brw_MOV(p, + byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]), + byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0])); + + if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]) + brw_MOV(p, + byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]), + byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1])); + } + } + brw_ENDIF(p, ccw); +} + + + + +/* + GLfloat iz = 1.0 / dir.z; + GLfloat ac = dir.x * iz; + GLfloat bc = dir.y * iz; + offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE; + offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor; + offset *= MRD; +*/ +static void compute_offset( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg off = c->reg.offset; + struct brw_reg dir = c->reg.dir; + + brw_math_invert(p, get_element(off, 2), get_element(dir, 2)); + brw_MUL(p, vec2(off), dir, get_element(off, 2)); + + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + brw_abs(get_element(off, 0)), + brw_abs(get_element(off, 1))); + + brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1))); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor)); + brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units)); +} + + +static void merge_edgeflags( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *is_poly; + struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0); + + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_POLYGON)); + + /* Get away with using reg.vertex because we know that this is not + * a _3DPRIM_TRISTRIP_REVERSE: + */ + is_poly = brw_IF(p, BRW_EXECUTE_1); + { + brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ); + brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8)); + brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ); + brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9)); + brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + brw_ENDIF(p, is_poly); +} + + + +static void apply_one_offset( struct brw_clip_compile *c, + struct brw_indirect vert ) +{ + struct brw_compile *p = &c->func; + struct brw_reg z = deref_1f(vert, c->header_position_offset + + 2 * type_sz(BRW_REGISTER_TYPE_F)); + + brw_ADD(p, z, z, vec1(c->reg.offset)); +} + + + +/*********************************************************************** + * Output clipped polygon as an unfilled primitive: + */ +static void emit_lines(struct brw_clip_compile *c, + GLboolean do_offset) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *loop; + struct brw_instruction *draw_edge; + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect v1 = brw_indirect(1, 0); + struct brw_indirect v0ptr = brw_indirect(2, 0); + struct brw_indirect v1ptr = brw_indirect(3, 0); + + /* Need a seperate loop for offset: + */ + if (do_offset) { + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + + loop = brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + apply_one_offset(c, v0); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_G); + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + } + brw_WHILE(p, loop); + } + + /* v1ptr = &inlist[nr_verts] + * *v1ptr = v0 + */ + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW)); + brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW)); + brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0)); + + loop = brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + /* draw edge if edgeflag != 0 */ + brw_CMP(p, + vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, + deref_1f(v0, c->offset[VERT_RESULT_EDGE]), + brw_imm_f(0)); + draw_edge = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START); + brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END); + } + brw_ENDIF(p, draw_edge); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + } + brw_WHILE(p, loop); +} + + + +static void emit_points(struct brw_clip_compile *c, + GLboolean do_offset ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *loop; + struct brw_instruction *draw_point; + + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect v0ptr = brw_indirect(2, 0); + + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + + loop = brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + /* draw if edgeflag != 0 + */ + brw_CMP(p, + vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, + deref_1f(v0, c->offset[VERT_RESULT_EDGE]), + brw_imm_f(0)); + draw_point = brw_IF(p, BRW_EXECUTE_1); + { + if (do_offset) + apply_one_offset(c, v0); + + brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END); + } + brw_ENDIF(p, draw_point); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + } + brw_WHILE(p, loop); +} + + + + + + + +static void emit_primitives( struct brw_clip_compile *c, + GLuint mode, + GLboolean do_offset ) +{ + switch (mode) { + case CLIP_FILL: + brw_clip_tri_emit_polygon(c); + break; + + case CLIP_LINE: + emit_lines(c, do_offset); + break; + + case CLIP_POINT: + emit_points(c, do_offset); + break; + + case CLIP_CULL: + assert(0); + break; + } +} + + + +static void emit_unfilled_primitives( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *ccw; + + /* Direction culling has already been done. + */ + if (c->key.fill_ccw != c->key.fill_cw && + c->key.fill_ccw != CLIP_CULL && + c->key.fill_cw != CLIP_CULL) + { + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + ccw = brw_IF(p, BRW_EXECUTE_1); + { + emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw); + } + ccw = brw_ELSE(p, ccw); + { + emit_primitives(c, c->key.fill_cw, c->key.offset_cw); + } + brw_ENDIF(p, ccw); + } + else if (c->key.fill_cw != CLIP_CULL) { + emit_primitives(c, c->key.fill_cw, c->key.offset_cw); + } + else if (c->key.fill_ccw != CLIP_CULL) { + emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw); + } +} + + + + +static void check_nr_verts( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *if_insn; + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3)); + if_insn = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p, if_insn); +} + + +void brw_emit_unfilled_clip( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *do_clip; + + + c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) || + (c->key.fill_ccw != c->key.fill_cw) || + c->key.fill_ccw == CLIP_CULL || + c->key.fill_cw == CLIP_CULL || + c->key.copy_bfc_cw || + c->key.copy_bfc_ccw); + + brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6); + brw_clip_tri_init_vertices(c); + brw_clip_init_ff_sync(c); + + assert(c->offset[VERT_RESULT_EDGE]); + + if (c->key.fill_ccw == CLIP_CULL && + c->key.fill_cw == CLIP_CULL) { + brw_clip_kill_thread(c); + return; + } + + merge_edgeflags(c); + + /* Need to use the inlist indirection here: + */ + if (c->need_direction) + compute_tri_direction(c); + + if (c->key.fill_ccw == CLIP_CULL || + c->key.fill_cw == CLIP_CULL) + cull_direction(c); + + if (c->key.offset_ccw || + c->key.offset_cw) + compute_offset(c); + + if (c->key.copy_bfc_ccw || + c->key.copy_bfc_cw) + copy_bfc(c); + + /* Need to do this whether we clip or not: + */ + if (c->key.do_flat_shading) + brw_clip_tri_flat_shade(c); + + brw_clip_init_clipmask(c); + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0)); + do_clip = brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_init_planes(c); + brw_clip_tri(c); + check_nr_verts(c); + } + brw_ENDIF(p, do_clip); + + emit_unfilled_primitives(c); + brw_clip_kill_thread(c); +} + + + diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c new file mode 100644 index 00000000000..5a73abdfee9 --- /dev/null +++ b/src/gallium/drivers/i965/brw_clip_util.c @@ -0,0 +1,396 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/program.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_clip.h" + + + + +struct brw_reg get_tmp( struct brw_clip_compile *c ) +{ + struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0); + + if (++c->last_tmp > c->prog_data.total_grf) + c->prog_data.total_grf = c->last_tmp; + + return tmp; +} + +static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp ) +{ + if (tmp.nr == c->last_tmp-1) + c->last_tmp--; +} + + +static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w) +{ + return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x); +} + + +void brw_clip_init_planes( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + + if (!c->key.nr_userclip) { + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0, 0, 0xff, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0, 0, 1, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0, 1, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff, 0, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1, 0, 0, 1)); + } +} + + + +#define W 3 + +/* Project 'pos' to screen space (or back again), overwrite with results: + */ +void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos ) +{ + struct brw_compile *p = &c->func; + + /* calc rhw + */ + brw_math_invert(p, get_element(pos, W), get_element(pos, W)); + + /* value.xyz *= value.rhw + */ + brw_set_access_mode(p, BRW_ALIGN_16); + brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W)); + brw_set_access_mode(p, BRW_ALIGN_1); +} + + +static void brw_clip_project_vertex( struct brw_clip_compile *c, + struct brw_indirect vert_addr ) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = get_tmp(c); + + /* Fixup position. Extract from the original vertex and re-project + * to screen space: + */ + brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS])); + brw_clip_project_position(c, tmp); + brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp); + + release_tmp(c, tmp); +} + + + + +/* Interpolate between two vertices and put the result into a0.0. + * Increment a0.0 accordingly. + */ +void brw_clip_interp_vertex( struct brw_clip_compile *c, + struct brw_indirect dest_ptr, + struct brw_indirect v0_ptr, /* from */ + struct brw_indirect v1_ptr, /* to */ + struct brw_reg t0, + GLboolean force_edgeflag) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = get_tmp(c); + GLuint i; + + /* Just copy the vertex header: + */ + /* + * After CLIP stage, only first 256 bits of the VUE are read + * back on IGDNG, so needn't change it + */ + brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1); + + /* Iterate over each attribute (could be done in pairs?) + */ + for (i = 0; i < c->nr_attrs; i++) { + GLuint delta = i*16 + 32; + + if (BRW_IS_IGDNG(p->brw)) + delta = i * 16 + 32 * 3; + + if (delta == c->offset[VERT_RESULT_EDGE]) { + if (force_edgeflag) + brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1)); + else + brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta)); + } + else { + /* Interpolate: + * + * New = attr0 + t*attr1 - t*attr0 + */ + brw_MUL(p, + vec4(brw_null_reg()), + deref_4f(v1_ptr, delta), + t0); + + brw_MAC(p, + tmp, + negate(deref_4f(v0_ptr, delta)), + t0); + + brw_ADD(p, + deref_4f(dest_ptr, delta), + deref_4f(v0_ptr, delta), + tmp); + } + } + + if (i & 1) { + GLuint delta = i*16 + 32; + + if (BRW_IS_IGDNG(p->brw)) + delta = i * 16 + 32 * 3; + + brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0)); + } + + release_tmp(c, tmp); + + /* Recreate the projected (NDC) coordinate in the new vertex + * header: + */ + brw_clip_project_vertex(c, dest_ptr ); +} + + + + +#define MAX_MRF 16 + +void brw_clip_emit_vue(struct brw_clip_compile *c, + struct brw_indirect vert, + GLboolean allocate, + GLboolean eot, + GLuint header) +{ + struct brw_compile *p = &c->func; + GLuint start = c->last_mrf; + + brw_clip_ff_sync(c); + + assert(!(allocate && eot)); + + /* Cycle through mrf regs - probably futile as we have to wait for + * the allocation response anyway. Also, the order this function + * is invoked doesn't correspond to the order the instructions will + * be executed, so it won't have any effect in many cases. + */ +#if 0 + if (start + c->nr_regs + 1 >= MAX_MRF) + start = 0; + + c->last_mrf = start + c->nr_regs + 1; +#endif + + /* Copy the vertex from vertn into m1..mN+1: + */ + brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs); + + /* Overwrite PrimType and PrimStart in the message header, for + * each vertex in turn: + */ + brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header)); + + + /* Send each vertex as a seperate write to the urb. This + * is different to the concept in brw_sf_emit.c, where + * subsequent writes are used to build up a single urb + * entry. Each of these writes instantiates a seperate + * urb entry - (I think... what about 'allocate'?) + */ + brw_urb_WRITE(p, + allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + start, + c->reg.R0, + allocate, + 1, /* used */ + c->nr_regs + 1, /* msg length */ + allocate ? 1 : 0, /* response_length */ + eot, /* eot */ + 1, /* writes_complete */ + 0, /* urb offset */ + BRW_URB_SWIZZLE_NONE); +} + + + +void brw_clip_kill_thread(struct brw_clip_compile *c) +{ + struct brw_compile *p = &c->func; + + brw_clip_ff_sync(c); + /* Send an empty message to kill the thread and release any + * allocated urb entry: + */ + brw_urb_WRITE(p, + retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + 0, + c->reg.R0, + 0, /* allocate */ + 0, /* used */ + 1, /* msg len */ + 0, /* response len */ + 1, /* eot */ + 1, /* writes complete */ + 0, + BRW_URB_SWIZZLE_NONE); +} + + + + +struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c ) +{ + return brw_address(c->reg.fixed_planes); +} + + +struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c ) +{ + if (c->key.nr_userclip) { + return brw_imm_uw(16); + } + else { + return brw_imm_uw(4); + } +} + + +/* If flatshading, distribute color from provoking vertex prior to + * clipping. + */ +void brw_clip_copy_colors( struct brw_clip_compile *c, + GLuint to, GLuint from ) +{ + struct brw_compile *p = &c->func; + + if (c->offset[VERT_RESULT_COL0]) + brw_MOV(p, + byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]), + byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0])); + + if (c->offset[VERT_RESULT_COL1]) + brw_MOV(p, + byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]), + byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1])); + + if (c->offset[VERT_RESULT_BFC0]) + brw_MOV(p, + byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]), + byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0])); + + if (c->offset[VERT_RESULT_BFC1]) + brw_MOV(p, + byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]), + byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1])); +} + + + +void brw_clip_init_clipmask( struct brw_clip_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg incoming = get_element_ud(c->reg.R0, 2); + + /* Shift so that lowest outcode bit is rightmost: + */ + brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26)); + + if (c->key.nr_userclip) { + struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD); + + /* Rearrange userclip outcodes so that they come directly after + * the fixed plane bits. + */ + brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14)); + brw_SHR(p, tmp, tmp, brw_imm_ud(8)); + brw_OR(p, c->reg.planemask, c->reg.planemask, tmp); + + release_tmp(c, tmp); + } +} + +void brw_clip_ff_sync(struct brw_clip_compile *c) +{ + if (c->need_ff_sync) { + struct brw_compile *p = &c->func; + struct brw_instruction *need_ff_sync; + + brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); + brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1)); + need_ff_sync = brw_IF(p, BRW_EXECUTE_1); + { + brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1)); + brw_ff_sync(p, + c->reg.R0, + 0, + c->reg.R0, + 1, + 1, /* used */ + 1, /* msg length */ + 1, /* response length */ + 0, /* eot */ + 1, /* write compelete */ + 0, /* urb offset */ + BRW_URB_SWIZZLE_NONE); + } + brw_ENDIF(p, need_ff_sync); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } +} + +void brw_clip_init_ff_sync(struct brw_clip_compile *c) +{ + if (c->need_ff_sync) { + struct brw_compile *p = &c->func; + + brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0)); + } +} diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c new file mode 100644 index 00000000000..c300c33adce --- /dev/null +++ b/src/gallium/drivers/i965/brw_context.c @@ -0,0 +1,173 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/imports.h" +#include "main/api_noop.h" +#include "main/macros.h" +#include "main/vtxfmt.h" +#include "main/simple_list.h" +#include "shader/shader_api.h" + +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_draw.h" +#include "brw_state.h" +#include "brw_vs.h" +#include "intel_tex.h" +#include "intel_blit.h" +#include "intel_batchbuffer.h" +#include "intel_pixel.h" +#include "intel_span.h" +#include "tnl/t_pipeline.h" + +#include "utils.h" + + +/*************************************** + * Mesa's Driver Functions + ***************************************/ + +static void brwUseProgram(GLcontext *ctx, GLuint program) +{ + _mesa_use_program(ctx, program); +} + +static void brwInitProgFuncs( struct dd_function_table *functions ) +{ + functions->UseProgram = brwUseProgram; +} +static void brwInitDriverFunctions( struct dd_function_table *functions ) +{ + intelInitDriverFunctions( functions ); + + brwInitFragProgFuncs( functions ); + brwInitProgFuncs( functions ); + brw_init_queryobj_functions(functions); + + functions->Viewport = intel_viewport; +} + +GLboolean brwCreateContext( const __GLcontextModes *mesaVis, + __DRIcontextPrivate *driContextPriv, + void *sharedContextPrivate) +{ + struct dd_function_table functions; + struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context); + struct intel_context *intel = &brw->intel; + GLcontext *ctx = &intel->ctx; + + if (!brw) { + _mesa_printf("%s: failed to alloc context\n", __FUNCTION__); + return GL_FALSE; + } + + brwInitVtbl( brw ); + brwInitDriverFunctions( &functions ); + + if (!intelInitContext( intel, mesaVis, driContextPriv, + sharedContextPrivate, &functions )) { + _mesa_printf("%s: failed to init intel context\n", __FUNCTION__); + FREE(brw); + return GL_FALSE; + } + + /* Initialize swrast, tnl driver tables: */ + intelInitSpanFuncs(ctx); + + TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline; + + ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT; + ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */ + ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits, + ctx->Const.MaxTextureImageUnits); + ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */ + + /* Mesa limits textures to 4kx4k; it would be nice to fix that someday + */ + ctx->Const.MaxTextureLevels = 13; + ctx->Const.Max3DTextureLevels = 9; + ctx->Const.MaxCubeTextureLevels = 12; + ctx->Const.MaxTextureRectSize = (1<<12); + + ctx->Const.MaxTextureMaxAnisotropy = 16.0; + + /* if conformance mode is set, swrast can handle any size AA point */ + ctx->Const.MaxPointSizeAA = 255.0; + + /* We want the GLSL compiler to emit code that uses condition codes */ + ctx->Shader.EmitCondCodes = GL_TRUE; + ctx->Shader.EmitNVTempInitialization = GL_TRUE; + + ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024); + ctx->Const.VertexProgram.MaxAluInstructions = 0; + ctx->Const.VertexProgram.MaxTexInstructions = 0; + ctx->Const.VertexProgram.MaxTexIndirections = 0; + ctx->Const.VertexProgram.MaxNativeAluInstructions = 0; + ctx->Const.VertexProgram.MaxNativeTexInstructions = 0; + ctx->Const.VertexProgram.MaxNativeTexIndirections = 0; + ctx->Const.VertexProgram.MaxNativeAttribs = 16; + ctx->Const.VertexProgram.MaxNativeTemps = 256; + ctx->Const.VertexProgram.MaxNativeAddressRegs = 1; + ctx->Const.VertexProgram.MaxNativeParameters = 1024; + ctx->Const.VertexProgram.MaxEnvParams = + MIN2(ctx->Const.VertexProgram.MaxNativeParameters, + ctx->Const.VertexProgram.MaxEnvParams); + + ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024); + ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024); + ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024); + ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024); + ctx->Const.FragmentProgram.MaxNativeAttribs = 12; + ctx->Const.FragmentProgram.MaxNativeTemps = 256; + ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0; + ctx->Const.FragmentProgram.MaxNativeParameters = 1024; + ctx->Const.FragmentProgram.MaxEnvParams = + MIN2(ctx->Const.FragmentProgram.MaxNativeParameters, + ctx->Const.FragmentProgram.MaxEnvParams); + + brw_init_state( brw ); + + brw->state.dirty.mesa = ~0; + brw->state.dirty.brw = ~0; + + brw->emit_state_always = 0; + + ctx->VertexProgram._MaintainTnlProgram = GL_TRUE; + ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE; + + make_empty_list(&brw->query.active_head); + + brw_draw_init( brw ); + + return GL_TRUE; +} + diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h new file mode 100644 index 00000000000..fa3e32c7ff1 --- /dev/null +++ b/src/gallium/drivers/i965/brw_context.h @@ -0,0 +1,767 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRWCONTEXT_INC +#define BRWCONTEXT_INC + +#include "intel_context.h" +#include "brw_structs.h" +#include "main/imports.h" + + +/* Glossary: + * + * URB - uniform resource buffer. A mid-sized buffer which is + * partitioned between the fixed function units and used for passing + * values (vertices, primitives, constants) between them. + * + * CURBE - constant URB entry. An urb region (entry) used to hold + * constant values which the fixed function units can be instructed to + * preload into the GRF when spawning a thread. + * + * VUE - vertex URB entry. An urb entry holding a vertex and usually + * a vertex header. The header contains control information and + * things like primitive type, Begin/end flags and clip codes. + * + * PUE - primitive URB entry. An urb entry produced by the setup (SF) + * unit holding rasterization and interpolation parameters. + * + * GRF - general register file. One of several register files + * addressable by programmed threads. The inputs (r0, payload, curbe, + * urb) of the thread are preloaded to this area before the thread is + * spawned. The registers are individually 8 dwords wide and suitable + * for general usage. Registers holding thread input values are not + * special and may be overwritten. + * + * MRF - message register file. Threads communicate (and terminate) + * by sending messages. Message parameters are placed in contiguous + * MRF registers. All program output is via these messages. URB + * entries are populated by sending a message to the shared URB + * function containing the new data, together with a control word, + * often an unmodified copy of R0. + * + * R0 - GRF register 0. Typically holds control information used when + * sending messages to other threads. + * + * EU or GEN4 EU: The name of the programmable subsystem of the + * i965 hardware. Threads are executed by the EU, the registers + * described above are part of the EU architecture. + * + * Fixed function units: + * + * CS - Command streamer. Notional first unit, little software + * interaction. Holds the URB entries used for constant data, ie the + * CURBEs. + * + * VF/VS - Vertex Fetch / Vertex Shader. The fixed function part of + * this unit is responsible for pulling vertices out of vertex buffers + * in vram and injecting them into the processing pipe as VUEs. If + * enabled, it first passes them to a VS thread which is a good place + * for the driver to implement any active vertex shader. + * + * GS - Geometry Shader. This corresponds to a new DX10 concept. If + * enabled, incoming strips etc are passed to GS threads in individual + * line/triangle/point units. The GS thread may perform arbitary + * computation and emit whatever primtives with whatever vertices it + * chooses. This makes GS an excellent place to implement GL's + * unfilled polygon modes, though of course it is capable of much + * more. Additionally, GS is used to translate away primitives not + * handled by latter units, including Quads and Lineloops. + * + * CS - Clipper. Mesa's clipping algorithms are imported to run on + * this unit. The fixed function part performs cliptesting against + * the 6 fixed clipplanes and makes descisions on whether or not the + * incoming primitive needs to be passed to a thread for clipping. + * User clip planes are handled via cooperation with the VS thread. + * + * SF - Strips Fans or Setup: Triangles are prepared for + * rasterization. Interpolation coefficients are calculated. + * Flatshading and two-side lighting usually performed here. + * + * WM - Windower. Interpolation of vertex attributes performed here. + * Fragment shader implemented here. SIMD aspects of EU taken full + * advantage of, as pixels are processed in blocks of 16. + * + * CC - Color Calculator. No EU threads associated with this unit. + * Handles blending and (presumably) depth and stencil testing. + */ + +#define BRW_FALLBACK_TEXTURE 0x1 +#define BRW_MAX_CURBE (32*16) + +struct brw_context; + +#define BRW_NEW_URB_FENCE 0x1 +#define BRW_NEW_FRAGMENT_PROGRAM 0x2 +#define BRW_NEW_VERTEX_PROGRAM 0x4 +#define BRW_NEW_INPUT_DIMENSIONS 0x8 +#define BRW_NEW_CURBE_OFFSETS 0x10 +#define BRW_NEW_REDUCED_PRIMITIVE 0x20 +#define BRW_NEW_PRIMITIVE 0x40 +#define BRW_NEW_CONTEXT 0x80 +#define BRW_NEW_WM_INPUT_DIMENSIONS 0x100 +#define BRW_NEW_PSP 0x800 +#define BRW_NEW_WM_SURFACES 0x1000 +#define BRW_NEW_FENCE 0x2000 +#define BRW_NEW_INDICES 0x4000 +#define BRW_NEW_VERTICES 0x8000 +/** + * Used for any batch entry with a relocated pointer that will be used + * by any 3D rendering. + */ +#define BRW_NEW_BATCH 0x10000 +/** brw->depth_region updated */ +#define BRW_NEW_DEPTH_BUFFER 0x20000 +#define BRW_NEW_NR_WM_SURFACES 0x40000 +#define BRW_NEW_NR_VS_SURFACES 0x80000 +#define BRW_NEW_INDEX_BUFFER 0x100000 + +struct brw_state_flags { + /** State update flags signalled by mesa internals */ + GLuint mesa; + /** + * State update flags signalled as the result of brw_tracked_state updates + */ + GLuint brw; + /** State update flags signalled by brw_state_cache.c searches */ + GLuint cache; +}; + + +/** Subclass of Mesa vertex program */ +struct brw_vertex_program { + struct gl_vertex_program program; + GLuint id; + dri_bo *const_buffer; /** Program constant buffer/surface */ + GLboolean use_const_buffer; +}; + + +/** Subclass of Mesa fragment program */ +struct brw_fragment_program { + struct gl_fragment_program program; + GLuint id; /**< serial no. to identify frag progs, never re-used */ + GLboolean isGLSL; /**< really, any IF/LOOP/CONT/BREAK instructions */ + + dri_bo *const_buffer; /** Program constant buffer/surface */ + GLboolean use_const_buffer; + + /** for debugging, which texture units are referenced */ + GLbitfield tex_units_used; +}; + + +/* Data about a particular attempt to compile a program. Note that + * there can be many of these, each in a different GL state + * corresponding to a different brw_wm_prog_key struct, with different + * compiled programs: + */ +struct brw_wm_prog_data { + GLuint curb_read_length; + GLuint urb_read_length; + + GLuint first_curbe_grf; + GLuint total_grf; + GLuint total_scratch; + + GLuint nr_params; /**< number of float params/constants */ + GLboolean error; + + /* Pointer to tracked values (only valid once + * _mesa_load_state_parameters has been called at runtime). + */ + const GLfloat *param[BRW_MAX_CURBE]; +}; + +struct brw_sf_prog_data { + GLuint urb_read_length; + GLuint total_grf; + + /* Each vertex may have upto 12 attributes, 4 components each, + * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11 + * rows. + * + * Actually we use 4 for each, so call it 12 rows. + */ + GLuint urb_entry_size; +}; + +struct brw_clip_prog_data { + GLuint curb_read_length; /* user planes? */ + GLuint clip_mode; + GLuint urb_read_length; + GLuint total_grf; +}; + +struct brw_gs_prog_data { + GLuint urb_read_length; + GLuint total_grf; +}; + +struct brw_vs_prog_data { + GLuint curb_read_length; + GLuint urb_read_length; + GLuint total_grf; + GLuint outputs_written; + GLuint nr_params; /**< number of float params/constants */ + + GLuint inputs_read; + + /* Used for calculating urb partitions: + */ + GLuint urb_entry_size; +}; + + +/* Size == 0 if output either not written, or always [0,0,0,1] + */ +struct brw_vs_ouput_sizes { + GLubyte output_size[VERT_RESULT_MAX]; +}; + + +/** Number of texture sampler units */ +#define BRW_MAX_TEX_UNIT 16 + +/** + * Size of our surface binding table for the WM. + * This contains pointers to the drawing surfaces and current texture + * objects and shader constant buffers (+2). + */ +#define BRW_WM_MAX_SURF (MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1) + +/** + * Helpers to convert drawing buffers, textures and constant buffers + * to surface binding table indexes, for WM. + */ +#define SURF_INDEX_DRAW(d) (d) +#define SURF_INDEX_FRAG_CONST_BUFFER (MAX_DRAW_BUFFERS) +#define SURF_INDEX_TEXTURE(t) (MAX_DRAW_BUFFERS + 1 + (t)) + +/** + * Size of surface binding table for the VS. + * Only one constant buffer for now. + */ +#define BRW_VS_MAX_SURF 1 + +/** + * Only a VS constant buffer + */ +#define SURF_INDEX_VERT_CONST_BUFFER 0 + + +enum brw_cache_id { + BRW_CC_VP, + BRW_CC_UNIT, + BRW_WM_PROG, + BRW_SAMPLER_DEFAULT_COLOR, + BRW_SAMPLER, + BRW_WM_UNIT, + BRW_SF_PROG, + BRW_SF_VP, + BRW_SF_UNIT, + BRW_VS_UNIT, + BRW_VS_PROG, + BRW_GS_UNIT, + BRW_GS_PROG, + BRW_CLIP_VP, + BRW_CLIP_UNIT, + BRW_CLIP_PROG, + BRW_SS_SURFACE, + BRW_SS_SURF_BIND, + + BRW_MAX_CACHE +}; + +struct brw_cache_item { + /** + * Effectively part of the key, cache_id identifies what kind of state + * buffer is involved, and also which brw->state.dirty.cache flag should + * be set when this cache item is chosen. + */ + enum brw_cache_id cache_id; + /** 32-bit hash of the key data */ + GLuint hash; + GLuint key_size; /* for variable-sized keys */ + const void *key; + dri_bo **reloc_bufs; + GLuint nr_reloc_bufs; + + dri_bo *bo; + GLuint data_size; + + struct brw_cache_item *next; +}; + + + +struct brw_cache { + struct brw_context *brw; + + struct brw_cache_item **items; + GLuint size, n_items; + + GLuint key_size[BRW_MAX_CACHE]; /* for fixed-size keys */ + GLuint aux_size[BRW_MAX_CACHE]; + char *name[BRW_MAX_CACHE]; + + /* Record of the last BOs chosen for each cache_id. Used to set + * brw->state.dirty.cache when a new cache item is chosen. + */ + dri_bo *last_bo[BRW_MAX_CACHE]; +}; + + +/* Considered adding a member to this struct to document which flags + * an update might raise so that ordering of the state atoms can be + * checked or derived at runtime. Dropped the idea in favor of having + * a debug mode where the state is monitored for flags which are + * raised that have already been tested against. + */ +struct brw_tracked_state { + struct brw_state_flags dirty; + void (*prepare)( struct brw_context *brw ); + void (*emit)( struct brw_context *brw ); +}; + +/* Flags for brw->state.cache. + */ +#define CACHE_NEW_CC_VP (1<<BRW_CC_VP) +#define CACHE_NEW_CC_UNIT (1<<BRW_CC_UNIT) +#define CACHE_NEW_WM_PROG (1<<BRW_WM_PROG) +#define CACHE_NEW_SAMPLER_DEFAULT_COLOR (1<<BRW_SAMPLER_DEFAULT_COLOR) +#define CACHE_NEW_SAMPLER (1<<BRW_SAMPLER) +#define CACHE_NEW_WM_UNIT (1<<BRW_WM_UNIT) +#define CACHE_NEW_SF_PROG (1<<BRW_SF_PROG) +#define CACHE_NEW_SF_VP (1<<BRW_SF_VP) +#define CACHE_NEW_SF_UNIT (1<<BRW_SF_UNIT) +#define CACHE_NEW_VS_UNIT (1<<BRW_VS_UNIT) +#define CACHE_NEW_VS_PROG (1<<BRW_VS_PROG) +#define CACHE_NEW_GS_UNIT (1<<BRW_GS_UNIT) +#define CACHE_NEW_GS_PROG (1<<BRW_GS_PROG) +#define CACHE_NEW_CLIP_VP (1<<BRW_CLIP_VP) +#define CACHE_NEW_CLIP_UNIT (1<<BRW_CLIP_UNIT) +#define CACHE_NEW_CLIP_PROG (1<<BRW_CLIP_PROG) +#define CACHE_NEW_SURFACE (1<<BRW_SS_SURFACE) +#define CACHE_NEW_SURF_BIND (1<<BRW_SS_SURF_BIND) + +struct brw_cached_batch_item { + struct header *header; + GLuint sz; + struct brw_cached_batch_item *next; +}; + + + +/* Protect against a future where VERT_ATTRIB_MAX > 32. Wouldn't life + * be easier if C allowed arrays of packed elements? + */ +#define ATTRIB_BIT_DWORDS ((VERT_ATTRIB_MAX+31)/32) + +struct brw_vertex_element { + const struct gl_client_array *glarray; + + /** The corresponding Mesa vertex attribute */ + gl_vert_attrib attrib; + /** Size of a complete element */ + GLuint element_size; + /** Number of uploaded elements for this input. */ + GLuint count; + /** Byte stride between elements in the uploaded array */ + GLuint stride; + /** Offset of the first element within the buffer object */ + unsigned int offset; + /** Buffer object containing the uploaded vertex data */ + dri_bo *bo; +}; + + + +struct brw_vertex_info { + GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */ +}; + + + + +/* Cache for TNL programs. + */ +struct brw_tnl_cache_item { + GLuint hash; + void *key; + void *data; + struct brw_tnl_cache_item *next; +}; + +struct brw_tnl_cache { + struct brw_tnl_cache_item **items; + GLuint size, n_items; +}; + +struct brw_query_object { + struct gl_query_object Base; + + /** Doubly linked list of active query objects in the context. */ + struct brw_query_object *prev, *next; + + /** Last query BO associated with this query. */ + dri_bo *bo; + /** First index in bo with query data for this object. */ + int first_index; + /** Last index in bo with query data for this object. */ + int last_index; + + /* Total count of pixels from previous BOs */ + unsigned int count; +}; + + +/** + * brw_context is derived from intel_context. + */ +struct brw_context +{ + struct intel_context intel; /**< base class, must be first field */ + GLuint primitive; + + GLboolean emit_state_always; + GLboolean tmp_fallback; + GLboolean no_batch_wrap; + + struct { + struct brw_state_flags dirty; + + GLuint nr_color_regions; + struct intel_region *color_regions[MAX_DRAW_BUFFERS]; + struct intel_region *depth_region; + + /** + * List of buffers accumulated in brw_validate_state to receive + * dri_bo_check_aperture treatment before exec, so we can know if we + * should flush the batch and try again before emitting primitives. + * + * This can be a fixed number as we only have a limited number of + * objects referenced from the batchbuffer in a primitive emit, + * consisting of the vertex buffers, pipelined state pointers, + * the CURBE, the depth buffer, and a query BO. + */ + dri_bo *validated_bos[VERT_ATTRIB_MAX + 16]; + int validated_bo_count; + } state; + + struct brw_cache cache; /** non-surface items */ + struct brw_cache surface_cache; /* surface items */ + struct brw_cached_batch_item *cached_batch_items; + + struct { + struct brw_vertex_element inputs[VERT_ATTRIB_MAX]; + + struct brw_vertex_element *enabled[VERT_ATTRIB_MAX]; + GLuint nr_enabled; + +#define BRW_NR_UPLOAD_BUFS 17 +#define BRW_UPLOAD_INIT_SIZE (128*1024) + + struct { + dri_bo *bo; + GLuint offset; + } upload; + + /* Summary of size and varying of active arrays, so we can check + * for changes to this state: + */ + struct brw_vertex_info info; + unsigned int min_index, max_index; + } vb; + + struct { + /** + * Index buffer for this draw_prims call. + * + * Updates are signaled by BRW_NEW_INDICES. + */ + const struct _mesa_index_buffer *ib; + + /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */ + dri_bo *bo; + unsigned int offset; + unsigned int size; + /* Offset to index buffer index to use in CMD_3D_PRIM so that we can + * avoid re-uploading the IB packet over and over if we're actually + * referencing the same index buffer. + */ + unsigned int start_vertex_offset; + } ib; + + /* Active vertex program: + */ + const struct gl_vertex_program *vertex_program; + const struct gl_fragment_program *fragment_program; + + + /* For populating the gtt: + */ + GLuint next_free_page; + + + /* BRW_NEW_URB_ALLOCATIONS: + */ + struct { + GLuint vsize; /* vertex size plus header in urb registers */ + GLuint csize; /* constant buffer size in urb registers */ + GLuint sfsize; /* setup data size in urb registers */ + + GLboolean constrained; + + GLuint nr_vs_entries; + GLuint nr_gs_entries; + GLuint nr_clip_entries; + GLuint nr_sf_entries; + GLuint nr_cs_entries; + +/* GLuint vs_size; */ +/* GLuint gs_size; */ +/* GLuint clip_size; */ +/* GLuint sf_size; */ +/* GLuint cs_size; */ + + GLuint vs_start; + GLuint gs_start; + GLuint clip_start; + GLuint sf_start; + GLuint cs_start; + } urb; + + + /* BRW_NEW_CURBE_OFFSETS: + */ + struct { + GLuint wm_start; /**< pos of first wm const in CURBE buffer */ + GLuint wm_size; /**< number of float[4] consts, multiple of 16 */ + GLuint clip_start; + GLuint clip_size; + GLuint vs_start; + GLuint vs_size; + GLuint total_size; + + dri_bo *curbe_bo; + /** Offset within curbe_bo of space for current curbe entry */ + GLuint curbe_offset; + /** Offset within curbe_bo of space for next curbe entry */ + GLuint curbe_next_offset; + + GLfloat *last_buf; + GLuint last_bufsz; + /** + * Whether we should create a new bo instead of reusing the old one + * (if we just dispatch the batch pointing at the old one. + */ + GLboolean need_new_bo; + } curbe; + + struct { + struct brw_vs_prog_data *prog_data; + + dri_bo *prog_bo; + dri_bo *state_bo; + + /** Binding table of pointers to surf_bo entries */ + dri_bo *bind_bo; + dri_bo *surf_bo[BRW_VS_MAX_SURF]; + GLuint nr_surfaces; + } vs; + + struct { + struct brw_gs_prog_data *prog_data; + + GLboolean prog_active; + dri_bo *prog_bo; + dri_bo *state_bo; + } gs; + + struct { + struct brw_clip_prog_data *prog_data; + + dri_bo *prog_bo; + dri_bo *state_bo; + dri_bo *vp_bo; + } clip; + + + struct { + struct brw_sf_prog_data *prog_data; + + dri_bo *prog_bo; + dri_bo *state_bo; + dri_bo *vp_bo; + } sf; + + struct { + struct brw_wm_prog_data *prog_data; + struct brw_wm_compile *compile_data; + + /** Input sizes, calculated from active vertex program. + * One bit per fragment program input attribute. + */ + GLbitfield input_size_masks[4]; + + /** Array of surface default colors (texture border color) */ + dri_bo *sdc_bo[BRW_MAX_TEX_UNIT]; + + GLuint render_surf; + GLuint nr_surfaces; + + GLuint max_threads; + dri_bo *scratch_bo; + + GLuint sampler_count; + dri_bo *sampler_bo; + + /** Binding table of pointers to surf_bo entries */ + dri_bo *bind_bo; + dri_bo *surf_bo[BRW_WM_MAX_SURF]; + + dri_bo *prog_bo; + dri_bo *state_bo; + } wm; + + + struct { + dri_bo *prog_bo; + dri_bo *state_bo; + dri_bo *vp_bo; + } cc; + + struct { + struct brw_query_object active_head; + dri_bo *bo; + int index; + GLboolean active; + } query; + /* Used to give every program string a unique id + */ + GLuint program_id; +}; + + +#define BRW_PACKCOLOR8888(r,g,b,a) ((r<<24) | (g<<16) | (b<<8) | a) + + + +/*====================================================================== + * brw_vtbl.c + */ +void brwInitVtbl( struct brw_context *brw ); + +/*====================================================================== + * brw_context.c + */ +GLboolean brwCreateContext( const __GLcontextModes *mesaVis, + __DRIcontextPrivate *driContextPriv, + void *sharedContextPrivate); + +/*====================================================================== + * brw_queryobj.c + */ +void brw_init_queryobj_functions(struct dd_function_table *functions); +void brw_prepare_query_begin(struct brw_context *brw); +void brw_emit_query_begin(struct brw_context *brw); +void brw_emit_query_end(struct brw_context *brw); + +/*====================================================================== + * brw_state_dump.c + */ +void brw_debug_batch(struct intel_context *intel); + +/*====================================================================== + * brw_tex.c + */ +void brw_validate_textures( struct brw_context *brw ); + + +/*====================================================================== + * brw_program.c + */ +void brwInitFragProgFuncs( struct dd_function_table *functions ); + + +/* brw_urb.c + */ +void brw_upload_urb_fence(struct brw_context *brw); + +/* brw_curbe.c + */ +void brw_upload_cs_urb_state(struct brw_context *brw); + +/* brw_disasm.c */ +int brw_disasm (FILE *file, struct brw_instruction *inst); + +/*====================================================================== + * Inline conversion functions. These are better-typed than the + * macros used previously: + */ +static INLINE struct brw_context * +brw_context( GLcontext *ctx ) +{ + return (struct brw_context *)ctx; +} + +static INLINE struct brw_vertex_program * +brw_vertex_program(struct gl_vertex_program *p) +{ + return (struct brw_vertex_program *) p; +} + +static INLINE const struct brw_vertex_program * +brw_vertex_program_const(const struct gl_vertex_program *p) +{ + return (const struct brw_vertex_program *) p; +} + +static INLINE struct brw_fragment_program * +brw_fragment_program(struct gl_fragment_program *p) +{ + return (struct brw_fragment_program *) p; +} + +static INLINE const struct brw_fragment_program * +brw_fragment_program_const(const struct gl_fragment_program *p) +{ + return (const struct brw_fragment_program *) p; +} + + + +#define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1) + +#endif + diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c new file mode 100644 index 00000000000..4be6c77aa1e --- /dev/null +++ b/src/gallium/drivers/i965/brw_curbe.c @@ -0,0 +1,376 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "main/glheader.h" +#include "main/context.h" +#include "main/macros.h" +#include "main/enums.h" +#include "shader/prog_parameter.h" +#include "shader/prog_print.h" +#include "shader/prog_statevars.h" +#include "intel_batchbuffer.h" +#include "intel_regions.h" +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_state.h" +#include "brw_util.h" + + +/** + * Partition the CURBE between the various users of constant values: + * Note that vertex and fragment shaders can now fetch constants out + * of constant buffers. We no longer allocatea block of the GRF for + * constants. That greatly reduces the demand for space in the CURBE. + * Some of the comments within are dated... + */ +static void calculate_curbe_offsets( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + /* CACHE_NEW_WM_PROG */ + const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16; + + /* BRW_NEW_VERTEX_PROGRAM */ + const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16; + GLuint nr_clip_regs = 0; + GLuint total_regs; + + /* _NEW_TRANSFORM */ + if (ctx->Transform.ClipPlanesEnabled) { + GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled); + nr_clip_regs = (nr_planes * 4 + 15) / 16; + } + + + total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs; + + /* This can happen - what to do? Probably rather than falling + * back, the best thing to do is emit programs which code the + * constants as immediate values. Could do this either as a static + * cap on WM and VS, or adaptively. + * + * Unfortunately, this is currently dependent on the results of the + * program generation process (in the case of wm), so this would + * introduce the need to re-generate programs in the event of a + * curbe allocation failure. + */ + /* Max size is 32 - just large enough to + * hold the 128 parameters allowed by + * the fragment and vertex program + * api's. It's not clear what happens + * when both VP and FP want to use 128 + * parameters, though. + */ + assert(total_regs <= 32); + + /* Lazy resize: + */ + if (nr_fp_regs > brw->curbe.wm_size || + nr_vp_regs > brw->curbe.vs_size || + nr_clip_regs != brw->curbe.clip_size || + (total_regs < brw->curbe.total_size / 4 && + brw->curbe.total_size > 16)) { + + GLuint reg = 0; + + /* Calculate a new layout: + */ + reg = 0; + brw->curbe.wm_start = reg; + brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs; + brw->curbe.clip_start = reg; + brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs; + brw->curbe.vs_start = reg; + brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs; + brw->curbe.total_size = reg; + + if (0) + _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n", + brw->curbe.wm_start, + brw->curbe.wm_size, + brw->curbe.clip_start, + brw->curbe.clip_size, + brw->curbe.vs_start, + brw->curbe.vs_size ); + + brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS; + } +} + + +const struct brw_tracked_state brw_curbe_offsets = { + .dirty = { + .mesa = _NEW_TRANSFORM, + .brw = BRW_NEW_VERTEX_PROGRAM, + .cache = CACHE_NEW_WM_PROG + }, + .prepare = calculate_curbe_offsets +}; + + + + +/* Define the number of curbes within CS's urb allocation. Multiple + * urb entries -> multiple curbes. These will be used by + * fixed-function hardware in a double-buffering scheme to avoid a + * pipeline stall each time the contents of the curbe is changed. + */ +void brw_upload_cs_urb_state(struct brw_context *brw) +{ + struct brw_cs_urb_state cs_urb; + memset(&cs_urb, 0, sizeof(cs_urb)); + + /* It appears that this is the state packet for the CS unit, ie. the + * urb entries detailed here are housed in the CS range from the + * URB_FENCE command. + */ + cs_urb.header.opcode = CMD_CS_URB_STATE; + cs_urb.header.length = sizeof(cs_urb)/4 - 2; + + /* BRW_NEW_URB_FENCE */ + cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries; + cs_urb.bits0.urb_entry_size = brw->urb.csize - 1; + + assert(brw->urb.nr_cs_entries); + BRW_CACHED_BATCH_STRUCT(brw, &cs_urb); +} + +static GLfloat fixed_plane[6][4] = { + { 0, 0, -1, 1 }, + { 0, 0, 1, 1 }, + { 0, -1, 0, 1 }, + { 0, 1, 0, 1 }, + {-1, 0, 0, 1 }, + { 1, 0, 0, 1 } +}; + +/* Upload a new set of constants. Too much variability to go into the + * cache mechanism, but maybe would benefit from a comparison against + * the current uploaded set of constants. + */ +static void prepare_constant_buffer(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + const struct brw_vertex_program *vp = + brw_vertex_program_const(brw->vertex_program); + const struct brw_fragment_program *fp = + brw_fragment_program_const(brw->fragment_program); + const GLuint sz = brw->curbe.total_size; + const GLuint bufsz = sz * 16 * sizeof(GLfloat); + GLfloat *buf; + GLuint i; + + if (sz == 0) { + if (brw->curbe.last_buf) { + free(brw->curbe.last_buf); + brw->curbe.last_buf = NULL; + brw->curbe.last_bufsz = 0; + } + return; + } + + buf = (GLfloat *) _mesa_calloc(bufsz); + + /* fragment shader constants */ + if (brw->curbe.wm_size) { + GLuint offset = brw->curbe.wm_start * 16; + + _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); + + /* copy float constants */ + for (i = 0; i < brw->wm.prog_data->nr_params; i++) + buf[offset + i] = *brw->wm.prog_data->param[i]; + } + + + /* The clipplanes are actually delivered to both CLIP and VS units. + * VS uses them to calculate the outcode bitmasks. + */ + if (brw->curbe.clip_size) { + GLuint offset = brw->curbe.clip_start * 16; + GLuint j; + + /* If any planes are going this way, send them all this way: + */ + for (i = 0; i < 6; i++) { + buf[offset + i * 4 + 0] = fixed_plane[i][0]; + buf[offset + i * 4 + 1] = fixed_plane[i][1]; + buf[offset + i * 4 + 2] = fixed_plane[i][2]; + buf[offset + i * 4 + 3] = fixed_plane[i][3]; + } + + /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to + * clip-space: + */ + assert(MAX_CLIP_PLANES == 6); + for (j = 0; j < MAX_CLIP_PLANES; j++) { + if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { + buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0]; + buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1]; + buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2]; + buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3]; + i++; + } + } + } + + /* vertex shader constants */ + if (brw->curbe.vs_size) { + GLuint offset = brw->curbe.vs_start * 16; + GLuint nr = brw->vs.prog_data->nr_params / 4; + + if (brw->vertex_program->IsNVProgram) + _mesa_load_tracked_matrices(ctx); + + /* Updates the ParamaterValues[i] pointers for all parameters of the + * basic type of PROGRAM_STATE_VAR. + */ + _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); + + /* XXX just use a memcpy here */ + for (i = 0; i < nr; i++) { + const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i]; + buf[offset + i * 4 + 0] = value[0]; + buf[offset + i * 4 + 1] = value[1]; + buf[offset + i * 4 + 2] = value[2]; + buf[offset + i * 4 + 3] = value[3]; + } + } + + if (0) { + for (i = 0; i < sz*16; i+=4) + _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4, + buf[i+0], buf[i+1], buf[i+2], buf[i+3]); + + _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n", + brw->curbe.last_buf, buf, + bufsz, brw->curbe.last_bufsz, + brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1); + } + + if (brw->curbe.curbe_bo != NULL && + brw->curbe.last_buf && + bufsz == brw->curbe.last_bufsz && + memcmp(buf, brw->curbe.last_buf, bufsz) == 0) { + /* constants have not changed */ + _mesa_free(buf); + } + else { + /* constants have changed */ + if (brw->curbe.last_buf) + _mesa_free(brw->curbe.last_buf); + + brw->curbe.last_buf = buf; + brw->curbe.last_bufsz = bufsz; + + if (brw->curbe.curbe_bo != NULL && + (brw->curbe.need_new_bo || + brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size)) + { + dri_bo_unreference(brw->curbe.curbe_bo); + brw->curbe.curbe_bo = NULL; + } + + if (brw->curbe.curbe_bo == NULL) { + /* Allocate a single page for CURBE entries for this batchbuffer. + * They're generally around 64b. + */ + brw->curbe.curbe_bo = dri_bo_alloc(brw->intel.bufmgr, "CURBE", + 4096, 1 << 6); + brw->curbe.curbe_next_offset = 0; + } + + brw->curbe.curbe_offset = brw->curbe.curbe_next_offset; + brw->curbe.curbe_next_offset += bufsz; + brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64); + + /* Copy data to the buffer: + */ + dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf); + } + + brw_add_validated_bo(brw, brw->curbe.curbe_bo); + + /* Because this provokes an action (ie copy the constants into the + * URB), it shouldn't be shortcircuited if identical to the + * previous time - because eg. the urb destination may have + * changed, or the urb contents different to last time. + * + * Note that the data referred to is actually copied internally, + * not just used in place according to passed pointer. + * + * It appears that the CS unit takes care of using each available + * URB entry (Const URB Entry == CURBE) in turn, and issuing + * flushes as necessary when doublebuffering of CURBEs isn't + * possible. + */ +} + +static void emit_constant_buffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + GLuint sz = brw->curbe.total_size; + + BEGIN_BATCH(2, IGNORE_CLIPRECTS); + if (sz == 0) { + OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); + OUT_BATCH(0); + } else { + OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); + OUT_RELOC(brw->curbe.curbe_bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + (sz - 1) + brw->curbe.curbe_offset); + } + ADVANCE_BATCH(); +} + +/* This tracked state is unique in that the state it monitors varies + * dynamically depending on the parameters tracked by the fragment and + * vertex programs. This is the template used as a starting point, + * each context will maintain a copy of this internally and update as + * required. + */ +const struct brw_tracked_state brw_constant_buffer = { + .dirty = { + .mesa = _NEW_PROGRAM_CONSTANTS, + .brw = (BRW_NEW_FRAGMENT_PROGRAM | + BRW_NEW_VERTEX_PROGRAM | + BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */ + BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */ + BRW_NEW_CURBE_OFFSETS | + BRW_NEW_BATCH), + .cache = (CACHE_NEW_WM_PROG) + }, + .prepare = prepare_constant_buffer, + .emit = emit_constant_buffer, +}; + diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h new file mode 100644 index 00000000000..78d457ad2be --- /dev/null +++ b/src/gallium/drivers/i965/brw_defines.h @@ -0,0 +1,851 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_DEFINES_H +#define BRW_DEFINES_H + +/* 3D state: + */ +#define _3DOP_3DSTATE_PIPELINED 0x0 +#define _3DOP_3DSTATE_NONPIPELINED 0x1 +#define _3DOP_3DCONTROL 0x2 +#define _3DOP_3DPRIMITIVE 0x3 + +#define _3DSTATE_PIPELINED_POINTERS 0x00 +#define _3DSTATE_BINDING_TABLE_POINTERS 0x01 +#define _3DSTATE_VERTEX_BUFFERS 0x08 +#define _3DSTATE_VERTEX_ELEMENTS 0x09 +#define _3DSTATE_INDEX_BUFFER 0x0A +#define _3DSTATE_VF_STATISTICS 0x0B +#define _3DSTATE_DRAWING_RECTANGLE 0x00 +#define _3DSTATE_CONSTANT_COLOR 0x01 +#define _3DSTATE_SAMPLER_PALETTE_LOAD 0x02 +#define _3DSTATE_CHROMA_KEY 0x04 +#define _3DSTATE_DEPTH_BUFFER 0x05 +#define _3DSTATE_POLY_STIPPLE_OFFSET 0x06 +#define _3DSTATE_POLY_STIPPLE_PATTERN 0x07 +#define _3DSTATE_LINE_STIPPLE 0x08 +#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP 0x09 +#define _3DCONTROL 0x00 + +#define PIPE_CONTROL_NOWRITE 0x00 +#define PIPE_CONTROL_WRITEIMMEDIATE 0x01 +#define PIPE_CONTROL_WRITEDEPTH 0x02 +#define PIPE_CONTROL_WRITETIMESTAMP 0x03 + +#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00 +#define PIPE_CONTROL_GTTWRITE_GLOBAL 0x01 + +#define _3DPRIM_POINTLIST 0x01 +#define _3DPRIM_LINELIST 0x02 +#define _3DPRIM_LINESTRIP 0x03 +#define _3DPRIM_TRILIST 0x04 +#define _3DPRIM_TRISTRIP 0x05 +#define _3DPRIM_TRIFAN 0x06 +#define _3DPRIM_QUADLIST 0x07 +#define _3DPRIM_QUADSTRIP 0x08 +#define _3DPRIM_LINELIST_ADJ 0x09 +#define _3DPRIM_LINESTRIP_ADJ 0x0A +#define _3DPRIM_TRILIST_ADJ 0x0B +#define _3DPRIM_TRISTRIP_ADJ 0x0C +#define _3DPRIM_TRISTRIP_REVERSE 0x0D +#define _3DPRIM_POLYGON 0x0E +#define _3DPRIM_RECTLIST 0x0F +#define _3DPRIM_LINELOOP 0x10 +#define _3DPRIM_POINTLIST_BF 0x11 +#define _3DPRIM_LINESTRIP_CONT 0x12 +#define _3DPRIM_LINESTRIP_BF 0x13 +#define _3DPRIM_LINESTRIP_CONT_BF 0x14 +#define _3DPRIM_TRIFAN_NOSTIPPLE 0x15 + +#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0 +#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM 1 + +#define BRW_ANISORATIO_2 0 +#define BRW_ANISORATIO_4 1 +#define BRW_ANISORATIO_6 2 +#define BRW_ANISORATIO_8 3 +#define BRW_ANISORATIO_10 4 +#define BRW_ANISORATIO_12 5 +#define BRW_ANISORATIO_14 6 +#define BRW_ANISORATIO_16 7 + +#define BRW_BLENDFACTOR_ONE 0x1 +#define BRW_BLENDFACTOR_SRC_COLOR 0x2 +#define BRW_BLENDFACTOR_SRC_ALPHA 0x3 +#define BRW_BLENDFACTOR_DST_ALPHA 0x4 +#define BRW_BLENDFACTOR_DST_COLOR 0x5 +#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE 0x6 +#define BRW_BLENDFACTOR_CONST_COLOR 0x7 +#define BRW_BLENDFACTOR_CONST_ALPHA 0x8 +#define BRW_BLENDFACTOR_SRC1_COLOR 0x9 +#define BRW_BLENDFACTOR_SRC1_ALPHA 0x0A +#define BRW_BLENDFACTOR_ZERO 0x11 +#define BRW_BLENDFACTOR_INV_SRC_COLOR 0x12 +#define BRW_BLENDFACTOR_INV_SRC_ALPHA 0x13 +#define BRW_BLENDFACTOR_INV_DST_ALPHA 0x14 +#define BRW_BLENDFACTOR_INV_DST_COLOR 0x15 +#define BRW_BLENDFACTOR_INV_CONST_COLOR 0x17 +#define BRW_BLENDFACTOR_INV_CONST_ALPHA 0x18 +#define BRW_BLENDFACTOR_INV_SRC1_COLOR 0x19 +#define BRW_BLENDFACTOR_INV_SRC1_ALPHA 0x1A + +#define BRW_BLENDFUNCTION_ADD 0 +#define BRW_BLENDFUNCTION_SUBTRACT 1 +#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT 2 +#define BRW_BLENDFUNCTION_MIN 3 +#define BRW_BLENDFUNCTION_MAX 4 + +#define BRW_ALPHATEST_FORMAT_UNORM8 0 +#define BRW_ALPHATEST_FORMAT_FLOAT32 1 + +#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH 0 +#define BRW_CHROMAKEY_REPLACE_BLACK 1 + +#define BRW_CLIP_API_OGL 0 +#define BRW_CLIP_API_DX 1 + +#define BRW_CLIPMODE_NORMAL 0 +#define BRW_CLIPMODE_CLIP_ALL 1 +#define BRW_CLIPMODE_CLIP_NON_REJECTED 2 +#define BRW_CLIPMODE_REJECT_ALL 3 +#define BRW_CLIPMODE_ACCEPT_ALL 4 +#define BRW_CLIPMODE_KERNEL_CLIP 5 + +#define BRW_CLIP_NDCSPACE 0 +#define BRW_CLIP_SCREENSPACE 1 + +#define BRW_COMPAREFUNCTION_ALWAYS 0 +#define BRW_COMPAREFUNCTION_NEVER 1 +#define BRW_COMPAREFUNCTION_LESS 2 +#define BRW_COMPAREFUNCTION_EQUAL 3 +#define BRW_COMPAREFUNCTION_LEQUAL 4 +#define BRW_COMPAREFUNCTION_GREATER 5 +#define BRW_COMPAREFUNCTION_NOTEQUAL 6 +#define BRW_COMPAREFUNCTION_GEQUAL 7 + +#define BRW_COVERAGE_PIXELS_HALF 0 +#define BRW_COVERAGE_PIXELS_1 1 +#define BRW_COVERAGE_PIXELS_2 2 +#define BRW_COVERAGE_PIXELS_4 3 + +#define BRW_CULLMODE_BOTH 0 +#define BRW_CULLMODE_NONE 1 +#define BRW_CULLMODE_FRONT 2 +#define BRW_CULLMODE_BACK 3 + +#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM 0 +#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT 1 + +#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT 0 +#define BRW_DEPTHFORMAT_D32_FLOAT 1 +#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT 2 +#define BRW_DEPTHFORMAT_D16_UNORM 5 + +#define BRW_FLOATING_POINT_IEEE_754 0 +#define BRW_FLOATING_POINT_NON_IEEE_754 1 + +#define BRW_FRONTWINDING_CW 0 +#define BRW_FRONTWINDING_CCW 1 + +#define BRW_SPRITE_POINT_ENABLE 16 + +#define BRW_INDEX_BYTE 0 +#define BRW_INDEX_WORD 1 +#define BRW_INDEX_DWORD 2 + +#define BRW_LOGICOPFUNCTION_CLEAR 0 +#define BRW_LOGICOPFUNCTION_NOR 1 +#define BRW_LOGICOPFUNCTION_AND_INVERTED 2 +#define BRW_LOGICOPFUNCTION_COPY_INVERTED 3 +#define BRW_LOGICOPFUNCTION_AND_REVERSE 4 +#define BRW_LOGICOPFUNCTION_INVERT 5 +#define BRW_LOGICOPFUNCTION_XOR 6 +#define BRW_LOGICOPFUNCTION_NAND 7 +#define BRW_LOGICOPFUNCTION_AND 8 +#define BRW_LOGICOPFUNCTION_EQUIV 9 +#define BRW_LOGICOPFUNCTION_NOOP 10 +#define BRW_LOGICOPFUNCTION_OR_INVERTED 11 +#define BRW_LOGICOPFUNCTION_COPY 12 +#define BRW_LOGICOPFUNCTION_OR_REVERSE 13 +#define BRW_LOGICOPFUNCTION_OR 14 +#define BRW_LOGICOPFUNCTION_SET 15 + +#define BRW_MAPFILTER_NEAREST 0x0 +#define BRW_MAPFILTER_LINEAR 0x1 +#define BRW_MAPFILTER_ANISOTROPIC 0x2 + +#define BRW_MIPFILTER_NONE 0 +#define BRW_MIPFILTER_NEAREST 1 +#define BRW_MIPFILTER_LINEAR 3 + +#define BRW_POLYGON_FRONT_FACING 0 +#define BRW_POLYGON_BACK_FACING 1 + +#define BRW_PREFILTER_ALWAYS 0x0 +#define BRW_PREFILTER_NEVER 0x1 +#define BRW_PREFILTER_LESS 0x2 +#define BRW_PREFILTER_EQUAL 0x3 +#define BRW_PREFILTER_LEQUAL 0x4 +#define BRW_PREFILTER_GREATER 0x5 +#define BRW_PREFILTER_NOTEQUAL 0x6 +#define BRW_PREFILTER_GEQUAL 0x7 + +#define BRW_PROVOKING_VERTEX_0 0 +#define BRW_PROVOKING_VERTEX_1 1 +#define BRW_PROVOKING_VERTEX_2 2 + +#define BRW_RASTRULE_UPPER_LEFT 0 +#define BRW_RASTRULE_UPPER_RIGHT 1 +/* These are listed as "Reserved, but not seen as useful" + * in Intel documentation (page 212, "Point Rasterization Rule", + * section 7.4 "SF Pipeline State Summary", of document + * "Intel® 965 Express Chipset Family and Intel® G35 Express + * Chipset Graphics Controller Programmer's Reference Manual, + * Volume 2: 3D/Media", Revision 1.0b as of January 2008, + * available at + * http://intellinuxgraphics.org/documentation.html + * at the time of this writing). + * + * These appear to be supported on at least some + * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT + * is useful when using OpenGL to render to a FBO + * (which has the pixel coordinate Y orientation inverted + * with respect to the normal OpenGL pixel coordinate system). + */ +#define BRW_RASTRULE_LOWER_LEFT 2 +#define BRW_RASTRULE_LOWER_RIGHT 3 + +#define BRW_RENDERTARGET_CLAMPRANGE_UNORM 0 +#define BRW_RENDERTARGET_CLAMPRANGE_SNORM 1 +#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT 2 + +#define BRW_STENCILOP_KEEP 0 +#define BRW_STENCILOP_ZERO 1 +#define BRW_STENCILOP_REPLACE 2 +#define BRW_STENCILOP_INCRSAT 3 +#define BRW_STENCILOP_DECRSAT 4 +#define BRW_STENCILOP_INCR 5 +#define BRW_STENCILOP_DECR 6 +#define BRW_STENCILOP_INVERT 7 + +#define BRW_SURFACE_MIPMAPLAYOUT_BELOW 0 +#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT 1 + +#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT 0x000 +#define BRW_SURFACEFORMAT_R32G32B32A32_SINT 0x001 +#define BRW_SURFACEFORMAT_R32G32B32A32_UINT 0x002 +#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM 0x003 +#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM 0x004 +#define BRW_SURFACEFORMAT_R64G64_FLOAT 0x005 +#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT 0x006 +#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED 0x007 +#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED 0x008 +#define BRW_SURFACEFORMAT_R32G32B32_FLOAT 0x040 +#define BRW_SURFACEFORMAT_R32G32B32_SINT 0x041 +#define BRW_SURFACEFORMAT_R32G32B32_UINT 0x042 +#define BRW_SURFACEFORMAT_R32G32B32_UNORM 0x043 +#define BRW_SURFACEFORMAT_R32G32B32_SNORM 0x044 +#define BRW_SURFACEFORMAT_R32G32B32_SSCALED 0x045 +#define BRW_SURFACEFORMAT_R32G32B32_USCALED 0x046 +#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM 0x080 +#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM 0x081 +#define BRW_SURFACEFORMAT_R16G16B16A16_SINT 0x082 +#define BRW_SURFACEFORMAT_R16G16B16A16_UINT 0x083 +#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT 0x084 +#define BRW_SURFACEFORMAT_R32G32_FLOAT 0x085 +#define BRW_SURFACEFORMAT_R32G32_SINT 0x086 +#define BRW_SURFACEFORMAT_R32G32_UINT 0x087 +#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS 0x088 +#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT 0x089 +#define BRW_SURFACEFORMAT_L32A32_FLOAT 0x08A +#define BRW_SURFACEFORMAT_R32G32_UNORM 0x08B +#define BRW_SURFACEFORMAT_R32G32_SNORM 0x08C +#define BRW_SURFACEFORMAT_R64_FLOAT 0x08D +#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM 0x08E +#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT 0x08F +#define BRW_SURFACEFORMAT_A32X32_FLOAT 0x090 +#define BRW_SURFACEFORMAT_L32X32_FLOAT 0x091 +#define BRW_SURFACEFORMAT_I32X32_FLOAT 0x092 +#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED 0x093 +#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED 0x094 +#define BRW_SURFACEFORMAT_R32G32_SSCALED 0x095 +#define BRW_SURFACEFORMAT_R32G32_USCALED 0x096 +#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0 +#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB 0x0C1 +#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM 0x0C2 +#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB 0x0C3 +#define BRW_SURFACEFORMAT_R10G10B10A2_UINT 0x0C4 +#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM 0x0C5 +#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM 0x0C7 +#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB 0x0C8 +#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM 0x0C9 +#define BRW_SURFACEFORMAT_R8G8B8A8_SINT 0x0CA +#define BRW_SURFACEFORMAT_R8G8B8A8_UINT 0x0CB +#define BRW_SURFACEFORMAT_R16G16_UNORM 0x0CC +#define BRW_SURFACEFORMAT_R16G16_SNORM 0x0CD +#define BRW_SURFACEFORMAT_R16G16_SINT 0x0CE +#define BRW_SURFACEFORMAT_R16G16_UINT 0x0CF +#define BRW_SURFACEFORMAT_R16G16_FLOAT 0x0D0 +#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM 0x0D1 +#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB 0x0D2 +#define BRW_SURFACEFORMAT_R11G11B10_FLOAT 0x0D3 +#define BRW_SURFACEFORMAT_R32_SINT 0x0D6 +#define BRW_SURFACEFORMAT_R32_UINT 0x0D7 +#define BRW_SURFACEFORMAT_R32_FLOAT 0x0D8 +#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS 0x0D9 +#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT 0x0DA +#define BRW_SURFACEFORMAT_L16A16_UNORM 0x0DF +#define BRW_SURFACEFORMAT_I24X8_UNORM 0x0E0 +#define BRW_SURFACEFORMAT_L24X8_UNORM 0x0E1 +#define BRW_SURFACEFORMAT_A24X8_UNORM 0x0E2 +#define BRW_SURFACEFORMAT_I32_FLOAT 0x0E3 +#define BRW_SURFACEFORMAT_L32_FLOAT 0x0E4 +#define BRW_SURFACEFORMAT_A32_FLOAT 0x0E5 +#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM 0x0E9 +#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB 0x0EA +#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM 0x0EB +#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB 0x0EC +#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP 0x0ED +#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM 0x0EE +#define BRW_SURFACEFORMAT_L16A16_FLOAT 0x0F0 +#define BRW_SURFACEFORMAT_R32_UNORM 0x0F1 +#define BRW_SURFACEFORMAT_R32_SNORM 0x0F2 +#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED 0x0F3 +#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED 0x0F4 +#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED 0x0F5 +#define BRW_SURFACEFORMAT_R16G16_SSCALED 0x0F6 +#define BRW_SURFACEFORMAT_R16G16_USCALED 0x0F7 +#define BRW_SURFACEFORMAT_R32_SSCALED 0x0F8 +#define BRW_SURFACEFORMAT_R32_USCALED 0x0F9 +#define BRW_SURFACEFORMAT_B5G6R5_UNORM 0x100 +#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB 0x101 +#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM 0x102 +#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB 0x103 +#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM 0x104 +#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB 0x105 +#define BRW_SURFACEFORMAT_R8G8_UNORM 0x106 +#define BRW_SURFACEFORMAT_R8G8_SNORM 0x107 +#define BRW_SURFACEFORMAT_R8G8_SINT 0x108 +#define BRW_SURFACEFORMAT_R8G8_UINT 0x109 +#define BRW_SURFACEFORMAT_R16_UNORM 0x10A +#define BRW_SURFACEFORMAT_R16_SNORM 0x10B +#define BRW_SURFACEFORMAT_R16_SINT 0x10C +#define BRW_SURFACEFORMAT_R16_UINT 0x10D +#define BRW_SURFACEFORMAT_R16_FLOAT 0x10E +#define BRW_SURFACEFORMAT_I16_UNORM 0x111 +#define BRW_SURFACEFORMAT_L16_UNORM 0x112 +#define BRW_SURFACEFORMAT_A16_UNORM 0x113 +#define BRW_SURFACEFORMAT_L8A8_UNORM 0x114 +#define BRW_SURFACEFORMAT_I16_FLOAT 0x115 +#define BRW_SURFACEFORMAT_L16_FLOAT 0x116 +#define BRW_SURFACEFORMAT_A16_FLOAT 0x117 +#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB 0x118 +#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM 0x119 +#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM 0x11A +#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB 0x11B +#define BRW_SURFACEFORMAT_R8G8_SSCALED 0x11C +#define BRW_SURFACEFORMAT_R8G8_USCALED 0x11D +#define BRW_SURFACEFORMAT_R16_SSCALED 0x11E +#define BRW_SURFACEFORMAT_R16_USCALED 0x11F +#define BRW_SURFACEFORMAT_R8_UNORM 0x140 +#define BRW_SURFACEFORMAT_R8_SNORM 0x141 +#define BRW_SURFACEFORMAT_R8_SINT 0x142 +#define BRW_SURFACEFORMAT_R8_UINT 0x143 +#define BRW_SURFACEFORMAT_A8_UNORM 0x144 +#define BRW_SURFACEFORMAT_I8_UNORM 0x145 +#define BRW_SURFACEFORMAT_L8_UNORM 0x146 +#define BRW_SURFACEFORMAT_P4A4_UNORM 0x147 +#define BRW_SURFACEFORMAT_A4P4_UNORM 0x148 +#define BRW_SURFACEFORMAT_R8_SSCALED 0x149 +#define BRW_SURFACEFORMAT_R8_USCALED 0x14A +#define BRW_SURFACEFORMAT_L8_UNORM_SRGB 0x14C +#define BRW_SURFACEFORMAT_R1_UINT 0x181 +#define BRW_SURFACEFORMAT_YCRCB_NORMAL 0x182 +#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY 0x183 +#define BRW_SURFACEFORMAT_BC1_UNORM 0x186 +#define BRW_SURFACEFORMAT_BC2_UNORM 0x187 +#define BRW_SURFACEFORMAT_BC3_UNORM 0x188 +#define BRW_SURFACEFORMAT_BC4_UNORM 0x189 +#define BRW_SURFACEFORMAT_BC5_UNORM 0x18A +#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB 0x18B +#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB 0x18C +#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB 0x18D +#define BRW_SURFACEFORMAT_MONO8 0x18E +#define BRW_SURFACEFORMAT_YCRCB_SWAPUV 0x18F +#define BRW_SURFACEFORMAT_YCRCB_SWAPY 0x190 +#define BRW_SURFACEFORMAT_DXT1_RGB 0x191 +#define BRW_SURFACEFORMAT_FXT1 0x192 +#define BRW_SURFACEFORMAT_R8G8B8_UNORM 0x193 +#define BRW_SURFACEFORMAT_R8G8B8_SNORM 0x194 +#define BRW_SURFACEFORMAT_R8G8B8_SSCALED 0x195 +#define BRW_SURFACEFORMAT_R8G8B8_USCALED 0x196 +#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT 0x197 +#define BRW_SURFACEFORMAT_R64G64B64_FLOAT 0x198 +#define BRW_SURFACEFORMAT_BC4_SNORM 0x199 +#define BRW_SURFACEFORMAT_BC5_SNORM 0x19A +#define BRW_SURFACEFORMAT_R16G16B16_UNORM 0x19C +#define BRW_SURFACEFORMAT_R16G16B16_SNORM 0x19D +#define BRW_SURFACEFORMAT_R16G16B16_SSCALED 0x19E +#define BRW_SURFACEFORMAT_R16G16B16_USCALED 0x19F + +#define BRW_SURFACERETURNFORMAT_FLOAT32 0 +#define BRW_SURFACERETURNFORMAT_S1 1 + +#define BRW_SURFACE_1D 0 +#define BRW_SURFACE_2D 1 +#define BRW_SURFACE_3D 2 +#define BRW_SURFACE_CUBE 3 +#define BRW_SURFACE_BUFFER 4 +#define BRW_SURFACE_NULL 7 + +#define BRW_TEXCOORDMODE_WRAP 0 +#define BRW_TEXCOORDMODE_MIRROR 1 +#define BRW_TEXCOORDMODE_CLAMP 2 +#define BRW_TEXCOORDMODE_CUBE 3 +#define BRW_TEXCOORDMODE_CLAMP_BORDER 4 +#define BRW_TEXCOORDMODE_MIRROR_ONCE 5 + +#define BRW_THREAD_PRIORITY_NORMAL 0 +#define BRW_THREAD_PRIORITY_HIGH 1 + +#define BRW_TILEWALK_XMAJOR 0 +#define BRW_TILEWALK_YMAJOR 1 + +#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS 0 +#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS 1 + +/* Execution Unit (EU) defines + */ + +#define BRW_ALIGN_1 0 +#define BRW_ALIGN_16 1 + +#define BRW_ADDRESS_DIRECT 0 +#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER 1 + +#define BRW_CHANNEL_X 0 +#define BRW_CHANNEL_Y 1 +#define BRW_CHANNEL_Z 2 +#define BRW_CHANNEL_W 3 + +#define BRW_COMPRESSION_NONE 0 +#define BRW_COMPRESSION_2NDHALF 1 +#define BRW_COMPRESSION_COMPRESSED 2 + +#define BRW_CONDITIONAL_NONE 0 +#define BRW_CONDITIONAL_Z 1 +#define BRW_CONDITIONAL_NZ 2 +#define BRW_CONDITIONAL_EQ 1 /* Z */ +#define BRW_CONDITIONAL_NEQ 2 /* NZ */ +#define BRW_CONDITIONAL_G 3 +#define BRW_CONDITIONAL_GE 4 +#define BRW_CONDITIONAL_L 5 +#define BRW_CONDITIONAL_LE 6 +#define BRW_CONDITIONAL_R 7 +#define BRW_CONDITIONAL_O 8 +#define BRW_CONDITIONAL_U 9 + +#define BRW_DEBUG_NONE 0 +#define BRW_DEBUG_BREAKPOINT 1 + +#define BRW_DEPENDENCY_NORMAL 0 +#define BRW_DEPENDENCY_NOTCLEARED 1 +#define BRW_DEPENDENCY_NOTCHECKED 2 +#define BRW_DEPENDENCY_DISABLE 3 + +#define BRW_EXECUTE_1 0 +#define BRW_EXECUTE_2 1 +#define BRW_EXECUTE_4 2 +#define BRW_EXECUTE_8 3 +#define BRW_EXECUTE_16 4 +#define BRW_EXECUTE_32 5 + +#define BRW_HORIZONTAL_STRIDE_0 0 +#define BRW_HORIZONTAL_STRIDE_1 1 +#define BRW_HORIZONTAL_STRIDE_2 2 +#define BRW_HORIZONTAL_STRIDE_4 3 + +#define BRW_INSTRUCTION_NORMAL 0 +#define BRW_INSTRUCTION_SATURATE 1 + +#define BRW_MASK_ENABLE 0 +#define BRW_MASK_DISABLE 1 + +#define BRW_OPCODE_MOV 1 +#define BRW_OPCODE_SEL 2 +#define BRW_OPCODE_NOT 4 +#define BRW_OPCODE_AND 5 +#define BRW_OPCODE_OR 6 +#define BRW_OPCODE_XOR 7 +#define BRW_OPCODE_SHR 8 +#define BRW_OPCODE_SHL 9 +#define BRW_OPCODE_RSR 10 +#define BRW_OPCODE_RSL 11 +#define BRW_OPCODE_ASR 12 +#define BRW_OPCODE_CMP 16 +#define BRW_OPCODE_CMPN 17 +#define BRW_OPCODE_JMPI 32 +#define BRW_OPCODE_IF 34 +#define BRW_OPCODE_IFF 35 +#define BRW_OPCODE_ELSE 36 +#define BRW_OPCODE_ENDIF 37 +#define BRW_OPCODE_DO 38 +#define BRW_OPCODE_WHILE 39 +#define BRW_OPCODE_BREAK 40 +#define BRW_OPCODE_CONTINUE 41 +#define BRW_OPCODE_HALT 42 +#define BRW_OPCODE_MSAVE 44 +#define BRW_OPCODE_MRESTORE 45 +#define BRW_OPCODE_PUSH 46 +#define BRW_OPCODE_POP 47 +#define BRW_OPCODE_WAIT 48 +#define BRW_OPCODE_SEND 49 +#define BRW_OPCODE_ADD 64 +#define BRW_OPCODE_MUL 65 +#define BRW_OPCODE_AVG 66 +#define BRW_OPCODE_FRC 67 +#define BRW_OPCODE_RNDU 68 +#define BRW_OPCODE_RNDD 69 +#define BRW_OPCODE_RNDE 70 +#define BRW_OPCODE_RNDZ 71 +#define BRW_OPCODE_MAC 72 +#define BRW_OPCODE_MACH 73 +#define BRW_OPCODE_LZD 74 +#define BRW_OPCODE_SAD2 80 +#define BRW_OPCODE_SADA2 81 +#define BRW_OPCODE_DP4 84 +#define BRW_OPCODE_DPH 85 +#define BRW_OPCODE_DP3 86 +#define BRW_OPCODE_DP2 87 +#define BRW_OPCODE_DPA2 88 +#define BRW_OPCODE_LINE 89 +#define BRW_OPCODE_NOP 126 + +#define BRW_PREDICATE_NONE 0 +#define BRW_PREDICATE_NORMAL 1 +#define BRW_PREDICATE_ALIGN1_ANYV 2 +#define BRW_PREDICATE_ALIGN1_ALLV 3 +#define BRW_PREDICATE_ALIGN1_ANY2H 4 +#define BRW_PREDICATE_ALIGN1_ALL2H 5 +#define BRW_PREDICATE_ALIGN1_ANY4H 6 +#define BRW_PREDICATE_ALIGN1_ALL4H 7 +#define BRW_PREDICATE_ALIGN1_ANY8H 8 +#define BRW_PREDICATE_ALIGN1_ALL8H 9 +#define BRW_PREDICATE_ALIGN1_ANY16H 10 +#define BRW_PREDICATE_ALIGN1_ALL16H 11 +#define BRW_PREDICATE_ALIGN16_REPLICATE_X 2 +#define BRW_PREDICATE_ALIGN16_REPLICATE_Y 3 +#define BRW_PREDICATE_ALIGN16_REPLICATE_Z 4 +#define BRW_PREDICATE_ALIGN16_REPLICATE_W 5 +#define BRW_PREDICATE_ALIGN16_ANY4H 6 +#define BRW_PREDICATE_ALIGN16_ALL4H 7 + +#define BRW_ARCHITECTURE_REGISTER_FILE 0 +#define BRW_GENERAL_REGISTER_FILE 1 +#define BRW_MESSAGE_REGISTER_FILE 2 +#define BRW_IMMEDIATE_VALUE 3 + +#define BRW_REGISTER_TYPE_UD 0 +#define BRW_REGISTER_TYPE_D 1 +#define BRW_REGISTER_TYPE_UW 2 +#define BRW_REGISTER_TYPE_W 3 +#define BRW_REGISTER_TYPE_UB 4 +#define BRW_REGISTER_TYPE_B 5 +#define BRW_REGISTER_TYPE_VF 5 /* packed float vector, immediates only? */ +#define BRW_REGISTER_TYPE_HF 6 +#define BRW_REGISTER_TYPE_V 6 /* packed int vector, immediates only, uword dest only */ +#define BRW_REGISTER_TYPE_F 7 + +#define BRW_ARF_NULL 0x00 +#define BRW_ARF_ADDRESS 0x10 +#define BRW_ARF_ACCUMULATOR 0x20 +#define BRW_ARF_FLAG 0x30 +#define BRW_ARF_MASK 0x40 +#define BRW_ARF_MASK_STACK 0x50 +#define BRW_ARF_MASK_STACK_DEPTH 0x60 +#define BRW_ARF_STATE 0x70 +#define BRW_ARF_CONTROL 0x80 +#define BRW_ARF_NOTIFICATION_COUNT 0x90 +#define BRW_ARF_IP 0xA0 + +#define BRW_AMASK 0 +#define BRW_IMASK 1 +#define BRW_LMASK 2 +#define BRW_CMASK 3 + + + +#define BRW_THREAD_NORMAL 0 +#define BRW_THREAD_ATOMIC 1 +#define BRW_THREAD_SWITCH 2 + +#define BRW_VERTICAL_STRIDE_0 0 +#define BRW_VERTICAL_STRIDE_1 1 +#define BRW_VERTICAL_STRIDE_2 2 +#define BRW_VERTICAL_STRIDE_4 3 +#define BRW_VERTICAL_STRIDE_8 4 +#define BRW_VERTICAL_STRIDE_16 5 +#define BRW_VERTICAL_STRIDE_32 6 +#define BRW_VERTICAL_STRIDE_64 7 +#define BRW_VERTICAL_STRIDE_128 8 +#define BRW_VERTICAL_STRIDE_256 9 +#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL 0xF + +#define BRW_WIDTH_1 0 +#define BRW_WIDTH_2 1 +#define BRW_WIDTH_4 2 +#define BRW_WIDTH_8 3 +#define BRW_WIDTH_16 4 + +#define BRW_STATELESS_BUFFER_BOUNDARY_1K 0 +#define BRW_STATELESS_BUFFER_BOUNDARY_2K 1 +#define BRW_STATELESS_BUFFER_BOUNDARY_4K 2 +#define BRW_STATELESS_BUFFER_BOUNDARY_8K 3 +#define BRW_STATELESS_BUFFER_BOUNDARY_16K 4 +#define BRW_STATELESS_BUFFER_BOUNDARY_32K 5 +#define BRW_STATELESS_BUFFER_BOUNDARY_64K 6 +#define BRW_STATELESS_BUFFER_BOUNDARY_128K 7 +#define BRW_STATELESS_BUFFER_BOUNDARY_256K 8 +#define BRW_STATELESS_BUFFER_BOUNDARY_512K 9 +#define BRW_STATELESS_BUFFER_BOUNDARY_1M 10 +#define BRW_STATELESS_BUFFER_BOUNDARY_2M 11 + +#define BRW_POLYGON_FACING_FRONT 0 +#define BRW_POLYGON_FACING_BACK 1 + +#define BRW_MESSAGE_TARGET_NULL 0 +#define BRW_MESSAGE_TARGET_MATH 1 +#define BRW_MESSAGE_TARGET_SAMPLER 2 +#define BRW_MESSAGE_TARGET_GATEWAY 3 +#define BRW_MESSAGE_TARGET_DATAPORT_READ 4 +#define BRW_MESSAGE_TARGET_DATAPORT_WRITE 5 +#define BRW_MESSAGE_TARGET_URB 6 +#define BRW_MESSAGE_TARGET_THREAD_SPAWNER 7 + +#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0 +#define BRW_SAMPLER_RETURN_FORMAT_UINT32 2 +#define BRW_SAMPLER_RETURN_FORMAT_SINT32 3 + +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0 +#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD8_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD16_LD 3 + +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG 0 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG 0 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG 1 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG 1 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG 2 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG 3 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG 3 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG 3 + +/* for IGDNG only */ +#define BRW_SAMPLER_SIMD_MODE_SIMD4X2 0 +#define BRW_SAMPLER_SIMD_MODE_SIMD8 1 +#define BRW_SAMPLER_SIMD_MODE_SIMD16 2 +#define BRW_SAMPLER_SIMD_MODE_SIMD32_64 3 + +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW 0 +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH 1 +#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS 2 +#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS 3 +#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS 4 + +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD 0 +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS 2 + +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2 +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3 + +#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0 +#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1 +#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ 2 +#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3 + +#define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0 +#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1 +#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE 2 + +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE 0 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED 1 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01 2 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23 3 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01 4 + +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 0 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 1 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE 2 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 3 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 4 +#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5 +#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7 + +#define BRW_MATH_FUNCTION_INV 1 +#define BRW_MATH_FUNCTION_LOG 2 +#define BRW_MATH_FUNCTION_EXP 3 +#define BRW_MATH_FUNCTION_SQRT 4 +#define BRW_MATH_FUNCTION_RSQ 5 +#define BRW_MATH_FUNCTION_SIN 6 /* was 7 */ +#define BRW_MATH_FUNCTION_COS 7 /* was 8 */ +#define BRW_MATH_FUNCTION_SINCOS 8 /* was 6 */ +#define BRW_MATH_FUNCTION_TAN 9 +#define BRW_MATH_FUNCTION_POW 10 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT 12 +#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER 13 + +#define BRW_MATH_INTEGER_UNSIGNED 0 +#define BRW_MATH_INTEGER_SIGNED 1 + +#define BRW_MATH_PRECISION_FULL 0 +#define BRW_MATH_PRECISION_PARTIAL 1 + +#define BRW_MATH_SATURATE_NONE 0 +#define BRW_MATH_SATURATE_SATURATE 1 + +#define BRW_MATH_DATA_VECTOR 0 +#define BRW_MATH_DATA_SCALAR 1 + +#define BRW_URB_OPCODE_WRITE 0 + +#define BRW_URB_SWIZZLE_NONE 0 +#define BRW_URB_SWIZZLE_INTERLEAVE 1 +#define BRW_URB_SWIZZLE_TRANSPOSE 2 + +#define BRW_SCRATCH_SPACE_SIZE_1K 0 +#define BRW_SCRATCH_SPACE_SIZE_2K 1 +#define BRW_SCRATCH_SPACE_SIZE_4K 2 +#define BRW_SCRATCH_SPACE_SIZE_8K 3 +#define BRW_SCRATCH_SPACE_SIZE_16K 4 +#define BRW_SCRATCH_SPACE_SIZE_32K 5 +#define BRW_SCRATCH_SPACE_SIZE_64K 6 +#define BRW_SCRATCH_SPACE_SIZE_128K 7 +#define BRW_SCRATCH_SPACE_SIZE_256K 8 +#define BRW_SCRATCH_SPACE_SIZE_512K 9 +#define BRW_SCRATCH_SPACE_SIZE_1M 10 +#define BRW_SCRATCH_SPACE_SIZE_2M 11 + + + + +#define CMD_URB_FENCE 0x6000 +#define CMD_CS_URB_STATE 0x6001 +#define CMD_CONST_BUFFER 0x6002 + +#define CMD_STATE_BASE_ADDRESS 0x6101 +#define CMD_STATE_INSN_POINTER 0x6102 +#define CMD_PIPELINE_SELECT_965 0x6104 +#define CMD_PIPELINE_SELECT_GM45 0x6904 + +#define CMD_PIPELINED_STATE_POINTERS 0x7800 +#define CMD_BINDING_TABLE_PTRS 0x7801 + +#define CMD_VERTEX_BUFFER 0x7808 +# define BRW_VB0_INDEX_SHIFT 27 +# define BRW_VB0_ACCESS_VERTEXDATA (0 << 26) +# define BRW_VB0_ACCESS_INSTANCEDATA (1 << 26) +# define BRW_VB0_PITCH_SHIFT 0 + +#define CMD_VERTEX_ELEMENT 0x7809 +# define BRW_VE0_INDEX_SHIFT 27 +# define BRW_VE0_FORMAT_SHIFT 16 +# define BRW_VE0_VALID (1 << 26) +# define BRW_VE0_SRC_OFFSET_SHIFT 0 +# define BRW_VE1_COMPONENT_NOSTORE 0 +# define BRW_VE1_COMPONENT_STORE_SRC 1 +# define BRW_VE1_COMPONENT_STORE_0 2 +# define BRW_VE1_COMPONENT_STORE_1_FLT 3 +# define BRW_VE1_COMPONENT_STORE_1_INT 4 +# define BRW_VE1_COMPONENT_STORE_VID 5 +# define BRW_VE1_COMPONENT_STORE_IID 6 +# define BRW_VE1_COMPONENT_STORE_PID 7 +# define BRW_VE1_COMPONENT_0_SHIFT 28 +# define BRW_VE1_COMPONENT_1_SHIFT 24 +# define BRW_VE1_COMPONENT_2_SHIFT 20 +# define BRW_VE1_COMPONENT_3_SHIFT 16 +# define BRW_VE1_DST_OFFSET_SHIFT 0 + +#define CMD_INDEX_BUFFER 0x780a +#define CMD_VF_STATISTICS_965 0x780b +#define CMD_VF_STATISTICS_GM45 0x680b + +#define CMD_DRAW_RECT 0x7900 +#define CMD_BLEND_CONSTANT_COLOR 0x7901 +#define CMD_CHROMA_KEY 0x7904 +#define CMD_DEPTH_BUFFER 0x7905 +#define CMD_POLY_STIPPLE_OFFSET 0x7906 +#define CMD_POLY_STIPPLE_PATTERN 0x7907 +#define CMD_LINE_STIPPLE_PATTERN 0x7908 +#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909 +#define CMD_AA_LINE_PARAMETERS 0x790a + +#define CMD_PIPE_CONTROL 0x7a00 + +#define CMD_3D_PRIM 0x7b00 + +#define CMD_MI_FLUSH 0x0200 + + +/* Various values from the R0 vertex header: + */ +#define R02_PRIM_END 0x1 +#define R02_PRIM_START 0x2 + +#include "intel_chipset.h" + +#define BRW_IS_G4X(brw) (IS_G4X((brw)->intel.intelScreen->deviceID)) +#define BRW_IS_IGDNG(brw) (IS_IGDNG((brw)->intel.intelScreen->deviceID)) +#define BRW_IS_965(brw) (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))) +#define CMD_PIPELINE_SELECT(brw) ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965) +#define CMD_VF_STATISTICS(brw) ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965) +#define URB_SIZES(brw) (BRW_IS_IGDNG(brw) ? 1024 : \ + (BRW_IS_G4X(brw) ? 384 : 256)) /* 512 bit units */ + +#endif diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c new file mode 100644 index 00000000000..9fef230507f --- /dev/null +++ b/src/gallium/drivers/i965/brw_disasm.c @@ -0,0 +1,903 @@ +/* + * Copyright © 2008 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <getopt.h> +#include <unistd.h> +#include <stdarg.h> + +#include "main/mtypes.h" + +#include "brw_context.h" +#include "brw_defines.h" + +struct { + char *name; + int nsrc; + int ndst; +} opcode[128] = { + [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 }, + + [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 }, + [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, +}; + +char *conditional_modifier[16] = { + [BRW_CONDITIONAL_NONE] = "", + [BRW_CONDITIONAL_Z] = ".e", + [BRW_CONDITIONAL_NZ] = ".ne", + [BRW_CONDITIONAL_G] = ".g", + [BRW_CONDITIONAL_GE] = ".ge", + [BRW_CONDITIONAL_L] = ".l", + [BRW_CONDITIONAL_LE] = ".le", + [BRW_CONDITIONAL_R] = ".r", + [BRW_CONDITIONAL_O] = ".o", + [BRW_CONDITIONAL_U] = ".u", +}; + +char *negate[2] = { + [0] = "", + [1] = "-", +}; + +char *_abs[2] = { + [0] = "", + [1] = "(abs)", +}; + +char *vert_stride[16] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4", + [4] = "8", + [5] = "16", + [6] = "32", + [15] = "VxH", +}; + +char *width[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", +}; + +char *horiz_stride[4] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4" +}; + +char *chan_sel[4] = { + [0] = "x", + [1] = "y", + [2] = "z", + [3] = "w", +}; + +char *dest_condmod[16] = { +}; + +char *debug_ctrl[2] = { + [0] = "", + [1] = ".breakpoint" +}; + +char *saturate[2] = { + [0] = "", + [1] = ".sat" +}; + +char *exec_size[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", + [5] = "32" +}; + +char *pred_inv[2] = { + [0] = "+", + [1] = "-" +}; + +char *pred_ctrl_align16[16] = { + [1] = "", + [2] = ".x", + [3] = ".y", + [4] = ".z", + [5] = ".w", + [6] = ".any4h", + [7] = ".all4h", +}; + +char *pred_ctrl_align1[16] = { + [1] = "", + [2] = ".anyv", + [3] = ".allv", + [4] = ".any2h", + [5] = ".all2h", + [6] = ".any4h", + [7] = ".all4h", + [8] = ".any8h", + [9] = ".all8h", + [10] = ".any16h", + [11] = ".all16h", +}; + +char *thread_ctrl[4] = { + [0] = "", + [2] = "switch" +}; + +char *compr_ctrl[4] = { + [0] = "", + [1] = "sechalf", + [2] = "compr", +}; + +char *dep_ctrl[4] = { + [0] = "", + [1] = "NoDDClr", + [2] = "NoDDChk", + [3] = "NoDDClr,NoDDChk", +}; + +char *mask_ctrl[4] = { + [0] = "", + [1] = "nomask", +}; + +char *access_mode[2] = { + [0] = "align1", + [1] = "align16", +}; + +char *reg_encoding[8] = { + [0] = "UD", + [1] = "D", + [2] = "UW", + [3] = "W", + [4] = "UB", + [5] = "B", + [7] = "F" +}; + +char *imm_encoding[8] = { + [0] = "UD", + [1] = "D", + [2] = "UW", + [3] = "W", + [5] = "VF", + [5] = "V", + [7] = "F" +}; + +char *reg_file[4] = { + [0] = "A", + [1] = "g", + [2] = "m", + [3] = "imm", +}; + +char *writemask[16] = { + [0x0] = ".", + [0x1] = ".x", + [0x2] = ".y", + [0x3] = ".xy", + [0x4] = ".z", + [0x5] = ".xz", + [0x6] = ".yz", + [0x7] = ".xyz", + [0x8] = ".w", + [0x9] = ".xw", + [0xa] = ".yw", + [0xb] = ".xyw", + [0xc] = ".zw", + [0xd] = ".xzw", + [0xe] = ".yzw", + [0xf] = "", +}; + +char *end_of_thread[2] = { + [0] = "", + [1] = "EOT" +}; + +char *target_function[16] = { + [BRW_MESSAGE_TARGET_NULL] = "null", + [BRW_MESSAGE_TARGET_MATH] = "math", + [BRW_MESSAGE_TARGET_SAMPLER] = "sampler", + [BRW_MESSAGE_TARGET_GATEWAY] = "gateway", + [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read", + [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write", + [BRW_MESSAGE_TARGET_URB] = "urb", + [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner" +}; + +char *math_function[16] = { + [BRW_MATH_FUNCTION_INV] = "inv", + [BRW_MATH_FUNCTION_LOG] = "log", + [BRW_MATH_FUNCTION_EXP] = "exp", + [BRW_MATH_FUNCTION_SQRT] = "sqrt", + [BRW_MATH_FUNCTION_RSQ] = "rsq", + [BRW_MATH_FUNCTION_SIN] = "sin", + [BRW_MATH_FUNCTION_COS] = "cos", + [BRW_MATH_FUNCTION_SINCOS] = "sincos", + [BRW_MATH_FUNCTION_TAN] = "tan", + [BRW_MATH_FUNCTION_POW] = "pow", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod", + [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv", +}; + +char *math_saturate[2] = { + [0] = "", + [1] = "sat" +}; + +char *math_signed[2] = { + [0] = "", + [1] = "signed" +}; + +char *math_scalar[2] = { + [0] = "", + [1] = "scalar" +}; + +char *math_precision[2] = { + [0] = "", + [1] = "partial_precision" +}; + +char *urb_swizzle[4] = { + [BRW_URB_SWIZZLE_NONE] = "", + [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave", + [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose", +}; + +char *urb_allocate[2] = { + [0] = "", + [1] = "allocate" +}; + +char *urb_used[2] = { + [0] = "", + [1] = "used" +}; + +char *urb_complete[2] = { + [0] = "", + [1] = "complete" +}; + +char *sampler_target_format[4] = { + [0] = "F", + [2] = "UD", + [3] = "D" +}; + + +static int column; + +static int string (FILE *file, char *string) +{ + fputs (string, file); + column += strlen (string); + return 0; +} + +static int format (FILE *f, char *format, ...) +{ + char buf[1024]; + va_list args; + va_start (args, format); + + vsnprintf (buf, sizeof (buf) - 1, format, args); + string (f, buf); + return 0; +} + +static int newline (FILE *f) +{ + putc ('\n', f); + column = 0; + return 0; +} + +static int pad (FILE *f, int c) +{ + do + string (f, " "); + while (column < c); + return 0; +} + +static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space) +{ + if (!ctrl[id]) { + fprintf (file, "*** invalid %s value %d ", + name, id); + return 1; + } + if (ctrl[id][0]) + { + if (space && *space) + string (file, " "); + string (file, ctrl[id]); + if (space) + *space = 1; + } + return 0; +} + +static int print_opcode (FILE *file, int id) +{ + if (!opcode[id].name) { + format (file, "*** invalid opcode value %d ", id); + return 1; + } + string (file, opcode[id].name); + return 0; +} + +static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr) +{ + int err = 0; + if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) { + switch (_reg_nr & 0xf0) { + case BRW_ARF_NULL: + string (file, "null"); + return -1; + case BRW_ARF_ADDRESS: + format (file, "a%d", _reg_nr & 0x0f); + break; + case BRW_ARF_ACCUMULATOR: + format (file, "acc%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK: + format (file, "mask%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK_STACK: + format (file, "msd%d", _reg_nr & 0x0f); + break; + case BRW_ARF_STATE: + format (file, "sr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_CONTROL: + format (file, "cr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_NOTIFICATION_COUNT: + format (file, "n%d", _reg_nr & 0x0f); + break; + case BRW_ARF_IP: + string (file, "ip"); + return -1; + break; + default: + format (file, "ARF%d", _reg_nr); + break; + } + } else { + err |= control (file, "src reg file", reg_file, _reg_file, NULL); + format (file, "%d", _reg_nr); + } + return err; +} + +static int dest (FILE *file, struct brw_instruction *inst) +{ + int err = 0; + + if (inst->header.access_mode == BRW_ALIGN_1) + { + if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT) + { + err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr); + if (err == -1) + return 0; + if (inst->bits1.da1.dest_subreg_nr) + format (file, ".%d", inst->bits1.da1.dest_subreg_nr); + format (file, "<%d>", inst->bits1.da1.dest_horiz_stride); + err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL); + } + else + { + string (file, "g[a0"); + if (inst->bits1.ia1.dest_subreg_nr) + format (file, ".%d", inst->bits1.ia1.dest_subreg_nr); + if (inst->bits1.ia1.dest_indirect_offset) + format (file, " %d", inst->bits1.ia1.dest_indirect_offset); + string (file, "]"); + format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride); + err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL); + } + } + else + { + if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT) + { + err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr); + if (err == -1) + return 0; + if (inst->bits1.da16.dest_subreg_nr) + format (file, ".%d", inst->bits1.da16.dest_subreg_nr); + string (file, "<1>"); + err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL); + err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL); + } + else + { + err = 1; + string (file, "Indirect align16 address mode not supported"); + } + } + + return 0; +} + +static int src_align1_region (FILE *file, + GLuint _vert_stride, GLuint _width, GLuint _horiz_stride) +{ + int err = 0; + string (file, "<"); + err |= control (file, "vert stride", vert_stride, _vert_stride, NULL); + string (file, ","); + err |= control (file, "width", width, _width, NULL); + string (file, ","); + err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL); + string (file, ">"); + return err; +} + +static int src_da1 (FILE *file, GLuint type, GLuint _reg_file, + GLuint _vert_stride, GLuint _width, GLuint _horiz_stride, + GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate) +{ + int err = 0; + err |= control (file, "negate", negate, _negate, NULL); + err |= control (file, "abs", _abs, __abs, NULL); + + err |= reg (file, _reg_file, reg_num); + if (err == -1) + return 0; + if (sub_reg_num) + format (file, ".%d", sub_reg_num); + src_align1_region (file, _vert_stride, _width, _horiz_stride); + err |= control (file, "src reg encoding", reg_encoding, type, NULL); + return err; +} + +static int src_ia1 (FILE *file, + GLuint type, + GLuint _reg_file, + GLint _addr_imm, + GLuint _addr_subreg_nr, + GLuint _negate, + GLuint __abs, + GLuint _addr_mode, + GLuint _horiz_stride, + GLuint _width, + GLuint _vert_stride) +{ + int err = 0; + err |= control (file, "negate", negate, _negate, NULL); + err |= control (file, "abs", _abs, __abs, NULL); + + string (file, "g[a0"); + if (_addr_subreg_nr) + format (file, ".%d", _addr_subreg_nr); + if (_addr_imm) + format (file, " %d", _addr_imm); + string (file, "]"); + src_align1_region (file, _vert_stride, _width, _horiz_stride); + err |= control (file, "src reg encoding", reg_encoding, type, NULL); + return err; +} + +static int src_da16 (FILE *file, + GLuint _reg_type, + GLuint _reg_file, + GLuint _vert_stride, + GLuint _reg_nr, + GLuint _subreg_nr, + GLuint __abs, + GLuint _negate, + GLuint swz_x, + GLuint swz_y, + GLuint swz_z, + GLuint swz_w) +{ + int err = 0; + err |= control (file, "negate", negate, _negate, NULL); + err |= control (file, "abs", _abs, __abs, NULL); + + err |= reg (file, _reg_file, _reg_nr); + if (err == -1) + return 0; + if (_subreg_nr) + format (file, ".%d", _subreg_nr); + string (file, "<"); + err |= control (file, "vert stride", vert_stride, _vert_stride, NULL); + string (file, ",1,1>"); + err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL); + /* + * Three kinds of swizzle display: + * identity - nothing printed + * 1->all - print the single channel + * 1->1 - print the mapping + */ + if (swz_x == BRW_CHANNEL_X && + swz_y == BRW_CHANNEL_Y && + swz_z == BRW_CHANNEL_Z && + swz_w == BRW_CHANNEL_W) + { + ; + } + else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w) + { + string (file, "."); + err |= control (file, "channel select", chan_sel, swz_x, NULL); + } + else + { + string (file, "."); + err |= control (file, "channel select", chan_sel, swz_x, NULL); + err |= control (file, "channel select", chan_sel, swz_y, NULL); + err |= control (file, "channel select", chan_sel, swz_z, NULL); + err |= control (file, "channel select", chan_sel, swz_w, NULL); + } + return err; +} + + +static int imm (FILE *file, GLuint type, struct brw_instruction *inst) { + switch (type) { + case BRW_REGISTER_TYPE_UD: + format (file, "0x%08xUD", inst->bits3.ud); + break; + case BRW_REGISTER_TYPE_D: + format (file, "%dD", inst->bits3.d); + break; + case BRW_REGISTER_TYPE_UW: + format (file, "0x%04xUW", (uint16_t) inst->bits3.ud); + break; + case BRW_REGISTER_TYPE_W: + format (file, "%dW", (int16_t) inst->bits3.d); + break; + case BRW_REGISTER_TYPE_UB: + format (file, "0x%02xUB", (int8_t) inst->bits3.ud); + break; + case BRW_REGISTER_TYPE_VF: + format (file, "Vector Float"); + break; + case BRW_REGISTER_TYPE_V: + format (file, "0x%08xV", inst->bits3.ud); + break; + case BRW_REGISTER_TYPE_F: + format (file, "%-gF", inst->bits3.f); + } + return 0; +} + +static int src0 (FILE *file, struct brw_instruction *inst) +{ + if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE) + return imm (file, inst->bits1.da1.src0_reg_type, + inst); + else if (inst->header.access_mode == BRW_ALIGN_1) + { + if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT) + { + return src_da1 (file, + inst->bits1.da1.src0_reg_type, + inst->bits1.da1.src0_reg_file, + inst->bits2.da1.src0_vert_stride, + inst->bits2.da1.src0_width, + inst->bits2.da1.src0_horiz_stride, + inst->bits2.da1.src0_reg_nr, + inst->bits2.da1.src0_subreg_nr, + inst->bits2.da1.src0_abs, + inst->bits2.da1.src0_negate); + } + else + { + return src_ia1 (file, + inst->bits1.ia1.src0_reg_type, + inst->bits1.ia1.src0_reg_file, + inst->bits2.ia1.src0_indirect_offset, + inst->bits2.ia1.src0_subreg_nr, + inst->bits2.ia1.src0_negate, + inst->bits2.ia1.src0_abs, + inst->bits2.ia1.src0_address_mode, + inst->bits2.ia1.src0_horiz_stride, + inst->bits2.ia1.src0_width, + inst->bits2.ia1.src0_vert_stride); + } + } + else + { + if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT) + { + return src_da16 (file, + inst->bits1.da16.src0_reg_type, + inst->bits1.da16.src0_reg_file, + inst->bits2.da16.src0_vert_stride, + inst->bits2.da16.src0_reg_nr, + inst->bits2.da16.src0_subreg_nr, + inst->bits2.da16.src0_abs, + inst->bits2.da16.src0_negate, + inst->bits2.da16.src0_swz_x, + inst->bits2.da16.src0_swz_y, + inst->bits2.da16.src0_swz_z, + inst->bits2.da16.src0_swz_w); + } + else + { + string (file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +static int src1 (FILE *file, struct brw_instruction *inst) +{ + if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE) + return imm (file, inst->bits1.da1.src1_reg_type, + inst); + else if (inst->header.access_mode == BRW_ALIGN_1) + { + if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT) + { + return src_da1 (file, + inst->bits1.da1.src1_reg_type, + inst->bits1.da1.src1_reg_file, + inst->bits3.da1.src1_vert_stride, + inst->bits3.da1.src1_width, + inst->bits3.da1.src1_horiz_stride, + inst->bits3.da1.src1_reg_nr, + inst->bits3.da1.src1_subreg_nr, + inst->bits3.da1.src1_abs, + inst->bits3.da1.src1_negate); + } + else + { + return src_ia1 (file, + inst->bits1.ia1.src1_reg_type, + inst->bits1.ia1.src1_reg_file, + inst->bits3.ia1.src1_indirect_offset, + inst->bits3.ia1.src1_subreg_nr, + inst->bits3.ia1.src1_negate, + inst->bits3.ia1.src1_abs, + inst->bits3.ia1.src1_address_mode, + inst->bits3.ia1.src1_horiz_stride, + inst->bits3.ia1.src1_width, + inst->bits3.ia1.src1_vert_stride); + } + } + else + { + if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT) + { + return src_da16 (file, + inst->bits1.da16.src1_reg_type, + inst->bits1.da16.src1_reg_file, + inst->bits3.da16.src1_vert_stride, + inst->bits3.da16.src1_reg_nr, + inst->bits3.da16.src1_subreg_nr, + inst->bits3.da16.src1_abs, + inst->bits3.da16.src1_negate, + inst->bits3.da16.src1_swz_x, + inst->bits3.da16.src1_swz_y, + inst->bits3.da16.src1_swz_z, + inst->bits3.da16.src1_swz_w); + } + else + { + string (file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +int brw_disasm (FILE *file, struct brw_instruction *inst) +{ + int err = 0; + int space = 0; + + if (inst->header.predicate_control) { + string (file, "("); + err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL); + string (file, "f0"); + if (inst->bits2.da1.flag_reg_nr) + format (file, ".%d", inst->bits2.da1.flag_reg_nr); + if (inst->header.access_mode == BRW_ALIGN_1) + err |= control (file, "predicate control align1", pred_ctrl_align1, + inst->header.predicate_control, NULL); + else + err |= control (file, "predicate control align16", pred_ctrl_align16, + inst->header.predicate_control, NULL); + string (file, ") "); + } + + err |= print_opcode (file, inst->header.opcode); + err |= control (file, "saturate", saturate, inst->header.saturate, NULL); + err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL); + + if (inst->header.opcode != BRW_OPCODE_SEND) + err |= control (file, "conditional modifier", conditional_modifier, + inst->header.destreg__conditionalmod, NULL); + + if (inst->header.opcode != BRW_OPCODE_NOP) { + string (file, "("); + err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL); + string (file, ")"); + } + + if (inst->header.opcode == BRW_OPCODE_SEND) + format (file, " %d", inst->header.destreg__conditionalmod); + + if (opcode[inst->header.opcode].ndst > 0) { + pad (file, 16); + err |= dest (file, inst); + } + if (opcode[inst->header.opcode].nsrc > 0) { + pad (file, 32); + err |= src0 (file, inst); + } + if (opcode[inst->header.opcode].nsrc > 1) { + pad (file, 48); + err |= src1 (file, inst); + } + + if (inst->header.opcode == BRW_OPCODE_SEND) { + newline (file); + pad (file, 16); + space = 0; + err |= control (file, "target function", target_function, + inst->bits3.generic.msg_target, &space); + switch (inst->bits3.generic.msg_target) { + case BRW_MESSAGE_TARGET_MATH: + err |= control (file, "math function", math_function, + inst->bits3.math.function, &space); + err |= control (file, "math saturate", math_saturate, + inst->bits3.math.saturate, &space); + err |= control (file, "math signed", math_signed, + inst->bits3.math.int_type, &space); + err |= control (file, "math scalar", math_scalar, + inst->bits3.math.data_type, &space); + err |= control (file, "math precision", math_precision, + inst->bits3.math.precision, &space); + break; + case BRW_MESSAGE_TARGET_SAMPLER: + format (file, " (%d, %d, ", + inst->bits3.sampler.binding_table_index, + inst->bits3.sampler.sampler); + err |= control (file, "sampler target format", sampler_target_format, + inst->bits3.sampler.return_format, NULL); + string (file, ")"); + break; + case BRW_MESSAGE_TARGET_DATAPORT_WRITE: + format (file, " (%d, %d, %d, %d)", + inst->bits3.dp_write.binding_table_index, + (inst->bits3.dp_write.pixel_scoreboard_clear << 3) | + inst->bits3.dp_write.msg_control, + inst->bits3.dp_write.msg_type, + inst->bits3.dp_write.send_commit_msg); + break; + case BRW_MESSAGE_TARGET_URB: + format (file, " %d", inst->bits3.urb.offset); + space = 1; + err |= control (file, "urb swizzle", urb_swizzle, + inst->bits3.urb.swizzle_control, &space); + err |= control (file, "urb allocate", urb_allocate, + inst->bits3.urb.allocate, &space); + err |= control (file, "urb used", urb_used, + inst->bits3.urb.used, &space); + err |= control (file, "urb complete", urb_complete, + inst->bits3.urb.complete, &space); + break; + case BRW_MESSAGE_TARGET_THREAD_SPAWNER: + break; + default: + format (file, "unsupported target %d", inst->bits3.generic.msg_target); + break; + } + if (space) + string (file, " "); + format (file, "mlen %d", + inst->bits3.generic.msg_length); + format (file, " rlen %d", + inst->bits3.generic.response_length); + } + pad (file, 64); + if (inst->header.opcode != BRW_OPCODE_NOP) { + string (file, "{"); + space = 1; + err |= control(file, "access mode", access_mode, inst->header.access_mode, &space); + err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space); + err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space); + err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space); + err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space); + if (inst->header.opcode == BRW_OPCODE_SEND) + err |= control (file, "end of thread", end_of_thread, + inst->bits3.generic.end_of_thread, &space); + if (space) + string (file, " "); + string (file, "}"); + } + string (file, ";"); + newline (file); + return err; +} diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c new file mode 100644 index 00000000000..44bb7bd5882 --- /dev/null +++ b/src/gallium/drivers/i965/brw_draw.c @@ -0,0 +1,493 @@ +/************************************************************************** + * + * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "main/glheader.h" +#include "main/context.h" +#include "main/state.h" +#include "main/enums.h" +#include "tnl/tnl.h" +#include "vbo/vbo_context.h" +#include "swrast/swrast.h" +#include "swrast_setup/swrast_setup.h" + +#include "brw_draw.h" +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_state.h" +#include "brw_fallback.h" + +#include "intel_batchbuffer.h" +#include "intel_buffer_objects.h" + +#define FILE_DEBUG_FLAG DEBUG_BATCH + +static GLuint prim_to_hw_prim[GL_POLYGON+1] = { + _3DPRIM_POINTLIST, + _3DPRIM_LINELIST, + _3DPRIM_LINELOOP, + _3DPRIM_LINESTRIP, + _3DPRIM_TRILIST, + _3DPRIM_TRISTRIP, + _3DPRIM_TRIFAN, + _3DPRIM_QUADLIST, + _3DPRIM_QUADSTRIP, + _3DPRIM_POLYGON +}; + + +static const GLenum reduced_prim[GL_POLYGON+1] = { + GL_POINTS, + GL_LINES, + GL_LINES, + GL_LINES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_TRIANGLES +}; + + +/* When the primitive changes, set a state bit and re-validate. Not + * the nicest and would rather deal with this by having all the + * programs be immune to the active primitive (ie. cope with all + * possibilities). That may not be realistic however. + */ +static GLuint brw_set_prim(struct brw_context *brw, GLenum prim) +{ + GLcontext *ctx = &brw->intel.ctx; + + if (INTEL_DEBUG & DEBUG_PRIMS) + _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim)); + + /* Slight optimization to avoid the GS program when not needed: + */ + if (prim == GL_QUAD_STRIP && + ctx->Light.ShadeModel != GL_FLAT && + ctx->Polygon.FrontMode == GL_FILL && + ctx->Polygon.BackMode == GL_FILL) + prim = GL_TRIANGLE_STRIP; + + if (prim != brw->primitive) { + brw->primitive = prim; + brw->state.dirty.brw |= BRW_NEW_PRIMITIVE; + + if (reduced_prim[prim] != brw->intel.reduced_primitive) { + brw->intel.reduced_primitive = reduced_prim[prim]; + brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE; + } + } + + return prim_to_hw_prim[prim]; +} + + +static GLuint trim(GLenum prim, GLuint length) +{ + if (prim == GL_QUAD_STRIP) + return length > 3 ? (length - length % 2) : 0; + else if (prim == GL_QUADS) + return length - length % 4; + else + return length; +} + + +static void brw_emit_prim(struct brw_context *brw, + const struct _mesa_prim *prim, + uint32_t hw_prim) +{ + struct brw_3d_primitive prim_packet; + struct intel_context *intel = &brw->intel; + + if (INTEL_DEBUG & DEBUG_PRIMS) + _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), + prim->start, prim->count); + + prim_packet.header.opcode = CMD_3D_PRIM; + prim_packet.header.length = sizeof(prim_packet)/4 - 2; + prim_packet.header.pad = 0; + prim_packet.header.topology = hw_prim; + prim_packet.header.indexed = prim->indexed; + + prim_packet.verts_per_instance = trim(prim->mode, prim->count); + prim_packet.start_vert_location = prim->start; + if (prim->indexed) + prim_packet.start_vert_location += brw->ib.start_vertex_offset; + prim_packet.instance_count = 1; + prim_packet.start_instance_location = 0; + prim_packet.base_vert_location = prim->basevertex; + + /* Can't wrap here, since we rely on the validated state. */ + brw->no_batch_wrap = GL_TRUE; + + /* If we're set to always flush, do it before and after the primitive emit. + * We want to catch both missed flushes that hurt instruction/state cache + * and missed flushes of the render cache as it heads to other parts of + * the besides the draw code. + */ + if (intel->always_flush_cache) { + BEGIN_BATCH(1, IGNORE_CLIPRECTS); + OUT_BATCH(intel->vtbl.flush_cmd()); + ADVANCE_BATCH(); + } + if (prim_packet.verts_per_instance) { + intel_batchbuffer_data( brw->intel.batch, &prim_packet, + sizeof(prim_packet), LOOP_CLIPRECTS); + } + if (intel->always_flush_cache) { + BEGIN_BATCH(1, IGNORE_CLIPRECTS); + OUT_BATCH(intel->vtbl.flush_cmd()); + ADVANCE_BATCH(); + } + + brw->no_batch_wrap = GL_FALSE; +} + +static void brw_merge_inputs( struct brw_context *brw, + const struct gl_client_array *arrays[]) +{ + struct brw_vertex_info old = brw->vb.info; + GLuint i; + + for (i = 0; i < VERT_ATTRIB_MAX; i++) + dri_bo_unreference(brw->vb.inputs[i].bo); + + memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs)); + memset(&brw->vb.info, 0, sizeof(brw->vb.info)); + + for (i = 0; i < VERT_ATTRIB_MAX; i++) { + brw->vb.inputs[i].glarray = arrays[i]; + brw->vb.inputs[i].attrib = (gl_vert_attrib) i; + + if (arrays[i]->StrideB != 0) + brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) << + ((i%16) * 2); + } + + /* Raise statechanges if input sizes have changed. */ + if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0) + brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS; +} + +/* XXX: could split the primitive list to fallback only on the + * non-conformant primitives. + */ +static GLboolean check_fallbacks( struct brw_context *brw, + const struct _mesa_prim *prim, + GLuint nr_prims ) +{ + GLcontext *ctx = &brw->intel.ctx; + GLuint i; + + /* If we don't require strict OpenGL conformance, never + * use fallbacks. If we're forcing fallbacks, always + * use fallfacks. + */ + if (brw->intel.conformance_mode == 0) + return GL_FALSE; + + if (brw->intel.conformance_mode == 2) + return GL_TRUE; + + if (ctx->Polygon.SmoothFlag) { + for (i = 0; i < nr_prims; i++) + if (reduced_prim[prim[i].mode] == GL_TRIANGLES) + return GL_TRUE; + } + + /* BRW hardware will do AA lines, but they are non-conformant it + * seems. TBD whether we keep this fallback: + */ + if (ctx->Line.SmoothFlag) { + for (i = 0; i < nr_prims; i++) + if (reduced_prim[prim[i].mode] == GL_LINES) + return GL_TRUE; + } + + /* Stipple -- these fallbacks could be resolved with a little + * bit of work? + */ + if (ctx->Line.StippleFlag) { + for (i = 0; i < nr_prims; i++) { + /* GS doesn't get enough information to know when to reset + * the stipple counter?!? + */ + if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) + return GL_TRUE; + + if (prim[i].mode == GL_POLYGON && + (ctx->Polygon.FrontMode == GL_LINE || + ctx->Polygon.BackMode == GL_LINE)) + return GL_TRUE; + } + } + + if (ctx->Point.SmoothFlag) { + for (i = 0; i < nr_prims; i++) + if (prim[i].mode == GL_POINTS) + return GL_TRUE; + } + + /* BRW hardware doesn't handle GL_CLAMP texturing correctly; + * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP + * as GL_CLAMP_TO_EDGE instead. If we're using GL_CLAMP, and + * we want strict conformance, force the fallback. + * Right now, we only do this for 2D textures. + */ + { + int u; + for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) { + struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u]; + if (texUnit->Enabled) { + if (texUnit->Enabled & TEXTURE_1D_BIT) { + if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) { + return GL_TRUE; + } + } + if (texUnit->Enabled & TEXTURE_2D_BIT) { + if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP || + texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) { + return GL_TRUE; + } + } + if (texUnit->Enabled & TEXTURE_3D_BIT) { + if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP || + texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP || + texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) { + return GL_TRUE; + } + } + } + } + } + + /* Nothing stopping us from the fast path now */ + return GL_FALSE; +} + +/* May fail if out of video memory for texture or vbo upload, or on + * fallback conditions. + */ +static GLboolean brw_try_draw_prims( GLcontext *ctx, + const struct gl_client_array *arrays[], + const struct _mesa_prim *prim, + GLuint nr_prims, + const struct _mesa_index_buffer *ib, + GLuint min_index, + GLuint max_index ) +{ + struct intel_context *intel = intel_context(ctx); + struct brw_context *brw = brw_context(ctx); + GLboolean retval = GL_FALSE; + GLboolean warn = GL_FALSE; + GLboolean first_time = GL_TRUE; + GLuint i; + + if (ctx->NewState) + _mesa_update_state( ctx ); + + /* We have to validate the textures *before* checking for fallbacks; + * otherwise, the software fallback won't be able to rely on the + * texture state, the firstLevel and lastLevel fields won't be + * set in the intel texture object (they'll both be 0), and the + * software fallback will segfault if it attempts to access any + * texture level other than level 0. + */ + brw_validate_textures( brw ); + + if (check_fallbacks(brw, prim, nr_prims)) + return GL_FALSE; + + /* Bind all inputs, derive varying and size information: + */ + brw_merge_inputs( brw, arrays ); + + brw->ib.ib = ib; + brw->state.dirty.brw |= BRW_NEW_INDICES; + + brw->vb.min_index = min_index; + brw->vb.max_index = max_index; + brw->state.dirty.brw |= BRW_NEW_VERTICES; + + /* Have to validate state quite late. Will rebuild tnl_program, + * which depends on varying information. + * + * Note this is where brw->vs->prog_data.inputs_read is calculated, + * so can't access it earlier. + */ + + LOCK_HARDWARE(intel); + + if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) { + UNLOCK_HARDWARE(intel); + return GL_TRUE; + } + + for (i = 0; i < nr_prims; i++) { + uint32_t hw_prim; + + /* Flush the batch if it's approaching full, so that we don't wrap while + * we've got validated state that needs to be in the same batch as the + * primitives. This fraction is just a guess (minimal full state plus + * a primitive is around 512 bytes), and would be better if we had + * an upper bound of how much we might emit in a single + * brw_try_draw_prims(). + */ + intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4, + LOOP_CLIPRECTS); + + hw_prim = brw_set_prim(brw, prim[i].mode); + + if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) { + first_time = GL_FALSE; + + brw_validate_state(brw); + + /* Various fallback checks: */ + if (brw->intel.Fallback) + goto out; + + /* Check that we can fit our state in with our existing batchbuffer, or + * flush otherwise. + */ + if (dri_bufmgr_check_aperture_space(brw->state.validated_bos, + brw->state.validated_bo_count)) { + static GLboolean warned; + intel_batchbuffer_flush(intel->batch); + + /* Validate the state after we flushed the batch (which would have + * changed the set of dirty state). If we still fail to + * check_aperture, warn of what's happening, but attempt to continue + * on since it may succeed anyway, and the user would probably rather + * see a failure and a warning than a fallback. + */ + brw_validate_state(brw); + if (!warned && + dri_bufmgr_check_aperture_space(brw->state.validated_bos, + brw->state.validated_bo_count)) { + warn = GL_TRUE; + warned = GL_TRUE; + } + } + + brw_upload_state(brw); + } + + brw_emit_prim(brw, &prim[i], hw_prim); + + retval = GL_TRUE; + } + + if (intel->always_flush_batch) + intel_batchbuffer_flush(intel->batch); + out: + UNLOCK_HARDWARE(intel); + + brw_state_cache_check_size(brw); + + if (warn) + fprintf(stderr, "i965: Single primitive emit potentially exceeded " + "available aperture space\n"); + + if (!retval) + DBG("%s failed\n", __FUNCTION__); + + return retval; +} + +void brw_draw_prims( GLcontext *ctx, + const struct gl_client_array *arrays[], + const struct _mesa_prim *prim, + GLuint nr_prims, + const struct _mesa_index_buffer *ib, + GLboolean index_bounds_valid, + GLuint min_index, + GLuint max_index ) +{ + GLboolean retval; + + if (!vbo_all_varyings_in_vbos(arrays)) { + if (!index_bounds_valid) + vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index); + + /* Decide if we want to rebase. If so we end up recursing once + * only into this function. + */ + if (min_index != 0) { + vbo_rebase_prims(ctx, arrays, + prim, nr_prims, + ib, min_index, max_index, + brw_draw_prims ); + return; + } + } + + /* Make a first attempt at drawing: + */ + retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index); + + /* Otherwise, we really are out of memory. Pass the drawing + * command to the software tnl module and which will in turn call + * swrast to do the drawing. + */ + if (!retval) { + _swsetup_Wakeup(ctx); + _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index); + } + +} + +void brw_draw_init( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct vbo_context *vbo = vbo_context(ctx); + + /* Register our drawing function: + */ + vbo->draw_prims = brw_draw_prims; +} + +void brw_draw_destroy( struct brw_context *brw ) +{ + int i; + + if (brw->vb.upload.bo != NULL) { + dri_bo_unreference(brw->vb.upload.bo); + brw->vb.upload.bo = NULL; + } + + for (i = 0; i < VERT_ATTRIB_MAX; i++) { + dri_bo_unreference(brw->vb.inputs[i].bo); + brw->vb.inputs[i].bo = NULL; + } + + dri_bo_unreference(brw->ib.bo); + brw->ib.bo = NULL; +} diff --git a/src/gallium/drivers/i965/brw_draw.h b/src/gallium/drivers/i965/brw_draw.h new file mode 100644 index 00000000000..2a14db217fc --- /dev/null +++ b/src/gallium/drivers/i965/brw_draw.h @@ -0,0 +1,54 @@ + /************************************************************************** + * + * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef BRW_DRAW_H +#define BRW_DRAW_H + +#include "main/mtypes.h" /* for GLcontext... */ +#include "vbo/vbo.h" + +struct brw_context; + + +void brw_draw_prims( GLcontext *ctx, + const struct gl_client_array *arrays[], + const struct _mesa_prim *prims, + GLuint nr_prims, + const struct _mesa_index_buffer *ib, + GLboolean index_bounds_valid, + GLuint min_index, + GLuint max_index ); + +void brw_draw_init( struct brw_context *brw ); +void brw_draw_destroy( struct brw_context *brw ); + +/* brw_draw_current.c + */ +void brw_init_current_values(GLcontext *ctx, + struct gl_client_array *arrays); + +#endif diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c new file mode 100644 index 00000000000..a3ff6c58d89 --- /dev/null +++ b/src/gallium/drivers/i965/brw_draw_upload.c @@ -0,0 +1,742 @@ +/************************************************************************** + * + * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "main/glheader.h" +#include "main/bufferobj.h" +#include "main/context.h" +#include "main/state.h" +#include "main/api_validate.h" +#include "main/enums.h" + +#include "brw_draw.h" +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_state.h" +#include "brw_fallback.h" + +#include "intel_batchbuffer.h" +#include "intel_buffer_objects.h" +#include "intel_tex.h" + +static GLuint double_types[5] = { + 0, + BRW_SURFACEFORMAT_R64_FLOAT, + BRW_SURFACEFORMAT_R64G64_FLOAT, + BRW_SURFACEFORMAT_R64G64B64_FLOAT, + BRW_SURFACEFORMAT_R64G64B64A64_FLOAT +}; + +static GLuint float_types[5] = { + 0, + BRW_SURFACEFORMAT_R32_FLOAT, + BRW_SURFACEFORMAT_R32G32_FLOAT, + BRW_SURFACEFORMAT_R32G32B32_FLOAT, + BRW_SURFACEFORMAT_R32G32B32A32_FLOAT +}; + +static GLuint uint_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R32_UNORM, + BRW_SURFACEFORMAT_R32G32_UNORM, + BRW_SURFACEFORMAT_R32G32B32_UNORM, + BRW_SURFACEFORMAT_R32G32B32A32_UNORM +}; + +static GLuint uint_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R32_USCALED, + BRW_SURFACEFORMAT_R32G32_USCALED, + BRW_SURFACEFORMAT_R32G32B32_USCALED, + BRW_SURFACEFORMAT_R32G32B32A32_USCALED +}; + +static GLuint int_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R32_SNORM, + BRW_SURFACEFORMAT_R32G32_SNORM, + BRW_SURFACEFORMAT_R32G32B32_SNORM, + BRW_SURFACEFORMAT_R32G32B32A32_SNORM +}; + +static GLuint int_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R32_SSCALED, + BRW_SURFACEFORMAT_R32G32_SSCALED, + BRW_SURFACEFORMAT_R32G32B32_SSCALED, + BRW_SURFACEFORMAT_R32G32B32A32_SSCALED +}; + +static GLuint ushort_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R16_UNORM, + BRW_SURFACEFORMAT_R16G16_UNORM, + BRW_SURFACEFORMAT_R16G16B16_UNORM, + BRW_SURFACEFORMAT_R16G16B16A16_UNORM +}; + +static GLuint ushort_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R16_USCALED, + BRW_SURFACEFORMAT_R16G16_USCALED, + BRW_SURFACEFORMAT_R16G16B16_USCALED, + BRW_SURFACEFORMAT_R16G16B16A16_USCALED +}; + +static GLuint short_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R16_SNORM, + BRW_SURFACEFORMAT_R16G16_SNORM, + BRW_SURFACEFORMAT_R16G16B16_SNORM, + BRW_SURFACEFORMAT_R16G16B16A16_SNORM +}; + +static GLuint short_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R16_SSCALED, + BRW_SURFACEFORMAT_R16G16_SSCALED, + BRW_SURFACEFORMAT_R16G16B16_SSCALED, + BRW_SURFACEFORMAT_R16G16B16A16_SSCALED +}; + +static GLuint ubyte_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R8_UNORM, + BRW_SURFACEFORMAT_R8G8_UNORM, + BRW_SURFACEFORMAT_R8G8B8_UNORM, + BRW_SURFACEFORMAT_R8G8B8A8_UNORM +}; + +static GLuint ubyte_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R8_USCALED, + BRW_SURFACEFORMAT_R8G8_USCALED, + BRW_SURFACEFORMAT_R8G8B8_USCALED, + BRW_SURFACEFORMAT_R8G8B8A8_USCALED +}; + +static GLuint byte_types_norm[5] = { + 0, + BRW_SURFACEFORMAT_R8_SNORM, + BRW_SURFACEFORMAT_R8G8_SNORM, + BRW_SURFACEFORMAT_R8G8B8_SNORM, + BRW_SURFACEFORMAT_R8G8B8A8_SNORM +}; + +static GLuint byte_types_scale[5] = { + 0, + BRW_SURFACEFORMAT_R8_SSCALED, + BRW_SURFACEFORMAT_R8G8_SSCALED, + BRW_SURFACEFORMAT_R8G8B8_SSCALED, + BRW_SURFACEFORMAT_R8G8B8A8_SSCALED +}; + + +/** + * Given vertex array type/size/format/normalized info, return + * the appopriate hardware surface type. + * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays. + */ +static GLuint get_surface_type( GLenum type, GLuint size, + GLenum format, GLboolean normalized ) +{ + if (INTEL_DEBUG & DEBUG_VERTS) + _mesa_printf("type %s size %d normalized %d\n", + _mesa_lookup_enum_by_nr(type), size, normalized); + + if (normalized) { + switch (type) { + case GL_DOUBLE: return double_types[size]; + case GL_FLOAT: return float_types[size]; + case GL_INT: return int_types_norm[size]; + case GL_SHORT: return short_types_norm[size]; + case GL_BYTE: return byte_types_norm[size]; + case GL_UNSIGNED_INT: return uint_types_norm[size]; + case GL_UNSIGNED_SHORT: return ushort_types_norm[size]; + case GL_UNSIGNED_BYTE: + if (format == GL_BGRA) { + /* See GL_EXT_vertex_array_bgra */ + assert(size == 4); + return BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + } + else { + return ubyte_types_norm[size]; + } + default: assert(0); return 0; + } + } + else { + assert(format == GL_RGBA); /* sanity check */ + switch (type) { + case GL_DOUBLE: return double_types[size]; + case GL_FLOAT: return float_types[size]; + case GL_INT: return int_types_scale[size]; + case GL_SHORT: return short_types_scale[size]; + case GL_BYTE: return byte_types_scale[size]; + case GL_UNSIGNED_INT: return uint_types_scale[size]; + case GL_UNSIGNED_SHORT: return ushort_types_scale[size]; + case GL_UNSIGNED_BYTE: return ubyte_types_scale[size]; + default: assert(0); return 0; + } + } +} + + +static GLuint get_size( GLenum type ) +{ + switch (type) { + case GL_DOUBLE: return sizeof(GLdouble); + case GL_FLOAT: return sizeof(GLfloat); + case GL_INT: return sizeof(GLint); + case GL_SHORT: return sizeof(GLshort); + case GL_BYTE: return sizeof(GLbyte); + case GL_UNSIGNED_INT: return sizeof(GLuint); + case GL_UNSIGNED_SHORT: return sizeof(GLushort); + case GL_UNSIGNED_BYTE: return sizeof(GLubyte); + default: return 0; + } +} + +static GLuint get_index_type(GLenum type) +{ + switch (type) { + case GL_UNSIGNED_BYTE: return BRW_INDEX_BYTE; + case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD; + case GL_UNSIGNED_INT: return BRW_INDEX_DWORD; + default: assert(0); return 0; + } +} + +static void wrap_buffers( struct brw_context *brw, + GLuint size ) +{ + if (size < BRW_UPLOAD_INIT_SIZE) + size = BRW_UPLOAD_INIT_SIZE; + + brw->vb.upload.offset = 0; + + if (brw->vb.upload.bo != NULL) + dri_bo_unreference(brw->vb.upload.bo); + brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO", + size, 1); + + /* Set the internal VBO\ to no-backing-store. We only use them as a + * temporary within a brw_try_draw_prims while the lock is held. + */ + /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH + FAKE TO PUSH THIS STUFF */ +// if (!brw->intel.ttm) +// dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL); +} + +static void get_space( struct brw_context *brw, + GLuint size, + dri_bo **bo_return, + GLuint *offset_return ) +{ + size = ALIGN(size, 64); + + if (brw->vb.upload.bo == NULL || + brw->vb.upload.offset + size > brw->vb.upload.bo->size) { + wrap_buffers(brw, size); + } + + assert(*bo_return == NULL); + dri_bo_reference(brw->vb.upload.bo); + *bo_return = brw->vb.upload.bo; + *offset_return = brw->vb.upload.offset; + brw->vb.upload.offset += size; +} + +static void +copy_array_to_vbo_array( struct brw_context *brw, + struct brw_vertex_element *element, + GLuint dst_stride) +{ + struct intel_context *intel = &brw->intel; + GLuint size = element->count * dst_stride; + + get_space(brw, size, &element->bo, &element->offset); + + if (element->glarray->StrideB == 0) { + assert(element->count == 1); + element->stride = 0; + } else { + element->stride = dst_stride; + } + + if (dst_stride == element->glarray->StrideB) { + if (intel->intelScreen->kernel_exec_fencing) { + drm_intel_gem_bo_map_gtt(element->bo); + memcpy((char *)element->bo->virtual + element->offset, + element->glarray->Ptr, size); + drm_intel_gem_bo_unmap_gtt(element->bo); + } else { + dri_bo_subdata(element->bo, + element->offset, + size, + element->glarray->Ptr); + } + } else { + char *dest; + const unsigned char *src = element->glarray->Ptr; + int i; + + if (intel->intelScreen->kernel_exec_fencing) { + drm_intel_gem_bo_map_gtt(element->bo); + dest = element->bo->virtual; + dest += element->offset; + + for (i = 0; i < element->count; i++) { + memcpy(dest, src, dst_stride); + src += element->glarray->StrideB; + dest += dst_stride; + } + + drm_intel_gem_bo_unmap_gtt(element->bo); + } else { + void *data; + + data = _mesa_malloc(dst_stride * element->count); + dest = data; + for (i = 0; i < element->count; i++) { + memcpy(dest, src, dst_stride); + src += element->glarray->StrideB; + dest += dst_stride; + } + + dri_bo_subdata(element->bo, + element->offset, + size, + data); + + _mesa_free(data); + } + } +} + +static void brw_prepare_vertices(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct intel_context *intel = intel_context(ctx); + GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; + GLuint i; + const unsigned char *ptr = NULL; + GLuint interleave = 0; + unsigned int min_index = brw->vb.min_index; + unsigned int max_index = brw->vb.max_index; + + struct brw_vertex_element *upload[VERT_ATTRIB_MAX]; + GLuint nr_uploads = 0; + + /* First build an array of pointers to ve's in vb.inputs_read + */ + if (0) + _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index); + + /* Accumulate the list of enabled arrays. */ + brw->vb.nr_enabled = 0; + while (vs_inputs) { + GLuint i = _mesa_ffsll(vs_inputs) - 1; + struct brw_vertex_element *input = &brw->vb.inputs[i]; + + vs_inputs &= ~(1 << i); + brw->vb.enabled[brw->vb.nr_enabled++] = input; + } + + /* XXX: In the rare cases where this happens we fallback all + * the way to software rasterization, although a tnl fallback + * would be sufficient. I don't know of *any* real world + * cases with > 17 vertex attributes enabled, so it probably + * isn't an issue at this point. + */ + if (brw->vb.nr_enabled >= BRW_VEP_MAX) { + intel->Fallback = 1; + return; + } + + for (i = 0; i < brw->vb.nr_enabled; i++) { + struct brw_vertex_element *input = brw->vb.enabled[i]; + + input->element_size = get_size(input->glarray->Type) * input->glarray->Size; + + if (_mesa_is_bufferobj(input->glarray->BufferObj)) { + struct intel_buffer_object *intel_buffer = + intel_buffer_object(input->glarray->BufferObj); + + /* Named buffer object: Just reference its contents directly. */ + dri_bo_unreference(input->bo); + input->bo = intel_bufferobj_buffer(intel, intel_buffer, + INTEL_READ); + dri_bo_reference(input->bo); + input->offset = (unsigned long)input->glarray->Ptr; + input->stride = input->glarray->StrideB; + input->count = input->glarray->_MaxElement; + + /* This is a common place to reach if the user mistakenly supplies + * a pointer in place of a VBO offset. If we just let it go through, + * we may end up dereferencing a pointer beyond the bounds of the + * GTT. We would hope that the VBO's max_index would save us, but + * Mesa appears to hand us min/max values not clipped to the + * array object's _MaxElement, and _MaxElement frequently appears + * to be wrong anyway. + * + * The VBO spec allows application termination in this case, and it's + * probably a service to the poor programmer to do so rather than + * trying to just not render. + */ + assert(input->offset < input->bo->size); + } else { + input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1; + if (input->bo != NULL) { + /* Already-uploaded vertex data is present from a previous + * prepare_vertices, but we had to re-validate state due to + * check_aperture failing and a new batch being produced. + */ + continue; + } + + /* Queue the buffer object up to be uploaded in the next pass, + * when we've decided if we're doing interleaved or not. + */ + if (input->attrib == VERT_ATTRIB_POS) { + /* Position array not properly enabled: + */ + if (input->glarray->StrideB == 0) { + intel->Fallback = 1; + return; + } + + interleave = input->glarray->StrideB; + ptr = input->glarray->Ptr; + } + else if (interleave != input->glarray->StrideB || + (const unsigned char *)input->glarray->Ptr - ptr < 0 || + (const unsigned char *)input->glarray->Ptr - ptr > interleave) + { + interleave = 0; + } + + upload[nr_uploads++] = input; + + /* We rebase drawing to start at element zero only when + * varyings are not in vbos, which means we can end up + * uploading non-varying arrays (stride != 0) when min_index + * is zero. This doesn't matter as the amount to upload is + * the same for these arrays whether the draw call is rebased + * or not - we just have to upload the one element. + */ + assert(min_index == 0 || input->glarray->StrideB == 0); + } + } + + /* Handle any arrays to be uploaded. */ + if (nr_uploads > 1 && interleave && interleave <= 256) { + /* All uploads are interleaved, so upload the arrays together as + * interleaved. First, upload the contents and set up upload[0]. + */ + copy_array_to_vbo_array(brw, upload[0], interleave); + + for (i = 1; i < nr_uploads; i++) { + /* Then, just point upload[i] at upload[0]'s buffer. */ + upload[i]->stride = interleave; + upload[i]->offset = upload[0]->offset + + ((const unsigned char *)upload[i]->glarray->Ptr - ptr); + upload[i]->bo = upload[0]->bo; + dri_bo_reference(upload[i]->bo); + } + } + else { + /* Upload non-interleaved arrays */ + for (i = 0; i < nr_uploads; i++) { + copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size); + } + } + + brw_prepare_query_begin(brw); + + for (i = 0; i < brw->vb.nr_enabled; i++) { + struct brw_vertex_element *input = brw->vb.enabled[i]; + + brw_add_validated_bo(brw, input->bo); + } +} + +static void brw_emit_vertices(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct intel_context *intel = intel_context(ctx); + GLuint i; + + brw_emit_query_begin(brw); + + /* If the VS doesn't read any inputs (calculating vertex position from + * a state variable for some reason, for example), emit a single pad + * VERTEX_ELEMENT struct and bail. + * + * The stale VB state stays in place, but they don't do anything unless + * a VE loads from them. + */ + if (brw->vb.nr_enabled == 0) { + BEGIN_BATCH(3, IGNORE_CLIPRECTS); + OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1); + OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) | + BRW_VE0_VALID | + (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) | + (0 << BRW_VE0_SRC_OFFSET_SHIFT)); + OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | + (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | + (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | + (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT)); + ADVANCE_BATCH(); + return; + } + + /* Now emit VB and VEP state packets. + * + * This still defines a hardware VB for each input, even if they + * are interleaved or from the same VBO. TBD if this makes a + * performance difference. + */ + BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS); + OUT_BATCH((CMD_VERTEX_BUFFER << 16) | + ((1 + brw->vb.nr_enabled * 4) - 2)); + + for (i = 0; i < brw->vb.nr_enabled; i++) { + struct brw_vertex_element *input = brw->vb.enabled[i]; + + OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) | + BRW_VB0_ACCESS_VERTEXDATA | + (input->stride << BRW_VB0_PITCH_SHIFT)); + OUT_RELOC(input->bo, + I915_GEM_DOMAIN_VERTEX, 0, + input->offset); + if (BRW_IS_IGDNG(brw)) { + if (input->stride) { + OUT_RELOC(input->bo, + I915_GEM_DOMAIN_VERTEX, 0, + input->offset + input->stride * input->count - 1); + } else { + assert(input->count == 1); + OUT_RELOC(input->bo, + I915_GEM_DOMAIN_VERTEX, 0, + input->offset + input->element_size - 1); + } + } else + OUT_BATCH(input->stride ? input->count : 0); + OUT_BATCH(0); /* Instance data step rate */ + } + ADVANCE_BATCH(); + + BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS); + OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2)); + for (i = 0; i < brw->vb.nr_enabled; i++) { + struct brw_vertex_element *input = brw->vb.enabled[i]; + uint32_t format = get_surface_type(input->glarray->Type, + input->glarray->Size, + input->glarray->Format, + input->glarray->Normalized); + uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC; + uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC; + uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC; + uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC; + + switch (input->glarray->Size) { + case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; + case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; + case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; + case 3: comp3 = BRW_VE1_COMPONENT_STORE_1_FLT; + break; + } + + OUT_BATCH((i << BRW_VE0_INDEX_SHIFT) | + BRW_VE0_VALID | + (format << BRW_VE0_FORMAT_SHIFT) | + (0 << BRW_VE0_SRC_OFFSET_SHIFT)); + + if (BRW_IS_IGDNG(brw)) + OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | + (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | + (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | + (comp3 << BRW_VE1_COMPONENT_3_SHIFT)); + else + OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | + (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | + (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | + (comp3 << BRW_VE1_COMPONENT_3_SHIFT) | + ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT)); + } + ADVANCE_BATCH(); +} + +const struct brw_tracked_state brw_vertices = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH | BRW_NEW_VERTICES, + .cache = 0, + }, + .prepare = brw_prepare_vertices, + .emit = brw_emit_vertices, +}; + +static void brw_prepare_indices(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct intel_context *intel = &brw->intel; + const struct _mesa_index_buffer *index_buffer = brw->ib.ib; + GLuint ib_size; + dri_bo *bo = NULL; + struct gl_buffer_object *bufferobj; + GLuint offset; + GLuint ib_type_size; + + if (index_buffer == NULL) + return; + + ib_type_size = get_size(index_buffer->type); + ib_size = ib_type_size * index_buffer->count; + bufferobj = index_buffer->obj;; + + /* Turn into a proper VBO: + */ + if (!_mesa_is_bufferobj(bufferobj)) { + brw->ib.start_vertex_offset = 0; + + /* Get new bufferobj, offset: + */ + get_space(brw, ib_size, &bo, &offset); + + /* Straight upload + */ + if (intel->intelScreen->kernel_exec_fencing) { + drm_intel_gem_bo_map_gtt(bo); + memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size); + drm_intel_gem_bo_unmap_gtt(bo); + } else { + dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr); + } + } else { + offset = (GLuint) (unsigned long) index_buffer->ptr; + brw->ib.start_vertex_offset = 0; + + /* If the index buffer isn't aligned to its element size, we have to + * rebase it into a temporary. + */ + if ((get_size(index_buffer->type) - 1) & offset) { + GLubyte *map = ctx->Driver.MapBuffer(ctx, + GL_ELEMENT_ARRAY_BUFFER_ARB, + GL_DYNAMIC_DRAW_ARB, + bufferobj); + map += offset; + + get_space(brw, ib_size, &bo, &offset); + + dri_bo_subdata(bo, offset, ib_size, map); + + ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj); + } else { + bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj), + INTEL_READ); + dri_bo_reference(bo); + + /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading + * the index buffer state when we're just moving the start index + * of our drawing. + */ + brw->ib.start_vertex_offset = offset / ib_type_size; + offset = 0; + ib_size = bo->size; + } + } + + if (brw->ib.bo != bo || + brw->ib.offset != offset || + brw->ib.size != ib_size) + { + drm_intel_bo_unreference(brw->ib.bo); + brw->ib.bo = bo; + brw->ib.offset = offset; + brw->ib.size = ib_size; + + brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER; + } else { + drm_intel_bo_unreference(bo); + } + + brw_add_validated_bo(brw, brw->ib.bo); +} + +const struct brw_tracked_state brw_indices = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_INDICES, + .cache = 0, + }, + .prepare = brw_prepare_indices, +}; + +static void brw_emit_index_buffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + const struct _mesa_index_buffer *index_buffer = brw->ib.ib; + + if (index_buffer == NULL) + return; + + /* Emit the indexbuffer packet: + */ + { + struct brw_indexbuffer ib; + + memset(&ib, 0, sizeof(ib)); + + ib.header.bits.opcode = CMD_INDEX_BUFFER; + ib.header.bits.length = sizeof(ib)/4 - 2; + ib.header.bits.index_format = get_index_type(index_buffer->type); + ib.header.bits.cut_index_enable = 0; + + BEGIN_BATCH(4, IGNORE_CLIPRECTS); + OUT_BATCH( ib.header.dword ); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + brw->ib.offset); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + brw->ib.offset + brw->ib.size - 1); + OUT_BATCH( 0 ); + ADVANCE_BATCH(); + } +} + +const struct brw_tracked_state brw_index_buffer = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER, + .cache = 0, + }, + .emit = brw_emit_index_buffer, +}; diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c new file mode 100644 index 00000000000..1df561386e6 --- /dev/null +++ b/src/gallium/drivers/i965/brw_eu.c @@ -0,0 +1,254 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_eu.h" + + + +/* How does predicate control work when execution_size != 8? Do I + * need to test/set for 0xffff when execution_size is 16? + */ +void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value ) +{ + p->current->header.predicate_control = BRW_PREDICATE_NONE; + + if (value != 0xff) { + if (value != p->flag_value) { + brw_push_insn_state(p); + brw_MOV(p, brw_flag_reg(), brw_imm_uw(value)); + p->flag_value = value; + brw_pop_insn_state(p); + } + + p->current->header.predicate_control = BRW_PREDICATE_NORMAL; + } +} + +void brw_set_predicate_control( struct brw_compile *p, GLuint pc ) +{ + p->current->header.predicate_control = pc; +} + +void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional ) +{ + p->current->header.destreg__conditionalmod = conditional; +} + +void brw_set_access_mode( struct brw_compile *p, GLuint access_mode ) +{ + p->current->header.access_mode = access_mode; +} + +void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control ) +{ + p->current->header.compression_control = compression_control; +} + +void brw_set_mask_control( struct brw_compile *p, GLuint value ) +{ + p->current->header.mask_control = value; +} + +void brw_set_saturate( struct brw_compile *p, GLuint value ) +{ + p->current->header.saturate = value; +} + +void brw_push_insn_state( struct brw_compile *p ) +{ + assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]); + memcpy(p->current+1, p->current, sizeof(struct brw_instruction)); + p->current++; +} + +void brw_pop_insn_state( struct brw_compile *p ) +{ + assert(p->current != p->stack); + p->current--; +} + + +/*********************************************************************** + */ +void brw_init_compile( struct brw_context *brw, struct brw_compile *p ) +{ + p->brw = brw; + p->nr_insn = 0; + p->current = p->stack; + memset(p->current, 0, sizeof(p->current[0])); + + /* Some defaults? + */ + brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */ + brw_set_saturate(p, 0); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_predicate_control_flag_value(p, 0xff); +} + + +const GLuint *brw_get_program( struct brw_compile *p, + GLuint *sz ) +{ + GLuint i; + + for (i = 0; i < 8; i++) + brw_NOP(p); + + *sz = p->nr_insn * sizeof(struct brw_instruction); + return (const GLuint *)p->store; +} + + + +/** + * Subroutine calls require special attention. + * Mesa instructions may be expanded into multiple hardware instructions + * so the prog_instruction::BranchTarget field can't be used as an index + * into the hardware instructions. + * + * The BranchTarget field isn't needed, however. Mesa's GLSL compiler + * emits CAL and BGNSUB instructions with labels that can be used to map + * subroutine calls to actual subroutine code blocks. + * + * The structures and function here implement patching of CAL instructions + * so they jump to the right subroutine code... + */ + + +/** + * For each OPCODE_BGNSUB we create one of these. + */ +struct brw_glsl_label +{ + const char *name; /**< the label string */ + GLuint position; /**< the position of the brw instruction for this label */ + struct brw_glsl_label *next; /**< next in linked list */ +}; + + +/** + * For each OPCODE_CAL we create one of these. + */ +struct brw_glsl_call +{ + GLuint call_inst_pos; /**< location of the CAL instruction */ + const char *sub_name; /**< name of subroutine to call */ + struct brw_glsl_call *next; /**< next in linked list */ +}; + + +/** + * Called for each OPCODE_BGNSUB. + */ +void +brw_save_label(struct brw_compile *c, const char *name, GLuint position) +{ + struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label); + label->name = name; + label->position = position; + label->next = c->first_label; + c->first_label = label; +} + + +/** + * Called for each OPCODE_CAL. + */ +void +brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos) +{ + struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call); + call->call_inst_pos = call_pos; + call->sub_name = name; + call->next = c->first_call; + c->first_call = call; +} + + +/** + * Lookup a label, return label's position/offset. + */ +static GLuint +brw_lookup_label(struct brw_compile *c, const char *name) +{ + const struct brw_glsl_label *label; + for (label = c->first_label; label; label = label->next) { + if (strcmp(name, label->name) == 0) { + return label->position; + } + } + abort(); /* should never happen */ + return ~0; +} + + +/** + * When we're done generating code, this function is called to resolve + * subroutine calls. + */ +void +brw_resolve_cals(struct brw_compile *c) +{ + const struct brw_glsl_call *call; + + for (call = c->first_call; call; call = call->next) { + const GLuint sub_loc = brw_lookup_label(c, call->sub_name); + struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos]; + struct brw_instruction *brw_sub_inst = &c->store[sub_loc]; + GLint offset = brw_sub_inst - brw_call_inst; + + /* patch brw_inst1 to point to brw_inst2 */ + brw_set_src1(brw_call_inst, brw_imm_d(offset * 16)); + } + + /* free linked list of calls */ + { + struct brw_glsl_call *call, *next; + for (call = c->first_call; call; call = next) { + next = call->next; + _mesa_free(call); + } + c->first_call = NULL; + } + + /* free linked list of labels */ + { + struct brw_glsl_label *label, *next; + for (label = c->first_label; label; label = next) { + next = label->next; + _mesa_free(label); + } + c->first_label = NULL; + } +} diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h new file mode 100644 index 00000000000..30603bdd0e6 --- /dev/null +++ b/src/gallium/drivers/i965/brw_eu.h @@ -0,0 +1,968 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_EU_H +#define BRW_EU_H + +#include "brw_structs.h" +#include "brw_defines.h" +#include "shader/prog_instruction.h" + +#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6)) +#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3) + +#define BRW_SWIZZLE_NOOP BRW_SWIZZLE4(0,1,2,3) +#define BRW_SWIZZLE_XYZW BRW_SWIZZLE4(0,1,2,3) +#define BRW_SWIZZLE_XXXX BRW_SWIZZLE4(0,0,0,0) +#define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1) + + +#define REG_SIZE (8*4) + + +/* These aren't hardware structs, just something useful for us to pass around: + * + * Align1 operation has a lot of control over input ranges. Used in + * WM programs to implement shaders decomposed into "channel serial" + * or "structure of array" form: + */ +struct brw_reg +{ + GLuint type:4; + GLuint file:2; + GLuint nr:8; + GLuint subnr:5; /* :1 in align16 */ + GLuint negate:1; /* source only */ + GLuint abs:1; /* source only */ + GLuint vstride:4; /* source only */ + GLuint width:3; /* src only, align1 only */ + GLuint hstride:2; /* align1 only */ + GLuint address_mode:1; /* relative addressing, hopefully! */ + GLuint pad0:1; + + union { + struct { + GLuint swizzle:8; /* src only, align16 only */ + GLuint writemask:4; /* dest only, align16 only */ + GLint indirect_offset:10; /* relative addressing offset */ + GLuint pad1:10; /* two dwords total */ + } bits; + + GLfloat f; + GLint d; + GLuint ud; + } dw1; +}; + + +struct brw_indirect { + GLuint addr_subnr:4; + GLint addr_offset:10; + GLuint pad:18; +}; + + +struct brw_glsl_label; +struct brw_glsl_call; + + + +#define BRW_EU_MAX_INSN_STACK 5 +#define BRW_EU_MAX_INSN 10000 + +struct brw_compile { + struct brw_instruction store[BRW_EU_MAX_INSN]; + GLuint nr_insn; + + /* Allow clients to push/pop instruction state: + */ + struct brw_instruction stack[BRW_EU_MAX_INSN_STACK]; + struct brw_instruction *current; + + GLuint flag_value; + GLboolean single_program_flow; + struct brw_context *brw; + + struct brw_glsl_label *first_label; /**< linked list of labels */ + struct brw_glsl_call *first_call; /**< linked list of CALs */ +}; + + +void +brw_save_label(struct brw_compile *c, const char *name, GLuint position); + +void +brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos); + +void +brw_resolve_cals(struct brw_compile *c); + + + +static INLINE int type_sz( GLuint type ) +{ + switch( type ) { + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_F: + return 4; + case BRW_REGISTER_TYPE_HF: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_W: + return 2; + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + return 1; + default: + return 0; + } +} + +/** + * Construct a brw_reg. + * \param file one of the BRW_x_REGISTER_FILE values + * \param nr register number/index + * \param subnr register sub number + * \param type one of BRW_REGISTER_TYPE_x + * \param vstride one of BRW_VERTICAL_STRIDE_x + * \param width one of BRW_WIDTH_x + * \param hstride one of BRW_HORIZONTAL_STRIDE_x + * \param swizzle one of BRW_SWIZZLE_x + * \param writemask WRITEMASK_X/Y/Z/W bitfield + */ +static INLINE struct brw_reg brw_reg( GLuint file, + GLuint nr, + GLuint subnr, + GLuint type, + GLuint vstride, + GLuint width, + GLuint hstride, + GLuint swizzle, + GLuint writemask ) +{ + struct brw_reg reg; + if (type == BRW_GENERAL_REGISTER_FILE) + assert(nr < BRW_MAX_GRF); + else if (type == BRW_MESSAGE_REGISTER_FILE) + assert(nr < BRW_MAX_MRF); + else if (type == BRW_ARCHITECTURE_REGISTER_FILE) + assert(nr <= BRW_ARF_IP); + + reg.type = type; + reg.file = file; + reg.nr = nr; + reg.subnr = subnr * type_sz(type); + reg.negate = 0; + reg.abs = 0; + reg.vstride = vstride; + reg.width = width; + reg.hstride = hstride; + reg.address_mode = BRW_ADDRESS_DIRECT; + reg.pad0 = 0; + + /* Could do better: If the reg is r5.3<0;1,0>, we probably want to + * set swizzle and writemask to W, as the lower bits of subnr will + * be lost when converted to align16. This is probably too much to + * keep track of as you'd want it adjusted by suboffset(), etc. + * Perhaps fix up when converting to align16? + */ + reg.dw1.bits.swizzle = swizzle; + reg.dw1.bits.writemask = writemask; + reg.dw1.bits.indirect_offset = 0; + reg.dw1.bits.pad1 = 0; + return reg; +} + +/** Construct float[16] register */ +static INLINE struct brw_reg brw_vec16_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return brw_reg(file, + nr, + subnr, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_16, + BRW_WIDTH_16, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[8] register */ +static INLINE struct brw_reg brw_vec8_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return brw_reg(file, + nr, + subnr, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_8, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[4] register */ +static INLINE struct brw_reg brw_vec4_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return brw_reg(file, + nr, + subnr, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +/** Construct float[2] register */ +static INLINE struct brw_reg brw_vec2_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return brw_reg(file, + nr, + subnr, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_2, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYXY, + WRITEMASK_XY); +} + +/** Construct float[1] register */ +static INLINE struct brw_reg brw_vec1_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return brw_reg(file, + nr, + subnr, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); +} + + +static INLINE struct brw_reg retype( struct brw_reg reg, + GLuint type ) +{ + reg.type = type; + return reg; +} + +static INLINE struct brw_reg suboffset( struct brw_reg reg, + GLuint delta ) +{ + reg.subnr += delta * type_sz(reg.type); + return reg; +} + + +static INLINE struct brw_reg offset( struct brw_reg reg, + GLuint delta ) +{ + reg.nr += delta; + return reg; +} + + +static INLINE struct brw_reg byte_offset( struct brw_reg reg, + GLuint bytes ) +{ + GLuint newoffset = reg.nr * REG_SIZE + reg.subnr + bytes; + reg.nr = newoffset / REG_SIZE; + reg.subnr = newoffset % REG_SIZE; + return reg; +} + + +/** Construct unsigned word[16] register */ +static INLINE struct brw_reg brw_uw16_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +/** Construct unsigned word[8] register */ +static INLINE struct brw_reg brw_uw8_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +/** Construct unsigned word[1] register */ +static INLINE struct brw_reg brw_uw1_reg( GLuint file, + GLuint nr, + GLuint subnr ) +{ + return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); +} + +static INLINE struct brw_reg brw_imm_reg( GLuint type ) +{ + return brw_reg( BRW_IMMEDIATE_VALUE, + 0, + 0, + type, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + 0, + 0); +} + +/** Construct float immediate register */ +static INLINE struct brw_reg brw_imm_f( GLfloat f ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F); + imm.dw1.f = f; + return imm; +} + +/** Construct integer immediate register */ +static INLINE struct brw_reg brw_imm_d( GLint d ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D); + imm.dw1.d = d; + return imm; +} + +/** Construct uint immediate register */ +static INLINE struct brw_reg brw_imm_ud( GLuint ud ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD); + imm.dw1.ud = ud; + return imm; +} + +/** Construct ushort immediate register */ +static INLINE struct brw_reg brw_imm_uw( GLushort uw ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW); + imm.dw1.ud = uw | (uw << 16); + return imm; +} + +/** Construct short immediate register */ +static INLINE struct brw_reg brw_imm_w( GLshort w ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W); + imm.dw1.d = w | (w << 16); + return imm; +} + +/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type + * numbers alias with _V and _VF below: + */ + +/** Construct vector of eight signed half-byte values */ +static INLINE struct brw_reg brw_imm_v( GLuint v ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V); + imm.vstride = BRW_VERTICAL_STRIDE_0; + imm.width = BRW_WIDTH_8; + imm.hstride = BRW_HORIZONTAL_STRIDE_1; + imm.dw1.ud = v; + return imm; +} + +/** Construct vector of four 8-bit float values */ +static INLINE struct brw_reg brw_imm_vf( GLuint v ) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF); + imm.vstride = BRW_VERTICAL_STRIDE_0; + imm.width = BRW_WIDTH_4; + imm.hstride = BRW_HORIZONTAL_STRIDE_1; + imm.dw1.ud = v; + return imm; +} + +#define VF_ZERO 0x0 +#define VF_ONE 0x30 +#define VF_NEG (1<<7) + +static INLINE struct brw_reg brw_imm_vf4( GLuint v0, + GLuint v1, + GLuint v2, + GLuint v3) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF); + imm.vstride = BRW_VERTICAL_STRIDE_0; + imm.width = BRW_WIDTH_4; + imm.hstride = BRW_HORIZONTAL_STRIDE_1; + imm.dw1.ud = ((v0 << 0) | + (v1 << 8) | + (v2 << 16) | + (v3 << 24)); + return imm; +} + + +static INLINE struct brw_reg brw_address( struct brw_reg reg ) +{ + return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr); +} + +/** Construct float[1] general-purpose register */ +static INLINE struct brw_reg brw_vec1_grf( GLuint nr, GLuint subnr ) +{ + return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[2] general-purpose register */ +static INLINE struct brw_reg brw_vec2_grf( GLuint nr, GLuint subnr ) +{ + return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[4] general-purpose register */ +static INLINE struct brw_reg brw_vec4_grf( GLuint nr, GLuint subnr ) +{ + return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +/** Construct float[8] general-purpose register */ +static INLINE struct brw_reg brw_vec8_grf( GLuint nr, GLuint subnr ) +{ + return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + + +static INLINE struct brw_reg brw_uw8_grf( GLuint nr, GLuint subnr ) +{ + return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + +static INLINE struct brw_reg brw_uw16_grf( GLuint nr, GLuint subnr ) +{ + return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + + +/** Construct null register (usually used for setting condition codes) */ +static INLINE struct brw_reg brw_null_reg( void ) +{ + return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NULL, + 0); +} + +static INLINE struct brw_reg brw_address_reg( GLuint subnr ) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_ADDRESS, + subnr); +} + +/* If/else instructions break in align16 mode if writemask & swizzle + * aren't xyzw. This goes against the convention for other scalar + * regs: + */ +static INLINE struct brw_reg brw_ip_reg( void ) +{ + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_IP, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_4, /* ? */ + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, /* NOTE! */ + WRITEMASK_XYZW); /* NOTE! */ +} + +static INLINE struct brw_reg brw_acc_reg( void ) +{ + return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_ACCUMULATOR, + 0); +} + + +static INLINE struct brw_reg brw_flag_reg( void ) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_FLAG, + 0); +} + + +static INLINE struct brw_reg brw_mask_reg( GLuint subnr ) +{ + return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_MASK, + subnr); +} + +static INLINE struct brw_reg brw_message_reg( GLuint nr ) +{ + assert(nr < BRW_MAX_MRF); + return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, + nr, + 0); +} + + + + +/* This is almost always called with a numeric constant argument, so + * make things easy to evaluate at compile time: + */ +static INLINE GLuint cvt( GLuint val ) +{ + switch (val) { + case 0: return 0; + case 1: return 1; + case 2: return 2; + case 4: return 3; + case 8: return 4; + case 16: return 5; + case 32: return 6; + } + return 0; +} + +static INLINE struct brw_reg stride( struct brw_reg reg, + GLuint vstride, + GLuint width, + GLuint hstride ) +{ + reg.vstride = cvt(vstride); + reg.width = cvt(width) - 1; + reg.hstride = cvt(hstride); + return reg; +} + + +static INLINE struct brw_reg vec16( struct brw_reg reg ) +{ + return stride(reg, 16,16,1); +} + +static INLINE struct brw_reg vec8( struct brw_reg reg ) +{ + return stride(reg, 8,8,1); +} + +static INLINE struct brw_reg vec4( struct brw_reg reg ) +{ + return stride(reg, 4,4,1); +} + +static INLINE struct brw_reg vec2( struct brw_reg reg ) +{ + return stride(reg, 2,2,1); +} + +static INLINE struct brw_reg vec1( struct brw_reg reg ) +{ + return stride(reg, 0,1,0); +} + + +static INLINE struct brw_reg get_element( struct brw_reg reg, GLuint elt ) +{ + return vec1(suboffset(reg, elt)); +} + +static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt ) +{ + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt)); +} + + +static INLINE struct brw_reg brw_swizzle( struct brw_reg reg, + GLuint x, + GLuint y, + GLuint z, + GLuint w) +{ + reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x), + BRW_GET_SWZ(reg.dw1.bits.swizzle, y), + BRW_GET_SWZ(reg.dw1.bits.swizzle, z), + BRW_GET_SWZ(reg.dw1.bits.swizzle, w)); + return reg; +} + + +static INLINE struct brw_reg brw_swizzle1( struct brw_reg reg, + GLuint x ) +{ + return brw_swizzle(reg, x, x, x, x); +} + +static INLINE struct brw_reg brw_writemask( struct brw_reg reg, + GLuint mask ) +{ + reg.dw1.bits.writemask &= mask; + return reg; +} + +static INLINE struct brw_reg brw_set_writemask( struct brw_reg reg, + GLuint mask ) +{ + reg.dw1.bits.writemask = mask; + return reg; +} + +static INLINE struct brw_reg negate( struct brw_reg reg ) +{ + reg.negate ^= 1; + return reg; +} + +static INLINE struct brw_reg brw_abs( struct brw_reg reg ) +{ + reg.abs = 1; + return reg; +} + +/*********************************************************************** + */ +static INLINE struct brw_reg brw_vec4_indirect( GLuint subnr, + GLint offset ) +{ + struct brw_reg reg = brw_vec4_grf(0, 0); + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.dw1.bits.indirect_offset = offset; + return reg; +} + +static INLINE struct brw_reg brw_vec1_indirect( GLuint subnr, + GLint offset ) +{ + struct brw_reg reg = brw_vec1_grf(0, 0); + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.dw1.bits.indirect_offset = offset; + return reg; +} + +static INLINE struct brw_reg deref_4f(struct brw_indirect ptr, GLint offset) +{ + return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset); +} + +static INLINE struct brw_reg deref_1f(struct brw_indirect ptr, GLint offset) +{ + return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset); +} + +static INLINE struct brw_reg deref_4b(struct brw_indirect ptr, GLint offset) +{ + return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B); +} + +static INLINE struct brw_reg deref_1uw(struct brw_indirect ptr, GLint offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW); +} + +static INLINE struct brw_reg deref_1d(struct brw_indirect ptr, GLint offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D); +} + +static INLINE struct brw_reg deref_1ud(struct brw_indirect ptr, GLint offset) +{ + return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD); +} + +static INLINE struct brw_reg get_addr_reg(struct brw_indirect ptr) +{ + return brw_address_reg(ptr.addr_subnr); +} + +static INLINE struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, GLint offset ) +{ + ptr.addr_offset += offset; + return ptr; +} + +static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset ) +{ + struct brw_indirect ptr; + ptr.addr_subnr = addr_subnr; + ptr.addr_offset = offset; + ptr.pad = 0; + return ptr; +} + +/** Do two brw_regs refer to the same register? */ +static INLINE GLboolean +brw_same_reg(struct brw_reg r1, struct brw_reg r2) +{ + return r1.file == r2.file && r1.nr == r2.nr; +} + +static INLINE struct brw_instruction *current_insn( struct brw_compile *p) +{ + return &p->store[p->nr_insn]; +} + +void brw_pop_insn_state( struct brw_compile *p ); +void brw_push_insn_state( struct brw_compile *p ); +void brw_set_mask_control( struct brw_compile *p, GLuint value ); +void brw_set_saturate( struct brw_compile *p, GLuint value ); +void brw_set_access_mode( struct brw_compile *p, GLuint access_mode ); +void brw_set_compression_control( struct brw_compile *p, GLboolean control ); +void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value ); +void brw_set_predicate_control( struct brw_compile *p, GLuint pc ); +void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional ); + +void brw_init_compile( struct brw_context *, struct brw_compile *p ); +const GLuint *brw_get_program( struct brw_compile *p, GLuint *sz ); + + +/* Helpers for regular instructions: + */ +#define ALU1(OP) \ +struct brw_instruction *brw_##OP(struct brw_compile *p, \ + struct brw_reg dest, \ + struct brw_reg src0); + +#define ALU2(OP) \ +struct brw_instruction *brw_##OP(struct brw_compile *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1); + +ALU1(MOV) +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU2(RSR) +ALU2(RSL) +ALU2(ASR) +ALU2(JMPI) +ALU2(ADD) +ALU2(MUL) +ALU1(FRC) +ALU1(RNDD) +ALU1(RNDZ) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU2(LINE) + +#undef ALU1 +#undef ALU2 + + + +/* Helpers for SEND instruction: + */ +void brw_urb_WRITE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean eot, + GLboolean writes_complete, + GLuint offset, + GLuint swizzle); + +void brw_ff_sync(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean eot, + GLboolean writes_complete, + GLuint offset, + GLuint swizzle); + +void brw_fb_WRITE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLuint binding_table_index, + GLuint msg_length, + GLuint response_length, + GLboolean eot); + +void brw_SAMPLE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLuint binding_table_index, + GLuint sampler, + GLuint writemask, + GLuint msg_type, + GLuint response_length, + GLuint msg_length, + GLboolean eot, + GLuint header_present, + GLuint simd_mode); + +void brw_math_16( struct brw_compile *p, + struct brw_reg dest, + GLuint function, + GLuint saturate, + GLuint msg_reg_nr, + struct brw_reg src, + GLuint precision ); + +void brw_math( struct brw_compile *p, + struct brw_reg dest, + GLuint function, + GLuint saturate, + GLuint msg_reg_nr, + struct brw_reg src, + GLuint data_type, + GLuint precision ); + +void brw_dp_READ_16( struct brw_compile *p, + struct brw_reg dest, + GLuint scratch_offset ); + +void brw_dp_READ_4( struct brw_compile *p, + struct brw_reg dest, + GLboolean relAddr, + GLuint location, + GLuint bind_table_index ); + +void brw_dp_READ_4_vs( struct brw_compile *p, + struct brw_reg dest, + GLuint oword, + GLboolean relAddr, + struct brw_reg addrReg, + GLuint location, + GLuint bind_table_index ); + +void brw_dp_WRITE_16( struct brw_compile *p, + struct brw_reg src, + GLuint scratch_offset ); + +/* If/else/endif. Works by manipulating the execution flags on each + * channel. + */ +struct brw_instruction *brw_IF(struct brw_compile *p, + GLuint execute_size); + +struct brw_instruction *brw_ELSE(struct brw_compile *p, + struct brw_instruction *if_insn); + +void brw_ENDIF(struct brw_compile *p, + struct brw_instruction *if_or_else_insn); + + +/* DO/WHILE loops: + */ +struct brw_instruction *brw_DO(struct brw_compile *p, + GLuint execute_size); + +struct brw_instruction *brw_WHILE(struct brw_compile *p, + struct brw_instruction *patch_insn); + +struct brw_instruction *brw_BREAK(struct brw_compile *p); +struct brw_instruction *brw_CONT(struct brw_compile *p); +/* Forward jumps: + */ +void brw_land_fwd_jump(struct brw_compile *p, + struct brw_instruction *jmp_insn); + + + +void brw_NOP(struct brw_compile *p); + +/* Special case: there is never a destination, execution size will be + * taken from src0: + */ +void brw_CMP(struct brw_compile *p, + struct brw_reg dest, + GLuint conditional, + struct brw_reg src0, + struct brw_reg src1); + +void brw_print_reg( struct brw_reg reg ); + + +/*********************************************************************** + * brw_eu_util.c: + */ + +void brw_copy_indirect_to_indirect(struct brw_compile *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + GLuint count); + +void brw_copy_from_indirect(struct brw_compile *p, + struct brw_reg dst, + struct brw_indirect ptr, + GLuint count); + +void brw_copy4(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src, + GLuint count); + +void brw_copy8(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src, + GLuint count); + +void brw_math_invert( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src); + +void brw_set_src1( struct brw_instruction *insn, + struct brw_reg reg ); +#endif diff --git a/src/gallium/drivers/i965/brw_eu_debug.c b/src/gallium/drivers/i965/brw_eu_debug.c new file mode 100644 index 00000000000..29f3f6d02fa --- /dev/null +++ b/src/gallium/drivers/i965/brw_eu_debug.c @@ -0,0 +1,95 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/mtypes.h" +#include "main/imports.h" +#include "brw_eu.h" + +void brw_print_reg( struct brw_reg hwreg ) +{ + static const char *file[] = { + "arf", + "grf", + "msg", + "imm" + }; + + static const char *type[] = { + "ud", + "d", + "uw", + "w", + "ub", + "vf", + "hf", + "f" + }; + + _mesa_printf("%s%s", + hwreg.abs ? "abs/" : "", + hwreg.negate ? "-" : ""); + + if (hwreg.file == BRW_GENERAL_REGISTER_FILE && + hwreg.nr % 2 == 0 && + hwreg.subnr == 0 && + hwreg.vstride == BRW_VERTICAL_STRIDE_8 && + hwreg.width == BRW_WIDTH_8 && + hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 && + hwreg.type == BRW_REGISTER_TYPE_F) { + /* vector register */ + _mesa_printf("vec%d", hwreg.nr); + } + else if (hwreg.file == BRW_GENERAL_REGISTER_FILE && + hwreg.vstride == BRW_VERTICAL_STRIDE_0 && + hwreg.width == BRW_WIDTH_1 && + hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 && + hwreg.type == BRW_REGISTER_TYPE_F) { + /* "scalar" register */ + _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4); + } + else if (hwreg.file == BRW_IMMEDIATE_VALUE) { + _mesa_printf("imm %f", hwreg.dw1.f); + } + else { + _mesa_printf("%s%d.%d<%d;%d,%d>:%s", + file[hwreg.file], + hwreg.nr, + hwreg.subnr / type_sz(hwreg.type), + hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0, + 1<<hwreg.width, + hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0, + type[hwreg.type]); + } +} + + + diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c new file mode 100644 index 00000000000..241cdc33f86 --- /dev/null +++ b/src/gallium/drivers/i965/brw_eu_emit.c @@ -0,0 +1,1425 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_eu.h" + + + + +/*********************************************************************** + * Internal helper for constructing instructions + */ + +static void guess_execution_size( struct brw_instruction *insn, + struct brw_reg reg ) +{ + if (reg.width == BRW_WIDTH_8 && + insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) + insn->header.execution_size = BRW_EXECUTE_16; + else + insn->header.execution_size = reg.width; /* note - definitions are compatible */ +} + + +static void brw_set_dest( struct brw_instruction *insn, + struct brw_reg dest ) +{ + if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE) + assert(dest.nr < 128); + + insn->bits1.da1.dest_reg_file = dest.file; + insn->bits1.da1.dest_reg_type = dest.type; + insn->bits1.da1.dest_address_mode = dest.address_mode; + + if (dest.address_mode == BRW_ADDRESS_DIRECT) { + insn->bits1.da1.dest_reg_nr = dest.nr; + + if (insn->header.access_mode == BRW_ALIGN_1) { + insn->bits1.da1.dest_subreg_nr = dest.subnr; + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + insn->bits1.da1.dest_horiz_stride = dest.hstride; + } + else { + insn->bits1.da16.dest_subreg_nr = dest.subnr / 16; + insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask; + } + } + else { + insn->bits1.ia1.dest_subreg_nr = dest.subnr; + + /* These are different sizes in align1 vs align16: + */ + if (insn->header.access_mode == BRW_ALIGN_1) { + insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset; + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + insn->bits1.ia1.dest_horiz_stride = dest.hstride; + } + else { + insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset; + } + } + + /* NEW: Set the execution size based on dest.width and + * insn->compression_control: + */ + guess_execution_size(insn, dest); +} + +static void brw_set_src0( struct brw_instruction *insn, + struct brw_reg reg ) +{ + assert(reg.file != BRW_MESSAGE_REGISTER_FILE); + + if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE) + assert(reg.nr < 128); + + insn->bits1.da1.src0_reg_file = reg.file; + insn->bits1.da1.src0_reg_type = reg.type; + insn->bits2.da1.src0_abs = reg.abs; + insn->bits2.da1.src0_negate = reg.negate; + insn->bits2.da1.src0_address_mode = reg.address_mode; + + if (reg.file == BRW_IMMEDIATE_VALUE) { + insn->bits3.ud = reg.dw1.ud; + + /* Required to set some fields in src1 as well: + */ + insn->bits1.da1.src1_reg_file = 0; /* arf */ + insn->bits1.da1.src1_reg_type = reg.type; + } + else + { + if (reg.address_mode == BRW_ADDRESS_DIRECT) { + if (insn->header.access_mode == BRW_ALIGN_1) { + insn->bits2.da1.src0_subreg_nr = reg.subnr; + insn->bits2.da1.src0_reg_nr = reg.nr; + } + else { + insn->bits2.da16.src0_subreg_nr = reg.subnr / 16; + insn->bits2.da16.src0_reg_nr = reg.nr; + } + } + else { + insn->bits2.ia1.src0_subreg_nr = reg.subnr; + + if (insn->header.access_mode == BRW_ALIGN_1) { + insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset; + } + else { + insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset; + } + } + + if (insn->header.access_mode == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + insn->header.execution_size == BRW_EXECUTE_1) { + insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0; + insn->bits2.da1.src0_width = BRW_WIDTH_1; + insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0; + } + else { + insn->bits2.da1.src0_horiz_stride = reg.hstride; + insn->bits2.da1.src0_width = reg.width; + insn->bits2.da1.src0_vert_stride = reg.vstride; + } + } + else { + insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X); + insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y); + insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z); + insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W); + + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + if (reg.vstride == BRW_VERTICAL_STRIDE_8) + insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4; + else + insn->bits2.da16.src0_vert_stride = reg.vstride; + } + } +} + + +void brw_set_src1( struct brw_instruction *insn, + struct brw_reg reg ) +{ + assert(reg.file != BRW_MESSAGE_REGISTER_FILE); + + assert(reg.nr < 128); + + insn->bits1.da1.src1_reg_file = reg.file; + insn->bits1.da1.src1_reg_type = reg.type; + insn->bits3.da1.src1_abs = reg.abs; + insn->bits3.da1.src1_negate = reg.negate; + + /* Only src1 can be immediate in two-argument instructions. + */ + assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE); + + if (reg.file == BRW_IMMEDIATE_VALUE) { + insn->bits3.ud = reg.dw1.ud; + } + else { + /* This is a hardware restriction, which may or may not be lifted + * in the future: + */ + assert (reg.address_mode == BRW_ADDRESS_DIRECT); + //assert (reg.file == BRW_GENERAL_REGISTER_FILE); + + if (insn->header.access_mode == BRW_ALIGN_1) { + insn->bits3.da1.src1_subreg_nr = reg.subnr; + insn->bits3.da1.src1_reg_nr = reg.nr; + } + else { + insn->bits3.da16.src1_subreg_nr = reg.subnr / 16; + insn->bits3.da16.src1_reg_nr = reg.nr; + } + + if (insn->header.access_mode == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + insn->header.execution_size == BRW_EXECUTE_1) { + insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0; + insn->bits3.da1.src1_width = BRW_WIDTH_1; + insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0; + } + else { + insn->bits3.da1.src1_horiz_stride = reg.hstride; + insn->bits3.da1.src1_width = reg.width; + insn->bits3.da1.src1_vert_stride = reg.vstride; + } + } + else { + insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X); + insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y); + insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z); + insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W); + + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + if (reg.vstride == BRW_VERTICAL_STRIDE_8) + insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4; + else + insn->bits3.da16.src1_vert_stride = reg.vstride; + } + } +} + + + +static void brw_set_math_message( struct brw_context *brw, + struct brw_instruction *insn, + GLuint msg_length, + GLuint response_length, + GLuint function, + GLuint integer_type, + GLboolean low_precision, + GLboolean saturate, + GLuint dataType ) +{ + brw_set_src1(insn, brw_imm_d(0)); + + if (BRW_IS_IGDNG(brw)) { + insn->bits3.math_igdng.function = function; + insn->bits3.math_igdng.int_type = integer_type; + insn->bits3.math_igdng.precision = low_precision; + insn->bits3.math_igdng.saturate = saturate; + insn->bits3.math_igdng.data_type = dataType; + insn->bits3.math_igdng.snapshot = 0; + insn->bits3.math_igdng.header_present = 0; + insn->bits3.math_igdng.response_length = response_length; + insn->bits3.math_igdng.msg_length = msg_length; + insn->bits3.math_igdng.end_of_thread = 0; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH; + insn->bits2.send_igdng.end_of_thread = 0; + } else { + insn->bits3.math.function = function; + insn->bits3.math.int_type = integer_type; + insn->bits3.math.precision = low_precision; + insn->bits3.math.saturate = saturate; + insn->bits3.math.data_type = dataType; + insn->bits3.math.response_length = response_length; + insn->bits3.math.msg_length = msg_length; + insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH; + insn->bits3.math.end_of_thread = 0; + } +} + + +static void brw_set_ff_sync_message( struct brw_context *brw, + struct brw_instruction *insn, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean end_of_thread, + GLboolean complete, + GLuint offset, + GLuint swizzle_control ) +{ + brw_set_src1(insn, brw_imm_d(0)); + + insn->bits3.urb_igdng.opcode = 1; + insn->bits3.urb_igdng.offset = offset; + insn->bits3.urb_igdng.swizzle_control = swizzle_control; + insn->bits3.urb_igdng.allocate = allocate; + insn->bits3.urb_igdng.used = used; + insn->bits3.urb_igdng.complete = complete; + insn->bits3.urb_igdng.header_present = 1; + insn->bits3.urb_igdng.response_length = response_length; + insn->bits3.urb_igdng.msg_length = msg_length; + insn->bits3.urb_igdng.end_of_thread = end_of_thread; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB; + insn->bits2.send_igdng.end_of_thread = end_of_thread; +} + +static void brw_set_urb_message( struct brw_context *brw, + struct brw_instruction *insn, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean end_of_thread, + GLboolean complete, + GLuint offset, + GLuint swizzle_control ) +{ + brw_set_src1(insn, brw_imm_d(0)); + + if (BRW_IS_IGDNG(brw)) { + insn->bits3.urb_igdng.opcode = 0; /* ? */ + insn->bits3.urb_igdng.offset = offset; + insn->bits3.urb_igdng.swizzle_control = swizzle_control; + insn->bits3.urb_igdng.allocate = allocate; + insn->bits3.urb_igdng.used = used; /* ? */ + insn->bits3.urb_igdng.complete = complete; + insn->bits3.urb_igdng.header_present = 1; + insn->bits3.urb_igdng.response_length = response_length; + insn->bits3.urb_igdng.msg_length = msg_length; + insn->bits3.urb_igdng.end_of_thread = end_of_thread; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB; + insn->bits2.send_igdng.end_of_thread = end_of_thread; + } else { + insn->bits3.urb.opcode = 0; /* ? */ + insn->bits3.urb.offset = offset; + insn->bits3.urb.swizzle_control = swizzle_control; + insn->bits3.urb.allocate = allocate; + insn->bits3.urb.used = used; /* ? */ + insn->bits3.urb.complete = complete; + insn->bits3.urb.response_length = response_length; + insn->bits3.urb.msg_length = msg_length; + insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB; + insn->bits3.urb.end_of_thread = end_of_thread; + } +} + +static void brw_set_dp_write_message( struct brw_context *brw, + struct brw_instruction *insn, + GLuint binding_table_index, + GLuint msg_control, + GLuint msg_type, + GLuint msg_length, + GLuint pixel_scoreboard_clear, + GLuint response_length, + GLuint end_of_thread ) +{ + brw_set_src1(insn, brw_imm_d(0)); + + if (BRW_IS_IGDNG(brw)) { + insn->bits3.dp_write_igdng.binding_table_index = binding_table_index; + insn->bits3.dp_write_igdng.msg_control = msg_control; + insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear; + insn->bits3.dp_write_igdng.msg_type = msg_type; + insn->bits3.dp_write_igdng.send_commit_msg = 0; + insn->bits3.dp_write_igdng.header_present = 1; + insn->bits3.dp_write_igdng.response_length = response_length; + insn->bits3.dp_write_igdng.msg_length = msg_length; + insn->bits3.dp_write_igdng.end_of_thread = end_of_thread; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE; + insn->bits2.send_igdng.end_of_thread = end_of_thread; + } else { + insn->bits3.dp_write.binding_table_index = binding_table_index; + insn->bits3.dp_write.msg_control = msg_control; + insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear; + insn->bits3.dp_write.msg_type = msg_type; + insn->bits3.dp_write.send_commit_msg = 0; + insn->bits3.dp_write.response_length = response_length; + insn->bits3.dp_write.msg_length = msg_length; + insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE; + insn->bits3.dp_write.end_of_thread = end_of_thread; + } +} + +static void brw_set_dp_read_message( struct brw_context *brw, + struct brw_instruction *insn, + GLuint binding_table_index, + GLuint msg_control, + GLuint msg_type, + GLuint target_cache, + GLuint msg_length, + GLuint response_length, + GLuint end_of_thread ) +{ + brw_set_src1(insn, brw_imm_d(0)); + + if (BRW_IS_IGDNG(brw)) { + insn->bits3.dp_read_igdng.binding_table_index = binding_table_index; + insn->bits3.dp_read_igdng.msg_control = msg_control; + insn->bits3.dp_read_igdng.msg_type = msg_type; + insn->bits3.dp_read_igdng.target_cache = target_cache; + insn->bits3.dp_read_igdng.header_present = 1; + insn->bits3.dp_read_igdng.response_length = response_length; + insn->bits3.dp_read_igdng.msg_length = msg_length; + insn->bits3.dp_read_igdng.pad1 = 0; + insn->bits3.dp_read_igdng.end_of_thread = end_of_thread; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ; + insn->bits2.send_igdng.end_of_thread = end_of_thread; + } else { + insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/ + insn->bits3.dp_read.msg_control = msg_control; /*8:11*/ + insn->bits3.dp_read.msg_type = msg_type; /*12:13*/ + insn->bits3.dp_read.target_cache = target_cache; /*14:15*/ + insn->bits3.dp_read.response_length = response_length; /*16:19*/ + insn->bits3.dp_read.msg_length = msg_length; /*20:23*/ + insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/ + insn->bits3.dp_read.pad1 = 0; /*28:30*/ + insn->bits3.dp_read.end_of_thread = end_of_thread; /*31*/ + } +} + +static void brw_set_sampler_message(struct brw_context *brw, + struct brw_instruction *insn, + GLuint binding_table_index, + GLuint sampler, + GLuint msg_type, + GLuint response_length, + GLuint msg_length, + GLboolean eot, + GLuint header_present, + GLuint simd_mode) +{ + assert(eot == 0); + brw_set_src1(insn, brw_imm_d(0)); + + if (BRW_IS_IGDNG(brw)) { + insn->bits3.sampler_igdng.binding_table_index = binding_table_index; + insn->bits3.sampler_igdng.sampler = sampler; + insn->bits3.sampler_igdng.msg_type = msg_type; + insn->bits3.sampler_igdng.simd_mode = simd_mode; + insn->bits3.sampler_igdng.header_present = header_present; + insn->bits3.sampler_igdng.response_length = response_length; + insn->bits3.sampler_igdng.msg_length = msg_length; + insn->bits3.sampler_igdng.end_of_thread = eot; + insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER; + insn->bits2.send_igdng.end_of_thread = eot; + } else if (BRW_IS_G4X(brw)) { + insn->bits3.sampler_g4x.binding_table_index = binding_table_index; + insn->bits3.sampler_g4x.sampler = sampler; + insn->bits3.sampler_g4x.msg_type = msg_type; + insn->bits3.sampler_g4x.response_length = response_length; + insn->bits3.sampler_g4x.msg_length = msg_length; + insn->bits3.sampler_g4x.end_of_thread = eot; + insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER; + } else { + insn->bits3.sampler.binding_table_index = binding_table_index; + insn->bits3.sampler.sampler = sampler; + insn->bits3.sampler.msg_type = msg_type; + insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + insn->bits3.sampler.response_length = response_length; + insn->bits3.sampler.msg_length = msg_length; + insn->bits3.sampler.end_of_thread = eot; + insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER; + } +} + + + +static struct brw_instruction *next_insn( struct brw_compile *p, + GLuint opcode ) +{ + struct brw_instruction *insn; + + assert(p->nr_insn + 1 < BRW_EU_MAX_INSN); + + insn = &p->store[p->nr_insn++]; + memcpy(insn, p->current, sizeof(*insn)); + + /* Reset this one-shot flag: + */ + + if (p->current->header.destreg__conditionalmod) { + p->current->header.destreg__conditionalmod = 0; + p->current->header.predicate_control = BRW_PREDICATE_NORMAL; + } + + insn->header.opcode = opcode; + return insn; +} + + +static struct brw_instruction *brw_alu1( struct brw_compile *p, + GLuint opcode, + struct brw_reg dest, + struct brw_reg src ) +{ + struct brw_instruction *insn = next_insn(p, opcode); + brw_set_dest(insn, dest); + brw_set_src0(insn, src); + return insn; +} + +static struct brw_instruction *brw_alu2(struct brw_compile *p, + GLuint opcode, + struct brw_reg dest, + struct brw_reg src0, + struct brw_reg src1 ) +{ + struct brw_instruction *insn = next_insn(p, opcode); + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_src1(insn, src1); + return insn; +} + + +/*********************************************************************** + * Convenience routines. + */ +#define ALU1(OP) \ +struct brw_instruction *brw_##OP(struct brw_compile *p, \ + struct brw_reg dest, \ + struct brw_reg src0) \ +{ \ + return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ +} + +#define ALU2(OP) \ +struct brw_instruction *brw_##OP(struct brw_compile *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1) \ +{ \ + return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ +} + + +ALU1(MOV) +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU2(RSR) +ALU2(RSL) +ALU2(ASR) +ALU2(ADD) +ALU2(MUL) +ALU1(FRC) +ALU1(RNDD) +ALU1(RNDZ) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU2(LINE) + + + + +void brw_NOP(struct brw_compile *p) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP); + brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src1(insn, brw_imm_ud(0x0)); +} + + + + + +/*********************************************************************** + * Comparisons, if/else/endif + */ + +struct brw_instruction *brw_JMPI(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg src0, + struct brw_reg src1) +{ + struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1); + + insn->header.execution_size = 1; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.mask_control = BRW_MASK_DISABLE; + + p->current->header.predicate_control = BRW_PREDICATE_NONE; + + return insn; +} + +/* EU takes the value from the flag register and pushes it onto some + * sort of a stack (presumably merging with any flag value already on + * the stack). Within an if block, the flags at the top of the stack + * control execution on each channel of the unit, eg. on each of the + * 16 pixel values in our wm programs. + * + * When the matching 'else' instruction is reached (presumably by + * countdown of the instruction count patched in by our ELSE/ENDIF + * functions), the relevent flags are inverted. + * + * When the matching 'endif' instruction is reached, the flags are + * popped off. If the stack is now empty, normal execution resumes. + * + * No attempt is made to deal with stack overflow (14 elements?). + */ +struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size) +{ + struct brw_instruction *insn; + + if (p->single_program_flow) { + assert(execute_size == BRW_EXECUTE_1); + + insn = next_insn(p, BRW_OPCODE_ADD); + insn->header.predicate_inverse = 1; + } else { + insn = next_insn(p, BRW_OPCODE_IF); + } + + /* Override the defaults for this instruction: + */ + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + + insn->header.execution_size = execute_size; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.predicate_control = BRW_PREDICATE_NORMAL; + insn->header.mask_control = BRW_MASK_ENABLE; + if (!p->single_program_flow) + insn->header.thread_control = BRW_THREAD_SWITCH; + + p->current->header.predicate_control = BRW_PREDICATE_NONE; + + return insn; +} + + +struct brw_instruction *brw_ELSE(struct brw_compile *p, + struct brw_instruction *if_insn) +{ + struct brw_instruction *insn; + GLuint br = 1; + + if (BRW_IS_IGDNG(p->brw)) + br = 2; + + if (p->single_program_flow) { + insn = next_insn(p, BRW_OPCODE_ADD); + } else { + insn = next_insn(p, BRW_OPCODE_ELSE); + } + + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = if_insn->header.execution_size; + insn->header.mask_control = BRW_MASK_ENABLE; + if (!p->single_program_flow) + insn->header.thread_control = BRW_THREAD_SWITCH; + + /* Patch the if instruction to point at this instruction. + */ + if (p->single_program_flow) { + assert(if_insn->header.opcode == BRW_OPCODE_ADD); + + if_insn->bits3.ud = (insn - if_insn + 1) * 16; + } else { + assert(if_insn->header.opcode == BRW_OPCODE_IF); + + if_insn->bits3.if_else.jump_count = br * (insn - if_insn); + if_insn->bits3.if_else.pop_count = 0; + if_insn->bits3.if_else.pad0 = 0; + } + + return insn; +} + +void brw_ENDIF(struct brw_compile *p, + struct brw_instruction *patch_insn) +{ + GLuint br = 1; + + if (BRW_IS_IGDNG(p->brw)) + br = 2; + + if (p->single_program_flow) { + /* In single program flow mode, there's no need to execute an ENDIF, + * since we don't need to do any stack operations, and if we're executing + * currently, we want to just continue executing. + */ + struct brw_instruction *next = &p->store[p->nr_insn]; + + assert(patch_insn->header.opcode == BRW_OPCODE_ADD); + + patch_insn->bits3.ud = (next - patch_insn) * 16; + } else { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF); + + brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src1(insn, brw_imm_d(0x0)); + + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = patch_insn->header.execution_size; + insn->header.mask_control = BRW_MASK_ENABLE; + insn->header.thread_control = BRW_THREAD_SWITCH; + + assert(patch_insn->bits3.if_else.jump_count == 0); + + /* Patch the if or else instructions to point at this or the next + * instruction respectively. + */ + if (patch_insn->header.opcode == BRW_OPCODE_IF) { + /* Automagically turn it into an IFF: + */ + patch_insn->header.opcode = BRW_OPCODE_IFF; + patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); + patch_insn->bits3.if_else.pop_count = 0; + patch_insn->bits3.if_else.pad0 = 0; + } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) { + patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); + patch_insn->bits3.if_else.pop_count = 1; + patch_insn->bits3.if_else.pad0 = 0; + } else { + assert(0); + } + + /* Also pop item off the stack in the endif instruction: + */ + insn->bits3.if_else.jump_count = 0; + insn->bits3.if_else.pop_count = 1; + insn->bits3.if_else.pad0 = 0; + } +} + +struct brw_instruction *brw_BREAK(struct brw_compile *p) +{ + struct brw_instruction *insn; + insn = next_insn(p, BRW_OPCODE_BREAK); + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = BRW_EXECUTE_8; + /* insn->header.mask_control = BRW_MASK_DISABLE; */ + insn->bits3.if_else.pad0 = 0; + return insn; +} + +struct brw_instruction *brw_CONT(struct brw_compile *p) +{ + struct brw_instruction *insn; + insn = next_insn(p, BRW_OPCODE_CONTINUE); + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = BRW_EXECUTE_8; + /* insn->header.mask_control = BRW_MASK_DISABLE; */ + insn->bits3.if_else.pad0 = 0; + return insn; +} + +/* DO/WHILE loop: + */ +struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size) +{ + if (p->single_program_flow) { + return &p->store[p->nr_insn]; + } else { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO); + + /* Override the defaults for this instruction: + */ + brw_set_dest(insn, brw_null_reg()); + brw_set_src0(insn, brw_null_reg()); + brw_set_src1(insn, brw_null_reg()); + + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = execute_size; + insn->header.predicate_control = BRW_PREDICATE_NONE; + /* insn->header.mask_control = BRW_MASK_ENABLE; */ + /* insn->header.mask_control = BRW_MASK_DISABLE; */ + + return insn; + } +} + + + +struct brw_instruction *brw_WHILE(struct brw_compile *p, + struct brw_instruction *do_insn) +{ + struct brw_instruction *insn; + GLuint br = 1; + + if (BRW_IS_IGDNG(p->brw)) + br = 2; + + if (p->single_program_flow) + insn = next_insn(p, BRW_OPCODE_ADD); + else + insn = next_insn(p, BRW_OPCODE_WHILE); + + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + + insn->header.compression_control = BRW_COMPRESSION_NONE; + + if (p->single_program_flow) { + insn->header.execution_size = BRW_EXECUTE_1; + + insn->bits3.d = (do_insn - insn) * 16; + } else { + insn->header.execution_size = do_insn->header.execution_size; + + assert(do_insn->header.opcode == BRW_OPCODE_DO); + insn->bits3.if_else.jump_count = br * (do_insn - insn + 1); + insn->bits3.if_else.pop_count = 0; + insn->bits3.if_else.pad0 = 0; + } + +/* insn->header.mask_control = BRW_MASK_ENABLE; */ + + /* insn->header.mask_control = BRW_MASK_DISABLE; */ + p->current->header.predicate_control = BRW_PREDICATE_NONE; + return insn; +} + + +/* FORWARD JUMPS: + */ +void brw_land_fwd_jump(struct brw_compile *p, + struct brw_instruction *jmp_insn) +{ + struct brw_instruction *landing = &p->store[p->nr_insn]; + GLuint jmpi = 1; + + if (BRW_IS_IGDNG(p->brw)) + jmpi = 2; + + assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI); + assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE); + + jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1); +} + + + +/* To integrate with the above, it makes sense that the comparison + * instruction should populate the flag register. It might be simpler + * just to use the flag reg for most WM tasks? + */ +void brw_CMP(struct brw_compile *p, + struct brw_reg dest, + GLuint conditional, + struct brw_reg src0, + struct brw_reg src1) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP); + + insn->header.destreg__conditionalmod = conditional; + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_src1(insn, src1); + +/* guess_execution_size(insn, src0); */ + + + /* Make it so that future instructions will use the computed flag + * value until brw_set_predicate_control_flag_value() is called + * again. + */ + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == 0) { + p->current->header.predicate_control = BRW_PREDICATE_NORMAL; + p->flag_value = 0xff; + } +} + + + +/*********************************************************************** + * Helpers for the various SEND message types: + */ + +/** Extended math function, float[8]. + */ +void brw_math( struct brw_compile *p, + struct brw_reg dest, + GLuint function, + GLuint saturate, + GLuint msg_reg_nr, + struct brw_reg src, + GLuint data_type, + GLuint precision ) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; + GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; + + /* Example code doesn't set predicate_control for send + * instructions. + */ + insn->header.predicate_control = 0; + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); + brw_set_src0(insn, src); + brw_set_math_message(p->brw, + insn, + msg_length, response_length, + function, + BRW_MATH_INTEGER_UNSIGNED, + precision, + saturate, + data_type); +} + +/** + * Extended math function, float[16]. + * Use 2 send instructions. + */ +void brw_math_16( struct brw_compile *p, + struct brw_reg dest, + GLuint function, + GLuint saturate, + GLuint msg_reg_nr, + struct brw_reg src, + GLuint precision ) +{ + struct brw_instruction *insn; + GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; + GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; + + /* First instruction: + */ + brw_push_insn_state(p); + brw_set_predicate_control_flag_value(p, 0xff); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + + insn = next_insn(p, BRW_OPCODE_SEND); + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); + brw_set_src0(insn, src); + brw_set_math_message(p->brw, + insn, + msg_length, response_length, + function, + BRW_MATH_INTEGER_UNSIGNED, + precision, + saturate, + BRW_MATH_DATA_VECTOR); + + /* Second instruction: + */ + insn = next_insn(p, BRW_OPCODE_SEND); + insn->header.compression_control = BRW_COMPRESSION_2NDHALF; + insn->header.destreg__conditionalmod = msg_reg_nr+1; + + brw_set_dest(insn, offset(dest,1)); + brw_set_src0(insn, src); + brw_set_math_message(p->brw, + insn, + msg_length, response_length, + function, + BRW_MATH_INTEGER_UNSIGNED, + precision, + saturate, + BRW_MATH_DATA_VECTOR); + + brw_pop_insn_state(p); +} + + +/** + * Write block of 16 dwords/floats to the data port Render Cache scratch buffer. + * Scratch offset should be a multiple of 64. + * Used for register spilling. + */ +void brw_dp_WRITE_16( struct brw_compile *p, + struct brw_reg src, + GLuint scratch_offset ) +{ + GLuint msg_reg_nr = 1; + { + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D), + brw_imm_d(scratch_offset)); + + brw_pop_insn_state(p); + } + + { + GLuint msg_length = 3; + struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = 0; /* XXX */ + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); + brw_set_src0(insn, src); + + brw_set_dp_write_message(p->brw, + insn, + 255, /* binding table index (255=stateless) */ + BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */ + BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */ + msg_length, + 0, /* pixel scoreboard */ + 0, /* response_length */ + 0); /* eot */ + } +} + + +/** + * Read block of 16 dwords/floats from the data port Render Cache scratch buffer. + * Scratch offset should be a multiple of 64. + * Used for register spilling. + */ +void brw_dp_READ_16( struct brw_compile *p, + struct brw_reg dest, + GLuint scratch_offset ) +{ + GLuint msg_reg_nr = 1; + { + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D), + brw_imm_d(scratch_offset)); + + brw_pop_insn_state(p); + } + + { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = 0; /* XXX */ + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); /* UW? */ + brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); + + brw_set_dp_read_message(p->brw, + insn, + 255, /* binding table index (255=stateless) */ + 3, /* msg_control (3 means 4 Owords) */ + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + 1, /* target cache (render/scratch) */ + 1, /* msg_length */ + 2, /* response_length */ + 0); /* eot */ + } +} + + +/** + * Read a float[4] vector from the data port Data Cache (const buffer). + * Location (in buffer) should be a multiple of 16. + * Used for fetching shader constants. + * If relAddr is true, we'll do an indirect fetch using the address register. + */ +void brw_dp_READ_4( struct brw_compile *p, + struct brw_reg dest, + GLboolean relAddr, + GLuint location, + GLuint bind_table_index ) +{ + /* XXX: relAddr not implemented */ + GLuint msg_reg_nr = 1; + { + struct brw_reg b; + brw_push_insn_state(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + + /* Setup MRF[1] with location/offset into const buffer */ + b = brw_message_reg(msg_reg_nr); + b = retype(b, BRW_REGISTER_TYPE_UD); + /* XXX I think we're setting all the dwords of MRF[1] to 'location'. + * when the docs say only dword[2] should be set. Hmmm. But it works. + */ + brw_MOV(p, b, brw_imm_ud(location)); + brw_pop_insn_state(p); + } + + { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + insn->header.mask_control = BRW_MASK_DISABLE; + + /* cast dest to a uword[8] vector */ + dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); + + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_null_reg()); + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + 0, /* msg_control (0 means 1 Oword) */ + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + 0, /* source cache = data cache */ + 1, /* msg_length */ + 1, /* response_length (1 Oword) */ + 0); /* eot */ + } +} + + +/** + * Read float[4] constant(s) from VS constant buffer. + * For relative addressing, two float[4] constants will be read into 'dest'. + * Otherwise, one float[4] constant will be read into the lower half of 'dest'. + */ +void brw_dp_READ_4_vs(struct brw_compile *p, + struct brw_reg dest, + GLuint oword, + GLboolean relAddr, + struct brw_reg addrReg, + GLuint location, + GLuint bind_table_index) +{ + GLuint msg_reg_nr = 1; + + assert(oword < 2); + /* + printf("vs const read msg, location %u, msg_reg_nr %d\n", + location, msg_reg_nr); + */ + + /* Setup MRF[1] with location/offset into const buffer */ + { + struct brw_reg b; + + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + /*brw_set_access_mode(p, BRW_ALIGN_16);*/ + + /* XXX I think we're setting all the dwords of MRF[1] to 'location'. + * when the docs say only dword[2] should be set. Hmmm. But it works. + */ + b = brw_message_reg(msg_reg_nr); + b = retype(b, BRW_REGISTER_TYPE_UD); + /*b = get_element_ud(b, 2);*/ + if (relAddr) { + brw_ADD(p, b, addrReg, brw_imm_ud(location)); + } + else { + brw_MOV(p, b, brw_imm_ud(location)); + } + + brw_pop_insn_state(p); + } + + { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + insn->header.mask_control = BRW_MASK_DISABLE; + /*insn->header.access_mode = BRW_ALIGN_16;*/ + + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_null_reg()); + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + oword, /* 0 = lower Oword, 1 = upper Oword */ + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + 0, /* source cache = data cache */ + 1, /* msg_length */ + 1, /* response_length (1 Oword) */ + 0); /* eot */ + } +} + + + +void brw_fb_WRITE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLuint binding_table_index, + GLuint msg_length, + GLuint response_length, + GLboolean eot) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = 0; /* XXX */ + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_dp_write_message(p->brw, + insn, + binding_table_index, + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */ + BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */ + msg_length, + 1, /* pixel scoreboard */ + response_length, + eot); +} + + +/** + * Texture sample instruction. + * Note: the msg_type plus msg_length values determine exactly what kind + * of sampling operation is performed. See volume 4, page 161 of docs. + */ +void brw_SAMPLE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLuint binding_table_index, + GLuint sampler, + GLuint writemask, + GLuint msg_type, + GLuint response_length, + GLuint msg_length, + GLboolean eot, + GLuint header_present, + GLuint simd_mode) +{ + GLboolean need_stall = 0; + + if (writemask == 0) { + /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */ + return; + } + + /* Hardware doesn't do destination dependency checking on send + * instructions properly. Add a workaround which generates the + * dependency by other means. In practice it seems like this bug + * only crops up for texture samples, and only where registers are + * written by the send and then written again later without being + * read in between. Luckily for us, we already track that + * information and use it to modify the writemask for the + * instruction, so that is a guide for whether a workaround is + * needed. + */ + if (writemask != WRITEMASK_XYZW) { + GLuint dst_offset = 0; + GLuint i, newmask = 0, len = 0; + + for (i = 0; i < 4; i++) { + if (writemask & (1<<i)) + break; + dst_offset += 2; + } + for (; i < 4; i++) { + if (!(writemask & (1<<i))) + break; + newmask |= 1<<i; + len++; + } + + if (newmask != writemask) { + need_stall = 1; + /* _mesa_printf("need stall %x %x\n", newmask , writemask); */ + } + else { + struct brw_reg m1 = brw_message_reg(msg_reg_nr); + + newmask = ~newmask & WRITEMASK_XYZW; + + brw_push_insn_state(p); + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + + brw_MOV(p, m1, brw_vec8_grf(0,0)); + brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); + + brw_pop_insn_state(p); + + src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); + dest = offset(dest, dst_offset); + response_length = len * 2; + } + } + + { + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = 0; /* XXX */ + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_sampler_message(p->brw, insn, + binding_table_index, + sampler, + msg_type, + response_length, + msg_length, + eot, + header_present, + simd_mode); + } + + if (need_stall) { + struct brw_reg reg = vec8(offset(dest, response_length-1)); + + /* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 } + */ + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, reg, reg); + brw_pop_insn_state(p); + } + +} + +/* All these variables are pretty confusing - we might be better off + * using bitmasks and macros for this, in the old style. Or perhaps + * just having the caller instantiate the fields in dword3 itself. + */ +void brw_urb_WRITE(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean eot, + GLboolean writes_complete, + GLuint offset, + GLuint swizzle) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + assert(msg_length < BRW_MAX_MRF); + + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_src1(insn, brw_imm_d(0)); + + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_urb_message(p->brw, + insn, + allocate, + used, + msg_length, + response_length, + eot, + writes_complete, + offset, + swizzle); +} + +void brw_ff_sync(struct brw_compile *p, + struct brw_reg dest, + GLuint msg_reg_nr, + struct brw_reg src0, + GLboolean allocate, + GLboolean used, + GLuint msg_length, + GLuint response_length, + GLboolean eot, + GLboolean writes_complete, + GLuint offset, + GLuint swizzle) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + assert(msg_length < 16); + + brw_set_dest(insn, dest); + brw_set_src0(insn, src0); + brw_set_src1(insn, brw_imm_d(0)); + + insn->header.destreg__conditionalmod = msg_reg_nr; + + brw_set_ff_sync_message(p->brw, + insn, + allocate, + used, + msg_length, + response_length, + eot, + writes_complete, + offset, + swizzle); +} diff --git a/src/gallium/drivers/i965/brw_eu_util.c b/src/gallium/drivers/i965/brw_eu_util.c new file mode 100644 index 00000000000..5405cf17a4e --- /dev/null +++ b/src/gallium/drivers/i965/brw_eu_util.c @@ -0,0 +1,126 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_defines.h" +#include "brw_eu.h" + + +void brw_math_invert( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src) +{ + brw_math( p, + dst, + BRW_MATH_FUNCTION_INV, + BRW_MATH_SATURATE_NONE, + 0, + src, + BRW_MATH_PRECISION_FULL, + BRW_MATH_DATA_VECTOR ); +} + + + +void brw_copy4(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src, + GLuint count) +{ + GLuint i; + + dst = vec4(dst); + src = vec4(src); + + for (i = 0; i < count; i++) + { + GLuint delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16)); + } +} + + +void brw_copy8(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src, + GLuint count) +{ + GLuint i; + + dst = vec8(dst); + src = vec8(src); + + for (i = 0; i < count; i++) + { + GLuint delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + } +} + + +void brw_copy_indirect_to_indirect(struct brw_compile *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + GLuint count) +{ + GLuint i; + + for (i = 0; i < count; i++) + { + GLuint delta = i*32; + brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta)); + brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16)); + } +} + + +void brw_copy_from_indirect(struct brw_compile *p, + struct brw_reg dst, + struct brw_indirect ptr, + GLuint count) +{ + GLuint i; + + dst = vec4(dst); + + for (i = 0; i < count; i++) + { + GLuint delta = i*32; + brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta)); + brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16)); + } +} + + + + diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c new file mode 100644 index 00000000000..48c2b9a41ce --- /dev/null +++ b/src/gallium/drivers/i965/brw_gs.c @@ -0,0 +1,201 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_state.h" +#include "brw_gs.h" + + + +static void compile_gs_prog( struct brw_context *brw, + struct brw_gs_prog_key *key ) +{ + struct brw_gs_compile c; + const GLuint *program; + GLuint program_size; + + memset(&c, 0, sizeof(c)); + + c.key = *key; + c.need_ff_sync = BRW_IS_IGDNG(brw); + /* Need to locate the two positions present in vertex + header. + * These are currently hardcoded: + */ + c.nr_attrs = brw_count_bits(c.key.attrs); + + if (BRW_IS_IGDNG(brw)) + c.nr_regs = (c.nr_attrs + 1) / 2 + 3; /* are vertices packed, or reg-aligned? */ + else + c.nr_regs = (c.nr_attrs + 1) / 2 + 1; /* are vertices packed, or reg-aligned? */ + + c.nr_bytes = c.nr_regs * REG_SIZE; + + + /* Begin the compilation: + */ + brw_init_compile(brw, &c.func); + + c.func.single_program_flow = 1; + + /* For some reason the thread is spawned with only 4 channels + * unmasked. + */ + brw_set_mask_control(&c.func, BRW_MASK_DISABLE); + + + /* Note that primitives which don't require a GS program have + * already been weeded out by this stage: + */ + switch (key->primitive) { + case GL_QUADS: + brw_gs_quads( &c ); + break; + case GL_QUAD_STRIP: + brw_gs_quad_strip( &c ); + break; + case GL_LINE_LOOP: + brw_gs_lines( &c ); + break; + case GL_LINES: + if (key->hint_gs_always) + brw_gs_lines( &c ); + else { + return; + } + break; + case GL_TRIANGLES: + if (key->hint_gs_always) + brw_gs_tris( &c ); + else { + return; + } + break; + case GL_POINTS: + if (key->hint_gs_always) + brw_gs_points( &c ); + else { + return; + } + break; + default: + return; + } + + /* get the program + */ + program = brw_get_program(&c.func, &program_size); + + /* Upload + */ + dri_bo_unreference(brw->gs.prog_bo); + brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG, + &c.key, sizeof(c.key), + NULL, 0, + program, program_size, + &c.prog_data, + &brw->gs.prog_data ); +} + +static const GLenum gs_prim[GL_POLYGON+1] = { + GL_POINTS, + GL_LINES, + GL_LINE_LOOP, + GL_LINES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_TRIANGLES, + GL_QUADS, + GL_QUAD_STRIP, + GL_TRIANGLES +}; + +static void populate_key( struct brw_context *brw, + struct brw_gs_prog_key *key ) +{ + memset(key, 0, sizeof(*key)); + + /* CACHE_NEW_VS_PROG */ + key->attrs = brw->vs.prog_data->outputs_written; + + /* BRW_NEW_PRIMITIVE */ + key->primitive = gs_prim[brw->primitive]; + + key->hint_gs_always = 0; /* debug code? */ + + key->need_gs_prog = (key->hint_gs_always || + brw->primitive == GL_QUADS || + brw->primitive == GL_QUAD_STRIP || + brw->primitive == GL_LINE_LOOP); +} + +/* Calculate interpolants for triangle and line rasterization. + */ +static void prepare_gs_prog(struct brw_context *brw) +{ + struct brw_gs_prog_key key; + /* Populate the key: + */ + populate_key(brw, &key); + + if (brw->gs.prog_active != key.need_gs_prog) { + brw->state.dirty.cache |= CACHE_NEW_GS_PROG; + brw->gs.prog_active = key.need_gs_prog; + } + + if (brw->gs.prog_active) { + dri_bo_unreference(brw->gs.prog_bo); + brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG, + &key, sizeof(key), + NULL, 0, + &brw->gs.prog_data); + if (brw->gs.prog_bo == NULL) + compile_gs_prog( brw, &key ); + } +} + + +const struct brw_tracked_state brw_gs_prog = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_PRIMITIVE, + .cache = CACHE_NEW_VS_PROG + }, + .prepare = prepare_gs_prog +}; diff --git a/src/gallium/drivers/i965/brw_gs.h b/src/gallium/drivers/i965/brw_gs.h new file mode 100644 index 00000000000..bbb991ea2e5 --- /dev/null +++ b/src/gallium/drivers/i965/brw_gs.h @@ -0,0 +1,76 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_GS_H +#define BRW_GS_H + + +#include "brw_context.h" +#include "brw_eu.h" + +#define MAX_GS_VERTS (4) + +struct brw_gs_prog_key { + GLuint attrs:32; + GLuint primitive:4; + GLuint hint_gs_always:1; + GLuint need_gs_prog:1; + GLuint pad:26; +}; + +struct brw_gs_compile { + struct brw_compile func; + struct brw_gs_prog_key key; + struct brw_gs_prog_data prog_data; + + struct { + struct brw_reg R0; + struct brw_reg vertex[MAX_GS_VERTS]; + } reg; + + /* 3 different ways of expressing vertex size: + */ + GLuint nr_attrs; + GLuint nr_regs; + GLuint nr_bytes; + GLboolean need_ff_sync; +}; + +#define ATTR_SIZE (4*4) + +void brw_gs_quads( struct brw_gs_compile *c ); +void brw_gs_quad_strip( struct brw_gs_compile *c ); +void brw_gs_tris( struct brw_gs_compile *c ); +void brw_gs_lines( struct brw_gs_compile *c ); +void brw_gs_points( struct brw_gs_compile *c ); + +#endif diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c new file mode 100644 index 00000000000..a9b2aa2eace --- /dev/null +++ b/src/gallium/drivers/i965/brw_gs_emit.c @@ -0,0 +1,186 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" + +#include "shader/program.h" +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_gs.h" + +static void brw_gs_alloc_regs( struct brw_gs_compile *c, + GLuint nr_verts ) +{ + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < nr_verts; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + c->prog_data.urb_read_length = c->nr_regs; + c->prog_data.total_grf = i; +} + + +static void brw_gs_emit_vue(struct brw_gs_compile *c, + struct brw_reg vert, + GLboolean last, + GLuint header) +{ + struct brw_compile *p = &c->func; + GLboolean allocate = !last; + + /* Overwrite PrimType and PrimStart in the message header, for + * each vertex in turn: + */ + brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header)); + + /* Copy the vertex from vertn into m1..mN+1: + */ + brw_copy8(p, brw_message_reg(1), vert, c->nr_regs); + + /* Send each vertex as a seperate write to the urb. This is + * different to the concept in brw_sf_emit.c, where subsequent + * writes are used to build up a single urb entry. Each of these + * writes instantiates a seperate urb entry, and a new one must be + * allocated each time. + */ + brw_urb_WRITE(p, + allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + 0, + c->reg.R0, + allocate, + 1, /* used */ + c->nr_regs + 1, /* msg length */ + allocate ? 1 : 0, /* response length */ + allocate ? 0 : 1, /* eot */ + 1, /* writes_complete */ + 0, /* urb offset */ + BRW_URB_SWIZZLE_NONE); +} + +static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim) +{ + struct brw_compile *p = &c->func; + brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim)); + brw_ff_sync(p, + c->reg.R0, + 0, + c->reg.R0, + 1, + 1, /* used */ + 1, /* msg length */ + 1, /* response length */ + 0, /* eot */ + 1, /* write compelete */ + 0, /* urb offset */ + BRW_URB_SWIZZLE_NONE); +} + + +void brw_gs_quads( struct brw_gs_compile *c ) +{ + brw_gs_alloc_regs(c, 4); + + /* Use polygons for correct edgeflag behaviour. Note that vertex 3 + * is the PV for quads, but vertex 0 for polygons: + */ + if (c->need_ff_sync) + brw_gs_ff_sync(c, 1); + brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START)); + brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); + brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); + brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END)); +} + +void brw_gs_quad_strip( struct brw_gs_compile *c ) +{ + brw_gs_alloc_regs(c, 4); + + if (c->need_ff_sync) + brw_gs_ff_sync(c, 1); + brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START)); + brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2)); + brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); + brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END)); +} + +void brw_gs_tris( struct brw_gs_compile *c ) +{ + brw_gs_alloc_regs(c, 3); + + if (c->need_ff_sync) + brw_gs_ff_sync(c, 1); + brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START)); + brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2)); + brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END)); +} + +void brw_gs_lines( struct brw_gs_compile *c ) +{ + brw_gs_alloc_regs(c, 2); + + if (c->need_ff_sync) + brw_gs_ff_sync(c, 1); + brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START)); + brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END)); +} + +void brw_gs_points( struct brw_gs_compile *c ) +{ + brw_gs_alloc_regs(c, 1); + + if (c->need_ff_sync) + brw_gs_ff_sync(c, 1); + brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END)); +} + + + + + + + + diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c new file mode 100644 index 00000000000..ed9d2ffe605 --- /dev/null +++ b/src/gallium/drivers/i965/brw_gs_state.c @@ -0,0 +1,149 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "main/macros.h" + +struct brw_gs_unit_key { + unsigned int total_grf; + unsigned int urb_entry_read_length; + + unsigned int curbe_offset; + + unsigned int nr_urb_entries, urb_size; + GLboolean prog_active; +}; + +static void +gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key) +{ + memset(key, 0, sizeof(*key)); + + /* CACHE_NEW_GS_PROG */ + key->prog_active = brw->gs.prog_active; + if (key->prog_active) { + key->total_grf = brw->gs.prog_data->total_grf; + key->urb_entry_read_length = brw->gs.prog_data->urb_read_length; + } else { + key->total_grf = 1; + key->urb_entry_read_length = 1; + } + + /* BRW_NEW_CURBE_OFFSETS */ + key->curbe_offset = brw->curbe.clip_start; + + /* BRW_NEW_URB_FENCE */ + key->nr_urb_entries = brw->urb.nr_gs_entries; + key->urb_size = brw->urb.vsize; +} + +static dri_bo * +gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key) +{ + struct brw_gs_unit_state gs; + dri_bo *bo; + + memset(&gs, 0, sizeof(gs)); + + gs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + if (key->prog_active) /* reloc */ + gs.thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6; + + gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + gs.thread1.single_program_flow = 1; + + gs.thread3.dispatch_grf_start_reg = 1; + gs.thread3.const_urb_entry_read_offset = 0; + gs.thread3.const_urb_entry_read_length = 0; + gs.thread3.urb_entry_read_offset = 0; + gs.thread3.urb_entry_read_length = key->urb_entry_read_length; + + gs.thread4.nr_urb_entries = key->nr_urb_entries; + gs.thread4.urb_entry_allocation_size = key->urb_size - 1; + + if (key->nr_urb_entries >= 8) + gs.thread4.max_threads = 1; + else + gs.thread4.max_threads = 0; + + if (BRW_IS_IGDNG(brw)) + gs.thread4.rendering_enable = 1; + + if (INTEL_DEBUG & DEBUG_STATS) + gs.thread4.stats_enable = 1; + + bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT, + key, sizeof(*key), + &brw->gs.prog_bo, 1, + &gs, sizeof(gs), + NULL, NULL); + + if (key->prog_active) { + /* Emit GS program relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + gs.thread0.grf_reg_count << 1, + offsetof(struct brw_gs_unit_state, thread0), + brw->gs.prog_bo); + } + + return bo; +} + +static void prepare_gs_unit(struct brw_context *brw) +{ + struct brw_gs_unit_key key; + + gs_unit_populate_key(brw, &key); + + dri_bo_unreference(brw->gs.state_bo); + brw->gs.state_bo = brw_search_cache(&brw->cache, BRW_GS_UNIT, + &key, sizeof(key), + &brw->gs.prog_bo, 1, + NULL); + if (brw->gs.state_bo == NULL) { + brw->gs.state_bo = gs_unit_create_from_key(brw, &key); + } +} + +const struct brw_tracked_state brw_gs_unit = { + .dirty = { + .mesa = 0, + .brw = (BRW_NEW_CURBE_OFFSETS | + BRW_NEW_URB_FENCE), + .cache = CACHE_NEW_GS_PROG + }, + .prepare = prepare_gs_unit, +}; diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c new file mode 100644 index 00000000000..ea718575484 --- /dev/null +++ b/src/gallium/drivers/i965/brw_misc_state.c @@ -0,0 +1,545 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "intel_batchbuffer.h" +#include "intel_regions.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + + + + + +/*********************************************************************** + * Blend color + */ + +static void upload_blend_constant_color(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_blend_constant_color bcc; + + memset(&bcc, 0, sizeof(bcc)); + bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR; + bcc.header.length = sizeof(bcc)/4-2; + bcc.blend_constant_color[0] = ctx->Color.BlendColor[0]; + bcc.blend_constant_color[1] = ctx->Color.BlendColor[1]; + bcc.blend_constant_color[2] = ctx->Color.BlendColor[2]; + bcc.blend_constant_color[3] = ctx->Color.BlendColor[3]; + + BRW_CACHED_BATCH_STRUCT(brw, &bcc); +} + + +const struct brw_tracked_state brw_blend_constant_color = { + .dirty = { + .mesa = _NEW_COLOR, + .brw = 0, + .cache = 0 + }, + .emit = upload_blend_constant_color +}; + +/* Constant single cliprect for framebuffer object or DRI2 drawing */ +static void upload_drawing_rect(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + GLcontext *ctx = &intel->ctx; + + if (!intel->constant_cliprect) + return; + + BEGIN_BATCH(4, NO_LOOP_CLIPRECTS); + OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965); + OUT_BATCH(0); /* xmin, ymin */ + OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) | + ((ctx->DrawBuffer->Height - 1) << 16)); + OUT_BATCH(0); + ADVANCE_BATCH(); +} + +const struct brw_tracked_state brw_drawing_rect = { + .dirty = { + .mesa = _NEW_BUFFERS, + .brw = 0, + .cache = 0 + }, + .emit = upload_drawing_rect +}; + +static void prepare_binding_table_pointers(struct brw_context *brw) +{ + brw_add_validated_bo(brw, brw->vs.bind_bo); + brw_add_validated_bo(brw, brw->wm.bind_bo); +} + +/** + * Upload the binding table pointers, which point each stage's array of surface + * state pointers. + * + * The binding table pointers are relative to the surface state base address, + * which is 0. + */ +static void upload_binding_table_pointers(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + BEGIN_BATCH(6, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2)); + if (brw->vs.bind_bo != NULL) + OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */ + else + OUT_BATCH(0); + OUT_BATCH(0); /* gs */ + OUT_BATCH(0); /* clip */ + OUT_BATCH(0); /* sf */ + OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */ + ADVANCE_BATCH(); +} + +const struct brw_tracked_state brw_binding_table_pointers = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH, + .cache = CACHE_NEW_SURF_BIND, + }, + .prepare = prepare_binding_table_pointers, + .emit = upload_binding_table_pointers, +}; + + +/** + * Upload pointers to the per-stage state. + * + * The state pointers in this packet are all relative to the general state + * base address set by CMD_STATE_BASE_ADDRESS, which is 0. + */ +static void upload_pipelined_state_pointers(struct brw_context *brw ) +{ + struct intel_context *intel = &brw->intel; + + BEGIN_BATCH(7, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2)); + OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + if (brw->gs.prog_active) + OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); + else + OUT_BATCH(0); + OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); + OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + ADVANCE_BATCH(); + + brw->state.dirty.brw |= BRW_NEW_PSP; +} + + +static void prepare_psp_urb_cbs(struct brw_context *brw) +{ + brw_add_validated_bo(brw, brw->vs.state_bo); + brw_add_validated_bo(brw, brw->gs.state_bo); + brw_add_validated_bo(brw, brw->clip.state_bo); + brw_add_validated_bo(brw, brw->sf.state_bo); + brw_add_validated_bo(brw, brw->wm.state_bo); + brw_add_validated_bo(brw, brw->cc.state_bo); +} + +static void upload_psp_urb_cbs(struct brw_context *brw ) +{ + upload_pipelined_state_pointers(brw); + brw_upload_urb_fence(brw); + brw_upload_cs_urb_state(brw); +} + +const struct brw_tracked_state brw_psp_urb_cbs = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH, + .cache = (CACHE_NEW_VS_UNIT | + CACHE_NEW_GS_UNIT | + CACHE_NEW_GS_PROG | + CACHE_NEW_CLIP_UNIT | + CACHE_NEW_SF_UNIT | + CACHE_NEW_WM_UNIT | + CACHE_NEW_CC_UNIT) + }, + .prepare = prepare_psp_urb_cbs, + .emit = upload_psp_urb_cbs, +}; + +static void prepare_depthbuffer(struct brw_context *brw) +{ + struct intel_region *region = brw->state.depth_region; + + if (region != NULL) + brw_add_validated_bo(brw, region->buffer); +} + +static void emit_depthbuffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct intel_region *region = brw->state.depth_region; + unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5; + + if (region == NULL) { + BEGIN_BATCH(len, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2)); + OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) | + (BRW_SURFACE_NULL << 29)); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); + + if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) + OUT_BATCH(0); + + ADVANCE_BATCH(); + } else { + unsigned int format; + + switch (region->cpp) { + case 2: + format = BRW_DEPTHFORMAT_D16_UNORM; + break; + case 4: + if (intel->depth_buffer_is_float) + format = BRW_DEPTHFORMAT_D32_FLOAT; + else + format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT; + break; + default: + assert(0); + return; + } + + assert(region->tiling != I915_TILING_X); + + BEGIN_BATCH(len, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2)); + OUT_BATCH(((region->pitch * region->cpp) - 1) | + (format << 18) | + (BRW_TILEWALK_YMAJOR << 26) | + ((region->tiling != I915_TILING_NONE) << 27) | + (BRW_SURFACE_2D << 29)); + OUT_RELOC(region->buffer, + I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + 0); + OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) | + ((region->pitch - 1) << 6) | + ((region->height - 1) << 19)); + OUT_BATCH(0); + + if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) + OUT_BATCH(0); + + ADVANCE_BATCH(); + } +} + +const struct brw_tracked_state brw_depthbuffer = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_DEPTH_BUFFER | BRW_NEW_BATCH, + .cache = 0, + }, + .prepare = prepare_depthbuffer, + .emit = emit_depthbuffer, +}; + + + +/*********************************************************************** + * Polygon stipple packet + */ + +static void upload_polygon_stipple(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_polygon_stipple bps; + GLuint i; + + memset(&bps, 0, sizeof(bps)); + bps.header.opcode = CMD_POLY_STIPPLE_PATTERN; + bps.header.length = sizeof(bps)/4-2; + + /* Polygon stipple is provided in OpenGL order, i.e. bottom + * row first. If we're rendering to a window (i.e. the + * default frame buffer object, 0), then we need to invert + * it to match our pixel layout. But if we're rendering + * to a FBO (i.e. any named frame buffer object), we *don't* + * need to invert - we already match the layout. + */ + if (ctx->DrawBuffer->Name == 0) { + for (i = 0; i < 32; i++) + bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */ + } + else { + for (i = 0; i < 32; i++) + bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */ + } + + BRW_CACHED_BATCH_STRUCT(brw, &bps); +} + +const struct brw_tracked_state brw_polygon_stipple = { + .dirty = { + .mesa = _NEW_POLYGONSTIPPLE, + .brw = 0, + .cache = 0 + }, + .emit = upload_polygon_stipple +}; + + +/*********************************************************************** + * Polygon stipple offset packet + */ + +static void upload_polygon_stipple_offset(struct brw_context *brw) +{ + __DRIdrawablePrivate *dPriv = brw->intel.driDrawable; + struct brw_polygon_stipple_offset bpso; + + memset(&bpso, 0, sizeof(bpso)); + bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET; + bpso.header.length = sizeof(bpso)/4-2; + + /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0), + * we have to invert the Y axis in order to match the OpenGL + * pixel coordinate system, and our offset must be matched + * to the window position. If we're drawing to a FBO + * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate + * system works just fine, and there's no window system to + * worry about. + */ + if (brw->intel.ctx.DrawBuffer->Name == 0) { + bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31; + bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31; + } + else { + bpso.bits0.y_offset = 0; + bpso.bits0.x_offset = 0; + } + + BRW_CACHED_BATCH_STRUCT(brw, &bpso); +} + +#define _NEW_WINDOW_POS 0x40000000 + +const struct brw_tracked_state brw_polygon_stipple_offset = { + .dirty = { + .mesa = _NEW_WINDOW_POS, + .brw = 0, + .cache = 0 + }, + .emit = upload_polygon_stipple_offset +}; + +/********************************************************************** + * AA Line parameters + */ +static void upload_aa_line_parameters(struct brw_context *brw) +{ + struct brw_aa_line_parameters balp; + + if (BRW_IS_965(brw)) + return; + + /* use legacy aa line coverage computation */ + memset(&balp, 0, sizeof(balp)); + balp.header.opcode = CMD_AA_LINE_PARAMETERS; + balp.header.length = sizeof(balp) / 4 - 2; + + BRW_CACHED_BATCH_STRUCT(brw, &balp); +} + +const struct brw_tracked_state brw_aa_line_parameters = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_CONTEXT, + .cache = 0 + }, + .emit = upload_aa_line_parameters +}; + +/*********************************************************************** + * Line stipple packet + */ + +static void upload_line_stipple(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_line_stipple bls; + GLfloat tmp; + GLint tmpi; + + memset(&bls, 0, sizeof(bls)); + bls.header.opcode = CMD_LINE_STIPPLE_PATTERN; + bls.header.length = sizeof(bls)/4 - 2; + + bls.bits0.pattern = ctx->Line.StipplePattern; + bls.bits1.repeat_count = ctx->Line.StippleFactor; + + tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor; + tmpi = tmp * (1<<13); + + + bls.bits1.inverse_repeat_count = tmpi; + + BRW_CACHED_BATCH_STRUCT(brw, &bls); +} + +const struct brw_tracked_state brw_line_stipple = { + .dirty = { + .mesa = _NEW_LINE, + .brw = 0, + .cache = 0 + }, + .emit = upload_line_stipple +}; + + +/*********************************************************************** + * Misc invarient state packets + */ + +static void upload_invarient_state( struct brw_context *brw ) +{ + { + /* 0x61040000 Pipeline Select */ + /* PipelineSelect : 0 */ + struct brw_pipeline_select ps; + + memset(&ps, 0, sizeof(ps)); + ps.header.opcode = CMD_PIPELINE_SELECT(brw); + ps.header.pipeline_select = 0; + BRW_BATCH_STRUCT(brw, &ps); + } + + { + struct brw_global_depth_offset_clamp gdo; + memset(&gdo, 0, sizeof(gdo)); + + /* Disable depth offset clamping. + */ + gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP; + gdo.header.length = sizeof(gdo)/4 - 2; + gdo.depth_offset_clamp = 0.0; + + BRW_BATCH_STRUCT(brw, &gdo); + } + + + /* 0x61020000 State Instruction Pointer */ + { + struct brw_system_instruction_pointer sip; + memset(&sip, 0, sizeof(sip)); + + sip.header.opcode = CMD_STATE_INSN_POINTER; + sip.header.length = 0; + sip.bits0.pad = 0; + sip.bits0.system_instruction_pointer = 0; + BRW_BATCH_STRUCT(brw, &sip); + } + + + { + struct brw_vf_statistics vfs; + memset(&vfs, 0, sizeof(vfs)); + + vfs.opcode = CMD_VF_STATISTICS(brw); + if (INTEL_DEBUG & DEBUG_STATS) + vfs.statistics_enable = 1; + + BRW_BATCH_STRUCT(brw, &vfs); + } +} + +const struct brw_tracked_state brw_invarient_state = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_CONTEXT, + .cache = 0 + }, + .emit = upload_invarient_state +}; + +/** + * Define the base addresses which some state is referenced from. + * + * This allows us to avoid having to emit relocations in many places for + * cached state, and instead emit pointers inside of large, mostly-static + * state pools. This comes at the expense of memory, and more expensive cache + * misses. + */ +static void upload_state_base_address( struct brw_context *brw ) +{ + struct intel_context *intel = &brw->intel; + + /* Output the structure (brw_state_base_address) directly to the + * batchbuffer, so we can emit relocations inline. + */ + if (BRW_IS_IGDNG(brw)) { + BEGIN_BATCH(8, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2)); + OUT_BATCH(1); /* General state base address */ + OUT_BATCH(1); /* Surface state base address */ + OUT_BATCH(1); /* Indirect object base address */ + OUT_BATCH(1); /* Instruction base address */ + OUT_BATCH(1); /* General state upper bound */ + OUT_BATCH(1); /* Indirect object upper bound */ + OUT_BATCH(1); /* Instruction access upper bound */ + ADVANCE_BATCH(); + } else { + BEGIN_BATCH(6, IGNORE_CLIPRECTS); + OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2)); + OUT_BATCH(1); /* General state base address */ + OUT_BATCH(1); /* Surface state base address */ + OUT_BATCH(1); /* Indirect object base address */ + OUT_BATCH(1); /* General state upper bound */ + OUT_BATCH(1); /* Indirect object upper bound */ + ADVANCE_BATCH(); + } +} + +const struct brw_tracked_state brw_state_base_address = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_CONTEXT, + .cache = 0, + }, + .emit = upload_state_base_address +}; diff --git a/src/gallium/drivers/i965/brw_program.c b/src/gallium/drivers/i965/brw_program.c new file mode 100644 index 00000000000..bac69187c19 --- /dev/null +++ b/src/gallium/drivers/i965/brw_program.c @@ -0,0 +1,166 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/imports.h" +#include "main/enums.h" +#include "shader/prog_parameter.h" +#include "shader/program.h" +#include "shader/programopt.h" +#include "tnl/tnl.h" + +#include "brw_context.h" +#include "brw_util.h" +#include "brw_wm.h" + +static void brwBindProgram( GLcontext *ctx, + GLenum target, + struct gl_program *prog ) +{ + struct brw_context *brw = brw_context(ctx); + + switch (target) { + case GL_VERTEX_PROGRAM_ARB: + brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; + break; + case GL_FRAGMENT_PROGRAM_ARB: + brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM; + break; + } +} + +static struct gl_program *brwNewProgram( GLcontext *ctx, + GLenum target, + GLuint id ) +{ + struct brw_context *brw = brw_context(ctx); + + switch (target) { + case GL_VERTEX_PROGRAM_ARB: { + struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program); + if (prog) { + prog->id = brw->program_id++; + + return _mesa_init_vertex_program( ctx, &prog->program, + target, id ); + } + else + return NULL; + } + + case GL_FRAGMENT_PROGRAM_ARB: { + struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program); + if (prog) { + prog->id = brw->program_id++; + + return _mesa_init_fragment_program( ctx, &prog->program, + target, id ); + } + else + return NULL; + } + + default: + return _mesa_new_program(ctx, target, id); + } +} + +static void brwDeleteProgram( GLcontext *ctx, + struct gl_program *prog ) +{ + if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) { + struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog; + struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog); + dri_bo_unreference(brw_fprog->const_buffer); + } + + _mesa_delete_program( ctx, prog ); +} + + +static GLboolean brwIsProgramNative( GLcontext *ctx, + GLenum target, + struct gl_program *prog ) +{ + return GL_TRUE; +} + +static void brwProgramStringNotify( GLcontext *ctx, + GLenum target, + struct gl_program *prog ) +{ + struct brw_context *brw = brw_context(ctx); + + if (target == GL_FRAGMENT_PROGRAM_ARB) { + struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog; + struct brw_fragment_program *newFP = brw_fragment_program(fprog); + const struct brw_fragment_program *curFP = + brw_fragment_program_const(brw->fragment_program); + + if (fprog->FogOption) { + _mesa_append_fog_code(ctx, fprog); + fprog->FogOption = GL_NONE; + } + + if (newFP == curFP) + brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM; + newFP->id = brw->program_id++; + newFP->isGLSL = brw_wm_is_glsl(fprog); + } + else if (target == GL_VERTEX_PROGRAM_ARB) { + struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog; + struct brw_vertex_program *newVP = brw_vertex_program(vprog); + const struct brw_vertex_program *curVP = + brw_vertex_program_const(brw->vertex_program); + + if (newVP == curVP) + brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; + if (newVP->program.IsPositionInvariant) { + _mesa_insert_mvp_code(ctx, &newVP->program); + } + newVP->id = brw->program_id++; + + /* Also tell tnl about it: + */ + _tnl_program_string(ctx, target, prog); + } +} + +void brwInitFragProgFuncs( struct dd_function_table *functions ) +{ + assert(functions->ProgramStringNotify == _tnl_program_string); + + functions->BindProgram = brwBindProgram; + functions->NewProgram = brwNewProgram; + functions->DeleteProgram = brwDeleteProgram; + functions->IsProgramNative = brwIsProgramNative; + functions->ProgramStringNotify = brwProgramStringNotify; +} + diff --git a/src/gallium/drivers/i965/brw_queryobj.c b/src/gallium/drivers/i965/brw_queryobj.c new file mode 100644 index 00000000000..a195bc32b07 --- /dev/null +++ b/src/gallium/drivers/i965/brw_queryobj.c @@ -0,0 +1,254 @@ +/* + * Copyright © 2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +/** @file support for ARB_query_object + * + * ARB_query_object is implemented by using the PIPE_CONTROL command to stall + * execution on the completion of previous depth tests, and write the + * current PS_DEPTH_COUNT to a buffer object. + * + * We use before and after counts when drawing during a query so that + * we don't pick up other clients' query data in ours. To reduce overhead, + * a single BO is used to record the query data for all active queries at + * once. This also gives us a simple bound on how much batchbuffer space is + * required for handling queries, so that we can be sure that we won't + * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT. + */ +#include "main/simple_list.h" +#include "main/imports.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "intel_batchbuffer.h" +#include "intel_reg.h" + +/** Waits on the query object's BO and totals the results for this query */ +static void +brw_queryobj_get_results(struct brw_query_object *query) +{ + int i; + uint64_t *results; + + if (query->bo == NULL) + return; + + /* Map and count the pixels from the current query BO */ + dri_bo_map(query->bo, GL_FALSE); + results = query->bo->virtual; + for (i = query->first_index; i <= query->last_index; i++) { + query->Base.Result += results[i * 2 + 1] - results[i * 2]; + } + dri_bo_unmap(query->bo); + + dri_bo_unreference(query->bo); + query->bo = NULL; +} + +static struct gl_query_object * +brw_new_query_object(GLcontext *ctx, GLuint id) +{ + struct brw_query_object *query; + + query = _mesa_calloc(sizeof(struct brw_query_object)); + + query->Base.Id = id; + query->Base.Result = 0; + query->Base.Active = GL_FALSE; + query->Base.Ready = GL_TRUE; + + return &query->Base; +} + +static void +brw_delete_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_query_object *query = (struct brw_query_object *)q; + + dri_bo_unreference(query->bo); + _mesa_free(query); +} + +static void +brw_begin_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = intel_context(ctx); + struct brw_query_object *query = (struct brw_query_object *)q; + + /* Reset our driver's tracking of query state. */ + dri_bo_unreference(query->bo); + query->bo = NULL; + query->first_index = -1; + query->last_index = -1; + + insert_at_head(&brw->query.active_head, query); + intel->stats_wm++; +} + +/** + * Begin the ARB_occlusion_query query on a query object. + */ +static void +brw_end_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = intel_context(ctx); + struct brw_query_object *query = (struct brw_query_object *)q; + + /* Flush the batchbuffer in case it has writes to our query BO. + * Have later queries write to a new query BO so that further rendering + * doesn't delay the collection of our results. + */ + if (query->bo) { + brw_emit_query_end(brw); + intel_batchbuffer_flush(intel->batch); + + dri_bo_unreference(brw->query.bo); + brw->query.bo = NULL; + } + + remove_from_list(query); + + intel->stats_wm--; +} + +static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_query_object *query = (struct brw_query_object *)q; + + brw_queryobj_get_results(query); + query->Base.Ready = GL_TRUE; +} + +static void brw_check_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_query_object *query = (struct brw_query_object *)q; + + if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) { + brw_queryobj_get_results(query); + query->Base.Ready = GL_TRUE; + } +} + +/** Called to set up the query BO and account for its aperture space */ +void +brw_prepare_query_begin(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + /* Skip if we're not doing any queries. */ + if (is_empty_list(&brw->query.active_head)) + return; + + /* Get a new query BO if we're going to need it. */ + if (brw->query.bo == NULL || + brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) { + dri_bo_unreference(brw->query.bo); + brw->query.bo = NULL; + + brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1); + brw->query.index = 0; + } + + brw_add_validated_bo(brw, brw->query.bo); +} + +/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */ +void +brw_emit_query_begin(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct brw_query_object *query; + + /* Skip if we're not doing any queries, or we've emitted the start. */ + if (brw->query.active || is_empty_list(&brw->query.active_head)) + return; + + BEGIN_BATCH(4, IGNORE_CLIPRECTS); + OUT_BATCH(_3DSTATE_PIPE_CONTROL | + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_WRITE_DEPTH_COUNT); + /* This object could be mapped cacheable, but we don't have an exposed + * mechanism to support that. Since it's going uncached, tell GEM that + * we're writing to it. The usual clflush should be all that's required + * to pick up the results. + */ + OUT_RELOC(brw->query.bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + PIPE_CONTROL_GLOBAL_GTT_WRITE | + ((brw->query.index * 2) * sizeof(uint64_t))); + OUT_BATCH(0); + OUT_BATCH(0); + ADVANCE_BATCH(); + + foreach(query, &brw->query.active_head) { + if (query->bo != brw->query.bo) { + if (query->bo != NULL) + brw_queryobj_get_results(query); + dri_bo_reference(brw->query.bo); + query->bo = brw->query.bo; + query->first_index = brw->query.index; + } + query->last_index = brw->query.index; + } + brw->query.active = GL_TRUE; +} + +/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */ +void +brw_emit_query_end(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + if (!brw->query.active) + return; + + BEGIN_BATCH(4, IGNORE_CLIPRECTS); + OUT_BATCH(_3DSTATE_PIPE_CONTROL | + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_WRITE_DEPTH_COUNT); + OUT_RELOC(brw->query.bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + PIPE_CONTROL_GLOBAL_GTT_WRITE | + ((brw->query.index * 2 + 1) * sizeof(uint64_t))); + OUT_BATCH(0); + OUT_BATCH(0); + ADVANCE_BATCH(); + + brw->query.active = GL_FALSE; + brw->query.index++; +} + +void brw_init_queryobj_functions(struct dd_function_table *functions) +{ + functions->NewQueryObject = brw_new_query_object; + functions->DeleteQuery = brw_delete_query; + functions->BeginQuery = brw_begin_query; + functions->EndQuery = brw_end_query; + functions->CheckQuery = brw_check_query; + functions->WaitQuery = brw_wait_query; +} diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c new file mode 100644 index 00000000000..e1c2c7777b5 --- /dev/null +++ b/src/gallium/drivers/i965/brw_sf.c @@ -0,0 +1,200 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_sf.h" +#include "brw_state.h" + +static void compile_sf_prog( struct brw_context *brw, + struct brw_sf_prog_key *key ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_sf_compile c; + const GLuint *program; + GLuint program_size; + GLuint i, idx; + + memset(&c, 0, sizeof(c)); + + /* Begin the compilation: + */ + brw_init_compile(brw, &c.func); + + c.key = *key; + c.nr_attrs = brw_count_bits(c.key.attrs); + c.nr_attr_regs = (c.nr_attrs+1)/2; + c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS); + c.nr_setup_regs = (c.nr_setup_attrs+1)/2; + + c.prog_data.urb_read_length = c.nr_attr_regs; + c.prog_data.urb_entry_size = c.nr_setup_regs * 2; + + /* Construct map from attribute number to position in the vertex. + */ + for (i = idx = 0; i < VERT_RESULT_MAX; i++) + if (c.key.attrs & (1<<i)) { + c.attr_to_idx[i] = idx; + c.idx_to_attr[idx] = i; + if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) { + c.point_attrs[i].CoordReplace = + ctx->Point.CoordReplace[i - VERT_RESULT_TEX0]; + } + else { + c.point_attrs[i].CoordReplace = GL_FALSE; + } + idx++; + } + + /* Which primitive? Or all three? + */ + switch (key->primitive) { + case SF_TRIANGLES: + c.nr_verts = 3; + brw_emit_tri_setup( &c, GL_TRUE ); + break; + case SF_LINES: + c.nr_verts = 2; + brw_emit_line_setup( &c, GL_TRUE ); + break; + case SF_POINTS: + c.nr_verts = 1; + if (key->do_point_sprite) + brw_emit_point_sprite_setup( &c, GL_TRUE ); + else + brw_emit_point_setup( &c, GL_TRUE ); + break; + case SF_UNFILLED_TRIS: + c.nr_verts = 3; + brw_emit_anyprim_setup( &c ); + break; + default: + assert(0); + return; + } + + /* get the program + */ + program = brw_get_program(&c.func, &program_size); + + /* Upload + */ + dri_bo_unreference(brw->sf.prog_bo); + brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG, + &c.key, sizeof(c.key), + NULL, 0, + program, program_size, + &c.prog_data, + &brw->sf.prog_data ); +} + +/* Calculate interpolants for triangle and line rasterization. + */ +static void upload_sf_prog(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_sf_prog_key key; + + memset(&key, 0, sizeof(key)); + + /* Populate the key, noting state dependencies: + */ + /* CACHE_NEW_VS_PROG */ + key.attrs = brw->vs.prog_data->outputs_written; + + /* BRW_NEW_REDUCED_PRIMITIVE */ + switch (brw->intel.reduced_primitive) { + case GL_TRIANGLES: + /* NOTE: We just use the edgeflag attribute as an indicator that + * unfilled triangles are active. We don't actually do the + * edgeflag testing here, it is already done in the clip + * program. + */ + if (key.attrs & (1<<VERT_RESULT_EDGE)) + key.primitive = SF_UNFILLED_TRIS; + else + key.primitive = SF_TRIANGLES; + break; + case GL_LINES: + key.primitive = SF_LINES; + break; + case GL_POINTS: + key.primitive = SF_POINTS; + break; + } + + key.do_point_sprite = ctx->Point.PointSprite; + key.SpriteOrigin = ctx->Point.SpriteOrigin; + /* _NEW_LIGHT */ + key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT); + key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide); + + /* _NEW_HINT */ + key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST); + + /* _NEW_POLYGON */ + if (key.do_twoside_color) { + /* If we're rendering to a FBO, we have to invert the polygon + * face orientation, just as we invert the viewport in + * sf_unit_create_from_key(). ctx->DrawBuffer->Name will be + * nonzero if we're rendering to such an FBO. + */ + key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0); + } + + dri_bo_unreference(brw->sf.prog_bo); + brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG, + &key, sizeof(key), + NULL, 0, + &brw->sf.prog_data); + if (brw->sf.prog_bo == NULL) + compile_sf_prog( brw, &key ); +} + + +const struct brw_tracked_state brw_sf_prog = { + .dirty = { + .mesa = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT), + .brw = (BRW_NEW_REDUCED_PRIMITIVE), + .cache = CACHE_NEW_VS_PROG + }, + .prepare = upload_sf_prog +}; + diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h new file mode 100644 index 00000000000..6426b6df9ff --- /dev/null +++ b/src/gallium/drivers/i965/brw_sf.h @@ -0,0 +1,113 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_SF_H +#define BRW_SF_H + + +#include "shader/program.h" +#include "brw_context.h" +#include "brw_eu.h" + + +#define SF_POINTS 0 +#define SF_LINES 1 +#define SF_TRIANGLES 2 +#define SF_UNFILLED_TRIS 3 + +struct brw_sf_prog_key { + GLuint attrs:32; + GLuint primitive:2; + GLuint do_twoside_color:1; + GLuint do_flat_shading:1; + GLuint frontface_ccw:1; + GLuint do_point_sprite:1; + GLuint linear_color:1; /**< linear interp vs. perspective interp */ + GLuint pad:25; + GLenum SpriteOrigin; +}; + +struct brw_sf_point_tex { + GLboolean CoordReplace; +}; + +struct brw_sf_compile { + struct brw_compile func; + struct brw_sf_prog_key key; + struct brw_sf_prog_data prog_data; + + struct brw_reg pv; + struct brw_reg det; + struct brw_reg dx0; + struct brw_reg dx2; + struct brw_reg dy0; + struct brw_reg dy2; + + /* z and 1/w passed in seperately: + */ + struct brw_reg z[3]; + struct brw_reg inv_w[3]; + + /* The vertices: + */ + struct brw_reg vert[3]; + + /* Temporaries, allocated after last vertex reg. + */ + struct brw_reg inv_det; + struct brw_reg a1_sub_a0; + struct brw_reg a2_sub_a0; + struct brw_reg tmp; + + struct brw_reg m1Cx; + struct brw_reg m2Cy; + struct brw_reg m3C0; + + GLuint nr_verts; + GLuint nr_attrs; + GLuint nr_attr_regs; + GLuint nr_setup_attrs; + GLuint nr_setup_regs; + + GLubyte attr_to_idx[VERT_RESULT_MAX]; + GLubyte idx_to_attr[VERT_RESULT_MAX]; + struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX]; +}; + + +void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate ); +void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate ); +void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate ); +void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate ); +void brw_emit_anyprim_setup( struct brw_sf_compile *c ); + +#endif diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c new file mode 100644 index 00000000000..ca8f97f9f9e --- /dev/null +++ b/src/gallium/drivers/i965/brw_sf_emit.c @@ -0,0 +1,739 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" + +#include "intel_batchbuffer.h" + +#include "brw_defines.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_util.h" +#include "brw_sf.h" + + +static struct brw_reg get_vert_attr(struct brw_sf_compile *c, + struct brw_reg vert, + GLuint attr) +{ + GLuint off = c->attr_to_idx[attr] / 2; + GLuint sub = c->attr_to_idx[attr] % 2; + + return brw_vec4_grf(vert.nr + off, sub * 4); +} + +static GLboolean have_attr(struct brw_sf_compile *c, + GLuint attr) +{ + return (c->key.attrs & (1<<attr)) ? 1 : 0; +} + +/*********************************************************************** + * Twoside lighting + */ +static void copy_bfc( struct brw_sf_compile *c, + struct brw_reg vert ) +{ + struct brw_compile *p = &c->func; + GLuint i; + + for (i = 0; i < 2; i++) { + if (have_attr(c, VERT_RESULT_COL0+i) && + have_attr(c, VERT_RESULT_BFC0+i)) + brw_MOV(p, + get_vert_attr(c, vert, VERT_RESULT_COL0+i), + get_vert_attr(c, vert, VERT_RESULT_BFC0+i)); + } +} + + +static void do_twoside_color( struct brw_sf_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *if_insn; + GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L; + + /* Already done in clip program: + */ + if (c->key.primitive == SF_UNFILLED_TRIS) + return; + + /* XXX: What happens if BFC isn't present? This could only happen + * for user-supplied vertex programs, as t_vp_build.c always does + * the right thing. + */ + if (!(have_attr(c, VERT_RESULT_COL0) && have_attr(c, VERT_RESULT_BFC0)) && + !(have_attr(c, VERT_RESULT_COL1) && have_attr(c, VERT_RESULT_BFC1))) + return; + + /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order + * to get all channels active inside the IF. In the clipping code + * we run with NoMask, so it's not an option and we can use + * BRW_EXECUTE_1 for all comparisions. + */ + brw_push_insn_state(p); + brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0)); + if_insn = brw_IF(p, BRW_EXECUTE_4); + { + switch (c->nr_verts) { + case 3: copy_bfc(c, c->vert[2]); + case 2: copy_bfc(c, c->vert[1]); + case 1: copy_bfc(c, c->vert[0]); + } + } + brw_ENDIF(p, if_insn); + brw_pop_insn_state(p); +} + + + +/*********************************************************************** + * Flat shading + */ + +#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \ + (1<<VERT_RESULT_COL1)) + +static void copy_colors( struct brw_sf_compile *c, + struct brw_reg dst, + struct brw_reg src) +{ + struct brw_compile *p = &c->func; + GLuint i; + + for (i = VERT_RESULT_COL0; i <= VERT_RESULT_COL1; i++) { + if (have_attr(c,i)) + brw_MOV(p, + get_vert_attr(c, dst, i), + get_vert_attr(c, src, i)); + } +} + + + +/* Need to use a computed jump to copy flatshaded attributes as the + * vertices are ordered according to y-coordinate before reaching this + * point, so the PV could be anywhere. + */ +static void do_flatshade_triangle( struct brw_sf_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg ip = brw_ip_reg(); + GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS); + GLuint jmpi = 1; + + if (!nr) + return; + + /* Already done in clip program: + */ + if (c->key.primitive == SF_UNFILLED_TRIS) + return; + + if (BRW_IS_IGDNG(p->brw)) + jmpi = 2; + + brw_push_insn_state(p); + + brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1))); + brw_JMPI(p, ip, ip, c->pv); + + copy_colors(c, c->vert[1], c->vert[0]); + copy_colors(c, c->vert[2], c->vert[0]); + brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1))); + + copy_colors(c, c->vert[0], c->vert[1]); + copy_colors(c, c->vert[2], c->vert[1]); + brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2)); + + copy_colors(c, c->vert[0], c->vert[2]); + copy_colors(c, c->vert[1], c->vert[2]); + + brw_pop_insn_state(p); +} + + +static void do_flatshade_line( struct brw_sf_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg ip = brw_ip_reg(); + GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS); + GLuint jmpi = 1; + + if (!nr) + return; + + /* Already done in clip program: + */ + if (c->key.primitive == SF_UNFILLED_TRIS) + return; + + if (BRW_IS_IGDNG(p->brw)) + jmpi = 2; + + brw_push_insn_state(p); + + brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1))); + brw_JMPI(p, ip, ip, c->pv); + copy_colors(c, c->vert[1], c->vert[0]); + + brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr)); + copy_colors(c, c->vert[0], c->vert[1]); + + brw_pop_insn_state(p); +} + + + +/*********************************************************************** + * Triangle setup. + */ + + +static void alloc_regs( struct brw_sf_compile *c ) +{ + GLuint reg, i; + + /* Values computed by fixed function unit: + */ + c->pv = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D); + c->det = brw_vec1_grf(1, 2); + c->dx0 = brw_vec1_grf(1, 3); + c->dx2 = brw_vec1_grf(1, 4); + c->dy0 = brw_vec1_grf(1, 5); + c->dy2 = brw_vec1_grf(1, 6); + + /* z and 1/w passed in seperately: + */ + c->z[0] = brw_vec1_grf(2, 0); + c->inv_w[0] = brw_vec1_grf(2, 1); + c->z[1] = brw_vec1_grf(2, 2); + c->inv_w[1] = brw_vec1_grf(2, 3); + c->z[2] = brw_vec1_grf(2, 4); + c->inv_w[2] = brw_vec1_grf(2, 5); + + /* The vertices: + */ + reg = 3; + for (i = 0; i < c->nr_verts; i++) { + c->vert[i] = brw_vec8_grf(reg, 0); + reg += c->nr_attr_regs; + } + + /* Temporaries, allocated after last vertex reg. + */ + c->inv_det = brw_vec1_grf(reg, 0); reg++; + c->a1_sub_a0 = brw_vec8_grf(reg, 0); reg++; + c->a2_sub_a0 = brw_vec8_grf(reg, 0); reg++; + c->tmp = brw_vec8_grf(reg, 0); reg++; + + /* Note grf allocation: + */ + c->prog_data.total_grf = reg; + + + /* Outputs of this program - interpolation coefficients for + * rasterization: + */ + c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0); + c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0); + c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0); +} + + +static void copy_z_inv_w( struct brw_sf_compile *c ) +{ + struct brw_compile *p = &c->func; + GLuint i; + + brw_push_insn_state(p); + + /* Copy both scalars with a single MOV: + */ + for (i = 0; i < c->nr_verts; i++) + brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i])); + + brw_pop_insn_state(p); +} + + +static void invert_det( struct brw_sf_compile *c) +{ + /* Looks like we invert all 8 elements just to get 1/det in + * position 2 !?! + */ + brw_math(&c->func, + c->inv_det, + BRW_MATH_FUNCTION_INV, + BRW_MATH_SATURATE_NONE, + 0, + c->det, + BRW_MATH_DATA_SCALAR, + BRW_MATH_PRECISION_FULL); + +} + + +static GLboolean calculate_masks( struct brw_sf_compile *c, + GLuint reg, + GLushort *pc, + GLushort *pc_persp, + GLushort *pc_linear) +{ + GLboolean is_last_attr = (reg == c->nr_setup_regs - 1); + GLuint persp_mask; + GLuint linear_mask; + + if (c->key.do_flat_shading || c->key.linear_color) + persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS | + FRAG_BIT_COL0 | + FRAG_BIT_COL1); + else + persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS); + + if (c->key.do_flat_shading) + linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1); + else + linear_mask = c->key.attrs; + + *pc_persp = 0; + *pc_linear = 0; + *pc = 0xf; + + if (persp_mask & (1 << c->idx_to_attr[reg*2])) + *pc_persp = 0xf; + + if (linear_mask & (1 << c->idx_to_attr[reg*2])) + *pc_linear = 0xf; + + /* Maybe only processs one attribute on the final round: + */ + if (reg*2+1 < c->nr_setup_attrs) { + *pc |= 0xf0; + + if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) + *pc_persp |= 0xf0; + + if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) + *pc_linear |= 0xf0; + } + + return is_last_attr; +} + + + +void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate) +{ + struct brw_compile *p = &c->func; + GLuint i; + + c->nr_verts = 3; + + if (allocate) + alloc_regs(c); + + invert_det(c); + copy_z_inv_w(c); + + if (c->key.do_twoside_color) + do_twoside_color(c); + + if (c->key.do_flat_shading) + do_flatshade_triangle(c); + + + for (i = 0; i < c->nr_setup_regs; i++) + { + /* Pair of incoming attributes: + */ + struct brw_reg a0 = offset(c->vert[0], i); + struct brw_reg a1 = offset(c->vert[1], i); + struct brw_reg a2 = offset(c->vert[2], i); + GLushort pc, pc_persp, pc_linear; + GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + brw_set_predicate_control_flag_value(p, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + brw_MUL(p, a1, a1, c->inv_w[1]); + brw_MUL(p, a2, a2, c->inv_w[2]); + } + + + /* Calculate coefficients for interpolated values: + */ + if (pc_linear) + { + brw_set_predicate_control_flag_value(p, pc_linear); + + brw_ADD(p, c->a1_sub_a0, a1, negate(a0)); + brw_ADD(p, c->a2_sub_a0, a2, negate(a0)); + + /* calculate dA/dx + */ + brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2); + brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0)); + brw_MUL(p, c->m1Cx, c->tmp, c->inv_det); + + /* calculate dA/dy + */ + brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0); + brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2)); + brw_MUL(p, c->m2Cy, c->tmp, c->inv_det); + } + + { + brw_set_predicate_control_flag_value(p, pc); + /* start point for interpolation + */ + brw_MOV(p, c->m3C0, a0); + + /* Copy m0..m3 to URB. m0 is implicitly copied from r0 in + * the send instruction: + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), /* r0, will be copied to m0 */ + 0, /* allocate */ + 1, /* used */ + 4, /* msg len */ + 0, /* response len */ + last, /* eot */ + last, /* writes complete */ + i*4, /* offset */ + BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */ + } + } +} + + + +void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate) +{ + struct brw_compile *p = &c->func; + GLuint i; + + + c->nr_verts = 2; + + if (allocate) + alloc_regs(c); + + invert_det(c); + copy_z_inv_w(c); + + if (c->key.do_flat_shading) + do_flatshade_line(c); + + for (i = 0; i < c->nr_setup_regs; i++) + { + /* Pair of incoming attributes: + */ + struct brw_reg a0 = offset(c->vert[0], i); + struct brw_reg a1 = offset(c->vert[1], i); + GLushort pc, pc_persp, pc_linear; + GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + brw_set_predicate_control_flag_value(p, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + brw_MUL(p, a1, a1, c->inv_w[1]); + } + + /* Calculate coefficients for position, color: + */ + if (pc_linear) { + brw_set_predicate_control_flag_value(p, pc_linear); + + brw_ADD(p, c->a1_sub_a0, a1, negate(a0)); + + brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0); + brw_MUL(p, c->m1Cx, c->tmp, c->inv_det); + + brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0); + brw_MUL(p, c->m2Cy, c->tmp, c->inv_det); + } + + { + brw_set_predicate_control_flag_value(p, pc); + + /* start point for interpolation + */ + brw_MOV(p, c->m3C0, a0); + + /* Copy m0..m3 to URB. + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + 0, /* allocate */ + 1, /* used */ + 4, /* msg len */ + 0, /* response len */ + last, /* eot */ + last, /* writes complete */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + } +} + +void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate) +{ + struct brw_compile *p = &c->func; + GLuint i; + + c->nr_verts = 1; + + if (allocate) + alloc_regs(c); + + copy_z_inv_w(c); + for (i = 0; i < c->nr_setup_regs; i++) + { + struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]]; + struct brw_reg a0 = offset(c->vert[0], i); + GLushort pc, pc_persp, pc_linear; + GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + if (!tex->CoordReplace) { + brw_set_predicate_control_flag_value(p, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + } + } + + if (tex->CoordReplace) { + /* Caculate 1.0/PointWidth */ + brw_math(&c->func, + c->tmp, + BRW_MATH_FUNCTION_INV, + BRW_MATH_SATURATE_NONE, + 0, + c->dx0, + BRW_MATH_DATA_SCALAR, + BRW_MATH_PRECISION_FULL); + + if (c->key.SpriteOrigin == GL_LOWER_LEFT) { + brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]); + brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0)); + brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0])); + brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0)); + } else { + brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]); + brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0)); + brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]); + brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0)); + } + } else { + brw_MOV(p, c->m1Cx, brw_imm_ud(0)); + brw_MOV(p, c->m2Cy, brw_imm_ud(0)); + } + + { + brw_set_predicate_control_flag_value(p, pc); + if (tex->CoordReplace) { + if (c->key.SpriteOrigin == GL_LOWER_LEFT) { + brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0)); + brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0)); + } + else + brw_MOV(p, c->m3C0, brw_imm_f(0.0)); + } else { + brw_MOV(p, c->m3C0, a0); /* constant value */ + } + + /* Copy m0..m3 to URB. + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + 0, /* allocate */ + 1, /* used */ + 4, /* msg len */ + 0, /* response len */ + last, /* eot */ + last, /* writes complete */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + } +} + +/* Points setup - several simplifications as all attributes are + * constant across the face of the point (point sprites excluded!) + */ +void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate) +{ + struct brw_compile *p = &c->func; + GLuint i; + + c->nr_verts = 1; + + if (allocate) + alloc_regs(c); + + copy_z_inv_w(c); + + brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */ + brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */ + + for (i = 0; i < c->nr_setup_regs; i++) + { + struct brw_reg a0 = offset(c->vert[0], i); + GLushort pc, pc_persp, pc_linear; + GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + /* This seems odd as the values are all constant, but the + * fragment shader will be expecting it: + */ + brw_set_predicate_control_flag_value(p, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + } + + + /* The delta values are always zero, just send the starting + * coordinate. Again, this is to fit in with the interpolation + * code in the fragment shader. + */ + { + brw_set_predicate_control_flag_value(p, pc); + + brw_MOV(p, c->m3C0, a0); /* constant value */ + + /* Copy m0..m3 to URB. + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + 0, /* allocate */ + 1, /* used */ + 4, /* msg len */ + 0, /* response len */ + last, /* eot */ + last, /* writes complete */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + } +} + +void brw_emit_anyprim_setup( struct brw_sf_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg ip = brw_ip_reg(); + struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0); + struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); + struct brw_reg primmask; + struct brw_instruction *jmp; + struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + + GLuint saveflag; + + c->nr_verts = 3; + alloc_regs(c); + + primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD); + + brw_MOV(p, primmask, brw_imm_ud(1)); + brw_SHL(p, primmask, primmask, payload_prim); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); + brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) | + (1<<_3DPRIM_TRISTRIP) | + (1<<_3DPRIM_TRIFAN) | + (1<<_3DPRIM_TRISTRIP_REVERSE) | + (1<<_3DPRIM_POLYGON) | + (1<<_3DPRIM_RECTLIST) | + (1<<_3DPRIM_TRIFAN_NOSTIPPLE))); + jmp = brw_JMPI(p, ip, ip, brw_imm_d(0)); + { + saveflag = p->flag_value; + brw_push_insn_state(p); + brw_emit_tri_setup( c, GL_FALSE ); + brw_pop_insn_state(p); + p->flag_value = saveflag; + /* note - thread killed in subroutine, so must + * restore the flag which is changed when building + * the subroutine. fix #13240 + */ + } + brw_land_fwd_jump(p, jmp); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); + brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) | + (1<<_3DPRIM_LINESTRIP) | + (1<<_3DPRIM_LINELOOP) | + (1<<_3DPRIM_LINESTRIP_CONT) | + (1<<_3DPRIM_LINESTRIP_BF) | + (1<<_3DPRIM_LINESTRIP_CONT_BF))); + jmp = brw_JMPI(p, ip, ip, brw_imm_d(0)); + { + saveflag = p->flag_value; + brw_push_insn_state(p); + brw_emit_line_setup( c, GL_FALSE ); + brw_pop_insn_state(p); + p->flag_value = saveflag; + /* note - thread killed in subroutine */ + } + brw_land_fwd_jump(p, jmp); + + brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); + brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE)); + jmp = brw_JMPI(p, ip, ip, brw_imm_d(0)); + { + saveflag = p->flag_value; + brw_push_insn_state(p); + brw_emit_point_sprite_setup( c, GL_FALSE ); + brw_pop_insn_state(p); + p->flag_value = saveflag; + } + brw_land_fwd_jump(p, jmp); + + brw_emit_point_setup( c, GL_FALSE ); +} + + + + diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c new file mode 100644 index 00000000000..bc0f0760738 --- /dev/null +++ b/src/gallium/drivers/i965/brw_sf_state.c @@ -0,0 +1,365 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "main/macros.h" +#include "intel_fbo.h" + +static void upload_sf_vp(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF; + struct brw_sf_viewport sfv; + GLfloat y_scale, y_bias; + const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0); + const GLfloat *v = ctx->Viewport._WindowMap.m; + + memset(&sfv, 0, sizeof(sfv)); + + if (render_to_fbo) { + y_scale = 1.0; + y_bias = 0; + } + else { + y_scale = -1.0; + y_bias = ctx->DrawBuffer->Height; + } + + /* _NEW_VIEWPORT */ + + sfv.viewport.m00 = v[MAT_SX]; + sfv.viewport.m11 = v[MAT_SY] * y_scale; + sfv.viewport.m22 = v[MAT_SZ] * depth_scale; + sfv.viewport.m30 = v[MAT_TX]; + sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias; + sfv.viewport.m32 = v[MAT_TZ] * depth_scale; + + /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT + * for DrawBuffer->_[XY]{min,max} + */ + + /* The scissor only needs to handle the intersection of drawable and + * scissor rect. Clipping to the boundaries of static shared buffers + * for front/back/depth is covered by looping over cliprects in brw_draw.c. + * + * Note that the hardware's coordinates are inclusive, while Mesa's min is + * inclusive but max is exclusive. + */ + if (render_to_fbo) { + /* texmemory: Y=0=bottom */ + sfv.scissor.xmin = ctx->DrawBuffer->_Xmin; + sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1; + sfv.scissor.ymin = ctx->DrawBuffer->_Ymin; + sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1; + } + else { + /* memory: Y=0=top */ + sfv.scissor.xmin = ctx->DrawBuffer->_Xmin; + sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1; + sfv.scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax; + sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1; + } + + dri_bo_unreference(brw->sf.vp_bo); + brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 ); +} + +const struct brw_tracked_state brw_sf_vp = { + .dirty = { + .mesa = (_NEW_VIEWPORT | + _NEW_SCISSOR | + _NEW_BUFFERS), + .brw = 0, + .cache = 0 + }, + .prepare = upload_sf_vp +}; + +struct brw_sf_unit_key { + unsigned int total_grf; + unsigned int urb_entry_read_length; + + unsigned int nr_urb_entries, urb_size, sfsize; + + GLenum front_face, cull_face, provoking_vertex; + unsigned scissor:1; + unsigned line_smooth:1; + unsigned point_sprite:1; + unsigned point_attenuated:1; + unsigned render_to_fbo:1; + float line_width; + float point_size; +}; + +static void +sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + memset(key, 0, sizeof(*key)); + + /* CACHE_NEW_SF_PROG */ + key->total_grf = brw->sf.prog_data->total_grf; + key->urb_entry_read_length = brw->sf.prog_data->urb_read_length; + + /* BRW_NEW_URB_FENCE */ + key->nr_urb_entries = brw->urb.nr_sf_entries; + key->urb_size = brw->urb.vsize; + key->sfsize = brw->urb.sfsize; + + key->scissor = ctx->Scissor.Enabled; + key->front_face = ctx->Polygon.FrontFace; + + if (ctx->Polygon.CullFlag) + key->cull_face = ctx->Polygon.CullFaceMode; + else + key->cull_face = GL_NONE; + + key->line_width = ctx->Line.Width; + key->line_smooth = ctx->Line.SmoothFlag; + + key->point_sprite = ctx->Point.PointSprite; + key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); + key->point_attenuated = ctx->Point._Attenuated; + + /* _NEW_LIGHT */ + key->provoking_vertex = ctx->Light.ProvokingVertex; + + key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; +} + +static dri_bo * +sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, + dri_bo **reloc_bufs) +{ + struct brw_sf_unit_state sf; + dri_bo *bo; + int chipset_max_threads; + memset(&sf, 0, sizeof(sf)); + + sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + sf.thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */ + + sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + + sf.thread3.dispatch_grf_start_reg = 3; + + if (BRW_IS_IGDNG(brw)) + sf.thread3.urb_entry_read_offset = 3; + else + sf.thread3.urb_entry_read_offset = 1; + + sf.thread3.urb_entry_read_length = key->urb_entry_read_length; + + sf.thread4.nr_urb_entries = key->nr_urb_entries; + sf.thread4.urb_entry_allocation_size = key->sfsize - 1; + + /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or + * 48(IGDNG) threads + */ + if (BRW_IS_IGDNG(brw)) + chipset_max_threads = 48; + else + chipset_max_threads = 24; + + sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1; + + if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) + sf.thread4.max_threads = 0; + + if (INTEL_DEBUG & DEBUG_STATS) + sf.thread4.stats_enable = 1; + + /* CACHE_NEW_SF_VP */ + sf.sf5.sf_viewport_state_offset = brw->sf.vp_bo->offset >> 5; /* reloc */ + + sf.sf5.viewport_transform = 1; + + /* _NEW_SCISSOR */ + if (key->scissor) + sf.sf6.scissor = 1; + + /* _NEW_POLYGON */ + if (key->front_face == GL_CCW) + sf.sf5.front_winding = BRW_FRONTWINDING_CCW; + else + sf.sf5.front_winding = BRW_FRONTWINDING_CW; + + /* The viewport is inverted for rendering to a FBO, and that inverts + * polygon front/back orientation. + */ + sf.sf5.front_winding ^= key->render_to_fbo; + + switch (key->cull_face) { + case GL_FRONT: + sf.sf6.cull_mode = BRW_CULLMODE_FRONT; + break; + case GL_BACK: + sf.sf6.cull_mode = BRW_CULLMODE_BACK; + break; + case GL_FRONT_AND_BACK: + sf.sf6.cull_mode = BRW_CULLMODE_BOTH; + break; + case GL_NONE: + sf.sf6.cull_mode = BRW_CULLMODE_NONE; + break; + default: + assert(0); + break; + } + + /* _NEW_LINE */ + /* XXX use ctx->Const.Min/MaxLineWidth here */ + sf.sf6.line_width = CLAMP(key->line_width, 1.0, 5.0) * (1<<1); + + sf.sf6.line_endcap_aa_region_width = 1; + if (key->line_smooth) + sf.sf6.aa_enable = 1; + else if (sf.sf6.line_width <= 0x2) + sf.sf6.line_width = 0; + + /* _NEW_BUFFERS */ + key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; + if (!key->render_to_fbo) { + /* Rendering to an OpenGL window */ + sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT; + } + else { + /* If rendering to an FBO, the pixel coordinate system is + * inverted with respect to the normal OpenGL coordinate + * system, so BRW_RASTRULE_LOWER_RIGHT is correct. + * But this value is listed as "Reserved, but not seen as useful" + * in Intel documentation (page 212, "Point Rasterization Rule", + * section 7.4 "SF Pipeline State Summary", of document + * "Intel® 965 Express Chipset Family and Intel® G35 Express + * Chipset Graphics Controller Programmer's Reference Manual, + * Volume 2: 3D/Media", Revision 1.0b as of January 2008, + * available at + * http://intellinuxgraphics.org/documentation.html + * at the time of this writing). + * + * It does work on at least some devices, if not all; + * if devices that don't support it can be identified, + * the likely failure case is that points are rasterized + * incorrectly, which is no worse than occurs without + * the value, so we're using it here. + */ + sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT; + } + /* XXX clamp max depends on AA vs. non-AA */ + + /* _NEW_POINT */ + sf.sf7.sprite_point = key->point_sprite; + sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3); + sf.sf7.use_point_size_state = !key->point_attenuated; + sf.sf7.aa_line_distance_mode = 0; + + /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons: + */ + if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) { + sf.sf7.trifan_pv = 2; + sf.sf7.linestrip_pv = 1; + sf.sf7.tristrip_pv = 2; + } else { + sf.sf7.trifan_pv = 1; + sf.sf7.linestrip_pv = 0; + sf.sf7.tristrip_pv = 0; + } + sf.sf7.line_last_pixel_enable = 0; + + /* Set bias for OpenGL rasterization rules: + */ + sf.sf6.dest_org_vbias = 0x8; + sf.sf6.dest_org_hbias = 0x8; + + bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT, + key, sizeof(*key), + reloc_bufs, 2, + &sf, sizeof(sf), + NULL, NULL); + + /* STATE_PREFETCH command description describes this state as being + * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain. + */ + /* Emit SF program relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + sf.thread0.grf_reg_count << 1, + offsetof(struct brw_sf_unit_state, thread0), + brw->sf.prog_bo); + + /* Emit SF viewport relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + sf.sf5.front_winding | (sf.sf5.viewport_transform << 1), + offsetof(struct brw_sf_unit_state, sf5), + brw->sf.vp_bo); + + return bo; +} + +static void upload_sf_unit( struct brw_context *brw ) +{ + struct brw_sf_unit_key key; + dri_bo *reloc_bufs[2]; + + sf_unit_populate_key(brw, &key); + + reloc_bufs[0] = brw->sf.prog_bo; + reloc_bufs[1] = brw->sf.vp_bo; + + dri_bo_unreference(brw->sf.state_bo); + brw->sf.state_bo = brw_search_cache(&brw->cache, BRW_SF_UNIT, + &key, sizeof(key), + reloc_bufs, 2, + NULL); + if (brw->sf.state_bo == NULL) { + brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs); + } +} + +const struct brw_tracked_state brw_sf_unit = { + .dirty = { + .mesa = (_NEW_POLYGON | + _NEW_LIGHT | + _NEW_LINE | + _NEW_POINT | + _NEW_SCISSOR | + _NEW_BUFFERS), + .brw = BRW_NEW_URB_FENCE, + .cache = (CACHE_NEW_SF_VP | + CACHE_NEW_SF_PROG) + }, + .prepare = upload_sf_unit, +}; diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h new file mode 100644 index 00000000000..d639656b9d4 --- /dev/null +++ b/src/gallium/drivers/i965/brw_state.h @@ -0,0 +1,173 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_STATE_H +#define BRW_STATE_H + +#include "brw_context.h" + +static inline void +brw_add_validated_bo(struct brw_context *brw, dri_bo *bo) +{ + assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos)); + + if (bo != NULL) { + dri_bo_reference(bo); + brw->state.validated_bos[brw->state.validated_bo_count++] = bo; + } +}; + +const struct brw_tracked_state brw_blend_constant_color; +const struct brw_tracked_state brw_cc_unit; +const struct brw_tracked_state brw_cc_vp; +const struct brw_tracked_state brw_check_fallback; +const struct brw_tracked_state brw_clip_prog; +const struct brw_tracked_state brw_clip_unit; +const struct brw_tracked_state brw_constant_buffer; +const struct brw_tracked_state brw_curbe_offsets; +const struct brw_tracked_state brw_invarient_state; +const struct brw_tracked_state brw_gs_prog; +const struct brw_tracked_state brw_gs_unit; +const struct brw_tracked_state brw_line_stipple; +const struct brw_tracked_state brw_aa_line_parameters; +const struct brw_tracked_state brw_pipelined_state_pointers; +const struct brw_tracked_state brw_binding_table_pointers; +const struct brw_tracked_state brw_depthbuffer; +const struct brw_tracked_state brw_polygon_stipple_offset; +const struct brw_tracked_state brw_polygon_stipple; +const struct brw_tracked_state brw_program_parameters; +const struct brw_tracked_state brw_recalculate_urb_fence; +const struct brw_tracked_state brw_sf_prog; +const struct brw_tracked_state brw_sf_unit; +const struct brw_tracked_state brw_sf_vp; +const struct brw_tracked_state brw_state_base_address; +const struct brw_tracked_state brw_urb_fence; +const struct brw_tracked_state brw_vertex_state; +const struct brw_tracked_state brw_vs_surfaces; +const struct brw_tracked_state brw_vs_prog; +const struct brw_tracked_state brw_vs_unit; +const struct brw_tracked_state brw_wm_input_sizes; +const struct brw_tracked_state brw_wm_prog; +const struct brw_tracked_state brw_wm_samplers; +const struct brw_tracked_state brw_wm_constant_surface; +const struct brw_tracked_state brw_wm_surfaces; +const struct brw_tracked_state brw_wm_unit; + +const struct brw_tracked_state brw_psp_urb_cbs; + +const struct brw_tracked_state brw_pipe_control; + +const struct brw_tracked_state brw_drawing_rect; +const struct brw_tracked_state brw_indices; +const struct brw_tracked_state brw_vertices; +const struct brw_tracked_state brw_index_buffer; + +/** + * Use same key for WM and VS surfaces. + */ +struct brw_surface_key { + GLenum target, depthmode; + dri_bo *bo; + GLint format, internal_format; + GLint first_level, last_level; + GLint width, height, depth; + GLint pitch, cpp; + uint32_t tiling; + GLuint offset; +}; + +/*********************************************************************** + * brw_state.c + */ +void brw_validate_state(struct brw_context *brw); +void brw_upload_state(struct brw_context *brw); +void brw_init_state(struct brw_context *brw); +void brw_destroy_state(struct brw_context *brw); + +/*********************************************************************** + * brw_state_cache.c + */ +dri_bo *brw_cache_data(struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *data, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs); + +dri_bo *brw_cache_data_sz(struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *data, + GLuint data_size, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs); + +dri_bo *brw_upload_cache( struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *key, + GLuint key_sz, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs, + const void *data, + GLuint data_sz, + const void *aux, + void *aux_return ); + +dri_bo *brw_search_cache( struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *key, + GLuint key_size, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs, + void *aux_return); +void brw_state_cache_check_size( struct brw_context *brw ); + +void brw_init_caches( struct brw_context *brw ); +void brw_destroy_caches( struct brw_context *brw ); +void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo); + +/*********************************************************************** + * brw_state_batch.c + */ +#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS) +#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) ) + +GLboolean brw_cached_batch_struct( struct brw_context *brw, + const void *data, + GLuint sz ); +void brw_destroy_batch_cache( struct brw_context *brw ); +void brw_clear_batch_cache( struct brw_context *brw ); + +/* brw_wm_surface_state.c */ +dri_bo * +brw_create_constant_surface( struct brw_context *brw, + struct brw_surface_key *key ); + +#endif diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c new file mode 100644 index 00000000000..7821898cf9b --- /dev/null +++ b/src/gallium/drivers/i965/brw_state_batch.c @@ -0,0 +1,99 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_state.h" +#include "intel_batchbuffer.h" +#include "main/imports.h" + + + +/* A facility similar to the data caching code above, which aims to + * prevent identical commands being issued repeatedly. + */ +GLboolean brw_cached_batch_struct( struct brw_context *brw, + const void *data, + GLuint sz ) +{ + struct brw_cached_batch_item *item = brw->cached_batch_items; + struct header *newheader = (struct header *)data; + + if (brw->emit_state_always) { + intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS); + return GL_TRUE; + } + + while (item) { + if (item->header->opcode == newheader->opcode) { + if (item->sz == sz && memcmp(item->header, newheader, sz) == 0) + return GL_FALSE; + if (item->sz != sz) { + _mesa_free(item->header); + item->header = _mesa_malloc(sz); + item->sz = sz; + } + goto emit; + } + item = item->next; + } + + assert(!item); + item = CALLOC_STRUCT(brw_cached_batch_item); + item->header = _mesa_malloc(sz); + item->sz = sz; + item->next = brw->cached_batch_items; + brw->cached_batch_items = item; + + emit: + memcpy(item->header, newheader, sz); + intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS); + return GL_TRUE; +} + +void brw_clear_batch_cache( struct brw_context *brw ) +{ + struct brw_cached_batch_item *item = brw->cached_batch_items; + + while (item) { + struct brw_cached_batch_item *next = item->next; + free((void *)item->header); + free(item); + item = next; + } + + brw->cached_batch_items = NULL; +} + +void brw_destroy_batch_cache( struct brw_context *brw ) +{ + brw_clear_batch_cache(brw); +} diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c new file mode 100644 index 00000000000..c262e1db8b9 --- /dev/null +++ b/src/gallium/drivers/i965/brw_state_cache.c @@ -0,0 +1,597 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +/** @file brw_state_cache.c + * + * This file implements a simple static state cache for 965. The consumers + * can query the hash table of state using a cache_id, opaque key data, + * and list of buffers that will be used in relocations, and receive the + * corresponding state buffer object of state (plus associated auxiliary + * data) in return. + * + * The inner workings are a simple hash table based on a CRC of the key data. + * The cache_id and relocation target buffers associated with the state + * buffer are included as auxiliary key data, but are not part of the hash + * value (this should be fixed, but will likely be fixed instead by making + * consumers use structured keys). + * + * Replacement is not implemented. Instead, when the cache gets too big, at + * a safe point (unlock) we throw out all of the cache data and let it + * regenerate for the next rendering operation. + * + * The reloc_buf pointers need to be included as key data, otherwise the + * non-unique values stuffed in the offset in key data through + * brw_cache_data() may result in successful probe for state buffers + * even when the buffer being referenced doesn't match. The result would be + * that the same state cache entry is used twice for different buffers, + * only one of the two buffers referenced gets put into the offset, and the + * incorrect program is run for the other instance. + */ + +#include "main/imports.h" +#include "brw_state.h" +#include "intel_batchbuffer.h" + +/* XXX: Fixme - have to include these to get the sizes of the prog_key + * structs: + */ +#include "brw_wm.h" +#include "brw_vs.h" +#include "brw_clip.h" +#include "brw_sf.h" +#include "brw_gs.h" + + +static GLuint +hash_key(const void *key, GLuint key_size, + dri_bo **reloc_bufs, GLuint nr_reloc_bufs) +{ + GLuint *ikey = (GLuint *)key; + GLuint hash = 0, i; + + assert(key_size % 4 == 0); + + /* I'm sure this can be improved on: + */ + for (i = 0; i < key_size/4; i++) { + hash ^= ikey[i]; + hash = (hash << 5) | (hash >> 27); + } + + /* Include the BO pointers as key data as well */ + ikey = (GLuint *)reloc_bufs; + key_size = nr_reloc_bufs * sizeof(dri_bo *); + for (i = 0; i < key_size/4; i++) { + hash ^= ikey[i]; + hash = (hash << 5) | (hash >> 27); + } + + return hash; +} + + +/** + * Marks a new buffer as being chosen for the given cache id. + */ +static void +update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id, + dri_bo *bo) +{ + if (bo == cache->last_bo[cache_id]) + return; /* no change */ + + dri_bo_unreference(cache->last_bo[cache_id]); + cache->last_bo[cache_id] = bo; + dri_bo_reference(cache->last_bo[cache_id]); + cache->brw->state.dirty.cache |= 1 << cache_id; +} + + +static struct brw_cache_item * +search_cache(struct brw_cache *cache, enum brw_cache_id cache_id, + GLuint hash, const void *key, GLuint key_size, + dri_bo **reloc_bufs, GLuint nr_reloc_bufs) +{ + struct brw_cache_item *c; + +#if 0 + int bucketcount = 0; + + for (c = cache->items[hash % cache->size]; c; c = c->next) + bucketcount++; + + fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size, + cache->size, bucketcount, cache->n_items); +#endif + + for (c = cache->items[hash % cache->size]; c; c = c->next) { + if (c->cache_id == cache_id && + c->hash == hash && + c->key_size == key_size && + memcmp(c->key, key, key_size) == 0 && + c->nr_reloc_bufs == nr_reloc_bufs && + memcmp(c->reloc_bufs, reloc_bufs, + nr_reloc_bufs * sizeof(dri_bo *)) == 0) + return c; + } + + return NULL; +} + + +static void +rehash(struct brw_cache *cache) +{ + struct brw_cache_item **items; + struct brw_cache_item *c, *next; + GLuint size, i; + + size = cache->size * 3; + items = (struct brw_cache_item**) _mesa_calloc(size * sizeof(*items)); + + for (i = 0; i < cache->size; i++) + for (c = cache->items[i]; c; c = next) { + next = c->next; + c->next = items[c->hash % size]; + items[c->hash % size] = c; + } + + FREE(cache->items); + cache->items = items; + cache->size = size; +} + + +/** + * Returns the buffer object matching cache_id and key, or NULL. + */ +dri_bo * +brw_search_cache(struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *key, + GLuint key_size, + dri_bo **reloc_bufs, GLuint nr_reloc_bufs, + void *aux_return) +{ + struct brw_cache_item *item; + GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs); + + item = search_cache(cache, cache_id, hash, key, key_size, + reloc_bufs, nr_reloc_bufs); + + if (item == NULL) + return NULL; + + if (aux_return) + *(void **)aux_return = (void *)((char *)item->key + item->key_size); + + update_cache_last(cache, cache_id, item->bo); + + dri_bo_reference(item->bo); + return item->bo; +} + + +dri_bo * +brw_upload_cache( struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *key, + GLuint key_size, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs, + const void *data, + GLuint data_size, + const void *aux, + void *aux_return ) +{ + struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item); + GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs); + GLuint relocs_size = nr_reloc_bufs * sizeof(dri_bo *); + GLuint aux_size = cache->aux_size[cache_id]; + void *tmp; + dri_bo *bo; + int i; + + /* Create the buffer object to contain the data */ + bo = dri_bo_alloc(cache->brw->intel.bufmgr, + cache->name[cache_id], data_size, 1 << 6); + + + /* Set up the memory containing the key, aux_data, and reloc_bufs */ + tmp = _mesa_malloc(key_size + aux_size + relocs_size); + + memcpy(tmp, key, key_size); + memcpy(tmp + key_size, aux, cache->aux_size[cache_id]); + memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size); + for (i = 0; i < nr_reloc_bufs; i++) { + if (reloc_bufs[i] != NULL) + dri_bo_reference(reloc_bufs[i]); + } + + item->cache_id = cache_id; + item->key = tmp; + item->hash = hash; + item->key_size = key_size; + item->reloc_bufs = tmp + key_size + aux_size; + item->nr_reloc_bufs = nr_reloc_bufs; + + item->bo = bo; + dri_bo_reference(bo); + item->data_size = data_size; + + if (cache->n_items > cache->size * 1.5) + rehash(cache); + + hash %= cache->size; + item->next = cache->items[hash]; + cache->items[hash] = item; + cache->n_items++; + + if (aux_return) { + assert(cache->aux_size[cache_id]); + *(void **)aux_return = (void *)((char *)item->key + item->key_size); + } + + if (INTEL_DEBUG & DEBUG_STATE) + _mesa_printf("upload %s: %d bytes to cache id %d\n", + cache->name[cache_id], + data_size, cache_id); + + /* Copy data to the buffer */ + dri_bo_subdata(bo, 0, data_size, data); + + update_cache_last(cache, cache_id, bo); + + return bo; +} + + +/** + * This doesn't really work with aux data. Use search/upload instead + */ +dri_bo * +brw_cache_data_sz(struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *data, + GLuint data_size, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs) +{ + dri_bo *bo; + struct brw_cache_item *item; + GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs); + + item = search_cache(cache, cache_id, hash, data, data_size, + reloc_bufs, nr_reloc_bufs); + if (item) { + update_cache_last(cache, cache_id, item->bo); + dri_bo_reference(item->bo); + return item->bo; + } + + bo = brw_upload_cache(cache, cache_id, + data, data_size, + reloc_bufs, nr_reloc_bufs, + data, data_size, + NULL, NULL); + + return bo; +} + + +/** + * Wrapper around brw_cache_data_sz using the cache_id's canonical key size. + * + * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be + * better to use, as the potentially changing offsets in the data-used-as-key + * will result in excessive cache misses. + */ +dri_bo * +brw_cache_data(struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *data, + dri_bo **reloc_bufs, + GLuint nr_reloc_bufs) +{ + return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id], + reloc_bufs, nr_reloc_bufs); +} + +enum pool_type { + DW_SURFACE_STATE, + DW_GENERAL_STATE +}; + + +static void +brw_init_cache_id(struct brw_cache *cache, + const char *name, + enum brw_cache_id id, + GLuint key_size, + GLuint aux_size) +{ + cache->name[id] = strdup(name); + cache->key_size[id] = key_size; + cache->aux_size[id] = aux_size; +} + + +static void +brw_init_non_surface_cache(struct brw_context *brw) +{ + struct brw_cache *cache = &brw->cache; + + cache->brw = brw; + + cache->size = 7; + cache->n_items = 0; + cache->items = (struct brw_cache_item **) + _mesa_calloc(cache->size * sizeof(struct brw_cache_item)); + + brw_init_cache_id(cache, + "CC_VP", + BRW_CC_VP, + sizeof(struct brw_cc_viewport), + 0); + + brw_init_cache_id(cache, + "CC_UNIT", + BRW_CC_UNIT, + sizeof(struct brw_cc_unit_state), + 0); + + brw_init_cache_id(cache, + "WM_PROG", + BRW_WM_PROG, + sizeof(struct brw_wm_prog_key), + sizeof(struct brw_wm_prog_data)); + + brw_init_cache_id(cache, + "SAMPLER_DEFAULT_COLOR", + BRW_SAMPLER_DEFAULT_COLOR, + sizeof(struct brw_sampler_default_color), + 0); + + brw_init_cache_id(cache, + "SAMPLER", + BRW_SAMPLER, + 0, /* variable key/data size */ + 0); + + brw_init_cache_id(cache, + "WM_UNIT", + BRW_WM_UNIT, + sizeof(struct brw_wm_unit_state), + 0); + + brw_init_cache_id(cache, + "SF_PROG", + BRW_SF_PROG, + sizeof(struct brw_sf_prog_key), + sizeof(struct brw_sf_prog_data)); + + brw_init_cache_id(cache, + "SF_VP", + BRW_SF_VP, + sizeof(struct brw_sf_viewport), + 0); + + brw_init_cache_id(cache, + "SF_UNIT", + BRW_SF_UNIT, + sizeof(struct brw_sf_unit_state), + 0); + + brw_init_cache_id(cache, + "VS_UNIT", + BRW_VS_UNIT, + sizeof(struct brw_vs_unit_state), + 0); + + brw_init_cache_id(cache, + "VS_PROG", + BRW_VS_PROG, + sizeof(struct brw_vs_prog_key), + sizeof(struct brw_vs_prog_data)); + + brw_init_cache_id(cache, + "CLIP_UNIT", + BRW_CLIP_UNIT, + sizeof(struct brw_clip_unit_state), + 0); + + brw_init_cache_id(cache, + "CLIP_PROG", + BRW_CLIP_PROG, + sizeof(struct brw_clip_prog_key), + sizeof(struct brw_clip_prog_data)); + + brw_init_cache_id(cache, + "GS_UNIT", + BRW_GS_UNIT, + sizeof(struct brw_gs_unit_state), + 0); + + brw_init_cache_id(cache, + "GS_PROG", + BRW_GS_PROG, + sizeof(struct brw_gs_prog_key), + sizeof(struct brw_gs_prog_data)); +} + + +static void +brw_init_surface_cache(struct brw_context *brw) +{ + struct brw_cache *cache = &brw->surface_cache; + + cache->brw = brw; + + cache->size = 7; + cache->n_items = 0; + cache->items = (struct brw_cache_item **) + _mesa_calloc(cache->size * sizeof(struct brw_cache_item)); + + brw_init_cache_id(cache, + "SS_SURFACE", + BRW_SS_SURFACE, + sizeof(struct brw_surface_state), + 0); + + brw_init_cache_id(cache, + "SS_SURF_BIND", + BRW_SS_SURF_BIND, + 0, + 0); +} + + +void +brw_init_caches(struct brw_context *brw) +{ + brw_init_non_surface_cache(brw); + brw_init_surface_cache(brw); +} + + +static void +brw_clear_cache(struct brw_context *brw, struct brw_cache *cache) +{ + struct brw_cache_item *c, *next; + GLuint i; + + if (INTEL_DEBUG & DEBUG_STATE) + _mesa_printf("%s\n", __FUNCTION__); + + for (i = 0; i < cache->size; i++) { + for (c = cache->items[i]; c; c = next) { + int j; + + next = c->next; + for (j = 0; j < c->nr_reloc_bufs; j++) + dri_bo_unreference(c->reloc_bufs[j]); + dri_bo_unreference(c->bo); + free((void *)c->key); + free(c); + } + cache->items[i] = NULL; + } + + cache->n_items = 0; + + if (brw->curbe.last_buf) { + _mesa_free(brw->curbe.last_buf); + brw->curbe.last_buf = NULL; + } + + brw->state.dirty.mesa |= ~0; + brw->state.dirty.brw |= ~0; + brw->state.dirty.cache |= ~0; +} + +/* Clear all entries from the cache that point to the given bo. + * + * This lets us release memory for reuse earlier for known-dead buffers, + * at the cost of walking the entire hash table. + */ +void +brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo) +{ + struct brw_cache_item **prev; + GLuint i; + + if (INTEL_DEBUG & DEBUG_STATE) + _mesa_printf("%s\n", __FUNCTION__); + + for (i = 0; i < cache->size; i++) { + for (prev = &cache->items[i]; *prev;) { + struct brw_cache_item *c = *prev; + + if (drm_intel_bo_references(c->bo, bo)) { + int j; + + *prev = c->next; + + for (j = 0; j < c->nr_reloc_bufs; j++) + dri_bo_unreference(c->reloc_bufs[j]); + dri_bo_unreference(c->bo); + free((void *)c->key); + free(c); + cache->n_items--; + } else { + prev = &c->next; + } + } + } +} + +void +brw_state_cache_check_size(struct brw_context *brw) +{ + if (INTEL_DEBUG & DEBUG_STATE) + _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items); + + /* un-tuned guess. We've got around 20 state objects for a total of around + * 32k, so 1000 of them is around 1.5MB. + */ + if (brw->cache.n_items > 1000) + brw_clear_cache(brw, &brw->cache); + + if (brw->surface_cache.n_items > 1000) + brw_clear_cache(brw, &brw->surface_cache); +} + + +static void +brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache) +{ + GLuint i; + + if (INTEL_DEBUG & DEBUG_STATE) + _mesa_printf("%s\n", __FUNCTION__); + + brw_clear_cache(brw, cache); + for (i = 0; i < BRW_MAX_CACHE; i++) { + dri_bo_unreference(cache->last_bo[i]); + free(cache->name[i]); + } + free(cache->items); + cache->items = NULL; + cache->size = 0; +} + + +void +brw_destroy_caches(struct brw_context *brw) +{ + brw_destroy_cache(brw, &brw->cache); + brw_destroy_cache(brw, &brw->surface_cache); +} diff --git a/src/gallium/drivers/i965/brw_state_dump.c b/src/gallium/drivers/i965/brw_state_dump.c new file mode 100644 index 00000000000..e94fa7d2b4c --- /dev/null +++ b/src/gallium/drivers/i965/brw_state_dump.c @@ -0,0 +1,224 @@ +/* + * Copyright © 2007 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#include "main/mtypes.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + +/** + * Prints out a header, the contents, and the message associated with + * the hardware state data given. + * + * \param name Name of the state object + * \param data Pointer to the base of the state object + * \param hw_offset Hardware offset of the base of the state data. + * \param index Index of the DWORD being output. + */ +static void +state_out(const char *name, void *data, uint32_t hw_offset, int index, + char *fmt, ...) +{ + va_list va; + + fprintf(stderr, "%8s: 0x%08x: 0x%08x: ", + name, hw_offset + index * 4, ((uint32_t *)data)[index]); + va_start(va, fmt); + vfprintf(stderr, fmt, va); + va_end(va); +} + +/** Generic, undecoded state buffer debug printout */ +static void +state_struct_out(const char *name, dri_bo *buffer, unsigned int state_size) +{ + int i; + + if (buffer == NULL) + return; + + dri_bo_map(buffer, GL_FALSE); + for (i = 0; i < state_size / 4; i++) { + state_out(name, buffer->virtual, buffer->offset, i, + "dword %d\n", i); + } + dri_bo_unmap(buffer); +} + +static const char * +get_965_surfacetype(unsigned int surfacetype) +{ + switch (surfacetype) { + case 0: return "1D"; + case 1: return "2D"; + case 2: return "3D"; + case 3: return "CUBE"; + case 4: return "BUFFER"; + case 7: return "NULL"; + default: return "unknown"; + } +} + +static const char * +get_965_surface_format(unsigned int surface_format) +{ + switch (surface_format) { + case 0x000: return "r32g32b32a32_float"; + case 0x0c1: return "b8g8r8a8_unorm"; + case 0x100: return "b5g6r5_unorm"; + case 0x102: return "b5g5r5a1_unorm"; + case 0x104: return "b4g4r4a4_unorm"; + default: return "unknown"; + } +} + +static void dump_wm_surface_state(struct brw_context *brw) +{ + int i; + + for (i = 0; i < brw->wm.nr_surfaces; i++) { + dri_bo *surf_bo = brw->wm.surf_bo[i]; + unsigned int surfoff; + struct brw_surface_state *surf; + char name[20]; + + if (surf_bo == NULL) { + fprintf(stderr, " WM SS%d: NULL\n", i); + continue; + } + dri_bo_map(surf_bo, GL_FALSE); + surfoff = surf_bo->offset; + surf = (struct brw_surface_state *)(surf_bo->virtual); + + sprintf(name, "WM SS%d", i); + state_out(name, surf, surfoff, 0, "%s %s\n", + get_965_surfacetype(surf->ss0.surface_type), + get_965_surface_format(surf->ss0.surface_format)); + state_out(name, surf, surfoff, 1, "offset\n"); + state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n", + surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count); + state_out(name, surf, surfoff, 3, "pitch %d, %stiled\n", + surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not "); + state_out(name, surf, surfoff, 4, "mip base %d\n", + surf->ss4.min_lod); + state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n", + surf->ss5.x_offset, surf->ss5.y_offset); + + dri_bo_unmap(surf_bo); + } +} + +static void dump_sf_viewport_state(struct brw_context *brw) +{ + const char *name = "SF VP"; + struct brw_sf_viewport *vp; + uint32_t vp_off; + + if (brw->sf.vp_bo == NULL) + return; + + dri_bo_map(brw->sf.vp_bo, GL_FALSE); + + vp = brw->sf.vp_bo->virtual; + vp_off = brw->sf.vp_bo->offset; + + state_out(name, vp, vp_off, 0, "m00 = %f\n", vp->viewport.m00); + state_out(name, vp, vp_off, 1, "m11 = %f\n", vp->viewport.m11); + state_out(name, vp, vp_off, 2, "m22 = %f\n", vp->viewport.m22); + state_out(name, vp, vp_off, 3, "m30 = %f\n", vp->viewport.m30); + state_out(name, vp, vp_off, 4, "m31 = %f\n", vp->viewport.m31); + state_out(name, vp, vp_off, 5, "m32 = %f\n", vp->viewport.m32); + + state_out(name, vp, vp_off, 6, "top left = %d,%d\n", + vp->scissor.xmin, vp->scissor.ymin); + state_out(name, vp, vp_off, 7, "bottom right = %d,%d\n", + vp->scissor.xmax, vp->scissor.ymax); + + dri_bo_unmap(brw->sf.vp_bo); +} + +static void brw_debug_prog(const char *name, dri_bo *prog) +{ + unsigned int i; + uint32_t *data; + + if (prog == NULL) + return; + + dri_bo_map(prog, GL_FALSE); + + data = prog->virtual; + + for (i = 0; i < prog->size / 4 / 4; i++) { + fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n", + name, (unsigned int)prog->offset + i * 4 * 4, + data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]); + /* Stop at the end of the program. It'd be nice to keep track of the actual + * intended program size instead of guessing like this. + */ + if (data[i * 4 + 0] == 0 && + data[i * 4 + 1] == 0 && + data[i * 4 + 2] == 0 && + data[i * 4 + 3] == 0) + break; + } + + dri_bo_unmap(prog); +} + + +/** + * Print additional debug information associated with the batchbuffer + * when DEBUG_BATCH is set. + * + * For 965, this means mapping the state buffers that would have been referenced + * by the batchbuffer and dumping them. + * + * The buffer offsets printed rely on the buffer containing the last offset + * it was validated at. + */ +void brw_debug_batch(struct intel_context *intel) +{ + struct brw_context *brw = brw_context(&intel->ctx); + + state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces); + dump_wm_surface_state(brw); + + state_struct_out("VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state)); + brw_debug_prog("VS prog", brw->vs.prog_bo); + + state_struct_out("GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state)); + brw_debug_prog("GS prog", brw->gs.prog_bo); + + state_struct_out("SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state)); + dump_sf_viewport_state(brw); + brw_debug_prog("SF prog", brw->sf.prog_bo); + + state_struct_out("WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state)); + brw_debug_prog("WM prog", brw->wm.prog_bo); +} diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c new file mode 100644 index 00000000000..b817b741e77 --- /dev/null +++ b/src/gallium/drivers/i965/brw_state_upload.c @@ -0,0 +1,416 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_context.h" +#include "brw_state.h" +#include "intel_batchbuffer.h" + +/* This is used to initialize brw->state.atoms[]. We could use this + * list directly except for a single atom, brw_constant_buffer, which + * has a .dirty value which changes according to the parameters of the + * current fragment and vertex programs, and so cannot be a static + * value. + */ +const struct brw_tracked_state *atoms[] = +{ + &brw_check_fallback, + + &brw_wm_input_sizes, + &brw_vs_prog, + &brw_gs_prog, + &brw_clip_prog, + &brw_sf_prog, + &brw_wm_prog, + + /* Once all the programs are done, we know how large urb entry + * sizes need to be and can decide if we need to change the urb + * layout. + */ + &brw_curbe_offsets, + &brw_recalculate_urb_fence, + + &brw_cc_vp, + &brw_cc_unit, + + &brw_vs_surfaces, /* must do before unit */ + &brw_wm_constant_surface, /* must do before wm surfaces/bind bo */ + &brw_wm_surfaces, /* must do before samplers and unit */ + &brw_wm_samplers, + + &brw_wm_unit, + &brw_sf_vp, + &brw_sf_unit, + &brw_vs_unit, /* always required, enabled or not */ + &brw_clip_unit, + &brw_gs_unit, + + /* Command packets: + */ + &brw_invarient_state, + &brw_state_base_address, + + &brw_binding_table_pointers, + &brw_blend_constant_color, + + &brw_depthbuffer, + + &brw_polygon_stipple, + &brw_polygon_stipple_offset, + + &brw_line_stipple, + &brw_aa_line_parameters, + + &brw_psp_urb_cbs, + + &brw_drawing_rect, + &brw_indices, + &brw_index_buffer, + &brw_vertices, + + &brw_constant_buffer +}; + + +void brw_init_state( struct brw_context *brw ) +{ + brw_init_caches(brw); +} + + +void brw_destroy_state( struct brw_context *brw ) +{ + brw_destroy_caches(brw); + brw_destroy_batch_cache(brw); +} + +/*********************************************************************** + */ + +static GLboolean check_state( const struct brw_state_flags *a, + const struct brw_state_flags *b ) +{ + return ((a->mesa & b->mesa) || + (a->brw & b->brw) || + (a->cache & b->cache)); +} + +static void accumulate_state( struct brw_state_flags *a, + const struct brw_state_flags *b ) +{ + a->mesa |= b->mesa; + a->brw |= b->brw; + a->cache |= b->cache; +} + + +static void xor_states( struct brw_state_flags *result, + const struct brw_state_flags *a, + const struct brw_state_flags *b ) +{ + result->mesa = a->mesa ^ b->mesa; + result->brw = a->brw ^ b->brw; + result->cache = a->cache ^ b->cache; +} + +static void +brw_clear_validated_bos(struct brw_context *brw) +{ + int i; + + /* Clear the last round of validated bos */ + for (i = 0; i < brw->state.validated_bo_count; i++) { + dri_bo_unreference(brw->state.validated_bos[i]); + brw->state.validated_bos[i] = NULL; + } + brw->state.validated_bo_count = 0; +} + +struct dirty_bit_map { + uint32_t bit; + char *name; + uint32_t count; +}; + +#define DEFINE_BIT(name) {name, #name, 0} + +static struct dirty_bit_map mesa_bits[] = { + DEFINE_BIT(_NEW_MODELVIEW), + DEFINE_BIT(_NEW_PROJECTION), + DEFINE_BIT(_NEW_TEXTURE_MATRIX), + DEFINE_BIT(_NEW_COLOR_MATRIX), + DEFINE_BIT(_NEW_ACCUM), + DEFINE_BIT(_NEW_COLOR), + DEFINE_BIT(_NEW_DEPTH), + DEFINE_BIT(_NEW_EVAL), + DEFINE_BIT(_NEW_FOG), + DEFINE_BIT(_NEW_HINT), + DEFINE_BIT(_NEW_LIGHT), + DEFINE_BIT(_NEW_LINE), + DEFINE_BIT(_NEW_PIXEL), + DEFINE_BIT(_NEW_POINT), + DEFINE_BIT(_NEW_POLYGON), + DEFINE_BIT(_NEW_POLYGONSTIPPLE), + DEFINE_BIT(_NEW_SCISSOR), + DEFINE_BIT(_NEW_STENCIL), + DEFINE_BIT(_NEW_TEXTURE), + DEFINE_BIT(_NEW_TRANSFORM), + DEFINE_BIT(_NEW_VIEWPORT), + DEFINE_BIT(_NEW_PACKUNPACK), + DEFINE_BIT(_NEW_ARRAY), + DEFINE_BIT(_NEW_RENDERMODE), + DEFINE_BIT(_NEW_BUFFERS), + DEFINE_BIT(_NEW_MULTISAMPLE), + DEFINE_BIT(_NEW_TRACK_MATRIX), + DEFINE_BIT(_NEW_PROGRAM), + DEFINE_BIT(_NEW_PROGRAM_CONSTANTS), + {0, 0, 0} +}; + +static struct dirty_bit_map brw_bits[] = { + DEFINE_BIT(BRW_NEW_URB_FENCE), + DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM), + DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM), + DEFINE_BIT(BRW_NEW_INPUT_DIMENSIONS), + DEFINE_BIT(BRW_NEW_CURBE_OFFSETS), + DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE), + DEFINE_BIT(BRW_NEW_PRIMITIVE), + DEFINE_BIT(BRW_NEW_CONTEXT), + DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS), + DEFINE_BIT(BRW_NEW_PSP), + DEFINE_BIT(BRW_NEW_FENCE), + DEFINE_BIT(BRW_NEW_INDICES), + DEFINE_BIT(BRW_NEW_INDEX_BUFFER), + DEFINE_BIT(BRW_NEW_VERTICES), + DEFINE_BIT(BRW_NEW_BATCH), + DEFINE_BIT(BRW_NEW_DEPTH_BUFFER), + {0, 0, 0} +}; + +static struct dirty_bit_map cache_bits[] = { + DEFINE_BIT(CACHE_NEW_CC_VP), + DEFINE_BIT(CACHE_NEW_CC_UNIT), + DEFINE_BIT(CACHE_NEW_WM_PROG), + DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR), + DEFINE_BIT(CACHE_NEW_SAMPLER), + DEFINE_BIT(CACHE_NEW_WM_UNIT), + DEFINE_BIT(CACHE_NEW_SF_PROG), + DEFINE_BIT(CACHE_NEW_SF_VP), + DEFINE_BIT(CACHE_NEW_SF_UNIT), + DEFINE_BIT(CACHE_NEW_VS_UNIT), + DEFINE_BIT(CACHE_NEW_VS_PROG), + DEFINE_BIT(CACHE_NEW_GS_UNIT), + DEFINE_BIT(CACHE_NEW_GS_PROG), + DEFINE_BIT(CACHE_NEW_CLIP_VP), + DEFINE_BIT(CACHE_NEW_CLIP_UNIT), + DEFINE_BIT(CACHE_NEW_CLIP_PROG), + DEFINE_BIT(CACHE_NEW_SURFACE), + DEFINE_BIT(CACHE_NEW_SURF_BIND), + {0, 0, 0} +}; + + +static void +brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits) +{ + int i; + + for (i = 0; i < 32; i++) { + if (bit_map[i].bit == 0) + return; + + if (bit_map[i].bit & bits) + bit_map[i].count++; + } +} + +static void +brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits) +{ + int i; + + for (i = 0; i < 32; i++) { + if (bit_map[i].bit == 0) + return; + + fprintf(stderr, "0x%08x: %12d (%s)\n", + bit_map[i].bit, bit_map[i].count, bit_map[i].name); + } +} + +/*********************************************************************** + * Emit all state: + */ +void brw_validate_state( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct intel_context *intel = &brw->intel; + struct brw_state_flags *state = &brw->state.dirty; + GLuint i; + + brw_clear_validated_bos(brw); + + state->mesa |= brw->intel.NewGLState; + brw->intel.NewGLState = 0; + + brw_add_validated_bo(brw, intel->batch->buf); + + if (brw->emit_state_always) { + state->mesa |= ~0; + state->brw |= ~0; + state->cache |= ~0; + } + + if (brw->fragment_program != ctx->FragmentProgram._Current) { + brw->fragment_program = ctx->FragmentProgram._Current; + brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM; + } + + if (brw->vertex_program != ctx->VertexProgram._Current) { + brw->vertex_program = ctx->VertexProgram._Current; + brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; + } + + if (state->mesa == 0 && + state->cache == 0 && + state->brw == 0) + return; + + if (brw->state.dirty.brw & BRW_NEW_CONTEXT) + brw_clear_batch_cache(brw); + + brw->intel.Fallback = 0; + + /* do prepare stage for all atoms */ + for (i = 0; i < Elements(atoms); i++) { + const struct brw_tracked_state *atom = atoms[i]; + + if (brw->intel.Fallback) + break; + + if (check_state(state, &atom->dirty)) { + if (atom->prepare) { + atom->prepare(brw); + } + } + } + + /* Make sure that the textures which are referenced by the current + * brw fragment program are actually present/valid. + * If this fails, we can experience GPU lock-ups. + */ + { + const struct brw_fragment_program *fp; + fp = brw_fragment_program_const(brw->fragment_program); + if (fp) { + assert((fp->tex_units_used & ctx->Texture._EnabledUnits) + == fp->tex_units_used); + } + } +} + + +void brw_upload_state(struct brw_context *brw) +{ + struct brw_state_flags *state = &brw->state.dirty; + int i; + static int dirty_count = 0; + + brw_clear_validated_bos(brw); + + if (INTEL_DEBUG) { + /* Debug version which enforces various sanity checks on the + * state flags which are generated and checked to help ensure + * state atoms are ordered correctly in the list. + */ + struct brw_state_flags examined, prev; + _mesa_memset(&examined, 0, sizeof(examined)); + prev = *state; + + for (i = 0; i < Elements(atoms); i++) { + const struct brw_tracked_state *atom = atoms[i]; + struct brw_state_flags generated; + + assert(atom->dirty.mesa || + atom->dirty.brw || + atom->dirty.cache); + + if (brw->intel.Fallback) + break; + + if (check_state(state, &atom->dirty)) { + if (atom->emit) { + atom->emit( brw ); + } + } + + accumulate_state(&examined, &atom->dirty); + + /* generated = (prev ^ state) + * if (examined & generated) + * fail; + */ + xor_states(&generated, &prev, state); + assert(!check_state(&examined, &generated)); + prev = *state; + } + } + else { + for (i = 0; i < Elements(atoms); i++) { + const struct brw_tracked_state *atom = atoms[i]; + + if (brw->intel.Fallback) + break; + + if (check_state(state, &atom->dirty)) { + if (atom->emit) { + atom->emit( brw ); + } + } + } + } + + if (INTEL_DEBUG & DEBUG_STATE) { + brw_update_dirty_count(mesa_bits, state->mesa); + brw_update_dirty_count(brw_bits, state->brw); + brw_update_dirty_count(cache_bits, state->cache); + if (dirty_count++ % 1000 == 0) { + brw_print_dirty_count(mesa_bits, state->mesa); + brw_print_dirty_count(brw_bits, state->brw); + brw_print_dirty_count(cache_bits, state->cache); + fprintf(stderr, "\n"); + } + } + + if (!brw->intel.Fallback) + memset(state, 0, sizeof(*state)); +} diff --git a/src/gallium/drivers/i965/brw_structs.h b/src/gallium/drivers/i965/brw_structs.h new file mode 100644 index 00000000000..66d4127271a --- /dev/null +++ b/src/gallium/drivers/i965/brw_structs.h @@ -0,0 +1,1575 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_STRUCTS_H +#define BRW_STRUCTS_H + + +/** Number of general purpose registers (VS, WM, etc) */ +#define BRW_MAX_GRF 128 + +/** Number of message register file registers */ +#define BRW_MAX_MRF 16 + + +/* Command packets: + */ +struct header +{ + GLuint length:16; + GLuint opcode:16; +}; + + +union header_union +{ + struct header bits; + GLuint dword; +}; + +struct brw_3d_control +{ + struct + { + GLuint length:8; + GLuint notify_enable:1; + GLuint pad:3; + GLuint wc_flush_enable:1; + GLuint depth_stall_enable:1; + GLuint operation:2; + GLuint opcode:16; + } header; + + struct + { + GLuint pad:2; + GLuint dest_addr_type:1; + GLuint dest_addr:29; + } dest; + + GLuint dword2; + GLuint dword3; +}; + + +struct brw_3d_primitive +{ + struct + { + GLuint length:8; + GLuint pad:2; + GLuint topology:5; + GLuint indexed:1; + GLuint opcode:16; + } header; + + GLuint verts_per_instance; + GLuint start_vert_location; + GLuint instance_count; + GLuint start_instance_location; + GLuint base_vert_location; +}; + +/* These seem to be passed around as function args, so it works out + * better to keep them as #defines: + */ +#define BRW_FLUSH_READ_CACHE 0x1 +#define BRW_FLUSH_STATE_CACHE 0x2 +#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4 +#define BRW_FLUSH_SNAPSHOT_COUNTERS 0x8 + +struct brw_mi_flush +{ + GLuint flags:4; + GLuint pad:12; + GLuint opcode:16; +}; + +struct brw_vf_statistics +{ + GLuint statistics_enable:1; + GLuint pad:15; + GLuint opcode:16; +}; + + + +struct brw_binding_table_pointers +{ + struct header header; + GLuint vs; + GLuint gs; + GLuint clp; + GLuint sf; + GLuint wm; +}; + + +struct brw_blend_constant_color +{ + struct header header; + GLfloat blend_constant_color[4]; +}; + + +struct brw_depthbuffer +{ + union header_union header; + + union { + struct { + GLuint pitch:18; + GLuint format:3; + GLuint pad:2; + GLuint software_tiled_rendering_mode:2; + GLuint depth_offset_disable:1; + GLuint tile_walk:1; + GLuint tiled_surface:1; + GLuint pad2:1; + GLuint surface_type:3; + } bits; + GLuint dword; + } dword1; + + GLuint dword2_base_addr; + + union { + struct { + GLuint pad:1; + GLuint mipmap_layout:1; + GLuint lod:4; + GLuint width:13; + GLuint height:13; + } bits; + GLuint dword; + } dword3; + + union { + struct { + GLuint pad:10; + GLuint min_array_element:11; + GLuint depth:11; + } bits; + GLuint dword; + } dword4; +}; + +struct brw_depthbuffer_g4x +{ + union header_union header; + + union { + struct { + GLuint pitch:18; + GLuint format:3; + GLuint pad:2; + GLuint software_tiled_rendering_mode:2; + GLuint depth_offset_disable:1; + GLuint tile_walk:1; + GLuint tiled_surface:1; + GLuint pad2:1; + GLuint surface_type:3; + } bits; + GLuint dword; + } dword1; + + GLuint dword2_base_addr; + + union { + struct { + GLuint pad:1; + GLuint mipmap_layout:1; + GLuint lod:4; + GLuint width:13; + GLuint height:13; + } bits; + GLuint dword; + } dword3; + + union { + struct { + GLuint pad:10; + GLuint min_array_element:11; + GLuint depth:11; + } bits; + GLuint dword; + } dword4; + + union { + struct { + GLuint xoffset:16; + GLuint yoffset:16; + } bits; + GLuint dword; + } dword5; /* NEW in Integrated Graphics Device */ +}; + +struct brw_drawrect +{ + struct header header; + GLuint xmin:16; + GLuint ymin:16; + GLuint xmax:16; + GLuint ymax:16; + GLuint xorg:16; + GLuint yorg:16; +}; + + + + +struct brw_global_depth_offset_clamp +{ + struct header header; + GLfloat depth_offset_clamp; +}; + +struct brw_indexbuffer +{ + union { + struct + { + GLuint length:8; + GLuint index_format:2; + GLuint cut_index_enable:1; + GLuint pad:5; + GLuint opcode:16; + } bits; + GLuint dword; + + } header; + + GLuint buffer_start; + GLuint buffer_end; +}; + +/* NEW in Integrated Graphics Device */ +struct brw_aa_line_parameters +{ + struct header header; + + struct { + GLuint aa_coverage_scope:8; + GLuint pad0:8; + GLuint aa_coverage_bias:8; + GLuint pad1:8; + } bits0; + + struct { + GLuint aa_coverage_endcap_slope:8; + GLuint pad0:8; + GLuint aa_coverage_endcap_bias:8; + GLuint pad1:8; + } bits1; +}; + +struct brw_line_stipple +{ + struct header header; + + struct + { + GLuint pattern:16; + GLuint pad:16; + } bits0; + + struct + { + GLuint repeat_count:9; + GLuint pad:7; + GLuint inverse_repeat_count:16; + } bits1; +}; + + +struct brw_pipelined_state_pointers +{ + struct header header; + + struct { + GLuint pad:5; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE */ + } vs; + + struct + { + GLuint enable:1; + GLuint pad:4; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE */ + } gs; + + struct + { + GLuint enable:1; + GLuint pad:4; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE */ + } clp; + + struct + { + GLuint pad:5; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE */ + } sf; + + struct + { + GLuint pad:5; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE */ + } wm; + + struct + { + GLuint pad:5; + GLuint offset:27; /* Offset from GENERAL_STATE_BASE. KW: check me! */ + } cc; +}; + + +struct brw_polygon_stipple_offset +{ + struct header header; + + struct { + GLuint y_offset:5; + GLuint pad:3; + GLuint x_offset:5; + GLuint pad0:19; + } bits0; +}; + + + +struct brw_polygon_stipple +{ + struct header header; + GLuint stipple[32]; +}; + + + +struct brw_pipeline_select +{ + struct + { + GLuint pipeline_select:1; + GLuint pad:15; + GLuint opcode:16; + } header; +}; + + +struct brw_pipe_control +{ + struct + { + GLuint length:8; + GLuint notify_enable:1; + GLuint texture_cache_flush_enable:1; + GLuint indirect_state_pointers_disable:1; + GLuint instruction_state_cache_flush_enable:1; + GLuint write_cache_flush_enable:1; + GLuint depth_stall_enable:1; + GLuint post_sync_operation:2; + + GLuint opcode:16; + } header; + + struct + { + GLuint pad:2; + GLuint dest_addr_type:1; + GLuint dest_addr:29; + } bits1; + + GLuint data0; + GLuint data1; +}; + + +struct brw_urb_fence +{ + struct + { + GLuint length:8; + GLuint vs_realloc:1; + GLuint gs_realloc:1; + GLuint clp_realloc:1; + GLuint sf_realloc:1; + GLuint vfe_realloc:1; + GLuint cs_realloc:1; + GLuint pad:2; + GLuint opcode:16; + } header; + + struct + { + GLuint vs_fence:10; + GLuint gs_fence:10; + GLuint clp_fence:10; + GLuint pad:2; + } bits0; + + struct + { + GLuint sf_fence:10; + GLuint vf_fence:10; + GLuint cs_fence:11; + GLuint pad:1; + } bits1; +}; + +struct brw_cs_urb_state +{ + struct header header; + + struct + { + GLuint nr_urb_entries:3; + GLuint pad:1; + GLuint urb_entry_size:5; + GLuint pad0:23; + } bits0; +}; + +struct brw_constant_buffer +{ + struct + { + GLuint length:8; + GLuint valid:1; + GLuint pad:7; + GLuint opcode:16; + } header; + + struct + { + GLuint buffer_length:6; + GLuint buffer_address:26; + } bits0; +}; + +struct brw_state_base_address +{ + struct header header; + + struct + { + GLuint modify_enable:1; + GLuint pad:4; + GLuint general_state_address:27; + } bits0; + + struct + { + GLuint modify_enable:1; + GLuint pad:4; + GLuint surface_state_address:27; + } bits1; + + struct + { + GLuint modify_enable:1; + GLuint pad:4; + GLuint indirect_object_state_address:27; + } bits2; + + struct + { + GLuint modify_enable:1; + GLuint pad:11; + GLuint general_state_upper_bound:20; + } bits3; + + struct + { + GLuint modify_enable:1; + GLuint pad:11; + GLuint indirect_object_state_upper_bound:20; + } bits4; +}; + +struct brw_state_prefetch +{ + struct header header; + + struct + { + GLuint prefetch_count:3; + GLuint pad:3; + GLuint prefetch_pointer:26; + } bits0; +}; + +struct brw_system_instruction_pointer +{ + struct header header; + + struct + { + GLuint pad:4; + GLuint system_instruction_pointer:28; + } bits0; +}; + + + + +/* State structs for the various fixed function units: + */ + + +struct thread0 +{ + GLuint pad0:1; + GLuint grf_reg_count:3; + GLuint pad1:2; + GLuint kernel_start_pointer:26; /* Offset from GENERAL_STATE_BASE */ +}; + +struct thread1 +{ + GLuint ext_halt_exception_enable:1; + GLuint sw_exception_enable:1; + GLuint mask_stack_exception_enable:1; + GLuint timeout_exception_enable:1; + GLuint illegal_op_exception_enable:1; + GLuint pad0:3; + GLuint depth_coef_urb_read_offset:6; /* WM only */ + GLuint pad1:2; + GLuint floating_point_mode:1; + GLuint thread_priority:1; + GLuint binding_table_entry_count:8; + GLuint pad3:5; + GLuint single_program_flow:1; +}; + +struct thread2 +{ + GLuint per_thread_scratch_space:4; + GLuint pad0:6; + GLuint scratch_space_base_pointer:22; +}; + + +struct thread3 +{ + GLuint dispatch_grf_start_reg:4; + GLuint urb_entry_read_offset:6; + GLuint pad0:1; + GLuint urb_entry_read_length:6; + GLuint pad1:1; + GLuint const_urb_entry_read_offset:6; + GLuint pad2:1; + GLuint const_urb_entry_read_length:6; + GLuint pad3:1; +}; + + + +struct brw_clip_unit_state +{ + struct thread0 thread0; + struct + { + GLuint pad0:7; + GLuint sw_exception_enable:1; + GLuint pad1:3; + GLuint mask_stack_exception_enable:1; + GLuint pad2:1; + GLuint illegal_op_exception_enable:1; + GLuint pad3:2; + GLuint floating_point_mode:1; + GLuint thread_priority:1; + GLuint binding_table_entry_count:8; + GLuint pad4:5; + GLuint single_program_flow:1; + } thread1; + + struct thread2 thread2; + struct thread3 thread3; + + struct + { + GLuint pad0:9; + GLuint gs_output_stats:1; /* not always */ + GLuint stats_enable:1; + GLuint nr_urb_entries:7; + GLuint pad1:1; + GLuint urb_entry_allocation_size:5; + GLuint pad2:1; + GLuint max_threads:5; /* may be less */ + GLuint pad3:2; + } thread4; + + struct + { + GLuint pad0:13; + GLuint clip_mode:3; + GLuint userclip_enable_flags:8; + GLuint userclip_must_clip:1; + GLuint negative_w_clip_test:1; + GLuint guard_band_enable:1; + GLuint viewport_z_clip_enable:1; + GLuint viewport_xy_clip_enable:1; + GLuint vertex_position_space:1; + GLuint api_mode:1; + GLuint pad2:1; + } clip5; + + struct + { + GLuint pad0:5; + GLuint clipper_viewport_state_ptr:27; + } clip6; + + + GLfloat viewport_xmin; + GLfloat viewport_xmax; + GLfloat viewport_ymin; + GLfloat viewport_ymax; +}; + + + +struct brw_cc_unit_state +{ + struct + { + GLuint pad0:3; + GLuint bf_stencil_pass_depth_pass_op:3; + GLuint bf_stencil_pass_depth_fail_op:3; + GLuint bf_stencil_fail_op:3; + GLuint bf_stencil_func:3; + GLuint bf_stencil_enable:1; + GLuint pad1:2; + GLuint stencil_write_enable:1; + GLuint stencil_pass_depth_pass_op:3; + GLuint stencil_pass_depth_fail_op:3; + GLuint stencil_fail_op:3; + GLuint stencil_func:3; + GLuint stencil_enable:1; + } cc0; + + + struct + { + GLuint bf_stencil_ref:8; + GLuint stencil_write_mask:8; + GLuint stencil_test_mask:8; + GLuint stencil_ref:8; + } cc1; + + + struct + { + GLuint logicop_enable:1; + GLuint pad0:10; + GLuint depth_write_enable:1; + GLuint depth_test_function:3; + GLuint depth_test:1; + GLuint bf_stencil_write_mask:8; + GLuint bf_stencil_test_mask:8; + } cc2; + + + struct + { + GLuint pad0:8; + GLuint alpha_test_func:3; + GLuint alpha_test:1; + GLuint blend_enable:1; + GLuint ia_blend_enable:1; + GLuint pad1:1; + GLuint alpha_test_format:1; + GLuint pad2:16; + } cc3; + + struct + { + GLuint pad0:5; + GLuint cc_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */ + } cc4; + + struct + { + GLuint pad0:2; + GLuint ia_dest_blend_factor:5; + GLuint ia_src_blend_factor:5; + GLuint ia_blend_function:3; + GLuint statistics_enable:1; + GLuint logicop_func:4; + GLuint pad1:11; + GLuint dither_enable:1; + } cc5; + + struct + { + GLuint clamp_post_alpha_blend:1; + GLuint clamp_pre_alpha_blend:1; + GLuint clamp_range:2; + GLuint pad0:11; + GLuint y_dither_offset:2; + GLuint x_dither_offset:2; + GLuint dest_blend_factor:5; + GLuint src_blend_factor:5; + GLuint blend_function:3; + } cc6; + + struct { + union { + GLfloat f; + GLubyte ub[4]; + } alpha_ref; + } cc7; +}; + + + +struct brw_sf_unit_state +{ + struct thread0 thread0; + struct thread1 thread1; + struct thread2 thread2; + struct thread3 thread3; + + struct + { + GLuint pad0:10; + GLuint stats_enable:1; + GLuint nr_urb_entries:7; + GLuint pad1:1; + GLuint urb_entry_allocation_size:5; + GLuint pad2:1; + GLuint max_threads:6; + GLuint pad3:1; + } thread4; + + struct + { + GLuint front_winding:1; + GLuint viewport_transform:1; + GLuint pad0:3; + GLuint sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */ + } sf5; + + struct + { + GLuint pad0:9; + GLuint dest_org_vbias:4; + GLuint dest_org_hbias:4; + GLuint scissor:1; + GLuint disable_2x2_trifilter:1; + GLuint disable_zero_pix_trifilter:1; + GLuint point_rast_rule:2; + GLuint line_endcap_aa_region_width:2; + GLuint line_width:4; + GLuint fast_scissor_disable:1; + GLuint cull_mode:2; + GLuint aa_enable:1; + } sf6; + + struct + { + GLuint point_size:11; + GLuint use_point_size_state:1; + GLuint subpixel_precision:1; + GLuint sprite_point:1; + GLuint pad0:10; + GLuint aa_line_distance_mode:1; + GLuint trifan_pv:2; + GLuint linestrip_pv:2; + GLuint tristrip_pv:2; + GLuint line_last_pixel_enable:1; + } sf7; + +}; + + +struct brw_gs_unit_state +{ + struct thread0 thread0; + struct thread1 thread1; + struct thread2 thread2; + struct thread3 thread3; + + struct + { + GLuint pad0:8; + GLuint rendering_enable:1; /* for IGDNG */ + GLuint pad4:1; + GLuint stats_enable:1; + GLuint nr_urb_entries:7; + GLuint pad1:1; + GLuint urb_entry_allocation_size:5; + GLuint pad2:1; + GLuint max_threads:5; + GLuint pad3:2; + } thread4; + + struct + { + GLuint sampler_count:3; + GLuint pad0:2; + GLuint sampler_state_pointer:27; + } gs5; + + + struct + { + GLuint max_vp_index:4; + GLuint pad0:12; + GLuint svbi_post_inc_value:10; + GLuint pad1:1; + GLuint svbi_post_inc_enable:1; + GLuint svbi_payload:1; + GLuint discard_adjaceny:1; + GLuint reorder_enable:1; + GLuint pad2:1; + } gs6; +}; + + +struct brw_vs_unit_state +{ + struct thread0 thread0; + struct thread1 thread1; + struct thread2 thread2; + struct thread3 thread3; + + struct + { + GLuint pad0:10; + GLuint stats_enable:1; + GLuint nr_urb_entries:7; + GLuint pad1:1; + GLuint urb_entry_allocation_size:5; + GLuint pad2:1; + GLuint max_threads:6; + GLuint pad3:1; + } thread4; + + struct + { + GLuint sampler_count:3; + GLuint pad0:2; + GLuint sampler_state_pointer:27; + } vs5; + + struct + { + GLuint vs_enable:1; + GLuint vert_cache_disable:1; + GLuint pad0:30; + } vs6; +}; + + +struct brw_wm_unit_state +{ + struct thread0 thread0; + struct thread1 thread1; + struct thread2 thread2; + struct thread3 thread3; + + struct { + GLuint stats_enable:1; + GLuint depth_buffer_clear:1; + GLuint sampler_count:3; + GLuint sampler_state_pointer:27; + } wm4; + + struct + { + GLuint enable_8_pix:1; + GLuint enable_16_pix:1; + GLuint enable_32_pix:1; + GLuint enable_con_32_pix:1; + GLuint enable_con_64_pix:1; + GLuint pad0:5; + GLuint legacy_global_depth_bias:1; + GLuint line_stipple:1; + GLuint depth_offset:1; + GLuint polygon_stipple:1; + GLuint line_aa_region_width:2; + GLuint line_endcap_aa_region_width:2; + GLuint early_depth_test:1; + GLuint thread_dispatch_enable:1; + GLuint program_uses_depth:1; + GLuint program_computes_depth:1; + GLuint program_uses_killpixel:1; + GLuint legacy_line_rast: 1; + GLuint transposed_urb_read_enable:1; + GLuint max_threads:7; + } wm5; + + GLfloat global_depth_offset_constant; + GLfloat global_depth_offset_scale; + + /* for IGDNG only */ + struct { + GLuint pad0:1; + GLuint grf_reg_count_1:3; + GLuint pad1:2; + GLuint kernel_start_pointer_1:26; + } wm8; + + struct { + GLuint pad0:1; + GLuint grf_reg_count_2:3; + GLuint pad1:2; + GLuint kernel_start_pointer_2:26; + } wm9; + + struct { + GLuint pad0:1; + GLuint grf_reg_count_3:3; + GLuint pad1:2; + GLuint kernel_start_pointer_3:26; + } wm10; +}; + +struct brw_sampler_default_color { + GLfloat color[4]; +}; + +struct brw_sampler_state +{ + + struct + { + GLuint shadow_function:3; + GLuint lod_bias:11; + GLuint min_filter:3; + GLuint mag_filter:3; + GLuint mip_filter:2; + GLuint base_level:5; + GLuint pad:1; + GLuint lod_preclamp:1; + GLuint default_color_mode:1; + GLuint pad0:1; + GLuint disable:1; + } ss0; + + struct + { + GLuint r_wrap_mode:3; + GLuint t_wrap_mode:3; + GLuint s_wrap_mode:3; + GLuint pad:3; + GLuint max_lod:10; + GLuint min_lod:10; + } ss1; + + + struct + { + GLuint pad:5; + GLuint default_color_pointer:27; + } ss2; + + struct + { + GLuint pad:19; + GLuint max_aniso:3; + GLuint chroma_key_mode:1; + GLuint chroma_key_index:2; + GLuint chroma_key_enable:1; + GLuint monochrome_filter_width:3; + GLuint monochrome_filter_height:3; + } ss3; +}; + + +struct brw_clipper_viewport +{ + GLfloat xmin; + GLfloat xmax; + GLfloat ymin; + GLfloat ymax; +}; + +struct brw_cc_viewport +{ + GLfloat min_depth; + GLfloat max_depth; +}; + +struct brw_sf_viewport +{ + struct { + GLfloat m00; + GLfloat m11; + GLfloat m22; + GLfloat m30; + GLfloat m31; + GLfloat m32; + } viewport; + + /* scissor coordinates are inclusive */ + struct { + GLshort xmin; + GLshort ymin; + GLshort xmax; + GLshort ymax; + } scissor; +}; + +/* Documented in the subsystem/shared-functions/sampler chapter... + */ +struct brw_surface_state +{ + struct { + GLuint cube_pos_z:1; + GLuint cube_neg_z:1; + GLuint cube_pos_y:1; + GLuint cube_neg_y:1; + GLuint cube_pos_x:1; + GLuint cube_neg_x:1; + GLuint pad:4; + GLuint mipmap_layout_mode:1; + GLuint vert_line_stride_ofs:1; + GLuint vert_line_stride:1; + GLuint color_blend:1; + GLuint writedisable_blue:1; + GLuint writedisable_green:1; + GLuint writedisable_red:1; + GLuint writedisable_alpha:1; + GLuint surface_format:9; /**< BRW_SURFACEFORMAT_x */ + GLuint data_return_format:1; + GLuint pad0:1; + GLuint surface_type:3; /**< BRW_SURFACE_1D/2D/3D/CUBE */ + } ss0; + + struct { + GLuint base_addr; + } ss1; + + struct { + GLuint pad:2; + GLuint mip_count:4; + GLuint width:13; + GLuint height:13; + } ss2; + + struct { + GLuint tile_walk:1; + GLuint tiled_surface:1; + GLuint pad:1; + GLuint pitch:18; + GLuint depth:11; + } ss3; + + struct { + GLuint multisample_position_palette_index:3; + GLuint pad1:1; + GLuint num_multisamples:3; + GLuint pad0:1; + GLuint render_target_view_extent:9; + GLuint min_array_elt:11; + GLuint min_lod:4; + } ss4; + + struct { + GLuint pad1:16; + GLuint llc_mapping:1; + GLuint mlc_mapping:1; + GLuint gfdt:1; + GLuint gfdt_src:1; + GLuint y_offset:4; + GLuint pad0:1; + GLuint x_offset:7; + } ss5; /* New in G4X */ + +}; + + + +struct brw_vertex_buffer_state +{ + struct { + GLuint pitch:11; + GLuint pad:15; + GLuint access_type:1; + GLuint vb_index:5; + } vb0; + + GLuint start_addr; + GLuint max_index; +#if 1 + GLuint instance_data_step_rate; /* not included for sequential/random vertices? */ +#endif +}; + +#define BRW_VBP_MAX 17 + +struct brw_vb_array_state { + struct header header; + struct brw_vertex_buffer_state vb[BRW_VBP_MAX]; +}; + + +struct brw_vertex_element_state +{ + struct + { + GLuint src_offset:11; + GLuint pad:5; + GLuint src_format:9; + GLuint pad0:1; + GLuint valid:1; + GLuint vertex_buffer_index:5; + } ve0; + + struct + { + GLuint dst_offset:8; + GLuint pad:8; + GLuint vfcomponent3:4; + GLuint vfcomponent2:4; + GLuint vfcomponent1:4; + GLuint vfcomponent0:4; + } ve1; +}; + +#define BRW_VEP_MAX 18 + +struct brw_vertex_element_packet { + struct header header; + struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */ +}; + + +struct brw_urb_immediate { + GLuint opcode:4; + GLuint offset:6; + GLuint swizzle_control:2; + GLuint pad:1; + GLuint allocate:1; + GLuint used:1; + GLuint complete:1; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; +}; + +/* Instruction format for the execution units: + */ + +struct brw_instruction +{ + struct + { + GLuint opcode:7; + GLuint pad:1; + GLuint access_mode:1; + GLuint mask_control:1; + GLuint dependency_control:2; + GLuint compression_control:2; + GLuint thread_control:2; + GLuint predicate_control:4; + GLuint predicate_inverse:1; + GLuint execution_size:3; + GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */ + GLuint pad0:2; + GLuint debug_control:1; + GLuint saturate:1; + } header; + + union { + struct + { + GLuint dest_reg_file:2; + GLuint dest_reg_type:3; + GLuint src0_reg_file:2; + GLuint src0_reg_type:3; + GLuint src1_reg_file:2; + GLuint src1_reg_type:3; + GLuint pad:1; + GLuint dest_subreg_nr:5; + GLuint dest_reg_nr:8; + GLuint dest_horiz_stride:2; + GLuint dest_address_mode:1; + } da1; + + struct + { + GLuint dest_reg_file:2; + GLuint dest_reg_type:3; + GLuint src0_reg_file:2; + GLuint src0_reg_type:3; + GLuint src1_reg_file:2; /* 0x00000c00 */ + GLuint src1_reg_type:3; /* 0x00007000 */ + GLuint pad:1; + GLint dest_indirect_offset:10; /* offset against the deref'd address reg */ + GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */ + GLuint dest_horiz_stride:2; + GLuint dest_address_mode:1; + } ia1; + + struct + { + GLuint dest_reg_file:2; + GLuint dest_reg_type:3; + GLuint src0_reg_file:2; + GLuint src0_reg_type:3; + GLuint src1_reg_file:2; + GLuint src1_reg_type:3; + GLuint pad:1; + GLuint dest_writemask:4; + GLuint dest_subreg_nr:1; + GLuint dest_reg_nr:8; + GLuint pad1:2; + GLuint dest_address_mode:1; + } da16; + + struct + { + GLuint dest_reg_file:2; + GLuint dest_reg_type:3; + GLuint src0_reg_file:2; + GLuint src0_reg_type:3; + GLuint pad0:6; + GLuint dest_writemask:4; + GLint dest_indirect_offset:6; + GLuint dest_subreg_nr:3; + GLuint pad1:2; + GLuint dest_address_mode:1; + } ia16; + } bits1; + + + union { + struct + { + GLuint src0_subreg_nr:5; + GLuint src0_reg_nr:8; + GLuint src0_abs:1; + GLuint src0_negate:1; + GLuint src0_address_mode:1; + GLuint src0_horiz_stride:2; + GLuint src0_width:3; + GLuint src0_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad:6; + } da1; + + struct + { + GLint src0_indirect_offset:10; + GLuint src0_subreg_nr:3; + GLuint src0_abs:1; + GLuint src0_negate:1; + GLuint src0_address_mode:1; + GLuint src0_horiz_stride:2; + GLuint src0_width:3; + GLuint src0_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad:6; + } ia1; + + struct + { + GLuint src0_swz_x:2; + GLuint src0_swz_y:2; + GLuint src0_subreg_nr:1; + GLuint src0_reg_nr:8; + GLuint src0_abs:1; + GLuint src0_negate:1; + GLuint src0_address_mode:1; + GLuint src0_swz_z:2; + GLuint src0_swz_w:2; + GLuint pad0:1; + GLuint src0_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad1:6; + } da16; + + struct + { + GLuint src0_swz_x:2; + GLuint src0_swz_y:2; + GLint src0_indirect_offset:6; + GLuint src0_subreg_nr:3; + GLuint src0_abs:1; + GLuint src0_negate:1; + GLuint src0_address_mode:1; + GLuint src0_swz_z:2; + GLuint src0_swz_w:2; + GLuint pad0:1; + GLuint src0_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad1:6; + } ia16; + + struct + { + GLuint pad:26; + GLuint end_of_thread:1; + GLuint pad1:1; + GLuint sfid:4; + } send_igdng; /* for IGDNG only */ + + } bits2; + + union + { + struct + { + GLuint src1_subreg_nr:5; + GLuint src1_reg_nr:8; + GLuint src1_abs:1; + GLuint src1_negate:1; + GLuint src1_address_mode:1; + GLuint src1_horiz_stride:2; + GLuint src1_width:3; + GLuint src1_vert_stride:4; + GLuint pad0:7; + } da1; + + struct + { + GLuint src1_swz_x:2; + GLuint src1_swz_y:2; + GLuint src1_subreg_nr:1; + GLuint src1_reg_nr:8; + GLuint src1_abs:1; + GLuint src1_negate:1; + GLuint src1_address_mode:1; + GLuint src1_swz_z:2; + GLuint src1_swz_w:2; + GLuint pad1:1; + GLuint src1_vert_stride:4; + GLuint pad2:7; + } da16; + + struct + { + GLint src1_indirect_offset:10; + GLuint src1_subreg_nr:3; + GLuint src1_abs:1; + GLuint src1_negate:1; + GLuint src1_address_mode:1; + GLuint src1_horiz_stride:2; + GLuint src1_width:3; + GLuint src1_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad1:6; + } ia1; + + struct + { + GLuint src1_swz_x:2; + GLuint src1_swz_y:2; + GLint src1_indirect_offset:6; + GLuint src1_subreg_nr:3; + GLuint src1_abs:1; + GLuint src1_negate:1; + GLuint pad0:1; + GLuint src1_swz_z:2; + GLuint src1_swz_w:2; + GLuint pad1:1; + GLuint src1_vert_stride:4; + GLuint flag_reg_nr:1; + GLuint pad2:6; + } ia16; + + + struct + { + GLint jump_count:16; /* note: signed */ + GLuint pop_count:4; + GLuint pad0:12; + } if_else; + + struct { + GLuint function:4; + GLuint int_type:1; + GLuint precision:1; + GLuint saturate:1; + GLuint data_type:1; + GLuint pad0:8; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } math; + + struct { + GLuint function:4; + GLuint int_type:1; + GLuint precision:1; + GLuint saturate:1; + GLuint data_type:1; + GLuint snapshot:1; + GLuint pad0:10; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } math_igdng; + + struct { + GLuint binding_table_index:8; + GLuint sampler:4; + GLuint return_format:2; + GLuint msg_type:2; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } sampler; + + struct { + GLuint binding_table_index:8; + GLuint sampler:4; + GLuint msg_type:4; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } sampler_g4x; + + struct { + GLuint binding_table_index:8; + GLuint sampler:4; + GLuint msg_type:4; + GLuint simd_mode:2; + GLuint pad0:1; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } sampler_igdng; + + struct brw_urb_immediate urb; + + struct { + GLuint opcode:4; + GLuint offset:6; + GLuint swizzle_control:2; + GLuint pad:1; + GLuint allocate:1; + GLuint used:1; + GLuint complete:1; + GLuint pad0:3; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } urb_igdng; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:4; + GLuint msg_type:2; + GLuint target_cache:2; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } dp_read; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint msg_type:3; + GLuint target_cache:2; + GLuint pad0:3; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_read_igdng; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint pixel_scoreboard_clear:1; + GLuint msg_type:3; + GLuint send_commit_msg:1; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } dp_write; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint pixel_scoreboard_clear:1; + GLuint msg_type:3; + GLuint send_commit_msg:1; + GLuint pad0:3; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_write_igdng; + + struct { + GLuint pad:16; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } generic; + + struct { + GLuint pad:19; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } generic_igdng; + + GLint d; + GLuint ud; + float f; + } bits3; +}; + + +#endif diff --git a/src/gallium/drivers/i965/brw_tex.c b/src/gallium/drivers/i965/brw_tex.c new file mode 100644 index 00000000000..e911b105b23 --- /dev/null +++ b/src/gallium/drivers/i965/brw_tex.c @@ -0,0 +1,59 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/mtypes.h" +#include "main/teximage.h" + +#include "intel_context.h" +#include "intel_regions.h" +#include "intel_tex.h" +#include "brw_context.h" + +/** + * Finalizes all textures, completing any rendering that needs to be done + * to prepare them. + */ +void brw_validate_textures( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct intel_context *intel = &brw->intel; + int i; + + for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { + struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i]; + + if (texUnit->_ReallyEnabled) { + intel_finalize_mipmap_tree(intel, i); + } + } +} diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c new file mode 100644 index 00000000000..5986cbffade --- /dev/null +++ b/src/gallium/drivers/i965/brw_tex_layout.c @@ -0,0 +1,222 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +/* Code to layout images in a mipmap tree for i965. + */ + +#include "intel_mipmap_tree.h" +#include "intel_tex_layout.h" +#include "intel_context.h" +#include "main/macros.h" +#include "intel_chipset.h" + +#define FILE_DEBUG_FLAG DEBUG_MIPTREE + +GLboolean brw_miptree_layout(struct intel_context *intel, + struct intel_mipmap_tree *mt, + uint32_t tiling) +{ + /* XXX: these vary depending on image format: */ + /* GLint align_w = 4; */ + + switch (mt->target) { + case GL_TEXTURE_CUBE_MAP: + if (IS_IGDNG(intel->intelScreen->deviceID)) { + GLuint align_h = 2, align_w = 4; + GLuint level; + GLuint x = 0; + GLuint y = 0; + GLuint width = mt->width0; + GLuint height = mt->height0; + GLuint qpitch = 0; + GLuint y_pitch = 0; + + mt->pitch = mt->width0; + intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h); + y_pitch = ALIGN(height, align_h); + + if (mt->compressed) { + mt->pitch = ALIGN(mt->width0, align_w); + } + + if (mt->first_level != mt->last_level) { + GLuint mip1_width; + + if (mt->compressed) { + mip1_width = ALIGN(minify(mt->width0), align_w) + + ALIGN(minify(minify(mt->width0)), align_w); + } else { + mip1_width = ALIGN(minify(mt->width0), align_w) + + minify(minify(mt->width0)); + } + + if (mip1_width > mt->pitch) { + mt->pitch = mip1_width; + } + } + + mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch); + + if (mt->compressed) { + qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp; + mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6; + } else { + qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp; + mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6; + } + + for (level = mt->first_level; level <= mt->last_level; level++) { + GLuint img_height; + GLuint nr_images = 6; + GLuint q = 0; + + intel_miptree_set_level_info(mt, level, nr_images, x, y, width, + height, 1); + + for (q = 0; q < nr_images; q++) + intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch); + + if (mt->compressed) + img_height = MAX2(1, height/4); + else + img_height = ALIGN(height, align_h); + + if (level == mt->first_level + 1) { + x += ALIGN(width, align_w); + } + else { + y += img_height; + } + + width = minify(width); + height = minify(height); + } + + break; + } + + case GL_TEXTURE_3D: { + GLuint width = mt->width0; + GLuint height = mt->height0; + GLuint depth = mt->depth0; + GLuint pack_x_pitch, pack_x_nr; + GLuint pack_y_pitch; + GLuint level; + GLuint align_h = 2; + GLuint align_w = 4; + + mt->total_height = 0; + intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h); + + if (mt->compressed) { + mt->pitch = ALIGN(width, align_w); + pack_y_pitch = (height + 3) / 4; + } else { + mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0); + pack_y_pitch = ALIGN(mt->height0, align_h); + } + + pack_x_pitch = width; + pack_x_nr = 1; + + for (level = mt->first_level ; level <= mt->last_level ; level++) { + GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6; + GLint x = 0; + GLint y = 0; + GLint q, j; + + intel_miptree_set_level_info(mt, level, nr_images, + 0, mt->total_height, + width, height, depth); + + for (q = 0; q < nr_images;) { + for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) { + intel_miptree_set_image_offset(mt, level, q, x, y); + x += pack_x_pitch; + } + + x = 0; + y += pack_y_pitch; + } + + + mt->total_height += y; + width = minify(width); + height = minify(height); + depth = minify(depth); + + if (mt->compressed) { + pack_y_pitch = (height + 3) / 4; + + if (pack_x_pitch > ALIGN(width, align_w)) { + pack_x_pitch = ALIGN(width, align_w); + pack_x_nr <<= 1; + } + } else { + if (pack_x_pitch > 4) { + pack_x_pitch >>= 1; + pack_x_nr <<= 1; + assert(pack_x_pitch * pack_x_nr <= mt->pitch); + } + + if (pack_y_pitch > 2) { + pack_y_pitch >>= 1; + pack_y_pitch = ALIGN(pack_y_pitch, align_h); + } + } + + } + /* The 965's sampler lays cachelines out according to how accesses + * in the texture surfaces run, so they may be "vertical" through + * memory. As a result, the docs say in Surface Padding Requirements: + * Sampling Engine Surfaces that two extra rows of padding are required. + * We don't know of similar requirements for pre-965, but given that + * those docs are silent on padding requirements in general, let's play + * it safe. + */ + if (mt->target == GL_TEXTURE_CUBE_MAP) + mt->total_height += 2; + break; + } + + default: + i945_miptree_layout_2d(intel, mt, tiling); + break; + } + DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__, + mt->pitch, + mt->total_height, + mt->cpp, + mt->pitch * mt->total_height * mt->cpp ); + + return GL_TRUE; +} + diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c new file mode 100644 index 00000000000..8c6f4355a6e --- /dev/null +++ b/src/gallium/drivers/i965/brw_urb.c @@ -0,0 +1,250 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "intel_batchbuffer.h" +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + +#define VS 0 +#define GS 1 +#define CLP 2 +#define SF 3 +#define CS 4 + +/** @file brw_urb.c + * + * Manages the division of the URB space between the various fixed-function + * units. + * + * See the Thread Initiation Management section of the GEN4 B-Spec, and + * the individual *_STATE structures for restrictions on numbers of + * entries and threads. + */ + +/* + * Generally, a unit requires a min_nr_entries based on how many entries + * it produces before the downstream unit gets unblocked and can use and + * dereference some of its handles. + * + * The SF unit preallocates a PUE at the start of thread dispatch, and only + * uses that one. So it requires one entry per thread. + * + * For CLIP, the SF unit will hold the previous primitive while the + * next is getting assembled, meaning that linestrips require 3 CLIP VUEs + * (vertices) to ensure continued processing, trifans require 4, and tristrips + * require 5. There can be 1 or 2 threads, and each has the same requirement. + * + * GS has the same requirement as CLIP, but it never handles tristrips, + * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces. + * We only run it single-threaded. + * + * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X). + * Each thread processes 2 preallocated VUEs (vertices) at a time, and they + * get streamed down as soon as threads processing earlier vertices get + * theirs accepted. + * + * Each unit will take the number of URB entries we give it (based on the + * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs, + * and brw_curbe.c for the CURBEs) and decide its maximum number of + * threads it can support based on that. in brw_*_state.c. + * + * XXX: Are the min_entry_size numbers useful? + * XXX: Verify min_nr_entries, esp for VS. + * XXX: Verify SF min_entry_size. + */ +static const struct { + GLuint min_nr_entries; + GLuint preferred_nr_entries; + GLuint min_entry_size; + GLuint max_entry_size; +} limits[CS+1] = { + { 16, 32, 1, 5 }, /* vs */ + { 4, 8, 1, 5 }, /* gs */ + { 5, 10, 1, 5 }, /* clp */ + { 1, 8, 1, 12 }, /* sf */ + { 1, 4, 1, 32 } /* cs */ +}; + + +static GLboolean check_urb_layout( struct brw_context *brw ) +{ + brw->urb.vs_start = 0; + brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize; + brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize; + brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize; + brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize; + + return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw); +} + +/* Most minimal update, forces re-emit of URB fence packet after GS + * unit turned on/off. + */ +static void recalculate_urb_fence( struct brw_context *brw ) +{ + GLuint csize = brw->curbe.total_size; + GLuint vsize = brw->vs.prog_data->urb_entry_size; + GLuint sfsize = brw->sf.prog_data->urb_entry_size; + + if (csize < limits[CS].min_entry_size) + csize = limits[CS].min_entry_size; + + if (vsize < limits[VS].min_entry_size) + vsize = limits[VS].min_entry_size; + + if (sfsize < limits[SF].min_entry_size) + sfsize = limits[SF].min_entry_size; + + if (brw->urb.vsize < vsize || + brw->urb.sfsize < sfsize || + brw->urb.csize < csize || + (brw->urb.constrained && (brw->urb.vsize > vsize || + brw->urb.sfsize > sfsize || + brw->urb.csize > csize))) { + + + brw->urb.csize = csize; + brw->urb.sfsize = sfsize; + brw->urb.vsize = vsize; + + brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries; + brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries; + brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries; + brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries; + brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries; + + brw->urb.constrained = 0; + + if (BRW_IS_IGDNG(brw)) { + brw->urb.nr_vs_entries = 128; + brw->urb.nr_sf_entries = 48; + if (check_urb_layout(brw)) { + goto done; + } else { + brw->urb.constrained = 1; + brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries; + brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries; + } + } else if (BRW_IS_G4X(brw)) { + brw->urb.nr_vs_entries = 64; + if (check_urb_layout(brw)) { + goto done; + } else { + brw->urb.constrained = 1; + brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries; + } + } + + if (!check_urb_layout(brw)) { + brw->urb.nr_vs_entries = limits[VS].min_nr_entries; + brw->urb.nr_gs_entries = limits[GS].min_nr_entries; + brw->urb.nr_clip_entries = limits[CLP].min_nr_entries; + brw->urb.nr_sf_entries = limits[SF].min_nr_entries; + brw->urb.nr_cs_entries = limits[CS].min_nr_entries; + + /* Mark us as operating with constrained nr_entries, so that next + * time we recalculate we'll resize the fences in the hope of + * escaping constrained mode and getting back to normal performance. + */ + brw->urb.constrained = 1; + + if (!check_urb_layout(brw)) { + /* This is impossible, given the maximal sizes of urb + * entries and the values for minimum nr of entries + * provided above. + */ + _mesa_printf("couldn't calculate URB layout!\n"); + exit(1); + } + + if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS)) + _mesa_printf("URB CONSTRAINED\n"); + } + +done: + if (INTEL_DEBUG & DEBUG_URB) + _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n", + brw->urb.vs_start, + brw->urb.gs_start, + brw->urb.clip_start, + brw->urb.sf_start, + brw->urb.cs_start, + URB_SIZES(brw)); + + brw->state.dirty.brw |= BRW_NEW_URB_FENCE; + } +} + + +const struct brw_tracked_state brw_recalculate_urb_fence = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_CURBE_OFFSETS, + .cache = (CACHE_NEW_VS_PROG | + CACHE_NEW_SF_PROG) + }, + .prepare = recalculate_urb_fence +}; + + + + + +void brw_upload_urb_fence(struct brw_context *brw) +{ + struct brw_urb_fence uf; + memset(&uf, 0, sizeof(uf)); + + uf.header.opcode = CMD_URB_FENCE; + uf.header.length = sizeof(uf)/4-2; + uf.header.vs_realloc = 1; + uf.header.gs_realloc = 1; + uf.header.clp_realloc = 1; + uf.header.sf_realloc = 1; + uf.header.vfe_realloc = 1; + uf.header.cs_realloc = 1; + + /* The ordering below is correct, not the layout in the + * instruction. + * + * There are 256/384 urb reg pairs in total. + */ + uf.bits0.vs_fence = brw->urb.gs_start; + uf.bits0.gs_fence = brw->urb.clip_start; + uf.bits0.clp_fence = brw->urb.sf_start; + uf.bits1.sf_fence = brw->urb.cs_start; + uf.bits1.cs_fence = URB_SIZES(brw); + + BRW_BATCH_STRUCT(brw, &uf); +} diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c new file mode 100644 index 00000000000..ce21aa48695 --- /dev/null +++ b/src/gallium/drivers/i965/brw_util.c @@ -0,0 +1,104 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/mtypes.h" +#include "shader/prog_parameter.h" +#include "brw_util.h" +#include "brw_defines.h" + +GLuint brw_count_bits( GLuint val ) +{ + GLuint i; + for (i = 0; val ; val >>= 1) + if (val & 1) + i++; + return i; +} + + +GLuint brw_translate_blend_equation( GLenum mode ) +{ + switch (mode) { + case GL_FUNC_ADD: + return BRW_BLENDFUNCTION_ADD; + case GL_MIN: + return BRW_BLENDFUNCTION_MIN; + case GL_MAX: + return BRW_BLENDFUNCTION_MAX; + case GL_FUNC_SUBTRACT: + return BRW_BLENDFUNCTION_SUBTRACT; + case GL_FUNC_REVERSE_SUBTRACT: + return BRW_BLENDFUNCTION_REVERSE_SUBTRACT; + default: + assert(0); + return BRW_BLENDFUNCTION_ADD; + } +} + +GLuint brw_translate_blend_factor( GLenum factor ) +{ + switch(factor) { + case GL_ZERO: + return BRW_BLENDFACTOR_ZERO; + case GL_SRC_ALPHA: + return BRW_BLENDFACTOR_SRC_ALPHA; + case GL_ONE: + return BRW_BLENDFACTOR_ONE; + case GL_SRC_COLOR: + return BRW_BLENDFACTOR_SRC_COLOR; + case GL_ONE_MINUS_SRC_COLOR: + return BRW_BLENDFACTOR_INV_SRC_COLOR; + case GL_DST_COLOR: + return BRW_BLENDFACTOR_DST_COLOR; + case GL_ONE_MINUS_DST_COLOR: + return BRW_BLENDFACTOR_INV_DST_COLOR; + case GL_ONE_MINUS_SRC_ALPHA: + return BRW_BLENDFACTOR_INV_SRC_ALPHA; + case GL_DST_ALPHA: + return BRW_BLENDFACTOR_DST_ALPHA; + case GL_ONE_MINUS_DST_ALPHA: + return BRW_BLENDFACTOR_INV_DST_ALPHA; + case GL_SRC_ALPHA_SATURATE: + return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE; + case GL_CONSTANT_COLOR: + return BRW_BLENDFACTOR_CONST_COLOR; + case GL_ONE_MINUS_CONSTANT_COLOR: + return BRW_BLENDFACTOR_INV_CONST_COLOR; + case GL_CONSTANT_ALPHA: + return BRW_BLENDFACTOR_CONST_ALPHA; + case GL_ONE_MINUS_CONSTANT_ALPHA: + return BRW_BLENDFACTOR_INV_CONST_ALPHA; + default: + assert(0); + return BRW_BLENDFACTOR_ZERO; + } +} diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h new file mode 100644 index 00000000000..33e7cd87e42 --- /dev/null +++ b/src/gallium/drivers/i965/brw_util.h @@ -0,0 +1,45 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_UTIL_H +#define BRW_UTIL_H + +#include "main/mtypes.h" + +extern GLuint brw_count_bits( GLuint val ); +extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList); +extern GLuint brw_translate_blend_factor( GLenum factor ); +extern GLuint brw_translate_blend_equation( GLenum mode ); + + + +#endif diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c new file mode 100644 index 00000000000..f0c79efbd96 --- /dev/null +++ b/src/gallium/drivers/i965/brw_vs.c @@ -0,0 +1,124 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_vs.h" +#include "brw_util.h" +#include "brw_state.h" +#include "shader/prog_print.h" + + + +static void do_vs_prog( struct brw_context *brw, + struct brw_vertex_program *vp, + struct brw_vs_prog_key *key ) +{ + GLuint program_size; + const GLuint *program; + struct brw_vs_compile c; + + memset(&c, 0, sizeof(c)); + memcpy(&c.key, key, sizeof(*key)); + + brw_init_compile(brw, &c.func); + c.vp = vp; + + c.prog_data.outputs_written = vp->program.Base.OutputsWritten; + c.prog_data.inputs_read = vp->program.Base.InputsRead; + + if (c.key.copy_edgeflag) { + c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE; + c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG; + } + + if (0) + _mesa_print_program(&c.vp->program.Base); + + + + /* Emit GEN4 code. + */ + brw_vs_emit(&c); + + /* get the program + */ + program = brw_get_program(&c.func, &program_size); + + dri_bo_unreference(brw->vs.prog_bo); + brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG, + &c.key, sizeof(c.key), + NULL, 0, + program, program_size, + &c.prog_data, + &brw->vs.prog_data ); +} + + +static void brw_upload_vs_prog(struct brw_context *brw) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_vs_prog_key key; + struct brw_vertex_program *vp = + (struct brw_vertex_program *)brw->vertex_program; + + memset(&key, 0, sizeof(key)); + + /* Just upload the program verbatim for now. Always send it all + * the inputs it asks for, whether they are varying or not. + */ + key.program_string_id = vp->id; + key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled); + key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL || + ctx->Polygon.BackMode != GL_FILL); + + /* Make an early check for the key. + */ + dri_bo_unreference(brw->vs.prog_bo); + brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG, + &key, sizeof(key), + NULL, 0, + &brw->vs.prog_data); + if (brw->vs.prog_bo == NULL) + do_vs_prog(brw, vp, &key); +} + + +/* See brw_vs.c: + */ +const struct brw_tracked_state brw_vs_prog = { + .dirty = { + .mesa = _NEW_TRANSFORM | _NEW_POLYGON, + .brw = BRW_NEW_VERTEX_PROGRAM, + .cache = 0 + }, + .prepare = brw_upload_vs_prog +}; diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h new file mode 100644 index 00000000000..4a591365c98 --- /dev/null +++ b/src/gallium/drivers/i965/brw_vs.h @@ -0,0 +1,88 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_VS_H +#define BRW_VS_H + + +#include "brw_context.h" +#include "brw_eu.h" +#include "shader/program.h" + + +struct brw_vs_prog_key { + GLuint program_string_id; + GLuint nr_userclip:4; + GLuint copy_edgeflag:1; + GLuint pad:26; +}; + + +struct brw_vs_compile { + struct brw_compile func; + struct brw_vs_prog_key key; + struct brw_vs_prog_data prog_data; + + struct brw_vertex_program *vp; + + GLuint nr_inputs; + + GLuint first_output; + GLuint nr_outputs; + GLuint first_overflow_output; /**< VERT_ATTRIB_x */ + + GLuint first_tmp; + GLuint last_tmp; + + struct brw_reg r0; + struct brw_reg r1; + struct brw_reg regs[PROGRAM_ADDRESS+1][128]; + struct brw_reg tmp; + struct brw_reg stack; + + struct { + GLboolean used_in_src; + struct brw_reg reg; + } output_regs[128]; + + struct brw_reg userplane[6]; + + /** we may need up to 3 constants per instruction (if use_const_buffer) */ + struct { + GLint index; + struct brw_reg reg; + } current_const[3]; +}; + +void brw_vs_emit( struct brw_vs_compile *c ); + +#endif diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c new file mode 100644 index 00000000000..1638ef81115 --- /dev/null +++ b/src/gallium/drivers/i965/brw_vs_emit.c @@ -0,0 +1,1667 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/macros.h" +#include "shader/program.h" +#include "shader/prog_parameter.h" +#include "shader/prog_print.h" +#include "brw_context.h" +#include "brw_vs.h" + + +static struct brw_reg get_tmp( struct brw_vs_compile *c ) +{ + struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0); + + if (++c->last_tmp > c->prog_data.total_grf) + c->prog_data.total_grf = c->last_tmp; + + return tmp; +} + +static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp ) +{ + if (tmp.nr == c->last_tmp-1) + c->last_tmp--; +} + +static void release_tmps( struct brw_vs_compile *c ) +{ + c->last_tmp = c->first_tmp; +} + + +/** + * Preallocate GRF register before code emit. + * Do things as simply as possible. Allocate and populate all regs + * ahead of time. + */ +static void brw_vs_alloc_regs( struct brw_vs_compile *c ) +{ + GLuint i, reg = 0, mrf; + int attributes_in_vue; + + /* Determine whether to use a real constant buffer or use a block + * of GRF registers for constants. The later is faster but only + * works if everything fits in the GRF. + * XXX this heuristic/check may need some fine tuning... + */ + if (c->vp->program.Base.Parameters->NumParameters + + c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF) + c->vp->use_const_buffer = GL_TRUE; + else + c->vp->use_const_buffer = GL_FALSE; + + /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/ + + /* r0 -- reserved as usual + */ + c->r0 = brw_vec8_grf(reg, 0); + reg++; + + /* User clip planes from curbe: + */ + if (c->key.nr_userclip) { + for (i = 0; i < c->key.nr_userclip; i++) { + c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1); + } + + /* Deal with curbe alignment: + */ + reg += ((6 + c->key.nr_userclip + 3) / 4) * 2; + } + + /* Vertex program parameters from curbe: + */ + if (c->vp->use_const_buffer) { + /* get constants from a real constant buffer */ + c->prog_data.curb_read_length = 0; + c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */ + } + else { + /* use a section of the GRF for constants */ + GLuint nr_params = c->vp->program.Base.Parameters->NumParameters; + for (i = 0; i < nr_params; i++) { + c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1); + } + reg += (nr_params + 1) / 2; + c->prog_data.curb_read_length = reg - 1; + + c->prog_data.nr_params = nr_params * 4; + } + + /* Allocate input regs: + */ + c->nr_inputs = 0; + for (i = 0; i < VERT_ATTRIB_MAX; i++) { + if (c->prog_data.inputs_read & (1 << i)) { + c->nr_inputs++; + c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0); + reg++; + } + } + /* If there are no inputs, we'll still be reading one attribute's worth + * because it's required -- see urb_read_length setting. + */ + if (c->nr_inputs == 0) + reg++; + + /* Allocate outputs. The non-position outputs go straight into message regs. + */ + c->nr_outputs = 0; + c->first_output = reg; + c->first_overflow_output = 0; + + if (BRW_IS_IGDNG(c->func.brw)) + mrf = 8; + else + mrf = 4; + + for (i = 0; i < VERT_RESULT_MAX; i++) { + if (c->prog_data.outputs_written & (1 << i)) { + c->nr_outputs++; + assert(i < Elements(c->regs[PROGRAM_OUTPUT])); + if (i == VERT_RESULT_HPOS) { + c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); + reg++; + } + else if (i == VERT_RESULT_PSIZ) { + c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); + reg++; + mrf++; /* just a placeholder? XXX fix later stages & remove this */ + } + else { + if (mrf < 16) { + c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf); + mrf++; + } + else { + /* too many vertex results to fit in MRF, use GRF for overflow */ + if (!c->first_overflow_output) + c->first_overflow_output = i; + c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); + reg++; + } + } + } + } + + /* Allocate program temporaries: + */ + for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) { + c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0); + reg++; + } + + /* Address reg(s). Don't try to use the internal address reg until + * deref time. + */ + for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) { + c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE, + reg, + 0, + BRW_REGISTER_TYPE_D, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_8, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); + reg++; + } + + if (c->vp->use_const_buffer) { + for (i = 0; i < 3; i++) { + c->current_const[i].index = -1; + c->current_const[i].reg = brw_vec8_grf(reg, 0); + reg++; + } + } + + for (i = 0; i < 128; i++) { + if (c->output_regs[i].used_in_src) { + c->output_regs[i].reg = brw_vec8_grf(reg, 0); + reg++; + } + } + + c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0); + reg += 2; + + /* Some opcodes need an internal temporary: + */ + c->first_tmp = reg; + c->last_tmp = reg; /* for allocation purposes */ + + /* Each input reg holds data from two vertices. The + * urb_read_length is the number of registers read from *each* + * vertex urb, so is half the amount: + */ + c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2; + /* Setting this field to 0 leads to undefined behavior according to the + * the VS_STATE docs. Our VUEs will always have at least one attribute + * sitting in them, even if it's padding. + */ + if (c->prog_data.urb_read_length == 0) + c->prog_data.urb_read_length = 1; + + /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size + * them to fit the biggest thing they need to. + */ + attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs); + + if (BRW_IS_IGDNG(c->func.brw)) + c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4; + else + c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4; + + c->prog_data.total_grf = reg; + + if (INTEL_DEBUG & DEBUG_VS) { + _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs); + _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries); + _mesa_printf("%s reg = %d\n", __FUNCTION__, reg); + } +} + + +/** + * If an instruction uses a temp reg both as a src and the dest, we + * sometimes need to allocate an intermediate temporary. + */ +static void unalias1( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + void (*func)( struct brw_vs_compile *, + struct brw_reg, + struct brw_reg )) +{ + if (dst.file == arg0.file && dst.nr == arg0.nr) { + struct brw_compile *p = &c->func; + struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); + func(c, tmp, arg0); + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } + else { + func(c, dst, arg0); + } +} + +/** + * \sa unalias2 + * Checkes if 2-operand instruction needs an intermediate temporary. + */ +static void unalias2( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + void (*func)( struct brw_vs_compile *, + struct brw_reg, + struct brw_reg, + struct brw_reg )) +{ + if ((dst.file == arg0.file && dst.nr == arg0.nr) || + (dst.file == arg1.file && dst.nr == arg1.nr)) { + struct brw_compile *p = &c->func; + struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); + func(c, tmp, arg0, arg1); + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } + else { + func(c, dst, arg0, arg1); + } +} + +/** + * \sa unalias2 + * Checkes if 3-operand instruction needs an intermediate temporary. + */ +static void unalias3( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + struct brw_reg arg2, + void (*func)( struct brw_vs_compile *, + struct brw_reg, + struct brw_reg, + struct brw_reg, + struct brw_reg )) +{ + if ((dst.file == arg0.file && dst.nr == arg0.nr) || + (dst.file == arg1.file && dst.nr == arg1.nr) || + (dst.file == arg2.file && dst.nr == arg2.nr)) { + struct brw_compile *p = &c->func; + struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); + func(c, tmp, arg0, arg1, arg2); + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } + else { + func(c, dst, arg0, arg1, arg2); + } +} + +static void emit_sop( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + GLuint cond) +{ + brw_MOV(p, dst, brw_imm_f(0.0f)); + brw_CMP(p, brw_null_reg(), cond, arg0, arg1); + brw_MOV(p, dst, brw_imm_f(1.0f)); + brw_set_predicate_control_flag_value(p, 0xff); +} + +static void emit_seq( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ); +} + +static void emit_sne( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ); +} +static void emit_slt( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L); +} + +static void emit_sle( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE); +} + +static void emit_sgt( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G); +} + +static void emit_sge( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE); +} + +static void emit_max( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1); + brw_SEL(p, dst, arg1, arg0); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); +} + +static void emit_min( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1 ) +{ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1); + brw_SEL(p, dst, arg0, arg1); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); +} + + +static void emit_math1( struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + GLuint precision) +{ + /* There are various odd behaviours with SEND on the simulator. In + * addition there are documented issues with the fact that the GEN4 + * processor doesn't do dependency control properly on SEND + * results. So, on balance, this kludge to get around failures + * with writemasked math results looks like it might be necessary + * whether that turns out to be a simulator bug or not: + */ + struct brw_compile *p = &c->func; + struct brw_reg tmp = dst; + GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf || + dst.file != BRW_GENERAL_REGISTER_FILE); + + if (need_tmp) + tmp = get_tmp(c); + + brw_math(p, + tmp, + function, + BRW_MATH_SATURATE_NONE, + 2, + arg0, + BRW_MATH_DATA_SCALAR, + precision); + + if (need_tmp) { + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } +} + + +static void emit_math2( struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + GLuint precision) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = dst; + GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf || + dst.file != BRW_GENERAL_REGISTER_FILE); + + if (need_tmp) + tmp = get_tmp(c); + + brw_MOV(p, brw_message_reg(3), arg1); + + brw_math(p, + tmp, + function, + BRW_MATH_SATURATE_NONE, + 2, + arg0, + BRW_MATH_DATA_SCALAR, + precision); + + if (need_tmp) { + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } +} + + +static void emit_exp_noalias( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0 ) +{ + struct brw_compile *p = &c->func; + + + if (dst.dw1.bits.writemask & WRITEMASK_X) { + struct brw_reg tmp = get_tmp(c); + struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D); + + /* tmp_d = floor(arg0.x) */ + brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0)); + + /* result[0] = 2.0 ^ tmp */ + + /* Adjust exponent for floating point: + * exp += 127 + */ + brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127)); + + /* Install exponent and sign. + * Excess drops off the edge: + */ + brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X), + tmp_d, brw_imm_d(23)); + + release_tmp(c, tmp); + } + + if (dst.dw1.bits.writemask & WRITEMASK_Y) { + /* result[1] = arg0.x - floor(arg0.x) */ + brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0)); + } + + if (dst.dw1.bits.writemask & WRITEMASK_Z) { + /* As with the LOG instruction, we might be better off just + * doing a taylor expansion here, seeing as we have to do all + * the prep work. + * + * If mathbox partial precision is too low, consider also: + * result[3] = result[0] * EXP(result[1]) + */ + emit_math1(c, + BRW_MATH_FUNCTION_EXP, + brw_writemask(dst, WRITEMASK_Z), + brw_swizzle1(arg0, 0), + BRW_MATH_PRECISION_FULL); + } + + if (dst.dw1.bits.writemask & WRITEMASK_W) { + /* result[3] = 1.0; */ + brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1)); + } +} + + +static void emit_log_noalias( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0 ) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = dst; + struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD); + struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD); + GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf || + dst.file != BRW_GENERAL_REGISTER_FILE); + + if (need_tmp) { + tmp = get_tmp(c); + tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD); + } + + /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt + * according to spec: + * + * These almost look likey they could be joined up, but not really + * practical: + * + * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127 + * result[1].i = (x.i & ((1<<23)-1) + (127<<23) + */ + if (dst.dw1.bits.writemask & WRITEMASK_XZ) { + brw_AND(p, + brw_writemask(tmp_ud, WRITEMASK_X), + brw_swizzle1(arg0_ud, 0), + brw_imm_ud((1U<<31)-1)); + + brw_SHR(p, + brw_writemask(tmp_ud, WRITEMASK_X), + tmp_ud, + brw_imm_ud(23)); + + brw_ADD(p, + brw_writemask(tmp, WRITEMASK_X), + retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */ + brw_imm_d(-127)); + } + + if (dst.dw1.bits.writemask & WRITEMASK_YZ) { + brw_AND(p, + brw_writemask(tmp_ud, WRITEMASK_Y), + brw_swizzle1(arg0_ud, 0), + brw_imm_ud((1<<23)-1)); + + brw_OR(p, + brw_writemask(tmp_ud, WRITEMASK_Y), + tmp_ud, + brw_imm_ud(127<<23)); + } + + if (dst.dw1.bits.writemask & WRITEMASK_Z) { + /* result[2] = result[0] + LOG2(result[1]); */ + + /* Why bother? The above is just a hint how to do this with a + * taylor series. Maybe we *should* use a taylor series as by + * the time all the above has been done it's almost certainly + * quicker than calling the mathbox, even with low precision. + * + * Options are: + * - result[0] + mathbox.LOG2(result[1]) + * - mathbox.LOG2(arg0.x) + * - result[0] + inline_taylor_approx(result[1]) + */ + emit_math1(c, + BRW_MATH_FUNCTION_LOG, + brw_writemask(tmp, WRITEMASK_Z), + brw_swizzle1(tmp, 1), + BRW_MATH_PRECISION_FULL); + + brw_ADD(p, + brw_writemask(tmp, WRITEMASK_Z), + brw_swizzle1(tmp, 2), + brw_swizzle1(tmp, 0)); + } + + if (dst.dw1.bits.writemask & WRITEMASK_W) { + /* result[3] = 1.0; */ + brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1)); + } + + if (need_tmp) { + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } +} + + +/* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1 + */ +static void emit_dst_noalias( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1) +{ + struct brw_compile *p = &c->func; + + /* There must be a better way to do this: + */ + if (dst.dw1.bits.writemask & WRITEMASK_X) + brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0)); + if (dst.dw1.bits.writemask & WRITEMASK_Y) + brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1); + if (dst.dw1.bits.writemask & WRITEMASK_Z) + brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0); + if (dst.dw1.bits.writemask & WRITEMASK_W) + brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1); +} + + +static void emit_xpd( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg t, + struct brw_reg u) +{ + brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3)); + brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3)); +} + + +static void emit_lit_noalias( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0 ) +{ + struct brw_compile *p = &c->func; + struct brw_instruction *if_insn; + struct brw_reg tmp = dst; + GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE); + + if (need_tmp) + tmp = get_tmp(c); + + brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0)); + brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1)); + + /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order + * to get all channels active inside the IF. In the clipping code + * we run with NoMask, so it's not an option and we can use + * BRW_EXECUTE_1 for all comparisions. + */ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0)); + if_insn = brw_IF(p, BRW_EXECUTE_8); + { + brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0)); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0)); + brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + emit_math2(c, + BRW_MATH_FUNCTION_POW, + brw_writemask(dst, WRITEMASK_Z), + brw_swizzle1(tmp, 2), + brw_swizzle1(arg0, 3), + BRW_MATH_PRECISION_PARTIAL); + } + + brw_ENDIF(p, if_insn); + + release_tmp(c, tmp); +} + +static void emit_lrp_noalias(struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + struct brw_reg arg2) +{ + struct brw_compile *p = &c->func; + + brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0)); + brw_MUL(p, brw_null_reg(), dst, arg2); + brw_MAC(p, dst, arg0, arg1); +} + +/** 3 or 4-component vector normalization */ +static void emit_nrm( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + int num_comps) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = get_tmp(c); + + /* tmp = dot(arg0, arg0) */ + if (num_comps == 3) + brw_DP3(p, tmp, arg0, arg0); + else + brw_DP4(p, tmp, arg0, arg0); + + /* tmp = 1 / sqrt(tmp) */ + emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL); + + /* dst = arg0 * tmp */ + brw_MUL(p, dst, arg0, tmp); + + release_tmp(c, tmp); +} + + +static struct brw_reg +get_constant(struct brw_vs_compile *c, + const struct prog_instruction *inst, + GLuint argIndex) +{ + const struct prog_src_register *src = &inst->SrcReg[argIndex]; + struct brw_compile *p = &c->func; + struct brw_reg const_reg; + struct brw_reg const2_reg; + const GLboolean relAddr = src->RelAddr; + + assert(argIndex < 3); + + if (c->current_const[argIndex].index != src->Index || relAddr) { + struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; + + c->current_const[argIndex].index = src->Index; + +#if 0 + printf(" fetch const[%d] for arg %d into reg %d\n", + src->Index, argIndex, c->current_const[argIndex].reg.nr); +#endif + /* need to fetch the constant now */ + brw_dp_READ_4_vs(p, + c->current_const[argIndex].reg,/* writeback dest */ + 0, /* oword */ + relAddr, /* relative indexing? */ + addrReg, /* address register */ + 16 * src->Index, /* byte offset */ + SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ + ); + + if (relAddr) { + /* second read */ + const2_reg = get_tmp(c); + + /* use upper half of address reg for second read */ + addrReg = stride(addrReg, 0, 4, 0); + addrReg.subnr = 16; + + brw_dp_READ_4_vs(p, + const2_reg, /* writeback dest */ + 1, /* oword */ + relAddr, /* relative indexing? */ + addrReg, /* address register */ + 16 * src->Index, /* byte offset */ + SURF_INDEX_VERT_CONST_BUFFER + ); + } + } + + const_reg = c->current_const[argIndex].reg; + + if (relAddr) { + /* merge the two Owords into the constant register */ + /* const_reg[7..4] = const2_reg[7..4] */ + brw_MOV(p, + suboffset(stride(const_reg, 0, 4, 1), 4), + suboffset(stride(const2_reg, 0, 4, 1), 4)); + release_tmp(c, const2_reg); + } + else { + /* replicate lower four floats into upper half (to get XYZWXYZW) */ + const_reg = stride(const_reg, 0, 4, 0); + const_reg.subnr = 0; + } + + return const_reg; +} + + + +/* TODO: relative addressing! + */ +static struct brw_reg get_reg( struct brw_vs_compile *c, + gl_register_file file, + GLuint index ) +{ + switch (file) { + case PROGRAM_TEMPORARY: + case PROGRAM_INPUT: + case PROGRAM_OUTPUT: + assert(c->regs[file][index].nr != 0); + return c->regs[file][index]; + case PROGRAM_STATE_VAR: + case PROGRAM_CONSTANT: + case PROGRAM_UNIFORM: + assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0); + return c->regs[PROGRAM_STATE_VAR][index]; + case PROGRAM_ADDRESS: + assert(index == 0); + return c->regs[file][index]; + + case PROGRAM_UNDEFINED: /* undef values */ + return brw_null_reg(); + + case PROGRAM_LOCAL_PARAM: + case PROGRAM_ENV_PARAM: + case PROGRAM_WRITE_ONLY: + default: + assert(0); + return brw_null_reg(); + } +} + + +/** + * Indirect addressing: get reg[[arg] + offset]. + */ +static struct brw_reg deref( struct brw_vs_compile *c, + struct brw_reg arg, + GLint offset) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = vec4(get_tmp(c)); + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW); + GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16; + struct brw_reg indirect = brw_vec4_indirect(0,0); + + { + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + + /* This is pretty clunky - load the address register twice and + * fetch each 4-dword value in turn. There must be a way to do + * this in a single pass, but I couldn't get it to work. + */ + brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset)); + brw_MOV(p, tmp, indirect); + + brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset)); + brw_MOV(p, suboffset(tmp, 4), indirect); + + brw_pop_insn_state(p); + } + + /* NOTE: tmp not released */ + return vec8(tmp); +} + + +/** + * Get brw reg corresponding to the instruction's [argIndex] src reg. + * TODO: relative addressing! + */ +static struct brw_reg +get_src_reg( struct brw_vs_compile *c, + const struct prog_instruction *inst, + GLuint argIndex ) +{ + const GLuint file = inst->SrcReg[argIndex].File; + const GLint index = inst->SrcReg[argIndex].Index; + const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr; + + switch (file) { + case PROGRAM_TEMPORARY: + case PROGRAM_INPUT: + case PROGRAM_OUTPUT: + if (relAddr) { + return deref(c, c->regs[file][0], index); + } + else { + assert(c->regs[file][index].nr != 0); + return c->regs[file][index]; + } + + case PROGRAM_STATE_VAR: + case PROGRAM_CONSTANT: + case PROGRAM_UNIFORM: + case PROGRAM_ENV_PARAM: + if (c->vp->use_const_buffer) { + return get_constant(c, inst, argIndex); + } + else if (relAddr) { + return deref(c, c->regs[PROGRAM_STATE_VAR][0], index); + } + else { + assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0); + return c->regs[PROGRAM_STATE_VAR][index]; + } + case PROGRAM_ADDRESS: + assert(index == 0); + return c->regs[file][index]; + + case PROGRAM_UNDEFINED: + /* this is a normal case since we loop over all three src args */ + return brw_null_reg(); + + case PROGRAM_LOCAL_PARAM: + case PROGRAM_WRITE_ONLY: + default: + assert(0); + return brw_null_reg(); + } +} + + +static void emit_arl( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0 ) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = dst; + GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE); + + if (need_tmp) + tmp = get_tmp(c); + + brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */ + brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */ + + if (need_tmp) + release_tmp(c, tmp); +} + + +/** + * Return the brw reg for the given instruction's src argument. + * Will return mangled results for SWZ op. The emit_swz() function + * ignores this result and recalculates taking extended swizzles into + * account. + */ +static struct brw_reg get_arg( struct brw_vs_compile *c, + const struct prog_instruction *inst, + GLuint argIndex ) +{ + const struct prog_src_register *src = &inst->SrcReg[argIndex]; + struct brw_reg reg; + + if (src->File == PROGRAM_UNDEFINED) + return brw_null_reg(); + + reg = get_src_reg(c, inst, argIndex); + + /* Convert 3-bit swizzle to 2-bit. + */ + reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0), + GET_SWZ(src->Swizzle, 1), + GET_SWZ(src->Swizzle, 2), + GET_SWZ(src->Swizzle, 3)); + + /* Note this is ok for non-swizzle instructions: + */ + reg.negate = src->Negate ? 1 : 0; + + return reg; +} + + +/** + * Get brw register for the given program dest register. + */ +static struct brw_reg get_dst( struct brw_vs_compile *c, + struct prog_dst_register dst ) +{ + struct brw_reg reg; + + switch (dst.File) { + case PROGRAM_TEMPORARY: + case PROGRAM_OUTPUT: + assert(c->regs[dst.File][dst.Index].nr != 0); + reg = c->regs[dst.File][dst.Index]; + break; + case PROGRAM_ADDRESS: + assert(dst.Index == 0); + reg = c->regs[dst.File][dst.Index]; + break; + case PROGRAM_UNDEFINED: + /* we may hit this for OPCODE_END, OPCODE_KIL, etc */ + reg = brw_null_reg(); + break; + default: + assert(0); + reg = brw_null_reg(); + } + + reg.dw1.bits.writemask = dst.WriteMask; + + return reg; +} + + +static void emit_swz( struct brw_vs_compile *c, + struct brw_reg dst, + const struct prog_instruction *inst) +{ + const GLuint argIndex = 0; + const struct prog_src_register src = inst->SrcReg[argIndex]; + struct brw_compile *p = &c->func; + GLuint zeros_mask = 0; + GLuint ones_mask = 0; + GLuint src_mask = 0; + GLubyte src_swz[4]; + GLboolean need_tmp = (src.Negate && + dst.file != BRW_GENERAL_REGISTER_FILE); + struct brw_reg tmp = dst; + GLuint i; + + if (need_tmp) + tmp = get_tmp(c); + + for (i = 0; i < 4; i++) { + if (dst.dw1.bits.writemask & (1<<i)) { + GLubyte s = GET_SWZ(src.Swizzle, i); + switch (s) { + case SWIZZLE_X: + case SWIZZLE_Y: + case SWIZZLE_Z: + case SWIZZLE_W: + src_mask |= 1<<i; + src_swz[i] = s; + break; + case SWIZZLE_ZERO: + zeros_mask |= 1<<i; + break; + case SWIZZLE_ONE: + ones_mask |= 1<<i; + break; + } + } + } + + /* Do src first, in case dst aliases src: + */ + if (src_mask) { + struct brw_reg arg0; + + arg0 = get_src_reg(c, inst, argIndex); + + arg0 = brw_swizzle(arg0, + src_swz[0], src_swz[1], + src_swz[2], src_swz[3]); + + brw_MOV(p, brw_writemask(tmp, src_mask), arg0); + } + + if (zeros_mask) + brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0)); + + if (ones_mask) + brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1)); + + if (src.Negate) + brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp)); + + if (need_tmp) { + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } +} + + +/** + * Post-vertex-program processing. Send the results to the URB. + */ +static void emit_vertex_write( struct brw_vs_compile *c) +{ + struct brw_compile *p = &c->func; + struct brw_reg m0 = brw_message_reg(0); + struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS]; + struct brw_reg ndc; + int eot; + GLuint len_vertext_header = 2; + + if (c->key.copy_edgeflag) { + brw_MOV(p, + get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE), + get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG)); + } + + /* Build ndc coords */ + ndc = get_tmp(c); + /* ndc = 1.0 / pos.w */ + emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL); + /* ndc.xyz = pos * ndc */ + brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc); + + /* Update the header for point size, user clipping flags, and -ve rhw + * workaround. + */ + if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) || + c->key.nr_userclip || BRW_IS_965(p->brw)) + { + struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + GLuint i; + + brw_MOV(p, header1, brw_imm_ud(0)); + + brw_set_access_mode(p, BRW_ALIGN_16); + + if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) { + struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ]; + brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11)); + brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8)); + } + + for (i = 0; i < c->key.nr_userclip; i++) { + brw_set_conditionalmod(p, BRW_CONDITIONAL_L); + brw_DP4(p, brw_null_reg(), pos, c->userplane[i]); + brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + + /* i965 clipping workaround: + * 1) Test for -ve rhw + * 2) If set, + * set ndc = (0,0,0,0) + * set ucp[6] = 1 + * + * Later, clipping will detect ucp[6] and ensure the primitive is + * clipped against all fixed planes. + */ + if (BRW_IS_965(p->brw)) { + brw_CMP(p, + vec8(brw_null_reg()), + BRW_CONDITIONAL_L, + brw_swizzle1(ndc, 3), + brw_imm_f(0)); + + brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6)); + brw_MOV(p, ndc, brw_imm_f(0)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + + brw_set_access_mode(p, BRW_ALIGN_1); /* why? */ + brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1); + brw_set_access_mode(p, BRW_ALIGN_16); + + release_tmp(c, header1); + } + else { + brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); + } + + /* Emit the (interleaved) headers for the two vertices - an 8-reg + * of zeros followed by two sets of NDC coordinates: + */ + brw_set_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, offset(m0, 2), ndc); + + if (BRW_IS_IGDNG(p->brw)) { + /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */ + brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */ + /* m4, m5 contain the distances from vertex to the user clip planeXXX. + * Seems it is useless for us. + * m6 is used for aligning, so that the remainder of vertex element is + * reg-aligned. + */ + brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */ + len_vertext_header = 6; + } else { + brw_MOV(p, offset(m0, 3), pos); + len_vertext_header = 2; + } + + eot = (c->first_overflow_output == 0); + + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + 0, /* starting mrf reg nr */ + c->r0, /* src */ + 0, /* allocate */ + 1, /* used */ + MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */ + 0, /* response len */ + eot, /* eot */ + eot, /* writes complete */ + 0, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); + + if (c->first_overflow_output > 0) { + /* Not all of the vertex outputs/results fit into the MRF. + * Move the overflowed attributes from the GRF to the MRF and + * issue another brw_urb_WRITE(). + */ + /* XXX I'm not 100% sure about which MRF regs to use here. Starting + * at mrf[4] atm... + */ + GLuint i, mrf = 0; + for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) { + if (c->prog_data.outputs_written & (1 << i)) { + /* move from GRF to MRF */ + brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]); + mrf++; + } + } + + brw_urb_WRITE(p, + brw_null_reg(), /* dest */ + 4, /* starting mrf reg nr */ + c->r0, /* src */ + 0, /* allocate */ + 1, /* used */ + mrf+1, /* msg len */ + 0, /* response len */ + 1, /* eot */ + 1, /* writes complete */ + BRW_MAX_MRF-1, /* urb destination offset */ + BRW_URB_SWIZZLE_INTERLEAVE); + } +} + + +/** + * Called after code generation to resolve subroutine calls and the + * END instruction. + * \param end_inst points to brw code for END instruction + * \param last_inst points to last instruction emitted before vertex write + */ +static void +post_vs_emit( struct brw_vs_compile *c, + struct brw_instruction *end_inst, + struct brw_instruction *last_inst ) +{ + GLint offset; + + brw_resolve_cals(&c->func); + + /* patch up the END code to jump past subroutines, etc */ + offset = last_inst - end_inst; + if (offset > 1) { + brw_set_src1(end_inst, brw_imm_d(offset * 16)); + } else { + end_inst->header.opcode = BRW_OPCODE_NOP; + } +} + +static uint32_t +get_predicate(const struct prog_instruction *inst) +{ + if (inst->DstReg.CondMask == COND_TR) + return BRW_PREDICATE_NONE; + + /* All of GLSL only produces predicates for COND_NE and one channel per + * vector. Fail badly if someone starts doing something else, as it might + * mean infinite looping or something. + * + * We'd like to support all the condition codes, but our hardware doesn't + * quite match the Mesa IR, which is modeled after the NV extensions. For + * those, the instruction may update the condition codes or not, then any + * later instruction may use one of those condition codes. For gen4, the + * instruction may update the flags register based on one of the condition + * codes output by the instruction, and then further instructions may + * predicate on that. We can probably support this, but it won't + * necessarily be easy. + */ + assert(inst->DstReg.CondMask == COND_NE); + + switch (inst->DstReg.CondSwizzle) { + case SWIZZLE_XXXX: + return BRW_PREDICATE_ALIGN16_REPLICATE_X; + case SWIZZLE_YYYY: + return BRW_PREDICATE_ALIGN16_REPLICATE_Y; + case SWIZZLE_ZZZZ: + return BRW_PREDICATE_ALIGN16_REPLICATE_Z; + case SWIZZLE_WWWW: + return BRW_PREDICATE_ALIGN16_REPLICATE_W; + default: + _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n", + inst->DstReg.CondMask); + return BRW_PREDICATE_NORMAL; + } +} + +/* Emit the vertex program instructions here. + */ +void brw_vs_emit(struct brw_vs_compile *c ) +{ +#define MAX_IF_DEPTH 32 +#define MAX_LOOP_DEPTH 32 + struct brw_compile *p = &c->func; + struct brw_context *brw = p->brw; + const GLuint nr_insns = c->vp->program.Base.NumInstructions; + GLuint insn, if_depth = 0, loop_depth = 0; + GLuint end_offset = 0; + struct brw_instruction *end_inst, *last_inst; + struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH]; + const struct brw_indirect stack_index = brw_indirect(0, 0); + GLuint index; + GLuint file; + + if (INTEL_DEBUG & DEBUG_VS) { + _mesa_printf("vs-mesa:\n"); + _mesa_print_program(&c->vp->program.Base); + _mesa_printf("\n"); + } + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_access_mode(p, BRW_ALIGN_16); + + /* Message registers can't be read, so copy the output into GRF register + if they are used in source registers */ + for (insn = 0; insn < nr_insns; insn++) { + GLuint i; + struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn]; + for (i = 0; i < 3; i++) { + struct prog_src_register *src = &inst->SrcReg[i]; + GLuint index = src->Index; + GLuint file = src->File; + if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS) + c->output_regs[index].used_in_src = GL_TRUE; + } + } + + /* Static register allocation + */ + brw_vs_alloc_regs(c); + brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); + + for (insn = 0; insn < nr_insns; insn++) { + + const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn]; + struct brw_reg args[3], dst; + GLuint i; + +#if 0 + printf("%d: ", insn); + _mesa_print_instruction(inst); +#endif + + /* Get argument regs. SWZ is special and does this itself. + */ + if (inst->Opcode != OPCODE_SWZ) + for (i = 0; i < 3; i++) { + const struct prog_src_register *src = &inst->SrcReg[i]; + index = src->Index; + file = src->File; + if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) + args[i] = c->output_regs[index].reg; + else + args[i] = get_arg(c, inst, i); + } + + /* Get dest regs. Note that it is possible for a reg to be both + * dst and arg, given the static allocation of registers. So + * care needs to be taken emitting multi-operation instructions. + */ + index = inst->DstReg.Index; + file = inst->DstReg.File; + if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) + dst = c->output_regs[index].reg; + else + dst = get_dst(c, inst->DstReg); + + if (inst->SaturateMode != SATURATE_OFF) { + _mesa_problem(NULL, "Unsupported saturate %d in vertex shader", + inst->SaturateMode); + } + + switch (inst->Opcode) { + case OPCODE_ABS: + brw_MOV(p, dst, brw_abs(args[0])); + break; + case OPCODE_ADD: + brw_ADD(p, dst, args[0], args[1]); + break; + case OPCODE_COS: + emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_DP3: + brw_DP3(p, dst, args[0], args[1]); + break; + case OPCODE_DP4: + brw_DP4(p, dst, args[0], args[1]); + break; + case OPCODE_DPH: + brw_DPH(p, dst, args[0], args[1]); + break; + case OPCODE_NRM3: + emit_nrm(c, dst, args[0], 3); + break; + case OPCODE_NRM4: + emit_nrm(c, dst, args[0], 4); + break; + case OPCODE_DST: + unalias2(c, dst, args[0], args[1], emit_dst_noalias); + break; + case OPCODE_EXP: + unalias1(c, dst, args[0], emit_exp_noalias); + break; + case OPCODE_EX2: + emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_ARL: + emit_arl(c, dst, args[0]); + break; + case OPCODE_FLR: + brw_RNDD(p, dst, args[0]); + break; + case OPCODE_FRC: + brw_FRC(p, dst, args[0]); + break; + case OPCODE_LOG: + unalias1(c, dst, args[0], emit_log_noalias); + break; + case OPCODE_LG2: + emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_LIT: + unalias1(c, dst, args[0], emit_lit_noalias); + break; + case OPCODE_LRP: + unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias); + break; + case OPCODE_MAD: + brw_MOV(p, brw_acc_reg(), args[2]); + brw_MAC(p, dst, args[0], args[1]); + break; + case OPCODE_MAX: + emit_max(p, dst, args[0], args[1]); + break; + case OPCODE_MIN: + emit_min(p, dst, args[0], args[1]); + break; + case OPCODE_MOV: + brw_MOV(p, dst, args[0]); + break; + case OPCODE_MUL: + brw_MUL(p, dst, args[0], args[1]); + break; + case OPCODE_POW: + emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_RCP: + emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_RSQ: + emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + + case OPCODE_SEQ: + emit_seq(p, dst, args[0], args[1]); + break; + case OPCODE_SIN: + emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL); + break; + case OPCODE_SNE: + emit_sne(p, dst, args[0], args[1]); + break; + case OPCODE_SGE: + emit_sge(p, dst, args[0], args[1]); + break; + case OPCODE_SGT: + emit_sgt(p, dst, args[0], args[1]); + break; + case OPCODE_SLT: + emit_slt(p, dst, args[0], args[1]); + break; + case OPCODE_SLE: + emit_sle(p, dst, args[0], args[1]); + break; + case OPCODE_SUB: + brw_ADD(p, dst, args[0], negate(args[1])); + break; + case OPCODE_SWZ: + /* The args[0] value can't be used here as it won't have + * correctly encoded the full swizzle: + */ + emit_swz(c, dst, inst); + break; + case OPCODE_TRUNC: + /* round toward zero */ + brw_RNDZ(p, dst, args[0]); + break; + case OPCODE_XPD: + emit_xpd(p, dst, args[0], args[1]); + break; + case OPCODE_IF: + assert(if_depth < MAX_IF_DEPTH); + if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8); + /* Note that brw_IF smashes the predicate_control field. */ + if_inst[if_depth]->header.predicate_control = get_predicate(inst); + if_depth++; + break; + case OPCODE_ELSE: + if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]); + break; + case OPCODE_ENDIF: + assert(if_depth > 0); + brw_ENDIF(p, if_inst[--if_depth]); + break; + case OPCODE_BGNLOOP: + loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8); + break; + case OPCODE_BRK: + brw_set_predicate_control(p, get_predicate(inst)); + brw_BREAK(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + break; + case OPCODE_CONT: + brw_set_predicate_control(p, get_predicate(inst)); + brw_CONT(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + break; + case OPCODE_ENDLOOP: + { + struct brw_instruction *inst0, *inst1; + GLuint br = 1; + + loop_depth--; + + if (BRW_IS_IGDNG(brw)) + br = 2; + + inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]); + /* patch all the BREAK/CONT instructions from last BEGINLOOP */ + while (inst0 > loop_inst[loop_depth]) { + inst0--; + if (inst0->header.opcode == BRW_OPCODE_BREAK) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); + inst0->bits3.if_else.pop_count = 0; + } + else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0); + inst0->bits3.if_else.pop_count = 0; + } + } + } + break; + case OPCODE_BRA: + brw_set_predicate_control(p, get_predicate(inst)); + brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + break; + case OPCODE_CAL: + brw_set_access_mode(p, BRW_ALIGN_1); + brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16)); + brw_set_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, get_addr_reg(stack_index), + get_addr_reg(stack_index), brw_imm_d(4)); + brw_save_call(p, inst->Comment, p->nr_insn); + brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); + break; + case OPCODE_RET: + brw_ADD(p, get_addr_reg(stack_index), + get_addr_reg(stack_index), brw_imm_d(-4)); + brw_set_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0)); + brw_set_access_mode(p, BRW_ALIGN_16); + break; + case OPCODE_END: + end_offset = p->nr_insn; + /* this instruction will get patched later to jump past subroutine + * code, etc. + */ + brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); + break; + case OPCODE_PRINT: + /* no-op */ + break; + case OPCODE_BGNSUB: + brw_save_label(p, inst->Comment, p->nr_insn); + break; + case OPCODE_ENDSUB: + /* no-op */ + break; + default: + _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader", + inst->Opcode, inst->Opcode < MAX_OPCODE ? + _mesa_opcode_string(inst->Opcode) : + "unknown"); + } + + /* Set the predication update on the last instruction of the native + * instruction sequence. + * + * This would be problematic if it was set on a math instruction, + * but that shouldn't be the case with the current GLSL compiler. + */ + if (inst->CondUpdate) { + struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1]; + + assert(hw_insn->header.destreg__conditionalmod == 0); + hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ; + } + + if ((inst->DstReg.File == PROGRAM_OUTPUT) + && (inst->DstReg.Index != VERT_RESULT_HPOS) + && c->output_regs[inst->DstReg.Index].used_in_src) { + brw_MOV(p, get_dst(c, inst->DstReg), dst); + } + + /* Result color clamping. + * + * When destination register is an output register and + * it's primary/secondary front/back color, we have to clamp + * the result to [0,1]. This is done by enabling the + * saturation bit for the last instruction. + * + * We don't use brw_set_saturate() as it modifies + * p->current->header.saturate, which affects all the subsequent + * instructions. Instead, we directly modify the header + * of the last (already stored) instruction. + */ + if (inst->DstReg.File == PROGRAM_OUTPUT) { + if ((inst->DstReg.Index == VERT_RESULT_COL0) + || (inst->DstReg.Index == VERT_RESULT_COL1) + || (inst->DstReg.Index == VERT_RESULT_BFC0) + || (inst->DstReg.Index == VERT_RESULT_BFC1)) { + p->store[p->nr_insn-1].header.saturate = 1; + } + } + + release_tmps(c); + } + + end_inst = &p->store[end_offset]; + last_inst = &p->store[p->nr_insn]; + + /* The END instruction will be patched to jump to this code */ + emit_vertex_write(c); + + post_vs_emit(c, end_inst, last_inst); + + if (INTEL_DEBUG & DEBUG_VS) { + int i; + + _mesa_printf("vs-native:\n"); + for (i = 0; i < p->nr_insn; i++) + brw_disasm(stderr, &p->store[i]); + _mesa_printf("\n"); + } +} diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c new file mode 100644 index 00000000000..d790ab65553 --- /dev/null +++ b/src/gallium/drivers/i965/brw_vs_state.c @@ -0,0 +1,185 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "main/macros.h" + +struct brw_vs_unit_key { + unsigned int total_grf; + unsigned int urb_entry_read_length; + unsigned int curb_entry_read_length; + + unsigned int curbe_offset; + + unsigned int nr_urb_entries, urb_size; + + unsigned int nr_surfaces; +}; + +static void +vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + + memset(key, 0, sizeof(*key)); + + /* CACHE_NEW_VS_PROG */ + key->total_grf = brw->vs.prog_data->total_grf; + key->urb_entry_read_length = brw->vs.prog_data->urb_read_length; + key->curb_entry_read_length = brw->vs.prog_data->curb_read_length; + + /* BRW_NEW_URB_FENCE */ + key->nr_urb_entries = brw->urb.nr_vs_entries; + key->urb_size = brw->urb.vsize; + + /* BRW_NEW_NR_VS_SURFACES */ + key->nr_surfaces = brw->vs.nr_surfaces; + + /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */ + if (ctx->Transform.ClipPlanesEnabled) { + /* Note that we read in the userclip planes as well, hence + * clip_start: + */ + key->curbe_offset = brw->curbe.clip_start; + } + else { + key->curbe_offset = brw->curbe.vs_start; + } +} + +static dri_bo * +vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key) +{ + struct brw_vs_unit_state vs; + dri_bo *bo; + int chipset_max_threads; + + memset(&vs, 0, sizeof(vs)); + + vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */ + vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + /* Choosing multiple program flow means that we may get 2-vertex threads, + * which will have the channel mask for dwords 4-7 enabled in the thread, + * and those dwords will be written to the second URB handle when we + * brw_urb_WRITE() results. + */ + vs.thread1.single_program_flow = 0; + + if (BRW_IS_IGDNG(brw)) + vs.thread1.binding_table_entry_count = 0; /* hardware requirement */ + else + vs.thread1.binding_table_entry_count = key->nr_surfaces; + + vs.thread3.urb_entry_read_length = key->urb_entry_read_length; + vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length; + vs.thread3.dispatch_grf_start_reg = 1; + vs.thread3.urb_entry_read_offset = 0; + vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2; + + if (BRW_IS_IGDNG(brw)) + vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2; + else + vs.thread4.nr_urb_entries = key->nr_urb_entries; + + vs.thread4.urb_entry_allocation_size = key->urb_size - 1; + + if (BRW_IS_IGDNG(brw)) + chipset_max_threads = 72; + else if (BRW_IS_G4X(brw)) + chipset_max_threads = 32; + else + chipset_max_threads = 16; + vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2, + 1, chipset_max_threads) - 1; + + if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) + vs.thread4.max_threads = 0; + + /* No samplers for ARB_vp programs: + */ + /* It has to be set to 0 for IGDNG + */ + vs.vs5.sampler_count = 0; + + if (INTEL_DEBUG & DEBUG_STATS) + vs.thread4.stats_enable = 1; + + /* Vertex program always enabled: + */ + vs.vs6.vs_enable = 1; + + bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT, + key, sizeof(*key), + &brw->vs.prog_bo, 1, + &vs, sizeof(vs), + NULL, NULL); + + /* Emit VS program relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + vs.thread0.grf_reg_count << 1, + offsetof(struct brw_vs_unit_state, thread0), + brw->vs.prog_bo); + + return bo; +} + +static void prepare_vs_unit(struct brw_context *brw) +{ + struct brw_vs_unit_key key; + + vs_unit_populate_key(brw, &key); + + dri_bo_unreference(brw->vs.state_bo); + brw->vs.state_bo = brw_search_cache(&brw->cache, BRW_VS_UNIT, + &key, sizeof(key), + &brw->vs.prog_bo, 1, + NULL); + if (brw->vs.state_bo == NULL) { + brw->vs.state_bo = vs_unit_create_from_key(brw, &key); + } +} + +const struct brw_tracked_state brw_vs_unit = { + .dirty = { + .mesa = _NEW_TRANSFORM, + .brw = (BRW_NEW_CURBE_OFFSETS | + BRW_NEW_NR_VS_SURFACES | + BRW_NEW_URB_FENCE), + .cache = CACHE_NEW_VS_PROG + }, + .prepare = prepare_vs_unit, +}; diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c new file mode 100644 index 00000000000..89f47522a1c --- /dev/null +++ b/src/gallium/drivers/i965/brw_vs_surface_state.c @@ -0,0 +1,226 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/mtypes.h" +#include "main/texformat.h" +#include "main/texstore.h" +#include "shader/prog_parameter.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + +/* Creates a new VS constant buffer reflecting the current VS program's + * constants, if needed by the VS program. + * + * Otherwise, constants go through the CURBEs using the brw_constant_buffer + * state atom. + */ +static drm_intel_bo * +brw_vs_update_constant_buffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct brw_vertex_program *vp = + (struct brw_vertex_program *) brw->vertex_program; + const struct gl_program_parameter_list *params = vp->program.Base.Parameters; + const int size = params->NumParameters * 4 * sizeof(GLfloat); + drm_intel_bo *const_buffer; + + /* BRW_NEW_VERTEX_PROGRAM */ + if (!vp->use_const_buffer) + return NULL; + + const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", + size, 64); + + /* _NEW_PROGRAM_CONSTANTS */ + dri_bo_subdata(const_buffer, 0, size, params->ParameterValues); + + return const_buffer; +} + +/** + * Update the surface state for a VS constant buffer. + * + * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer. + */ +static void +brw_update_vs_constant_surface( GLcontext *ctx, + GLuint surf) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_surface_key key; + struct brw_vertex_program *vp = + (struct brw_vertex_program *) brw->vertex_program; + const struct gl_program_parameter_list *params = vp->program.Base.Parameters; + + assert(surf == 0); + + /* If we're in this state update atom, we need to update VS constants, so + * free the old buffer and create a new one for the new contents. + */ + dri_bo_unreference(vp->const_buffer); + vp->const_buffer = brw_vs_update_constant_buffer(brw); + + /* If there's no constant buffer, then no surface BO is needed to point at + * it. + */ + if (vp->const_buffer == 0) { + drm_intel_bo_unreference(brw->vs.surf_bo[surf]); + brw->vs.surf_bo[surf] = NULL; + return; + } + + memset(&key, 0, sizeof(key)); + + key.format = MESA_FORMAT_RGBA_FLOAT32; + key.internal_format = GL_RGBA; + key.bo = vp->const_buffer; + key.depthmode = GL_NONE; + key.pitch = params->NumParameters; + key.width = params->NumParameters; + key.height = 1; + key.depth = 1; + key.cpp = 16; + + /* + printf("%s:\n", __FUNCTION__); + printf(" width %d height %d depth %d cpp %d pitch %d\n", + key.width, key.height, key.depth, key.cpp, key.pitch); + */ + + drm_intel_bo_unreference(brw->vs.surf_bo[surf]); + brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache, + BRW_SS_SURFACE, + &key, sizeof(key), + &key.bo, key.bo ? 1 : 0, + NULL); + if (brw->vs.surf_bo[surf] == NULL) { + brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key); + } +} + + +/** + * Constructs the binding table for the VS surface state. + */ +static dri_bo * +brw_vs_get_binding_table(struct brw_context *brw) +{ + dri_bo *bind_bo; + + bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND, + NULL, 0, + brw->vs.surf_bo, BRW_VS_MAX_SURF, + NULL); + + if (bind_bo == NULL) { + GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint); + uint32_t *data = malloc(data_size); + int i; + + for (i = 0; i < BRW_VS_MAX_SURF; i++) + if (brw->vs.surf_bo[i]) + data[i] = brw->vs.surf_bo[i]->offset; + else + data[i] = 0; + + bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND, + NULL, 0, + brw->vs.surf_bo, BRW_VS_MAX_SURF, + data, data_size, + NULL, NULL); + + /* Emit binding table relocations to surface state */ + for (i = 0; i < BRW_VS_MAX_SURF; i++) { + if (brw->vs.surf_bo[i] != NULL) { + /* The presumed offsets were set in the data values for + * brw_upload_cache. + */ + drm_intel_bo_emit_reloc(bind_bo, i * 4, + brw->vs.surf_bo[i], 0, + I915_GEM_DOMAIN_INSTRUCTION, 0); + } + } + + free(data); + } + + return bind_bo; +} + +/** + * Vertex shader surfaces (constant buffer). + * + * This consumes the state updates for the constant buffer needing + * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and + * CACHE_NEW_SURF_BIND for the binding table upload. + */ +static void prepare_vs_surfaces(struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + int i; + int nr_surfaces = 0; + + brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER); + + for (i = 0; i < BRW_VS_MAX_SURF; i++) { + if (brw->vs.surf_bo[i] != NULL) { + nr_surfaces = i + 1; + } + } + + if (brw->vs.nr_surfaces != nr_surfaces) { + brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES; + brw->vs.nr_surfaces = nr_surfaces; + } + + /* Note that we don't end up updating the bind_bo if we don't have a + * surface to be pointing at. This should be relatively harmless, as it + * just slightly increases our working set size. + */ + if (brw->vs.nr_surfaces != 0) { + dri_bo_unreference(brw->vs.bind_bo); + brw->vs.bind_bo = brw_vs_get_binding_table(brw); + } +} + +const struct brw_tracked_state brw_vs_surfaces = { + .dirty = { + .mesa = (_NEW_PROGRAM_CONSTANTS), + .brw = (BRW_NEW_VERTEX_PROGRAM), + .cache = 0 + }, + .prepare = prepare_vs_surfaces, +}; + + + diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c new file mode 100644 index 00000000000..2292de94c44 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm.c @@ -0,0 +1,375 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + +#include "main/texformat.h" +#include "brw_context.h" +#include "brw_util.h" +#include "brw_wm.h" +#include "brw_state.h" + + +/** Return number of src args for given instruction */ +GLuint brw_wm_nr_args( GLuint opcode ) +{ + switch (opcode) { + case WM_FRONTFACING: + case WM_PIXELXY: + return 0; + case WM_CINTERP: + case WM_WPOSXY: + case WM_DELTAXY: + return 1; + case WM_LINTERP: + case WM_PIXELW: + return 2; + case WM_FB_WRITE: + case WM_PINTERP: + return 3; + default: + assert(opcode < MAX_OPCODE); + return _mesa_num_inst_src_regs(opcode); + } +} + + +GLuint brw_wm_is_scalar_result( GLuint opcode ) +{ + switch (opcode) { + case OPCODE_COS: + case OPCODE_EX2: + case OPCODE_LG2: + case OPCODE_POW: + case OPCODE_RCP: + case OPCODE_RSQ: + case OPCODE_SIN: + case OPCODE_DP3: + case OPCODE_DP4: + case OPCODE_DPH: + case OPCODE_DST: + return 1; + + default: + return 0; + } +} + + +/** + * Do GPU code generation for non-GLSL shader. non-GLSL shaders have + * no flow control instructions so we can more readily do SSA-style + * optimizations. + */ +static void +brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) +{ + /* Augment fragment program. Add instructions for pre- and + * post-fragment-program tasks such as interpolation and fogging. + */ + brw_wm_pass_fp(c); + + /* Translate to intermediate representation. Build register usage + * chains. + */ + brw_wm_pass0(c); + + /* Dead code removal. + */ + brw_wm_pass1(c); + + /* Register allocation. + * Divide by two because we operate on 16 pixels at a time and require + * two GRF entries for each logical shader register. + */ + c->grf_limit = BRW_WM_MAX_GRF / 2; + + brw_wm_pass2(c); + + /* how many general-purpose registers are used */ + c->prog_data.total_grf = c->max_wm_grf; + + /* Scratch space is used for register spilling */ + if (c->last_scratch) { + c->prog_data.total_scratch = c->last_scratch + 0x40; + } + else { + c->prog_data.total_scratch = 0; + } + + /* Emit GEN4 code. + */ + brw_wm_emit(c); +} + + +/** + * All Mesa program -> GPU code generation goes through this function. + * Depending on the instructions used (i.e. flow control instructions) + * we'll use one of two code generators. + */ +static void do_wm_prog( struct brw_context *brw, + struct brw_fragment_program *fp, + struct brw_wm_prog_key *key) +{ + struct brw_wm_compile *c; + const GLuint *program; + GLuint program_size; + + c = brw->wm.compile_data; + if (c == NULL) { + brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data)); + c = brw->wm.compile_data; + if (c == NULL) { + /* Ouch - big out of memory problem. Can't continue + * without triggering a segfault, no way to signal, + * so just return. + */ + return; + } + } else { + memset(c, 0, sizeof(*brw->wm.compile_data)); + } + memcpy(&c->key, key, sizeof(*key)); + + c->fp = fp; + c->env_param = brw->intel.ctx.FragmentProgram.Parameters; + + brw_init_compile(brw, &c->func); + + /* temporary sanity check assertion */ + ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program)); + + /* + * Shader which use GLSL features such as flow control are handled + * differently from "simple" shaders. + */ + if (fp->isGLSL) { + c->dispatch_width = 8; + brw_wm_glsl_emit(brw, c); + } + else { + c->dispatch_width = 16; + brw_wm_non_glsl_emit(brw, c); + } + + if (INTEL_DEBUG & DEBUG_WM) + fprintf(stderr, "\n"); + + /* get the program + */ + program = brw_get_program(&c->func, &program_size); + + dri_bo_unreference(brw->wm.prog_bo); + brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG, + &c->key, sizeof(c->key), + NULL, 0, + program, program_size, + &c->prog_data, + &brw->wm.prog_data ); +} + + + +static void brw_wm_populate_key( struct brw_context *brw, + struct brw_wm_prog_key *key ) +{ + GLcontext *ctx = &brw->intel.ctx; + /* BRW_NEW_FRAGMENT_PROGRAM */ + const struct brw_fragment_program *fp = + (struct brw_fragment_program *)brw->fragment_program; + GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0; + GLuint lookup = 0; + GLuint line_aa; + GLuint i; + + memset(key, 0, sizeof(*key)); + + /* Build the index for table lookup + */ + /* _NEW_COLOR */ + if (fp->program.UsesKill || + ctx->Color.AlphaEnabled) + lookup |= IZ_PS_KILL_ALPHATEST_BIT; + + if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH)) + lookup |= IZ_PS_COMPUTES_DEPTH_BIT; + + /* _NEW_DEPTH */ + if (ctx->Depth.Test) + lookup |= IZ_DEPTH_TEST_ENABLE_BIT; + + if (ctx->Depth.Test && + ctx->Depth.Mask) /* ?? */ + lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; + + /* _NEW_STENCIL */ + if (ctx->Stencil._Enabled) { + lookup |= IZ_STENCIL_TEST_ENABLE_BIT; + + if (ctx->Stencil.WriteMask[0] || + ctx->Stencil.WriteMask[ctx->Stencil._BackFace]) + lookup |= IZ_STENCIL_WRITE_ENABLE_BIT; + } + + line_aa = AA_NEVER; + + /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */ + if (ctx->Line.SmoothFlag) { + if (brw->intel.reduced_primitive == GL_LINES) { + line_aa = AA_ALWAYS; + } + else if (brw->intel.reduced_primitive == GL_TRIANGLES) { + if (ctx->Polygon.FrontMode == GL_LINE) { + line_aa = AA_SOMETIMES; + + if (ctx->Polygon.BackMode == GL_LINE || + (ctx->Polygon.CullFlag && + ctx->Polygon.CullFaceMode == GL_BACK)) + line_aa = AA_ALWAYS; + } + else if (ctx->Polygon.BackMode == GL_LINE) { + line_aa = AA_SOMETIMES; + + if ((ctx->Polygon.CullFlag && + ctx->Polygon.CullFaceMode == GL_FRONT)) + line_aa = AA_ALWAYS; + } + } + } + + brw_wm_lookup_iz(line_aa, + lookup, + uses_depth, + key); + + + /* BRW_NEW_WM_INPUT_DIMENSIONS */ + key->proj_attrib_mask = brw->wm.input_size_masks[4-1]; + + /* _NEW_LIGHT */ + key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT); + + /* _NEW_HINT */ + key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST); + + /* _NEW_TEXTURE */ + for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { + const struct gl_texture_unit *unit = &ctx->Texture.Unit[i]; + + if (unit->_ReallyEnabled) { + const struct gl_texture_object *t = unit->_Current; + const struct gl_texture_image *img = t->Image[0][t->BaseLevel]; + if (img->InternalFormat == GL_YCBCR_MESA) { + key->yuvtex_mask |= 1 << i; + if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR) + key->yuvtex_swap_mask |= 1 << i; + } + + key->tex_swizzles[i] = t->_Swizzle; + } + else { + key->tex_swizzles[i] = SWIZZLE_NOOP; + } + } + + /* Shadow */ + key->shadowtex_mask = fp->program.Base.ShadowSamplers; + + /* _NEW_BUFFERS */ + /* + * Include the draw buffer origin and height so that we can calculate + * fragment position values relative to the bottom left of the drawable, + * from the incoming screen origin relative position we get as part of our + * payload. + * + * We could avoid recompiling by including this as a constant referenced by + * our program, but if we were to do that it would also be nice to handle + * getting that constant updated at batchbuffer submit time (when we + * hold the lock and know where the buffer really is) rather than at emit + * time when we don't hold the lock and are just guessing. We could also + * just avoid using this as key data if the program doesn't use + * fragment.position. + * + * This pretty much becomes moot with DRI2 and redirected buffers anyway, + * as our origins will always be zero then. + */ + if (brw->intel.driDrawable != NULL) { + key->origin_x = brw->intel.driDrawable->x; + key->origin_y = brw->intel.driDrawable->y; + key->drawable_height = brw->intel.driDrawable->h; + } + + /* CACHE_NEW_VS_PROG */ + key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS; + + /* The unique fragment program ID */ + key->program_string_id = fp->id; +} + + +static void brw_prepare_wm_prog(struct brw_context *brw) +{ + struct brw_wm_prog_key key; + struct brw_fragment_program *fp = (struct brw_fragment_program *) + brw->fragment_program; + + brw_wm_populate_key(brw, &key); + + /* Make an early check for the key. + */ + dri_bo_unreference(brw->wm.prog_bo); + brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG, + &key, sizeof(key), + NULL, 0, + &brw->wm.prog_data); + if (brw->wm.prog_bo == NULL) + do_wm_prog(brw, fp, &key); +} + + +const struct brw_tracked_state brw_wm_prog = { + .dirty = { + .mesa = (_NEW_COLOR | + _NEW_DEPTH | + _NEW_HINT | + _NEW_STENCIL | + _NEW_POLYGON | + _NEW_LINE | + _NEW_LIGHT | + _NEW_BUFFERS | + _NEW_TEXTURE), + .brw = (BRW_NEW_FRAGMENT_PROGRAM | + BRW_NEW_WM_INPUT_DIMENSIONS | + BRW_NEW_REDUCED_PRIMITIVE), + .cache = CACHE_NEW_VS_PROG, + }, + .prepare = brw_prepare_wm_prog +}; + diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h new file mode 100644 index 00000000000..872b1f3ecf4 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm.h @@ -0,0 +1,309 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#ifndef BRW_WM_H +#define BRW_WM_H + + +#include "shader/prog_instruction.h" +#include "brw_context.h" +#include "brw_eu.h" + +#define SATURATE (1<<5) + +/* A big lookup table is used to figure out which and how many + * additional regs will inserted before the main payload in the WM + * program execution. These mainly relate to depth and stencil + * processing and the early-depth-test optimization. + */ +#define IZ_PS_KILL_ALPHATEST_BIT 0x1 +#define IZ_PS_COMPUTES_DEPTH_BIT 0x2 +#define IZ_DEPTH_WRITE_ENABLE_BIT 0x4 +#define IZ_DEPTH_TEST_ENABLE_BIT 0x8 +#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10 +#define IZ_STENCIL_TEST_ENABLE_BIT 0x20 +#define IZ_BIT_MAX 0x40 + +#define AA_NEVER 0 +#define AA_SOMETIMES 1 +#define AA_ALWAYS 2 + +struct brw_wm_prog_key { + GLuint source_depth_reg:3; + GLuint aa_dest_stencil_reg:3; + GLuint dest_depth_reg:3; + GLuint nr_depth_regs:3; + GLuint computes_depth:1; /* could be derived from program string */ + GLuint source_depth_to_render_target:1; + GLuint flat_shade:1; + GLuint linear_color:1; /**< linear interpolation vs perspective interp */ + GLuint runtime_check_aads_emit:1; + + GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */ + GLuint shadowtex_mask:16; + GLuint yuvtex_mask:16; + GLuint yuvtex_swap_mask:16; /* UV swaped */ + + GLuint tex_swizzles[BRW_MAX_TEX_UNIT]; + + GLuint program_string_id:32; + GLuint origin_x, origin_y; + GLuint drawable_height; + GLuint vp_outputs_written; +}; + + +/* A bit of a glossary: + * + * brw_wm_value: A computed value or program input. Values are + * constant, they are created once and are never modified. When a + * fragment program register is written or overwritten, new values are + * created fresh, preserving the rule that values are constant. + * + * brw_wm_ref: A reference to a value. Wherever a value used is by an + * instruction or as a program output, that is tracked with an + * instance of this struct. All references to a value occur after it + * is created. After the last reference, a value is dead and can be + * discarded. + * + * brw_wm_grf: Represents a physical hardware register. May be either + * empty or hold a value. Register allocation is the process of + * assigning values to grf registers. This occurs in pass2 and the + * brw_wm_grf struct is not used before that. + * + * Fragment program registers: These are time-varying constructs that + * are hard to reason about and which we translate away in pass0. A + * single fragment program register element (eg. temp[0].x) will be + * translated to one or more brw_wm_value structs, one for each time + * that temp[0].x is written to during the program. + */ + + + +/* Used in pass2 to track register allocation. + */ +struct brw_wm_grf { + struct brw_wm_value *value; + GLuint nextuse; +}; + +struct brw_wm_value { + struct brw_reg hw_reg; /* emitted to this reg, may not always be there */ + struct brw_wm_ref *lastuse; + struct brw_wm_grf *resident; + GLuint contributes_to_output:1; + GLuint spill_slot:16; /* if non-zero, spill immediately after calculation */ +}; + +struct brw_wm_ref { + struct brw_reg hw_reg; /* nr filled in in pass2, everything else, pass0 */ + struct brw_wm_value *value; + struct brw_wm_ref *prevuse; + GLuint unspill_reg:7; /* unspill to reg */ + GLuint emitted:1; + GLuint insn:24; +}; + +struct brw_wm_constref { + const struct brw_wm_ref *ref; + GLfloat constval; +}; + + +struct brw_wm_instruction { + struct brw_wm_value *dst[4]; + struct brw_wm_ref *src[3][4]; + GLuint opcode:8; + GLuint saturate:1; + GLuint writemask:4; + GLuint tex_unit:4; /* texture unit for TEX, TXD, TXP instructions */ + GLuint tex_idx:3; /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */ + GLuint tex_shadow:1; /* do shadow comparison? */ + GLuint eot:1; /* End of thread indicator for FB_WRITE*/ + GLuint target:10; /* target binding table index for FB_WRITE*/ +}; + + +#define BRW_WM_MAX_INSN (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3) +#define BRW_WM_MAX_GRF 128 /* hardware limit */ +#define BRW_WM_MAX_VREG (BRW_WM_MAX_INSN * 4) +#define BRW_WM_MAX_REF (BRW_WM_MAX_INSN * 12) +#define BRW_WM_MAX_PARAM 256 +#define BRW_WM_MAX_CONST 256 +#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS +#define BRW_WM_MAX_SUBROUTINE 16 + + + +/* New opcodes to track internal operations required for WM unit. + * These are added early so that the registers used can be tracked, + * freed and reused like those of other instructions. + */ +#define WM_PIXELXY (MAX_OPCODE) +#define WM_DELTAXY (MAX_OPCODE + 1) +#define WM_PIXELW (MAX_OPCODE + 2) +#define WM_LINTERP (MAX_OPCODE + 3) +#define WM_PINTERP (MAX_OPCODE + 4) +#define WM_CINTERP (MAX_OPCODE + 5) +#define WM_WPOSXY (MAX_OPCODE + 6) +#define WM_FB_WRITE (MAX_OPCODE + 7) +#define WM_FRONTFACING (MAX_OPCODE + 8) +#define MAX_WM_OPCODE (MAX_OPCODE + 9) + +#define PROGRAM_PAYLOAD (PROGRAM_FILE_MAX) +#define PAYLOAD_DEPTH (FRAG_ATTRIB_MAX) + +struct brw_wm_compile { + struct brw_compile func; + struct brw_wm_prog_key key; + struct brw_wm_prog_data prog_data; + + struct brw_fragment_program *fp; + + GLfloat (*env_param)[4]; + + enum { + START, + PASS2_DONE + } state; + + /* Initial pass - translate fp instructions to fp instructions, + * simplifying and adding instructions for interpolation and + * framebuffer writes. + */ + struct prog_instruction prog_instructions[BRW_WM_MAX_INSN]; + GLuint nr_fp_insns; + GLuint fp_temp; + GLuint fp_interp_emitted; + GLuint fp_fragcolor_emitted; + + struct prog_src_register pixel_xy; + struct prog_src_register delta_xy; + struct prog_src_register pixel_w; + + + struct brw_wm_value vreg[BRW_WM_MAX_VREG]; + GLuint nr_vreg; + + struct brw_wm_value creg[BRW_WM_MAX_PARAM]; + GLuint nr_creg; + + struct { + struct brw_wm_value depth[4]; /* includes r0/r1 */ + struct brw_wm_value input_interp[FRAG_ATTRIB_MAX]; + } payload; + + + const struct brw_wm_ref *pass0_fp_reg[PROGRAM_PAYLOAD+1][256][4]; + + struct brw_wm_ref undef_ref; + struct brw_wm_value undef_value; + + struct brw_wm_ref refs[BRW_WM_MAX_REF]; + GLuint nr_refs; + + struct brw_wm_instruction instruction[BRW_WM_MAX_INSN]; + GLuint nr_insns; + + struct brw_wm_constref constref[BRW_WM_MAX_CONST]; + GLuint nr_constrefs; + + struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2]; + + GLuint grf_limit; + GLuint max_wm_grf; + GLuint last_scratch; + + GLuint cur_inst; /**< index of current instruction */ + + GLboolean out_of_regs; /**< ran out of GRF registers? */ + + /** Mapping from Mesa registers to hardware registers */ + struct { + GLboolean inited; + struct brw_reg reg; + } wm_regs[PROGRAM_PAYLOAD+1][256][4]; + + GLboolean used_grf[BRW_WM_MAX_GRF]; + GLuint first_free_grf; + struct brw_reg stack; + struct brw_reg emit_mask_reg; + GLuint tmp_regs[BRW_WM_MAX_GRF]; + GLuint tmp_index; + GLuint tmp_max; + GLuint subroutines[BRW_WM_MAX_SUBROUTINE]; + GLuint dispatch_width; + + /** we may need up to 3 constants per instruction (if use_const_buffer) */ + struct { + GLint index; + struct brw_reg reg; + } current_const[3]; +}; + + +GLuint brw_wm_nr_args( GLuint opcode ); +GLuint brw_wm_is_scalar_result( GLuint opcode ); + +void brw_wm_pass_fp( struct brw_wm_compile *c ); +void brw_wm_pass0( struct brw_wm_compile *c ); +void brw_wm_pass1( struct brw_wm_compile *c ); +void brw_wm_pass2( struct brw_wm_compile *c ); +void brw_wm_emit( struct brw_wm_compile *c ); + +void brw_wm_print_value( struct brw_wm_compile *c, + struct brw_wm_value *value ); + +void brw_wm_print_ref( struct brw_wm_compile *c, + struct brw_wm_ref *ref ); + +void brw_wm_print_insn( struct brw_wm_compile *c, + struct brw_wm_instruction *inst ); + +void brw_wm_print_program( struct brw_wm_compile *c, + const char *stage ); + +void brw_wm_lookup_iz( GLuint line_aa, + GLuint lookup, + GLboolean ps_uses_depth, + struct brw_wm_prog_key *key ); + +GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp); +void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c); + +void emit_ddxy(struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + GLboolean is_ddx, + const struct brw_reg *arg0); + +#endif diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c new file mode 100644 index 00000000000..220821087c1 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_debug.c @@ -0,0 +1,174 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_wm.h" + + +void brw_wm_print_value( struct brw_wm_compile *c, + struct brw_wm_value *value ) +{ + assert(value); + if (c->state >= PASS2_DONE) + brw_print_reg(value->hw_reg); + else if( value == &c->undef_value ) + _mesa_printf("undef"); + else if( value - c->vreg >= 0 && + value - c->vreg < BRW_WM_MAX_VREG) + _mesa_printf("r%d", value - c->vreg); + else if (value - c->creg >= 0 && + value - c->creg < BRW_WM_MAX_PARAM) + _mesa_printf("c%d", value - c->creg); + else if (value - c->payload.input_interp >= 0 && + value - c->payload.input_interp < FRAG_ATTRIB_MAX) + _mesa_printf("i%d", value - c->payload.input_interp); + else if (value - c->payload.depth >= 0 && + value - c->payload.depth < FRAG_ATTRIB_MAX) + _mesa_printf("d%d", value - c->payload.depth); + else + _mesa_printf("?"); +} + +void brw_wm_print_ref( struct brw_wm_compile *c, + struct brw_wm_ref *ref ) +{ + struct brw_reg hw_reg = ref->hw_reg; + + if (ref->unspill_reg) + _mesa_printf("UNSPILL(%x)/", ref->value->spill_slot); + + if (c->state >= PASS2_DONE) + brw_print_reg(ref->hw_reg); + else { + _mesa_printf("%s", hw_reg.negate ? "-" : ""); + _mesa_printf("%s", hw_reg.abs ? "abs/" : ""); + brw_wm_print_value(c, ref->value); + if ((hw_reg.nr&1) || hw_reg.subnr) { + _mesa_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr); + } + } +} + +void brw_wm_print_insn( struct brw_wm_compile *c, + struct brw_wm_instruction *inst ) +{ + GLuint i, arg; + GLuint nr_args = brw_wm_nr_args(inst->opcode); + + _mesa_printf("["); + for (i = 0; i < 4; i++) { + if (inst->dst[i]) { + brw_wm_print_value(c, inst->dst[i]); + if (inst->dst[i]->spill_slot) + _mesa_printf("/SPILL(%x)",inst->dst[i]->spill_slot); + } + else + _mesa_printf("#"); + if (i < 3) + _mesa_printf(","); + } + _mesa_printf("]"); + + if (inst->writemask != WRITEMASK_XYZW) + _mesa_printf(".%s%s%s%s", + GET_BIT(inst->writemask, 0) ? "x" : "", + GET_BIT(inst->writemask, 1) ? "y" : "", + GET_BIT(inst->writemask, 2) ? "z" : "", + GET_BIT(inst->writemask, 3) ? "w" : ""); + + switch (inst->opcode) { + case WM_PIXELXY: + _mesa_printf(" = PIXELXY"); + break; + case WM_DELTAXY: + _mesa_printf(" = DELTAXY"); + break; + case WM_PIXELW: + _mesa_printf(" = PIXELW"); + break; + case WM_WPOSXY: + _mesa_printf(" = WPOSXY"); + break; + case WM_PINTERP: + _mesa_printf(" = PINTERP"); + break; + case WM_LINTERP: + _mesa_printf(" = LINTERP"); + break; + case WM_CINTERP: + _mesa_printf(" = CINTERP"); + break; + case WM_FB_WRITE: + _mesa_printf(" = FB_WRITE"); + break; + case WM_FRONTFACING: + _mesa_printf(" = FRONTFACING"); + break; + default: + _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode)); + break; + } + + if (inst->saturate) + _mesa_printf("_SAT"); + + for (arg = 0; arg < nr_args; arg++) { + + _mesa_printf(" ["); + + for (i = 0; i < 4; i++) { + if (inst->src[arg][i]) { + brw_wm_print_ref(c, inst->src[arg][i]); + } + else + _mesa_printf("%%"); + + if (i < 3) + _mesa_printf(","); + else + _mesa_printf("]"); + } + } + _mesa_printf("\n"); +} + +void brw_wm_print_program( struct brw_wm_compile *c, + const char *stage ) +{ + GLuint insn; + + _mesa_printf("%s:\n", stage); + for (insn = 0; insn < c->nr_insns; insn++) + brw_wm_print_insn(c, &c->instruction[insn]); + _mesa_printf("\n"); +} + diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c new file mode 100644 index 00000000000..bf80a2942a4 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_emit.c @@ -0,0 +1,1509 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/macros.h" +#include "brw_context.h" +#include "brw_wm.h" + +/* Not quite sure how correct this is - need to understand horiz + * vs. vertical strides a little better. + */ +static INLINE struct brw_reg sechalf( struct brw_reg reg ) +{ + if (reg.vstride) + reg.nr++; + return reg; +} + +/* Payload R0: + * + * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles, + * corresponding to each of the 16 execution channels. + * R0.1..8 -- ? + * R1.0 -- triangle vertex 0.X + * R1.1 -- triangle vertex 0.Y + * R1.2 -- tile 0 x,y coords (2 packed uwords) + * R1.3 -- tile 1 x,y coords (2 packed uwords) + * R1.4 -- tile 2 x,y coords (2 packed uwords) + * R1.5 -- tile 3 x,y coords (2 packed uwords) + * R1.6 -- ? + * R1.7 -- ? + * R1.8 -- ? + */ + + +static void emit_pixel_xy(struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask) +{ + struct brw_reg r1 = brw_vec1_grf(1, 0); + struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + + /* Calculate pixel centers by adding 1 or 0 to each of the + * micro-tile coordinates passed in r1. + */ + if (mask & WRITEMASK_X) { + brw_ADD(p, + vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)), + stride(suboffset(r1_uw, 4), 2, 4, 0), + brw_imm_v(0x10101010)); + } + + if (mask & WRITEMASK_Y) { + brw_ADD(p, + vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)), + stride(suboffset(r1_uw,5), 2, 4, 0), + brw_imm_v(0x11001100)); + } + + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); +} + + + +static void emit_delta_xy(struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0) +{ + struct brw_reg r1 = brw_vec1_grf(1, 0); + + /* Calc delta X,Y by subtracting origin in r1 from the pixel + * centers. + */ + if (mask & WRITEMASK_X) { + brw_ADD(p, + dst[0], + retype(arg0[0], BRW_REGISTER_TYPE_UW), + negate(r1)); + } + + if (mask & WRITEMASK_Y) { + brw_ADD(p, + dst[1], + retype(arg0[1], BRW_REGISTER_TYPE_UW), + negate(suboffset(r1,1))); + + } +} + +static void emit_wpos_xy(struct brw_wm_compile *c, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0) +{ + struct brw_compile *p = &c->func; + + /* Calculate the pixel offset from window bottom left into destination + * X and Y channels. + */ + if (mask & WRITEMASK_X) { + /* X' = X - origin */ + brw_ADD(p, + dst[0], + retype(arg0[0], BRW_REGISTER_TYPE_W), + brw_imm_d(0 - c->key.origin_x)); + } + + if (mask & WRITEMASK_Y) { + /* Y' = height - (Y - origin_y) = height + origin_y - Y */ + brw_ADD(p, + dst[1], + negate(retype(arg0[1], BRW_REGISTER_TYPE_W)), + brw_imm_d(c->key.origin_y + c->key.drawable_height - 1)); + } +} + + +static void emit_pixel_w( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *deltas) +{ + /* Don't need this if all you are doing is interpolating color, for + * instance. + */ + if (mask & WRITEMASK_W) { + struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4); + + /* Calc 1/w - just linterp wpos[3] optimized by putting the + * result straight into a message reg. + */ + brw_LINE(p, brw_null_reg(), interp3, deltas[0]); + brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]); + + /* Calc w */ + brw_math_16( p, dst[3], + BRW_MATH_FUNCTION_INV, + BRW_MATH_SATURATE_NONE, + 2, brw_null_reg(), + BRW_MATH_PRECISION_FULL); + } +} + + + +static void emit_linterp( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *deltas ) +{ + struct brw_reg interp[4]; + GLuint nr = arg0[0].nr; + GLuint i; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); + brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); + } + } +} + + +static void emit_pinterp( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *deltas, + const struct brw_reg *w) +{ + struct brw_reg interp[4]; + GLuint nr = arg0[0].nr; + GLuint i; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); + brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); + } + } + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MUL(p, dst[i], dst[i], w[3]); + } + } +} + + +static void emit_cinterp( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0 ) +{ + struct brw_reg interp[4]; + GLuint nr = arg0[0].nr; + GLuint i; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */ + } + } +} + +/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */ +static void emit_frontfacing( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask ) +{ + struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); + GLuint i; + + if (!(mask & WRITEMASK_XYZW)) + return; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MOV(p, dst[i], brw_imm_f(0.0)); + } + } + + /* bit 31 is "primitive is back face", so checking < (1 << 31) gives + * us front face + */ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31)); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MOV(p, dst[i], brw_imm_f(1.0)); + } + } + brw_set_predicate_control_flag_value(p, 0xff); +} + +/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input + * looking like: + * + * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br + * + * and we're trying to produce: + * + * DDX DDY + * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) + * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) + * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) + * (ss0.br - ss0.bl) (ss0.tr - ss0.br) + * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) + * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) + * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) + * (ss1.br - ss1.bl) (ss1.tr - ss1.br) + * + * and add another set of two more subspans if in 16-pixel dispatch mode. + * + * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result + * for each pair, and vertstride = 2 jumps us 2 elements after processing a + * pair. But for DDY, it's harder, as we want to produce the pairs swizzled + * between each other. We could probably do it like ddx and swizzle the right + * order later, but bail for now and just produce + * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) + */ +void emit_ddxy(struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + GLboolean is_ddx, + const struct brw_reg *arg0) +{ + int i; + struct brw_reg src0, src1; + + if (mask & SATURATE) + brw_set_saturate(p, 1); + for (i = 0; i < 4; i++ ) { + if (mask & (1<<i)) { + if (is_ddx) { + src0 = brw_reg(arg0[i].file, arg0[i].nr, 1, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_2, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + src1 = brw_reg(arg0[i].file, arg0[i].nr, 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_2, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + } else { + src0 = brw_reg(arg0[i].file, arg0[i].nr, 0, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + src1 = brw_reg(arg0[i].file, arg0[i].nr, 2, + BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_4, + BRW_WIDTH_4, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + } + brw_ADD(p, dst[i], src0, negate(src1)); + } + } + if (mask & SATURATE) + brw_set_saturate(p, 0); +} + +static void emit_alu1( struct brw_compile *p, + struct brw_instruction *(*func)(struct brw_compile *, + struct brw_reg, + struct brw_reg), + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0 ) +{ + GLuint i; + + if (mask & SATURATE) + brw_set_saturate(p, 1); + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + func(p, dst[i], arg0[i]); + } + } + + if (mask & SATURATE) + brw_set_saturate(p, 0); +} + + +static void emit_alu2( struct brw_compile *p, + struct brw_instruction *(*func)(struct brw_compile *, + struct brw_reg, + struct brw_reg, + struct brw_reg), + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + GLuint i; + + if (mask & SATURATE) + brw_set_saturate(p, 1); + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + func(p, dst[i], arg0[i], arg1[i]); + } + } + + if (mask & SATURATE) + brw_set_saturate(p, 0); +} + + +static void emit_mad( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1, + const struct brw_reg *arg2 ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MUL(p, dst[i], arg0[i], arg1[i]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_ADD(p, dst[i], dst[i], arg2[i]); + brw_set_saturate(p, 0); + } + } +} + +static void emit_trunc( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_RNDZ(p, dst[i], arg0[i]); + } + } +} + +static void emit_lrp( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1, + const struct brw_reg *arg2 ) +{ + GLuint i; + + /* Uses dst as a temporary: + */ + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + /* Can I use the LINE instruction for this? + */ + brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0)); + brw_MUL(p, brw_null_reg(), dst[i], arg2[i]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MAC(p, dst[i], arg0[i], arg1[i]); + brw_set_saturate(p, 0); + } + } +} + +static void emit_sop( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + GLuint cond, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MOV(p, dst[i], brw_imm_f(0)); + brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]); + brw_MOV(p, dst[i], brw_imm_f(1.0)); + brw_set_predicate_control_flag_value(p, 0xff); + } + } +} + +static void emit_slt( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1); +} + +static void emit_sle( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1); +} + +static void emit_sgt( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1); +} + +static void emit_sge( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1); +} + +static void emit_seq( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1); +} + +static void emit_sne( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1); +} + +static void emit_cmp( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1, + const struct brw_reg *arg2 ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg2[i]); + brw_set_saturate(p, 0); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0)); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg1[i]); + brw_set_saturate(p, 0); + brw_set_predicate_control_flag_value(p, 0xff); + } + } +} + +static void emit_max( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg0[i]); + brw_set_saturate(p, 0); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg1[i]); + brw_set_saturate(p, 0); + brw_set_predicate_control_flag_value(p, 0xff); + } + } +} + +static void emit_min( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg1[i]); + brw_set_saturate(p, 0); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[i], arg0[i]); + brw_set_saturate(p, 0); + brw_set_predicate_control_flag_value(p, 0xff); + } + } +} + + +static void emit_dp3( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; /* Do not emit dead code */ + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); + brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); + brw_set_saturate(p, 0); +} + + +static void emit_dp4( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; /* Do not emit dead code */ + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); + brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); + brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]); + brw_set_saturate(p, 0); +} + + +static void emit_dph( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; /* Do not emit dead code */ + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); + brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); + brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]); + brw_set_saturate(p, 0); +} + + +static void emit_xpd( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1 ) +{ + GLuint i; + + assert(!(mask & WRITEMASK_W) == WRITEMASK_X); + + for (i = 0 ; i < 3; i++) { + if (mask & (1<<i)) { + GLuint i2 = (i+2)%3; + GLuint i1 = (i+1)%3; + + brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]); + + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MAC(p, dst[i], arg0[i1], arg1[i2]); + brw_set_saturate(p, 0); + } + } +} + + +static void emit_math1( struct brw_compile *p, + GLuint function, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0 ) +{ + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; /* Do not emit dead code */ + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + brw_MOV(p, brw_message_reg(2), arg0[0]); + + /* Send two messages to perform all 16 operations: + */ + brw_math_16(p, + dst[dst_chan], + function, + (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, + 2, + brw_null_reg(), + BRW_MATH_PRECISION_FULL); +} + + +static void emit_math2( struct brw_compile *p, + GLuint function, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0, + const struct brw_reg *arg1) +{ + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; /* Do not emit dead code */ + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + brw_push_insn_state(p); + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, brw_message_reg(2), arg0[0]); + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_MOV(p, brw_message_reg(4), sechalf(arg0[0])); + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, brw_message_reg(3), arg1[0]); + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_MOV(p, brw_message_reg(5), sechalf(arg1[0])); + + + /* Send two messages to perform all 16 operations: + */ + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_math(p, + dst[dst_chan], + function, + (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, + 2, + brw_null_reg(), + BRW_MATH_DATA_VECTOR, + BRW_MATH_PRECISION_FULL); + + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_math(p, + offset(dst[dst_chan],1), + function, + (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, + 4, + brw_null_reg(), + BRW_MATH_DATA_VECTOR, + BRW_MATH_PRECISION_FULL); + + brw_pop_insn_state(p); +} + + + +static void emit_tex( struct brw_wm_compile *c, + const struct brw_wm_instruction *inst, + struct brw_reg *dst, + GLuint dst_flags, + struct brw_reg *arg ) +{ + struct brw_compile *p = &c->func; + GLuint msgLength, responseLength; + GLuint i, nr; + GLuint emit; + GLuint msg_type; + + /* How many input regs are there? + */ + switch (inst->tex_idx) { + case TEXTURE_1D_INDEX: + emit = WRITEMASK_X; + nr = 1; + break; + case TEXTURE_2D_INDEX: + case TEXTURE_RECT_INDEX: + emit = WRITEMASK_XY; + nr = 2; + break; + case TEXTURE_3D_INDEX: + case TEXTURE_CUBE_INDEX: + emit = WRITEMASK_XYZ; + nr = 3; + break; + default: + /* unexpected target */ + abort(); + } + + if (inst->tex_shadow) { + nr = 4; + emit |= WRITEMASK_W; + } + + msgLength = 1; + + for (i = 0; i < nr; i++) { + static const GLuint swz[4] = {0,1,2,2}; + if (emit & (1<<i)) + brw_MOV(p, brw_message_reg(msgLength+1), arg[swz[i]]); + else + brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0)); + msgLength += 2; + } + + responseLength = 8; /* always */ + + if (BRW_IS_IGDNG(p->brw)) { + if (inst->tex_shadow) + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG; + else + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG; + } else { + if (inst->tex_shadow) + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; + else + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; + } + + brw_SAMPLE(p, + retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW), + 1, + retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW), + SURF_INDEX_TEXTURE(inst->tex_unit), + inst->tex_unit, /* sampler */ + inst->writemask, + msg_type, + responseLength, + msgLength, + 0, + 1, + BRW_SAMPLER_SIMD_MODE_SIMD16); +} + + +static void emit_txb( struct brw_wm_compile *c, + const struct brw_wm_instruction *inst, + struct brw_reg *dst, + GLuint dst_flags, + struct brw_reg *arg ) +{ + struct brw_compile *p = &c->func; + GLuint msgLength; + GLuint msg_type; + /* Shadow ignored for txb. + */ + switch (inst->tex_idx) { + case TEXTURE_1D_INDEX: + brw_MOV(p, brw_message_reg(2), arg[0]); + brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); + brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); + break; + case TEXTURE_2D_INDEX: + case TEXTURE_RECT_INDEX: + brw_MOV(p, brw_message_reg(2), arg[0]); + brw_MOV(p, brw_message_reg(4), arg[1]); + brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); + break; + case TEXTURE_3D_INDEX: + case TEXTURE_CUBE_INDEX: + brw_MOV(p, brw_message_reg(2), arg[0]); + brw_MOV(p, brw_message_reg(4), arg[1]); + brw_MOV(p, brw_message_reg(6), arg[2]); + break; + default: + /* unexpected target */ + abort(); + } + + brw_MOV(p, brw_message_reg(8), arg[3]); + msgLength = 9; + + if (BRW_IS_IGDNG(p->brw)) + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG; + else + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; + + brw_SAMPLE(p, + retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW), + 1, + retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW), + SURF_INDEX_TEXTURE(inst->tex_unit), + inst->tex_unit, /* sampler */ + inst->writemask, + msg_type, + 8, /* responseLength */ + msgLength, + 0, + 1, + BRW_SAMPLER_SIMD_MODE_SIMD16); +} + + +static void emit_lit( struct brw_compile *p, + const struct brw_reg *dst, + GLuint mask, + const struct brw_reg *arg0 ) +{ + assert((mask & WRITEMASK_XW) == 0); + + if (mask & WRITEMASK_Y) { + brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); + brw_MOV(p, dst[1], arg0[0]); + brw_set_saturate(p, 0); + } + + if (mask & WRITEMASK_Z) { + emit_math2(p, BRW_MATH_FUNCTION_POW, + &dst[2], + WRITEMASK_X | (mask & SATURATE), + &arg0[1], + &arg0[3]); + } + + /* Ordinarily you'd use an iff statement to skip or shortcircuit + * some of the POW calculations above, but 16-wide iff statements + * seem to lock c1 hardware, so this is a nasty workaround: + */ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0)); + { + if (mask & WRITEMASK_Y) + brw_MOV(p, dst[1], brw_imm_f(0)); + + if (mask & WRITEMASK_Z) + brw_MOV(p, dst[2], brw_imm_f(0)); + } + brw_set_predicate_control(p, BRW_PREDICATE_NONE); +} + + +/* Kill pixel - set execution mask to zero for those pixels which + * fail. + */ +static void emit_kil( struct brw_wm_compile *c, + struct brw_reg *arg0) +{ + struct brw_compile *p = &c->func; + struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + GLuint i; + + /* XXX - usually won't need 4 compares! + */ + for (i = 0; i < 4; i++) { + brw_push_insn_state(p); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0)); + brw_set_predicate_control_flag_value(p, 0xff); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_AND(p, r0uw, brw_flag_reg(), r0uw); + brw_pop_insn_state(p); + } +} + +/* KIL_NV kills the pixels that are currently executing, not based on a test + * of the arguments. + */ +static void emit_kil_nv( struct brw_wm_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK + brw_AND(p, r0uw, c->emit_mask_reg, r0uw); + brw_pop_insn_state(p); +} + +static void fire_fb_write( struct brw_wm_compile *c, + GLuint base_reg, + GLuint nr, + GLuint target, + GLuint eot ) +{ + struct brw_compile *p = &c->func; + + /* Pass through control information: + */ +/* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ + { + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, + brw_message_reg(base_reg + 1), + brw_vec8_grf(1, 0)); + brw_pop_insn_state(p); + } + + /* Send framebuffer write message: */ +/* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */ + brw_fb_WRITE(p, + retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW), + base_reg, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), + target, + nr, + 0, + eot); +} + + +static void emit_aa( struct brw_wm_compile *c, + struct brw_reg *arg1, + GLuint reg ) +{ + struct brw_compile *p = &c->func; + GLuint comp = c->key.aa_dest_stencil_reg / 2; + GLuint off = c->key.aa_dest_stencil_reg % 2; + struct brw_reg aa = offset(arg1[comp], off); + + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */ + brw_MOV(p, brw_message_reg(reg), aa); + brw_pop_insn_state(p); +} + + +/* Post-fragment-program processing. Send the results to the + * framebuffer. + * \param arg0 the fragment color + * \param arg1 the pass-through depth value + * \param arg2 the shader-computed depth value + */ +static void emit_fb_write( struct brw_wm_compile *c, + struct brw_reg *arg0, + struct brw_reg *arg1, + struct brw_reg *arg2, + GLuint target, + GLuint eot) +{ + struct brw_compile *p = &c->func; + GLuint nr = 2; + GLuint channel; + + /* Reserve a space for AA - may not be needed: + */ + if (c->key.aa_dest_stencil_reg) + nr += 1; + + /* I don't really understand how this achieves the color interleave + * (ie RGBARGBA) in the result: [Do the saturation here] + */ + { + brw_push_insn_state(p); + + for (channel = 0; channel < 4; channel++) { + /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ + /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */ + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, + brw_message_reg(nr + channel), + arg0[channel]); + + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_MOV(p, + brw_message_reg(nr + channel + 4), + sechalf(arg0[channel])); + } + + /* skip over the regs populated above: + */ + nr += 8; + + brw_pop_insn_state(p); + } + + if (c->key.source_depth_to_render_target) + { + if (c->key.computes_depth) + brw_MOV(p, brw_message_reg(nr), arg2[2]); + else + brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */ + + nr += 2; + } + + if (c->key.dest_depth_reg) + { + GLuint comp = c->key.dest_depth_reg / 2; + GLuint off = c->key.dest_depth_reg % 2; + + if (off != 0) { + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + + brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1)); + /* 2nd half? */ + brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]); + brw_pop_insn_state(p); + } + else { + brw_MOV(p, brw_message_reg(nr), arg1[comp]); + } + nr += 2; + } + + if (!c->key.runtime_check_aads_emit) { + if (c->key.aa_dest_stencil_reg) + emit_aa(c, arg1, 2); + + fire_fb_write(c, 0, nr, target, eot); + } + else { + struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + struct brw_reg ip = brw_ip_reg(); + struct brw_instruction *jmp; + + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); + brw_AND(p, + v1_null_ud, + get_element_ud(brw_vec8_grf(1,0), 6), + brw_imm_ud(1<<26)); + + jmp = brw_JMPI(p, ip, ip, brw_imm_d(0)); + { + emit_aa(c, arg1, 2); + fire_fb_write(c, 0, nr, target, eot); + /* note - thread killed in subroutine */ + } + brw_land_fwd_jump(p, jmp); + + /* ELSE: Shuffle up one register to fill in the hole left for AA: + */ + fire_fb_write(c, 1, nr-1, target, eot); + } +} + + +/** + * Move a GPR to scratch memory. + */ +static void emit_spill( struct brw_wm_compile *c, + struct brw_reg reg, + GLuint slot ) +{ + struct brw_compile *p = &c->func; + + /* + mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr } + */ + brw_MOV(p, brw_message_reg(2), reg); + + /* + mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask } + send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 } + */ + brw_dp_WRITE_16(p, + retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW), + slot); +} + + +/** + * Load a GPR from scratch memory. + */ +static void emit_unspill( struct brw_wm_compile *c, + struct brw_reg reg, + GLuint slot ) +{ + struct brw_compile *p = &c->func; + + /* Slot 0 is the undef value. + */ + if (slot == 0) { + brw_MOV(p, reg, brw_imm_f(0)); + return; + } + + /* + mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask } + send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 } + */ + + brw_dp_READ_16(p, + retype(vec16(reg), BRW_REGISTER_TYPE_UW), + slot); +} + + +/** + * Retrieve up to 4 GEN4 register pairs for the given wm reg: + * Args with unspill_reg != 0 will be loaded from scratch memory. + */ +static void get_argument_regs( struct brw_wm_compile *c, + struct brw_wm_ref *arg[], + struct brw_reg *regs ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + if (arg[i]) { + if (arg[i]->unspill_reg) + emit_unspill(c, + brw_vec8_grf(arg[i]->unspill_reg, 0), + arg[i]->value->spill_slot); + + regs[i] = arg[i]->hw_reg; + } + else { + regs[i] = brw_null_reg(); + } + } +} + + +/** + * For values that have a spill_slot!=0, write those regs to scratch memory. + */ +static void spill_values( struct brw_wm_compile *c, + struct brw_wm_value *values, + GLuint nr ) +{ + GLuint i; + + for (i = 0; i < nr; i++) + if (values[i].spill_slot) + emit_spill(c, values[i].hw_reg, values[i].spill_slot); +} + + +/* Emit the fragment program instructions here. + */ +void brw_wm_emit( struct brw_wm_compile *c ) +{ + struct brw_compile *p = &c->func; + GLuint insn; + + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); + + /* Check if any of the payload regs need to be spilled: + */ + spill_values(c, c->payload.depth, 4); + spill_values(c, c->creg, c->nr_creg); + spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX); + + + for (insn = 0; insn < c->nr_insns; insn++) { + + struct brw_wm_instruction *inst = &c->instruction[insn]; + struct brw_reg args[3][4], dst[4]; + GLuint i, dst_flags; + + /* Get argument regs: + */ + for (i = 0; i < 3; i++) + get_argument_regs(c, inst->src[i], args[i]); + + /* Get dest regs: + */ + for (i = 0; i < 4; i++) + if (inst->dst[i]) + dst[i] = inst->dst[i]->hw_reg; + else + dst[i] = brw_null_reg(); + + /* Flags + */ + dst_flags = inst->writemask; + if (inst->saturate) + dst_flags |= SATURATE; + + switch (inst->opcode) { + /* Generated instructions for calculating triangle interpolants: + */ + case WM_PIXELXY: + emit_pixel_xy(p, dst, dst_flags); + break; + + case WM_DELTAXY: + emit_delta_xy(p, dst, dst_flags, args[0]); + break; + + case WM_WPOSXY: + emit_wpos_xy(c, dst, dst_flags, args[0]); + break; + + case WM_PIXELW: + emit_pixel_w(p, dst, dst_flags, args[0], args[1]); + break; + + case WM_LINTERP: + emit_linterp(p, dst, dst_flags, args[0], args[1]); + break; + + case WM_PINTERP: + emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]); + break; + + case WM_CINTERP: + emit_cinterp(p, dst, dst_flags, args[0]); + break; + + case WM_FB_WRITE: + emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot); + break; + + case WM_FRONTFACING: + emit_frontfacing(p, dst, dst_flags); + break; + + /* Straightforward arithmetic: + */ + case OPCODE_ADD: + emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_FRC: + emit_alu1(p, brw_FRC, dst, dst_flags, args[0]); + break; + + case OPCODE_FLR: + emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]); + break; + + case OPCODE_DDX: + emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]); + break; + + case OPCODE_DDY: + emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]); + break; + + case OPCODE_DP3: + emit_dp3(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_DP4: + emit_dp4(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_DPH: + emit_dph(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_TRUNC: + emit_trunc(p, dst, dst_flags, args[0]); + break; + + case OPCODE_LRP: + emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]); + break; + + case OPCODE_MAD: + emit_mad(p, dst, dst_flags, args[0], args[1], args[2]); + break; + + case OPCODE_MOV: + case OPCODE_SWZ: + emit_alu1(p, brw_MOV, dst, dst_flags, args[0]); + break; + + case OPCODE_MUL: + emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_XPD: + emit_xpd(p, dst, dst_flags, args[0], args[1]); + break; + + /* Higher math functions: + */ + case OPCODE_RCP: + emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]); + break; + + case OPCODE_RSQ: + emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]); + break; + + case OPCODE_SIN: + emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]); + break; + + case OPCODE_COS: + emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]); + break; + + case OPCODE_EX2: + emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]); + break; + + case OPCODE_LG2: + emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]); + break; + + case OPCODE_SCS: + /* There is an scs math function, but it would need some + * fixup for 16-element execution. + */ + if (dst_flags & WRITEMASK_X) + emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); + if (dst_flags & WRITEMASK_Y) + emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); + break; + + case OPCODE_POW: + emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]); + break; + + /* Comparisons: + */ + case OPCODE_CMP: + emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]); + break; + + case OPCODE_MAX: + emit_max(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_MIN: + emit_min(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_SLT: + emit_slt(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_SLE: + emit_sle(p, dst, dst_flags, args[0], args[1]); + break; + case OPCODE_SGT: + emit_sgt(p, dst, dst_flags, args[0], args[1]); + break; + case OPCODE_SGE: + emit_sge(p, dst, dst_flags, args[0], args[1]); + break; + case OPCODE_SEQ: + emit_seq(p, dst, dst_flags, args[0], args[1]); + break; + case OPCODE_SNE: + emit_sne(p, dst, dst_flags, args[0], args[1]); + break; + + case OPCODE_LIT: + emit_lit(p, dst, dst_flags, args[0]); + break; + + /* Texturing operations: + */ + case OPCODE_TEX: + emit_tex(c, inst, dst, dst_flags, args[0]); + break; + + case OPCODE_TXB: + emit_txb(c, inst, dst, dst_flags, args[0]); + break; + + case OPCODE_KIL: + emit_kil(c, args[0]); + break; + + case OPCODE_KIL_NV: + emit_kil_nv(c); + break; + + default: + _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n", + inst->opcode, inst->opcode < MAX_OPCODE ? + _mesa_opcode_string(inst->opcode) : + "unknown"); + } + + for (i = 0; i < 4; i++) + if (inst->dst[i] && inst->dst[i]->spill_slot) + emit_spill(c, + inst->dst[i]->hw_reg, + inst->dst[i]->spill_slot); + } + + if (INTEL_DEBUG & DEBUG_WM) { + int i; + + _mesa_printf("wm-native:\n"); + for (i = 0; i < p->nr_insn; i++) + brw_disasm(stderr, &p->store[i]); + _mesa_printf("\n"); + } +} diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c new file mode 100644 index 00000000000..4e3edfbbffa --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_fp.c @@ -0,0 +1,1177 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" +#include "brw_context.h" +#include "brw_wm.h" +#include "brw_util.h" + +#include "shader/prog_parameter.h" +#include "shader/prog_print.h" +#include "shader/prog_statevars.h" + + +/** An invalid texture target */ +#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS + +/** An invalid texture unit */ +#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT + +#define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS + +#define X 0 +#define Y 1 +#define Z 2 +#define W 3 + + +static const char *wm_opcode_strings[] = { + "PIXELXY", + "DELTAXY", + "PIXELW", + "LINTERP", + "PINTERP", + "CINTERP", + "WPOSXY", + "FB_WRITE", + "FRONTFACING", +}; + +#if 0 +static const char *wm_file_strings[] = { + "PAYLOAD" +}; +#endif + + +/*********************************************************************** + * Source regs + */ + +static struct prog_src_register src_reg(GLuint file, GLuint idx) +{ + struct prog_src_register reg; + reg.File = file; + reg.Index = idx; + reg.Swizzle = SWIZZLE_NOOP; + reg.RelAddr = 0; + reg.Negate = NEGATE_NONE; + reg.Abs = 0; + return reg; +} + +static struct prog_src_register src_reg_from_dst(struct prog_dst_register dst) +{ + return src_reg(dst.File, dst.Index); +} + +static struct prog_src_register src_undef( void ) +{ + return src_reg(PROGRAM_UNDEFINED, 0); +} + +static GLboolean src_is_undef(struct prog_src_register src) +{ + return src.File == PROGRAM_UNDEFINED; +} + +static struct prog_src_register src_swizzle( struct prog_src_register reg, int x, int y, int z, int w ) +{ + reg.Swizzle = MAKE_SWIZZLE4(x,y,z,w); + return reg; +} + +static struct prog_src_register src_swizzle1( struct prog_src_register reg, int x ) +{ + return src_swizzle(reg, x, x, x, x); +} + +static struct prog_src_register src_swizzle4( struct prog_src_register reg, uint swizzle ) +{ + reg.Swizzle = swizzle; + return reg; +} + + +/*********************************************************************** + * Dest regs + */ + +static struct prog_dst_register dst_reg(GLuint file, GLuint idx) +{ + struct prog_dst_register reg; + reg.File = file; + reg.Index = idx; + reg.WriteMask = WRITEMASK_XYZW; + reg.RelAddr = 0; + reg.CondMask = COND_TR; + reg.CondSwizzle = 0; + reg.CondSrc = 0; + reg.pad = 0; + return reg; +} + +static struct prog_dst_register dst_mask( struct prog_dst_register reg, int mask ) +{ + reg.WriteMask &= mask; + return reg; +} + +static struct prog_dst_register dst_undef( void ) +{ + return dst_reg(PROGRAM_UNDEFINED, 0); +} + + + +static struct prog_dst_register get_temp( struct brw_wm_compile *c ) +{ + int bit = _mesa_ffs( ~c->fp_temp ); + + if (!bit) { + _mesa_printf("%s: out of temporaries\n", __FILE__); + exit(1); + } + + c->fp_temp |= 1<<(bit-1); + return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1)); +} + + +static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp ) +{ + c->fp_temp &= ~(1 << (temp.Index - FIRST_INTERNAL_TEMP)); +} + + +/*********************************************************************** + * Instructions + */ + +static struct prog_instruction *get_fp_inst(struct brw_wm_compile *c) +{ + return &c->prog_instructions[c->nr_fp_insns++]; +} + +static struct prog_instruction *emit_insn(struct brw_wm_compile *c, + const struct prog_instruction *inst0) +{ + struct prog_instruction *inst = get_fp_inst(c); + *inst = *inst0; + return inst; +} + +static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c, + GLuint op, + struct prog_dst_register dest, + GLuint saturate, + GLuint tex_src_unit, + GLuint tex_src_target, + GLuint tex_shadow, + struct prog_src_register src0, + struct prog_src_register src1, + struct prog_src_register src2 ) +{ + struct prog_instruction *inst = get_fp_inst(c); + + assert(tex_src_unit < BRW_MAX_TEX_UNIT || + tex_src_unit == TEX_UNIT_NONE); + assert(tex_src_target < NUM_TEXTURE_TARGETS || + tex_src_target == TEX_TARGET_NONE); + + /* update mask of which texture units are referenced by this program */ + if (tex_src_unit != TEX_UNIT_NONE) + c->fp->tex_units_used |= (1 << tex_src_unit); + + memset(inst, 0, sizeof(*inst)); + + inst->Opcode = op; + inst->DstReg = dest; + inst->SaturateMode = saturate; + inst->TexSrcUnit = tex_src_unit; + inst->TexSrcTarget = tex_src_target; + inst->TexShadow = tex_shadow; + inst->SrcReg[0] = src0; + inst->SrcReg[1] = src1; + inst->SrcReg[2] = src2; + return inst; +} + + +static struct prog_instruction * emit_op(struct brw_wm_compile *c, + GLuint op, + struct prog_dst_register dest, + GLuint saturate, + struct prog_src_register src0, + struct prog_src_register src1, + struct prog_src_register src2 ) +{ + return emit_tex_op(c, op, dest, saturate, + TEX_UNIT_NONE, TEX_TARGET_NONE, 0, /* unit, tgt, shadow */ + src0, src1, src2); +} + + +/* Many Mesa opcodes produce the same value across all the result channels. + * We'd rather not have to support that splatting in the opcode implementations, + * and brw_wm_pass*.c wants to optimize them out by shuffling references around + * anyway. We can easily get both by emitting the opcode to one channel, and + * then MOVing it to the others, which brw_wm_pass*.c already understands. + */ +static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c, + const struct prog_instruction *inst0) +{ + struct prog_instruction *inst; + unsigned int dst_chan; + unsigned int other_channel_mask; + + if (inst0->DstReg.WriteMask == 0) + return NULL; + + dst_chan = _mesa_ffs(inst0->DstReg.WriteMask) - 1; + inst = get_fp_inst(c); + *inst = *inst0; + inst->DstReg.WriteMask = 1 << dst_chan; + + other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan); + if (other_channel_mask != 0) { + inst = emit_op(c, + OPCODE_MOV, + dst_mask(inst0->DstReg, other_channel_mask), + 0, + src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan), + src_undef(), + src_undef()); + } + return inst; +} + + +/*********************************************************************** + * Special instructions for interpolation and other tasks + */ + +static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c ) +{ + if (src_is_undef(c->pixel_xy)) { + struct prog_dst_register pixel_xy = get_temp(c); + struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH); + + + /* Emit the out calculations, and hold onto the results. Use + * two instructions as a temporary is required. + */ + /* pixel_xy.xy = PIXELXY payload[0]; + */ + emit_op(c, + WM_PIXELXY, + dst_mask(pixel_xy, WRITEMASK_XY), + 0, + payload_r0_depth, + src_undef(), + src_undef()); + + c->pixel_xy = src_reg_from_dst(pixel_xy); + } + + return c->pixel_xy; +} + +static struct prog_src_register get_delta_xy( struct brw_wm_compile *c ) +{ + if (src_is_undef(c->delta_xy)) { + struct prog_dst_register delta_xy = get_temp(c); + struct prog_src_register pixel_xy = get_pixel_xy(c); + struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH); + + /* deltas.xy = DELTAXY pixel_xy, payload[0] + */ + emit_op(c, + WM_DELTAXY, + dst_mask(delta_xy, WRITEMASK_XY), + 0, + pixel_xy, + payload_r0_depth, + src_undef()); + + c->delta_xy = src_reg_from_dst(delta_xy); + } + + return c->delta_xy; +} + +static struct prog_src_register get_pixel_w( struct brw_wm_compile *c ) +{ + if (src_is_undef(c->pixel_w)) { + struct prog_dst_register pixel_w = get_temp(c); + struct prog_src_register deltas = get_delta_xy(c); + struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS); + + /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x + */ + emit_op(c, + WM_PIXELW, + dst_mask(pixel_w, WRITEMASK_W), + 0, + interp_wpos, + deltas, + src_undef()); + + + c->pixel_w = src_reg_from_dst(pixel_w); + } + + return c->pixel_w; +} + +static void emit_interp( struct brw_wm_compile *c, + GLuint idx ) +{ + struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx); + struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx); + struct prog_src_register deltas = get_delta_xy(c); + + /* Need to use PINTERP on attributes which have been + * multiplied by 1/W in the SF program, and LINTERP on those + * which have not: + */ + switch (idx) { + case FRAG_ATTRIB_WPOS: + /* Have to treat wpos.xy specially: + */ + emit_op(c, + WM_WPOSXY, + dst_mask(dst, WRITEMASK_XY), + 0, + get_pixel_xy(c), + src_undef(), + src_undef()); + + dst = dst_mask(dst, WRITEMASK_ZW); + + /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw + */ + emit_op(c, + WM_LINTERP, + dst, + 0, + interp, + deltas, + src_undef()); + break; + case FRAG_ATTRIB_COL0: + case FRAG_ATTRIB_COL1: + if (c->key.flat_shade) { + emit_op(c, + WM_CINTERP, + dst, + 0, + interp, + src_undef(), + src_undef()); + } + else { + if (c->key.linear_color) { + emit_op(c, + WM_LINTERP, + dst, + 0, + interp, + deltas, + src_undef()); + } + else { + /* perspective-corrected color interpolation */ + emit_op(c, + WM_PINTERP, + dst, + 0, + interp, + deltas, + get_pixel_w(c)); + } + } + break; + case FRAG_ATTRIB_FOGC: + /* Interpolate the fog coordinate */ + emit_op(c, + WM_PINTERP, + dst_mask(dst, WRITEMASK_X), + 0, + interp, + deltas, + get_pixel_w(c)); + + emit_op(c, + OPCODE_MOV, + dst_mask(dst, WRITEMASK_YZW), + 0, + src_swizzle(interp, + SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ONE), + src_undef(), + src_undef()); + break; + + case FRAG_ATTRIB_FACE: + /* XXX review/test this case */ + emit_op(c, + WM_FRONTFACING, + dst_mask(dst, WRITEMASK_X), + 0, + src_undef(), + src_undef(), + src_undef()); + break; + + case FRAG_ATTRIB_PNTC: + /* XXX review/test this case */ + emit_op(c, + WM_PINTERP, + dst_mask(dst, WRITEMASK_XY), + 0, + interp, + deltas, + get_pixel_w(c)); + + emit_op(c, + OPCODE_MOV, + dst_mask(dst, WRITEMASK_ZW), + 0, + src_swizzle(interp, + SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ONE), + src_undef(), + src_undef()); + break; + + default: + emit_op(c, + WM_PINTERP, + dst, + 0, + interp, + deltas, + get_pixel_w(c)); + break; + } + + c->fp_interp_emitted |= 1<<idx; +} + +/*********************************************************************** + * Hacks to extend the program parameter and constant lists. + */ + +/* Add the fog parameters to the parameter list of the original + * program, rather than creating a new list. Doesn't really do any + * harm and it's not as if the parameter handling isn't a big hack + * anyway. + */ +static struct prog_src_register search_or_add_param5(struct brw_wm_compile *c, + GLint s0, + GLint s1, + GLint s2, + GLint s3, + GLint s4) +{ + struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters; + gl_state_index tokens[STATE_LENGTH]; + GLuint idx; + tokens[0] = s0; + tokens[1] = s1; + tokens[2] = s2; + tokens[3] = s3; + tokens[4] = s4; + + for (idx = 0; idx < paramList->NumParameters; idx++) { + if (paramList->Parameters[idx].Type == PROGRAM_STATE_VAR && + memcmp(paramList->Parameters[idx].StateIndexes, tokens, sizeof(tokens)) == 0) + return src_reg(PROGRAM_STATE_VAR, idx); + } + + idx = _mesa_add_state_reference( paramList, tokens ); + + return src_reg(PROGRAM_STATE_VAR, idx); +} + + +static struct prog_src_register search_or_add_const4f( struct brw_wm_compile *c, + GLfloat s0, + GLfloat s1, + GLfloat s2, + GLfloat s3) +{ + struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters; + GLfloat values[4]; + GLuint idx; + GLuint swizzle; + + values[0] = s0; + values[1] = s1; + values[2] = s2; + values[3] = s3; + + /* Have to search, otherwise multiple compilations will each grow + * the parameter list. + */ + for (idx = 0; idx < paramList->NumParameters; idx++) { + if (paramList->Parameters[idx].Type == PROGRAM_CONSTANT && + memcmp(paramList->ParameterValues[idx], values, sizeof(values)) == 0) + + /* XXX: this mimics the mesa bug which puts all constants and + * parameters into the "PROGRAM_STATE_VAR" category: + */ + return src_reg(PROGRAM_STATE_VAR, idx); + } + + idx = _mesa_add_unnamed_constant( paramList, values, 4, &swizzle ); + assert(swizzle == SWIZZLE_NOOP); /* Need to handle swizzle in reg setup */ + return src_reg(PROGRAM_STATE_VAR, idx); +} + + + +/*********************************************************************** + * Expand various instructions here to simpler forms. + */ +static void precalc_dst( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct prog_src_register src0 = inst->SrcReg[0]; + struct prog_src_register src1 = inst->SrcReg[1]; + struct prog_dst_register dst = inst->DstReg; + + if (dst.WriteMask & WRITEMASK_Y) { + /* dst.y = mul src0.y, src1.y + */ + emit_op(c, + OPCODE_MUL, + dst_mask(dst, WRITEMASK_Y), + inst->SaturateMode, + src0, + src1, + src_undef()); + } + + if (dst.WriteMask & WRITEMASK_XZ) { + struct prog_instruction *swz; + GLuint z = GET_SWZ(src0.Swizzle, Z); + + /* dst.xz = swz src0.1zzz + */ + swz = emit_op(c, + OPCODE_SWZ, + dst_mask(dst, WRITEMASK_XZ), + inst->SaturateMode, + src_swizzle(src0, SWIZZLE_ONE, z, z, z), + src_undef(), + src_undef()); + /* Avoid letting negation flag of src0 affect our 1 constant. */ + swz->SrcReg[0].Negate &= ~NEGATE_X; + } + if (dst.WriteMask & WRITEMASK_W) { + /* dst.w = mov src1.w + */ + emit_op(c, + OPCODE_MOV, + dst_mask(dst, WRITEMASK_W), + inst->SaturateMode, + src1, + src_undef(), + src_undef()); + } +} + + +static void precalc_lit( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct prog_src_register src0 = inst->SrcReg[0]; + struct prog_dst_register dst = inst->DstReg; + + if (dst.WriteMask & WRITEMASK_XW) { + struct prog_instruction *swz; + + /* dst.xw = swz src0.1111 + */ + swz = emit_op(c, + OPCODE_SWZ, + dst_mask(dst, WRITEMASK_XW), + 0, + src_swizzle1(src0, SWIZZLE_ONE), + src_undef(), + src_undef()); + /* Avoid letting the negation flag of src0 affect our 1 constant. */ + swz->SrcReg[0].Negate = NEGATE_NONE; + } + + if (dst.WriteMask & WRITEMASK_YZ) { + emit_op(c, + OPCODE_LIT, + dst_mask(dst, WRITEMASK_YZ), + inst->SaturateMode, + src0, + src_undef(), + src_undef()); + } +} + + +/** + * Some TEX instructions require extra code, cube map coordinate + * normalization, or coordinate scaling for RECT textures, etc. + * This function emits those extra instructions and the TEX + * instruction itself. + */ +static void precalc_tex( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct prog_src_register coord; + struct prog_dst_register tmpcoord; + const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]; + + assert(unit < BRW_MAX_TEX_UNIT); + + if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) { + struct prog_instruction *out; + struct prog_dst_register tmp0 = get_temp(c); + struct prog_src_register tmp0src = src_reg_from_dst(tmp0); + struct prog_dst_register tmp1 = get_temp(c); + struct prog_src_register tmp1src = src_reg_from_dst(tmp1); + struct prog_src_register src0 = inst->SrcReg[0]; + + /* find longest component of coord vector and normalize it */ + tmpcoord = get_temp(c); + coord = src_reg_from_dst(tmpcoord); + + /* tmpcoord = src0 (i.e.: coord = src0) */ + out = emit_op(c, OPCODE_MOV, + tmpcoord, + 0, + src0, + src_undef(), + src_undef()); + out->SrcReg[0].Negate = NEGATE_NONE; + out->SrcReg[0].Abs = 1; + + /* tmp0 = MAX(coord.X, coord.Y) */ + emit_op(c, OPCODE_MAX, + tmp0, + 0, + src_swizzle1(coord, X), + src_swizzle1(coord, Y), + src_undef()); + + /* tmp1 = MAX(tmp0, coord.Z) */ + emit_op(c, OPCODE_MAX, + tmp1, + 0, + tmp0src, + src_swizzle1(coord, Z), + src_undef()); + + /* tmp0 = 1 / tmp1 */ + emit_op(c, OPCODE_RCP, + dst_mask(tmp0, WRITEMASK_X), + 0, + tmp1src, + src_undef(), + src_undef()); + + /* tmpCoord = src0 * tmp0 */ + emit_op(c, OPCODE_MUL, + tmpcoord, + 0, + src0, + src_swizzle1(tmp0src, SWIZZLE_X), + src_undef()); + + release_temp(c, tmp0); + release_temp(c, tmp1); + } + else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) { + struct prog_src_register scale = + search_or_add_param5( c, + STATE_INTERNAL, + STATE_TEXRECT_SCALE, + unit, + 0,0 ); + + tmpcoord = get_temp(c); + + /* coord.xy = MUL inst->SrcReg[0], { 1/width, 1/height } + */ + emit_op(c, + OPCODE_MUL, + tmpcoord, + 0, + inst->SrcReg[0], + src_swizzle(scale, + SWIZZLE_X, + SWIZZLE_Y, + SWIZZLE_ONE, + SWIZZLE_ONE), + src_undef()); + + coord = src_reg_from_dst(tmpcoord); + } + else { + coord = inst->SrcReg[0]; + } + + /* Need to emit YUV texture conversions by hand. Probably need to + * do this here - the alternative is in brw_wm_emit.c, but the + * conversion requires allocating a temporary variable which we + * don't have the facility to do that late in the compilation. + */ + if (c->key.yuvtex_mask & (1 << unit)) { + /* convert ycbcr to RGBA */ + GLboolean swap_uv = c->key.yuvtex_swap_mask & (1<<unit); + + /* + CONST C0 = { -.5, -.0625, -.5, 1.164 } + CONST C1 = { 1.596, -0.813, 2.018, -.391 } + UYV = TEX ... + UYV.xyz = ADD UYV, C0 + UYV.y = MUL UYV.y, C0.w + if (UV swaped) + RGB.xyz = MAD UYV.zzx, C1, UYV.y + else + RGB.xyz = MAD UYV.xxz, C1, UYV.y + RGB.y = MAD UYV.z, C1.w, RGB.y + */ + struct prog_dst_register dst = inst->DstReg; + struct prog_dst_register tmp = get_temp(c); + struct prog_src_register tmpsrc = src_reg_from_dst(tmp); + struct prog_src_register C0 = search_or_add_const4f( c, -.5, -.0625, -.5, 1.164 ); + struct prog_src_register C1 = search_or_add_const4f( c, 1.596, -0.813, 2.018, -.391 ); + + /* tmp = TEX ... + */ + emit_tex_op(c, + OPCODE_TEX, + tmp, + inst->SaturateMode, + unit, + inst->TexSrcTarget, + inst->TexShadow, + coord, + src_undef(), + src_undef()); + + /* tmp.xyz = ADD TMP, C0 + */ + emit_op(c, + OPCODE_ADD, + dst_mask(tmp, WRITEMASK_XYZ), + 0, + tmpsrc, + C0, + src_undef()); + + /* YUV.y = MUL YUV.y, C0.w + */ + + emit_op(c, + OPCODE_MUL, + dst_mask(tmp, WRITEMASK_Y), + 0, + tmpsrc, + src_swizzle1(C0, W), + src_undef()); + + /* + * if (UV swaped) + * RGB.xyz = MAD YUV.zzx, C1, YUV.y + * else + * RGB.xyz = MAD YUV.xxz, C1, YUV.y + */ + + emit_op(c, + OPCODE_MAD, + dst_mask(dst, WRITEMASK_XYZ), + 0, + swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z), + C1, + src_swizzle1(tmpsrc, Y)); + + /* RGB.y = MAD YUV.z, C1.w, RGB.y + */ + emit_op(c, + OPCODE_MAD, + dst_mask(dst, WRITEMASK_Y), + 0, + src_swizzle1(tmpsrc, Z), + src_swizzle1(C1, W), + src_swizzle1(src_reg_from_dst(dst), Y)); + + release_temp(c, tmp); + } + else { + /* ordinary RGBA tex instruction */ + emit_tex_op(c, + OPCODE_TEX, + inst->DstReg, + inst->SaturateMode, + unit, + inst->TexSrcTarget, + inst->TexShadow, + coord, + src_undef(), + src_undef()); + } + + /* For GL_EXT_texture_swizzle: */ + if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) { + /* swizzle the result of the TEX instruction */ + struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg); + emit_op(c, OPCODE_SWZ, + inst->DstReg, + SATURATE_OFF, /* saturate already done above */ + src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]), + src_undef(), + src_undef()); + } + + if ((inst->TexSrcTarget == TEXTURE_RECT_INDEX) || + (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)) + release_temp(c, tmpcoord); +} + + +/** + * Check if the given TXP instruction really needs the divide-by-W step. + */ +static GLboolean projtex( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + const struct prog_src_register src = inst->SrcReg[0]; + GLboolean retVal; + + assert(inst->Opcode == OPCODE_TXP); + + /* Only try to detect the simplest cases. Could detect (later) + * cases where we are trying to emit code like RCP {1.0}, MUL x, + * {1.0}, and so on. + * + * More complex cases than this typically only arise from + * user-provided fragment programs anyway: + */ + if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) + retVal = GL_FALSE; /* ut2004 gun rendering !?! */ + else if (src.File == PROGRAM_INPUT && + GET_SWZ(src.Swizzle, W) == W && + (c->key.proj_attrib_mask & (1 << src.Index)) == 0) + retVal = GL_FALSE; + else + retVal = GL_TRUE; + + return retVal; +} + + +/** + * Emit code for TXP. + */ +static void precalc_txp( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct prog_src_register src0 = inst->SrcReg[0]; + + if (projtex(c, inst)) { + struct prog_dst_register tmp = get_temp(c); + struct prog_instruction tmp_inst; + + /* tmp0.w = RCP inst.arg[0][3] + */ + emit_op(c, + OPCODE_RCP, + dst_mask(tmp, WRITEMASK_W), + 0, + src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)), + src_undef(), + src_undef()); + + /* tmp0.xyz = MUL inst.arg[0], tmp0.wwww + */ + emit_op(c, + OPCODE_MUL, + dst_mask(tmp, WRITEMASK_XYZ), + 0, + src0, + src_swizzle1(src_reg_from_dst(tmp), W), + src_undef()); + + /* dst = precalc(TEX tmp0) + */ + tmp_inst = *inst; + tmp_inst.SrcReg[0] = src_reg_from_dst(tmp); + precalc_tex(c, &tmp_inst); + + release_temp(c, tmp); + } + else + { + /* dst = precalc(TEX src0) + */ + precalc_tex(c, inst); + } +} + + + +static void emit_fb_write( struct brw_wm_compile *c ) +{ + struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH); + struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH); + struct prog_src_register outcolor; + GLuint i; + + struct prog_instruction *inst, *last_inst; + struct brw_context *brw = c->func.brw; + + /* The inst->Aux field is used for FB write target and the EOT marker */ + + if (brw->state.nr_color_regions > 1) { + for (i = 0 ; i < brw->state.nr_color_regions; i++) { + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i); + last_inst = inst = emit_op(c, + WM_FB_WRITE, dst_mask(dst_undef(),0), 0, + outcolor, payload_r0_depth, outdepth); + inst->Aux = (i<<1); + if (c->fp_fragcolor_emitted) { + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR); + last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0), + 0, outcolor, payload_r0_depth, outdepth); + inst->Aux = (i<<1); + } + } + last_inst->Aux |= 1; //eot + } + else { + /* if gl_FragData[0] is written, use it, else use gl_FragColor */ + if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0)) + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0); + else + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR); + + inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0), + 0, outcolor, payload_r0_depth, outdepth); + inst->Aux = 1|(0<<1); + } +} + + + + +/*********************************************************************** + * Emit INTERP instructions ahead of first use of each attrib. + */ + +static void validate_src_regs( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + GLuint nr_args = brw_wm_nr_args( inst->Opcode ); + GLuint i; + + for (i = 0; i < nr_args; i++) { + if (inst->SrcReg[i].File == PROGRAM_INPUT) { + GLuint idx = inst->SrcReg[i].Index; + if (!(c->fp_interp_emitted & (1<<idx))) { + emit_interp(c, idx); + } + } + } +} + +static void validate_dst_regs( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + if (inst->DstReg.File == PROGRAM_OUTPUT) { + GLuint idx = inst->DstReg.Index; + if (idx == FRAG_RESULT_COLOR) + c->fp_fragcolor_emitted = 1; + } +} + +static void print_insns( const struct prog_instruction *insn, + GLuint nr ) +{ + GLuint i; + for (i = 0; i < nr; i++, insn++) { + _mesa_printf("%3d: ", i); + if (insn->Opcode < MAX_OPCODE) + _mesa_print_instruction(insn); + else if (insn->Opcode < MAX_WM_OPCODE) { + GLuint idx = insn->Opcode - MAX_OPCODE; + + _mesa_print_alu_instruction(insn, + wm_opcode_strings[idx], + 3); + } + else + _mesa_printf("965 Opcode %d\n", insn->Opcode); + } +} + + +/** + * Initial pass for fragment program code generation. + * This function is used by both the GLSL and non-GLSL paths. + */ +void brw_wm_pass_fp( struct brw_wm_compile *c ) +{ + struct brw_fragment_program *fp = c->fp; + GLuint insn; + + if (INTEL_DEBUG & DEBUG_WM) { + _mesa_printf("pre-fp:\n"); + _mesa_print_program(&fp->program.Base); + _mesa_printf("\n"); + } + + c->pixel_xy = src_undef(); + c->delta_xy = src_undef(); + c->pixel_w = src_undef(); + c->nr_fp_insns = 0; + c->fp->tex_units_used = 0x0; + + /* Emit preamble instructions. This is where special instructions such as + * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to + * compute shader inputs from varying vars. + */ + for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) { + const struct prog_instruction *inst = &fp->program.Base.Instructions[insn]; + validate_src_regs(c, inst); + validate_dst_regs(c, inst); + } + + /* Loop over all instructions doing assorted simplifications and + * transformations. + */ + for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) { + const struct prog_instruction *inst = &fp->program.Base.Instructions[insn]; + struct prog_instruction *out; + + /* Check for INPUT values, emit INTERP instructions where + * necessary: + */ + + switch (inst->Opcode) { + case OPCODE_SWZ: + out = emit_insn(c, inst); + out->Opcode = OPCODE_MOV; + break; + + case OPCODE_ABS: + out = emit_insn(c, inst); + out->Opcode = OPCODE_MOV; + out->SrcReg[0].Negate = NEGATE_NONE; + out->SrcReg[0].Abs = 1; + break; + + case OPCODE_SUB: + out = emit_insn(c, inst); + out->Opcode = OPCODE_ADD; + out->SrcReg[1].Negate ^= NEGATE_XYZW; + break; + + case OPCODE_SCS: + out = emit_insn(c, inst); + /* This should probably be done in the parser. + */ + out->DstReg.WriteMask &= WRITEMASK_XY; + break; + + case OPCODE_DST: + precalc_dst(c, inst); + break; + + case OPCODE_LIT: + precalc_lit(c, inst); + break; + + case OPCODE_TEX: + precalc_tex(c, inst); + break; + + case OPCODE_TXP: + precalc_txp(c, inst); + break; + + case OPCODE_TXB: + out = emit_insn(c, inst); + out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit]; + assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT); + break; + + case OPCODE_XPD: + out = emit_insn(c, inst); + /* This should probably be done in the parser. + */ + out->DstReg.WriteMask &= WRITEMASK_XYZ; + break; + + case OPCODE_KIL: + out = emit_insn(c, inst); + /* This should probably be done in the parser. + */ + out->DstReg.WriteMask = 0; + break; + case OPCODE_END: + emit_fb_write(c); + break; + case OPCODE_PRINT: + break; + default: + if (brw_wm_is_scalar_result(inst->Opcode)) + emit_scalar_insn(c, inst); + else + emit_insn(c, inst); + break; + } + } + + if (INTEL_DEBUG & DEBUG_WM) { + _mesa_printf("pass_fp:\n"); + print_insns( c->prog_instructions, c->nr_fp_insns ); + _mesa_printf("\n"); + } +} + diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c new file mode 100644 index 00000000000..c9fe1dd8ad2 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_glsl.c @@ -0,0 +1,3046 @@ +#include "main/macros.h" +#include "shader/prog_parameter.h" +#include "shader/prog_print.h" +#include "shader/prog_optimize.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_wm.h" + +enum _subroutine { + SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4 +}; + +static struct brw_reg get_dst_reg(struct brw_wm_compile *c, + const struct prog_instruction *inst, + GLuint component); + +/** + * Determine if the given fragment program uses GLSL features such + * as flow conditionals, loops, subroutines. + * Some GLSL shaders may use these features, others might not. + */ +GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp) +{ + int i; + + for (i = 0; i < fp->Base.NumInstructions; i++) { + const struct prog_instruction *inst = &fp->Base.Instructions[i]; + switch (inst->Opcode) { + case OPCODE_ARL: + case OPCODE_IF: + case OPCODE_ENDIF: + case OPCODE_CAL: + case OPCODE_BRK: + case OPCODE_RET: + case OPCODE_NOISE1: + case OPCODE_NOISE2: + case OPCODE_NOISE3: + case OPCODE_NOISE4: + case OPCODE_BGNLOOP: + return GL_TRUE; + default: + break; + } + } + return GL_FALSE; +} + + + +static void +reclaim_temps(struct brw_wm_compile *c); + + +/** Mark GRF register as used. */ +static void +prealloc_grf(struct brw_wm_compile *c, int r) +{ + c->used_grf[r] = GL_TRUE; +} + + +/** Mark given GRF register as not in use. */ +static void +release_grf(struct brw_wm_compile *c, int r) +{ + /*assert(c->used_grf[r]);*/ + c->used_grf[r] = GL_FALSE; + c->first_free_grf = MIN2(c->first_free_grf, r); +} + + +/** Return index of a free GRF, mark it as used. */ +static int +alloc_grf(struct brw_wm_compile *c) +{ + GLuint r; + for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) { + if (!c->used_grf[r]) { + c->used_grf[r] = GL_TRUE; + c->first_free_grf = r + 1; /* a guess */ + return r; + } + } + + /* no free temps, try to reclaim some */ + reclaim_temps(c); + c->first_free_grf = 0; + + /* try alloc again */ + for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) { + if (!c->used_grf[r]) { + c->used_grf[r] = GL_TRUE; + c->first_free_grf = r + 1; /* a guess */ + return r; + } + } + + for (r = 0; r < BRW_WM_MAX_GRF; r++) { + assert(c->used_grf[r]); + } + + /* really, no free GRF regs found */ + if (!c->out_of_regs) { + /* print warning once per compilation */ + _mesa_warning(NULL, "i965: ran out of registers for fragment program"); + c->out_of_regs = GL_TRUE; + } + + return -1; +} + + +/** Return number of GRF registers used */ +static int +num_grf_used(const struct brw_wm_compile *c) +{ + int r; + for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--) + if (c->used_grf[r]) + return r + 1; + return 0; +} + + + +/** + * Record the mapping of a Mesa register to a hardware register. + */ +static void set_reg(struct brw_wm_compile *c, int file, int index, + int component, struct brw_reg reg) +{ + c->wm_regs[file][index][component].reg = reg; + c->wm_regs[file][index][component].inited = GL_TRUE; +} + +static struct brw_reg alloc_tmp(struct brw_wm_compile *c) +{ + struct brw_reg reg; + + /* if we need to allocate another temp, grow the tmp_regs[] array */ + if (c->tmp_index == c->tmp_max) { + int r = alloc_grf(c); + if (r < 0) { + /*printf("Out of temps in %s\n", __FUNCTION__);*/ + r = 50; /* XXX random register! */ + } + c->tmp_regs[ c->tmp_max++ ] = r; + } + + /* form the GRF register */ + reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0); + /*printf("alloc_temp %d\n", reg.nr);*/ + assert(reg.nr < BRW_WM_MAX_GRF); + return reg; + +} + +/** + * Save current temp register info. + * There must be a matching call to release_tmps(). + */ +static int mark_tmps(struct brw_wm_compile *c) +{ + return c->tmp_index; +} + +static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index ) +{ + return brw_vec8_grf( c->tmp_regs[ index ], 0 ); +} + +static void release_tmps(struct brw_wm_compile *c, int mark) +{ + c->tmp_index = mark; +} + +/** + * Convert Mesa src register to brw register. + * + * Since we're running in SOA mode each Mesa register corresponds to four + * hardware registers. We allocate the hardware registers as needed here. + * + * \param file register file, one of PROGRAM_x + * \param index register number + * \param component src component (X=0, Y=1, Z=2, W=3) + * \param nr not used?!? + * \param neg negate value? + * \param abs take absolute value? + */ +static struct brw_reg +get_reg(struct brw_wm_compile *c, int file, int index, int component, + int nr, GLuint neg, GLuint abs) +{ + struct brw_reg reg; + switch (file) { + case PROGRAM_STATE_VAR: + case PROGRAM_CONSTANT: + case PROGRAM_UNIFORM: + file = PROGRAM_STATE_VAR; + break; + case PROGRAM_UNDEFINED: + return brw_null_reg(); + case PROGRAM_TEMPORARY: + case PROGRAM_INPUT: + case PROGRAM_OUTPUT: + case PROGRAM_PAYLOAD: + break; + default: + _mesa_problem(NULL, "Unexpected file in get_reg()"); + return brw_null_reg(); + } + + assert(index < 256); + assert(component < 4); + + /* see if we've already allocated a HW register for this Mesa register */ + if (c->wm_regs[file][index][component].inited) { + /* yes, re-use */ + reg = c->wm_regs[file][index][component].reg; + } + else { + /* no, allocate new register */ + int grf = alloc_grf(c); + /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/ + if (grf < 0) { + /* totally out of temps */ + grf = 51; /* XXX random register! */ + } + + reg = brw_vec8_grf(grf, 0); + /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/ + + set_reg(c, file, index, component, reg); + } + + if (neg & (1 << component)) { + reg = negate(reg); + } + if (abs) + reg = brw_abs(reg); + return reg; +} + + + +/** + * This is called if we run out of GRF registers. Examine the live intervals + * of temp regs in the program and free those which won't be used again. + */ +static void +reclaim_temps(struct brw_wm_compile *c) +{ + GLint intBegin[MAX_PROGRAM_TEMPS]; + GLint intEnd[MAX_PROGRAM_TEMPS]; + int index; + + /*printf("Reclaim temps:\n");*/ + + _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns, + intBegin, intEnd); + + for (index = 0; index < MAX_PROGRAM_TEMPS; index++) { + if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) { + /* program temp[i] can be freed */ + int component; + /*printf(" temp[%d] is dead\n", index);*/ + for (component = 0; component < 4; component++) { + if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) { + int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr; + release_grf(c, r); + /* + printf(" Reclaim temp %d, reg %d at inst %d\n", + index, r, c->cur_inst); + */ + c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE; + } + } + } + } +} + + + + +/** + * Preallocate registers. This sets up the Mesa to hardware register + * mapping for certain registers, such as constants (uniforms/state vars) + * and shader inputs. + */ +static void prealloc_reg(struct brw_wm_compile *c) +{ + int i, j; + struct brw_reg reg; + int urb_read_length = 0; + GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted; + GLuint reg_index = 0; + + memset(c->used_grf, GL_FALSE, sizeof(c->used_grf)); + c->first_free_grf = 0; + + for (i = 0; i < 4; i++) { + if (i < c->key.nr_depth_regs) + reg = brw_vec8_grf(i * 2, 0); + else + reg = brw_vec8_grf(0, 0); + set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg); + } + reg_index += 2 * c->key.nr_depth_regs; + + /* constants */ + { + const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters; + const GLuint nr_temps = c->fp->program.Base.NumTemporaries; + + /* use a real constant buffer, or just use a section of the GRF? */ + /* XXX this heuristic may need adjustment... */ + if ((nr_params + nr_temps) * 4 + reg_index > 80) + c->fp->use_const_buffer = GL_TRUE; + else + c->fp->use_const_buffer = GL_FALSE; + /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/ + + if (c->fp->use_const_buffer) { + /* We'll use a real constant buffer and fetch constants from + * it with a dataport read message. + */ + + /* number of float constants in CURBE */ + c->prog_data.nr_params = 0; + } + else { + const struct gl_program_parameter_list *plist = + c->fp->program.Base.Parameters; + int index = 0; + + /* number of float constants in CURBE */ + c->prog_data.nr_params = 4 * nr_params; + + /* loop over program constants (float[4]) */ + for (i = 0; i < nr_params; i++) { + /* loop over XYZW channels */ + for (j = 0; j < 4; j++, index++) { + reg = brw_vec1_grf(reg_index + index / 8, index % 8); + /* Save pointer to parameter/constant value. + * Constants will be copied in prepare_constant_buffer() + */ + c->prog_data.param[index] = &plist->ParameterValues[i][j]; + set_reg(c, PROGRAM_STATE_VAR, i, j, reg); + } + } + /* number of constant regs used (each reg is float[8]) */ + c->nr_creg = 2 * ((4 * nr_params + 15) / 16); + reg_index += c->nr_creg; + } + } + + /* fragment shader inputs */ + for (i = 0; i < VERT_RESULT_MAX; i++) { + int fp_input; + + if (i >= VERT_RESULT_VAR0) + fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0; + else if (i <= VERT_RESULT_TEX7) + fp_input = i; + else + fp_input = -1; + + if (fp_input >= 0 && inputs & (1 << fp_input)) { + urb_read_length = reg_index; + reg = brw_vec8_grf(reg_index, 0); + for (j = 0; j < 4; j++) + set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg); + } + if (c->key.vp_outputs_written & (1 << i)) { + reg_index += 2; + } + } + + c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2; + c->prog_data.urb_read_length = urb_read_length; + c->prog_data.curb_read_length = c->nr_creg; + c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0); + reg_index++; + c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0); + reg_index += 2; + + /* mark GRF regs [0..reg_index-1] as in-use */ + for (i = 0; i < reg_index; i++) + prealloc_grf(c, i); + + /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */ + prealloc_grf(c, 126); + prealloc_grf(c, 127); + + for (i = 0; i < c->nr_fp_insns; i++) { + const struct prog_instruction *inst = &c->prog_instructions[i]; + struct brw_reg dst[4]; + + switch (inst->Opcode) { + case OPCODE_TEX: + case OPCODE_TXB: + /* Allocate the channels of texture results contiguously, + * since they are written out that way by the sampler unit. + */ + for (j = 0; j < 4; j++) { + dst[j] = get_dst_reg(c, inst, j); + if (j != 0) + assert(dst[j].nr == dst[j - 1].nr + 1); + } + break; + default: + break; + } + } + + /* An instruction may reference up to three constants. + * They'll be found in these registers. + * XXX alloc these on demand! + */ + if (c->fp->use_const_buffer) { + for (i = 0; i < 3; i++) { + c->current_const[i].index = -1; + c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0); + } + } +#if 0 + printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer); + printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index); +#endif +} + + +/** + * Check if any of the instruction's src registers are constants, uniforms, + * or statevars. If so, fetch any constants that we don't already have in + * the three GRF slots. + */ +static void fetch_constants(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint i; + + /* loop over instruction src regs */ + for (i = 0; i < 3; i++) { + const struct prog_src_register *src = &inst->SrcReg[i]; + if (src->File == PROGRAM_STATE_VAR || + src->File == PROGRAM_CONSTANT || + src->File == PROGRAM_UNIFORM) { + c->current_const[i].index = src->Index; + +#if 0 + printf(" fetch const[%d] for arg %d into reg %d\n", + src->Index, i, c->current_const[i].reg.nr); +#endif + + /* need to fetch the constant now */ + brw_dp_READ_4(p, + c->current_const[i].reg, /* writeback dest */ + src->RelAddr, /* relative indexing? */ + 16 * src->Index, /* byte offset */ + SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */ + ); + } + } +} + + +/** + * Convert Mesa dst register to brw register. + */ +static struct brw_reg get_dst_reg(struct brw_wm_compile *c, + const struct prog_instruction *inst, + GLuint component) +{ + const int nr = 1; + return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr, + 0, 0); +} + + +static struct brw_reg +get_src_reg_const(struct brw_wm_compile *c, + const struct prog_instruction *inst, + GLuint srcRegIndex, GLuint component) +{ + /* We should have already fetched the constant from the constant + * buffer in fetch_constants(). Now we just have to return a + * register description that extracts the needed component and + * smears it across all eight vector components. + */ + const struct prog_src_register *src = &inst->SrcReg[srcRegIndex]; + struct brw_reg const_reg; + + assert(component < 4); + assert(srcRegIndex < 3); + assert(c->current_const[srcRegIndex].index != -1); + const_reg = c->current_const[srcRegIndex].reg; + + /* extract desired float from the const_reg, and smear */ + const_reg = stride(const_reg, 0, 1, 0); + const_reg.subnr = component * 4; + + if (src->Negate & (1 << component)) + const_reg = negate(const_reg); + if (src->Abs) + const_reg = brw_abs(const_reg); + +#if 0 + printf(" form const[%d].%d for arg %d, reg %d\n", + c->current_const[srcRegIndex].index, + component, + srcRegIndex, + const_reg.nr); +#endif + + return const_reg; +} + + +/** + * Convert Mesa src register to brw register. + */ +static struct brw_reg get_src_reg(struct brw_wm_compile *c, + const struct prog_instruction *inst, + GLuint srcRegIndex, GLuint channel) +{ + const struct prog_src_register *src = &inst->SrcReg[srcRegIndex]; + const GLuint nr = 1; + const GLuint component = GET_SWZ(src->Swizzle, channel); + + /* Extended swizzle terms */ + if (component == SWIZZLE_ZERO) { + return brw_imm_f(0.0F); + } + else if (component == SWIZZLE_ONE) { + return brw_imm_f(1.0F); + } + + if (c->fp->use_const_buffer && + (src->File == PROGRAM_STATE_VAR || + src->File == PROGRAM_CONSTANT || + src->File == PROGRAM_UNIFORM)) { + return get_src_reg_const(c, inst, srcRegIndex, component); + } + else { + /* other type of source register */ + return get_reg(c, src->File, src->Index, component, nr, + src->Negate, src->Abs); + } +} + + +/** + * Same as \sa get_src_reg() but if the register is a literal, emit + * a brw_reg encoding the literal. + * Note that a brw instruction only allows one src operand to be a literal. + * For instructions with more than one operand, only the second can be a + * literal. This means that we treat some literals as constants/uniforms + * (which why PROGRAM_CONSTANT is checked in fetch_constants()). + * + */ +static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, + const struct prog_instruction *inst, + GLuint srcRegIndex, GLuint channel) +{ + const struct prog_src_register *src = &inst->SrcReg[srcRegIndex]; + if (src->File == PROGRAM_CONSTANT) { + /* a literal */ + const int component = GET_SWZ(src->Swizzle, channel); + const GLfloat *param = + c->fp->program.Base.Parameters->ParameterValues[src->Index]; + GLfloat value = param[component]; + if (src->Negate & (1 << channel)) + value = -value; + if (src->Abs) + value = FABSF(value); +#if 0 + printf(" form immed value %f for chan %d\n", value, channel); +#endif + return brw_imm_f(value); + } + else { + return get_src_reg(c, inst, srcRegIndex, channel); + } +} + + +/** + * Subroutines are minimal support for resusable instruction sequences. + * They are implemented as simply as possible to minimise overhead: there + * is no explicit support for communication between the caller and callee + * other than saving the return address in a temporary register, nor is + * there any automatic local storage. This implies that great care is + * required before attempting reentrancy or any kind of nested + * subroutine invocations. + */ +static void invoke_subroutine( struct brw_wm_compile *c, + enum _subroutine subroutine, + void (*emit)( struct brw_wm_compile * ) ) +{ + struct brw_compile *p = &c->func; + + assert( subroutine < BRW_WM_MAX_SUBROUTINE ); + + if( c->subroutines[ subroutine ] ) { + /* subroutine previously emitted: reuse existing instructions */ + + int mark = mark_tmps( c ); + struct brw_reg return_address = retype( alloc_tmp( c ), + BRW_REGISTER_TYPE_UD ); + int here = p->nr_insn; + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) ); + + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), + brw_imm_d( ( c->subroutines[ subroutine ] - + here - 1 ) << 4 ) ); + brw_pop_insn_state(p); + + release_tmps( c, mark ); + } else { + /* previously unused subroutine: emit, and mark for later reuse */ + + int mark = mark_tmps( c ); + struct brw_reg return_address = retype( alloc_tmp( c ), + BRW_REGISTER_TYPE_UD ); + struct brw_instruction *calc; + int base = p->nr_insn; + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) ); + brw_pop_insn_state(p); + + c->subroutines[ subroutine ] = p->nr_insn; + + emit( c ); + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_MOV( p, brw_ip_reg(), return_address ); + brw_pop_insn_state(p); + + brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) ); + + release_tmps( c, mark ); + } +} + +static void emit_trunc( struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + struct brw_reg src, dst; + dst = get_dst_reg(c, inst, i); + src = get_src_reg(c, inst, 0, i); + brw_RNDZ(p, dst, src); + } + } + brw_set_saturate(p, 0); +} + +static void emit_mov( struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + struct brw_reg src, dst; + dst = get_dst_reg(c, inst, i); + /* XXX some moves from immediate value don't work reliably!!! */ + /*src = get_src_reg_imm(c, inst, 0, i);*/ + src = get_src_reg(c, inst, 0, i); + brw_MOV(p, dst, src); + } + } + brw_set_saturate(p, 0); +} + +static void emit_pixel_xy(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_reg r1 = brw_vec1_grf(1, 0); + struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); + + struct brw_reg dst0, dst1; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + + dst0 = get_dst_reg(c, inst, 0); + dst1 = get_dst_reg(c, inst, 1); + /* Calculate pixel centers by adding 1 or 0 to each of the + * micro-tile coordinates passed in r1. + */ + if (mask & WRITEMASK_X) { + brw_ADD(p, + vec8(retype(dst0, BRW_REGISTER_TYPE_UW)), + stride(suboffset(r1_uw, 4), 2, 4, 0), + brw_imm_v(0x10101010)); + } + + if (mask & WRITEMASK_Y) { + brw_ADD(p, + vec8(retype(dst1, BRW_REGISTER_TYPE_UW)), + stride(suboffset(r1_uw, 5), 2, 4, 0), + brw_imm_v(0x11001100)); + } +} + +static void emit_delta_xy(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_reg r1 = brw_vec1_grf(1, 0); + struct brw_reg dst0, dst1, src0, src1; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + + dst0 = get_dst_reg(c, inst, 0); + dst1 = get_dst_reg(c, inst, 1); + src0 = get_src_reg(c, inst, 0, 0); + src1 = get_src_reg(c, inst, 0, 1); + /* Calc delta X,Y by subtracting origin in r1 from the pixel + * centers. + */ + if (mask & WRITEMASK_X) { + brw_ADD(p, + dst0, + retype(src0, BRW_REGISTER_TYPE_UW), + negate(r1)); + } + + if (mask & WRITEMASK_Y) { + brw_ADD(p, + dst1, + retype(src1, BRW_REGISTER_TYPE_UW), + negate(suboffset(r1,1))); + + } +} + +static void fire_fb_write( struct brw_wm_compile *c, + GLuint base_reg, + GLuint nr, + GLuint target, + GLuint eot) +{ + struct brw_compile *p = &c->func; + /* Pass through control information: + */ + /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ + { + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ + brw_MOV(p, + brw_message_reg(base_reg + 1), + brw_vec8_grf(1, 0)); + brw_pop_insn_state(p); + } + /* Send framebuffer write message: */ + brw_fb_WRITE(p, + retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), + base_reg, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), + target, + nr, + 0, + eot); +} + +static void emit_fb_write(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + int nr = 2; + int channel; + GLuint target, eot; + struct brw_reg src0; + + /* Reserve a space for AA - may not be needed: + */ + if (c->key.aa_dest_stencil_reg) + nr += 1; + + brw_push_insn_state(p); + for (channel = 0; channel < 4; channel++) { + src0 = get_src_reg(c, inst, 0, channel); + /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ + /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */ + brw_MOV(p, brw_message_reg(nr + channel), src0); + } + /* skip over the regs populated above: */ + nr += 8; + brw_pop_insn_state(p); + + if (c->key.source_depth_to_render_target) { + if (c->key.computes_depth) { + src0 = get_src_reg(c, inst, 2, 2); + brw_MOV(p, brw_message_reg(nr), src0); + } + else { + src0 = get_src_reg(c, inst, 1, 1); + brw_MOV(p, brw_message_reg(nr), src0); + } + + nr += 2; + } + + if (c->key.dest_depth_reg) { + const GLuint comp = c->key.dest_depth_reg / 2; + const GLuint off = c->key.dest_depth_reg % 2; + + if (off != 0) { + /* XXX this code needs review/testing */ + struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp); + struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1); + + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + + brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1)); + /* 2nd half? */ + brw_MOV(p, brw_message_reg(nr+1), arg1_1); + brw_pop_insn_state(p); + } + else + { + struct brw_reg src = get_src_reg(c, inst, 1, 1); + brw_MOV(p, brw_message_reg(nr), src); + } + nr += 2; + } + + target = inst->Aux >> 1; + eot = inst->Aux & 1; + fire_fb_write(c, 0, nr, target, eot); +} + +static void emit_pixel_w( struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + if (mask & WRITEMASK_W) { + struct brw_reg dst, src0, delta0, delta1; + struct brw_reg interp3; + + dst = get_dst_reg(c, inst, 3); + src0 = get_src_reg(c, inst, 0, 0); + delta0 = get_src_reg(c, inst, 1, 0); + delta1 = get_src_reg(c, inst, 1, 1); + + interp3 = brw_vec1_grf(src0.nr+1, 4); + /* Calc 1/w - just linterp wpos[3] optimized by putting the + * result straight into a message reg. + */ + brw_LINE(p, brw_null_reg(), interp3, delta0); + brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1); + + /* Calc w */ + brw_math_16( p, dst, + BRW_MATH_FUNCTION_INV, + BRW_MATH_SATURATE_NONE, + 2, brw_null_reg(), + BRW_MATH_PRECISION_FULL); + } +} + +static void emit_linterp(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + struct brw_reg interp[4]; + struct brw_reg dst, delta0, delta1; + struct brw_reg src0; + GLuint nr, i; + + src0 = get_src_reg(c, inst, 0, 0); + delta0 = get_src_reg(c, inst, 1, 0); + delta1 = get_src_reg(c, inst, 1, 1); + nr = src0.nr; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for(i = 0; i < 4; i++ ) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_LINE(p, brw_null_reg(), interp[i], delta0); + brw_MAC(p, dst, suboffset(interp[i],1), delta1); + } + } +} + +static void emit_cinterp(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + + struct brw_reg interp[4]; + struct brw_reg dst, src0; + GLuint nr, i; + + src0 = get_src_reg(c, inst, 0, 0); + nr = src0.nr; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for(i = 0; i < 4; i++ ) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV(p, dst, suboffset(interp[i],3)); + } + } +} + +static void emit_pinterp(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + + struct brw_reg interp[4]; + struct brw_reg dst, delta0, delta1; + struct brw_reg src0, w; + GLuint nr, i; + + src0 = get_src_reg(c, inst, 0, 0); + delta0 = get_src_reg(c, inst, 1, 0); + delta1 = get_src_reg(c, inst, 1, 1); + w = get_src_reg(c, inst, 2, 3); + nr = src0.nr; + + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + + for(i = 0; i < 4; i++ ) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_LINE(p, brw_null_reg(), interp[i], delta0); + brw_MAC(p, dst, suboffset(interp[i],1), + delta1); + brw_MUL(p, dst, dst, w); + } + } +} + +/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */ +static void emit_frontfacing(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); + struct brw_reg dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV(p, dst, brw_imm_f(0.0)); + } + } + + /* bit 31 is "primitive is back face", so checking < (1 << 31) gives + * us front face + */ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31)); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV(p, dst, brw_imm_f(1.0)); + } + } + brw_set_predicate_control_flag_value(p, 0xff); +} + +static void emit_xpd(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + for (i = 0; i < 4; i++) { + GLuint i2 = (i+2)%3; + GLuint i1 = (i+1)%3; + if (mask & (1<<i)) { + struct brw_reg src0, src1, dst; + dst = get_dst_reg(c, inst, i); + src0 = negate(get_src_reg(c, inst, 0, i2)); + src1 = get_src_reg_imm(c, inst, 1, i1); + brw_MUL(p, brw_null_reg(), src0, src1); + src0 = get_src_reg(c, inst, 0, i1); + src1 = get_src_reg_imm(c, inst, 1, i2); + brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); + brw_MAC(p, dst, src0, src1); + brw_set_saturate(p, 0); + } + } + brw_set_saturate(p, 0); +} + +static void emit_dp3(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_reg src0[3], src1[3], dst; + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + for (i = 0; i < 3; i++) { + src0[i] = get_src_reg(c, inst, 0, i); + src1[i] = get_src_reg_imm(c, inst, 1, i); + } + + dst = get_dst_reg(c, inst, dst_chan); + brw_MUL(p, brw_null_reg(), src0[0], src1[0]); + brw_MAC(p, brw_null_reg(), src0[1], src1[1]); + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_MAC(p, dst, src0[2], src1[2]); + brw_set_saturate(p, 0); +} + +static void emit_dp4(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_reg src0[4], src1[4], dst; + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + for (i = 0; i < 4; i++) { + src0[i] = get_src_reg(c, inst, 0, i); + src1[i] = get_src_reg_imm(c, inst, 1, i); + } + dst = get_dst_reg(c, inst, dst_chan); + brw_MUL(p, brw_null_reg(), src0[0], src1[0]); + brw_MAC(p, brw_null_reg(), src0[1], src1[1]); + brw_MAC(p, brw_null_reg(), src0[2], src1[2]); + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_MAC(p, dst, src0[3], src1[3]); + brw_set_saturate(p, 0); +} + +static void emit_dph(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_reg src0[4], src1[4], dst; + int i; + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + for (i = 0; i < 4; i++) { + src0[i] = get_src_reg(c, inst, 0, i); + src1[i] = get_src_reg_imm(c, inst, 1, i); + } + dst = get_dst_reg(c, inst, dst_chan); + brw_MUL(p, brw_null_reg(), src0[0], src1[0]); + brw_MAC(p, brw_null_reg(), src0[1], src1[1]); + brw_MAC(p, dst, src0[2], src1[2]); + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_ADD(p, dst, dst, src1[3]); + brw_set_saturate(p, 0); +} + +/** + * Emit a scalar instruction, like RCP, RSQ, LOG, EXP. + * Note that the result of the function is smeared across the dest + * register's X, Y, Z and W channels (subject to writemasking of course). + */ +static void emit_math1(struct brw_wm_compile *c, + const struct prog_instruction *inst, GLuint func) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, dst; + GLuint mask = inst->DstReg.WriteMask; + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + /* Get first component of source register */ + dst = get_dst_reg(c, inst, dst_chan); + src0 = get_src_reg(c, inst, 0, 0); + + brw_MOV(p, brw_message_reg(2), src0); + brw_math(p, + dst, + func, + (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, + 2, + brw_null_reg(), + BRW_MATH_DATA_VECTOR, + BRW_MATH_PRECISION_FULL); +} + +static void emit_rcp(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_INV); +} + +static void emit_rsq(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ); +} + +static void emit_sin(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_SIN); +} + +static void emit_cos(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_COS); +} + +static void emit_ex2(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_EXP); +} + +static void emit_lg2(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_math1(c, inst, BRW_MATH_FUNCTION_LOG); +} + +static void emit_add(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg(c, inst, 0, i); + src1 = get_src_reg_imm(c, inst, 1, i); + brw_ADD(p, dst, src0, src1); + } + } + brw_set_saturate(p, 0); +} + +static void emit_arl(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, addr_reg; + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_ADDRESS, 0); + src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */ + brw_MOV(p, addr_reg, src0); + brw_set_saturate(p, 0); +} + + +static void emit_mul(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg(c, inst, 0, i); + src1 = get_src_reg_imm(c, inst, 1, i); + brw_MUL(p, dst, src0, src1); + } + } + brw_set_saturate(p, 0); +} + +static void emit_frc(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg_imm(c, inst, 0, i); + brw_FRC(p, dst, src0); + } + } + if (inst->SaturateMode != SATURATE_OFF) + brw_set_saturate(p, 0); +} + +static void emit_flr(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg_imm(c, inst, 0, i); + brw_RNDD(p, dst, src0); + } + } + brw_set_saturate(p, 0); +} + + +static void emit_min_max(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + const GLuint mask = inst->DstReg.WriteMask; + const int mark = mark_tmps(c); + int i; + brw_push_insn_state(p); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + struct brw_reg real_dst = get_dst_reg(c, inst, i); + struct brw_reg src0 = get_src_reg(c, inst, 0, i); + struct brw_reg src1 = get_src_reg(c, inst, 1, i); + struct brw_reg dst; + /* if dst==src0 or dst==src1 we need to use a temp reg */ + GLboolean use_temp = brw_same_reg(dst, src0) || + brw_same_reg(dst, src1); + if (use_temp) + dst = alloc_tmp(c); + else + dst = real_dst; + + /* + printf(" Min/max: dst %d src0 %d src1 %d\n", + dst.nr, src0.nr, src1.nr); + */ + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_MOV(p, dst, src0); + brw_set_saturate(p, 0); + + if (inst->Opcode == OPCODE_MIN) + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0); + else + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0); + + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); + brw_MOV(p, dst, src1); + brw_set_saturate(p, 0); + brw_set_predicate_control_flag_value(p, 0xff); + if (use_temp) + brw_MOV(p, real_dst, dst); + } + } + brw_pop_insn_state(p); + release_tmps(c, mark); +} + +static void emit_pow(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg dst, src0, src1; + GLuint mask = inst->DstReg.WriteMask; + int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1; + + if (!(mask & WRITEMASK_XYZW)) + return; + + assert(is_power_of_two(mask & WRITEMASK_XYZW)); + + dst = get_dst_reg(c, inst, dst_chan); + src0 = get_src_reg_imm(c, inst, 0, 0); + src1 = get_src_reg_imm(c, inst, 1, 0); + + brw_MOV(p, brw_message_reg(2), src0); + brw_MOV(p, brw_message_reg(3), src1); + + brw_math(p, + dst, + BRW_MATH_FUNCTION_POW, + (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, + 2, + brw_null_reg(), + BRW_MATH_DATA_VECTOR, + BRW_MATH_PRECISION_FULL); +} + +static void emit_lrp(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + struct brw_reg dst, tmp1, tmp2, src0, src1, src2; + int i; + int mark = mark_tmps(c); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg(c, inst, 0, i); + + src1 = get_src_reg_imm(c, inst, 1, i); + + if (src1.nr == dst.nr) { + tmp1 = alloc_tmp(c); + brw_MOV(p, tmp1, src1); + } else + tmp1 = src1; + + src2 = get_src_reg(c, inst, 2, i); + if (src2.nr == dst.nr) { + tmp2 = alloc_tmp(c); + brw_MOV(p, tmp2, src2); + } else + tmp2 = src2; + + brw_ADD(p, dst, negate(src0), brw_imm_f(1.0)); + brw_MUL(p, brw_null_reg(), dst, tmp2); + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_MAC(p, dst, src0, tmp1); + brw_set_saturate(p, 0); + } + release_tmps(c, mark); + } +} + +/** + * For GLSL shaders, this KIL will be unconditional. + * It may be contained inside an IF/ENDIF structure of course. + */ +static void emit_kil(struct brw_wm_compile *c) +{ + struct brw_compile *p = &c->func; + struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK + brw_AND(p, depth, c->emit_mask_reg, depth); + brw_pop_insn_state(p); +} + +static void emit_mad(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + struct brw_reg dst, src0, src1, src2; + int i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg(c, inst, 0, i); + src1 = get_src_reg_imm(c, inst, 1, i); + src2 = get_src_reg_imm(c, inst, 2, i); + brw_MUL(p, dst, src0, src1); + + brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); + brw_ADD(p, dst, dst, src2); + brw_set_saturate(p, 0); + } + } +} + +static void emit_sop(struct brw_wm_compile *c, + const struct prog_instruction *inst, GLuint cond) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + struct brw_reg dst, src0, src1; + int i; + + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + src0 = get_src_reg(c, inst, 0, i); + src1 = get_src_reg_imm(c, inst, 1, i); + brw_push_insn_state(p); + brw_CMP(p, brw_null_reg(), cond, src0, src1); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_MOV(p, dst, brw_imm_f(0.0)); + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); + brw_MOV(p, dst, brw_imm_f(1.0)); + brw_pop_insn_state(p); + } + } +} + +static void emit_slt(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_L); +} + +static void emit_sle(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_LE); +} + +static void emit_sgt(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_G); +} + +static void emit_sge(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_GE); +} + +static void emit_seq(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_EQ); +} + +static void emit_sne(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + emit_sop(c, inst, BRW_CONDITIONAL_NEQ); +} + +static INLINE struct brw_reg high_words( struct brw_reg reg ) +{ + return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ), + 0, 8, 2 ); +} + +static INLINE struct brw_reg low_words( struct brw_reg reg ) +{ + return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 ); +} + +static INLINE struct brw_reg even_bytes( struct brw_reg reg ) +{ + return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 ); +} + +static INLINE struct brw_reg odd_bytes( struct brw_reg reg ) +{ + return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ), + 0, 16, 2 ); +} + +/* One-, two- and three-dimensional Perlin noise, similar to the description + in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */ +static void noise1_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param, + x0, x1, /* gradients at each end */ + t, tmp[ 2 ], /* float temporaries */ + itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */ + int i; + int mark = mark_tmps( c ); + + x0 = alloc_tmp( c ); + x1 = alloc_tmp( c ); + t = alloc_tmp( c ); + tmp[ 0 ] = alloc_tmp( c ); + tmp[ 1 ] = alloc_tmp( c ); + itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD ); + itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD ); + itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD ); + itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD ); + itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD ); + + param = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ + + /* Arrange the two end coordinates into scalars (itmp0/itmp1) to + be hashed. Also compute the remainder (offset within the unit + length), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param ); + brw_FRC( p, param, param ); + brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) ); + brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ + brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ + + /* We're now ready to perform the hashing. The two hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 32x16 + bit multiplication, and 16-bit swizzles (which we get for + free). We can't use immediate operands in the multiplies, + because immediates are permitted only in src1 and the 16-bit + factor is permitted only in src0. */ + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + + /* Now we want to initialise the two gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 31 ), but + we correct for that right at the end. */ + brw_ADD( p, t, param, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) ); + brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) ); + + brw_MUL( p, x0, x0, param ); + brw_MUL( p, x1, x1, t ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the + pipeline */ + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_MUL( p, param, tmp[ 0 ], param ); + brw_MUL( p, x1, x1, param ); + brw_ADD( p, x0, x0, x1 ); + /* scale by pow( 2, -30 ), to compensate for the format conversion + above and an extra factor of 2 so that a single gradient covers + the [-1,1] range */ + brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise1( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src, param, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src = get_src_reg( c, inst, 0, 0 ); + + param = alloc_tmp( c ); + + brw_MOV( p, param, src ); + + invoke_subroutine( c, SUB_NOISE1, noise1_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV( p, dst, param ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +static void noise2_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param0, param1, + x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */ + t, tmp[ 4 ], /* float temporaries */ + itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */ + int i; + int mark = mark_tmps( c ); + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + t = alloc_tmp( c ); + for( i = 0; i < 4; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + } + itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD ); + itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD ); + itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD ); + + param0 = lookup_tmp( c, mark - 3 ); + param1 = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to + be hashed. Also compute the remainders (offsets within the unit + square), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); + brw_FRC( p, param0, param0 ); + brw_FRC( p, param1, param1 ); + brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ + brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ), + low_words( itmp[ 1 ] ) ); + brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ + brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ + brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) ); + brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) ); + brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) ); + + /* We're now ready to perform the hashing. The four hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 32x16 + bit multiplication, and 16-bit swizzles (which we get for + free). We can't use immediate operands in the multiplies, + because immediates are permitted only in src1 and the 16-bit + factor is permitted only in src0. */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + + /* Now we want to initialise the four gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) ); + + brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 ); + brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t ); + brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 0 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 2 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 1 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 3 ] ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) ); + brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the + pipeline */ + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the + pipeline */ + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_MUL( p, param0, tmp[ 0 ], param0 ); + brw_MUL( p, param1, tmp[ 1 ], param1 ); + + /* Here we interpolate in the y dimension... */ + brw_MUL( p, x0y1, x0y1, param1 ); + brw_MUL( p, x1y1, x1y1, param1 ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. There are horrible register dependencies here, + but we have nothing else to do. */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, param0 ); + brw_ADD( p, x0y0, x0y0, x1y0 ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise2( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, param0, param1, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst, 0, 0 ); + src1 = get_src_reg( c, inst, 0, 1 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + + invoke_subroutine( c, SUB_NOISE2, noise2_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +/** + * The three-dimensional case is much like the one- and two- versions above, + * but since the number of corners is rapidly growing we now pack 16 16-bit + * hashes into each register to extract more parallelism from the EUs. + */ +static void noise3_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param0, param1, param2, + x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ + xi, yi, zi, /* interpolation coefficients */ + t, tmp[ 8 ], /* float temporaries */ + itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ + wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ + int i; + int mark = mark_tmps( c ); + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + xi = alloc_tmp( c ); + yi = alloc_tmp( c ); + zi = alloc_tmp( c ); + t = alloc_tmp( c ); + for( i = 0; i < 8; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); + } + + param0 = lookup_tmp( c, mark - 4 ); + param1 = lookup_tmp( c, mark - 3 ); + param2 = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to + be hashed. Also compute the remainders (offsets within the unit + cube), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); + brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 ); + brw_FRC( p, param0, param0 ); + brw_FRC( p, param1, param1 ); + brw_FRC( p, param2, param2 ); + /* Since we now have only 16 bits of precision in the hash, we must + be more careful about thorough mixing to maintain entropy as we + squash the input vector into a small scalar. */ + brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) ); + brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) ); + brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ), + brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + + /* Temporarily disable the execution mask while we work with ExecSize=16 + channels (the mask is set for ExecSize=8 and is probably incorrect). + Although this might cause execution of unwanted channels, the code + writes only to temporary registers and has no side effects, so + disabling the mask is harmless. */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); + brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); + + /* We're now ready to perform the hashing. The eight hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 16x16 + bit multiplication, and 8-bit swizzles (which we get for + free). */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + brw_pop_insn_state( p ); + + /* Now we want to initialise the four rear gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + /* x component */ + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) ); + brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) ); + brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) ); + brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) ); + brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) ); + brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) ); + brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) ); + brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) ); + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */ + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */ + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + + /* Here we interpolate in the y dimension... */ + brw_MUL( p, x0y1, x0y1, yi ); + brw_MUL( p, x1y1, x1y1, yi ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, xi ); + brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); + + /* Now do the same thing for the front four gradients... */ + /* x component */ + brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param2, brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* The interpolation coefficients are still around from last time, so + again interpolate in the y dimension... */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, yi ); + brw_MUL( p, x1y1, x1y1, yi ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. The rear face is in tmp[ 0 ] (see above), so this + time put the front face in tmp[ 1 ] and we're nearly there... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, xi ); + brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); + + /* The final interpolation, in the z dimension: */ + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise3( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, src2, param0, param1, param2, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst, 0, 0 ); + src1 = get_src_reg( c, inst, 0, 1 ); + src2 = get_src_reg( c, inst, 0, 2 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + param2 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + brw_MOV( p, param2, src2 ); + + invoke_subroutine( c, SUB_NOISE3, noise3_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +/** + * For the four-dimensional case, the little micro-optimisation benefits + * we obtain by unrolling all the loops aren't worth the massive bloat it + * now causes. Instead, we loop twice around performing a similar operation + * to noise3, once for the w=0 cube and once for the w=1, with a bit more + * code to glue it all together. + */ +static void noise4_sub( struct brw_wm_compile *c ) +{ + struct brw_compile *p = &c->func; + struct brw_reg param[ 4 ], + x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ + w0, /* noise for the w=0 cube */ + floors[ 2 ], /* integer coordinates of base corner of hypercube */ + interp[ 4 ], /* interpolation coefficients */ + t, tmp[ 8 ], /* float temporaries */ + itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ + wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ + int i, j; + int mark = mark_tmps( c ); + GLuint loop, origin; + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + t = alloc_tmp( c ); + w0 = alloc_tmp( c ); + floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); + floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); + + for( i = 0; i < 4; i++ ) { + param[ i ] = lookup_tmp( c, mark - 5 + i ); + interp[ i ] = alloc_tmp( c ); + } + + for( i = 0; i < 8; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); + } + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* We only want 16 bits of precision from the integral part of each + co-ordinate, but unfortunately the RNDD semantics would saturate + at 16 bits if we performed the operation directly to a 16-bit + destination. Therefore, we round to 32-bit temporaries where + appropriate, and then store only the lower 16 bits. */ + brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] ); + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] ); + brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] ); + brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) ); + brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) ); + + /* Modify the flag register here, because the side effect is useful + later (see below). We know for certain that all flags will be + cleared, since the FRC instruction cannot possibly generate + negative results. Even for exceptional inputs (infinities, denormals, + NaNs), the architecture guarantees that the L conditional is false. */ + brw_set_conditionalmod( p, BRW_CONDITIONAL_L ); + brw_FRC( p, param[ 0 ], param[ 0 ] ); + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + for( i = 1; i < 4; i++ ) + brw_FRC( p, param[ i ], param[ i ] ); + + /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first + of all. */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) ); + for( i = 0; i < 4; i++ ) + brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); + for( i = 0; i < 4; i++ ) + brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) ); + for( j = 0; j < 3; j++ ) + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); + + /* Mark the current address, as it will be a jump destination. The + following code will be executed twice: first, with the flag + register clear indicating the w=0 case, and second with flags + set for w=1. */ + loop = p->nr_insn; + + /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to + be hashed. Since we have only 16 bits of precision in the hash, we + must be careful about thorough mixing to maintain entropy as we + squash the input vector into a small scalar. */ + brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ), + brw_imm_uw( 0xD0BD ) ); + brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ), + brw_imm_uw( 0x9B93 ) ); + brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ), + brw_imm_uw( 0xA359 ) ); + brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + + /* Temporarily disable the execution mask while we work with ExecSize=16 + channels (the mask is set for ExecSize=8 and is probably incorrect). + Although this might cause execution of unwanted channels, the code + writes only to temporary registers and has no side effects, so + disabling the mask is harmless. */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); + brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); + + /* We're now ready to perform the hashing. The eight hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 16x16 + bit multiplication, and 8-bit swizzles (which we get for + free). */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + brw_pop_insn_state( p ); + + /* Now we want to initialise the four rear gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + /* x component */ + brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param[ 0 ] ); + brw_MUL( p, x0y1, x0y1, param[ 0 ] ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + /* prepare t for the w component (used below): w the first time through + the loop; w - 1 the second time) */ + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); + p->current->header.predicate_inverse = 1; + brw_MOV( p, t, param[ 3 ] ); + p->current->header.predicate_inverse = 0; + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* w component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* Here we interpolate in the y dimension... */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); + brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); + brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); + + /* Now do the same thing for the front four gradients... */ + /* x component */ + brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param[ 0 ] ); + brw_MUL( p, x0y1, x0y1, param[ 0 ] ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + /* prepare t for the w component (used below): w the first time through + the loop; w - 1 the second time) */ + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); + p->current->header.predicate_inverse = 1; + brw_MOV( p, t, param[ 3 ] ); + p->current->header.predicate_inverse = 0; + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* w component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* Interpolate in the y dimension: */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); + brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. The rear face is in tmp[ 0 ] (see above), so this + time put the front face in tmp[ 1 ] and we're nearly there... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); + brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); + + /* Another interpolation, in the z dimension: */ + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); + + /* Exit the loop if we've computed both cubes... */ + origin = p->nr_insn; + brw_push_insn_state( p ); + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) ); + brw_pop_insn_state( p ); + + /* Save the result for the w=0 case, and increment the w coordinate: */ + brw_MOV( p, w0, tmp[ 0 ] ); + brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ), + brw_imm_uw( 1 ) ); + + /* Loop around for the other cube. Explicitly set the flag register + (unfortunately we must spend an extra instruction to do this: we + can't rely on a side effect of the previous MOV or ADD because + conditional modifiers which are normally true might be false in + exceptional circumstances, e.g. given a NaN input; the add to + brw_ip_reg() is not suitable because the IP is not an 8-vector). */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) ); + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), + brw_imm_d( ( loop - p->nr_insn ) << 4 ) ); + brw_pop_insn_state( p ); + + /* Patch the previous conditional branch now that we know the + destination address. */ + brw_set_src1( p->store + origin, + brw_imm_d( ( p->nr_insn - origin ) << 4 ) ); + + /* The very last interpolation. */ + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise4( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst, 0, 0 ); + src1 = get_src_reg( c, inst, 0, 1 ); + src2 = get_src_reg( c, inst, 0, 2 ); + src3 = get_src_reg( c, inst, 0, 3 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + param2 = alloc_tmp( c ); + param3 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + brw_MOV( p, param2, src2 ); + brw_MOV( p, param3, src3 ); + + invoke_subroutine( c, SUB_NOISE4, noise4_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +static void emit_wpos_xy(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + GLuint mask = inst->DstReg.WriteMask; + struct brw_reg src0[2], dst[2]; + + dst[0] = get_dst_reg(c, inst, 0); + dst[1] = get_dst_reg(c, inst, 1); + + src0[0] = get_src_reg(c, inst, 0, 0); + src0[1] = get_src_reg(c, inst, 0, 1); + + /* Calculate the pixel offset from window bottom left into destination + * X and Y channels. + */ + if (mask & WRITEMASK_X) { + /* X' = X - origin_x */ + brw_ADD(p, + dst[0], + retype(src0[0], BRW_REGISTER_TYPE_W), + brw_imm_d(0 - c->key.origin_x)); + } + + if (mask & WRITEMASK_Y) { + /* Y' = height - (Y - origin_y) = height + origin_y - Y */ + brw_ADD(p, + dst[1], + negate(retype(src0[1], BRW_REGISTER_TYPE_W)), + brw_imm_d(c->key.origin_y + c->key.drawable_height - 1)); + } +} + +/* TODO + BIAS on SIMD8 not working yet... + */ +static void emit_txb(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg dst[4], src[4], payload_reg; + /* Note: TexSrcUnit was already looked up through SamplerTextures[] */ + const GLuint unit = inst->TexSrcUnit; + GLuint i; + GLuint msg_type; + + assert(unit < BRW_MAX_TEX_UNIT); + + payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0); + + for (i = 0; i < 4; i++) + dst[i] = get_dst_reg(c, inst, i); + for (i = 0; i < 4; i++) + src[i] = get_src_reg(c, inst, 0, i); + + switch (inst->TexSrcTarget) { + case TEXTURE_1D_INDEX: + brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */ + brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */ + brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */ + break; + case TEXTURE_2D_INDEX: + case TEXTURE_RECT_INDEX: + brw_MOV(p, brw_message_reg(2), src[0]); + brw_MOV(p, brw_message_reg(3), src[1]); + brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); + break; + case TEXTURE_3D_INDEX: + case TEXTURE_CUBE_INDEX: + brw_MOV(p, brw_message_reg(2), src[0]); + brw_MOV(p, brw_message_reg(3), src[1]); + brw_MOV(p, brw_message_reg(4), src[2]); + break; + default: + /* invalid target */ + abort(); + } + brw_MOV(p, brw_message_reg(5), src[3]); /* bias */ + brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */ + + if (BRW_IS_IGDNG(p->brw)) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG; + } else { + /* Does it work well on SIMD8? */ + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; + } + + brw_SAMPLE(p, + retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */ + 1, /* msg_reg_nr */ + retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */ + SURF_INDEX_TEXTURE(unit), + unit, /* sampler */ + inst->DstReg.WriteMask, /* writemask */ + msg_type, /* msg_type */ + 4, /* response_length */ + 4, /* msg_length */ + 0, /* eot */ + 1, + BRW_SAMPLER_SIMD_MODE_SIMD8); +} + + +static void emit_tex(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_compile *p = &c->func; + struct brw_reg dst[4], src[4], payload_reg; + /* Note: TexSrcUnit was already looked up through SamplerTextures[] */ + const GLuint unit = inst->TexSrcUnit; + GLuint msg_len; + GLuint i, nr; + GLuint emit; + GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0; + GLuint msg_type; + + assert(unit < BRW_MAX_TEX_UNIT); + + payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0); + + for (i = 0; i < 4; i++) + dst[i] = get_dst_reg(c, inst, i); + for (i = 0; i < 4; i++) + src[i] = get_src_reg(c, inst, 0, i); + + switch (inst->TexSrcTarget) { + case TEXTURE_1D_INDEX: + emit = WRITEMASK_X; + nr = 1; + break; + case TEXTURE_2D_INDEX: + case TEXTURE_RECT_INDEX: + emit = WRITEMASK_XY; + nr = 2; + break; + case TEXTURE_3D_INDEX: + case TEXTURE_CUBE_INDEX: + emit = WRITEMASK_XYZ; + nr = 3; + break; + default: + /* invalid target */ + abort(); + } + msg_len = 1; + + /* move/load S, T, R coords */ + for (i = 0; i < nr; i++) { + static const GLuint swz[4] = {0,1,2,2}; + if (emit & (1<<i)) + brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]); + else + brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0)); + msg_len += 1; + } + + if (shadow) { + brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */ + brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */ + } + + if (BRW_IS_IGDNG(p->brw)) { + if (shadow) + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG; + else + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG; + } else { + /* Does it work for shadow on SIMD8 ? */ + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; + } + + brw_SAMPLE(p, + retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */ + 1, /* msg_reg_nr */ + retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */ + SURF_INDEX_TEXTURE(unit), + unit, /* sampler */ + inst->DstReg.WriteMask, /* writemask */ + msg_type, /* msg_type */ + 4, /* response_length */ + shadow ? 6 : 4, /* msg_length */ + 0, /* eot */ + 1, + BRW_SAMPLER_SIMD_MODE_SIMD8); + + if (shadow) + brw_MOV(p, dst[3], brw_imm_f(1.0)); +} + + +/** + * Resolve subroutine calls after code emit is done. + */ +static void post_wm_emit( struct brw_wm_compile *c ) +{ + brw_resolve_cals(&c->func); +} + +static void +get_argument_regs(struct brw_wm_compile *c, + const struct prog_instruction *inst, + int index, + struct brw_reg *regs, + int mask) +{ + int i; + + for (i = 0; i < 4; i++) { + if (mask & (1 << i)) + regs[i] = get_src_reg(c, inst, index, i); + } +} + +static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) +{ +#define MAX_IF_DEPTH 32 +#define MAX_LOOP_DEPTH 32 + struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH]; + GLuint i, if_depth = 0, loop_depth = 0; + struct brw_compile *p = &c->func; + struct brw_indirect stack_index = brw_indirect(0, 0); + + c->out_of_regs = GL_FALSE; + + prealloc_reg(c); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); + + for (i = 0; i < c->nr_fp_insns; i++) { + const struct prog_instruction *inst = &c->prog_instructions[i]; + int dst_flags; + struct brw_reg args[3][4], dst[4]; + int j; + + c->cur_inst = i; + +#if 0 + _mesa_printf("Inst %d: ", i); + _mesa_print_instruction(inst); +#endif + + /* fetch any constants that this instruction needs */ + if (c->fp->use_const_buffer) + fetch_constants(c, inst); + + if (inst->CondUpdate) + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + else + brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE); + + dst_flags = inst->DstReg.WriteMask; + if (inst->SaturateMode == SATURATE_ZERO_ONE) + dst_flags |= SATURATE; + + switch (inst->Opcode) { + case WM_PIXELXY: + emit_pixel_xy(c, inst); + break; + case WM_DELTAXY: + emit_delta_xy(c, inst); + break; + case WM_PIXELW: + emit_pixel_w(c, inst); + break; + case WM_LINTERP: + emit_linterp(c, inst); + break; + case WM_PINTERP: + emit_pinterp(c, inst); + break; + case WM_CINTERP: + emit_cinterp(c, inst); + break; + case WM_WPOSXY: + emit_wpos_xy(c, inst); + break; + case WM_FB_WRITE: + emit_fb_write(c, inst); + break; + case WM_FRONTFACING: + emit_frontfacing(c, inst); + break; + case OPCODE_ADD: + emit_add(c, inst); + break; + case OPCODE_ARL: + emit_arl(c, inst); + break; + case OPCODE_FRC: + emit_frc(c, inst); + break; + case OPCODE_FLR: + emit_flr(c, inst); + break; + case OPCODE_LRP: + emit_lrp(c, inst); + break; + case OPCODE_TRUNC: + emit_trunc(c, inst); + break; + case OPCODE_MOV: + case OPCODE_SWZ: + emit_mov(c, inst); + break; + case OPCODE_DP3: + emit_dp3(c, inst); + break; + case OPCODE_DP4: + emit_dp4(c, inst); + break; + case OPCODE_XPD: + emit_xpd(c, inst); + break; + case OPCODE_DPH: + emit_dph(c, inst); + break; + case OPCODE_RCP: + emit_rcp(c, inst); + break; + case OPCODE_RSQ: + emit_rsq(c, inst); + break; + case OPCODE_SIN: + emit_sin(c, inst); + break; + case OPCODE_COS: + emit_cos(c, inst); + break; + case OPCODE_EX2: + emit_ex2(c, inst); + break; + case OPCODE_LG2: + emit_lg2(c, inst); + break; + case OPCODE_MIN: + case OPCODE_MAX: + emit_min_max(c, inst); + break; + case OPCODE_DDX: + case OPCODE_DDY: + for (j = 0; j < 4; j++) { + if (inst->DstReg.WriteMask & (1 << j)) + dst[j] = get_dst_reg(c, inst, j); + else + dst[j] = brw_null_reg(); + } + get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW); + emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX), + args[0]); + break; + case OPCODE_SLT: + emit_slt(c, inst); + break; + case OPCODE_SLE: + emit_sle(c, inst); + break; + case OPCODE_SGT: + emit_sgt(c, inst); + break; + case OPCODE_SGE: + emit_sge(c, inst); + break; + case OPCODE_SEQ: + emit_seq(c, inst); + break; + case OPCODE_SNE: + emit_sne(c, inst); + break; + case OPCODE_MUL: + emit_mul(c, inst); + break; + case OPCODE_POW: + emit_pow(c, inst); + break; + case OPCODE_MAD: + emit_mad(c, inst); + break; + case OPCODE_NOISE1: + emit_noise1(c, inst); + break; + case OPCODE_NOISE2: + emit_noise2(c, inst); + break; + case OPCODE_NOISE3: + emit_noise3(c, inst); + break; + case OPCODE_NOISE4: + emit_noise4(c, inst); + break; + case OPCODE_TEX: + emit_tex(c, inst); + break; + case OPCODE_TXB: + emit_txb(c, inst); + break; + case OPCODE_KIL_NV: + emit_kil(c); + break; + case OPCODE_IF: + assert(if_depth < MAX_IF_DEPTH); + if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8); + break; + case OPCODE_ELSE: + if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]); + break; + case OPCODE_ENDIF: + assert(if_depth > 0); + brw_ENDIF(p, if_inst[--if_depth]); + break; + case OPCODE_BGNSUB: + brw_save_label(p, inst->Comment, p->nr_insn); + break; + case OPCODE_ENDSUB: + /* no-op */ + break; + case OPCODE_CAL: + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_access_mode(p, BRW_ALIGN_1); + brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16)); + brw_set_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, get_addr_reg(stack_index), + get_addr_reg(stack_index), brw_imm_d(4)); + brw_save_call(&c->func, inst->Comment, p->nr_insn); + brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); + brw_pop_insn_state(p); + break; + + case OPCODE_RET: + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_ADD(p, get_addr_reg(stack_index), + get_addr_reg(stack_index), brw_imm_d(-4)); + brw_set_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0)); + brw_set_access_mode(p, BRW_ALIGN_16); + brw_pop_insn_state(p); + + break; + case OPCODE_BGNLOOP: + /* XXX may need to invalidate the current_constant regs */ + loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8); + break; + case OPCODE_BRK: + brw_BREAK(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + break; + case OPCODE_CONT: + brw_CONT(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + break; + case OPCODE_ENDLOOP: + { + struct brw_instruction *inst0, *inst1; + GLuint br = 1; + + if (BRW_IS_IGDNG(brw)) + br = 2; + + loop_depth--; + inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]); + /* patch all the BREAK/CONT instructions from last BGNLOOP */ + while (inst0 > loop_inst[loop_depth]) { + inst0--; + if (inst0->header.opcode == BRW_OPCODE_BREAK) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); + inst0->bits3.if_else.pop_count = 0; + } + else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0); + inst0->bits3.if_else.pop_count = 0; + } + } + } + break; + default: + _mesa_printf("unsupported IR in fragment shader %d\n", + inst->Opcode); + } + + if (inst->CondUpdate) + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); + else + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } + post_wm_emit(c); + + if (INTEL_DEBUG & DEBUG_WM) { + _mesa_printf("wm-native:\n"); + for (i = 0; i < p->nr_insn; i++) + brw_disasm(stderr, &p->store[i]); + _mesa_printf("\n"); + } +} + +/** + * Do GPU code generation for shaders that use GLSL features such as + * flow control. Other shaders will be compiled with the + */ +void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) +{ + if (INTEL_DEBUG & DEBUG_WM) { + _mesa_printf("brw_wm_glsl_emit:\n"); + } + + /* initial instruction translation/simplification */ + brw_wm_pass_fp(c); + + /* actual code generation */ + brw_wm_emit_glsl(brw, c); + + if (INTEL_DEBUG & DEBUG_WM) { + brw_wm_print_program(c, "brw_wm_glsl_emit done"); + } + + c->prog_data.total_grf = num_grf_used(c); + c->prog_data.total_scratch = 0; +} diff --git a/src/gallium/drivers/i965/brw_wm_iz.c b/src/gallium/drivers/i965/brw_wm_iz.c new file mode 100644 index 00000000000..5e399ac62a8 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_iz.c @@ -0,0 +1,157 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/mtypes.h" +#include "brw_wm.h" + + +#undef P /* prompted depth */ +#undef C /* computed */ +#undef N /* non-promoted? */ + +#define P 0 +#define C 1 +#define N 2 + +const struct { + GLuint mode:2; + GLuint sd_present:1; + GLuint sd_to_rt:1; + GLuint dd_present:1; + GLuint ds_present:1; +} wm_iz_table[IZ_BIT_MAX] = +{ + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 0, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 } +}; + +/** + * \param line_aa AA_NEVER, AA_ALWAYS or AA_SOMETIMES + * \param lookup bitmask of IZ_* flags + */ +void brw_wm_lookup_iz( GLuint line_aa, + GLuint lookup, + GLboolean ps_uses_depth, + struct brw_wm_prog_key *key ) +{ + GLuint reg = 2; + + assert (lookup < IZ_BIT_MAX); + + if (lookup & IZ_PS_COMPUTES_DEPTH_BIT) + key->computes_depth = 1; + + if (wm_iz_table[lookup].sd_present || ps_uses_depth) { + key->source_depth_reg = reg; + reg += 2; + } + + if (wm_iz_table[lookup].sd_to_rt) + key->source_depth_to_render_target = 1; + + if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) { + key->aa_dest_stencil_reg = reg; + key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present && + line_aa == AA_SOMETIMES); + reg++; + } + + if (wm_iz_table[lookup].dd_present) { + key->dest_depth_reg = reg; + reg+=2; + } + + key->nr_depth_regs = (reg+1)/2; +} + diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c new file mode 100644 index 00000000000..62792583396 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_pass0.c @@ -0,0 +1,442 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_wm.h" +#include "shader/prog_parameter.h" + + + +/*********************************************************************** + */ + +static struct brw_wm_ref *get_ref( struct brw_wm_compile *c ) +{ + assert(c->nr_refs < BRW_WM_MAX_REF); + return &c->refs[c->nr_refs++]; +} + +static struct brw_wm_value *get_value( struct brw_wm_compile *c) +{ + assert(c->nr_refs < BRW_WM_MAX_VREG); + return &c->vreg[c->nr_vreg++]; +} + +/** return pointer to a newly allocated instruction */ +static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c ) +{ + assert(c->nr_insns < BRW_WM_MAX_INSN); + return &c->instruction[c->nr_insns++]; +} + +/*********************************************************************** + */ + +/** Init the "undef" register */ +static void pass0_init_undef( struct brw_wm_compile *c) +{ + struct brw_wm_ref *ref = &c->undef_ref; + ref->value = &c->undef_value; + ref->hw_reg = brw_vec8_grf(0, 0); + ref->insn = 0; + ref->prevuse = NULL; +} + +/** Set a FP register to a value */ +static void pass0_set_fpreg_value( struct brw_wm_compile *c, + GLuint file, + GLuint idx, + GLuint component, + struct brw_wm_value *value ) +{ + struct brw_wm_ref *ref = get_ref(c); + ref->value = value; + ref->hw_reg = brw_vec8_grf(0, 0); + ref->insn = 0; + ref->prevuse = NULL; + c->pass0_fp_reg[file][idx][component] = ref; +} + +/** Set a FP register to a ref */ +static void pass0_set_fpreg_ref( struct brw_wm_compile *c, + GLuint file, + GLuint idx, + GLuint component, + const struct brw_wm_ref *src_ref ) +{ + c->pass0_fp_reg[file][idx][component] = src_ref; +} + +static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c, + const GLfloat *param_ptr ) +{ + GLuint i = c->prog_data.nr_params++; + + if (i >= BRW_WM_MAX_PARAM) { + _mesa_printf("%s: out of params\n", __FUNCTION__); + c->prog_data.error = 1; + return NULL; + } + else { + struct brw_wm_ref *ref = get_ref(c); + + c->prog_data.param[i] = param_ptr; + c->nr_creg = (i+16)/16; + + /* Push the offsets into hw_reg. These will be added to the + * real register numbers once one is allocated in pass2. + */ + ref->hw_reg = brw_vec1_grf((i&8)?1:0, i%8); + ref->value = &c->creg[i/16]; + ref->insn = 0; + ref->prevuse = NULL; + + return ref; + } +} + + +/** Return a ref to a constant/literal value */ +static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c, + const GLfloat *constval ) +{ + GLuint i; + + /* Search for an existing const value matching the request: + */ + for (i = 0; i < c->nr_constrefs; i++) { + if (c->constref[i].constval == *constval) + return c->constref[i].ref; + } + + /* Else try to add a new one: + */ + if (c->nr_constrefs < BRW_WM_MAX_CONST) { + GLuint i = c->nr_constrefs++; + + /* A constant is a special type of parameter: + */ + c->constref[i].constval = *constval; + c->constref[i].ref = get_param_ref(c, constval); + + return c->constref[i].ref; + } + else { + _mesa_printf("%s: out of constrefs\n", __FUNCTION__); + c->prog_data.error = 1; + return NULL; + } +} + + +/* Lookup our internal registers + */ +static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c, + GLuint file, + GLuint idx, + GLuint component ) +{ + const struct brw_wm_ref *ref = c->pass0_fp_reg[file][idx][component]; + + if (!ref) { + switch (file) { + case PROGRAM_INPUT: + case PROGRAM_PAYLOAD: + case PROGRAM_TEMPORARY: + case PROGRAM_OUTPUT: + case PROGRAM_VARYING: + break; + + case PROGRAM_LOCAL_PARAM: + ref = get_param_ref(c, &c->fp->program.Base.LocalParams[idx][component]); + break; + + case PROGRAM_ENV_PARAM: + ref = get_param_ref(c, &c->env_param[idx][component]); + break; + + case PROGRAM_STATE_VAR: + case PROGRAM_UNIFORM: + case PROGRAM_CONSTANT: + case PROGRAM_NAMED_PARAM: { + struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters; + + /* There's something really hokey about parameters parsed in + * arb programs - they all end up in here, whether they be + * state values, parameters or constants. This duplicates the + * structure above & also seems to subvert the limits set for + * each type of constant/param. + */ + switch (plist->Parameters[idx].Type) { + case PROGRAM_NAMED_PARAM: + case PROGRAM_CONSTANT: + /* These are invarient: + */ + ref = get_const_ref(c, &plist->ParameterValues[idx][component]); + break; + + case PROGRAM_STATE_VAR: + case PROGRAM_UNIFORM: + /* These may change from run to run: + */ + ref = get_param_ref(c, &plist->ParameterValues[idx][component] ); + break; + + default: + assert(0); + break; + } + break; + } + + default: + assert(0); + break; + } + + c->pass0_fp_reg[file][idx][component] = ref; + } + + if (!ref) + ref = &c->undef_ref; + + return ref; +} + + + +/*********************************************************************** + * Straight translation to internal instruction format + */ + +static void pass0_set_dst( struct brw_wm_compile *c, + struct brw_wm_instruction *out, + const struct prog_instruction *inst, + GLuint writemask ) +{ + const struct prog_dst_register *dst = &inst->DstReg; + GLuint i; + + for (i = 0; i < 4; i++) { + if (writemask & (1<<i)) { + out->dst[i] = get_value(c); + pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[i]); + } + } + + out->writemask = writemask; +} + + +static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c, + struct prog_src_register src, + GLuint i ) +{ + GLuint component = GET_SWZ(src.Swizzle,i); + const struct brw_wm_ref *src_ref; + static const GLfloat const_zero = 0.0; + static const GLfloat const_one = 1.0; + + if (component == SWIZZLE_ZERO) + src_ref = get_const_ref(c, &const_zero); + else if (component == SWIZZLE_ONE) + src_ref = get_const_ref(c, &const_one); + else + src_ref = pass0_get_reg(c, src.File, src.Index, component); + + return src_ref; +} + + +static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c, + struct prog_src_register src, + GLuint i, + struct brw_wm_instruction *insn) +{ + const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i); + struct brw_wm_ref *newref = get_ref(c); + + newref->value = ref->value; + newref->hw_reg = ref->hw_reg; + + if (insn) { + newref->insn = insn - c->instruction; + newref->prevuse = newref->value->lastuse; + newref->value->lastuse = newref; + } + + if (src.Negate & (1 << i)) + newref->hw_reg.negate ^= 1; + + if (src.Abs) { + newref->hw_reg.negate = 0; + newref->hw_reg.abs = 1; + } + + return newref; +} + + +static void +translate_insn(struct brw_wm_compile *c, + const struct prog_instruction *inst) +{ + struct brw_wm_instruction *out = get_instruction(c); + GLuint writemask = inst->DstReg.WriteMask; + GLuint nr_args = brw_wm_nr_args(inst->Opcode); + GLuint i, j; + + /* Copy some data out of the instruction + */ + out->opcode = inst->Opcode; + out->saturate = (inst->SaturateMode != SATURATE_OFF); + out->tex_unit = inst->TexSrcUnit; + out->tex_idx = inst->TexSrcTarget; + out->tex_shadow = inst->TexShadow; + out->eot = inst->Aux & 1; + out->target = inst->Aux >> 1; + + /* Args: + */ + for (i = 0; i < nr_args; i++) { + for (j = 0; j < 4; j++) { + out->src[i][j] = get_new_ref(c, inst->SrcReg[i], j, out); + } + } + + /* Dst: + */ + pass0_set_dst(c, out, inst, writemask); +} + + + +/*********************************************************************** + * Optimize moves and swizzles away: + */ +static void pass0_precalc_mov( struct brw_wm_compile *c, + const struct prog_instruction *inst ) +{ + const struct prog_dst_register *dst = &inst->DstReg; + GLuint writemask = inst->DstReg.WriteMask; + struct brw_wm_ref *refs[4]; + GLuint i; + + /* Get the effect of a MOV by manipulating our register table: + * First get all refs, then assign refs. This ensures that "in-place" + * swizzles such as: + * MOV t, t.xxyx + * are handled correctly. Previously, these two steps were done in + * one loop and the above case was incorrectly handled. + */ + for (i = 0; i < 4; i++) { + refs[i] = get_new_ref(c, inst->SrcReg[0], i, NULL); + } + for (i = 0; i < 4; i++) { + if (writemask & (1 << i)) { + pass0_set_fpreg_ref( c, dst->File, dst->Index, i, refs[i]); + } + } +} + + +/* Initialize payload "registers". + */ +static void pass0_init_payload( struct brw_wm_compile *c ) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + GLuint j = i >= c->key.nr_depth_regs ? 0 : i; + pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, + &c->payload.depth[j] ); + } + +#if 0 + /* This seems to be an alternative to the INTERP_WPOS stuff I do + * elsewhere: + */ + if (c->key.source_depth_reg) + pass0_set_fpreg_value(c, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, 2, + &c->payload.depth[c->key.source_depth_reg/2]); +#endif + + for (i = 0; i < FRAG_ATTRIB_MAX; i++) + pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, i, 0, + &c->payload.input_interp[i] ); +} + + +/*********************************************************************** + * PASS 0 + * + * Work forwards to give each calculated value a unique number. Where + * an instruction produces duplicate values (eg DP3), all are given + * the same number. + * + * Translate away swizzling and eliminate non-saturating moves. + */ +void brw_wm_pass0( struct brw_wm_compile *c ) +{ + GLuint insn; + + c->nr_vreg = 0; + c->nr_insns = 0; + + pass0_init_undef(c); + pass0_init_payload(c); + + for (insn = 0; insn < c->nr_fp_insns; insn++) { + const struct prog_instruction *inst = &c->prog_instructions[insn]; + + /* Optimize away moves, otherwise emit translated instruction: + */ + switch (inst->Opcode) { + case OPCODE_MOV: + case OPCODE_SWZ: + if (!inst->SaturateMode) { + pass0_precalc_mov(c, inst); + } + else { + translate_insn(c, inst); + } + break; + default: + translate_insn(c, inst); + break; + } + } + + if (INTEL_DEBUG & DEBUG_WM) { + brw_wm_print_program(c, "pass0"); + } +} diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c new file mode 100644 index 00000000000..b4493940292 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_pass1.c @@ -0,0 +1,291 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_wm.h" + + +static GLuint get_tracked_mask(struct brw_wm_compile *c, + struct brw_wm_instruction *inst) +{ + GLuint i; + for (i = 0; i < 4; i++) { + if (inst->writemask & (1<<i)) { + if (!inst->dst[i]->contributes_to_output) { + inst->writemask &= ~(1<<i); + inst->dst[i] = 0; + } + } + } + + return inst->writemask; +} + +/* Remove a reference from a value's usage chain. + */ +static void unlink_ref(struct brw_wm_ref *ref) +{ + struct brw_wm_value *value = ref->value; + + if (ref == value->lastuse) { + value->lastuse = ref->prevuse; + } + else { + struct brw_wm_ref *i = value->lastuse; + while (i->prevuse != ref) i = i->prevuse; + i->prevuse = ref->prevuse; + } +} + +static void track_arg(struct brw_wm_compile *c, + struct brw_wm_instruction *inst, + GLuint arg, + GLuint readmask) +{ + GLuint i; + + for (i = 0; i < 4; i++) { + struct brw_wm_ref *ref = inst->src[arg][i]; + if (ref) { + if (readmask & (1<<i)) { + ref->value->contributes_to_output = 1; + } + else { + unlink_ref(ref); + inst->src[arg][i] = NULL; + } + } + } +} + +static GLuint get_texcoord_mask( GLuint tex_idx ) +{ + switch (tex_idx) { + case TEXTURE_1D_INDEX: + return WRITEMASK_X; + case TEXTURE_2D_INDEX: + return WRITEMASK_XY; + case TEXTURE_3D_INDEX: + return WRITEMASK_XYZ; + case TEXTURE_CUBE_INDEX: + return WRITEMASK_XYZ; + case TEXTURE_RECT_INDEX: + return WRITEMASK_XY; + default: return 0; + } +} + + +/* Step two: Basically this is dead code elimination. + * + * Iterate backwards over instructions, noting which values + * contribute to the final result. Adjust writemasks to only + * calculate these values. + */ +void brw_wm_pass1( struct brw_wm_compile *c ) +{ + GLint insn; + + for (insn = c->nr_insns-1; insn >= 0; insn--) { + struct brw_wm_instruction *inst = &c->instruction[insn]; + GLuint writemask; + GLuint read0, read1, read2; + + if (inst->opcode == OPCODE_KIL) { + track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */ + continue; + } + + if (inst->opcode == WM_FB_WRITE) { + track_arg(c, inst, 0, WRITEMASK_XYZW); + track_arg(c, inst, 1, WRITEMASK_XYZW); + if (c->key.source_depth_to_render_target && + c->key.computes_depth) + track_arg(c, inst, 2, WRITEMASK_Z); + else + track_arg(c, inst, 2, 0); + continue; + } + + /* Lookup all the registers which were written by this + * instruction and get a mask of those that contribute to the output: + */ + writemask = get_tracked_mask(c, inst); + if (!writemask) { + GLuint arg; + for (arg = 0; arg < 3; arg++) + track_arg(c, inst, arg, 0); + continue; + } + + read0 = 0; + read1 = 0; + read2 = 0; + + /* Mark all inputs which contribute to the marked outputs: + */ + switch (inst->opcode) { + case OPCODE_ABS: + case OPCODE_FLR: + case OPCODE_FRC: + case OPCODE_MOV: + case OPCODE_SWZ: + case OPCODE_TRUNC: + read0 = writemask; + break; + + case OPCODE_SUB: + case OPCODE_SLT: + case OPCODE_SLE: + case OPCODE_SGE: + case OPCODE_SGT: + case OPCODE_SEQ: + case OPCODE_SNE: + case OPCODE_ADD: + case OPCODE_MAX: + case OPCODE_MIN: + case OPCODE_MUL: + read0 = writemask; + read1 = writemask; + break; + + case OPCODE_DDX: + case OPCODE_DDY: + read0 = writemask; + break; + + case OPCODE_MAD: + case OPCODE_CMP: + case OPCODE_LRP: + read0 = writemask; + read1 = writemask; + read2 = writemask; + break; + + case OPCODE_XPD: + if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ; + if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ; + if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY; + read1 = read0; + break; + + case OPCODE_COS: + case OPCODE_EX2: + case OPCODE_LG2: + case OPCODE_RCP: + case OPCODE_RSQ: + case OPCODE_SIN: + case OPCODE_SCS: + case WM_CINTERP: + case WM_PIXELXY: + read0 = WRITEMASK_X; + break; + + case OPCODE_POW: + read0 = WRITEMASK_X; + read1 = WRITEMASK_X; + break; + + case OPCODE_TEX: + case OPCODE_TXP: + read0 = get_texcoord_mask(inst->tex_idx); + + if (inst->tex_shadow) + read0 |= WRITEMASK_Z; + break; + + case OPCODE_TXB: + /* Shadow ignored for txb. + */ + read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W; + break; + + case WM_WPOSXY: + read0 = writemask & WRITEMASK_XY; + break; + + case WM_DELTAXY: + read0 = writemask & WRITEMASK_XY; + read1 = WRITEMASK_X; + break; + + case WM_PIXELW: + read0 = WRITEMASK_X; + read1 = WRITEMASK_XY; + break; + + case WM_LINTERP: + read0 = WRITEMASK_X; + read1 = WRITEMASK_XY; + break; + + case WM_PINTERP: + read0 = WRITEMASK_X; /* interpolant */ + read1 = WRITEMASK_XY; /* deltas */ + read2 = WRITEMASK_W; /* pixel w */ + break; + + case OPCODE_DP3: + read0 = WRITEMASK_XYZ; + read1 = WRITEMASK_XYZ; + break; + + case OPCODE_DPH: + read0 = WRITEMASK_XYZ; + read1 = WRITEMASK_XYZW; + break; + + case OPCODE_DP4: + read0 = WRITEMASK_XYZW; + read1 = WRITEMASK_XYZW; + break; + + case OPCODE_LIT: + read0 = WRITEMASK_XYW; + break; + + case OPCODE_DST: + case WM_FRONTFACING: + case OPCODE_KIL_NV: + default: + break; + } + + track_arg(c, inst, 0, read0); + track_arg(c, inst, 1, read1); + track_arg(c, inst, 2, read2); + } + + if (INTEL_DEBUG & DEBUG_WM) { + brw_wm_print_program(c, "pass1"); + } +} diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c new file mode 100644 index 00000000000..6faea018fbc --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_pass2.c @@ -0,0 +1,343 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_wm.h" + + +/* Use these to force spilling so that that functionality can be + * tested with known-good examples rather than having to construct new + * tests. + */ +#define TEST_PAYLOAD_SPILLS 0 +#define TEST_DST_SPILLS 0 + +static void spill_value(struct brw_wm_compile *c, + struct brw_wm_value *value); + +static void prealloc_reg(struct brw_wm_compile *c, + struct brw_wm_value *value, + GLuint reg) +{ + if (value->lastuse) { + /* Set nextuse to zero, it will be corrected by + * update_register_usage(). + */ + c->pass2_grf[reg].value = value; + c->pass2_grf[reg].nextuse = 0; + + value->resident = &c->pass2_grf[reg]; + value->hw_reg = brw_vec8_grf(reg*2, 0); + + if (TEST_PAYLOAD_SPILLS) + spill_value(c, value); + } +} + + +/* Initialize all the register values. Do the initial setup + * calculations for interpolants. + */ +static void init_registers( struct brw_wm_compile *c ) +{ + GLuint nr_interp_regs = 0; + GLuint i = 0; + GLuint j; + + for (j = 0; j < c->grf_limit; j++) + c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN; + + for (j = 0; j < c->key.nr_depth_regs; j++) + prealloc_reg(c, &c->payload.depth[j], i++); + + for (j = 0; j < c->nr_creg; j++) + prealloc_reg(c, &c->creg[j], i++); + + for (j = 0; j < FRAG_ATTRIB_MAX; j++) { + if (c->key.vp_outputs_written & (1<<j)) { + int fp_index; + + if (j >= VERT_RESULT_VAR0) + fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); + else if (j <= VERT_RESULT_TEX7) + fp_index = j; + else + fp_index = -1; + + nr_interp_regs++; + if (fp_index >= 0) + prealloc_reg(c, &c->payload.input_interp[fp_index], i++); + } + } + + assert(nr_interp_regs >= 1); + + c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2; + c->prog_data.urb_read_length = nr_interp_regs * 2; + c->prog_data.curb_read_length = c->nr_creg * 2; + + c->max_wm_grf = i * 2; +} + + +/* Update the nextuse value for each register in our file. + */ +static void update_register_usage(struct brw_wm_compile *c, + GLuint thisinsn) +{ + GLuint i; + + for (i = 1; i < c->grf_limit; i++) { + struct brw_wm_grf *grf = &c->pass2_grf[i]; + + /* Only search those which can change: + */ + if (grf->nextuse < thisinsn) { + const struct brw_wm_ref *ref = grf->value->lastuse; + + /* Has last use of value been passed? + */ + if (ref->insn < thisinsn) { + grf->value->resident = 0; + grf->value = 0; + grf->nextuse = BRW_WM_MAX_INSN; + } + else { + /* Else loop through chain to update: + */ + while (ref->prevuse && ref->prevuse->insn >= thisinsn) + ref = ref->prevuse; + + grf->nextuse = ref->insn; + } + } + } +} + + +static void spill_value(struct brw_wm_compile *c, + struct brw_wm_value *value) +{ + /* Allocate a spill slot. Note that allocations start from 0x40 - + * the first slot is reserved to mean "undef" in brw_wm_emit.c + */ + if (!value->spill_slot) { + c->last_scratch += 0x40; + value->spill_slot = c->last_scratch; + } + + /* The spill will be done in brw_wm_emit.c immediately after the + * value is calculated, so we can just take this reg without any + * further work. + */ + value->resident->value = NULL; + value->resident->nextuse = BRW_WM_MAX_INSN; + value->resident = NULL; +} + + + +/* Search for contiguous region with the most distant nearest + * member. Free regs count as very distant. + * + * TODO: implement spill-to-reg so that we can rearrange discontigous + * free regs and then spill the oldest non-free regs in sequence. + * This would mean inserting instructions in this pass. + */ +static GLuint search_contiguous_regs(struct brw_wm_compile *c, + GLuint nr, + GLuint thisinsn) +{ + struct brw_wm_grf *grf = c->pass2_grf; + GLuint furthest = 0; + GLuint reg = 0; + GLuint i, j; + + /* Start search at 1: r0 is special and can't be used or spilled. + */ + for (i = 1; i < c->grf_limit && furthest < BRW_WM_MAX_INSN; i++) { + GLuint group_nextuse = BRW_WM_MAX_INSN; + + for (j = 0; j < nr; j++) { + if (grf[i+j].nextuse < group_nextuse) + group_nextuse = grf[i+j].nextuse; + } + + if (group_nextuse > furthest) { + furthest = group_nextuse; + reg = i; + } + } + + assert(furthest != thisinsn); + + /* Any non-empty regs will need to be spilled: + */ + for (j = 0; j < nr; j++) + if (grf[reg+j].value) + spill_value(c, grf[reg+j].value); + + return reg; +} + + +static void alloc_contiguous_dest(struct brw_wm_compile *c, + struct brw_wm_value *dst[], + GLuint nr, + GLuint thisinsn) +{ + GLuint reg = search_contiguous_regs(c, nr, thisinsn); + GLuint i; + + for (i = 0; i < nr; i++) { + if (!dst[i]) { + /* Need to grab a dummy value in TEX case. Don't introduce + * it into the tracking scheme. + */ + dst[i] = &c->vreg[c->nr_vreg++]; + } + else { + assert(!dst[i]->resident); + assert(c->pass2_grf[reg+i].nextuse != thisinsn); + + c->pass2_grf[reg+i].value = dst[i]; + c->pass2_grf[reg+i].nextuse = thisinsn; + + dst[i]->resident = &c->pass2_grf[reg+i]; + } + + dst[i]->hw_reg = brw_vec8_grf((reg+i)*2, 0); + } + + if ((reg+nr)*2 > c->max_wm_grf) + c->max_wm_grf = (reg+nr) * 2; +} + + +static void load_args(struct brw_wm_compile *c, + struct brw_wm_instruction *inst) +{ + GLuint thisinsn = inst - c->instruction; + GLuint i,j; + + for (i = 0; i < 3; i++) { + for (j = 0; j < 4; j++) { + struct brw_wm_ref *ref = inst->src[i][j]; + + if (ref) { + if (!ref->value->resident) { + /* Need to bring the value in from scratch space. The code for + * this will be done in brw_wm_emit.c, here we just do the + * register allocation and mark the ref as requiring a fill. + */ + GLuint reg = search_contiguous_regs(c, 1, thisinsn); + + c->pass2_grf[reg].value = ref->value; + c->pass2_grf[reg].nextuse = thisinsn; + + ref->value->resident = &c->pass2_grf[reg]; + + /* Note that a fill is required: + */ + ref->unspill_reg = reg*2; + } + + /* Adjust the hw_reg to point at the value's current location: + */ + assert(ref->value == ref->value->resident->value); + ref->hw_reg.nr += (ref->value->resident - c->pass2_grf) * 2; + } + } + } +} + + + +/* Step 3: Work forwards once again. Perform register allocations, + * taking into account instructions like TEX which require contiguous + * result registers. Where necessary spill registers to scratch space + * and reload later. + */ +void brw_wm_pass2( struct brw_wm_compile *c ) +{ + GLuint insn; + GLuint i; + + init_registers(c); + + for (insn = 0; insn < c->nr_insns; insn++) { + struct brw_wm_instruction *inst = &c->instruction[insn]; + + /* Update registers' nextuse values: + */ + update_register_usage(c, insn); + + /* May need to unspill some args. + */ + load_args(c, inst); + + /* Allocate registers to hold results: + */ + switch (inst->opcode) { + case OPCODE_TEX: + case OPCODE_TXB: + case OPCODE_TXP: + alloc_contiguous_dest(c, inst->dst, 4, insn); + break; + + default: + for (i = 0; i < 4; i++) { + if (inst->writemask & (1<<i)) { + assert(inst->dst[i]); + alloc_contiguous_dest(c, &inst->dst[i], 1, insn); + } + } + break; + } + + if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) { + for (i = 0; i < 4; i++) + if (inst->dst[i]) + spill_value(c, inst->dst[i]); + } + } + + if (INTEL_DEBUG & DEBUG_WM) { + brw_wm_print_program(c, "pass2"); + } + + c->state = PASS2_DONE; + + if (INTEL_DEBUG & DEBUG_WM) { + brw_wm_print_program(c, "pass2/done"); + } +} diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c new file mode 100644 index 00000000000..dff466587a8 --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c @@ -0,0 +1,369 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + +#include "main/macros.h" + + + +/* Samplers aren't strictly wm state from the hardware's perspective, + * but that is the only situation in which we use them in this driver. + */ + + + +/* The brw (and related graphics cores) do not support GL_CLAMP. The + * Intel drivers for "other operating systems" implement GL_CLAMP as + * GL_CLAMP_TO_EDGE, so the same is done here. + */ +static GLuint translate_wrap_mode( GLenum wrap ) +{ + switch( wrap ) { + case GL_REPEAT: + return BRW_TEXCOORDMODE_WRAP; + case GL_CLAMP: + return BRW_TEXCOORDMODE_CLAMP; + case GL_CLAMP_TO_EDGE: + return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */ + case GL_CLAMP_TO_BORDER: + return BRW_TEXCOORDMODE_CLAMP_BORDER; + case GL_MIRRORED_REPEAT: + return BRW_TEXCOORDMODE_MIRROR; + default: + return BRW_TEXCOORDMODE_WRAP; + } +} + + +static GLuint U_FIXED(GLfloat value, GLuint frac_bits) +{ + value *= (1<<frac_bits); + return value < 0 ? 0 : value; +} + +static GLint S_FIXED(GLfloat value, GLuint frac_bits) +{ + return value * (1<<frac_bits); +} + + +static dri_bo *upload_default_color( struct brw_context *brw, + const GLfloat *color ) +{ + struct brw_sampler_default_color sdc; + + COPY_4V(sdc.color, color); + + return brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc, + NULL, 0 ); +} + + +struct wm_sampler_key { + int sampler_count; + + struct wm_sampler_entry { + GLenum tex_target; + GLenum wrap_r, wrap_s, wrap_t; + float maxlod, minlod; + float lod_bias; + float max_aniso; + GLenum minfilter, magfilter; + GLenum comparemode, comparefunc; + dri_bo *sdc_bo; + + /** If target is cubemap, take context setting. + */ + GLboolean seamless_cube_map; + } sampler[BRW_MAX_TEX_UNIT]; +}; + +/** + * Sets the sampler state for a single unit based off of the sampler key + * entry. + */ +static void brw_update_sampler_state(struct wm_sampler_entry *key, + dri_bo *sdc_bo, + struct brw_sampler_state *sampler) +{ + _mesa_memset(sampler, 0, sizeof(*sampler)); + + switch (key->minfilter) { + case GL_NEAREST: + sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST; + sampler->ss0.mip_filter = BRW_MIPFILTER_NONE; + break; + case GL_LINEAR: + sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR; + sampler->ss0.mip_filter = BRW_MIPFILTER_NONE; + break; + case GL_NEAREST_MIPMAP_NEAREST: + sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST; + sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST; + break; + case GL_LINEAR_MIPMAP_NEAREST: + sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR; + sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST; + break; + case GL_NEAREST_MIPMAP_LINEAR: + sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST; + sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR; + break; + case GL_LINEAR_MIPMAP_LINEAR: + sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR; + sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR; + break; + default: + break; + } + + /* Set Anisotropy: + */ + if (key->max_aniso > 1.0) { + sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; + sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC; + + if (key->max_aniso > 2.0) { + sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2, + BRW_ANISORATIO_16); + } + } + else { + switch (key->magfilter) { + case GL_NEAREST: + sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST; + break; + case GL_LINEAR: + sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR; + break; + default: + break; + } + } + + sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r); + sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s); + sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t); + + /* Cube-maps on 965 and later must use the same wrap mode for all 3 + * coordinate dimensions. Futher, only CUBE and CLAMP are valid. + */ + if (key->tex_target == GL_TEXTURE_CUBE_MAP) { + if (key->seamless_cube_map && + (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) { + sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE; + sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE; + sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE; + } else { + sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; + sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP; + sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP; + } + } else if (key->tex_target == GL_TEXTURE_1D) { + /* There's a bug in 1D texture sampling - it actually pays + * attention to the wrap_t value, though it should not. + * Override the wrap_t value here to GL_REPEAT to keep + * any nonexistent border pixels from floating in. + */ + sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP; + } + + + /* Set shadow function: + */ + if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) { + /* Shadowing is "enabled" by emitting a particular sampler + * message (sample_c). So need to recompile WM program when + * shadow comparison is enabled on each/any texture unit. + */ + sampler->ss0.shadow_function = + intel_translate_shadow_compare_func(key->comparefunc); + } + + /* Set LOD bias: + */ + sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6); + + sampler->ss0.lod_preclamp = 1; /* OpenGL mode */ + sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */ + + /* Set BaseMipLevel, MaxLOD, MinLOD: + * + * XXX: I don't think that using firstLevel, lastLevel works, + * because we always setup the surface state as if firstLevel == + * level zero. Probably have to subtract firstLevel from each of + * these: + */ + sampler->ss0.base_level = U_FIXED(0, 1); + + sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6); + sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6); + + sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */ +} + + +/** Sets up the cache key for sampler state for all texture units */ +static void +brw_wm_sampler_populate_key(struct brw_context *brw, + struct wm_sampler_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + int unit; + + memset(key, 0, sizeof(*key)); + + for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) { + if (ctx->Texture.Unit[unit]._ReallyEnabled) { + struct wm_sampler_entry *entry = &key->sampler[unit]; + struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit]; + struct gl_texture_object *texObj = texUnit->_Current; + struct intel_texture_object *intelObj = intel_texture_object(texObj); + struct gl_texture_image *firstImage = + texObj->Image[0][intelObj->firstLevel]; + + entry->tex_target = texObj->Target; + + entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP) + ? ctx->Texture.CubeMapSeamless : GL_FALSE; + + entry->wrap_r = texObj->WrapR; + entry->wrap_s = texObj->WrapS; + entry->wrap_t = texObj->WrapT; + + entry->maxlod = texObj->MaxLod; + entry->minlod = texObj->MinLod; + entry->lod_bias = texUnit->LodBias + texObj->LodBias; + entry->max_aniso = texObj->MaxAnisotropy; + entry->minfilter = texObj->MinFilter; + entry->magfilter = texObj->MagFilter; + entry->comparemode = texObj->CompareMode; + entry->comparefunc = texObj->CompareFunc; + + dri_bo_unreference(brw->wm.sdc_bo[unit]); + if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) { + float bordercolor[4] = { + texObj->BorderColor[0], + texObj->BorderColor[0], + texObj->BorderColor[0], + texObj->BorderColor[0] + }; + /* GL specs that border color for depth textures is taken from the + * R channel, while the hardware uses A. Spam R into all the + * channels for safety. + */ + brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor); + } else { + brw->wm.sdc_bo[unit] = upload_default_color(brw, + texObj->BorderColor); + } + key->sampler_count = unit + 1; + } + } +} + +/* All samplers must be uploaded in a single contiguous array, which + * complicates various things. However, this is still too confusing - + * FIXME: simplify all the different new texture state flags. + */ +static void upload_wm_samplers( struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct wm_sampler_key key; + int i; + + brw_wm_sampler_populate_key(brw, &key); + + if (brw->wm.sampler_count != key.sampler_count) { + brw->wm.sampler_count = key.sampler_count; + brw->state.dirty.cache |= CACHE_NEW_SAMPLER; + } + + dri_bo_unreference(brw->wm.sampler_bo); + brw->wm.sampler_bo = NULL; + if (brw->wm.sampler_count == 0) + return; + + brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER, + &key, sizeof(key), + brw->wm.sdc_bo, key.sampler_count, + NULL); + + /* If we didnt find it in the cache, compute the state and put it in the + * cache. + */ + if (brw->wm.sampler_bo == NULL) { + struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT]; + + memset(sampler, 0, sizeof(sampler)); + for (i = 0; i < key.sampler_count; i++) { + if (brw->wm.sdc_bo[i] == NULL) + continue; + + brw_update_sampler_state(&key.sampler[i], brw->wm.sdc_bo[i], + &sampler[i]); + } + + brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER, + &key, sizeof(key), + brw->wm.sdc_bo, key.sampler_count, + &sampler, sizeof(sampler), + NULL, NULL); + + /* Emit SDC relocations */ + for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { + if (!ctx->Texture.Unit[i]._ReallyEnabled) + continue; + + dri_bo_emit_reloc(brw->wm.sampler_bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0, + i * sizeof(struct brw_sampler_state) + + offsetof(struct brw_sampler_state, ss2), + brw->wm.sdc_bo[i]); + } + } +} + +const struct brw_tracked_state brw_wm_samplers = { + .dirty = { + .mesa = _NEW_TEXTURE, + .brw = 0, + .cache = 0 + }, + .prepare = upload_wm_samplers, +}; + + diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c new file mode 100644 index 00000000000..361f91292be --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_state.c @@ -0,0 +1,317 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" +#include "brw_wm.h" + +/*********************************************************************** + * WM unit - fragment programs and rasterization + */ + +struct brw_wm_unit_key { + unsigned int total_grf, total_scratch; + unsigned int urb_entry_read_length; + unsigned int curb_entry_read_length; + unsigned int dispatch_grf_start_reg; + + unsigned int curbe_offset; + unsigned int urb_size; + + unsigned int max_threads; + + unsigned int nr_surfaces, sampler_count; + GLboolean uses_depth, computes_depth, uses_kill, is_glsl; + GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable; + GLfloat offset_units, offset_factor; +}; + +static void +wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) +{ + GLcontext *ctx = &brw->intel.ctx; + const struct gl_fragment_program *fp = brw->fragment_program; + const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp; + struct intel_context *intel = &brw->intel; + + memset(key, 0, sizeof(*key)); + + if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) + key->max_threads = 1; + else { + /* WM maximum threads is number of EUs times number of threads per EU. */ + if (BRW_IS_IGDNG(brw)) + key->max_threads = 12 * 6; + else if (BRW_IS_G4X(brw)) + key->max_threads = 10 * 5; + else + key->max_threads = 8 * 4; + } + + /* CACHE_NEW_WM_PROG */ + key->total_grf = brw->wm.prog_data->total_grf; + key->urb_entry_read_length = brw->wm.prog_data->urb_read_length; + key->curb_entry_read_length = brw->wm.prog_data->curb_read_length; + key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; + key->total_scratch = ALIGN(brw->wm.prog_data->total_scratch, 1024); + + /* BRW_NEW_URB_FENCE */ + key->urb_size = brw->urb.vsize; + + /* BRW_NEW_CURBE_OFFSETS */ + key->curbe_offset = brw->curbe.wm_start; + + /* BRW_NEW_NR_SURFACEs */ + key->nr_surfaces = brw->wm.nr_surfaces; + + /* CACHE_NEW_SAMPLER */ + key->sampler_count = brw->wm.sampler_count; + + /* _NEW_POLYGONSTIPPLE */ + key->polygon_stipple = ctx->Polygon.StippleFlag; + + /* BRW_NEW_FRAGMENT_PROGRAM */ + key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0; + + /* as far as we can tell */ + key->computes_depth = + (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0; + /* BRW_NEW_DEPTH_BUFFER + * Override for NULL depthbuffer case, required by the Pixel Shader Computed + * Depth field. + */ + if (brw->state.depth_region == NULL) + key->computes_depth = 0; + + /* _NEW_COLOR */ + key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled; + key->is_glsl = bfp->isGLSL; + + /* temporary sanity check assertion */ + ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp)); + + /* _NEW_DEPTH */ + key->stats_wm = intel->stats_wm; + + /* _NEW_LINE */ + key->line_stipple = ctx->Line.StippleFlag; + + /* _NEW_POLYGON */ + key->offset_enable = ctx->Polygon.OffsetFill; + key->offset_units = ctx->Polygon.OffsetUnits; + key->offset_factor = ctx->Polygon.OffsetFactor; +} + +/** + * Setup wm hardware state. See page 225 of Volume 2 + */ +static dri_bo * +wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, + dri_bo **reloc_bufs) +{ + struct brw_wm_unit_state wm; + dri_bo *bo; + + memset(&wm, 0, sizeof(wm)); + + wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */ + wm.thread1.depth_coef_urb_read_offset = 1; + wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + + if (BRW_IS_IGDNG(brw)) + wm.thread1.binding_table_entry_count = 0; /* hardware requirement */ + else + wm.thread1.binding_table_entry_count = key->nr_surfaces; + + if (key->total_scratch != 0) { + wm.thread2.scratch_space_base_pointer = + brw->wm.scratch_bo->offset >> 10; /* reloc */ + wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1; + } else { + wm.thread2.scratch_space_base_pointer = 0; + wm.thread2.per_thread_scratch_space = 0; + } + + wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg; + wm.thread3.urb_entry_read_length = key->urb_entry_read_length; + wm.thread3.urb_entry_read_offset = 0; + wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length; + wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2; + + if (BRW_IS_IGDNG(brw)) + wm.wm4.sampler_count = 0; /* hardware requirement */ + else + wm.wm4.sampler_count = (key->sampler_count + 1) / 4; + + if (brw->wm.sampler_bo != NULL) { + /* reloc */ + wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5; + } else { + wm.wm4.sampler_state_pointer = 0; + } + + wm.wm5.program_uses_depth = key->uses_depth; + wm.wm5.program_computes_depth = key->computes_depth; + wm.wm5.program_uses_killpixel = key->uses_kill; + + if (key->is_glsl) + wm.wm5.enable_8_pix = 1; + else + wm.wm5.enable_16_pix = 1; + + wm.wm5.max_threads = key->max_threads - 1; + wm.wm5.thread_dispatch_enable = 1; /* AKA: color_write */ + wm.wm5.legacy_line_rast = 0; + wm.wm5.legacy_global_depth_bias = 0; + wm.wm5.early_depth_test = 1; /* never need to disable */ + wm.wm5.line_aa_region_width = 0; + wm.wm5.line_endcap_aa_region_width = 1; + + wm.wm5.polygon_stipple = key->polygon_stipple; + + if (key->offset_enable) { + wm.wm5.depth_offset = 1; + /* Something wierd going on with legacy_global_depth_bias, + * offset_constant, scaling and MRD. This value passes glean + * but gives some odd results elsewere (eg. the + * quad-offset-units test). + */ + wm.global_depth_offset_constant = key->offset_units * 2; + + /* This is the only value that passes glean: + */ + wm.global_depth_offset_scale = key->offset_factor; + } + + wm.wm5.line_stipple = key->line_stipple; + + if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm) + wm.wm4.stats_enable = 1; + + bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT, + key, sizeof(*key), + reloc_bufs, 3, + &wm, sizeof(wm), + NULL, NULL); + + /* Emit WM program relocation */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + wm.thread0.grf_reg_count << 1, + offsetof(struct brw_wm_unit_state, thread0), + brw->wm.prog_bo); + + /* Emit scratch space relocation */ + if (key->total_scratch != 0) { + dri_bo_emit_reloc(bo, + 0, 0, + wm.thread2.per_thread_scratch_space, + offsetof(struct brw_wm_unit_state, thread2), + brw->wm.scratch_bo); + } + + /* Emit sampler state relocation */ + if (key->sampler_count != 0) { + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + wm.wm4.stats_enable | (wm.wm4.sampler_count << 2), + offsetof(struct brw_wm_unit_state, wm4), + brw->wm.sampler_bo); + } + + return bo; +} + + +static void upload_wm_unit( struct brw_context *brw ) +{ + struct intel_context *intel = &brw->intel; + struct brw_wm_unit_key key; + dri_bo *reloc_bufs[3]; + wm_unit_populate_key(brw, &key); + + /* Allocate the necessary scratch space if we haven't already. Don't + * bother reducing the allocation later, since we use scratch so + * rarely. + */ + assert(key.total_scratch <= 12 * 1024); + if (key.total_scratch) { + GLuint total = key.total_scratch * key.max_threads; + + if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) { + dri_bo_unreference(brw->wm.scratch_bo); + brw->wm.scratch_bo = NULL; + } + if (brw->wm.scratch_bo == NULL) { + brw->wm.scratch_bo = dri_bo_alloc(intel->bufmgr, + "wm scratch", + total, + 4096); + } + } + + reloc_bufs[0] = brw->wm.prog_bo; + reloc_bufs[1] = brw->wm.scratch_bo; + reloc_bufs[2] = brw->wm.sampler_bo; + + dri_bo_unreference(brw->wm.state_bo); + brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT, + &key, sizeof(key), + reloc_bufs, 3, + NULL); + if (brw->wm.state_bo == NULL) { + brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs); + } +} + +const struct brw_tracked_state brw_wm_unit = { + .dirty = { + .mesa = (_NEW_POLYGON | + _NEW_POLYGONSTIPPLE | + _NEW_LINE | + _NEW_COLOR | + _NEW_DEPTH), + + .brw = (BRW_NEW_FRAGMENT_PROGRAM | + BRW_NEW_CURBE_OFFSETS | + BRW_NEW_DEPTH_BUFFER | + BRW_NEW_NR_WM_SURFACES), + + .cache = (CACHE_NEW_WM_PROG | + CACHE_NEW_SAMPLER) + }, + .prepare = upload_wm_unit, +}; + diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c new file mode 100644 index 00000000000..f7cc5153a8e --- /dev/null +++ b/src/gallium/drivers/i965/brw_wm_surface_state.c @@ -0,0 +1,752 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + */ + + +#include "main/mtypes.h" +#include "main/texformat.h" +#include "main/texstore.h" +#include "shader/prog_parameter.h" + +#include "intel_mipmap_tree.h" +#include "intel_batchbuffer.h" +#include "intel_tex.h" +#include "intel_fbo.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "brw_defines.h" + + +static GLuint translate_tex_target( GLenum target ) +{ + switch (target) { + case GL_TEXTURE_1D: + return BRW_SURFACE_1D; + + case GL_TEXTURE_RECTANGLE_NV: + return BRW_SURFACE_2D; + + case GL_TEXTURE_2D: + return BRW_SURFACE_2D; + + case GL_TEXTURE_3D: + return BRW_SURFACE_3D; + + case GL_TEXTURE_CUBE_MAP: + return BRW_SURFACE_CUBE; + + default: + assert(0); + return 0; + } +} + + +static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format, + GLenum depth_mode ) +{ + switch( mesa_format ) { + case MESA_FORMAT_L8: + return BRW_SURFACEFORMAT_L8_UNORM; + + case MESA_FORMAT_I8: + return BRW_SURFACEFORMAT_I8_UNORM; + + case MESA_FORMAT_A8: + return BRW_SURFACEFORMAT_A8_UNORM; + + case MESA_FORMAT_AL88: + return BRW_SURFACEFORMAT_L8A8_UNORM; + + case MESA_FORMAT_RGB888: + assert(0); /* not supported for sampling */ + return BRW_SURFACEFORMAT_R8G8B8_UNORM; + + case MESA_FORMAT_ARGB8888: + if (internal_format == GL_RGB) + return BRW_SURFACEFORMAT_B8G8R8X8_UNORM; + else + return BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + + case MESA_FORMAT_RGBA8888_REV: + if (internal_format == GL_RGB) + return BRW_SURFACEFORMAT_R8G8B8X8_UNORM; + else + return BRW_SURFACEFORMAT_R8G8B8A8_UNORM; + + case MESA_FORMAT_RGB565: + return BRW_SURFACEFORMAT_B5G6R5_UNORM; + + case MESA_FORMAT_ARGB1555: + return BRW_SURFACEFORMAT_B5G5R5A1_UNORM; + + case MESA_FORMAT_ARGB4444: + return BRW_SURFACEFORMAT_B4G4R4A4_UNORM; + + case MESA_FORMAT_YCBCR_REV: + return BRW_SURFACEFORMAT_YCRCB_NORMAL; + + case MESA_FORMAT_YCBCR: + return BRW_SURFACEFORMAT_YCRCB_SWAPUVY; + + case MESA_FORMAT_RGB_FXT1: + case MESA_FORMAT_RGBA_FXT1: + return BRW_SURFACEFORMAT_FXT1; + + case MESA_FORMAT_Z16: + if (depth_mode == GL_INTENSITY) + return BRW_SURFACEFORMAT_I16_UNORM; + else if (depth_mode == GL_ALPHA) + return BRW_SURFACEFORMAT_A16_UNORM; + else + return BRW_SURFACEFORMAT_L16_UNORM; + + case MESA_FORMAT_RGB_DXT1: + return BRW_SURFACEFORMAT_DXT1_RGB; + + case MESA_FORMAT_RGBA_DXT1: + return BRW_SURFACEFORMAT_BC1_UNORM; + + case MESA_FORMAT_RGBA_DXT3: + return BRW_SURFACEFORMAT_BC2_UNORM; + + case MESA_FORMAT_RGBA_DXT5: + return BRW_SURFACEFORMAT_BC3_UNORM; + + case MESA_FORMAT_SARGB8: + return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; + + case MESA_FORMAT_SLA8: + return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB; + + case MESA_FORMAT_SL8: + return BRW_SURFACEFORMAT_L8_UNORM_SRGB; + + case MESA_FORMAT_SRGB_DXT1: + return BRW_SURFACEFORMAT_BC1_UNORM_SRGB; + + case MESA_FORMAT_S8_Z24: + /* XXX: these different surface formats don't seem to + * make any difference for shadow sampler/compares. + */ + if (depth_mode == GL_INTENSITY) + return BRW_SURFACEFORMAT_I24X8_UNORM; + else if (depth_mode == GL_ALPHA) + return BRW_SURFACEFORMAT_A24X8_UNORM; + else + return BRW_SURFACEFORMAT_L24X8_UNORM; + + case MESA_FORMAT_DUDV8: + return BRW_SURFACEFORMAT_R8G8_SNORM; + + case MESA_FORMAT_SIGNED_RGBA8888_REV: + return BRW_SURFACEFORMAT_R8G8B8A8_SNORM; + + default: + assert(0); + return 0; + } +} + +static void +brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling) +{ + switch (tiling) { + case I915_TILING_NONE: + surf->ss3.tiled_surface = 0; + surf->ss3.tile_walk = 0; + break; + case I915_TILING_X: + surf->ss3.tiled_surface = 1; + surf->ss3.tile_walk = BRW_TILEWALK_XMAJOR; + break; + case I915_TILING_Y: + surf->ss3.tiled_surface = 1; + surf->ss3.tile_walk = BRW_TILEWALK_YMAJOR; + break; + } +} + +static dri_bo * +brw_create_texture_surface( struct brw_context *brw, + struct brw_surface_key *key ) +{ + struct brw_surface_state surf; + dri_bo *bo; + + memset(&surf, 0, sizeof(surf)); + + surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; + surf.ss0.surface_type = translate_tex_target(key->target); + if (key->bo) { + surf.ss0.surface_format = translate_tex_format(key->format, + key->internal_format, + key->depthmode); + } + else { + switch (key->depth) { + case 32: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + break; + default: + case 24: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM; + break; + case 16: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; + break; + } + } + + /* This is ok for all textures with channel width 8bit or less: + */ +/* surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */ + if (key->bo) + surf.ss1.base_addr = key->bo->offset; /* reloc */ + else + surf.ss1.base_addr = key->offset; + + surf.ss2.mip_count = key->last_level - key->first_level; + surf.ss2.width = key->width - 1; + surf.ss2.height = key->height - 1; + brw_set_surface_tiling(&surf, key->tiling); + surf.ss3.pitch = (key->pitch * key->cpp) - 1; + surf.ss3.depth = key->depth - 1; + + surf.ss4.min_lod = 0; + + if (key->target == GL_TEXTURE_CUBE_MAP) { + surf.ss0.cube_pos_x = 1; + surf.ss0.cube_pos_y = 1; + surf.ss0.cube_pos_z = 1; + surf.ss0.cube_neg_x = 1; + surf.ss0.cube_neg_y = 1; + surf.ss0.cube_neg_z = 1; + } + + bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE, + key, sizeof(*key), + &key->bo, key->bo ? 1 : 0, + &surf, sizeof(surf), + NULL, NULL); + + if (key->bo) { + /* Emit relocation to surface contents */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0, + offsetof(struct brw_surface_state, ss1), + key->bo); + } + return bo; +} + +static void +brw_update_texture_surface( GLcontext *ctx, GLuint unit ) +{ + struct brw_context *brw = brw_context(ctx); + struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current; + struct intel_texture_object *intelObj = intel_texture_object(tObj); + struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel]; + struct brw_surface_key key; + const GLuint surf = SURF_INDEX_TEXTURE(unit); + + memset(&key, 0, sizeof(key)); + + if (intelObj->imageOverride) { + key.pitch = intelObj->pitchOverride / intelObj->mt->cpp; + key.depth = intelObj->depthOverride; + key.bo = NULL; + key.offset = intelObj->textureOffset; + } else { + key.format = firstImage->TexFormat->MesaFormat; + key.internal_format = firstImage->InternalFormat; + key.pitch = intelObj->mt->pitch; + key.depth = firstImage->Depth; + key.bo = intelObj->mt->region->buffer; + key.offset = 0; + } + + key.target = tObj->Target; + key.depthmode = tObj->DepthMode; + key.first_level = intelObj->firstLevel; + key.last_level = intelObj->lastLevel; + key.width = firstImage->Width; + key.height = firstImage->Height; + key.cpp = intelObj->mt->cpp; + key.tiling = intelObj->mt->region->tiling; + + dri_bo_unreference(brw->wm.surf_bo[surf]); + brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache, + BRW_SS_SURFACE, + &key, sizeof(key), + &key.bo, key.bo ? 1 : 0, + NULL); + if (brw->wm.surf_bo[surf] == NULL) { + brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key); + } +} + + + +/** + * Create the constant buffer surface. Vertex/fragment shader constants will be + * read from this buffer with Data Port Read instructions/messages. + */ +dri_bo * +brw_create_constant_surface( struct brw_context *brw, + struct brw_surface_key *key ) +{ + const GLint w = key->width - 1; + struct brw_surface_state surf; + dri_bo *bo; + + memset(&surf, 0, sizeof(surf)); + + surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; + surf.ss0.surface_type = BRW_SURFACE_BUFFER; + surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT; + + assert(key->bo); + if (key->bo) + surf.ss1.base_addr = key->bo->offset; /* reloc */ + else + surf.ss1.base_addr = key->offset; + + surf.ss2.width = w & 0x7f; /* bits 6:0 of size or width */ + surf.ss2.height = (w >> 7) & 0x1fff; /* bits 19:7 of size or width */ + surf.ss3.depth = (w >> 20) & 0x7f; /* bits 26:20 of size or width */ + surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */ + brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */ + + bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE, + key, sizeof(*key), + &key->bo, key->bo ? 1 : 0, + &surf, sizeof(surf), + NULL, NULL); + + if (key->bo) { + /* Emit relocation to surface contents */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0, + offsetof(struct brw_surface_state, ss1), + key->bo); + } + + return bo; +} + +/* Creates a new WM constant buffer reflecting the current fragment program's + * constants, if needed by the fragment program. + * + * Otherwise, constants go through the CURBEs using the brw_constant_buffer + * state atom. + */ +static drm_intel_bo * +brw_wm_update_constant_buffer(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct brw_fragment_program *fp = + (struct brw_fragment_program *) brw->fragment_program; + const struct gl_program_parameter_list *params = fp->program.Base.Parameters; + const int size = params->NumParameters * 4 * sizeof(GLfloat); + drm_intel_bo *const_buffer; + + /* BRW_NEW_FRAGMENT_PROGRAM */ + if (!fp->use_const_buffer) + return NULL; + + const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer", + size, 64); + + /* _NEW_PROGRAM_CONSTANTS */ + dri_bo_subdata(const_buffer, 0, size, params->ParameterValues); + + return const_buffer; +} + +/** + * Update the surface state for a WM constant buffer. + * The constant buffer will be (re)allocated here if needed. + */ +static void +brw_update_wm_constant_surface( GLcontext *ctx, + GLuint surf) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_surface_key key; + struct brw_fragment_program *fp = + (struct brw_fragment_program *) brw->fragment_program; + const struct gl_program_parameter_list *params = + fp->program.Base.Parameters; + + /* If we're in this state update atom, we need to update WM constants, so + * free the old buffer and create a new one for the new contents. + */ + dri_bo_unreference(fp->const_buffer); + fp->const_buffer = brw_wm_update_constant_buffer(brw); + + /* If there's no constant buffer, then no surface BO is needed to point at + * it. + */ + if (fp->const_buffer == 0) { + drm_intel_bo_unreference(brw->wm.surf_bo[surf]); + brw->wm.surf_bo[surf] = NULL; + return; + } + + memset(&key, 0, sizeof(key)); + + key.format = MESA_FORMAT_RGBA_FLOAT32; + key.internal_format = GL_RGBA; + key.bo = fp->const_buffer; + key.depthmode = GL_NONE; + key.pitch = params->NumParameters; + key.width = params->NumParameters; + key.height = 1; + key.depth = 1; + key.cpp = 16; + + /* + printf("%s:\n", __FUNCTION__); + printf(" width %d height %d depth %d cpp %d pitch %d\n", + key.width, key.height, key.depth, key.cpp, key.pitch); + */ + + dri_bo_unreference(brw->wm.surf_bo[surf]); + brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache, + BRW_SS_SURFACE, + &key, sizeof(key), + &key.bo, key.bo ? 1 : 0, + NULL); + if (brw->wm.surf_bo[surf] == NULL) { + brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key); + } + brw->state.dirty.brw |= BRW_NEW_WM_SURFACES; +} + +/** + * Updates surface / buffer for fragment shader constant buffer, if + * one is required. + * + * This consumes the state updates for the constant buffer, and produces + * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for + * inclusion in the binding table. + */ +static void prepare_wm_constant_surface(struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + struct brw_fragment_program *fp = + (struct brw_fragment_program *) brw->fragment_program; + GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER; + + drm_intel_bo_unreference(fp->const_buffer); + fp->const_buffer = brw_wm_update_constant_buffer(brw); + + /* If there's no constant buffer, then no surface BO is needed to point at + * it. + */ + if (fp->const_buffer == 0) { + if (brw->wm.surf_bo[surf] != NULL) { + drm_intel_bo_unreference(brw->wm.surf_bo[surf]); + brw->wm.surf_bo[surf] = NULL; + brw->state.dirty.brw |= BRW_NEW_WM_SURFACES; + } + return; + } + + brw_update_wm_constant_surface(ctx, surf); +} + +const struct brw_tracked_state brw_wm_constant_surface = { + .dirty = { + .mesa = (_NEW_PROGRAM_CONSTANTS), + .brw = (BRW_NEW_FRAGMENT_PROGRAM), + .cache = 0 + }, + .prepare = prepare_wm_constant_surface, +}; + + +/** + * Sets up a surface state structure to point at the given region. + * While it is only used for the front/back buffer currently, it should be + * usable for further buffers when doing ARB_draw_buffer support. + */ +static void +brw_update_renderbuffer_surface(struct brw_context *brw, + struct gl_renderbuffer *rb, + unsigned int unit) +{ + GLcontext *ctx = &brw->intel.ctx; + dri_bo *region_bo = NULL; + struct intel_renderbuffer *irb = intel_renderbuffer(rb); + struct intel_region *region = irb ? irb->region : NULL; + struct { + unsigned int surface_type; + unsigned int surface_format; + unsigned int width, height, pitch, cpp; + GLubyte color_mask[4]; + GLboolean color_blend; + uint32_t tiling; + uint32_t draw_offset; + } key; + + memset(&key, 0, sizeof(key)); + + if (region != NULL) { + region_bo = region->buffer; + + key.surface_type = BRW_SURFACE_2D; + switch (irb->texformat->MesaFormat) { + case MESA_FORMAT_ARGB8888: + key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + break; + case MESA_FORMAT_RGB565: + key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; + break; + case MESA_FORMAT_ARGB1555: + key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM; + break; + case MESA_FORMAT_ARGB4444: + key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM; + break; + default: + _mesa_problem(ctx, "Bad renderbuffer format: %d\n", + irb->texformat->MesaFormat); + } + key.tiling = region->tiling; + if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) { + key.width = rb->Width; + key.height = rb->Height; + } else { + key.width = region->width; + key.height = region->height; + } + key.pitch = region->pitch; + key.cpp = region->cpp; + key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */ + } else { + key.surface_type = BRW_SURFACE_NULL; + key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + key.tiling = I915_TILING_X; + key.width = 1; + key.height = 1; + key.cpp = 4; + key.draw_offset = 0; + } + memcpy(key.color_mask, ctx->Color.ColorMask, + sizeof(key.color_mask)); + key.color_blend = (!ctx->Color._LogicOpEnabled && + ctx->Color.BlendEnabled); + + dri_bo_unreference(brw->wm.surf_bo[unit]); + brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache, + BRW_SS_SURFACE, + &key, sizeof(key), + ®ion_bo, 1, + NULL); + + if (brw->wm.surf_bo[unit] == NULL) { + struct brw_surface_state surf; + + memset(&surf, 0, sizeof(surf)); + + surf.ss0.surface_format = key.surface_format; + surf.ss0.surface_type = key.surface_type; + if (key.tiling == I915_TILING_NONE) { + surf.ss1.base_addr = key.draw_offset; + } else { + uint32_t tile_offset = key.draw_offset % 4096; + + surf.ss1.base_addr = key.draw_offset - tile_offset; + + assert(BRW_IS_G4X(brw) || tile_offset == 0); + if (BRW_IS_G4X(brw)) { + if (key.tiling == I915_TILING_X) { + /* Note that the low bits of these fields are missing, so + * there's the possibility of getting in trouble. + */ + surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4; + surf.ss5.y_offset = tile_offset / 512 / 2; + } else { + surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4; + surf.ss5.y_offset = tile_offset / 128 / 2; + } + } + } + if (region_bo != NULL) + surf.ss1.base_addr += region_bo->offset; /* reloc */ + + surf.ss2.width = key.width - 1; + surf.ss2.height = key.height - 1; + brw_set_surface_tiling(&surf, key.tiling); + surf.ss3.pitch = (key.pitch * key.cpp) - 1; + + /* _NEW_COLOR */ + surf.ss0.color_blend = key.color_blend; + surf.ss0.writedisable_red = !key.color_mask[0]; + surf.ss0.writedisable_green = !key.color_mask[1]; + surf.ss0.writedisable_blue = !key.color_mask[2]; + surf.ss0.writedisable_alpha = !key.color_mask[3]; + + /* Key size will never match key size for textures, so we're safe. */ + brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache, + BRW_SS_SURFACE, + &key, sizeof(key), + ®ion_bo, 1, + &surf, sizeof(surf), + NULL, NULL); + if (region_bo != NULL) { + /* We might sample from it, and we might render to it, so flag + * them both. We might be able to figure out from other state + * a more restrictive relocation to emit. + */ + drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit], + offsetof(struct brw_surface_state, ss1), + region_bo, + surf.ss1.base_addr - region_bo->offset, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER); + } + } +} + + +/** + * Constructs the binding table for the WM surface state, which maps unit + * numbers to surface state objects. + */ +static dri_bo * +brw_wm_get_binding_table(struct brw_context *brw) +{ + dri_bo *bind_bo; + + assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF); + + bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND, + NULL, 0, + brw->wm.surf_bo, brw->wm.nr_surfaces, + NULL); + + if (bind_bo == NULL) { + GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint); + uint32_t data[BRW_WM_MAX_SURF]; + int i; + + for (i = 0; i < brw->wm.nr_surfaces; i++) + if (brw->wm.surf_bo[i]) + data[i] = brw->wm.surf_bo[i]->offset; + else + data[i] = 0; + + bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND, + NULL, 0, + brw->wm.surf_bo, brw->wm.nr_surfaces, + data, data_size, + NULL, NULL); + + /* Emit binding table relocations to surface state */ + for (i = 0; i < BRW_WM_MAX_SURF; i++) { + if (brw->wm.surf_bo[i] != NULL) { + dri_bo_emit_reloc(bind_bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + 0, + i * sizeof(GLuint), + brw->wm.surf_bo[i]); + } + } + } + + return bind_bo; +} + +static void prepare_wm_surfaces(struct brw_context *brw ) +{ + GLcontext *ctx = &brw->intel.ctx; + GLuint i; + int old_nr_surfaces; + + /* _NEW_BUFFERS */ + /* Update surfaces for drawing buffers */ + if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) { + for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) { + brw_update_renderbuffer_surface(brw, + ctx->DrawBuffer->_ColorDrawBuffers[i], + i); + } + } else { + brw_update_renderbuffer_surface(brw, NULL, 0); + } + + old_nr_surfaces = brw->wm.nr_surfaces; + brw->wm.nr_surfaces = MAX_DRAW_BUFFERS; + + if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL) + brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1; + + /* Update surfaces for textures */ + for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { + const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i]; + const GLuint surf = SURF_INDEX_TEXTURE(i); + + /* _NEW_TEXTURE, BRW_NEW_TEXDATA */ + if (texUnit->_ReallyEnabled) { + brw_update_texture_surface(ctx, i); + brw->wm.nr_surfaces = surf + 1; + } else { + dri_bo_unreference(brw->wm.surf_bo[surf]); + brw->wm.surf_bo[surf] = NULL; + } + } + + dri_bo_unreference(brw->wm.bind_bo); + brw->wm.bind_bo = brw_wm_get_binding_table(brw); + + if (brw->wm.nr_surfaces != old_nr_surfaces) + brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES; +} + +const struct brw_tracked_state brw_wm_surfaces = { + .dirty = { + .mesa = (_NEW_COLOR | + _NEW_TEXTURE | + _NEW_BUFFERS), + .brw = (BRW_NEW_CONTEXT | + BRW_NEW_WM_SURFACES), + .cache = 0 + }, + .prepare = prepare_wm_surfaces, +}; + + + diff --git a/src/gallium/drivers/i965/intel_batchbuffer.h b/src/gallium/drivers/i965/intel_batchbuffer.h new file mode 100644 index 00000000000..d4899aab7fa --- /dev/null +++ b/src/gallium/drivers/i965/intel_batchbuffer.h @@ -0,0 +1,184 @@ +#ifndef INTEL_BATCHBUFFER_H +#define INTEL_BATCHBUFFER_H + +#include "main/mtypes.h" + +#include "intel_context.h" +#include "intel_bufmgr.h" +#include "intel_reg.h" + +#define BATCH_SZ 16384 +#define BATCH_RESERVED 16 + +enum cliprect_mode { + /** + * Batchbuffer contents may be looped over per cliprect, but do not + * require it. + */ + IGNORE_CLIPRECTS, + /** + * Batchbuffer contents require looping over per cliprect at batch submit + * time. + * + * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single + * constant cliprect, as in DRI2 or FBO rendering. + */ + LOOP_CLIPRECTS, + /** + * Batchbuffer contents contain drawing that should not be executed multiple + * times. + */ + NO_LOOP_CLIPRECTS, + /** + * Batchbuffer contents contain drawing that already handles cliprects, such + * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE. + * + * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch + * outside of LOCK/UNLOCK. This is upgraded to just NO_LOOP_CLIPRECTS when + * there's a constant cliprect, as in DRI2 or FBO rendering. + */ + REFERENCES_CLIPRECTS +}; + +struct intel_batchbuffer +{ + struct intel_context *intel; + + dri_bo *buf; + + GLubyte *buffer; + + GLubyte *map; + GLubyte *ptr; + + enum cliprect_mode cliprect_mode; + + GLuint size; + + /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */ + struct { + GLuint total; + GLubyte *start_ptr; + } emit; + + GLuint dirty_state; +}; + +struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context + *intel); + +void intel_batchbuffer_free(struct intel_batchbuffer *batch); + + +void _intel_batchbuffer_flush(struct intel_batchbuffer *batch, + const char *file, int line); + +#define intel_batchbuffer_flush(batch) \ + _intel_batchbuffer_flush(batch, __FILE__, __LINE__) + +void intel_batchbuffer_reset(struct intel_batchbuffer *batch); + + +/* Unlike bmBufferData, this currently requires the buffer be mapped. + * Consider it a convenience function wrapping multple + * intel_buffer_dword() calls. + */ +void intel_batchbuffer_data(struct intel_batchbuffer *batch, + const void *data, GLuint bytes, + enum cliprect_mode cliprect_mode); + +void intel_batchbuffer_release_space(struct intel_batchbuffer *batch, + GLuint bytes); + +GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch, + dri_bo *buffer, + uint32_t read_domains, + uint32_t write_domain, + uint32_t offset); + +/* Inline functions - might actually be better off with these + * non-inlined. Certainly better off switching all command packets to + * be passed as structs rather than dwords, but that's a little bit of + * work... + */ +static INLINE GLint +intel_batchbuffer_space(struct intel_batchbuffer *batch) +{ + return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map); +} + + +static INLINE void +intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword) +{ + assert(batch->map); + assert(intel_batchbuffer_space(batch) >= 4); + *(GLuint *) (batch->ptr) = dword; + batch->ptr += 4; +} + +static INLINE void +intel_batchbuffer_require_space(struct intel_batchbuffer *batch, + GLuint sz, + enum cliprect_mode cliprect_mode) +{ + assert(sz < batch->size - 8); + if (intel_batchbuffer_space(batch) < sz) + intel_batchbuffer_flush(batch); + + if ((cliprect_mode == LOOP_CLIPRECTS || + cliprect_mode == REFERENCES_CLIPRECTS) && + batch->intel->constant_cliprect) + cliprect_mode = NO_LOOP_CLIPRECTS; + + if (cliprect_mode != IGNORE_CLIPRECTS) { + if (batch->cliprect_mode == IGNORE_CLIPRECTS) { + batch->cliprect_mode = cliprect_mode; + } else { + if (batch->cliprect_mode != cliprect_mode) { + intel_batchbuffer_flush(batch); + batch->cliprect_mode = cliprect_mode; + } + } + } +} + +/* Here are the crusty old macros, to be removed: + */ +#define BATCH_LOCALS + +#define BEGIN_BATCH(n, cliprect_mode) do { \ + intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \ + assert(intel->batch->emit.start_ptr == NULL); \ + intel->batch->emit.total = (n) * 4; \ + intel->batch->emit.start_ptr = intel->batch->ptr; \ +} while (0) + +#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d) + +#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \ + assert((unsigned) (delta) < buf->size); \ + intel_batchbuffer_emit_reloc(intel->batch, buf, \ + read_domains, write_domain, delta); \ +} while (0) + +#define ADVANCE_BATCH() do { \ + unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr; \ + assert(intel->batch->emit.start_ptr != NULL); \ + if (_n != intel->batch->emit.total) { \ + fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n", \ + _n, intel->batch->emit.total); \ + abort(); \ + } \ + intel->batch->emit.start_ptr = NULL; \ +} while(0) + + +static INLINE void +intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch) +{ + intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS); + intel_batchbuffer_emit_dword(batch, MI_FLUSH); +} + +#endif diff --git a/src/gallium/drivers/i965/intel_chipset.h b/src/gallium/drivers/i965/intel_chipset.h new file mode 100644 index 00000000000..3dc8653a735 --- /dev/null +++ b/src/gallium/drivers/i965/intel_chipset.h @@ -0,0 +1,118 @@ +/* + * Copyright © 2007 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <[email protected]> + * + */ + +#define PCI_CHIP_I810 0x7121 +#define PCI_CHIP_I810_DC100 0x7123 +#define PCI_CHIP_I810_E 0x7125 +#define PCI_CHIP_I815 0x1132 + +#define PCI_CHIP_I830_M 0x3577 +#define PCI_CHIP_845_G 0x2562 +#define PCI_CHIP_I855_GM 0x3582 +#define PCI_CHIP_I865_G 0x2572 + +#define PCI_CHIP_I915_G 0x2582 +#define PCI_CHIP_E7221_G 0x258A +#define PCI_CHIP_I915_GM 0x2592 +#define PCI_CHIP_I945_G 0x2772 +#define PCI_CHIP_I945_GM 0x27A2 +#define PCI_CHIP_I945_GME 0x27AE + +#define PCI_CHIP_Q35_G 0x29B2 +#define PCI_CHIP_G33_G 0x29C2 +#define PCI_CHIP_Q33_G 0x29D2 + +#define PCI_CHIP_IGD_GM 0xA011 +#define PCI_CHIP_IGD_G 0xA001 + +#define IS_IGDGM(devid) (devid == PCI_CHIP_IGD_GM) +#define IS_IGDG(devid) (devid == PCI_CHIP_IGD_G) +#define IS_IGD(devid) (IS_IGDG(devid) || IS_IGDGM(devid)) + +#define PCI_CHIP_I965_G 0x29A2 +#define PCI_CHIP_I965_Q 0x2992 +#define PCI_CHIP_I965_G_1 0x2982 +#define PCI_CHIP_I946_GZ 0x2972 +#define PCI_CHIP_I965_GM 0x2A02 +#define PCI_CHIP_I965_GME 0x2A12 + +#define PCI_CHIP_GM45_GM 0x2A42 + +#define PCI_CHIP_IGD_E_G 0x2E02 +#define PCI_CHIP_Q45_G 0x2E12 +#define PCI_CHIP_G45_G 0x2E22 +#define PCI_CHIP_G41_G 0x2E32 +#define PCI_CHIP_B43_G 0x2E42 + +#define PCI_CHIP_ILD_G 0x0042 +#define PCI_CHIP_ILM_G 0x0046 + +#define IS_MOBILE(devid) (devid == PCI_CHIP_I855_GM || \ + devid == PCI_CHIP_I915_GM || \ + devid == PCI_CHIP_I945_GM || \ + devid == PCI_CHIP_I945_GME || \ + devid == PCI_CHIP_I965_GM || \ + devid == PCI_CHIP_I965_GME || \ + devid == PCI_CHIP_GM45_GM || \ + IS_IGD(devid) || \ + devid == PCI_CHIP_ILM_G) + +#define IS_G45(devid) (devid == PCI_CHIP_IGD_E_G || \ + devid == PCI_CHIP_Q45_G || \ + devid == PCI_CHIP_G45_G || \ + devid == PCI_CHIP_G41_G || \ + devid == PCI_CHIP_B43_G) +#define IS_GM45(devid) (devid == PCI_CHIP_GM45_GM) +#define IS_G4X(devid) (IS_G45(devid) || IS_GM45(devid)) + +#define IS_ILD(devid) (devid == PCI_CHIP_ILD_G) +#define IS_ILM(devid) (devid == PCI_CHIP_ILM_G) +#define IS_IGDNG(devid) (IS_ILD(devid) || IS_ILM(devid)) + +#define IS_915(devid) (devid == PCI_CHIP_I915_G || \ + devid == PCI_CHIP_E7221_G || \ + devid == PCI_CHIP_I915_GM) + +#define IS_945(devid) (devid == PCI_CHIP_I945_G || \ + devid == PCI_CHIP_I945_GM || \ + devid == PCI_CHIP_I945_GME || \ + devid == PCI_CHIP_G33_G || \ + devid == PCI_CHIP_Q33_G || \ + devid == PCI_CHIP_Q35_G || IS_IGD(devid)) + +#define IS_965(devid) (devid == PCI_CHIP_I965_G || \ + devid == PCI_CHIP_I965_Q || \ + devid == PCI_CHIP_I965_G_1 || \ + devid == PCI_CHIP_I965_GM || \ + devid == PCI_CHIP_I965_GME || \ + devid == PCI_CHIP_I946_GZ || \ + IS_G4X(devid) || \ + IS_IGDNG(devid)) + +#define IS_9XX(devid) (IS_915(devid) || \ + IS_945(devid) || \ + IS_965(devid)) diff --git a/src/gallium/drivers/i965/intel_structs.h b/src/gallium/drivers/i965/intel_structs.h new file mode 100644 index 00000000000..522e3bd92c2 --- /dev/null +++ b/src/gallium/drivers/i965/intel_structs.h @@ -0,0 +1,132 @@ +#ifndef INTEL_STRUCTS_H +#define INTEL_STRUCTS_H + +struct br0 { + GLuint length:8; + GLuint pad0:3; + GLuint dst_tiled:1; + GLuint pad1:8; + GLuint write_rgb:1; + GLuint write_alpha:1; + GLuint opcode:7; + GLuint client:3; +}; + + +struct br13 { + GLint dest_pitch:16; + GLuint rop:8; + GLuint color_depth:2; + GLuint pad1:3; + GLuint mono_source_transparency:1; + GLuint clipping_enable:1; + GLuint pad0:1; +}; + + + +/* This is an attempt to move some of the 2D interaction in this + * driver to using structs for packets rather than a bunch of #defines + * and dwords. + */ +struct xy_color_blit { + struct br0 br0; + struct br13 br13; + + struct { + GLuint dest_x1:16; + GLuint dest_y1:16; + } dw2; + + struct { + GLuint dest_x2:16; + GLuint dest_y2:16; + } dw3; + + GLuint dest_base_addr; + GLuint color; +}; + +struct xy_src_copy_blit { + struct br0 br0; + struct br13 br13; + + struct { + GLuint dest_x1:16; + GLuint dest_y1:16; + } dw2; + + struct { + GLuint dest_x2:16; + GLuint dest_y2:16; + } dw3; + + GLuint dest_base_addr; + + struct { + GLuint src_x1:16; + GLuint src_y1:16; + } dw5; + + struct { + GLint src_pitch:16; + GLuint pad:16; + } dw6; + + GLuint src_base_addr; +}; + +struct xy_setup_blit { + struct br0 br0; + struct br13 br13; + + struct { + GLuint clip_x1:16; + GLuint clip_y1:16; + } dw2; + + struct { + GLuint clip_x2:16; + GLuint clip_y2:16; + } dw3; + + GLuint dest_base_addr; + GLuint background_color; + GLuint foreground_color; + GLuint pattern_base_addr; +}; + + +struct xy_text_immediate_blit { + struct { + GLuint length:8; + GLuint pad2:3; + GLuint dst_tiled:1; + GLuint pad1:4; + GLuint byte_packed:1; + GLuint pad0:5; + GLuint opcode:7; + GLuint client:3; + } dw0; + + struct { + GLuint dest_x1:16; + GLuint dest_y1:16; + } dw1; + + struct { + GLuint dest_x2:16; + GLuint dest_y2:16; + } dw2; + + /* Src bitmap data follows as inline dwords. + */ +}; + + +#define CLIENT_2D 0x2 +#define OPCODE_XY_SETUP_BLT 0x1 +#define OPCODE_XY_COLOR_BLT 0x50 +#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31 + +#endif diff --git a/src/gallium/drivers/i965/intel_tex_format.c b/src/gallium/drivers/i965/intel_tex_format.c new file mode 100644 index 00000000000..3322a711307 --- /dev/null +++ b/src/gallium/drivers/i965/intel_tex_format.c @@ -0,0 +1,225 @@ +#include "intel_context.h" +#include "intel_tex.h" +#include "intel_chipset.h" +#include "main/texformat.h" +#include "main/enums.h" + + +/** + * Choose hardware texture format given the user's glTexImage parameters. + * + * It works out that this function is fine for all the supported + * hardware. However, there is still a need to map the formats onto + * hardware descriptors. + * + * Note that the i915 can actually support many more formats than + * these if we take the step of simply swizzling the colors + * immediately after sampling... + */ +const struct gl_texture_format * +intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat, + GLenum format, GLenum type) +{ + struct intel_context *intel = intel_context(ctx); + const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24); + +#if 0 + printf("%s intFmt=0x%x format=0x%x type=0x%x\n", + __FUNCTION__, internalFormat, format, type); +#endif + + switch (internalFormat) { + case 4: + case GL_RGBA: + case GL_COMPRESSED_RGBA: + if (format == GL_BGRA) { + if (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) { + return &_mesa_texformat_argb8888; + } + else if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) { + return &_mesa_texformat_argb4444; + } + else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) { + return &_mesa_texformat_argb1555; + } + } + return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444; + + case 3: + case GL_RGB: + case GL_COMPRESSED_RGB: + if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) { + return &_mesa_texformat_rgb565; + } + return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_rgb565; + + case GL_RGBA8: + case GL_RGB10_A2: + case GL_RGBA12: + case GL_RGBA16: + return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444; + + case GL_RGBA4: + case GL_RGBA2: + return &_mesa_texformat_argb4444; + + case GL_RGB5_A1: + return &_mesa_texformat_argb1555; + + case GL_RGB8: + case GL_RGB10: + case GL_RGB12: + case GL_RGB16: + return &_mesa_texformat_argb8888; + + case GL_RGB5: + case GL_RGB4: + case GL_R3_G3_B2: + return &_mesa_texformat_rgb565; + + case GL_ALPHA: + case GL_ALPHA4: + case GL_ALPHA8: + case GL_ALPHA12: + case GL_ALPHA16: + case GL_COMPRESSED_ALPHA: + return &_mesa_texformat_a8; + + case 1: + case GL_LUMINANCE: + case GL_LUMINANCE4: + case GL_LUMINANCE8: + case GL_LUMINANCE12: + case GL_LUMINANCE16: + case GL_COMPRESSED_LUMINANCE: + return &_mesa_texformat_l8; + + case 2: + case GL_LUMINANCE_ALPHA: + case GL_LUMINANCE4_ALPHA4: + case GL_LUMINANCE6_ALPHA2: + case GL_LUMINANCE8_ALPHA8: + case GL_LUMINANCE12_ALPHA4: + case GL_LUMINANCE12_ALPHA12: + case GL_LUMINANCE16_ALPHA16: + case GL_COMPRESSED_LUMINANCE_ALPHA: + return &_mesa_texformat_al88; + + case GL_INTENSITY: + case GL_INTENSITY4: + case GL_INTENSITY8: + case GL_INTENSITY12: + case GL_INTENSITY16: + case GL_COMPRESSED_INTENSITY: + return &_mesa_texformat_i8; + + case GL_YCBCR_MESA: + if (type == GL_UNSIGNED_SHORT_8_8_MESA || type == GL_UNSIGNED_BYTE) + return &_mesa_texformat_ycbcr; + else + return &_mesa_texformat_ycbcr_rev; + + case GL_COMPRESSED_RGB_FXT1_3DFX: + return &_mesa_texformat_rgb_fxt1; + case GL_COMPRESSED_RGBA_FXT1_3DFX: + return &_mesa_texformat_rgba_fxt1; + + case GL_RGB_S3TC: + case GL_RGB4_S3TC: + case GL_COMPRESSED_RGB_S3TC_DXT1_EXT: + return &_mesa_texformat_rgb_dxt1; + + case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT: + return &_mesa_texformat_rgba_dxt1; + + case GL_RGBA_S3TC: + case GL_RGBA4_S3TC: + case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT: + return &_mesa_texformat_rgba_dxt3; + + case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT: + return &_mesa_texformat_rgba_dxt5; + + case GL_DEPTH_COMPONENT: + case GL_DEPTH_COMPONENT16: + case GL_DEPTH_COMPONENT24: + case GL_DEPTH_COMPONENT32: +#if 0 + return &_mesa_texformat_z16; +#else + /* fall-through. + * 16bpp depth texture can't be paired with a stencil buffer so + * always used combined depth/stencil format. + */ +#endif + case GL_DEPTH_STENCIL_EXT: + case GL_DEPTH24_STENCIL8_EXT: + return &_mesa_texformat_s8_z24; + +#ifndef I915 + case GL_SRGB_EXT: + case GL_SRGB8_EXT: + case GL_SRGB_ALPHA_EXT: + case GL_SRGB8_ALPHA8_EXT: + case GL_COMPRESSED_SRGB_EXT: + case GL_COMPRESSED_SRGB_ALPHA_EXT: + case GL_COMPRESSED_SLUMINANCE_EXT: + case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT: + return &_mesa_texformat_sargb8; + case GL_SLUMINANCE_EXT: + case GL_SLUMINANCE8_EXT: + if (IS_G4X(intel->intelScreen->deviceID)) + return &_mesa_texformat_sl8; + else + return &_mesa_texformat_sargb8; + case GL_SLUMINANCE_ALPHA_EXT: + case GL_SLUMINANCE8_ALPHA8_EXT: + if (IS_G4X(intel->intelScreen->deviceID)) + return &_mesa_texformat_sla8; + else + return &_mesa_texformat_sargb8; + case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT: + return &_mesa_texformat_srgb_dxt1; + + /* i915 could also do this */ + case GL_DUDV_ATI: + case GL_DU8DV8_ATI: + return &_mesa_texformat_dudv8; + case GL_RGBA_SNORM: + case GL_RGBA8_SNORM: + return &_mesa_texformat_signed_rgba8888_rev; +#endif + + default: + fprintf(stderr, "unexpected texture format %s in %s\n", + _mesa_lookup_enum_by_nr(internalFormat), __FUNCTION__); + return NULL; + } + + return NULL; /* never get here */ +} + +int intel_compressed_num_bytes(GLuint mesaFormat) +{ + int bytes = 0; + switch(mesaFormat) { + + case MESA_FORMAT_RGB_FXT1: + case MESA_FORMAT_RGBA_FXT1: + case MESA_FORMAT_RGB_DXT1: + case MESA_FORMAT_RGBA_DXT1: + bytes = 2; + break; + + case MESA_FORMAT_RGBA_DXT3: + case MESA_FORMAT_RGBA_DXT5: + bytes = 4; + default: + break; + } + + return bytes; +} diff --git a/src/gallium/drivers/i965/intel_tex_layout.c b/src/gallium/drivers/i965/intel_tex_layout.c new file mode 100644 index 00000000000..7d69ea4484a --- /dev/null +++ b/src/gallium/drivers/i965/intel_tex_layout.c @@ -0,0 +1,140 @@ +/************************************************************************** + * + * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + /* + * Authors: + * Keith Whitwell <[email protected]> + * Michel Dänzer <[email protected]> + */ + +#include "intel_mipmap_tree.h" +#include "intel_tex_layout.h" +#include "intel_context.h" +#include "main/macros.h" + +void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h) +{ + switch (internalFormat) { + case GL_COMPRESSED_RGB_FXT1_3DFX: + case GL_COMPRESSED_RGBA_FXT1_3DFX: + *w = 8; + *h = 4; + break; + + case GL_RGB_S3TC: + case GL_RGB4_S3TC: + case GL_COMPRESSED_RGB_S3TC_DXT1_EXT: + case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT: + case GL_RGBA_S3TC: + case GL_RGBA4_S3TC: + case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT: + case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT: + *w = 4; + *h = 4; + break; + + default: + *w = 4; + *h = 2; + break; + } +} + +void i945_miptree_layout_2d( struct intel_context *intel, + struct intel_mipmap_tree *mt, + uint32_t tiling ) +{ + GLuint align_h = 2, align_w = 4; + GLuint level; + GLuint x = 0; + GLuint y = 0; + GLuint width = mt->width0; + GLuint height = mt->height0; + + mt->pitch = mt->width0; + intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h); + + if (mt->compressed) { + mt->pitch = ALIGN(mt->width0, align_w); + } + + /* May need to adjust pitch to accomodate the placement of + * the 2nd mipmap. This occurs when the alignment + * constraints of mipmap placement push the right edge of the + * 2nd mipmap out past the width of its parent. + */ + if (mt->first_level != mt->last_level) { + GLuint mip1_width; + + if (mt->compressed) { + mip1_width = ALIGN(minify(mt->width0), align_w) + + ALIGN(minify(minify(mt->width0)), align_w); + } else { + mip1_width = ALIGN(minify(mt->width0), align_w) + + minify(minify(mt->width0)); + } + + if (mip1_width > mt->pitch) { + mt->pitch = mip1_width; + } + } + + /* Pitch must be a whole number of dwords, even though we + * express it in texels. + */ + mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch); + mt->total_height = 0; + + for ( level = mt->first_level ; level <= mt->last_level ; level++ ) { + GLuint img_height; + + intel_miptree_set_level_info(mt, level, 1, x, y, width, + height, 1); + + if (mt->compressed) + img_height = MAX2(1, height/4); + else + img_height = ALIGN(height, align_h); + + + /* Because the images are packed better, the final offset + * might not be the maximal one: + */ + mt->total_height = MAX2(mt->total_height, y + img_height); + + /* Layout_below: step right after second mipmap. + */ + if (level == mt->first_level + 1) { + x += ALIGN(width, align_w); + } + else { + y += img_height; + } + + width = minify(width); + height = minify(height); + } +} |