diff options
author | Eric Anholt <[email protected]> | 2010-07-26 17:47:59 -0700 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2010-07-26 17:53:27 -0700 |
commit | afe125e0a18ac3886c45c7e6b02b122fb2d327b5 (patch) | |
tree | 78621707e71154c0b388b0baacffc26432b7e992 /src/mesa/drivers/dri | |
parent | d64343f1ae84979bd154475badf11af8a9bfc2eb (diff) | |
parent | 5403ca79b225605c79f49866a6497c97da53be3b (diff) |
Merge remote branch 'origin/master' into glsl2
This pulls in multiple i965 driver fixes which will help ensure better
testing coverage during development, and also gets past the conflicts
of the src/mesa/shader -> src/mesa/program move.
Conflicts:
src/mesa/Makefile
src/mesa/main/shaderapi.c
src/mesa/main/shaderobj.h
Diffstat (limited to 'src/mesa/drivers/dri')
97 files changed, 2308 insertions, 835 deletions
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c index dfb7d640409..86e59a8e51c 100644 --- a/src/mesa/drivers/dri/common/dri_metaops.c +++ b/src/mesa/drivers/dri/common/dri_metaops.c @@ -26,6 +26,7 @@ * **************************************************************************/ +#include "main/arbprogram.h" #include "main/arrayobj.h" #include "main/bufferobj.h" #include "main/enable.h" @@ -33,8 +34,7 @@ #include "main/texstate.h" #include "main/varray.h" #include "main/viewport.h" -#include "shader/arbprogram.h" -#include "shader/program.h" +#include "program/program.h" #include "dri_metaops.h" void diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c index 18b9035248f..dce84ef0deb 100644 --- a/src/mesa/drivers/dri/common/dri_util.c +++ b/src/mesa/drivers/dri/common/dri_util.c @@ -927,41 +927,6 @@ const __DRI2configQueryExtension dri2ConfigQueryExtension = { dri2ConfigQueryf, }; -static int -driFrameTracking(__DRIdrawable *drawable, GLboolean enable) -{ - return GLX_BAD_CONTEXT; -} - -static int -driQueryFrameTracking(__DRIdrawable *dpriv, - int64_t * sbc, int64_t * missedFrames, - float * lastMissedUsage, float * usage) -{ - __DRIswapInfo sInfo; - int status; - int64_t ust; - __DRIscreen *psp = dpriv->driScreenPriv; - - status = dpriv->driScreenPriv->DriverAPI.GetSwapInfo( dpriv, & sInfo ); - if ( status == 0 ) { - *sbc = sInfo.swap_count; - *missedFrames = sInfo.swap_missed_count; - *lastMissedUsage = sInfo.swap_missed_usage; - - (*psp->systemTime->getUST)( & ust ); - *usage = driCalculateSwapUsage( dpriv, sInfo.swap_ust, ust ); - } - - return status; -} - -const __DRIframeTrackingExtension driFrameTrackingExtension = { - { __DRI_FRAME_TRACKING, __DRI_FRAME_TRACKING_VERSION }, - driFrameTracking, - driQueryFrameTracking -}; - /** * Calculate amount of swap interval used between GLX buffer swaps. * diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h index e4c590b1322..bc647ff8130 100644 --- a/src/mesa/drivers/dri/common/dri_util.h +++ b/src/mesa/drivers/dri/common/dri_util.h @@ -70,7 +70,6 @@ extern const __DRIdri2Extension driDRI2Extension; extern const __DRIextension driReadDrawableExtension; extern const __DRIcopySubBufferExtension driCopySubBufferExtension; extern const __DRIswapControlExtension driSwapControlExtension; -extern const __DRIframeTrackingExtension driFrameTrackingExtension; extern const __DRImediaStreamCounterExtension driMediaStreamCounterExtension; extern const __DRI2configQueryExtension dri2ConfigQueryExtension; diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c index e60157f3777..f1505dc5e73 100644 --- a/src/mesa/drivers/dri/i915/i915_fragprog.c +++ b/src/mesa/drivers/dri/i915/i915_fragprog.c @@ -29,11 +29,11 @@ #include "main/macros.h" #include "main/enums.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/program.h" -#include "shader/programopt.h" -#include "shader/prog_print.h" +#include "program/prog_instruction.h" +#include "program/prog_parameter.h" +#include "program/program.h" +#include "program/programopt.h" +#include "program/prog_print.h" #include "tnl/tnl.h" #include "tnl/t_context.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c index 228ee3f3be1..a1e9dae9154 100644 --- a/src/mesa/drivers/dri/i965/brw_clip.c +++ b/src/mesa/drivers/dri/i965/brw_clip.c @@ -55,6 +55,7 @@ static void compile_clip_prog( struct brw_context *brw, GLuint program_size; GLuint delta; GLuint i; + GLuint header_regs; memset(&c, 0, sizeof(c)); @@ -72,22 +73,28 @@ static void compile_clip_prog( struct brw_context *brw, c.header_position_offset = ATTR_SIZE; if (intel->gen == 5) - delta = 3 * REG_SIZE; + header_regs = 3; else - delta = REG_SIZE; + header_regs = 1; - for (i = 0; i < VERT_RESULT_MAX; i++) + delta = header_regs * REG_SIZE; + + for (i = 0; i < VERT_RESULT_MAX; i++) { if (c.key.attrs & BITFIELD64_BIT(i)) { c.offset[i] = delta; delta += ATTR_SIZE; + + c.idx_to_attr[c.nr_attrs] = i; + c.nr_attrs++; } + } - c.nr_attrs = brw_count_bits(c.key.attrs); - - if (intel->gen == 5) - c.nr_regs = (c.nr_attrs + 1) / 2 + 3; /* are vertices packed, or reg-aligned? */ - else - c.nr_regs = (c.nr_attrs + 1) / 2 + 1; /* are vertices packed, or reg-aligned? */ + /* The vertex attributes start at a URB row-aligned offset after + * the 8-20 dword vertex header, and continue for a URB row-aligned + * length. nr_regs determines the urb_read_length from the start + * of the header to the end of the vertex data. + */ + c.nr_regs = header_regs + (c.nr_attrs + 1) / 2; c.nr_bytes = c.nr_regs * REG_SIZE; diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h index 68222c6c278..3a8cd7bf390 100644 --- a/src/mesa/drivers/dri/i965/brw_clip.h +++ b/src/mesa/drivers/dri/i965/brw_clip.h @@ -115,7 +115,10 @@ struct brw_clip_compile { GLboolean need_direction; GLuint header_position_offset; - GLuint offset[VERT_ATTRIB_MAX]; + /** Mapping from VERT_RESULT_* to offset within the VUE. */ + GLuint offset[VERT_RESULT_MAX]; + /** Mapping from attribute index to VERT_RESULT_* */ + GLuint idx_to_attr[VERT_RESULT_MAX]; }; #define ATTR_SIZE (4*4) diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c index ceb62a31162..4b9117bb0b1 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_line.c +++ b/src/mesa/drivers/dri/i965/brw_clip_line.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c index 7f47634dca8..b994a32bc3b 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_point.c +++ b/src/mesa/drivers/dri/i965/brw_clip_point.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c index 916a99ea004..cb58d1da9fe 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_tri.c +++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" @@ -76,10 +76,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, if (c->nr_attrs & 1) { for (j = 0; j < 3; j++) { - GLuint delta = c->nr_attrs*16 + 32; - - if (intel->gen == 5) - delta = c->nr_attrs * 16 + 32 * 3; + GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE; brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0)); } diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c index f36d22fdbf8..afd93f8be0b 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c +++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c index 2148bc8244a..d2ac1235e46 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_util.c +++ b/src/mesa/drivers/dri/i965/brw_clip_util.c @@ -33,7 +33,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" @@ -134,7 +134,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, GLboolean force_edgeflag) { struct brw_compile *p = &c->func; - struct intel_context *intel = &p->brw->intel; struct brw_reg tmp = get_tmp(c); GLuint i; @@ -149,12 +148,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, /* Iterate over each attribute (could be done in pairs?) */ for (i = 0; i < c->nr_attrs; i++) { - GLuint delta = i*16 + 32; + GLuint delta = c->offset[c->idx_to_attr[i]]; - if (intel->gen == 5) - delta = i * 16 + 32 * 3; - - if (delta == c->offset[VERT_RESULT_EDGE]) { + if (c->idx_to_attr[i] == VERT_RESULT_EDGE) { if (force_edgeflag) brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1)); else @@ -183,10 +179,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, } if (i & 1) { - GLuint delta = i*16 + 32; - - if (intel->gen == 5) - delta = i * 16 + 32 * 3; + GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE; brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0)); } @@ -199,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, brw_clip_project_vertex(c, dest_ptr ); } - - - -#define MAX_MRF 16 - void brw_clip_emit_vue(struct brw_clip_compile *c, struct brw_indirect vert, GLboolean allocate, diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index d13b9ae298b..6d064b822e5 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -34,7 +34,6 @@ #include "main/api_noop.h" #include "main/macros.h" #include "main/simple_list.h" - #include "brw_context.h" #include "brw_defines.h" #include "brw_draw.h" diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c index 6c0b79f7241..8196d8ca625 100644 --- a/src/mesa/drivers/dri/i965/brw_curbe.c +++ b/src/mesa/drivers/dri/i965/brw_curbe.c @@ -35,9 +35,9 @@ #include "main/context.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" #include "intel_regions.h" #include "brw_context.h" diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 39bf5b63fc2..f7a68cead7c 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -501,6 +501,10 @@ #define BRW_MASK_ENABLE 0 #define BRW_MASK_DISABLE 1 +/* Sandybridge is WECtrl (Write enable control) */ +#define BRW_WE_NORMAL 0 +#define BRW_WE_KILL_PRED 1 + #define BRW_OPCODE_MOV 1 #define BRW_OPCODE_SEL 2 #define BRW_OPCODE_NOT 4 @@ -600,6 +604,8 @@ #define BRW_ARF_NOTIFICATION_COUNT 0x90 #define BRW_ARF_IP 0xA0 +#define BRW_MRF_COMPR4 (1 << 7) + #define BRW_AMASK 0 #define BRW_IMASK 1 #define BRW_LMASK 2 @@ -646,13 +652,14 @@ #define BRW_POLYGON_FACING_BACK 1 #define BRW_MESSAGE_TARGET_NULL 0 -#define BRW_MESSAGE_TARGET_MATH 1 +#define BRW_MESSAGE_TARGET_MATH 1 /* reserved on GEN6 */ #define BRW_MESSAGE_TARGET_SAMPLER 2 #define BRW_MESSAGE_TARGET_GATEWAY 3 -#define BRW_MESSAGE_TARGET_DATAPORT_READ 4 -#define BRW_MESSAGE_TARGET_DATAPORT_WRITE 5 +#define BRW_MESSAGE_TARGET_DATAPORT_READ 4 /* sampler cache on GEN6 */ +#define BRW_MESSAGE_TARGET_DATAPORT_WRITE 5 /* render cache on Gen6 */ #define BRW_MESSAGE_TARGET_URB 6 #define BRW_MESSAGE_TARGET_THREAD_SPAWNER 7 +#define BRW_MESSAGE_TARGET_CONST_CACHE 9 /* GEN6 */ #define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0 #define BRW_SAMPLER_RETURN_FORMAT_UINT32 2 @@ -698,10 +705,24 @@ #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3 +/* This one stays the same across generations. */ #define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0 +/* GEN4 */ #define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1 -#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ 2 +#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 2 #define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3 +/* G45, GEN5 */ +#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ 3 +#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 +/* GEN6 */ +#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ 5 +#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 #define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0 #define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1 @@ -721,6 +742,16 @@ #define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5 #define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7 +/* GEN6 */ +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE_GEN6 7 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE_GEN6 8 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE_GEN6 9 +#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE_GEN6 10 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORLD_SCATTERED_WRITE_GEN6 11 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6 12 +#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE_GEN6 13 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE_GEN6 14 + #define BRW_MATH_FUNCTION_INV 1 #define BRW_MATH_FUNCTION_LOG 2 #define BRW_MATH_FUNCTION_EXP 3 diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index ff12daf497d..d2307145361 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -598,7 +598,7 @@ static int src_da16 (FILE *file, format (file, ".%d", _subreg_nr); string (file, "<"); err |= control (file, "vert stride", vert_stride, _vert_stride, NULL); - string (file, ",1,1>"); + string (file, ",4,1>"); err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL); /* * Three kinds of swizzle display: @@ -836,10 +836,12 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) if (inst->header.opcode == BRW_OPCODE_SEND) { int target; - if (gen >= 5) - target = inst->bits2.send_gen5.sfid; + if (gen >= 6) + target = inst->header.destreg__conditionalmod; + else if (gen == 5) + target = inst->bits2.send_gen5.sfid; else - target = inst->bits3.generic.msg_target; + target = inst->bits3.generic.msg_target; newline (file); pad (file, 16); @@ -868,13 +870,44 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) inst->bits3.sampler.return_format, NULL); string (file, ")"); break; + case BRW_MESSAGE_TARGET_DATAPORT_READ: + if (gen >= 6) { + format (file, " (%d, %d, %d, %d, %d, %d)", + inst->bits3.dp_render_cache.binding_table_index, + inst->bits3.dp_render_cache.msg_control, + inst->bits3.dp_render_cache.msg_type, + inst->bits3.dp_render_cache.send_commit_msg, + inst->bits3.dp_render_cache.msg_length, + inst->bits3.dp_render_cache.response_length); + } else if (gen >= 5) { + format (file, " (%d, %d, %d)", + inst->bits3.dp_read_gen5.binding_table_index, + inst->bits3.dp_read_gen5.msg_control, + inst->bits3.dp_read_gen5.msg_type); + } else { + format (file, " (%d, %d, %d)", + inst->bits3.dp_read.binding_table_index, + inst->bits3.dp_read.msg_control, + inst->bits3.dp_read.msg_type); + } + break; case BRW_MESSAGE_TARGET_DATAPORT_WRITE: - format (file, " (%d, %d, %d, %d)", - inst->bits3.dp_write.binding_table_index, - (inst->bits3.dp_write.pixel_scoreboard_clear << 3) | - inst->bits3.dp_write.msg_control, - inst->bits3.dp_write.msg_type, - inst->bits3.dp_write.send_commit_msg); + if (gen >= 6) { + format (file, " (%d, %d, %d, %d, %d, %d)", + inst->bits3.dp_render_cache.binding_table_index, + inst->bits3.dp_render_cache.msg_control, + inst->bits3.dp_render_cache.msg_type, + inst->bits3.dp_render_cache.send_commit_msg, + inst->bits3.dp_render_cache.msg_length, + inst->bits3.dp_render_cache.response_length); + } else { + format (file, " (%d, %d, %d, %d)", + inst->bits3.dp_write.binding_table_index, + (inst->bits3.dp_write.pixel_scoreboard_clear << 3) | + inst->bits3.dp_write.msg_control, + inst->bits3.dp_write.msg_type, + inst->bits3.dp_write.send_commit_msg); + } break; case BRW_MESSAGE_TARGET_URB: if (gen >= 5) { @@ -900,15 +933,22 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) case BRW_MESSAGE_TARGET_THREAD_SPAWNER: break; default: - format (file, "unsupported target %d", inst->bits3.generic.msg_target); + format (file, "unsupported target %d", target); break; } if (space) string (file, " "); - format (file, "mlen %d", - inst->bits3.generic.msg_length); - format (file, " rlen %d", - inst->bits3.generic.response_length); + if (gen >= 5) { + format (file, "mlen %d", + inst->bits3.generic_gen5.msg_length); + format (file, " rlen %d", + inst->bits3.generic_gen5.response_length); + } else { + format (file, "mlen %d", + inst->bits3.generic.msg_length); + format (file, " rlen %d", + inst->bits3.generic.response_length); + } } pad (file, 64); if (inst->header.opcode != BRW_OPCODE_NOP) { diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 3a32ad26c12..ffdddd0a388 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -35,7 +35,7 @@ #include "brw_structs.h" #include "brw_defines.h" -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6)) #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3) @@ -520,6 +520,20 @@ static INLINE struct brw_reg brw_acc_reg( void ) 0); } +static INLINE struct brw_reg brw_notification_1_reg(void) +{ + + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NOTIFICATION_COUNT, + 1, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); +} + static INLINE struct brw_reg brw_flag_reg( void ) { @@ -877,12 +891,15 @@ void brw_dp_READ_4( struct brw_compile *p, void brw_dp_READ_4_vs( struct brw_compile *p, struct brw_reg dest, - GLuint oword, - GLboolean relAddr, - struct brw_reg addrReg, GLuint location, GLuint bind_table_index ); +void brw_dp_READ_4_vs_relative(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg addrReg, + GLuint offset, + GLuint bind_table_index); + void brw_dp_WRITE_16( struct brw_compile *p, struct brw_reg src, GLuint scratch_offset ); @@ -919,6 +936,8 @@ void brw_land_fwd_jump(struct brw_compile *p, void brw_NOP(struct brw_compile *p); +void brw_WAIT(struct brw_compile *p); + /* Special case: there is never a destination, execution size will be * taken from src0: */ @@ -965,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn, /* brw_optimize.c */ void brw_optimize(struct brw_compile *p); +void brw_remove_duplicate_mrf_moves(struct brw_compile *p); +void brw_remove_grf_to_mrf_moves(struct brw_compile *p); #endif diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 34dfe10cb93..0d5d17f501d 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -364,7 +364,8 @@ static void brw_set_dp_write_message( struct brw_context *brw, GLuint msg_length, GLuint pixel_scoreboard_clear, GLuint response_length, - GLuint end_of_thread ) + GLuint end_of_thread, + GLuint send_commit_msg) { struct intel_context *intel = &brw->intel; brw_set_src1(insn, brw_imm_d(0)); @@ -374,7 +375,7 @@ static void brw_set_dp_write_message( struct brw_context *brw, insn->bits3.dp_write_gen5.msg_control = msg_control; insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear; insn->bits3.dp_write_gen5.msg_type = msg_type; - insn->bits3.dp_write_gen5.send_commit_msg = 0; + insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg; insn->bits3.dp_write_gen5.header_present = 1; insn->bits3.dp_write_gen5.response_length = response_length; insn->bits3.dp_write_gen5.msg_length = msg_length; @@ -386,7 +387,7 @@ static void brw_set_dp_write_message( struct brw_context *brw, insn->bits3.dp_write.msg_control = msg_control; insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear; insn->bits3.dp_write.msg_type = msg_type; - insn->bits3.dp_write.send_commit_msg = 0; + insn->bits3.dp_write.send_commit_msg = send_commit_msg; insn->bits3.dp_write.response_length = response_length; insn->bits3.dp_write.msg_length = msg_length; insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE; @@ -906,6 +907,20 @@ void brw_CMP(struct brw_compile *p, } } +/* Issue 'wait' instruction for n1, host could program MMIO + to wake up thread. */ +void brw_WAIT (struct brw_compile *p) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT); + struct brw_reg src = brw_notification_1_reg(); + + brw_set_dest(insn, src); + brw_set_src0(insn, src); + brw_set_src1(insn, brw_null_reg()); + insn->header.execution_size = 0; /* must */ + insn->header.predicate_control = 0; + insn->header.compression_control = 0; +} /*********************************************************************** @@ -1040,6 +1055,7 @@ void brw_dp_WRITE_16( struct brw_compile *p, struct brw_reg src, GLuint scratch_offset ) { + struct intel_context *intel = &p->brw->intel; GLuint msg_reg_nr = 1; { brw_push_insn_state(p); @@ -1056,13 +1072,32 @@ void brw_dp_WRITE_16( struct brw_compile *p, { GLuint msg_length = 3; - struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); + struct brw_reg dest; struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - + int send_commit_msg; + insn->header.predicate_control = 0; /* XXX */ insn->header.compression_control = BRW_COMPRESSION_NONE; insn->header.destreg__conditionalmod = msg_reg_nr; - + + /* Until gen6, writes followed by reads from the same location + * are not guaranteed to be ordered unless write_commit is set. + * If set, then a no-op write is issued to the destination + * register to set a dependency, and a read from the destination + * can be used to ensure the ordering. + * + * For gen6, only writes between different threads need ordering + * protection. Our use of DP writes is all about register + * spilling within a thread. + */ + if (intel->gen >= 6) { + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + send_commit_msg = 0; + } else { + dest = brw_uw16_grf(0, 0); + send_commit_msg = 1; + } + brw_set_dest(insn, dest); brw_set_src0(insn, src); @@ -1073,8 +1108,9 @@ void brw_dp_WRITE_16( struct brw_compile *p, BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */ msg_length, 0, /* pixel scoreboard */ - 0, /* response_length */ - 0); /* eot */ + send_commit_msg, /* response_length */ + 0, /* eot */ + send_commit_msg); } } @@ -1115,7 +1151,7 @@ void brw_dp_READ_16( struct brw_compile *p, brw_set_dp_read_message(p->brw, insn, 255, /* binding table index (255=stateless) */ - 3, /* msg_control (3 means 4 Owords) */ + BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ 1, /* target cache (render/scratch) */ 1, /* msg_length */ @@ -1190,68 +1226,107 @@ void brw_dp_READ_4( struct brw_compile *p, */ void brw_dp_READ_4_vs(struct brw_compile *p, struct brw_reg dest, - GLuint oword, - GLboolean relAddr, - struct brw_reg addrReg, GLuint location, GLuint bind_table_index) { + struct brw_instruction *insn; GLuint msg_reg_nr = 1; + struct brw_reg b; - assert(oword < 2); /* printf("vs const read msg, location %u, msg_reg_nr %d\n", location, msg_reg_nr); */ /* Setup MRF[1] with location/offset into const buffer */ - { - struct brw_reg b; + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); - brw_push_insn_state(p); - brw_set_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); - /*brw_set_access_mode(p, BRW_ALIGN_16);*/ + /* XXX I think we're setting all the dwords of MRF[1] to 'location'. + * when the docs say only dword[2] should be set. Hmmm. But it works. + */ + b = brw_message_reg(msg_reg_nr); + b = retype(b, BRW_REGISTER_TYPE_UD); + /*b = get_element_ud(b, 2);*/ + brw_MOV(p, b, brw_imm_ud(location)); - /* XXX I think we're setting all the dwords of MRF[1] to 'location'. - * when the docs say only dword[2] should be set. Hmmm. But it works. - */ - b = brw_message_reg(msg_reg_nr); - b = retype(b, BRW_REGISTER_TYPE_UD); - /*b = get_element_ud(b, 2);*/ - if (relAddr) { - brw_ADD(p, b, addrReg, brw_imm_ud(location)); - } - else { - brw_MOV(p, b, brw_imm_ud(location)); - } + brw_pop_insn_state(p); - brw_pop_insn_state(p); - } + insn = next_insn(p, BRW_OPCODE_SEND); - { - struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - - insn->header.predicate_control = BRW_PREDICATE_NONE; - insn->header.compression_control = BRW_COMPRESSION_NONE; - insn->header.destreg__conditionalmod = msg_reg_nr; - insn->header.mask_control = BRW_MASK_DISABLE; - /*insn->header.access_mode = BRW_ALIGN_16;*/ - - brw_set_dest(insn, dest); - brw_set_src0(insn, brw_null_reg()); + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + insn->header.mask_control = BRW_MASK_DISABLE; - brw_set_dp_read_message(p->brw, - insn, - bind_table_index, - oword, /* 0 = lower Oword, 1 = upper Oword */ - BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ - 0, /* source cache = data cache */ - 1, /* msg_length */ - 1, /* response_length (1 Oword) */ - 0); /* eot */ - } + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_null_reg()); + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + 0, + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + 0, /* source cache = data cache */ + 1, /* msg_length */ + 1, /* response_length (1 Oword) */ + 0); /* eot */ +} + +/** + * Read a float[4] constant per vertex from VS constant buffer, with + * relative addressing. + */ +void brw_dp_READ_4_vs_relative(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg addr_reg, + GLuint offset, + GLuint bind_table_index) +{ + struct intel_context *intel = &p->brw->intel; + int msg_type; + + /* Setup MRF[1] with offset into const buffer */ + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* M1.0 is block offset 0, M1.4 is block offset 1, all other + * fields ignored. + */ + brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), + addr_reg, brw_imm_d(offset)); + brw_pop_insn_state(p); + + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = 0; + insn->header.mask_control = BRW_MASK_DISABLE; + + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_vec8_grf(0, 0)); + + if (intel->gen == 6) + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else if (intel->gen == 5 || intel->is_g4x) + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, + 0, /* source cache = data cache */ + 2, /* msg_length */ + 1, /* response_length */ + 0); /* eot */ } @@ -1281,7 +1356,8 @@ void brw_fb_WRITE(struct brw_compile *p, msg_length, 1, /* pixel scoreboard */ response_length, - eot); + eot, + 0 /* send_commit_msg */); } diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c index 99a6f6be113..a01d5576f8c 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c @@ -34,7 +34,7 @@ #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" #include "brw_defines.h" diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c index e79b3ddea35..8aa6fb6cc6f 100644 --- a/src/mesa/drivers/dri/i965/brw_optimize.c +++ b/src/mesa/drivers/dri/i965/brw_optimize.c @@ -26,12 +26,600 @@ */ #include "main/macros.h" -#include "shader/program.h" -#include "shader/prog_print.h" +#include "program/program.h" +#include "program/prog_print.h" #include "brw_context.h" #include "brw_defines.h" #include "brw_eu.h" +static const struct { + char *name; + int nsrc; + int ndst; + GLboolean is_arith; +} inst_opcode[128] = { + [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 }, + + [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, +}; + +static INLINE +GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst) +{ + return inst_opcode[inst->header.opcode].is_arith; +} + +static const GLuint inst_stride[7] = { + [0] = 0, + [1] = 1, + [2] = 2, + [3] = 4, + [4] = 8, + [5] = 16, + [6] = 32 +}; + +static const GLuint inst_type_size[8] = { + [BRW_REGISTER_TYPE_UD] = 4, + [BRW_REGISTER_TYPE_D] = 4, + [BRW_REGISTER_TYPE_UW] = 2, + [BRW_REGISTER_TYPE_W] = 2, + [BRW_REGISTER_TYPE_UB] = 1, + [BRW_REGISTER_TYPE_B] = 1, + [BRW_REGISTER_TYPE_F] = 4 +}; + +static INLINE GLboolean +brw_is_grf_written(const struct brw_instruction *inst, + int reg_index, int size, + int gen) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + int length, write_end; + + /* SEND is specific */ + if (inst->header.opcode == BRW_OPCODE_SEND) { + if (gen >= 5) + length = inst->bits3.generic_gen5.response_length*REG_SIZE; + else + length = inst->bits3.generic.response_length*REG_SIZE; + } + else { + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + } + + /* If the two intervals intersect, we overwrite the register */ + write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end); + + return left < right; +} + +/* Specific path for message register since we need to handle the compr4 case */ +static INLINE GLboolean +brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f; + const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4; + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + + /* We use compr4 with a size != 16 elements. Strange, we conservatively + * consider that we are writing the register. + */ + if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16) + return GL_TRUE; + + GLboolean is_written = GL_FALSE; + + /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */ + if (is_compr4) { + const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride; + + /* First 8-way register */ + const int write_start0 = mrf_index*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end0 = write_start0 + length; + + /* Second 8-way register */ + const int write_start1 = (mrf_index+4)*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end1 = write_start1 + length; + + /* If the two intervals intersect, we overwrite the register */ + const int left0 = MAX2(write_start0, reg_start); + const int right0 = MIN2(write_end0, reg_end); + const int left1 = MAX2(write_start1, reg_start); + const int right1 = MIN2(write_end1, reg_end); + + is_written = left0 < right0 || left1 < right1; + } + else { + int length; + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + + /* If the two intervals intersect, we write into the register */ + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + + is_written = left < right; + } + + /* SEND may perform an implicit mov to a mrf register */ + if (is_written == GL_FALSE && + inst->header.opcode == BRW_OPCODE_SEND && + inst->bits1.da1.src0_reg_file != 0) { + + const int mrf_start = inst->header.destreg__conditionalmod; + const int write_start = mrf_start * REG_SIZE; + const int write_end = write_start + REG_SIZE; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + is_written = left < right; + } + + return is_written; +} + +static INLINE GLboolean +brw_is_mrf_read(const struct brw_instruction *inst, + int reg_index, int size, int gen) +{ + if (inst->header.opcode != BRW_OPCODE_SEND) + return GL_FALSE; + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + return GL_TRUE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + int length, read_start, read_end; + if (gen >= 5) + length = inst->bits3.generic_gen5.msg_length*REG_SIZE; + else + length = inst->bits3.generic.msg_length*REG_SIZE; + + /* Look if SEND uses an implicit mov. In that case, we read one less register + * (but we write it) + */ + if (inst->bits1.da1.src0_reg_file != 0) + read_start = inst->header.destreg__conditionalmod; + else { + length--; + read_start = inst->header.destreg__conditionalmod + 1; + } + read_start *= REG_SIZE; + read_end = read_start + length; + + const int left = MAX2(read_start, reg_start); + const int right = MIN2(read_end, reg_end); + + return left < right; +} + +static INLINE GLboolean +brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size) +{ + int i, j; + if (inst_opcode[inst->header.opcode].nsrc == 0) + return GL_FALSE; + + /* Look at first source. We must take into account register regions to + * monitor carefully the read. Note that we are a bit too conservative here + * since we do not take into account the fact that some complete registers + * may be skipped + */ + if (inst_opcode[inst->header.opcode].nsrc >= 1) { + + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits2.da1.src0_width; + const int row_num = elem_num >> inst->bits2.da1.src0_width; + const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride]; + int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE + + inst->bits2.da1.src0_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + /* Second src register */ + if (inst_opcode[inst->header.opcode].nsrc >= 2) { + + if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits3.da1.src1_width; + const int row_num = elem_num >> inst->bits3.da1.src1_width; + const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride]; + int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE + + inst->bits3.da1.src1_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + return GL_FALSE; +} + +static INLINE GLboolean +brw_is_control_done(const struct brw_instruction *mov) { + return + mov->header.dependency_control != 0 || + mov->header.thread_control != 0 || + mov->header.mask_control != 0 || + mov->header.saturate != 0 || + mov->header.debug_control != 0; +} + +static INLINE GLboolean +brw_is_predicated(const struct brw_instruction *mov) { + return mov->header.predicate_control != 0; +} + +static INLINE GLboolean +brw_is_grf_to_mrf_mov(const struct brw_instruction *mov, + int *mrf_index, + int *grf_index, + GLboolean *is_compr4) +{ + if (brw_is_predicated(mov) || + brw_is_control_done(mov) || + mov->header.debug_control != 0) + return GL_FALSE; + + if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE || + mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F || + mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits1.da1.dest_subreg_nr != 0) + return GL_FALSE; + + if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE || + mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F || + mov->bits2.da1.src0_width != BRW_WIDTH_8 || + mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 || + mov->bits2.da1.src0_subreg_nr != 0 || + mov->bits2.da1.src0_abs != 0 || + mov->bits2.da1.src0_negate != 0) + return GL_FALSE; + + *grf_index = mov->bits2.da1.src0_reg_nr; + *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f; + *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0; + return GL_TRUE; +} + +static INLINE GLboolean +brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index) +{ + /* remark: no problem to predicate a SEL instruction */ + if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) && + brw_is_control_done(inst) == GL_FALSE && + inst->header.execution_size == 4 && + inst->header.access_mode == BRW_ALIGN_1 && + inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT && + inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE && + inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F && + inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 && + inst->bits1.da1.dest_reg_nr == grf_index && + inst->bits1.da1.dest_subreg_nr == 0 && + brw_is_arithmetic_inst(inst)) + return GL_TRUE; + + return GL_FALSE; +} + +static INLINE GLboolean +brw_inst_are_equal(const struct brw_instruction *src0, + const struct brw_instruction *src1) +{ + const GLuint *field0 = (GLuint *) src0; + const GLuint *field1 = (GLuint *) src1; + return field0[0] == field1[0] && + field0[1] == field1[1] && + field0[2] == field1[2] && + field0[3] == field1[3]; +} + +static INLINE void +brw_inst_copy(struct brw_instruction *dst, + const struct brw_instruction *src) +{ + GLuint *field_dst = (GLuint *) dst; + const GLuint *field_src = (GLuint *) src; + field_dst[0] = field_src[0]; + field_dst[1] = field_src[1]; + field_dst[2] = field_src[2]; + field_dst[3] = field_src[3]; +} + +static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst) +{ + int i, nr_insn = 0, to = 0, from = 0; + + for (from = 0; from < p->nr_insn; ++from) { + if (removeInst[from]) + continue; + if(to != from) + brw_inst_copy(p->store + to, p->store + from); + to++; + } + + for (i = 0; i < p->nr_insn; ++i) + if (removeInst[i] == GL_FALSE) + nr_insn++; + p->nr_insn = nr_insn; +} + +/* The gen code emitter generates a lot of duplications in the + * grf-to-mrf moves, for example when texture sampling with the same + * coordinates from multiple textures.. Here, we monitor same mov + * grf-to-mrf instrutions and remove repeated ones where the operands + * and dst ahven't changed in between. + */ +void brw_remove_duplicate_mrf_moves(struct brw_compile *p) +{ + const int gen = p->brw->intel.gen; + int i, j; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1; + const int simd16_size = 2 * REG_SIZE; + + for (j = i + 1; j < p->nr_insn; j++) { + const struct brw_instruction *inst = p->store + j; + + if (brw_inst_are_equal(mov, inst)) { + removeInst[j] = GL_TRUE; + continue; + } + + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE)) + break; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + +/* Replace moves to MRFs where the value moved is the result of a + * normal arithmetic operation with computation right into the MRF. + */ +void brw_remove_grf_to_mrf_moves(struct brw_compile *p) +{ + int i, j, prev; + struct brw_context *brw = p->brw; + const int gen = brw->intel.gen; + const int simd16_size = 2*REG_SIZE; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + assert(removeInst); + + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + struct brw_instruction *grf_inst = NULL; + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + /* Using comp4 enables a stride of 4 for this instruction */ + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1; + + /* Look where the register has been set */ + prev = i; + GLboolean potential_remove = GL_FALSE; + while (prev--) { + + /* If _one_ instruction writes the grf, we try to remove the mov */ + struct brw_instruction *inst = p->store + prev; + if (brw_is_grf_straight_write(inst, grf_index)) { + potential_remove = GL_TRUE; + grf_inst = inst; + break; + } + + } + + if (potential_remove == GL_FALSE) + continue; + removeInst[i] = GL_TRUE; + + /* Monitor first the section of code between the grf computation and the + * mov. Here we cannot read or write both mrf and grf register + */ + for (j = prev + 1; j < i; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_grf_read(inst, grf_index, simd16_size) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE) || + brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) || + brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) { + removeInst[i] = GL_FALSE; + break; + } + } + + /* After the mov, we can read or write the mrf. If the grf is overwritten, + * we are done + */ + for (j = i + 1; j < p->nr_insn; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + + if (brw_is_grf_read(inst, grf_index, simd16_size)) { + removeInst[i] = GL_FALSE; + break; + } + + if (brw_is_grf_straight_write(inst, grf_index)) + break; + } + + /* Note that with the top down traversal, we can safely pacth the mov + * instruction + */ + if (removeInst[i]) { + grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file; + grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + static GLboolean is_single_channel_dp4(struct brw_instruction *insn) { diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index bd560acdadf..4b08d2599bc 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -31,10 +31,10 @@ #include "main/imports.h" #include "main/enums.h" -#include "shader/prog_parameter.h" -#include "shader/program.h" -#include "shader/programopt.h" -#include "shader/shader_api.h" +#include "main/shaderobj.h" +#include "program/prog_parameter.h" +#include "program/program.h" +#include "program/programopt.h" #include "tnl/tnl.h" #include "brw_context.h" @@ -174,9 +174,36 @@ static GLboolean brwProgramStringNotify( GLcontext *ctx, shader_error(ctx, prog, "i965 driver doesn't yet support uninlined function " "calls. Move to using a single return statement at " - "the end of the function to work around it."); + "the end of the function to work around it.\n"); return GL_FALSE; } + if (prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_INPUT) { + shader_error(ctx, prog, + "Variable indexing of shader inputs unsupported\n"); + return GL_FALSE; + } + if (prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_OUTPUT) { + shader_error(ctx, prog, + "Variable indexing of shader outputs unsupported\n"); + return GL_FALSE; + } + if (target == GL_FRAGMENT_PROGRAM_ARB) { + if ((prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[0].RelAddr && + prog->Instructions[i].SrcReg[0].File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[1].RelAddr && + prog->Instructions[i].SrcReg[1].File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[2].RelAddr && + prog->Instructions[i].SrcReg[2].File == PROGRAM_TEMPORARY)) { + shader_error(ctx, prog, + "Variable indexing of variable arrays in the FS " + "unsupported\n"); + return GL_FALSE; + } + } } return GL_TRUE; diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h index a0680a56f2c..e525c730d3f 100644 --- a/src/mesa/drivers/dri/i965/brw_sf.h +++ b/src/mesa/drivers/dri/i965/brw_sf.h @@ -34,7 +34,7 @@ #define BRW_SF_H -#include "shader/program.h" +#include "program/program.h" #include "brw_context.h" #include "brw_eu.h" diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c index e290ca92f60..914f275cc67 100644 --- a/src/mesa/drivers/dri/i965/brw_sf_state.c +++ b/src/mesa/drivers/dri/i965/brw_sf_state.c @@ -130,7 +130,7 @@ struct brw_sf_unit_key { unsigned scissor:1; unsigned line_smooth:1; unsigned point_sprite:1; - unsigned point_attenuated:1; + unsigned use_vs_point_size:1; unsigned render_to_fbo:1; float line_width; float point_size; @@ -164,7 +164,8 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key) key->point_sprite = ctx->Point.PointSprite; key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); - key->point_attenuated = ctx->Point._Attenuated; + key->use_vs_point_size = (ctx->VertexProgram.PointSizeEnabled || + ctx->Point._Attenuated); /* _NEW_LIGHT */ key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION); @@ -296,7 +297,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, /* _NEW_POINT */ sf.sf7.sprite_point = key->point_sprite; sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3); - sf.sf7.use_point_size_state = !key->point_attenuated; + sf.sf7.use_point_size_state = !key->use_vs_point_size; sf.sf7.aa_line_distance_mode = 0; /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons: diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h index 2a7fa5b6997..2fde42a7060 100644 --- a/src/mesa/drivers/dri/i965/brw_structs.h +++ b/src/mesa/drivers/dri/i965/brw_structs.h @@ -1657,8 +1657,36 @@ struct brw_instruction GLuint end_of_thread:1; } dp_write_gen5; + /* Sandybridge DP for sample cache, constant cache, render cache */ struct { - GLuint pad:16; + GLuint binding_table_index:8; + GLuint msg_control:5; + GLuint msg_type:3; + GLuint pad0:3; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_sampler_const_cache; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint slot_group_select:1; + GLuint pixel_scoreboard_clear:1; + GLuint msg_type:4; + GLuint send_commit_msg:1; + GLuint pad0:1; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_render_cache; + + struct { + GLuint function_control:16; GLuint response_length:4; GLuint msg_length:4; GLuint msg_target:4; @@ -1666,8 +1694,9 @@ struct brw_instruction GLuint end_of_thread:1; } generic; + /* Of this struct, only end_of_thread is not present for gen6. */ struct { - GLuint pad:19; + GLuint function_control:19; GLuint header_present:1; GLuint response_length:5; GLuint msg_length:4; diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c index bba9249d1b4..1db2a210d45 100644 --- a/src/mesa/drivers/dri/i965/brw_util.c +++ b/src/mesa/drivers/dri/i965/brw_util.c @@ -31,7 +31,7 @@ #include "main/mtypes.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "brw_util.h" #include "brw_defines.h" diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 3c12f11ea78..9a832af9a97 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -34,8 +34,8 @@ #include "brw_vs.h" #include "brw_util.h" #include "brw_state.h" -#include "shader/prog_print.h" -#include "shader/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_parameter.h" diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index 6493744f3eb..9338a6b7dbf 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -36,7 +36,7 @@ #include "brw_context.h" #include "brw_eu.h" -#include "shader/program.h" +#include "program/program.h" struct brw_vs_prog_key { diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index 128987d78a6..c1d6525e9b7 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -31,9 +31,9 @@ #include "main/macros.h" -#include "shader/program.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" +#include "program/program.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" #include "brw_context.h" #include "brw_vs.h" @@ -44,6 +44,7 @@ static GLboolean brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg) { int opcode_array[] = { + [OPCODE_MOV] = 1, [OPCODE_ADD] = 2, [OPCODE_CMP] = 3, [OPCODE_DP3] = 2, @@ -218,7 +219,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) c->first_overflow_output = 0; if (intel->gen >= 6) - mrf = 6; + mrf = 4; else if (intel->gen == 5) mrf = 8; else @@ -238,12 +239,25 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) mrf++; /* just a placeholder? XXX fix later stages & remove this */ } else { - if (mrf < 16) { + /* Two restrictions on our compute-to-MRF here. The + * message length for all SEND messages is restricted to + * [1,15], so we can't use mrf 15, as that means a length + * of 16. + * + * Additionally, URB writes are aligned to URB rows, so we + * need to put an even number of registers of URB data in + * each URB write so that the later write is aligned. A + * message length of 15 means 1 message header reg plus 14 + * regs of URB data. + * + * For attributes beyond the compute-to-MRF, we compute to + * GRFs and they will be written in the second URB_WRITE. + */ + if (mrf < 15) { c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf); mrf++; } else { - /* too many vertex results to fit in MRF, use GRF for overflow */ if (!c->first_overflow_output) c->first_overflow_output = i; c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); @@ -318,8 +332,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) */ attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs); + /* See emit_vertex_write() for where the VUE's overhead on top of the + * attributes comes from. + */ if (intel->gen >= 6) - c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8; + c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8; else if (intel->gen == 5) c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4; else @@ -869,8 +886,6 @@ get_constant(struct brw_vs_compile *c, assert(argIndex < 3); if (c->current_const[argIndex].index != src->Index) { - struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; - /* Keep track of the last constant loaded in this slot, for reuse. */ c->current_const[argIndex].index = src->Index; @@ -881,9 +896,6 @@ get_constant(struct brw_vs_compile *c, /* need to fetch the constant now */ brw_dp_READ_4_vs(p, const_reg, /* writeback dest */ - 0, /* oword */ - 0, /* relative indexing? */ - addrReg, /* address register */ 16 * src->Index, /* byte offset */ SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ ); @@ -904,8 +916,8 @@ get_reladdr_constant(struct brw_vs_compile *c, const struct prog_src_register *src = &inst->SrcReg[argIndex]; struct brw_compile *p = &c->func; struct brw_reg const_reg = c->current_const[argIndex].reg; - struct brw_reg const2_reg; struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg byte_addr_reg = get_tmp(c); assert(argIndex < 3); @@ -917,37 +929,15 @@ get_reladdr_constant(struct brw_vs_compile *c, src->Index, argIndex, c->current_const[argIndex].reg.nr); #endif + brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16)); + /* fetch the first vec4 */ - brw_dp_READ_4_vs(p, - const_reg, /* writeback dest */ - 0, /* oword */ - 1, /* relative indexing? */ - addrReg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ - ); - /* second vec4 */ - const2_reg = get_tmp(c); - - /* use upper half of address reg for second read */ - addrReg = stride(addrReg, 0, 4, 0); - addrReg.subnr = 16; - - brw_dp_READ_4_vs(p, - const2_reg, /* writeback dest */ - 1, /* oword */ - 1, /* relative indexing? */ - addrReg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER - ); - - /* merge the two Owords into the constant register */ - /* const_reg[7..4] = const2_reg[7..4] */ - brw_MOV(p, - suboffset(stride(const_reg, 0, 4, 1), 4), - suboffset(stride(const2_reg, 0, 4, 1), 4)); - release_tmp(c, const2_reg); + brw_dp_READ_4_vs_relative(p, + const_reg, /* writeback dest */ + byte_addr_reg, /* address register */ + 16 * src->Index, /* byte offset */ + SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ + ); return const_reg; } @@ -993,36 +983,71 @@ static struct brw_reg get_reg( struct brw_vs_compile *c, */ static struct brw_reg deref( struct brw_vs_compile *c, struct brw_reg arg, - GLint offset) + GLint offset, + GLuint reg_size ) { struct brw_compile *p = &c->func; - struct brw_reg tmp = vec4(get_tmp(c)); + struct brw_reg tmp = get_tmp(c); struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; - struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW); - GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size; struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + /* Set the vertical stride on the register access so that the first + * 4 components come from a0.0 and the second 4 from a0.1. + */ + indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; { brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); - /* This is pretty clunky - load the address register twice and - * fetch each 4-dword value in turn. There must be a way to do - * this in a single pass, but I couldn't get it to work. - */ - brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset)); - brw_MOV(p, tmp, indirect); + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); + + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset)); - brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset)); - brw_MOV(p, suboffset(tmp, 4), indirect); + brw_MOV(p, tmp, indirect); brw_pop_insn_state(p); } - + /* NOTE: tmp not released */ - return vec8(tmp); + return tmp; } +static void +move_to_reladdr_dst(struct brw_vs_compile *c, + const struct prog_instruction *inst, + struct brw_reg val) +{ + struct brw_compile *p = &c->func; + int reg_size = 32; + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + struct brw_reg temp_base = c->regs[inst->DstReg.File][0]; + GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr; + struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + byte_offset += inst->DstReg.Index * reg_size; + + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); + brw_MOV(p, indirect, val); + + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, + brw_imm_uw(byte_offset + reg_size / 2)); + brw_MOV(p, indirect, suboffset(val, 4)); + + brw_pop_insn_state(p); +} /** * Get brw reg corresponding to the instruction's [argIndex] src reg. @@ -1091,7 +1116,7 @@ get_src_reg( struct brw_vs_compile *c, case PROGRAM_INPUT: case PROGRAM_OUTPUT: if (relAddr) { - return deref(c, c->regs[file][0], index); + return deref(c, c->regs[file][0], index, 32); } else { assert(c->regs[file][index].nr != 0); @@ -1113,7 +1138,7 @@ get_src_reg( struct brw_vs_compile *c, return get_constant(c, inst, argIndex); } else if (relAddr) { - return deref(c, c->regs[PROGRAM_STATE_VAR][0], index); + return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16); } else { assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0); @@ -1134,26 +1159,6 @@ get_src_reg( struct brw_vs_compile *c, } } - -static void emit_arl( struct brw_vs_compile *c, - struct brw_reg dst, - struct brw_reg arg0 ) -{ - struct brw_compile *p = &c->func; - struct brw_reg tmp = dst; - GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE); - - if (need_tmp) - tmp = get_tmp(c); - - brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */ - brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */ - - if (need_tmp) - release_tmp(c, tmp); -} - - /** * Return the brw reg for the given instruction's src argument. * Will return mangled results for SWZ op. The emit_swz() function @@ -1198,8 +1203,17 @@ static struct brw_reg get_dst( struct brw_vs_compile *c, switch (dst.File) { case PROGRAM_TEMPORARY: case PROGRAM_OUTPUT: - assert(c->regs[dst.File][dst.Index].nr != 0); - reg = c->regs[dst.File][dst.Index]; + /* register-indirect addressing is only 1x1, not VxH, for + * destination regs. So, for RelAddr we'll return a temporary + * for the dest and do a move of the result to the RelAddr + * register after the instruction emit. + */ + if (dst.RelAddr) { + reg = get_tmp(c); + } else { + assert(c->regs[dst.File][dst.Index].nr != 0); + reg = c->regs[dst.File][dst.Index]; + } break; case PROGRAM_ADDRESS: assert(dst.Index == 0); @@ -1298,7 +1312,6 @@ static void emit_vertex_write( struct brw_vs_compile *c) struct brw_compile *p = &c->func; struct brw_context *brw = p->brw; struct intel_context *intel = &brw->intel; - struct brw_reg m0 = brw_message_reg(0); struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS]; struct brw_reg ndc; int eot; @@ -1381,16 +1394,19 @@ static void emit_vertex_write( struct brw_vs_compile *c) */ brw_set_access_mode(p, BRW_ALIGN_1); + /* The VUE layout is documented in Volume 2a. */ if (intel->gen >= 6) { - /* There are 16 DWs (D0-D15) in VUE header on Sandybridge: + /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge: * dword 0-3 (m1) of the header is indices, point width, clip flags. * dword 4-7 (m2) is the 4D space position - * dword 8-15 (m3,m4) of the vertex header is the user clip distance. - * m5 is the first vertex data we fill, which is the vertex position. + * dword 8-15 (m3,m4) of the vertex header is the user clip distance if + * enabled. We don't use it, so skip it. + * m3 is the first vertex element data we fill, which is the vertex + * position. */ - brw_MOV(p, offset(m0, 2), pos); - brw_MOV(p, offset(m0, 5), pos); - len_vertex_header = 4; + brw_MOV(p, brw_message_reg(2), pos); + brw_MOV(p, brw_message_reg(3), pos); + len_vertex_header = 2; } else if (intel->gen == 5) { /* There are 20 DWs (D0-D19) in VUE header on Ironlake: * dword 0-3 (m1) of the header is indices, point width, clip flags. @@ -1400,9 +1416,9 @@ static void emit_vertex_write( struct brw_vs_compile *c) * m6 is a pad so that the vertex element data is aligned * m7 is the first vertex data we fill, which is the vertex position. */ - brw_MOV(p, offset(m0, 2), ndc); - brw_MOV(p, offset(m0, 3), pos); - brw_MOV(p, offset(m0, 7), pos); + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); + brw_MOV(p, brw_message_reg(7), pos); len_vertex_header = 6; } else { /* There are 8 dwords in VUE header pre-Ironlake: @@ -1412,8 +1428,8 @@ static void emit_vertex_write( struct brw_vs_compile *c) * dword 8-11 (m3) is the first vertex data, which we always have be the * vertex position. */ - brw_MOV(p, offset(m0, 2), ndc); - brw_MOV(p, offset(m0, 3), pos); + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); len_vertex_header = 2; } @@ -1437,29 +1453,26 @@ static void emit_vertex_write( struct brw_vs_compile *c) * Move the overflowed attributes from the GRF to the MRF and * issue another brw_urb_WRITE(). */ - /* XXX I'm not 100% sure about which MRF regs to use here. Starting - * at mrf[4] atm... - */ - GLuint i, mrf = 0; + GLuint i, mrf = 1; for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) { if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) { /* move from GRF to MRF */ - brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]); + brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]); mrf++; } } brw_urb_WRITE(p, brw_null_reg(), /* dest */ - 4, /* starting mrf reg nr */ + 0, /* starting mrf reg nr */ c->r0, /* src */ 0, /* allocate */ 1, /* used */ - mrf+1, /* msg len */ + mrf, /* msg len */ 0, /* response len */ 1, /* eot */ 1, /* writes complete */ - BRW_MAX_MRF-1, /* urb destination offset */ + 14 / 2, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); } } @@ -1665,7 +1678,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_ARL: - emit_arl(c, dst, args[0]); + brw_RNDD(p, dst, args[0]); break; case OPCODE_FLR: brw_RNDD(p, dst, args[0]); @@ -1890,6 +1903,14 @@ void brw_vs_emit(struct brw_vs_compile *c ) } } + if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) { + /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the + * compute-to-mrf and the fact that we are allocating + * registers for only the used PROGRAM_OUTPUTs. + */ + move_to_reladdr_dst(c, inst, dst); + } + release_tmps(c); } diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index be9e415cb07..0250a68d292 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -31,7 +31,7 @@ #include "main/mtypes.h" #include "main/texstore.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "brw_context.h" #include "brw_state.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 197b8754345..40f51c21c95 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -34,7 +34,7 @@ #define BRW_WM_H -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #include "brw_context.h" #include "brw_eu.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index a90a2d3cf25..0c625a4cd02 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -1326,7 +1326,7 @@ void emit_fb_write(struct brw_wm_compile *c, * + 1 for the second half we get destination + 4. */ brw_MOV(p, - brw_message_reg(nr + channel + (1 << 7)), + brw_message_reg(nr + channel + BRW_MRF_COMPR4), arg0[channel]); } else { /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ @@ -1763,12 +1763,20 @@ void brw_wm_emit( struct brw_wm_compile *c ) inst->dst[i]->spill_slot); } + /* Only properly tested on ILK */ + if (p->brw->intel.gen == 5) { + brw_remove_duplicate_mrf_moves(p); + if (c->dispatch_width == 16) + brw_remove_grf_to_mrf_moves(p); + } + if (INTEL_DEBUG & DEBUG_WM) { int i; - printf("wm-native:\n"); - for (i = 0; i < p->nr_insn; i++) + printf("wm-native:\n"); + for (i = 0; i < p->nr_insn; i++) brw_disasm(stderr, &p->store[i], p->brw->intel.gen); printf("\n"); } } + diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c index d73c3915824..0bef874b887 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_fp.c +++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c @@ -37,9 +37,9 @@ #include "brw_wm.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_statevars.h" /** An invalid texture target */ diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c index 57be08a8d1d..2dd346d6dd1 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c +++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c @@ -1,7 +1,7 @@ #include "main/macros.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_optimize.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_optimize.h" #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c index 60bd92ed223..05de85a957e 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c +++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c @@ -32,7 +32,7 @@ #include "brw_context.h" #include "brw_wm.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 1789b21451d..c1cf4db1cae 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -222,7 +222,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2), brw->wm.scratch_bo, wm.thread2.per_thread_scratch_space, - 0, 0); + I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); } /* Emit sampler state relocation */ diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 77898dbbe72..17b016b569b 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -32,7 +32,7 @@ #include "main/mtypes.h" #include "main/texstore.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "intel_mipmap_tree.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 51940efb443..6820ca3abf4 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -69,7 +69,7 @@ upload_sf_state(struct brw_context *brw) dw1 = num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT | (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT | - 3 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT; + 1 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT; dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE | GEN6_SF_STATISTICS_ENABLE; dw3 = 0; diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 5916a139946..4080a9dedfd 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -29,8 +29,8 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" static void diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index ed1a72f03ba..863c85449d9 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -29,8 +29,8 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" static void diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c index 698445c5268..ff741fc39ab 100644 --- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c @@ -102,7 +102,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used) if (INTEL_DEBUG & DEBUG_BATCH) { drm_intel_bo_map(batch->buf, GL_FALSE); intel_decode(batch->buf->virtual, used / 4, batch->buf->offset, - intel->intelScreen->deviceID); + intel->intelScreen->deviceID, GL_TRUE); drm_intel_bo_unmap(batch->buf); if (intel->vtbl.debug_batch != NULL) diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h index cd614c59e55..72a74322ee5 100644 --- a/src/mesa/drivers/dri/intel/intel_chipset.h +++ b/src/mesa/drivers/dri/intel/intel_chipset.h @@ -115,6 +115,9 @@ devid == PCI_CHIP_I946_GZ || \ IS_G4X(devid)) +/* Compat macro for intel_decode.c */ +#define IS_IRONLAKE(devid) IS_GEN5(devid) + #define IS_GEN6(devid) (devid == PCI_CHIP_SANDYBRIDGE || \ devid == PCI_CHIP_SANDYBRIDGE_M) diff --git a/src/mesa/drivers/dri/intel/intel_decode.c b/src/mesa/drivers/dri/intel/intel_decode.c index 650010ac9ca..25b4131594f 100644 --- a/src/mesa/drivers/dri/intel/intel_decode.c +++ b/src/mesa/drivers/dri/intel/intel_decode.c @@ -1,48 +1,21 @@ -/* -*- c-basic-offset: 4 -*- */ -/* - * Copyright © 2007 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Eric Anholt <[email protected]> - * - */ - -/** @file intel_decode.c - * This file contains code to print out batchbuffer contents in a - * human-readable format. - * - * The current version only supports i915 packets, and only pretty-prints a - * subset of them. The intention is for it to make just a best attempt to - * decode, but never crash in the process. - */ - +#include <stdint.h> #include <stdio.h> #include <stdarg.h> #include <string.h> -#include <inttypes.h> #include "intel_decode.h" #include "intel_chipset.h" +static FILE *out; +static uint32_t saved_s2 = 0, saved_s4 = 0; +static char saved_s2_set = 0, saved_s4_set = 0; +static uint32_t head_offset = 0xffffffff; /* undefined */ +static uint32_t tail_offset = 0xffffffff; /* undefined */ + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(A) (sizeof(A)/sizeof(A[0])) +#endif + #define BUFFER_FAIL(_count, _len, _name) do { \ fprintf(out, "Buffer size too small in %s (%d < %d)\n", \ (_name), (_count), (_len)); \ @@ -50,9 +23,6 @@ return count; \ } while (0) -static FILE *out; -static uint32_t saved_s2 = 0, saved_s4 = 0; -static char saved_s2_set = 0, saved_s4_set = 0; static float int_as_float(uint32_t intval) @@ -71,15 +41,24 @@ instr_out(uint32_t *data, uint32_t hw_offset, unsigned int index, char *fmt, ...) { va_list va; - - fprintf(out, "0x%08x: 0x%08x:%s ", hw_offset + index * 4, data[index], - index == 0 ? "" : " "); + char *parseinfo; + uint32_t offset = hw_offset + index * 4; + + if (offset == head_offset) + parseinfo = "HEAD"; + else if (offset == tail_offset) + parseinfo = "TAIL"; + else + parseinfo = " "; + + fprintf(out, "0x%08x: %s 0x%08x: %s", offset, parseinfo, + data[index], + index == 0 ? "" : " "); va_start(va, fmt); vfprintf(out, fmt, va); va_end(va); } - static int decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures) { @@ -94,10 +73,11 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures) } opcodes_mi[] = { { 0x08, 0, 1, 1, "MI_ARB_ON_OFF" }, { 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" }, + { 0x30, 0x3f, 3, 3, "MI_BATCH_BUFFER" }, { 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" }, { 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" }, { 0x04, 0, 1, 1, "MI_FLUSH" }, - { 0x22, 0, 3, 3, "MI_LOAD_REGISTER_IMM" }, + { 0x22, 0x1f, 3, 3, "MI_LOAD_REGISTER_IMM" }, { 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" }, { 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" }, { 0x00, 0, 1, 1, "MI_NOOP" }, @@ -111,6 +91,11 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures) { 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" }, }; + switch ((data[0] & 0x1f800000) >> 23) { + case 0x0a: + instr_out(data, hw_offset, 0, "MI_BATCH_BUFFER_END\n"); + return -1; + } for (opcode = 0; opcode < sizeof(opcodes_mi) / sizeof(opcodes_mi[0]); opcode++) { @@ -305,9 +290,13 @@ decode_2d(uint32_t *data, int count, uint32_t hw_offset, int *failures) static int decode_3d_1c(uint32_t *data, int count, uint32_t hw_offset, int *failures) { - switch ((data[0] & 0x00f80000) >> 19) { + uint32_t opcode; + + opcode = (data[0] & 0x00f80000) >> 19; + + switch (opcode) { case 0x11: - instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISALBE\n"); + instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE\n"); return 1; case 0x10: instr_out(data, hw_offset, 0, "3DSTATE_SCISSOR_ENABLE\n"); @@ -323,7 +312,8 @@ decode_3d_1c(uint32_t *data, int count, uint32_t hw_offset, int *failures) return 1; } - instr_out(data, hw_offset, 0, "3D UNKNOWN\n"); + instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_1c opcode = 0x%x\n", + opcode); (*failures)++; return 1; } @@ -381,7 +371,7 @@ i915_get_instruction_dst(uint32_t *data, int i, char *dstname, int do_mask) sprintf(dstname, "oD%s%s", dstmask, sat); break; case 6: - if (dst_nr > 2) + if (dst_nr > 3) fprintf(out, "bad destination reg U%d\n", dst_nr); sprintf(dstname, "U%d%s%s", dst_nr, dstmask, sat); break; @@ -452,7 +442,7 @@ i915_get_instruction_src_name(uint32_t src_type, uint32_t src_nr, char *name) break; case 6: sprintf(name, "U%d", src_nr); - if (src_nr > 2) + if (src_nr > 3) fprintf(out, "bad src reg %s\n", name); break; default: @@ -797,10 +787,14 @@ i915_decode_instruction(uint32_t *data, uint32_t hw_offset, } static int -decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830) +decode_3d_1d(uint32_t *data, int count, + uint32_t hw_offset, + uint32_t devid, + int *failures) { - unsigned int len, i, c, opcode, word, map, sampler, instr; + unsigned int len, i, c, idx, word, map, sampler, instr; char *format; + uint32_t opcode; struct { uint32_t opcode; @@ -811,7 +805,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i } opcodes_3d_1d[] = { { 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" }, { 0x86, 0, 4, 4, "3DSTATE_CHROMA_KEY" }, - { 0x9c, 0, 1, 1, "3DSTATE_CLEAR_PARAMETERS" }, + { 0x9c, 0, 7, 7, "3DSTATE_CLEAR_PARAMETERS" }, { 0x88, 0, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" }, { 0x99, 0, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" }, { 0x9a, 0, 2, 2, "3DSTATE_DEFAULT_SPECULAR" }, @@ -819,7 +813,6 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i { 0x97, 0, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" }, { 0x85, 0, 2, 2, "3DSTATE_DEST_BUFFER_VARIABLES" }, { 0x80, 0, 5, 5, "3DSTATE_DRAWING_RECTANGLE" }, - { 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" }, { 0x9d, 0, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" }, { 0x9e, 0, 4, 4, "3DSTATE_MONO_FILTER" }, { 0x89, 0, 4, 4, "3DSTATE_FOG_MODE" }, @@ -831,9 +824,11 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i { 0x8d, 1, 3, 3, "3DSTATE_W_STATE_I830" }, { 0x01, 1, 2, 2, "3DSTATE_COLOR_FACTOR_I830" }, { 0x02, 1, 2, 2, "3DSTATE_MAP_COORD_SETBIND_I830" }, - }; + }, *opcode_3d_1d; + + opcode = (data[0] & 0x00ff0000) >> 16; - switch ((data[0] & 0x00ff0000) >> 16) { + switch (opcode) { case 0x07: /* This instruction is unusual. A 0 length means just 1 DWORD instead of * 2. The 0 length is specified in one place to be unsupported, but @@ -888,26 +883,56 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n"); len = (data[0] & 0x0000000f) + 2; i = 1; - for (word = 0; word <= 7; word++) { + for (word = 0; word <= 8; word++) { if (data[0] & (1 << (4 + word))) { if (i >= count) BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_1"); /* save vertex state for decode */ - if (word == 2) { - saved_s2_set = 1; - saved_s2 = data[i]; - } - if (word == 4) { - saved_s4_set = 1; - saved_s4 = data[i]; + if (IS_9XX(devid)) { + if (word == 2) { + saved_s2_set = 1; + saved_s2 = data[i]; + } + if (word == 4) { + saved_s4_set = 1; + saved_s4 = data[i]; + } } instr_out(data, hw_offset, i++, "S%d\n", word); } } if (len != i) { - fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n"); + fprintf(out, "Bad count in 3DSTATE_LOAD_STATE_IMMEDIATE_1\n"); + (*failures)++; + } + return len; + case 0x03: + instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_2\n"); + len = (data[0] & 0x0000000f) + 2; + i = 1; + for (word = 6; word <= 14; word++) { + if (data[0] & (1 << word)) { + if (i >= count) + BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_2"); + + if (word == 6) + instr_out(data, hw_offset, i++, "TBCF\n"); + else if (word >= 7 && word <= 10) { + instr_out(data, hw_offset, i++, "TB%dC\n", word - 7); + instr_out(data, hw_offset, i++, "TB%dA\n", word - 7); + } else if (word >= 11 && word <= 14) { + instr_out(data, hw_offset, i++, "TM%dS0\n", word - 11); + instr_out(data, hw_offset, i++, "TM%dS1\n", word - 11); + instr_out(data, hw_offset, i++, "TM%dS2\n", word - 11); + instr_out(data, hw_offset, i++, "TM%dS3\n", word - 11); + instr_out(data, hw_offset, i++, "TM%dS4\n", word - 11); + } + } + } + if (len != i) { + fprintf(out, "Bad count in 3DSTATE_LOAD_STATE_IMMEDIATE_2\n"); (*failures)++; } return len; @@ -919,11 +944,28 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i i = 2; for (map = 0; map <= 15; map++) { if (data[1] & (1 << map)) { + int width, height, pitch, dword; + const char *tiling; + if (i + 3 >= count) BUFFER_FAIL(count, len, "3DSTATE_MAP_STATE"); + instr_out(data, hw_offset, i++, "map %d MS2\n", map); - instr_out(data, hw_offset, i++, "map %d MS3\n", map); - instr_out(data, hw_offset, i++, "map %d MS4\n", map); + + dword = data[i]; + width = ((dword >> 10) & ((1 << 11) - 1))+1; + height = ((dword >> 21) & ((1 << 11) - 1))+1; + + tiling = "none"; + if (dword & (1 << 2)) + tiling = "fenced"; + else if (dword & (1 << 1)) + tiling = dword & (1 << 0) ? "Y" : "X"; + instr_out(data, hw_offset, i++, "map %d MS3 [width=%d, height=%d, tiling=%s]\n", map, width, height, tiling); + + dword = data[i]; + pitch = 4*(((dword >> 21) & ((1 << 11) - 1))+1); + instr_out(data, hw_offset, i++, "map %d MS4 [pitch=%d]\n", map, pitch); } } if (len != i) { @@ -979,8 +1021,8 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i } return len; case 0x01: - if (i830) - break; + if (!IS_9XX(devid)) + break; instr_out(data, hw_offset, 0, "3DSTATE_SAMPLER_STATE\n"); instr_out(data, hw_offset, 1, "mask\n"); len = (data[0] & 0x0000003f) + 2; @@ -1031,32 +1073,61 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i format, (data[1] & (1 << 31)) ? "en" : "dis"); return len; + + case 0x8e: + { + const char *name, *tiling; + + len = (data[0] & 0x0000000f) + 2; + if (len != 3) + fprintf(out, "Bad count in 3DSTATE_BUFFER_INFO\n"); + if (count < 3) + BUFFER_FAIL(count, len, "3DSTATE_BUFFER_INFO"); + + switch((data[1] >> 24) & 0x7) { + case 0x3: name = "color"; break; + case 0x7: name = "depth"; break; + default: name = "unknown"; break; + } + + tiling = "none"; + if (data[1] & (1 << 23)) + tiling = "fenced"; + else if (data[1] & (1 << 22)) + tiling = data[1] & (1 << 21) ? "Y" : "X"; + + instr_out(data, hw_offset, 0, "3DSTATE_BUFFER_INFO\n"); + instr_out(data, hw_offset, 1, "%s, tiling = %s, pitch=%d\n", name, tiling, data[1]&0xffff); + + instr_out(data, hw_offset, 2, "address\n"); + return len; + } } - for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]); - opcode++) + for (idx = 0; idx < ARRAY_SIZE(opcodes_3d_1d); idx++) { - if (opcodes_3d_1d[opcode].i830_only && !i830) + opcode_3d_1d = &opcodes_3d_1d[idx]; + if (opcode_3d_1d->i830_only && IS_9XX(devid)) continue; - if (((data[0] & 0x00ff0000) >> 16) == opcodes_3d_1d[opcode].opcode) { + if (((data[0] & 0x00ff0000) >> 16) == opcode_3d_1d->opcode) { len = 1; - instr_out(data, hw_offset, 0, "%s\n", opcodes_3d_1d[opcode].name); - if (opcodes_3d_1d[opcode].max_len > 1) { + instr_out(data, hw_offset, 0, "%s\n", opcode_3d_1d->name); + if (opcode_3d_1d->max_len > 1) { len = (data[0] & 0x0000ffff) + 2; - if (len < opcodes_3d_1d[opcode].min_len || - len > opcodes_3d_1d[opcode].max_len) + if (len < opcode_3d_1d->min_len || + len > opcode_3d_1d->max_len) { fprintf(out, "Bad count in %s\n", - opcodes_3d_1d[opcode].name); + opcode_3d_1d->name); (*failures)++; } } for (i = 1; i < len; i++) { if (i >= count) - BUFFER_FAIL(count, len, opcodes_3d_1d[opcode].name); + BUFFER_FAIL(count, len, opcode_3d_1d->name); instr_out(data, hw_offset, i, "dword %d\n", i); } @@ -1064,7 +1135,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i } } - instr_out(data, hw_offset, 0, "3D UNKNOWN\n"); + instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_1d opcode = 0x%x\n", opcode); (*failures)++; return 1; } @@ -1074,8 +1145,10 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, int *failures) { char immediate = (data[0] & (1 << 23)) == 0; - unsigned int len, i; + unsigned int len, i, ret; char *primtype; + int original_s2 = saved_s2; + int original_s4 = saved_s4; switch ((data[0] >> 18) & 0xf) { case 0x0: primtype = "TRILIST"; break; @@ -1088,7 +1161,7 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, case 0x7: primtype = "RECTLIST"; break; case 0x8: primtype = "POINTLIST"; break; case 0x9: primtype = "DIB"; break; - case 0xa: primtype = "CLEAR_RECT"; break; + case 0xa: primtype = "CLEAR_RECT"; saved_s4 = 3 << 6; saved_s2 = ~0; break; default: primtype = "unknown"; break; } @@ -1192,6 +1265,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, vertex++; } } + + ret = len; } else { /* indirect vertices */ len = data[0] & 0x0000ffff; /* index count */ @@ -1209,13 +1284,15 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, if ((data[i] & 0xffff) == 0xffff) { instr_out(data, hw_offset, i, " indices: (terminator)\n"); - return i; + ret = i; + goto out; } else if ((data[i] >> 16) == 0xffff) { instr_out(data, hw_offset, i, " indices: 0x%04x, " "(terminator)\n", data[i] & 0xffff); - return i; + ret = i; + goto out; } else { instr_out(data, hw_offset, i, " indices: 0x%04x, 0x%04x\n", @@ -1225,7 +1302,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, fprintf(out, "3DPRIMITIVE: no terminator found in index buffer\n"); (*failures)++; - return count; + ret = count; + goto out; } else { /* fixed size vertex index buffer */ for (i = 0; i < len; i += 2) { @@ -1240,7 +1318,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, } } } - return (len + 1) / 2 + 1; + ret = (len + 1) / 2 + 1; + goto out; } else { /* sequential vertex access */ if (count < 2) @@ -1249,17 +1328,22 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset, "3DPRIMITIVE sequential indirect %s, %d starting from " "%d\n", primtype, len, data[1] & 0xffff); instr_out(data, hw_offset, 1, " start\n"); - return 2; + ret = 2; + goto out; } } - return len; +out: + saved_s2 = original_s2; + saved_s4 = original_s4; + return ret; } static int -decode_3d(uint32_t *data, int count, uint32_t hw_offset, int *failures) +decode_3d(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures) { - unsigned int opcode; + uint32_t opcode; + unsigned int idx; struct { uint32_t opcode; @@ -1276,42 +1360,44 @@ decode_3d(uint32_t *data, int count, uint32_t hw_offset, int *failures) { 0x0d, 1, 1, "3DSTATE_MODES_4" }, { 0x0c, 1, 1, "3DSTATE_MODES_5" }, { 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" }, - }; + }, *opcode_3d; + + opcode = (data[0] & 0x1f000000) >> 24; - switch ((data[0] & 0x1f000000) >> 24) { + switch (opcode) { case 0x1f: return decode_3d_primitive(data, count, hw_offset, failures); case 0x1d: - return decode_3d_1d(data, count, hw_offset, failures, 0); + return decode_3d_1d(data, count, hw_offset, devid, failures); case 0x1c: return decode_3d_1c(data, count, hw_offset, failures); } - for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]); - opcode++) { - if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) { + for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) { + opcode_3d = &opcodes_3d[idx]; + if (opcode == opcode_3d->opcode) { unsigned int len = 1, i; - instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name); - if (opcodes_3d[opcode].max_len > 1) { + instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name); + if (opcode_3d->max_len > 1) { len = (data[0] & 0xff) + 2; - if (len < opcodes_3d[opcode].min_len || - len > opcodes_3d[opcode].max_len) + if (len < opcode_3d->min_len || + len > opcode_3d->max_len) { - fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name); + fprintf(out, "Bad count in %s\n", opcode_3d->name); } } for (i = 1; i < len; i++) { if (i >= count) - BUFFER_FAIL(count, len, opcodes_3d[opcode].name); + BUFFER_FAIL(count, len, opcode_3d->name); instr_out(data, hw_offset, i, "dword %d\n", i); } return len; } } - instr_out(data, hw_offset, 0, "3D UNKNOWN\n"); + instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d opcode = 0x%x\n", opcode); (*failures)++; return 1; } @@ -1403,11 +1489,86 @@ get_965_prim_type(uint32_t data) } static int -decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) +i965_decode_urb_fence(uint32_t *data, uint32_t hw_offset, int len, int count, + int *failures) { - unsigned int opcode, len; - int i; - char *desc1; + uint32_t vs_fence, clip_fence, gs_fence, sf_fence, vfe_fence, cs_fence; + + if (len != 3) + fprintf(out, "Bad count in URB_FENCE\n"); + if (count < 3) + BUFFER_FAIL(count, len, "URB_FENCE"); + + vs_fence = data[1] & 0x3ff; + gs_fence = (data[1] >> 10) & 0x3ff; + clip_fence = (data[1] >> 20) & 0x3ff; + sf_fence = data[2] & 0x3ff; + vfe_fence = (data[2] >> 10) & 0x3ff; + cs_fence = (data[2] >> 20) & 0x7ff; + + instr_out(data, hw_offset, 0, "URB_FENCE: %s%s%s%s%s%s\n", + (data[0] >> 13) & 1 ? "cs " : "", + (data[0] >> 12) & 1 ? "vfe " : "", + (data[0] >> 11) & 1 ? "sf " : "", + (data[0] >> 10) & 1 ? "clip " : "", + (data[0] >> 9) & 1 ? "gs " : "", + (data[0] >> 8) & 1 ? "vs " : ""); + instr_out(data, hw_offset, 1, + "vs fence: %d, clip_fence: %d, gs_fence: %d\n", + vs_fence, clip_fence, gs_fence); + instr_out(data, hw_offset, 2, + "sf fence: %d, vfe_fence: %d, cs_fence: %d\n", + sf_fence, vfe_fence, cs_fence); + if (gs_fence < vs_fence) + fprintf(out, "gs fence < vs fence!\n"); + if (clip_fence < gs_fence) + fprintf(out, "clip fence < gs fence!\n"); + if (sf_fence < clip_fence) + fprintf(out, "sf fence < clip fence!\n"); + if (cs_fence < sf_fence) + fprintf(out, "cs fence < sf fence!\n"); + + return len; +} + +static void +state_base_out(uint32_t *data, uint32_t hw_offset, unsigned int index, + char *name) +{ + if (data[index] & 1) { + instr_out(data, hw_offset, index, "%s state base address 0x%08x\n", + name, data[index] & ~1); + } else { + instr_out(data, hw_offset, index, "%s state base not updated\n", + name); + } +} + +static void +state_max_out(uint32_t *data, uint32_t hw_offset, unsigned int index, + char *name) +{ + if (data[index] & 1) { + if (data[index] == 1) { + instr_out(data, hw_offset, index, + "%s state upper bound disabled\n", name); + } else { + instr_out(data, hw_offset, index, "%s state upper bound 0x%08x\n", + name, data[index] & ~1); + } + } else { + instr_out(data, hw_offset, index, "%s state upper bound not updated\n", + name); + } +} + +static int +decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures) +{ + uint32_t opcode; + unsigned int idx, len; + int i, sba_len; + char *desc1 = NULL; struct { uint32_t opcode; @@ -1436,57 +1597,78 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) { 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" }, { 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" }, { 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" }, + { 0x7909, 2, 2, "3DSTATE_CLEAR_PARAMS" }, { 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" }, + { 0x790b, 4, 4, "3DSTATE_GS_SVB_INDEX" }, + { 0x790d, 3, 3, "3DSTATE_MULTISAMPLE" }, { 0x7b00, 6, 6, "3DPRIMITIVE" }, + { 0x7802, 4, 4, "3DSTATE_SAMPLER_STATE_POINTERS" }, + { 0x7805, 3, 3, "3DSTATE_URB" }, { 0x780e, 4, 4, "3DSTATE_CC_STATE_POINTERS" }, { 0x7810, 6, 6, "3DSTATE_VS_STATE" }, - { 0x7811, 6, 6, "3DSTATE_GS_STATE" }, + { 0x7811, 7, 7, "3DSTATE_GS_STATE" }, + { 0x7812, 4, 4, "3DSTATE_CLIP_STATE" }, + { 0x7813, 20, 20, "3DSTATE_SF_STATE" }, + { 0x7814, 9, 9, "3DSTATE_WM_STATE" }, { 0x7812, 4, 4, "3DSTATE_CLIP_STATE" }, { 0x7815, 5, 5, "3DSTATE_CONSTANT_VS_STATE" }, { 0x7816, 5, 5, "3DSTATE_CONSTANT_GS_STATE" }, - }; + { 0x7817, 5, 5, "3DSTATE_CONSTANT_PS_STATE" }, + { 0x7818, 2, 2, "3DSTATE_SAMPLE_MASK" }, + }, *opcode_3d; len = (data[0] & 0x0000ffff) + 2; - switch ((data[0] & 0xffff0000) >> 16) { + opcode = (data[0] & 0xffff0000) >> 16; + switch (opcode) { + case 0x6000: + len = (data[0] & 0x000000ff) + 2; + return i965_decode_urb_fence(data, hw_offset, len, count, failures); + case 0x6001: + instr_out(data, hw_offset, 0, "CS_URB_STATE\n"); + instr_out(data, hw_offset, 1, "entry_size: %d [%d bytes], n_entries: %d\n", + (data[1] >> 4) & 0x1f, + (((data[1] >> 4) & 0x1f) + 1) * 64, + data[1] & 0x7); + return len; + case 0x6002: + len = (data[0] & 0x000000ff) + 2; + instr_out(data, hw_offset, 0, "CONSTANT_BUFFER: %s\n", + (data[0] >> 8) & 1 ? "valid" : "invalid"); + instr_out(data, hw_offset, 1, "offset: 0x%08x, length: %d bytes\n", + data[1] & ~0x3f, ((data[1] & 0x3f) + 1) * 64); + return len; case 0x6101: - if (len != 6) + if (IS_GEN6(devid)) + sba_len = 10; + else if (IS_IRONLAKE(devid)) + sba_len = 8; + else + sba_len = 6; + if (len != sba_len) fprintf(out, "Bad count in STATE_BASE_ADDRESS\n"); - if (count < 6) + if (len != sba_len) BUFFER_FAIL(count, len, "STATE_BASE_ADDRESS"); + i = 0; instr_out(data, hw_offset, 0, "STATE_BASE_ADDRESS\n"); - - if (data[1] & 1) { - instr_out(data, hw_offset, 1, "General state at 0x%08x\n", - data[1] & ~1); - } else - instr_out(data, hw_offset, 1, "General state not updated\n"); - - if (data[2] & 1) { - instr_out(data, hw_offset, 2, "Surface state at 0x%08x\n", - data[2] & ~1); - } else - instr_out(data, hw_offset, 2, "Surface state not updated\n"); - - if (data[3] & 1) { - instr_out(data, hw_offset, 3, "Indirect state at 0x%08x\n", - data[3] & ~1); - } else - instr_out(data, hw_offset, 3, "Indirect state not updated\n"); - - if (data[4] & 1) { - instr_out(data, hw_offset, 4, "General state upper bound 0x%08x\n", - data[4] & ~1); - } else - instr_out(data, hw_offset, 4, "General state not updated\n"); - - if (data[5] & 1) { - instr_out(data, hw_offset, 5, "Indirect state upper bound 0x%08x\n", - data[5] & ~1); - } else - instr_out(data, hw_offset, 5, "Indirect state not updated\n"); + i++; + + state_base_out(data, hw_offset, i++, "general"); + state_base_out(data, hw_offset, i++, "surface"); + if (IS_GEN6(devid)) + state_base_out(data, hw_offset, i++, "dynamic"); + state_base_out(data, hw_offset, i++, "indirect"); + if (IS_IRONLAKE(devid) || IS_GEN6(devid)) + state_base_out(data, hw_offset, i++, "instruction"); + + state_max_out(data, hw_offset, i++, "general"); + if (IS_GEN6(devid)) + state_max_out(data, hw_offset, i++, "dynamic"); + state_max_out(data, hw_offset, i++, "indirect"); + if (IS_IRONLAKE(devid) || IS_GEN6(devid)) + state_max_out(data, hw_offset, i++, "instruction"); return len; case 0x7800: @@ -1505,18 +1687,33 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) instr_out(data, hw_offset, 6, "CC state\n"); return len; case 0x7801: - if (len != 6) + len = (data[0] & 0x000000ff) + 2; + if (len != 6 && len != 4) fprintf(out, "Bad count in 3DSTATE_BINDING_TABLE_POINTERS\n"); - if (count < 6) - BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS"); + if (len == 6) { + if (count < 6) + BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS"); + instr_out(data, hw_offset, 0, + "3DSTATE_BINDING_TABLE_POINTERS\n"); + instr_out(data, hw_offset, 1, "VS binding table\n"); + instr_out(data, hw_offset, 2, "GS binding table\n"); + instr_out(data, hw_offset, 3, "Clip binding table\n"); + instr_out(data, hw_offset, 4, "SF binding table\n"); + instr_out(data, hw_offset, 5, "WM binding table\n"); + } else { + if (count < 4) + BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS"); - instr_out(data, hw_offset, 0, - "3DSTATE_BINDING_TABLE_POINTERS\n"); - instr_out(data, hw_offset, 1, "VS binding table\n"); - instr_out(data, hw_offset, 2, "GS binding table\n"); - instr_out(data, hw_offset, 3, "Clip binding table\n"); - instr_out(data, hw_offset, 4, "SF binding table\n"); - instr_out(data, hw_offset, 5, "WM binding table\n"); + instr_out(data, hw_offset, 0, + "3DSTATE_BINDING_TABLE_POINTERS: VS mod %d, " + "GS mod %d, PS mod %d\n", + (data[0] & (1 << 8)) != 0, + (data[0] & (1 << 9)) != 0, + (data[0] & (1 << 10)) != 0); + instr_out(data, hw_offset, 1, "VS binding table\n"); + instr_out(data, hw_offset, 2, "GS binding table\n"); + instr_out(data, hw_offset, 3, "WM binding table\n"); + } return len; @@ -1567,6 +1764,18 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) } return len; + case 0x780d: + len = (data[0] & 0xff) + 2; + if (len != 4) + fprintf(out, "Bad count in 3DSTATE_VIEWPORT_STATE_POINTERS\n"); + if (count < len) + BUFFER_FAIL(count, len, "3DSTATE_VIEWPORT_STATE_POINTERS"); + instr_out(data, hw_offset, 0, "3DSTATE_VIEWPORT_STATE_POINTERS\n"); + instr_out(data, hw_offset, 1, "clip\n"); + instr_out(data, hw_offset, 2, "sf\n"); + instr_out(data, hw_offset, 3, "cc\n"); + return len; + case 0x780a: len = (data[0] & 0xff) + 2; if (len != 3) @@ -1616,10 +1825,10 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) ((data[3] & 0x0007ffc0) >> 6) + 1, ((data[3] & 0xfff80000) >> 19) + 1); instr_out(data, hw_offset, 4, "volume depth\n"); - if (len == 6) + if (len >= 6) instr_out(data, hw_offset, 5, "\n"); - if (len == 7) - instr_out(data, hw_offset, 6, "render target view extent\n"); + if (len >= 7) + instr_out(data, hw_offset, 6, "render target view extent\n"); return len; @@ -1638,12 +1847,11 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) } instr_out(data, hw_offset, 0, "PIPE_CONTROL: %s, %sdepth stall, %sRC write flush, " - "%sinst flush, %stexture flush\n", + "%sinst flush\n", desc1, data[0] & (1 << 13) ? "" : "no ", data[0] & (1 << 12) ? "" : "no ", - data[0] & (1 << 11) ? "" : "no ", - data[0] & (1 << 9) ? "" : "no "); + data[0] & (1 << 11) ? "" : "no "); instr_out(data, hw_offset, 1, "destination address\n"); instr_out(data, hw_offset, 2, "immediate dword low\n"); instr_out(data, hw_offset, 3, "immediate dword high\n"); @@ -1668,40 +1876,41 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures) return len; } - for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]); - opcode++) { - if ((data[0] & 0xffff0000) >> 16 == opcodes_3d[opcode].opcode) { + for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) { + opcode_3d = &opcodes_3d[idx]; + if ((data[0] & 0xffff0000) >> 16 == opcode_3d->opcode) { unsigned int i; len = 1; - instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name); - if (opcodes_3d[opcode].max_len > 1) { + instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name); + if (opcode_3d->max_len > 1) { len = (data[0] & 0xff) + 2; - if (len < opcodes_3d[opcode].min_len || - len > opcodes_3d[opcode].max_len) + if (len < opcode_3d->min_len || + len > opcode_3d->max_len) { - fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name); + fprintf(out, "Bad count in %s\n", opcode_3d->name); } } for (i = 1; i < len; i++) { if (i >= count) - BUFFER_FAIL(count, len, opcodes_3d[opcode].name); + BUFFER_FAIL(count, len, opcode_3d->name); instr_out(data, hw_offset, i, "dword %d\n", i); } return len; } } - instr_out(data, hw_offset, 0, "3D UNKNOWN\n"); + instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_965 opcode = 0x%x\n", opcode); (*failures)++; return 1; } static int -decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures) +decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures) { - unsigned int opcode; + unsigned int idx; + uint32_t opcode; struct { uint32_t opcode; @@ -1725,42 +1934,44 @@ decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures) { 0x0f, 1, 1, "3DSTATE_MODES_2" }, { 0x15, 1, 1, "3DSTATE_FOG_COLOR" }, { 0x16, 1, 1, "3DSTATE_MODES_4" }, - }; + }, *opcode_3d; - switch ((data[0] & 0x1f000000) >> 24) { + opcode = (data[0] & 0x1f000000) >> 24; + + switch (opcode) { case 0x1f: return decode_3d_primitive(data, count, hw_offset, failures); case 0x1d: - return decode_3d_1d(data, count, hw_offset, failures, 1); + return decode_3d_1d(data, count, hw_offset, devid, failures); case 0x1c: return decode_3d_1c(data, count, hw_offset, failures); } - for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]); - opcode++) { - if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) { + for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) { + opcode_3d = &opcodes_3d[idx]; + if ((data[0] & 0x1f000000) >> 24 == opcode_3d->opcode) { unsigned int len = 1, i; - instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name); - if (opcodes_3d[opcode].max_len > 1) { + instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name); + if (opcode_3d->max_len > 1) { len = (data[0] & 0xff) + 2; - if (len < opcodes_3d[opcode].min_len || - len > opcodes_3d[opcode].max_len) + if (len < opcode_3d->min_len || + len > opcode_3d->max_len) { - fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name); + fprintf(out, "Bad count in %s\n", opcode_3d->name); } } for (i = 1; i < len; i++) { if (i >= count) - BUFFER_FAIL(count, len, opcodes_3d[opcode].name); + BUFFER_FAIL(count, len, opcode_3d->name); instr_out(data, hw_offset, i, "dword %d\n", i); } return len; } } - instr_out(data, hw_offset, 0, "3D UNKNOWN\n"); + instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_i830 opcode = 0x%x\n", opcode); (*failures)++; return 1; } @@ -1773,18 +1984,37 @@ decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures) * \param hw_offset hardware address for the buffer */ int -intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid) +intel_decode(uint32_t *data, int count, + uint32_t hw_offset, + uint32_t devid, + uint32_t ignore_end_of_batchbuffer) { + int ret; int index = 0; int failures = 0; - out = stderr; + out = stdout; while (index < count) { switch ((data[index] & 0xe0000000) >> 29) { case 0x0: - index += decode_mi(data + index, count - index, + ret = decode_mi(data + index, count - index, hw_offset + index * 4, &failures); + + /* If MI_BATCHBUFFER_END happened, then dump the rest of the + * output in case we some day want it in debugging, but don't + * decode it since it'll just confuse in the common case. + */ + if (ret == -1) { + if (ignore_end_of_batchbuffer) { + index++; + } else { + for (index = index + 1; index < count; index++) { + instr_out(data, hw_offset, index, "\n"); + } + } + } else + index += ret; break; case 0x2: index += decode_2d(data + index, count - index, @@ -1793,13 +2023,16 @@ intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid) case 0x3: if (IS_965(devid)) { index += decode_3d_965(data + index, count - index, - hw_offset + index * 4, &failures); + hw_offset + index * 4, + devid, &failures); } else if (IS_9XX(devid)) { index += decode_3d(data + index, count - index, - hw_offset + index * 4, &failures); + hw_offset + index * 4, + devid, &failures); } else { index += decode_3d_i830(data + index, count - index, - hw_offset + index * 4, &failures); + hw_offset + index * 4, + devid, &failures); } break; default: @@ -1820,3 +2053,8 @@ void intel_decode_context_reset(void) saved_s4_set = 1; } +void intel_decode_context_set_head_tail(uint32_t head, uint32_t tail) +{ + head_offset = head; + tail_offset = tail; +} diff --git a/src/mesa/drivers/dri/intel/intel_decode.h b/src/mesa/drivers/dri/intel/intel_decode.h index c50644a46b5..a13b075cef8 100644 --- a/src/mesa/drivers/dri/intel/intel_decode.h +++ b/src/mesa/drivers/dri/intel/intel_decode.h @@ -25,5 +25,7 @@ * */ -int intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid); +int intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, + uint32_t ignore_end_of_batchbuffer); +void intel_decode_context_set_head_tail(uint32_t head, uint32_t tail); void intel_decode_context_reset(void); diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c index 076fee89bdd..0e2fe893fed 100644 --- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c +++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c @@ -26,6 +26,7 @@ **************************************************************************/ #include "main/glheader.h" +#include "main/arbprogram.h" #include "main/enums.h" #include "main/image.h" #include "main/colormac.h" @@ -44,7 +45,6 @@ #include "main/attrib.h" #include "main/enable.h" #include "main/viewport.h" -#include "shader/arbprogram.h" #include "swrast/swrast.h" #include "intel_screen.h" diff --git a/src/mesa/drivers/dri/mach64/mach64_screen.c b/src/mesa/drivers/dri/mach64/mach64_screen.c index 4bd6dee6c0e..239e8bc8fd0 100644 --- a/src/mesa/drivers/dri/mach64/mach64_screen.c +++ b/src/mesa/drivers/dri/mach64/mach64_screen.c @@ -256,7 +256,6 @@ mach64CreateScreen( __DRIscreen *sPriv ) mach64Screen->driScreen = sPriv; i = 0; - mach64Screen->extensions[i++] = &driFrameTrackingExtension.base; if ( mach64Screen->irq != 0 ) { mach64Screen->extensions[i++] = &driSwapControlExtension.base; mach64Screen->extensions[i++] = &driMediaStreamCounterExtension.base; diff --git a/src/mesa/drivers/dri/mga/mga_xmesa.c b/src/mesa/drivers/dri/mga/mga_xmesa.c index 31007ccb1da..3a31dfb44a3 100644 --- a/src/mesa/drivers/dri/mga/mga_xmesa.c +++ b/src/mesa/drivers/dri/mga/mga_xmesa.c @@ -182,7 +182,6 @@ mgaFillInModes( __DRIscreen *psp, const __DRIextension *mgaScreenExtensions[] = { &driReadDrawableExtension, &driSwapControlExtension.base, - &driFrameTrackingExtension.base, &driMediaStreamCounterExtension.base, NULL }; diff --git a/src/mesa/drivers/dri/r128/r128_screen.c b/src/mesa/drivers/dri/r128/r128_screen.c index 2d918028236..7626a159d6a 100644 --- a/src/mesa/drivers/dri/r128/r128_screen.c +++ b/src/mesa/drivers/dri/r128/r128_screen.c @@ -221,7 +221,6 @@ r128CreateScreen( __DRIscreen *sPriv ) r128Screen->driScreen = sPriv; i = 0; - r128Screen->extensions[i++] = &driFrameTrackingExtension.base; if ( r128Screen->irq != 0 ) { r128Screen->extensions[i++] = &driSwapControlExtension.base; r128Screen->extensions[i++] = &driMediaStreamCounterExtension.base; diff --git a/src/mesa/drivers/dri/r128/r128_state.c b/src/mesa/drivers/dri/r128/r128_state.c index 4d773feaaa8..9ad25f7f463 100644 --- a/src/mesa/drivers/dri/r128/r128_state.c +++ b/src/mesa/drivers/dri/r128/r128_state.c @@ -42,6 +42,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/context.h" #include "main/enums.h" #include "main/colormac.h" +#include "main/macros.h" #include "swrast/swrast.h" #include "vbo/vbo.h" #include "tnl/tnl.h" diff --git a/src/mesa/drivers/dri/r128/r128_tex.c b/src/mesa/drivers/dri/r128/r128_tex.c index 4ec4be9a47b..b5a19b510af 100644 --- a/src/mesa/drivers/dri/r128/r128_tex.c +++ b/src/mesa/drivers/dri/r128/r128_tex.c @@ -44,6 +44,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/texobj.h" #include "main/imports.h" #include "main/texobj.h" +#include "main/macros.h" #include "xmlpool.h" diff --git a/src/mesa/drivers/dri/r200/r200_fragshader.c b/src/mesa/drivers/dri/r200/r200_fragshader.c index 85c1b7bdd19..2a9268dd343 100644 --- a/src/mesa/drivers/dri/r200/r200_fragshader.c +++ b/src/mesa/drivers/dri/r200/r200_fragshader.c @@ -26,11 +26,11 @@ **************************************************************************/ #include "main/glheader.h" +#include "main/atifragshader.h" #include "main/macros.h" #include "main/enums.h" #include "tnl/t_context.h" -#include "shader/atifragshader.h" -#include "shader/program.h" +#include "program/program.h" #include "r200_context.h" #include "r200_ioctl.h" #include "r200_tex.h" diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c index b72f69b7f45..df73de5394a 100644 --- a/src/mesa/drivers/dri/r200/r200_ioctl.c +++ b/src/mesa/drivers/dri/r200/r200_ioctl.c @@ -253,113 +253,6 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask ) } } -/* This version of AllocateMemoryMESA allocates only GART memory, and - * only does so after the point at which the driver has been - * initialized. - * - * Theoretically a valid context isn't required. However, in this - * implementation, it is, as I'm using the hardware lock to protect - * the kernel data structures, and the current context to get the - * device fd. - */ -void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size, - GLfloat readfreq, GLfloat writefreq, - GLfloat priority) -{ - GET_CURRENT_CONTEXT(ctx); - r200ContextPtr rmesa; - int region_offset; - drm_radeon_mem_alloc_t alloc; - int ret; - - if (R200_DEBUG & RADEON_IOCTL) - fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, - writefreq, priority); - - if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) - return NULL; - - if (getenv("R200_NO_ALLOC")) - return NULL; - - alloc.region = RADEON_MEM_REGION_GART; - alloc.alignment = 0; - alloc.size = size; - alloc.region_offset = ®ion_offset; - - ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd, - DRM_RADEON_ALLOC, - &alloc, sizeof(alloc)); - - if (ret) { - fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret); - return NULL; - } - - { - char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map; - return (void *)(region_start + region_offset); - } -} - - -/* Called via glXFreeMemoryMESA() */ -void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer) -{ - GET_CURRENT_CONTEXT(ctx); - r200ContextPtr rmesa; - ptrdiff_t region_offset; - drm_radeon_mem_free_t memfree; - int ret; - - if (R200_DEBUG & RADEON_IOCTL) - fprintf(stderr, "%s %p\n", __FUNCTION__, pointer); - - if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) { - fprintf(stderr, "%s: no context\n", __FUNCTION__); - return; - } - - region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map; - - if (region_offset < 0 || - region_offset > rmesa->radeon.radeonScreen->gartTextures.size) { - fprintf(stderr, "offset %d outside range 0..%d\n", region_offset, - rmesa->radeon.radeonScreen->gartTextures.size); - return; - } - - memfree.region = RADEON_MEM_REGION_GART; - memfree.region_offset = region_offset; - - ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd, - DRM_RADEON_FREE, - &memfree, sizeof(memfree)); - - if (ret) - fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret); -} - -/* Called via glXGetMemoryOffsetMESA() */ -GLuint r200GetMemoryOffsetMESA(__DRIscreen *screen, const GLvoid *pointer) -{ - GET_CURRENT_CONTEXT(ctx); - r200ContextPtr rmesa; - GLuint card_offset; - - if (!ctx || !(rmesa = R200_CONTEXT(ctx)) ) { - fprintf(stderr, "%s: no context\n", __FUNCTION__); - return ~0; - } - - if (!r200IsGartMemory( rmesa, pointer, 0 )) - return ~0; - - card_offset = r200GartOffsetFromVirtual( rmesa, pointer ); - - return card_offset - rmesa->radeon.radeonScreen->gart_base; -} - GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer, GLint size ) { diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h index 8d51aefa042..c5dca89bc76 100644 --- a/src/mesa/drivers/dri/r200/r200_ioctl.h +++ b/src/mesa/drivers/dri/r200/r200_ioctl.h @@ -64,11 +64,6 @@ extern void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset); extern void r200InitIoctlFuncs( struct dd_function_table *functions ); -extern void *r200AllocateMemoryMESA( __DRIscreen *screen, GLsizei size, GLfloat readfreq, - GLfloat writefreq, GLfloat priority ); -extern void r200FreeMemoryMESA( __DRIscreen *screen, GLvoid *pointer ); -extern GLuint r200GetMemoryOffsetMESA( __DRIscreen *screen, const GLvoid *pointer ); - extern GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer, GLint size ); diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c index 12f869d96f8..5d268319f3f 100644 --- a/src/mesa/drivers/dri/r200/r200_vertprog.c +++ b/src/mesa/drivers/dri/r200/r200_vertprog.c @@ -33,11 +33,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" -#include "shader/programopt.h" +#include "program/program.h" +#include "program/prog_instruction.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" +#include "program/programopt.h" #include "tnl/tnl.h" #include "r200_context.h" diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile index ff3801dc676..3167d49bcae 100644 --- a/src/mesa/drivers/dri/r300/compiler/Makefile +++ b/src/mesa/drivers/dri/r300/compiler/Makefile @@ -23,6 +23,7 @@ C_SOURCES = \ radeon_dataflow_deadcode.c \ radeon_dataflow_swizzles.c \ radeon_optimize.c \ + radeon_rename_regs.c \ r3xx_fragprog.c \ r300_fragprog.c \ r300_fragprog_swizzle.c \ diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript index 50d9cdb7f2d..c6f47a6f8a4 100755 --- a/src/mesa/drivers/dri/r300/compiler/SConscript +++ b/src/mesa/drivers/dri/r300/compiler/SConscript @@ -22,6 +22,7 @@ r300compiler = env.ConvenienceLibrary( 'radeon_pair_schedule.c', 'radeon_pair_regalloc.c', 'radeon_optimize.c', + 'radeon_rename_regs.c', 'radeon_emulate_branches.c', 'radeon_emulate_loops.c', 'radeon_dataflow.c', diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c index 38312658d65..a326ee4c4fa 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c @@ -29,6 +29,7 @@ #include "radeon_emulate_loops.h" #include "radeon_program_alu.h" #include "radeon_program_tex.h" +#include "radeon_rename_regs.h" #include "r300_fragprog.h" #include "r300_fragprog_swizzle.h" #include "r500_fragprog.h" @@ -97,25 +98,27 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) { + struct emulate_loop_state loop_state; + rewrite_depth_out(c); + /* This transformation needs to be done before any of the IF + * instructions are modified. */ + radeonTransformKILP(&c->Base); + debug_program_log(c, "before compilation"); - /* XXX Ideally this should be done only for r3xx, but since - * we don't have branching support for r5xx, we use the emulation - * on all chipsets. */ - - if(c->Base.is_r500){ - rc_emulate_loops(&c->Base, R500_PFS_MAX_INST); + if (c->Base.is_r500){ + r500_transform_unroll_loops(&c->Base, &loop_state); + debug_program_log(c, "after r500 transform loops"); } else{ - rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST); + rc_transform_unroll_loops(&c->Base, &loop_state); + debug_program_log(c, "after transform loops"); + + rc_emulate_branches(&c->Base); + debug_program_log(c, "after emulate branches"); } - debug_program_log(c, "after emulate loops"); - - rc_emulate_branches(&c->Base); - - debug_program_log(c, "after emulate branches"); if (c->Base.is_r500) { struct radeon_program_transformation transformations[] = { @@ -162,6 +165,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "after deadcode"); + if(!c->Base.is_r500){ + rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST); + debug_program_log(c, "after emulate loops"); + } + rc_optimize(&c->Base); debug_program_log(c, "after dataflow optimize"); @@ -172,6 +180,16 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "after dataflow passes"); + if(!c->Base.is_r500) { + /* This pass makes it easier for the scheduler to group TEX + * instructions and reduces the chances of creating too + * many texture indirections.*/ + rc_rename_regs(&c->Base); + if (c->Base.Error) + return; + debug_program_log(c, "after register rename"); + } + rc_pair_translate(c); if (c->Base.Error) return; diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c index 507b2e532fe..d347b4df9cd 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c @@ -30,6 +30,7 @@ #include "radeon_program_alu.h" #include "radeon_swizzle.h" #include "radeon_emulate_branches.h" +#include "radeon_emulate_loops.h" /* * Take an already-setup and valid source then swizzle it appropriately to @@ -145,7 +146,8 @@ static unsigned long t_src(struct r300_vertex_program_code *vp, t_swizzle(GET_SWZ(src->Swizzle, 2)), t_swizzle(GET_SWZ(src->Swizzle, 3)), t_src_class(src->File), - src->Negate) | (src->RelAddr << 4); + src->Negate) | + (src->RelAddr << 4) | (src->Abs << 3); } static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, @@ -161,7 +163,7 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, t_swizzle(GET_SWZ(src->Swizzle, 0)), t_src_class(src->File), src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | - (src->RelAddr << 4); + (src->RelAddr << 4) | (src->Abs << 3); } static int valid_dst(struct r300_vertex_program_code *vp, @@ -348,7 +350,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi if (!valid_dst(compiler->code, &vpi->DstReg)) continue; - if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) { + if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS || + (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) { rc_error(&compiler->Base, "Vertex program has too many instructions\n"); return; } @@ -404,7 +407,7 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c { struct rc_instruction *inst; unsigned int num_orig_temps = 0; - char hwtemps[VSF_MAX_FRAGMENT_TEMPS]; + char hwtemps[R300_VS_MAX_TEMPS]; struct temporary_allocation * ta; unsigned int i, j; @@ -463,11 +466,11 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c unsigned int orig = inst->U.I.DstReg.Index; if (!ta[orig].Allocated) { - for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) { + for(j = 0; j < R300_VS_MAX_TEMPS; ++j) { if (!hwtemps[j]) break; } - if (j >= VSF_MAX_FRAGMENT_TEMPS) { + if (j >= R300_VS_MAX_TEMPS) { fprintf(stderr, "Out of hw temporaries\n"); } else { ta[orig].Allocated = 1; @@ -485,6 +488,44 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c } } +/** + * R3xx-R4xx vertex engine does not support the Absolute source operand modifier + * and the Saturate opcode modifier. Only Absolute is currently transformed. + */ +static int transform_nonnative_modifiers( + struct radeon_compiler *c, + struct rc_instruction *inst, + void* unused) +{ + const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); + unsigned i; + + /* Transform ABS(a) to MAX(a, -a). */ + for (i = 0; i < opcode->NumSrcRegs; i++) { + if (inst->U.I.SrcReg[i].Abs) { + struct rc_instruction *new_inst; + unsigned temp; + + inst->U.I.SrcReg[i].Abs = 0; + + temp = rc_find_free_temporary(c); + + new_inst = rc_insert_new_instruction(c, inst->Prev); + new_inst->U.I.Opcode = RC_OPCODE_MAX; + new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; + new_inst->U.I.DstReg.Index = temp; + new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; + new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; + new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; + + memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i])); + inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; + inst->U.I.SrcReg[i].Index = temp; + inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW; + } + } + return 1; +} /** * Vertex engine cannot read two inputs or two constants at the same time. @@ -591,6 +632,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = { void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) { + struct emulate_loop_state loop_state; + compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; addArtificialOutputs(compiler); @@ -600,19 +643,48 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) /* XXX Ideally this should be done only for r3xx, but since * we don't have branching support for r5xx, we use the emulation * on all chipsets. */ + rc_transform_unroll_loops(&compiler->Base, &loop_state); + + debug_program_log(compiler, "after transform loops"); + + if (compiler->Base.is_r500){ + rc_emulate_loops(&loop_state, R500_VS_MAX_ALU); + } else { + rc_emulate_loops(&loop_state, R300_VS_MAX_ALU); + } + debug_program_log(compiler, "after emulate loops"); + rc_emulate_branches(&compiler->Base); debug_program_log(compiler, "after emulate branches"); - { + if (compiler->Base.is_r500) { struct radeon_program_transformation transformations[] = { { &r300_transform_vertex_alu, 0 }, { &r300_transform_trig_scale_vertex, 0 } }; radeonLocalTransform(&compiler->Base, 2, transformations); - } - debug_program_log(compiler, "after native rewrite"); + debug_program_log(compiler, "after native rewrite"); + } else { + struct radeon_program_transformation transformations[] = { + { &r300_transform_vertex_alu, 0 }, + { &radeonTransformTrigSimple, 0 } + }; + radeonLocalTransform(&compiler->Base, 2, transformations); + + debug_program_log(compiler, "after native rewrite"); + + /* Note: This pass has to be done seperately from ALU rewrite, + * because it needs to check every instruction. + */ + struct radeon_program_transformation transformations2[] = { + { &transform_nonnative_modifiers, 0 }, + }; + radeonLocalTransform(&compiler->Base, 1, transformations2); + + debug_program_log(compiler, "after emulate modifiers"); + } { /* Note: This pass has to be done seperately from ALU rewrite, diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c index 632f0bcf4f8..e6b5522c5b9 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c @@ -30,6 +30,7 @@ #include <stdio.h> #include "../r300_reg.h" +#include "radeon_emulate_loops.h" /** * Rewrite IF instructions to use the ALU result special register. @@ -59,6 +60,31 @@ int r500_transform_IF( return 1; } +/** + * Rewrite loops to make them easier to emit. This is not a local + * transformation, because it modifies and reorders an entire block of code. + */ +void r500_transform_unroll_loops(struct radeon_compiler * c, + struct emulate_loop_state *s) +{ + int i; + + rc_transform_unroll_loops(c, s); + + for( i = s->LoopCount - 1; i >= 0; i-- ){ + struct rc_instruction * inst_continue; + if(!s->Loops[i].EndLoop){ + continue; + } + /* Insert a continue instruction at the end of the loop. This + * is required in order to emit loops correctly. */ + inst_continue = rc_insert_new_instruction(c, + s->Loops[i].EndIf->Prev); + inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE; + } + +} + static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) { unsigned int relevant; @@ -252,7 +278,7 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) struct r500_fragment_program_code *code = &c->code.r500; fprintf(stderr, "R500 Fragment Program:\n--------\n"); - int n; + int n, i; uint32_t inst; uint32_t inst0; char *str = NULL; @@ -275,8 +301,8 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) to_mask((inst >> 15) & 0xf)); switch(inst0 & 0x3) { - case 0: - case 1: + case R500_INST_TYPE_ALU: + case R500_INST_TYPE_OUT: fprintf(stderr,"\t1:RGB_ADDR 0x%08x:", code->inst[n].inst1); inst = code->inst[n].inst1; @@ -319,9 +345,87 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) (inst >> 23) & 0x3, (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3); break; - case 2: + case R500_INST_TYPE_FC: + fprintf(stderr, "\t2:FC_INST 0x%08x:", code->inst[n].inst2); + inst = code->inst[n].inst2; + /* JUMP_FUNC JUMP_ANY*/ + fprintf(stderr, "0x%02x %1x ", inst >> 8 & 0xff, + (inst & R500_FC_JUMP_ANY) >> 5); + + /* OP */ + switch(inst & 0x7){ + case R500_FC_OP_JUMP: + fprintf(stderr, "JUMP"); + break; + case R500_FC_OP_LOOP: + fprintf(stderr, "LOOP"); + break; + case R500_FC_OP_ENDLOOP: + fprintf(stderr, "ENDLOOP"); + break; + case R500_FC_OP_REP: + fprintf(stderr, "REP"); + break; + case R500_FC_OP_ENDREP: + fprintf(stderr, "ENDREP"); + break; + case R500_FC_OP_BREAKLOOP: + fprintf(stderr, "BREAKLOOP"); + break; + case R500_FC_OP_BREAKREP: + fprintf(stderr, "BREAKREP"); + break; + case R500_FC_OP_CONTINUE: + fprintf(stderr, "CONTINUE"); + break; + } + fprintf(stderr," "); + /* A_OP */ + switch(inst & (0x3 << 6)){ + case R500_FC_A_OP_NONE: + fprintf(stderr, "NONE"); + break; + case R500_FC_A_OP_POP: + fprintf(stderr, "POP"); + break; + case R500_FC_A_OP_PUSH: + fprintf(stderr, "PUSH"); + break; + } + /* B_OP0 B_OP1 */ + for(i=0; i<2; i++){ + fprintf(stderr, " "); + switch(inst & (0x3 << (24 + (i * 2)))){ + /* R500_FC_B_OP0_NONE + * R500_FC_B_OP1_NONE */ + case 0: + fprintf(stderr, "NONE"); + break; + case R500_FC_B_OP0_DECR: + case R500_FC_B_OP1_DECR: + fprintf(stderr, "DECR"); + break; + case R500_FC_B_OP0_INCR: + case R500_FC_B_OP1_INCR: + fprintf(stderr, "INCR"); + break; + } + } + /*POP_CNT B_ELSE */ + fprintf(stderr, " %d %1x", (inst >> 16) & 0x1f, (inst & R500_FC_B_ELSE) >> 4); + inst = code->inst[n].inst3; + /* JUMP_ADDR */ + fprintf(stderr, " %d", inst >> 16); + + if(code->inst[n].inst2 & R500_FC_IGNORE_UNCOVERED){ + fprintf(stderr, " IGN_UNC"); + } + inst = code->inst[n].inst3; + fprintf(stderr, "\n\t3:FC_ADDR 0x%08x:", inst); + fprintf(stderr, "BOOL: 0x%02x, INT: 0x%02x, JUMP_ADDR: %d, JMP_GLBL: %1x\n", + inst & 0x1f, (inst >> 8) & 0x1f, (inst >> 16) & 0x1ff, inst >> 31); break; - case 3: + case R500_INST_TYPE_TEX: inst = code->inst[n].inst1; fprintf(stderr,"\t1:TEX_INST: 0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf, to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "", diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h index 4efbae7ba67..0d005a794ff 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h @@ -36,6 +36,8 @@ #include "radeon_compiler.h" #include "radeon_swizzle.h" +struct emulate_loop_state; + extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler); extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c); @@ -47,4 +49,6 @@ extern int r500_transform_IF( struct rc_instruction * inst, void* data); +void r500_transform_unroll_loops(struct radeon_compiler * c, + struct emulate_loop_state * s); #endif diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c index fb2d8b5a9c0..0bd8f0a239f 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c @@ -45,6 +45,8 @@ #include "radeon_program_pair.h" +#define MAX_BRANCH_DEPTH_FULL 32 +#define MAX_BRANCH_DEPTH_PARTIAL 4 #define PROG_CODE \ struct r500_fragment_program_code *code = &c->code->code.r500 @@ -61,6 +63,10 @@ struct branch_info { int Endif; }; +struct loop_info { + int LoopStart; +}; + struct emit_state { struct radeon_compiler * C; struct r500_fragment_program_code * Code; @@ -69,7 +75,12 @@ struct emit_state { unsigned int CurrentBranchDepth; unsigned int BranchesReserved; + struct loop_info * Loops; + unsigned int CurrentLoopDepth; + unsigned int LoopsReserved; + unsigned int MaxBranchDepth; + }; static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode) @@ -359,16 +370,49 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT; - if (inst->U.I.Opcode == RC_OPCODE_IF) { - if (s->CurrentBranchDepth >= 32) { + switch(inst->U.I.Opcode){ + struct branch_info * branch; + struct loop_info * loop; + case RC_OPCODE_BGNLOOP: + memory_pool_array_reserve(&s->C->Pool, struct loop_info, + s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1); + + loop = &s->Loops[s->CurrentLoopDepth++]; + + /* We don't emit an instruction for BGNLOOP, so we need to + * decrement the instruction counter, but first we need to + * set LoopStart to the current value of inst_end, which + * will end up being the first real instruction in the loop.*/ + loop->LoopStart = s->Code->inst_end--; + break; + + case RC_OPCODE_BRK: + /* Don't emit an instruction for BRK */ + s->Code->inst_end--; + break; + + case RC_OPCODE_CONTINUE: + loop = &s->Loops[s->CurrentLoopDepth - 1]; + s->Code->inst[newip].inst2 = R500_FC_OP_JUMP | + R500_FC_JUMP_FUNC(0xff); + s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart); + break; + + case RC_OPCODE_ENDLOOP: + /* Don't emit an instruction for ENDLOOP */ + s->Code->inst_end--; + s->CurrentLoopDepth--; + break; + + case RC_OPCODE_IF: + if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) { rc_error(s->C, "Branch depth exceeds hardware limit"); return; } - memory_pool_array_reserve(&s->C->Pool, struct branch_info, s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1); - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++]; + branch = &s->Branches[s->CurrentBranchDepth++]; branch->If = newip; branch->Else = -1; branch->Endif = -1; @@ -377,29 +421,50 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->MaxBranchDepth = s->CurrentBranchDepth; /* actual instruction is filled in at ENDIF time */ - } else if (inst->U.I.Opcode == RC_OPCODE_ELSE) { + break; + + case RC_OPCODE_ELSE: if (!s->CurrentBranchDepth) { rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__); return; } - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1]; + branch = &s->Branches[s->CurrentBranchDepth - 1]; branch->Else = newip; /* actual instruction is filled in at ENDIF time */ - } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) { + break; + + case RC_OPCODE_ENDIF: if (!s->CurrentBranchDepth) { rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__); return; } - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1]; - branch->Endif = newip; - + branch = &s->Branches[s->CurrentBranchDepth - 1]; + + if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){ + branch->Endif = --s->Code->inst_end; + s->Code->inst[branch->Endif].inst2 |= + R500_FC_B_OP0_DECR; + } + else{ + branch->Endif = newip; + + s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP + | R500_FC_A_OP_NONE /* no address stack */ + | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ + | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ + | R500_FC_B_OP1_NONE /* no branch counter if stay */ + | R500_FC_B_POP_CNT(1) + ; + s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); + } s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP | R500_FC_A_OP_NONE /* no address stack */ | R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */ | R500_FC_B_OP0_INCR /* increment branch counter if stay */ + | R500_FC_IGNORE_UNCOVERED ; if (branch->Else >= 0) { @@ -421,17 +486,10 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); } - s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP - | R500_FC_A_OP_NONE /* no address stack */ - | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ - | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ - | R500_FC_B_OP1_NONE /* no branch counter if stay */ - | R500_FC_B_POP_CNT(1) - ; - s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); s->CurrentBranchDepth--; - } else { + break; + default: rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name); } } @@ -486,6 +544,10 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT; } + /* Use FULL flow control mode if branches are nested deep enough. + * We don not need to enable FULL flow control mode for loops, becasue + * we aren't using the hardware loop instructions. + */ if (s.MaxBranchDepth >= 4) { if (code->max_temp_idx < 1) code->max_temp_idx = 1; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h index 1979e7e4e49..d03689763bc 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h @@ -235,8 +235,11 @@ struct rX00_fragment_program_code { }; -#define VSF_MAX_FRAGMENT_LENGTH (255*4) -#define VSF_MAX_FRAGMENT_TEMPS (14) +#define R300_VS_MAX_ALU 256 +#define R300_VS_MAX_ALU_DWORDS (R300_VS_MAX_ALU * 4) +#define R500_VS_MAX_ALU 1024 +#define R500_VS_MAX_ALU_DWORDS (R500_VS_MAX_ALU * 4) +#define R300_VS_MAX_TEMPS 32 #define VSF_MAX_INPUTS 32 #define VSF_MAX_OUTPUTS 32 @@ -244,8 +247,8 @@ struct rX00_fragment_program_code { struct r300_vertex_program_code { int length; union { - uint32_t d[VSF_MAX_FRAGMENT_LENGTH]; - float f[VSF_MAX_FRAGMENT_LENGTH]; + uint32_t d[R500_VS_MAX_ALU_DWORDS]; + float f[R500_VS_MAX_ALU_DWORDS]; } body; int pos_end; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c index e3c2c83c0cf..fbb4235c223 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c @@ -202,32 +202,65 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f inst = inst->Prev) { const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - if (opcode->IsFlowControl) { - if (opcode->Opcode == RC_OPCODE_ENDIF) { - push_branch(&s); - } else { - if (s.BranchStackSize) { - struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1]; - - if (opcode->Opcode == RC_OPCODE_IF) { - or_updatemasks(&s.R, - &s.R, - branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif); - - s.BranchStackSize--; - } else if (opcode->Opcode == RC_OPCODE_ELSE) { - if (branch->HaveElse) { - rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__); - } else { - memcpy(&branch->StoreElse, &s.R, sizeof(s.R)); - memcpy(&s.R, &branch->StoreEndif, sizeof(s.R)); - branch->HaveElse = 1; - } + switch(opcode->Opcode){ + /* Mark all sources in the loop body as used before doing + * normal deadcode analysis. This is probably not optimal. + */ + case RC_OPCODE_ENDLOOP: + { + int endloops = 1; + struct rc_instruction *ptr; + for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){ + opcode = rc_get_opcode_info(ptr->U.I.Opcode); + if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){ + endloops--; + continue; + } + if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){ + endloops++; + continue; + } + if(opcode->HasDstReg){ + int src = 0; + unsigned int srcmasks[3]; + rc_compute_sources_for_writemask(ptr, + ptr->U.I.DstReg.WriteMask, srcmasks); + for(src=0; src < opcode->NumSrcRegs; src++){ + mark_used(&s, + ptr->U.I.SrcReg[src].File, + ptr->U.I.SrcReg[src].Index, + srcmasks[src]); + } + } + } + break; + } + case RC_OPCODE_CONTINUE: + case RC_OPCODE_BRK: + case RC_OPCODE_BGNLOOP: + break; + case RC_OPCODE_ENDIF: + push_branch(&s); + break; + default: + if (opcode->IsFlowControl && s.BranchStackSize) { + struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1]; + if (opcode->Opcode == RC_OPCODE_IF) { + or_updatemasks(&s.R, + &s.R, + branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif); + + s.BranchStackSize--; + } else if (opcode->Opcode == RC_OPCODE_ELSE) { + if (branch->HaveElse) { + rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__); } else { - rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name); + memcpy(&branch->StoreElse, &s.R, sizeof(s.R)); + memcpy(&s.R, &branch->StoreEndif, sizeof(s.R)); + branch->HaveElse = 1; } } else { - rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__); + rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name); } } } diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c index 4c5d29f4217..131e9e7436d 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c @@ -38,22 +38,6 @@ #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0) -struct emulate_loop_state { - struct radeon_compiler * C; - struct loop_info * Loops; - unsigned int LoopCount; - unsigned int LoopReserved; -}; - -struct loop_info { - struct rc_instruction * BeginLoop; - struct rc_instruction * Cond; - struct rc_instruction * If; - struct rc_instruction * Brk; - struct rc_instruction * EndIf; - struct rc_instruction * EndLoop; -}; - struct const_value { struct radeon_compiler * C; @@ -94,22 +78,13 @@ static int src_reg_is_immediate(struct rc_src_register * src, c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE; } -static unsigned int loop_count_instructions(struct loop_info * loop) +static unsigned int loop_calc_iterations(struct emulate_loop_state *s, + struct loop_info * loop, unsigned int max_instructions) { - unsigned int count = 0; - struct rc_instruction * inst = loop->BeginLoop->Next; - while(inst != loop->EndLoop){ - count++; - inst = inst->Next; - } - return count; -} - -static unsigned int loop_calc_iterations(struct loop_info * loop, - unsigned int loop_count, unsigned int max_instructions) -{ - unsigned int icount = loop_count_instructions(loop); - return max_instructions / (loop_count * icount); + unsigned int total_i = rc_recompute_ips(s->C); + unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1; + /* +1 because the program already has one iteration of the loop. */ + return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i)); } static void loop_unroll(struct emulate_loop_state * s, @@ -214,8 +189,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst, } static int transform_const_loop(struct emulate_loop_state * s, - struct loop_info * loop, - struct rc_instruction * cond) + struct loop_info * loop) { int end_loops = 1; int iterations; @@ -228,13 +202,13 @@ static int transform_const_loop(struct emulate_loop_state * s, /* Find the counter and the upper limit */ - if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){ - limit = &cond->U.I.SrcReg[0]; - counter = &cond->U.I.SrcReg[1]; + if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){ + limit = &loop->Cond->U.I.SrcReg[0]; + counter = &loop->Cond->U.I.SrcReg[1]; } - else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){ - limit = &cond->U.I.SrcReg[1]; - counter = &cond->U.I.SrcReg[0]; + else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){ + limit = &loop->Cond->U.I.SrcReg[1]; + counter = &loop->Cond->U.I.SrcReg[0]; } else{ DBG("No constant limit.\n"); @@ -293,8 +267,22 @@ static int transform_const_loop(struct emulate_loop_state * s, * simple, since we only support increment and decrement loops. */ limit_value = get_constant_value(s->C, limit, 0); - iterations = (int) ((limit_value - counter_value.Value) / + DBG("Limit is %f.\n", limit_value); + switch(loop->Cond->U.I.Opcode){ + case RC_OPCODE_SGT: + case RC_OPCODE_SLT: + iterations = (int) ceilf((limit_value - counter_value.Value) / count_inst.Amount); + break; + + case RC_OPCODE_SLE: + case RC_OPCODE_SGE: + iterations = (int) floorf((limit_value - counter_value.Value) / + count_inst.Amount) + 1; + break; + default: + return 0; + } DBG("Loop will have %d iterations.\n", iterations); @@ -414,7 +402,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, } /* Check if the number of loops is known at compile time. */ - if(transform_const_loop(s, loop, ptr)){ + if(transform_const_loop(s, loop)){ return loop->BeginLoop->Next; } @@ -425,9 +413,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, return loop->EndLoop; } -static void rc_transform_loops(struct emulate_loop_state * s) +void rc_transform_unroll_loops(struct radeon_compiler *c, + struct emulate_loop_state * s) { - struct rc_instruction * ptr = s->C->Program.Instructions.Next; + struct rc_instruction * ptr; + + memset(s, 0, sizeof(struct emulate_loop_state)); + s->C = c; + ptr = s->C->Program.Instructions.Next; while(ptr != &s->C->Program.Instructions) { if(ptr->Type == RC_INSTRUCTION_NORMAL && ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){ @@ -440,7 +433,7 @@ static void rc_transform_loops(struct emulate_loop_state * s) } } -static void rc_unroll_loops(struct emulate_loop_state *s, +void rc_emulate_loops(struct emulate_loop_state *s, unsigned int max_instructions) { int i; @@ -451,24 +444,8 @@ static void rc_unroll_loops(struct emulate_loop_state *s, if(!s->Loops[i].EndLoop){ continue; } - unsigned int iterations = loop_calc_iterations(&s->Loops[i], - s->LoopCount, max_instructions); + unsigned int iterations = loop_calc_iterations(s, &s->Loops[i], + max_instructions); loop_unroll(s, &s->Loops[i], iterations); } } - -void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions) -{ - struct emulate_loop_state s; - - memset(&s, 0, sizeof(struct emulate_loop_state)); - s.C = c; - - /* We may need to move these two operations to r3xx_(vert|frag)prog.c - * and run the optimization passes between them in order to increase - * the number of unrolls we can do for each loop. - */ - rc_transform_loops(&s); - - rc_unroll_loops(&s, max_instructions); -} diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h index ddcf1c0fabe..7748813c4eb 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h @@ -7,6 +7,26 @@ struct radeon_compiler; -void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions); +struct loop_info { + struct rc_instruction * BeginLoop; + struct rc_instruction * Cond; + struct rc_instruction * If; + struct rc_instruction * Brk; + struct rc_instruction * EndIf; + struct rc_instruction * EndLoop; +}; + +struct emulate_loop_state { + struct radeon_compiler * C; + struct loop_info * Loops; + unsigned int LoopCount; + unsigned int LoopReserved; +}; + +void rc_transform_unroll_loops(struct radeon_compiler *c, + struct emulate_loop_state * s); + +void rc_emulate_loops(struct emulate_loop_state *s, + unsigned int max_instructions); #endif /* RADEON_EMULATE_LOOPS_H */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c index 1dc16855dc1..04f234f11d8 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c @@ -386,6 +386,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .NumSrcRegs = 0, }, { + .Opcode = RC_OPCODE_CONTINUE, + .Name = "CONTINUE", + .IsFlowControl = 1, + .NumSrcRegs = 0 + }, + { .Opcode = RC_OPCODE_REPL_ALPHA, .Name = "REPL_ALPHA", .HasDstReg = 1 @@ -393,6 +399,10 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { { .Opcode = RC_OPCODE_BEGIN_TEX, .Name = "BEGIN_TEX" + }, + { + .Opcode = RC_OPCODE_KILP, + .Name = "KILP", } }; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h index 91c82ac0890..8b9fa07dde2 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h @@ -187,6 +187,8 @@ typedef enum { RC_OPCODE_ENDLOOP, + RC_OPCODE_CONTINUE, + /** special instruction, used in R300-R500 fragment program pair instructions * indicates that the result of the alpha operation shall be replicated * across all other channels */ @@ -197,6 +199,9 @@ typedef enum { * can run simultaneously. */ RC_OPCODE_BEGIN_TEX, + /** Stop execution of the shader (GLSL discard) */ + RC_OPCODE_KILP, + MAX_RC_OPCODE } rc_opcode; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c index 21d72108886..eca06515367 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c @@ -75,6 +75,15 @@ struct peephole_state { int BranchDepth; }; +/** + * This is a callback function that is meant to be passed to + * rc_for_all_reads_mask. This function will be called once for each source + * register in inst. + * @param inst The instruction that the source register belongs to. + * @param file The register file of the source register. + * @param index The index of the source register. + * @param mask The components of the source register that are being read from. + */ static void peephole_scan_read(void * data, struct rc_instruction * inst, rc_register_file file, unsigned int index, unsigned int mask) { @@ -153,6 +162,11 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo for(struct rc_instruction * inst = inst_mov->Next; inst != &c->Program.Instructions; inst = inst->Next) { + /* XXX In the future we might be able to make the optimizer + * smart enough to handle loops. */ + if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){ + return; + } rc_for_all_reads_mask(inst, peephole_scan_read, &s); rc_for_all_writes_mask(inst, peephole_scan_write, &s); if (s.Conflict) @@ -161,7 +175,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo if (s.BranchDepth >= 0) { if (inst->U.I.Opcode == RC_OPCODE_IF) { s.BranchDepth++; - } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) { + } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF + || inst->U.I.Opcode == RC_OPCODE_ELSE) { s.BranchDepth--; if (s.BranchDepth < 0) { s.DefinedMask &= ~s.MovMask; @@ -208,7 +223,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo if (s.BranchDepth >= 0) { if (inst->U.I.Opcode == RC_OPCODE_IF) { s.BranchDepth++; - } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) { + } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF + || inst->U.I.Opcode == RC_OPCODE_ELSE) { s.BranchDepth--; if (s.BranchDepth < 0) break; /* no more readers after this point */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c index a279549ff89..fc540496c41 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c @@ -141,12 +141,28 @@ static void add_inst_to_list(struct schedule_instruction ** list, struct schedul *list = inst; } +static void add_inst_to_list_end(struct schedule_instruction ** list, + struct schedule_instruction * inst) +{ + if(!*list){ + *list = inst; + }else{ + struct schedule_instruction * temp = *list; + while(temp->NextReady){ + temp = temp->NextReady; + } + temp->NextReady = inst; + } +} + static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst) { DBG("%i is now ready\n", sinst->Instruction->IP); + /* Adding Ready TEX instructions to the end of the "Ready List" helps + * us emit TEX instructions in blocks without losing our place. */ if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) - add_inst_to_list(&s->ReadyTEX, sinst); + add_inst_to_list_end(&s->ReadyTEX, sinst); else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP) add_inst_to_list(&s->ReadyRGB, sinst); else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP) @@ -163,11 +179,14 @@ static void decrease_dependencies(struct schedule_state * s, struct schedule_ins instruction_ready(s, sinst); } -static void commit_instruction(struct schedule_state * s, struct schedule_instruction * sinst) -{ - DBG("%i: commit\n", sinst->Instruction->IP); - - for(unsigned int i = 0; i < sinst->NumReadValues; ++i) { +/** + * This function decreases the dependencies of the next instruction that + * wants to write to each of sinst's read values. + */ +static void commit_update_reads(struct schedule_state * s, + struct schedule_instruction * sinst){ + unsigned int i; + for(i = 0; i < sinst->NumReadValues; ++i) { struct reg_value * v = sinst->ReadValues[i]; assert(v->NumReaders > 0); v->NumReaders--; @@ -176,8 +195,12 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru decrease_dependencies(s, v->Next->Writer); } } +} - for(unsigned int i = 0; i < sinst->NumWriteValues; ++i) { +static void commit_update_writes(struct schedule_state * s, + struct schedule_instruction * sinst){ + unsigned int i; + for(i = 0; i < sinst->NumWriteValues; ++i) { struct reg_value * v = sinst->WriteValues[i]; if (v->NumReaders) { for(struct reg_value_reader * r = v->Readers; r; r = r->Next) { @@ -196,6 +219,15 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru } } +static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst) +{ + DBG("%i: commit\n", sinst->Instruction->IP); + + commit_update_reads(s, sinst); + + commit_update_writes(s, sinst); +} + /** * Emit all ready texture instructions in a single block. * @@ -208,21 +240,37 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo assert(s->ReadyTEX); - /* Don't let the ready list change under us! */ - readytex = s->ReadyTEX; - s->ReadyTEX = 0; - /* Node marker for R300 */ struct rc_instruction * inst_begin = rc_insert_new_instruction(s->C, before->Prev); inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX; /* Link texture instructions back in */ + readytex = s->ReadyTEX; while(readytex) { - struct schedule_instruction * tex = readytex; + rc_insert_instruction(before->Prev, readytex->Instruction); + DBG("%i: commit TEX reads\n", readytex->Instruction->IP); + + /* All of the TEX instructions in the same TEX block have + * their source registers read from before any of the + * instructions in that block write to their destination + * registers. This means that when we commit a TEX + * instruction, any other TEX instruction that wants to write + * to one of the committed instruction's source register can be + * marked as ready and should be emitted in the same TEX + * block. This prevents the following sequence from being + * emitted in two different TEX blocks: + * 0: TEX temp[0].xyz, temp[1].xy__, 2D[0]; + * 1: TEX temp[1].xyz, temp[2].xy__, 2D[0]; + */ + commit_update_reads(s, readytex); + readytex = readytex->NextReady; + } + readytex = s->ReadyTEX; + s->ReadyTEX = 0; + while(readytex){ + DBG("%i: commit TEX writes\n", readytex->Instruction->IP); + commit_update_writes(s, readytex); readytex = readytex->NextReady; - - rc_insert_instruction(before->Prev, tex->Instruction); - commit_instruction(s, tex); } } @@ -328,7 +376,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor } rc_insert_instruction(before->Prev, sinst->Instruction); - commit_instruction(s, sinst); + commit_alu_instruction(s, sinst); } else { struct schedule_instruction **prgb; struct schedule_instruction **palpha; @@ -346,8 +394,8 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor *prgb = (*prgb)->NextReady; *palpha = (*palpha)->NextReady; rc_insert_instruction(before->Prev, psirgb->Instruction); - commit_instruction(s, psirgb); - commit_instruction(s, psialpha); + commit_alu_instruction(s, psirgb); + commit_alu_instruction(s, psialpha); goto success; } } @@ -357,7 +405,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor s->ReadyRGB = s->ReadyRGB->NextReady; rc_insert_instruction(before->Prev, sinst->Instruction); - commit_instruction(s, sinst); + commit_alu_instruction(s, sinst); success: ; } } diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c index c922d3d9a44..3cc28972934 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c @@ -973,3 +973,32 @@ int radeonTransformDeriv(struct radeon_compiler* c, return 1; } + +/** + * IF Temp[0].x -\ + * KILP - > KIL -abs(Temp[0].x) + * ENDIF -/ + * + * This needs to be done in its own pass, because it modifies the instructions + * before and after KILP. + */ +void radeonTransformKILP(struct radeon_compiler * c) +{ + struct rc_instruction * inst; + for (inst = c->Program.Instructions.Next; + inst != &c->Program.Instructions; inst = inst->Next) { + + if (inst->U.I.Opcode != RC_OPCODE_KILP + || inst->Prev->U.I.Opcode != RC_OPCODE_IF + || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) { + continue; + } + inst->U.I.Opcode = RC_OPCODE_KIL; + inst->U.I.SrcReg[0] = negate(absolute(inst->Prev->U.I.SrcReg[0])); + + /* Remove IF */ + rc_remove_instruction(inst->Prev); + /* Remove ENDIF */ + rc_remove_instruction(inst->Next); + } +} diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h index 77d444476f2..e6e2cc20c5a 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h @@ -60,4 +60,6 @@ int radeonTransformDeriv( struct rc_instruction * inst, void*); +void radeonTransformKILP(struct radeon_compiler * c); + #endif /* __RADEON_PROGRAM_ALU_H_ */ diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c new file mode 100644 index 00000000000..31c98668838 --- /dev/null +++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c @@ -0,0 +1,131 @@ +/* + * Copyright 2010 Tom Stellard <[email protected]> + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * \file + */ + +#include "radeon_rename_regs.h" + +#include "radeon_compiler.h" +#include "radeon_dataflow.h" + +struct reg_rename { + int old_index; + int new_index; + int temp_index; +}; + +static void rename_reg(void * data, struct rc_instruction * inst, + rc_register_file * file, unsigned int * index) +{ + struct reg_rename *r = data; + + if(r->old_index == *index && *file == RC_FILE_TEMPORARY) { + *index = r->new_index; + } + else if(r->new_index == *index && *file == RC_FILE_TEMPORARY) { + *index = r->temp_index; + } +} + +static void rename_all( + struct radeon_compiler *c, + struct rc_instruction * start, + unsigned int old, + unsigned int new, + unsigned int temp) +{ + struct rc_instruction * inst; + struct reg_rename r; + r.old_index = old; + r.new_index = new; + r.temp_index = temp; + for(inst = start; inst != &c->Program.Instructions; + inst = inst->Next) { + rc_remap_registers(inst, rename_reg, &r); + } +} + +/** + * This function renames registers in an attempt to get the code close to + * SSA form. After this function has completed, most of the register are only + * written to one time, with a few exceptions. For example, this block of code + * will not be modified by this function: + * Mov Temp[0].x Const[0].x + * Mov Temp[0].y Const[0].y + * Basically, destination registers will be renamed if: + * 1. There have been no previous writes to that register + * or + * 2. If the instruction is writting to the exact components (no more, no less) + * of a register that has been written to by previous instructions. + * + * This function assumes all the instructions are still of type + * RC_INSTRUCTION_NORMAL. + */ +void rc_rename_regs(struct radeon_compiler * c) +{ + unsigned int cur_index = 0; + unsigned int icount; + struct rc_instruction * inst; + unsigned int * masks; + + /* The number of instructions in the program is also the maximum + * number of temp registers that could potentially be used. */ + icount = rc_recompute_ips(c); + masks = memory_pool_malloc(&c->Pool, icount * sizeof(unsigned int)); + memset(masks, 0, icount * sizeof(unsigned int)); + + for(inst = c->Program.Instructions.Next; + inst != &c->Program.Instructions; + inst = inst->Next) { + const struct rc_opcode_info * info; + if(inst->Type != RC_INSTRUCTION_NORMAL) { + rc_error(c, "%s only works with normal instructions.", + __FUNCTION__); + return; + } + unsigned int old_index, temp_index; + struct rc_dst_register * dst = &inst->U.I.DstReg; + info = rc_get_opcode_info(inst->U.I.Opcode); + if(!info->HasDstReg || dst->File != RC_FILE_TEMPORARY) { + continue; + } + if(dst->Index >= icount || !masks[dst->Index] || + masks[dst->Index] == dst->WriteMask) { + old_index = dst->Index; + /* We need to set dst->Index here so get free temporary + * will work. */ + dst->Index = cur_index++; + temp_index = rc_find_free_temporary(c); + rename_all(c, inst->Next, old_index, + dst->Index, temp_index); + } + assert(dst->Index < icount); + masks[dst->Index] |= dst->WriteMask; + } +} diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h new file mode 100644 index 00000000000..4323b995d84 --- /dev/null +++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h @@ -0,0 +1,9 @@ + +#ifndef RADEON_RENAME_REGS_H +#define RADEON_RENAME_REGS_H + +struct radeon_compiler; + +void rc_rename_regs(struct radeon_compiler * c); + +#endif /* RADEON_RENAME_REGS_H */ diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c index 6992ca59dbf..e4b302bbad9 100644 --- a/src/mesa/drivers/dri/r300/r300_context.c +++ b/src/mesa/drivers/dri/r300/r300_context.c @@ -376,13 +376,12 @@ static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen) ctx->Const.MaxDrawBuffers = 1; ctx->Const.MaxColorAttachments = 1; - /* currently bogus data */ if (r300->options.hw_tcl_enabled) { - ctx->Const.VertexProgram.MaxNativeInstructions = VSF_MAX_FRAGMENT_LENGTH / 4; - ctx->Const.VertexProgram.MaxNativeAluInstructions = VSF_MAX_FRAGMENT_LENGTH / 4; - ctx->Const.VertexProgram.MaxNativeAttribs = 16; /* r420 */ + ctx->Const.VertexProgram.MaxNativeInstructions = 255; + ctx->Const.VertexProgram.MaxNativeAluInstructions = 255; + ctx->Const.VertexProgram.MaxNativeAttribs = 16; ctx->Const.VertexProgram.MaxNativeTemps = 32; - ctx->Const.VertexProgram.MaxNativeParameters = 256; /* r420 */ + ctx->Const.VertexProgram.MaxNativeParameters = 256; ctx->Const.VertexProgram.MaxNativeAddressRegs = 1; } diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index fbb609b9f61..99540e3354f 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -43,7 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "radeon_common.h" #include "main/mtypes.h" -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #include "compiler/radeon_code.h" struct r300_context; diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c index 282c0e18bca..5ae9f49840b 100644 --- a/src/mesa/drivers/dri/r300/r300_draw.c +++ b/src/mesa/drivers/dri/r300/r300_draw.c @@ -523,8 +523,7 @@ static void r300AllocDmaRegions(GLcontext *ctx, const struct gl_client_array *in r300ConvertAttrib(ctx, count, input[i], &vbuf->attribs[index]); } else { if (input[i]->BufferObj->Name) { - if (stride % 4 != 0) { - assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0); + if (stride % 4 != 0 || (intptr_t)input[i]->Ptr % 4 != 0) { r300AlignDataToDword(ctx, input[i], count, &vbuf->attribs[index]); vbuf->attribs[index].is_named_bo = GL_FALSE; } else { diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c index 7be2f74b5b2..95f4306f604 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_common.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c @@ -38,8 +38,8 @@ #include "r300_fragprog_common.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" #include "compiler/radeon_compiler.h" diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index ac93563ed9e..f25264b6f2d 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -3066,8 +3066,8 @@ enum { # define R500_FC_B_OP0_NONE (0 << 24) # define R500_FC_B_OP0_DECR (1 << 24) # define R500_FC_B_OP0_INCR (2 << 24) -# define R500_FC_B_OP1_DECR (0 << 26) -# define R500_FC_B_OP1_NONE (1 << 26) +# define R500_FC_B_OP1_NONE (0 << 26) +# define R500_FC_B_OP1_DECR (1 << 26) # define R500_FC_B_OP1_INCR (2 << 26) # define R500_FC_IGNORE_UNCOVERED (1 << 28) #define R500_US_FC_INT_CONST_0 0x4c00 diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c index 9c24166ec5b..a9bddf05779 100644 --- a/src/mesa/drivers/dri/r300/r300_shader.c +++ b/src/mesa/drivers/dri/r300/r300_shader.c @@ -27,7 +27,7 @@ #include "main/glheader.h" -#include "shader/program.h" +#include "program/program.h" #include "tnl/tnl.h" #include "r300_context.h" #include "r300_fragprog_common.h" diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index fa33be49989..0113eecaa3a 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -49,8 +49,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "drivers/common/meta.h" #include "swrast/swrast.h" #include "swrast_setup/swrast_setup.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "vbo/vbo.h" #include "tnl/tnl.h" diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c index a1fe3780294..67d8b2b3286 100644 --- a/src/mesa/drivers/dri/r300/r300_vertprog.c +++ b/src/mesa/drivers/dri/r300/r300_vertprog.c @@ -31,12 +31,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" -#include "shader/programopt.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_statevars.h" +#include "program/program.h" +#include "program/programopt.h" +#include "program/prog_instruction.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_statevars.h" #include "tnl/tnl.h" #include "compiler/radeon_compiler.h" diff --git a/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c b/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c index 9f9dec840b4..471a3723cb9 100644 --- a/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c +++ b/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c @@ -28,8 +28,8 @@ #include "radeon_mesa_to_rc.h" #include "main/mtypes.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" +#include "program/prog_instruction.h" +#include "program/prog_parameter.h" #include "compiler/radeon_compiler.h" #include "compiler/radeon_program.h" diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c index afe2d55dc7c..8013553f679 100644 --- a/src/mesa/drivers/dri/r600/r600_cmdbuf.c +++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c @@ -46,7 +46,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "r600_context.h" #include "radeon_reg.h" #include "r600_cmdbuf.h" -#include "r600_emit.h" #include "radeon_bocs_wrapper.h" #include "radeon_reg.h" diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h index dff00096999..78fccd0b601 100644 --- a/src/mesa/drivers/dri/r600/r600_cmdbuf.h +++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.h @@ -37,7 +37,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define __R600_CMDBUF_H__ #include "r600_context.h" -#include "r600_emit.h" #define RADEON_CP_PACKET3_NOP 0xC0001000 #define RADEON_CP_PACKET3_NEXT_CHAR 0xC0001900 diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c index f4aed4e87fd..84d9d423124 100644 --- a/src/mesa/drivers/dri/r600/r600_context.c +++ b/src/mesa/drivers/dri/r600/r600_context.c @@ -59,7 +59,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "radeon_buffer_objects.h" #include "radeon_span.h" #include "r600_cmdbuf.h" -#include "r600_emit.h" #include "radeon_bocs_wrapper.h" #include "radeon_queryobj.h" #include "r600_blit.h" diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c index de5c5d89fea..99a33df4fcb 100644 --- a/src/mesa/drivers/dri/r600/r700_assembler.c +++ b/src/mesa/drivers/dri/r600/r700_assembler.c @@ -32,7 +32,7 @@ #include "main/mtypes.h" #include "main/imports.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "radeon_debug.h" #include "r600_context.h" @@ -293,7 +293,9 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size) case 2: format = FMT_16_16; break; case 3: - format = FMT_16_16_16; break; + /* 3 comp GL_SHORT vertex format doesnt work on r700 + 4 somehow works, test - sauerbraten */ + format = FMT_16_16_16_16; break; case 4: format = FMT_16_16_16_16; break; default: @@ -1262,7 +1264,7 @@ GLboolean checkop3(r700_AssemblerBase* pAsm) { if( GL_FALSE == mov_temp(pAsm, 1) ) { - return 1; + return GL_FALSE; } } diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h index 2d3c32487e6..dbc6cdb1903 100644 --- a/src/mesa/drivers/dri/r600/r700_assembler.h +++ b/src/mesa/drivers/dri/r600/r700_assembler.h @@ -28,7 +28,7 @@ #define _R700_ASSEMBLER_H_ #include "main/mtypes.h" -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #include "r700_chip.h" #include "r700_shaderinst.h" diff --git a/src/mesa/drivers/dri/r600/r700_chip.h b/src/mesa/drivers/dri/r600/r700_chip.h index ae249e15fd4..0b6b72f8501 100644 --- a/src/mesa/drivers/dri/r600/r700_chip.h +++ b/src/mesa/drivers/dri/r600/r700_chip.h @@ -27,7 +27,9 @@ #ifndef _R700_CHIP_H_ #define _R700_CHIP_H_ -#include "r600_context.h" +#include <GL/gl.h> + +#include "radeon_common_context.h" #include "r600_reg.h" #include "r600_reg_auto_r6xx.h" diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c index fbb808e0662..f9d84b6ed68 100644 --- a/src/mesa/drivers/dri/r600/r700_fragprog.c +++ b/src/mesa/drivers/dri/r600/r700_fragprog.c @@ -32,12 +32,13 @@ #include <math.h> #include "main/imports.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" -#include "shader/program.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" +#include "program/program.h" #include "r600_context.h" #include "r600_cmdbuf.h" +#include "r600_emit.h" #include "r700_fragprog.h" @@ -586,7 +587,9 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx) SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_T, PNT_SPRITE_OVRD_Y_shift, PNT_SPRITE_OVRD_Y_mask); SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_0, PNT_SPRITE_OVRD_Z_shift, PNT_SPRITE_OVRD_Z_mask); SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_1, PNT_SPRITE_OVRD_W_shift, PNT_SPRITE_OVRD_W_mask); - if(ctx->Point.SpriteOrigin == GL_LOWER_LEFT) + /* Like e.g. viewport and winding, point sprite coordinates are + * inverted when rendering to FBO. */ + if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == !ctx->DrawBuffer->Name) SETbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit); else CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit); diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.c b/src/mesa/drivers/dri/r600/r700_oglprog.c index b7124e644a3..83517925115 100644 --- a/src/mesa/drivers/dri/r600/r700_oglprog.c +++ b/src/mesa/drivers/dri/r600/r700_oglprog.c @@ -29,7 +29,7 @@ #include "main/glheader.h" #include "main/imports.h" -#include "shader/program.h" +#include "program/program.h" #include "tnl/tnl.h" #include "r600_context.h" diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.h b/src/mesa/drivers/dri/r600/r700_oglprog.h index fe2e9d19749..4d421338678 100644 --- a/src/mesa/drivers/dri/r600/r700_oglprog.h +++ b/src/mesa/drivers/dri/r600/r700_oglprog.h @@ -27,7 +27,7 @@ #ifndef _R700_OGLPROG_H_ #define _R700_OGLPROG_H_ -#include "r600_context.h" +#include "main/dd.h" extern void r700InitShaderFuncs(struct dd_function_table *functions); diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c index ac64bbf874f..5ea8918611c 100644 --- a/src/mesa/drivers/dri/r600/r700_state.c +++ b/src/mesa/drivers/dri/r600/r700_state.c @@ -41,8 +41,8 @@ #include "main/framebuffer.h" #include "drivers/common/meta.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "vbo/vbo.h" #include "r600_context.h" diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c index 14dd2a5482c..137f3007ced 100644 --- a/src/mesa/drivers/dri/r600/r700_vertprog.c +++ b/src/mesa/drivers/dri/r600/r700_vertprog.c @@ -35,14 +35,15 @@ #include "main/mtypes.h" #include "tnl/t_context.h" -#include "shader/program.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/program.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "radeon_debug.h" #include "r600_context.h" #include "r600_cmdbuf.h" -#include "shader/programopt.h" +#include "r600_emit.h" +#include "program/programopt.h" #include "r700_debug.h" #include "r700_vertprog.h" diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c index 94f476617b6..5a7d52c4d2f 100644 --- a/src/mesa/drivers/dri/radeon/radeon_common_context.c +++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c @@ -300,10 +300,10 @@ void radeonDestroyContext(__DRIcontext *driContextPriv ) _mesa_meta_free(radeon->glCtx); if (radeon == current) { - radeon_firevertices(radeon); _mesa_make_current(NULL, NULL, NULL); } + radeon_firevertices(radeon); if (!is_empty_list(&radeon->dma.reserved)) { rcommonFlushCmdBuf( radeon, __FUNCTION__ ); } diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c index 6cd1d87de24..c877e6c1765 100644 --- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c +++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c @@ -602,17 +602,17 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t __FUNCTION__, texObj ,t->minLod, t->maxLod); radeon_mipmap_tree *dst_miptree; - dst_miptree = get_biggest_matching_miptree(t, t->minLod, t->maxLod); + dst_miptree = get_biggest_matching_miptree(t, t->base.BaseLevel, t->base.MaxLevel); + radeon_miptree_unreference(&t->mt); if (!dst_miptree) { - radeon_miptree_unreference(&t->mt); radeon_try_alloc_miptree(rmesa, t); - dst_miptree = t->mt; radeon_print(RADEON_TEXTURE, RADEON_NORMAL, "%s: No matching miptree found, allocated new one %p\n", __FUNCTION__, t->mt); } else { + radeon_miptree_reference(dst_miptree, &t->mt); radeon_print(RADEON_TEXTURE, RADEON_NORMAL, "%s: Using miptree %p\n", __FUNCTION__, t->mt); } @@ -629,7 +629,7 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t "Checking image level %d, face %d, mt %p ... ", level, face, img->mt); - if (img->mt != dst_miptree) { + if (img->mt != t->mt) { radeon_print(RADEON_TEXTURE, RADEON_TRACE, "MIGRATING\n"); @@ -637,7 +637,7 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t if (src_bo && radeon_bo_is_referenced_by_cs(src_bo, rmesa->cmdbuf.cs)) { radeon_firevertices(rmesa); } - migrate_image_to_miptree(dst_miptree, img, face, level); + migrate_image_to_miptree(t->mt, img, face, level); } else radeon_print(RADEON_TEXTURE, RADEON_TRACE, "OK\n"); } diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c index 4f59511a528..82107cc6aeb 100644 --- a/src/mesa/drivers/dri/radeon/radeon_screen.c +++ b/src/mesa/drivers/dri/radeon/radeon_screen.c @@ -52,7 +52,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "radeon_tex.h" #elif defined(RADEON_R200) #include "r200_context.h" -#include "r200_ioctl.h" #include "r200_tex.h" #elif defined(RADEON_R300) #include "r300_context.h" @@ -338,12 +337,6 @@ static const __DRItexBufferExtension radeonTexBufferExtension = { #endif #if defined(RADEON_R200) -static const __DRIallocateExtension r200AllocateExtension = { - { __DRI_ALLOCATE, __DRI_ALLOCATE_VERSION }, - r200AllocateMemoryMESA, - r200FreeMemoryMESA, - r200GetMemoryOffsetMESA -}; static const __DRItexOffsetExtension r200texOffsetExtension = { { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION }, @@ -1209,7 +1202,6 @@ radeonCreateScreen( __DRIscreen *sPriv ) i = 0; screen->extensions[i++] = &driCopySubBufferExtension.base; - screen->extensions[i++] = &driFrameTrackingExtension.base; screen->extensions[i++] = &driReadDrawableExtension; if ( screen->irq != 0 ) { @@ -1222,9 +1214,6 @@ radeonCreateScreen( __DRIscreen *sPriv ) #endif #if defined(RADEON_R200) - if (IS_R200_CLASS(screen)) - screen->extensions[i++] = &r200AllocateExtension.base; - screen->extensions[i++] = &r200texOffsetExtension.base; #endif @@ -1366,8 +1355,8 @@ radeonCreateScreen2(__DRIscreen *sPriv) i = 0; screen->extensions[i++] = &driCopySubBufferExtension.base; - screen->extensions[i++] = &driFrameTrackingExtension.base; screen->extensions[i++] = &driReadDrawableExtension; + screen->extensions[i++] = &dri2ConfigQueryExtension.base; if ( screen->irq != 0 ) { screen->extensions[i++] = &driSwapControlExtension.base; @@ -1379,9 +1368,6 @@ radeonCreateScreen2(__DRIscreen *sPriv) #endif #if defined(RADEON_R200) - if (IS_R200_CLASS(screen)) - screen->extensions[i++] = &r200AllocateExtension.base; - screen->extensions[i++] = &r200TexBufferExtension.base; #endif diff --git a/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c b/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c index 3ababb1ef53..f878b48e5f9 100644 --- a/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c +++ b/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c @@ -31,6 +31,7 @@ #include "radeon_common_context.h" #include "radeon_texture.h" +#include "radeon_mipmap_tree.h" #include "main/texgetimage.h" @@ -51,7 +52,15 @@ radeon_get_tex_image(GLcontext * ctx, GLenum target, GLint level, __func__, ctx, texObj, image, compressed); if (image->mt) { + radeonContextPtr rmesa = RADEON_CONTEXT(ctx); /* Map the texture image read-only */ + if (radeon_bo_is_referenced_by_cs(image->mt->bo, rmesa->cmdbuf.cs)) { + radeon_print(RADEON_TEXTURE, RADEON_VERBOSE, + "%s: called for texture that is queued for GPU processing\n", + __func__); + radeon_firevertices(rmesa); + } + radeon_teximage_map(image, GL_FALSE); } else { /* Image hasn't been uploaded to a miptree yet */ diff --git a/src/mesa/drivers/dri/sis/sis_state.c b/src/mesa/drivers/dri/sis/sis_state.c index a22195ccceb..6173231a82e 100644 --- a/src/mesa/drivers/dri/sis/sis_state.c +++ b/src/mesa/drivers/dri/sis/sis_state.c @@ -37,6 +37,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #include "sis_lock.h" #include "main/context.h" +#include "main/macros.h" #include "swrast/swrast.h" #include "vbo/vbo.h" #include "tnl/tnl.h" diff --git a/src/mesa/drivers/dri/unichrome/via_screen.c b/src/mesa/drivers/dri/unichrome/via_screen.c index ee10b569bf1..4b3e9d5a38f 100644 --- a/src/mesa/drivers/dri/unichrome/via_screen.c +++ b/src/mesa/drivers/dri/unichrome/via_screen.c @@ -166,7 +166,6 @@ viaInitDriver(__DRIscreen *sPriv) viaScreen->sareaPrivOffset = gDRIPriv->sarea_priv_offset; i = 0; - viaScreen->extensions[i++] = &driFrameTrackingExtension.base; viaScreen->extensions[i++] = &driReadDrawableExtension; if ( viaScreen->irqEnabled ) { viaScreen->extensions[i++] = &driSwapControlExtension.base; |