102 files changed, 2348 insertions, 889 deletions
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index ca5eb5c7552..227710fb025 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -31,6 +31,7 @@
 #include "main/mipmap.h"
 #include "main/queryobj.h"
 #include "main/renderbuffer.h"
+#include "main/shaderobj.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
 #include "main/texgetimage.h"
@@ -51,8 +52,7 @@
 #include "main/transformfeedback.h"
 #endif
 
-#include "shader/program.h"
-#include "shader/shader_api.h"
+#include "program/program.h"
 #include "tnl/tnl.h"
 #include "swrast/swrast.h"
 
@@ -208,6 +208,8 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->DeleteArrayObject = _mesa_delete_array_object;
    driver->BindArrayObject = NULL;
 
+   _mesa_init_shader_object_functions(driver);
+
 #if FEATURE_EXT_transform_feedback
    _mesa_init_transform_feedback_functions(driver);
 #endif
@@ -231,10 +233,6 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->EndList = NULL;
    driver->BeginCallList = NULL;
    driver->EndCallList = NULL;
-
-
-   /* XXX temporary here */
-   _mesa_init_glsl_driver_functions(driver);
 }
 
 
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 35255833821..dc6e7120c63 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -34,6 +34,7 @@
 #include "main/glheader.h"
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "main/arbprogram.h"
 #include "main/arrayobj.h"
 #include "main/blend.h"
 #include "main/bufferobj.h"
@@ -51,7 +52,7 @@
 #include "main/polygon.h"
 #include "main/readpix.h"
 #include "main/scissor.h"
-#include "main/shaders.h"
+#include "main/shaderapi.h"
 #include "main/state.h"
 #include "main/stencil.h"
 #include "main/texobj.h"
@@ -61,8 +62,7 @@
 #include "main/texstate.h"
 #include "main/varray.h"
 #include "main/viewport.h"
-#include "shader/program.h"
-#include "shader/arbprogram.h"
+#include "program/program.h"
 #include "swrast/swrast.h"
 #include "drivers/common/meta.h"
 
@@ -2570,12 +2570,6 @@ copy_tex_image(GLcontext *ctx, GLuint dims, GLenum target, GLint level,
       return;
    }
 
-   if (texImage->TexFormat == MESA_FORMAT_NONE)
-      texImage->TexFormat = ctx->Driver.ChooseTextureFormat(ctx,
-                                                            internalFormat,
-                                                            format,
-                                                            type);
-
    _mesa_unlock_texture(ctx, texObj); /* need to unlock first */
 
    /*
@@ -2604,6 +2598,9 @@ copy_tex_image(GLcontext *ctx, GLuint dims, GLenum target, GLint level,
                               postConvWidth, postConvHeight, 1,
                               border, internalFormat);
 
+   _mesa_choose_texture_format(ctx, texObj, texImage, target, level,
+                               internalFormat, GL_NONE, GL_NONE);
+
    /*
     * Store texture data (with pixel transfer ops)
     */
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c
index dfb7d640409..86e59a8e51c 100644
--- a/src/mesa/drivers/dri/common/dri_metaops.c
+++ b/src/mesa/drivers/dri/common/dri_metaops.c
@@ -26,6 +26,7 @@
  *
  **************************************************************************/
 
+#include "main/arbprogram.h"
 #include "main/arrayobj.h"
 #include "main/bufferobj.h"
 #include "main/enable.h"
@@ -33,8 +34,7 @@
 #include "main/texstate.h"
 #include "main/varray.h"
 #include "main/viewport.h"
-#include "shader/arbprogram.h"
-#include "shader/program.h"
+#include "program/program.h"
 #include "dri_metaops.h"
 
 void
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 18b9035248f..dce84ef0deb 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -927,41 +927,6 @@ const __DRI2configQueryExtension dri2ConfigQueryExtension = {
    dri2ConfigQueryf,
 };
 
-static int
-driFrameTracking(__DRIdrawable *drawable, GLboolean enable)
-{
-    return GLX_BAD_CONTEXT;
-}
-
-static int
-driQueryFrameTracking(__DRIdrawable *dpriv,
-		      int64_t * sbc, int64_t * missedFrames,
-		      float * lastMissedUsage, float * usage)
-{
-   __DRIswapInfo   sInfo;
-   int             status;
-   int64_t         ust;
-   __DRIscreen *psp = dpriv->driScreenPriv;
-
-   status = dpriv->driScreenPriv->DriverAPI.GetSwapInfo( dpriv, & sInfo );
-   if ( status == 0 ) {
-      *sbc = sInfo.swap_count;
-      *missedFrames = sInfo.swap_missed_count;
-      *lastMissedUsage = sInfo.swap_missed_usage;
-
-      (*psp->systemTime->getUST)( & ust );
-      *usage = driCalculateSwapUsage( dpriv, sInfo.swap_ust, ust );
-   }
-
-   return status;
-}
-
-const __DRIframeTrackingExtension driFrameTrackingExtension = {
-    { __DRI_FRAME_TRACKING, __DRI_FRAME_TRACKING_VERSION },
-    driFrameTracking,
-    driQueryFrameTracking    
-};
-
 /**
  * Calculate amount of swap interval used between GLX buffer swaps.
  * 
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index e4c590b1322..bc647ff8130 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -70,7 +70,6 @@ extern const __DRIdri2Extension driDRI2Extension;
 extern const __DRIextension driReadDrawableExtension;
 extern const __DRIcopySubBufferExtension driCopySubBufferExtension;
 extern const __DRIswapControlExtension driSwapControlExtension;
-extern const __DRIframeTrackingExtension driFrameTrackingExtension;
 extern const __DRImediaStreamCounterExtension driMediaStreamCounterExtension;
 extern const __DRI2configQueryExtension dri2ConfigQueryExtension;
 
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index e60157f3777..f1505dc5e73 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -29,11 +29,11 @@
 #include "main/macros.h"
 #include "main/enums.h"
 
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/program.h"
-#include "shader/programopt.h"
-#include "shader/prog_print.h"
+#include "program/prog_instruction.h"
+#include "program/prog_parameter.h"
+#include "program/program.h"
+#include "program/programopt.h"
+#include "program/prog_print.h"
 
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 228ee3f3be1..a1e9dae9154 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -55,6 +55,7 @@ static void compile_clip_prog( struct brw_context *brw,
    GLuint program_size;
    GLuint delta;
    GLuint i;
+   GLuint header_regs;
 
    memset(&c, 0, sizeof(c));
    
@@ -72,22 +73,28 @@ static void compile_clip_prog( struct brw_context *brw,
    c.header_position_offset = ATTR_SIZE;
 
    if (intel->gen == 5)
-       delta = 3 * REG_SIZE;
+      header_regs = 3;
    else
-       delta = REG_SIZE;
+      header_regs = 1;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++)
+   delta = header_regs * REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
+
+	 c.idx_to_attr[c.nr_attrs] = i;
+	 c.nr_attrs++;
       }
+   }
 
-   c.nr_attrs = brw_count_bits(c.key.attrs);
-   
-   if (intel->gen == 5)
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
-   else
-       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   /* The vertex attributes start at a URB row-aligned offset after
+    * the 8-20 dword vertex header, and continue for a URB row-aligned
+    * length.  nr_regs determines the urb_read_length from the start
+    * of the header to the end of the vertex data.
+    */
+   c.nr_regs = header_regs + (c.nr_attrs + 1) / 2;
 
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index 68222c6c278..3a8cd7bf390 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -115,7 +115,10 @@ struct brw_clip_compile {
    GLboolean need_direction;
 
    GLuint header_position_offset;
-   GLuint offset[VERT_ATTRIB_MAX];
+   /** Mapping from VERT_RESULT_* to offset within the VUE. */
+   GLuint offset[VERT_RESULT_MAX];
+   /** Mapping from attribute index to VERT_RESULT_* */
+   GLuint idx_to_attr[VERT_RESULT_MAX];
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index ceb62a31162..4b9117bb0b1 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
index 7f47634dca8..b994a32bc3b 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 916a99ea004..cb58d1da9fe 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
@@ -76,10 +76,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
 
    if (c->nr_attrs & 1) {
       for (j = 0; j < 3; j++) {
-	 GLuint delta = c->nr_attrs*16 + 32;
-
-         if (intel->gen == 5)
-             delta = c->nr_attrs * 16 + 32 * 3;
+	 GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE;
 
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
       }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index f36d22fdbf8..afd93f8be0b 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -32,7 +32,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 2148bc8244a..d2ac1235e46 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -33,7 +33,7 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 #include "intel_batchbuffer.h"
 
@@ -134,7 +134,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 			     GLboolean force_edgeflag)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = get_tmp(c);
    GLuint i;
 
@@ -149,12 +148,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    /* Iterate over each attribute (could be done in pairs?)
     */
    for (i = 0; i < c->nr_attrs; i++) {
-      GLuint delta = i*16 + 32;
+      GLuint delta = c->offset[c->idx_to_attr[i]];
 
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
-
-      if (delta == c->offset[VERT_RESULT_EDGE]) {
+      if (c->idx_to_attr[i] == VERT_RESULT_EDGE) {
 	 if (force_edgeflag) 
 	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
 	 else
@@ -183,10 +179,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    }
 
    if (i & 1) {
-      GLuint delta = i*16 + 32;
-
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
+      GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE;
 
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
    }
@@ -199,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    brw_clip_project_vertex(c, dest_ptr );
 }
 
-
-
-
-#define MAX_MRF 16
-
 void brw_clip_emit_vue(struct brw_clip_compile *c, 
 		       struct brw_indirect vert,
 		       GLboolean allocate,
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index d13b9ae298b..6d064b822e5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -34,7 +34,6 @@
 #include "main/api_noop.h"
 #include "main/macros.h"
 #include "main/simple_list.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 6c0b79f7241..8196d8ca625 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -35,9 +35,9 @@
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 39bf5b63fc2..f7a68cead7c 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -501,6 +501,10 @@
 #define BRW_MASK_ENABLE   0
 #define BRW_MASK_DISABLE  1
 
+/* Sandybridge is WECtrl (Write enable control) */
+#define BRW_WE_NORMAL		0
+#define BRW_WE_KILL_PRED	1
+
 #define BRW_OPCODE_MOV        1
 #define BRW_OPCODE_SEL        2
 #define BRW_OPCODE_NOT        4
@@ -600,6 +604,8 @@
 #define BRW_ARF_NOTIFICATION_COUNT    0x90
 #define BRW_ARF_IP                    0xA0
 
+#define BRW_MRF_COMPR4			(1 << 7)
+
 #define BRW_AMASK   0
 #define BRW_IMASK   1
 #define BRW_LMASK   2
@@ -646,13 +652,14 @@
 #define BRW_POLYGON_FACING_BACK       1
 
 #define BRW_MESSAGE_TARGET_NULL               0
-#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_MATH               1 /* reserved on GEN6 */
 #define BRW_MESSAGE_TARGET_SAMPLER            2
 #define BRW_MESSAGE_TARGET_GATEWAY            3
-#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
-#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4 /* sampler cache on GEN6 */
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5 /* render cache on Gen6 */
 #define BRW_MESSAGE_TARGET_URB                6
 #define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+#define BRW_MESSAGE_TARGET_CONST_CACHE	      9 /* GEN6 */
 
 #define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
@@ -698,10 +705,24 @@
 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
 
+/* This one stays the same across generations. */
 #define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+/* GEN4 */
 #define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
-#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          2
 #define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+/* G45, GEN5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ	    3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+/* GEN6 */
+#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ  5
+#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
 
 #define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
 #define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
@@ -721,6 +742,16 @@
 #define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
 #define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
 
+/* GEN6 */
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE_GEN6		7
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE_GEN6		8
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE_GEN6		9
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE_GEN6		10
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORLD_SCATTERED_WRITE_GEN6		11
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6		12
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE_GEN6		13
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE_GEN6	14
+
 #define BRW_MATH_FUNCTION_INV                              1
 #define BRW_MATH_FUNCTION_LOG                              2
 #define BRW_MATH_FUNCTION_EXP                              3
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index ff12daf497d..d2307145361 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -598,7 +598,7 @@ static int src_da16 (FILE *file,
 	format (file, ".%d", _subreg_nr);
     string (file, "<");
     err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
-    string (file, ",1,1>");
+    string (file, ",4,1>");
     err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
     /*
      * Three kinds of swizzle display:
@@ -836,10 +836,12 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
     if (inst->header.opcode == BRW_OPCODE_SEND) {
 	int target;
 
-	if (gen >= 5)
-	   target = inst->bits2.send_gen5.sfid;
+	if (gen >= 6)
+	    target = inst->header.destreg__conditionalmod;
+	else if (gen == 5)
+	    target = inst->bits2.send_gen5.sfid;
 	else
-	   target = inst->bits3.generic.msg_target;
+	    target = inst->bits3.generic.msg_target;
 
 	newline (file);
 	pad (file, 16);
@@ -868,13 +870,44 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			    inst->bits3.sampler.return_format, NULL);
 	    string (file, ")");
 	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_READ:
+	    if (gen >= 6) {
+		format (file, " (%d, %d, %d, %d, %d, %d)",
+			inst->bits3.dp_render_cache.binding_table_index,
+			inst->bits3.dp_render_cache.msg_control,
+			inst->bits3.dp_render_cache.msg_type,
+			inst->bits3.dp_render_cache.send_commit_msg,
+			inst->bits3.dp_render_cache.msg_length,
+			inst->bits3.dp_render_cache.response_length);
+	    } else if (gen >= 5) {
+		format (file, " (%d, %d, %d)",
+			inst->bits3.dp_read_gen5.binding_table_index,
+			inst->bits3.dp_read_gen5.msg_control,
+			inst->bits3.dp_read_gen5.msg_type);
+	    } else {
+		format (file, " (%d, %d, %d)",
+			inst->bits3.dp_read.binding_table_index,
+			inst->bits3.dp_read.msg_control,
+			inst->bits3.dp_read.msg_type);
+	    }
+	    break;
 	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
-	    format (file, " (%d, %d, %d, %d)",
-		    inst->bits3.dp_write.binding_table_index,
-		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
-		    inst->bits3.dp_write.msg_control,
-		    inst->bits3.dp_write.msg_type,
-		    inst->bits3.dp_write.send_commit_msg);
+	    if (gen >= 6) {
+		format (file, " (%d, %d, %d, %d, %d, %d)",
+			inst->bits3.dp_render_cache.binding_table_index,
+			inst->bits3.dp_render_cache.msg_control,
+			inst->bits3.dp_render_cache.msg_type,
+			inst->bits3.dp_render_cache.send_commit_msg,
+			inst->bits3.dp_render_cache.msg_length,
+			inst->bits3.dp_render_cache.response_length);
+	    } else {
+		format (file, " (%d, %d, %d, %d)",
+			inst->bits3.dp_write.binding_table_index,
+			(inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+			inst->bits3.dp_write.msg_control,
+			inst->bits3.dp_write.msg_type,
+			inst->bits3.dp_write.send_commit_msg);
+	    }
 	    break;
 	case BRW_MESSAGE_TARGET_URB:
 	    if (gen >= 5) {
@@ -900,15 +933,22 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
 	    break;
 	default:
-	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    format (file, "unsupported target %d", target);
 	    break;
 	}
 	if (space)
 	    string (file, " ");
-	format (file, "mlen %d",
-		inst->bits3.generic.msg_length);
-	format (file, " rlen %d",
-		inst->bits3.generic.response_length);
+	if (gen >= 5) {
+	   format (file, "mlen %d",
+		   inst->bits3.generic_gen5.msg_length);
+	   format (file, " rlen %d",
+		   inst->bits3.generic_gen5.response_length);
+	} else {
+	   format (file, "mlen %d",
+		   inst->bits3.generic.msg_length);
+	   format (file, " rlen %d",
+		   inst->bits3.generic.response_length);
+	}
     }
     pad (file, 64);
     if (inst->header.opcode != BRW_OPCODE_NOP) {
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 3a32ad26c12..ffdddd0a388 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -35,7 +35,7 @@
 
 #include "brw_structs.h"
 #include "brw_defines.h"
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
@@ -520,6 +520,20 @@ static INLINE struct brw_reg brw_acc_reg( void )
 		       0);
 }
 
+static INLINE struct brw_reg brw_notification_1_reg(void)
+{
+
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		  BRW_ARF_NOTIFICATION_COUNT,
+		  1,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  WRITEMASK_X);
+}
+
 
 static INLINE struct brw_reg brw_flag_reg( void )
 {
@@ -877,12 +891,15 @@ void brw_dp_READ_4( struct brw_compile *p,
 
 void brw_dp_READ_4_vs( struct brw_compile *p,
                        struct brw_reg dest,
-                       GLuint oword,
-                       GLboolean relAddr,
-                       struct brw_reg addrReg,
                        GLuint location,
                        GLuint bind_table_index );
 
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addrReg,
+			       GLuint offset,
+			       GLuint bind_table_index);
+
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
 		      GLuint scratch_offset );
@@ -919,6 +936,8 @@ void brw_land_fwd_jump(struct brw_compile *p,
 
 void brw_NOP(struct brw_compile *p);
 
+void brw_WAIT(struct brw_compile *p);
+
 /* Special case: there is never a destination, execution size will be
  * taken from src0:
  */
@@ -965,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn,
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 34dfe10cb93..0d5d17f501d 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -364,7 +364,8 @@ static void brw_set_dp_write_message( struct brw_context *brw,
 				      GLuint msg_length,
 				      GLuint pixel_scoreboard_clear,
 				      GLuint response_length,
-				      GLuint end_of_thread )
+				      GLuint end_of_thread,
+				      GLuint send_commit_msg)
 {
    struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
@@ -374,7 +375,7 @@ static void brw_set_dp_write_message( struct brw_context *brw,
        insn->bits3.dp_write_gen5.msg_control = msg_control;
        insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
        insn->bits3.dp_write_gen5.msg_type = msg_type;
-       insn->bits3.dp_write_gen5.send_commit_msg = 0;
+       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
        insn->bits3.dp_write_gen5.header_present = 1;
        insn->bits3.dp_write_gen5.response_length = response_length;
        insn->bits3.dp_write_gen5.msg_length = msg_length;
@@ -386,7 +387,7 @@ static void brw_set_dp_write_message( struct brw_context *brw,
        insn->bits3.dp_write.msg_control = msg_control;
        insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
        insn->bits3.dp_write.msg_type = msg_type;
-       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
        insn->bits3.dp_write.response_length = response_length;
        insn->bits3.dp_write.msg_length = msg_length;
        insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
@@ -906,6 +907,20 @@ void brw_CMP(struct brw_compile *p,
    }
 }
 
+/* Issue 'wait' instruction for n1, host could program MMIO
+   to wake up thread. */
+void brw_WAIT (struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
+   struct brw_reg src = brw_notification_1_reg();
+
+   brw_set_dest(insn, src);
+   brw_set_src0(insn, src);
+   brw_set_src1(insn, brw_null_reg());
+   insn->header.execution_size = 0; /* must */
+   insn->header.predicate_control = 0;
+   insn->header.compression_control = 0;
+}
 
 
 /***********************************************************************
@@ -1040,6 +1055,7 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
 		      GLuint scratch_offset )
 {
+   struct intel_context *intel = &p->brw->intel;
    GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
@@ -1056,13 +1072,32 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 
    {
       GLuint msg_length = 3;
-      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_reg dest;
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
+      int send_commit_msg;
+
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
       insn->header.destreg__conditionalmod = msg_reg_nr;
-  
+
+      /* Until gen6, writes followed by reads from the same location
+       * are not guaranteed to be ordered unless write_commit is set.
+       * If set, then a no-op write is issued to the destination
+       * register to set a dependency, and a read from the destination
+       * can be used to ensure the ordering.
+       *
+       * For gen6, only writes between different threads need ordering
+       * protection.  Our use of DP writes is all about register
+       * spilling within a thread.
+       */
+      if (intel->gen >= 6) {
+	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+	 send_commit_msg = 0;
+      } else {
+	 dest = brw_uw16_grf(0, 0);
+	 send_commit_msg = 1;
+      }
+
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
@@ -1073,8 +1108,9 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
 			       msg_length,
 			       0, /* pixel scoreboard */
-			       0, /* response_length */
-			       0); /* eot */
+			       send_commit_msg, /* response_length */
+			       0, /* eot */
+			       send_commit_msg);
    }
 }
 
@@ -1115,7 +1151,7 @@ void brw_dp_READ_16( struct brw_compile *p,
       brw_set_dp_read_message(p->brw,
 			      insn,
 			      255, /* binding table index (255=stateless) */
-			      3,  /* msg_control (3 means 4 Owords) */
+			      BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
 			      1, /* target cache (render/scratch) */
 			      1, /* msg_length */
@@ -1190,68 +1226,107 @@ void brw_dp_READ_4( struct brw_compile *p,
  */
 void brw_dp_READ_4_vs(struct brw_compile *p,
                       struct brw_reg dest,
-                      GLuint oword,
-                      GLboolean relAddr,
-                      struct brw_reg addrReg,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
+   struct brw_reg b;
 
-   assert(oword < 2);
    /*
    printf("vs const read msg, location %u, msg_reg_nr %d\n",
           location, msg_reg_nr);
    */
 
    /* Setup MRF[1] with location/offset into const buffer */
-   {
-      struct brw_reg b;
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+    */
+   b = brw_message_reg(msg_reg_nr);
+   b = retype(b, BRW_REGISTER_TYPE_UD);
+   /*b = get_element_ud(b, 2);*/
+   brw_MOV(p, b, brw_imm_ud(location));
 
-      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-       */
-      b = brw_message_reg(msg_reg_nr);
-      b = retype(b, BRW_REGISTER_TYPE_UD);
-      /*b = get_element_ud(b, 2);*/
-      if (relAddr) {
-         brw_ADD(p, b, addrReg, brw_imm_ud(location));
-      }
-      else {
-         brw_MOV(p, b, brw_imm_ud(location));
-      }
+   brw_pop_insn_state(p);
 
-      brw_pop_insn_state(p);
-   }
+   insn = next_insn(p, BRW_OPCODE_SEND);
 
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = BRW_PREDICATE_NONE;
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-      insn->header.mask_control = BRW_MASK_DISABLE;
-      /*insn->header.access_mode = BRW_ALIGN_16;*/
-  
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, brw_null_reg());
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+   insn->header.mask_control = BRW_MASK_DISABLE;
 
-      brw_set_dp_read_message(p->brw,
-			      insn,
-			      bind_table_index,
-			      oword,  /* 0 = lower Oword, 1 = upper Oword */
-			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      0, /* source cache = data cache */
-			      1, /* msg_length */
-			      1, /* response_length (1 Oword) */
-			      0); /* eot */
-   }
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_null_reg());
+
+   brw_set_dp_read_message(p->brw,
+			   insn,
+			   bind_table_index,
+			   0,
+			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			   0, /* source cache = data cache */
+			   1, /* msg_length */
+			   1, /* response_length (1 Oword) */
+			   0); /* eot */
+}
+
+/**
+ * Read a float[4] constant per vertex from VS constant buffer, with
+ * relative addressing.
+ */
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addr_reg,
+			       GLuint offset,
+			       GLuint bind_table_index)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int msg_type;
+
+   /* Setup MRF[1] with offset into const buffer */
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
+    * fields ignored.
+    */
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+	   addr_reg, brw_imm_d(offset));
+   brw_pop_insn_state(p);
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = 0;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_vec8_grf(0, 0));
+
+   if (intel->gen == 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   brw_set_dp_read_message(p->brw,
+			   insn,
+			   bind_table_index,
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type,
+			   0, /* source cache = data cache */
+			   2, /* msg_length */
+			   1, /* response_length */
+			   0); /* eot */
 }
 
 
@@ -1281,7 +1356,8 @@ void brw_fb_WRITE(struct brw_compile *p,
 			    msg_length,
 			    1,	/* pixel scoreboard */
 			    response_length, 
-			    eot);
+			    eot,
+			    0 /* send_commit_msg */);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 99a6f6be113..a01d5576f8c 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -34,7 +34,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c
index e79b3ddea35..8aa6fb6cc6f 100644
--- a/src/mesa/drivers/dri/i965/brw_optimize.c
+++ b/src/mesa/drivers/dri/i965/brw_optimize.c
@@ -26,12 +26,600 @@
  */
 
 #include "main/macros.h"
-#include "shader/program.h"
-#include "shader/prog_print.h"
+#include "program/program.h"
+#include "program/prog_print.h"
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_eu.h"
 
+static const struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+    GLboolean is_arith;
+} inst_opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static INLINE
+GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
+{
+   return inst_opcode[inst->header.opcode].is_arith;
+}
+
+static const GLuint inst_stride[7] = {
+    [0] = 0,
+    [1] = 1,
+    [2] = 2,
+    [3] = 4,
+    [4] = 8,
+    [5] = 16,
+    [6] = 32
+};
+
+static const GLuint inst_type_size[8] = {
+    [BRW_REGISTER_TYPE_UD] = 4,
+    [BRW_REGISTER_TYPE_D] = 4,
+    [BRW_REGISTER_TYPE_UW] = 2,
+    [BRW_REGISTER_TYPE_W] = 2,
+    [BRW_REGISTER_TYPE_UB] = 1,
+    [BRW_REGISTER_TYPE_B] = 1,
+    [BRW_REGISTER_TYPE_F] = 4
+};
+
+static INLINE GLboolean
+brw_is_grf_written(const struct brw_instruction *inst,
+                   int reg_index, int size,
+                   int gen)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+   const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                         + inst->bits1.da1.dest_subreg_nr;
+   int length, write_end;
+
+   /* SEND is specific */
+   if (inst->header.opcode == BRW_OPCODE_SEND) {
+      if (gen >= 5)
+         length = inst->bits3.generic_gen5.response_length*REG_SIZE;
+      else 
+         length = inst->bits3.generic.response_length*REG_SIZE;
+   }
+   else {
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+   }
+
+   /* If the two intervals intersect, we overwrite the register */
+   write_end = write_start + length;
+   const int left = MAX2(write_start, reg_start);
+   const int right = MIN2(write_end, reg_end);
+
+   return left < right;
+}
+
+/* Specific path for message register since we need to handle the compr4 case */
+static INLINE GLboolean
+brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
+   const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+
+   /* We use compr4 with a size != 16 elements. Strange, we conservatively
+    * consider that we are writing the register.
+    */
+   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
+      return GL_TRUE;
+
+   GLboolean is_written = GL_FALSE;
+
+   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
+   if (is_compr4) {
+      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
+
+      /* First 8-way register */
+      const int write_start0 = mrf_index*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end0 = write_start0 + length;
+
+      /* Second 8-way register */
+      const int write_start1 = (mrf_index+4)*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end1 = write_start1 + length;
+
+      /* If the two intervals intersect, we overwrite the register */
+      const int left0 = MAX2(write_start0, reg_start);
+      const int right0 = MIN2(write_end0, reg_end);
+      const int left1 = MAX2(write_start1, reg_start);
+      const int right1 = MIN2(write_end1, reg_end);
+
+      is_written = left0 < right0 || left1 < right1;
+   }
+   else {
+      int length;
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+
+      /* If the two intervals intersect, we write into the register */
+      const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                            + inst->bits1.da1.dest_subreg_nr;
+      const int write_end = write_start + length;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+
+      is_written = left < right;
+   }
+
+   /* SEND may perform an implicit mov to a mrf register */
+   if (is_written == GL_FALSE &&
+       inst->header.opcode == BRW_OPCODE_SEND &&
+       inst->bits1.da1.src0_reg_file != 0) {
+
+      const int mrf_start = inst->header.destreg__conditionalmod;
+      const int write_start = mrf_start * REG_SIZE;
+      const int write_end = write_start + REG_SIZE;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+      is_written = left < right;
+   }
+
+   return is_written;
+}
+
+static INLINE GLboolean
+brw_is_mrf_read(const struct brw_instruction *inst,
+                int reg_index, int size, int gen)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND)
+      return GL_FALSE;
+   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+      return GL_TRUE;
+
+   const int reg_start = reg_index*REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   int length, read_start, read_end;
+   if (gen >= 5)
+      length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
+   else 
+      length = inst->bits3.generic.msg_length*REG_SIZE;
+
+   /* Look if SEND uses an implicit mov. In that case, we read one less register
+    * (but we write it)
+    */
+   if (inst->bits1.da1.src0_reg_file != 0)
+      read_start = inst->header.destreg__conditionalmod;
+   else {
+      length--;
+      read_start = inst->header.destreg__conditionalmod + 1;
+   }
+   read_start *= REG_SIZE;
+   read_end = read_start + length;
+
+   const int left = MAX2(read_start, reg_start);
+   const int right = MIN2(read_end, reg_end);
+
+   return left < right;
+}
+
+static INLINE GLboolean
+brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
+{
+   int i, j;
+   if (inst_opcode[inst->header.opcode].nsrc == 0)
+      return GL_FALSE;
+
+   /* Look at first source. We must take into account register regions to
+    * monitor carefully the read. Note that we are a bit too conservative here
+    * since we do not take into account the fact that some complete registers
+    * may be skipped
+    */
+   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
+
+      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits2.da1.src0_width;
+      const int row_num = elem_num >> inst->bits2.da1.src0_width;
+      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
+      int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
+                    + inst->bits2.da1.src0_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   /* Second src register */
+   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
+
+      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits3.da1.src1_width;
+      const int row_num = elem_num >> inst->bits3.da1.src1_width;
+      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
+      int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
+                    + inst->bits3.da1.src1_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_is_control_done(const struct brw_instruction *mov) {
+   return
+       mov->header.dependency_control != 0 ||
+       mov->header.thread_control != 0 ||
+       mov->header.mask_control != 0 ||
+       mov->header.saturate != 0 ||
+       mov->header.debug_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_predicated(const struct brw_instruction *mov) {
+   return mov->header.predicate_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
+                      int *mrf_index,
+                      int *grf_index,
+                      GLboolean *is_compr4)
+{
+   if (brw_is_predicated(mov) ||
+       brw_is_control_done(mov) ||
+       mov->header.debug_control != 0)
+      return GL_FALSE;
+
+   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
+       mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits1.da1.dest_subreg_nr != 0)
+      return GL_FALSE;
+
+   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
+       mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
+       mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
+       mov->bits2.da1.src0_subreg_nr != 0 ||
+       mov->bits2.da1.src0_abs != 0 ||
+       mov->bits2.da1.src0_negate != 0)
+      return GL_FALSE;
+
+   *grf_index = mov->bits2.da1.src0_reg_nr;
+   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
+   *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
+   return GL_TRUE;
+}
+
+static INLINE GLboolean
+brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
+{
+   /* remark: no problem to predicate a SEL instruction */
+   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
+       brw_is_control_done(inst) == GL_FALSE &&
+       inst->header.execution_size == 4 &&
+       inst->header.access_mode == BRW_ALIGN_1 &&
+       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
+       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
+       inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
+       inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
+       inst->bits1.da1.dest_reg_nr == grf_index &&
+       inst->bits1.da1.dest_subreg_nr == 0 &&
+       brw_is_arithmetic_inst(inst))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_inst_are_equal(const struct brw_instruction *src0,
+                   const struct brw_instruction *src1)
+{
+   const GLuint *field0 = (GLuint *) src0;
+   const GLuint *field1 = (GLuint *) src1;
+   return field0[0] == field1[0] &&
+          field0[1] == field1[1] &&
+          field0[2] == field1[2] &&
+          field0[3] == field1[3];
+}
+
+static INLINE void
+brw_inst_copy(struct brw_instruction *dst,
+              const struct brw_instruction *src)
+{
+   GLuint *field_dst = (GLuint *) dst;
+   const GLuint *field_src = (GLuint *) src;
+   field_dst[0] = field_src[0];
+   field_dst[1] = field_src[1];
+   field_dst[2] = field_src[2];
+   field_dst[3] = field_src[3];
+}
+
+static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
+{
+   int i, nr_insn = 0, to = 0, from = 0;
+
+   for (from = 0; from < p->nr_insn; ++from) {
+      if (removeInst[from])
+         continue;
+      if(to != from)
+         brw_inst_copy(p->store + to, p->store + from);
+      to++;
+   }
+
+   for (i = 0; i < p->nr_insn; ++i)
+      if (removeInst[i] == GL_FALSE)
+         nr_insn++;
+   p->nr_insn = nr_insn;
+}
+
+/* The gen code emitter generates a lot of duplications in the
+ * grf-to-mrf moves, for example when texture sampling with the same
+ * coordinates from multiple textures..  Here, we monitor same mov
+ * grf-to-mrf instrutions and remove repeated ones where the operands
+ * and dst ahven't changed in between.
+ */
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
+{
+   const int gen = p->brw->intel.gen;
+   int i, j;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
+      const int simd16_size = 2 * REG_SIZE;
+
+      for (j = i + 1; j < p->nr_insn; j++) {
+         const struct brw_instruction *inst = p->store + j;
+
+         if (brw_inst_are_equal(mov, inst)) {
+            removeInst[j] = GL_TRUE;
+            continue;
+         }
+
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
+            break;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
+/* Replace moves to MRFs where the value moved is the result of a
+ * normal arithmetic operation with computation right into the MRF.
+ */
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
+{
+   int i, j, prev;
+   struct brw_context *brw = p->brw;
+   const int gen = brw->intel.gen;
+   const int simd16_size = 2*REG_SIZE;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   assert(removeInst);
+
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      struct brw_instruction *grf_inst = NULL;
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      /* Using comp4 enables a stride of 4 for this instruction */
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
+
+      /* Look where the register has been set */
+      prev = i;
+      GLboolean potential_remove = GL_FALSE;
+      while (prev--) {
+
+         /* If _one_ instruction writes the grf, we try to remove the mov */
+         struct brw_instruction *inst = p->store + prev;
+         if (brw_is_grf_straight_write(inst, grf_index)) {
+            potential_remove = GL_TRUE;
+            grf_inst = inst;
+            break;
+         }
+
+      }
+
+      if (potential_remove == GL_FALSE)
+         continue;
+      removeInst[i] = GL_TRUE;
+
+      /* Monitor first the section of code between the grf computation and the
+       * mov. Here we cannot read or write both mrf and grf register
+       */
+      for (j = prev + 1; j < i; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
+             brw_is_grf_read(inst, grf_index, simd16_size)           ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE)   ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE)   ||
+             brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
+             brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+      }
+
+      /* After the mov, we can read or write the mrf. If the grf is overwritten,
+       * we are done
+       */
+      for (j = i + 1; j < p->nr_insn; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+
+         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+
+         if (brw_is_grf_straight_write(inst, grf_index))
+            break;
+      }
+
+      /* Note that with the top down traversal, we can safely pacth the mov
+       * instruction
+       */
+      if (removeInst[i]) {
+         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
+         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
 static GLboolean
 is_single_channel_dp4(struct brw_instruction *insn)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index bd560acdadf..4b08d2599bc 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -31,10 +31,10 @@
   
 #include "main/imports.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/program.h"
-#include "shader/programopt.h"
-#include "shader/shader_api.h"
+#include "main/shaderobj.h"
+#include "program/prog_parameter.h"
+#include "program/program.h"
+#include "program/programopt.h"
 #include "tnl/tnl.h"
 
 #include "brw_context.h"
@@ -174,9 +174,36 @@ static GLboolean brwProgramStringNotify( GLcontext *ctx,
 	 shader_error(ctx, prog,
 		      "i965 driver doesn't yet support uninlined function "
 		      "calls.  Move to using a single return statement at "
-		      "the end of the function to work around it.");
+		      "the end of the function to work around it.\n");
 	 return GL_FALSE;
       }
+      if (prog->Instructions[i].DstReg.RelAddr &&
+	  prog->Instructions[i].DstReg.File == PROGRAM_INPUT) {
+	 shader_error(ctx, prog,
+		      "Variable indexing of shader inputs unsupported\n");
+	 return GL_FALSE;
+      }
+      if (prog->Instructions[i].DstReg.RelAddr &&
+	  prog->Instructions[i].DstReg.File == PROGRAM_OUTPUT) {
+	 shader_error(ctx, prog,
+		      "Variable indexing of shader outputs unsupported\n");
+	 return GL_FALSE;
+      }
+      if (target == GL_FRAGMENT_PROGRAM_ARB) {
+	 if ((prog->Instructions[i].DstReg.RelAddr &&
+	      prog->Instructions[i].DstReg.File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[0].RelAddr &&
+	      prog->Instructions[i].SrcReg[0].File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[1].RelAddr &&
+	      prog->Instructions[i].SrcReg[1].File == PROGRAM_TEMPORARY) ||
+	     (prog->Instructions[i].SrcReg[2].RelAddr &&
+	      prog->Instructions[i].SrcReg[2].File == PROGRAM_TEMPORARY)) {
+	    shader_error(ctx, prog,
+			 "Variable indexing of variable arrays in the FS "
+			 "unsupported\n");
+	    return GL_FALSE;
+	 }
+      }
    }
 
    return GL_TRUE;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index a0680a56f2c..e525c730d3f 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -34,7 +34,7 @@
 #define BRW_SF_H
 
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index e290ca92f60..914f275cc67 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -130,7 +130,7 @@ struct brw_sf_unit_key {
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
-   unsigned point_attenuated:1;
+   unsigned use_vs_point_size:1;
    unsigned render_to_fbo:1;
    float line_width;
    float point_size;
@@ -164,7 +164,8 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 
    key->point_sprite = ctx->Point.PointSprite;
    key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-   key->point_attenuated = ctx->Point._Attenuated;
+   key->use_vs_point_size = (ctx->VertexProgram.PointSizeEnabled ||
+			     ctx->Point._Attenuated);
 
    /* _NEW_LIGHT */
    key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
@@ -296,7 +297,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    /* _NEW_POINT */
    sf.sf7.sprite_point = key->point_sprite;
    sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
-   sf.sf7.use_point_size_state = !key->point_attenuated;
+   sf.sf7.use_point_size_state = !key->use_vs_point_size;
    sf.sf7.aa_line_distance_mode = 0;
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 2a7fa5b6997..2fde42a7060 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1657,8 +1657,36 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } dp_write_gen5;
 
+      /* Sandybridge DP for sample cache, constant cache, render cache */
       struct {
-	 GLuint pad:16;
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:5;
+	 GLuint msg_type:3;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_sampler_const_cache;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint slot_group_select:1;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:4;
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_render_cache;
+
+      struct {
+	 GLuint function_control:16;
 	 GLuint response_length:4;
 	 GLuint msg_length:4;
 	 GLuint msg_target:4;
@@ -1666,8 +1694,9 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } generic;
 
+      /* Of this struct, only end_of_thread is not present for gen6. */
       struct {
-	 GLuint pad:19;
+	 GLuint function_control:19;
 	 GLuint header_present:1;
 	 GLuint response_length:5;
 	 GLuint msg_length:4;
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index bba9249d1b4..1db2a210d45 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -31,7 +31,7 @@
          
 
 #include "main/mtypes.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 #include "brw_util.h"
 #include "brw_defines.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 3c12f11ea78..9a832af9a97 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -34,8 +34,8 @@
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "shader/prog_print.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_parameter.h"
 
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 6493744f3eb..9338a6b7dbf 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -36,7 +36,7 @@
 
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "shader/program.h"
+#include "program/program.h"
 
 
 struct brw_vs_prog_key {
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 128987d78a6..c1d6525e9b7 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -31,9 +31,9 @@
             
 
 #include "main/macros.h"
-#include "shader/program.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "program/program.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 
@@ -44,6 +44,7 @@ static GLboolean
 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
 {
    int opcode_array[] = {
+      [OPCODE_MOV] = 1,
       [OPCODE_ADD] = 2,
       [OPCODE_CMP] = 3,
       [OPCODE_DP3] = 2,
@@ -218,7 +219,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_overflow_output = 0;
 
    if (intel->gen >= 6)
-      mrf = 6;
+      mrf = 4;
    else if (intel->gen == 5)
       mrf = 8;
    else
@@ -238,12 +239,25 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
-            if (mrf < 16) {
+	    /* Two restrictions on our compute-to-MRF here.  The
+	     * message length for all SEND messages is restricted to
+	     * [1,15], so we can't use mrf 15, as that means a length
+	     * of 16.
+	     *
+	     * Additionally, URB writes are aligned to URB rows, so we
+	     * need to put an even number of registers of URB data in
+	     * each URB write so that the later write is aligned.  A
+	     * message length of 15 means 1 message header reg plus 14
+	     * regs of URB data.
+	     *
+	     * For attributes beyond the compute-to-MRF, we compute to
+	     * GRFs and they will be written in the second URB_WRITE.
+	     */
+            if (mrf < 15) {
                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
                mrf++;
             }
             else {
-               /* too many vertex results to fit in MRF, use GRF for overflow */
                if (!c->first_overflow_output)
                   c->first_overflow_output = i;
                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
@@ -318,8 +332,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
+   /* See emit_vertex_write() for where the VUE's overhead on top of the
+    * attributes comes from.
+    */
    if (intel->gen >= 6)
-      c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
    else if (intel->gen == 5)
       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
@@ -869,8 +886,6 @@ get_constant(struct brw_vs_compile *c,
    assert(argIndex < 3);
 
    if (c->current_const[argIndex].index != src->Index) {
-      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
-
       /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
 
@@ -881,9 +896,6 @@ get_constant(struct brw_vs_compile *c,
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
                        const_reg,                     /* writeback dest */
-                       0,                             /* oword */
-                       0,                             /* relative indexing? */
-                       addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
@@ -904,8 +916,8 @@ get_reladdr_constant(struct brw_vs_compile *c,
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg = c->current_const[argIndex].reg;
-   struct brw_reg const2_reg;
    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg byte_addr_reg = get_tmp(c);
 
    assert(argIndex < 3);
 
@@ -917,37 +929,15 @@ get_reladdr_constant(struct brw_vs_compile *c,
 	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
 
+   brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
+
    /* fetch the first vec4 */
-   brw_dp_READ_4_vs(p,
-		    const_reg,                     /* writeback dest */
-		    0,                             /* oword */
-		    1,                             /* relative indexing? */
-		    addrReg,                       /* address register */
-		    16 * src->Index,               /* byte offset */
-		    SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
-		    );
-   /* second vec4 */
-   const2_reg = get_tmp(c);
-
-   /* use upper half of address reg for second read */
-   addrReg = stride(addrReg, 0, 4, 0);
-   addrReg.subnr = 16;
-
-   brw_dp_READ_4_vs(p,
-		    const2_reg,              /* writeback dest */
-		    1,                       /* oword */
-		    1,                       /* relative indexing? */
-		    addrReg,                 /* address register */
-		    16 * src->Index,         /* byte offset */
-		    SURF_INDEX_VERT_CONST_BUFFER
-		    );
-
-   /* merge the two Owords into the constant register */
-   /* const_reg[7..4] = const2_reg[7..4] */
-   brw_MOV(p,
-	   suboffset(stride(const_reg, 0, 4, 1), 4),
-	   suboffset(stride(const2_reg, 0, 4, 1), 4));
-   release_tmp(c, const2_reg);
+   brw_dp_READ_4_vs_relative(p,
+			     const_reg,                     /* writeback dest */
+			     byte_addr_reg,                 /* address register */
+			     16 * src->Index,               /* byte offset */
+			     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+			     );
 
    return const_reg;
 }
@@ -993,36 +983,71 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
  */
 static struct brw_reg deref( struct brw_vs_compile *c,
 			     struct brw_reg arg,
-			     GLint offset)
+			     GLint offset,
+			     GLuint reg_size )
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg tmp = get_tmp(c);
    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
-   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
-   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
+   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
+   struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
+
+   /* Set the vertical stride on the register access so that the first
+    * 4 components come from a0.0 and the second 4 from a0.1.
+    */
+   indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
 
    {
       brw_push_insn_state(p);
       brw_set_access_mode(p, BRW_ALIGN_1);
 
-      /* This is pretty clunky - load the address register twice and
-       * fetch each 4-dword value in turn.  There must be a way to do
-       * this in a single pass, but I couldn't get it to work.
-       */
-      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
-      brw_MOV(p, tmp, indirect);
+      brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
+      brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
+
+      brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
+      brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
 
-      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
-      brw_MOV(p, suboffset(tmp, 4), indirect);
+      brw_MOV(p, tmp, indirect);
 
       brw_pop_insn_state(p);
    }
-   
+
    /* NOTE: tmp not released */
-   return vec8(tmp);
+   return tmp;
 }
 
+static void
+move_to_reladdr_dst(struct brw_vs_compile *c,
+		    const struct prog_instruction *inst,
+		    struct brw_reg val)
+{
+   struct brw_compile *p = &c->func;
+   int reg_size = 32;
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
+   struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
+   GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+   struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
+
+   byte_offset += inst->DstReg.Index * reg_size;
+
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
+   brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
+   brw_MOV(p, indirect, val);
+
+   brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
+   brw_ADD(p, brw_address_reg(0), acc,
+	   brw_imm_uw(byte_offset + reg_size / 2));
+   brw_MOV(p, indirect, suboffset(val, 4));
+
+   brw_pop_insn_state(p);
+}
 
 /**
  * Get brw reg corresponding to the instruction's [argIndex] src reg.
@@ -1091,7 +1116,7 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_INPUT:
    case PROGRAM_OUTPUT:
       if (relAddr) {
-         return deref(c, c->regs[file][0], index);
+         return deref(c, c->regs[file][0], index, 32);
       }
       else {
          assert(c->regs[file][index].nr != 0);
@@ -1113,7 +1138,7 @@ get_src_reg( struct brw_vs_compile *c,
 	    return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
       }
       else {
          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
@@ -1134,26 +1159,6 @@ get_src_reg( struct brw_vs_compile *c,
    }
 }
 
-
-static void emit_arl( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
-   
-   if (need_tmp) 
-      tmp = get_tmp(c);
-
-   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
-   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
-
-   if (need_tmp)
-      release_tmp(c, tmp);
-}
-
-
 /**
  * Return the brw reg for the given instruction's src argument.
  * Will return mangled results for SWZ op.  The emit_swz() function
@@ -1198,8 +1203,17 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
    switch (dst.File) {
    case PROGRAM_TEMPORARY:
    case PROGRAM_OUTPUT:
-      assert(c->regs[dst.File][dst.Index].nr != 0);
-      reg = c->regs[dst.File][dst.Index];
+      /* register-indirect addressing is only 1x1, not VxH, for
+       * destination regs.  So, for RelAddr we'll return a temporary
+       * for the dest and do a move of the result to the RelAddr
+       * register after the instruction emit.
+       */
+      if (dst.RelAddr) {
+	 reg = get_tmp(c);
+      } else {
+	 assert(c->regs[dst.File][dst.Index].nr != 0);
+	 reg = c->regs[dst.File][dst.Index];
+      }
       break;
    case PROGRAM_ADDRESS:
       assert(dst.Index == 0);
@@ -1298,7 +1312,6 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
@@ -1381,16 +1394,19 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
 
+   /* The VUE layout is documented in Volume 2a. */
    if (intel->gen >= 6) {
-      /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
        * dword 4-7 (m2) is the 4D space position
-       * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
-       * m5 is the first vertex data we fill, which is the vertex position.
+       * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
+       * enabled.  We don't use it, so skip it.
+       * m3 is the first vertex element data we fill, which is the vertex
+       * position.
        */
-      brw_MOV(p, offset(m0, 2), pos);
-      brw_MOV(p, offset(m0, 5), pos);
-      len_vertex_header = 4;
+      brw_MOV(p, brw_message_reg(2), pos);
+      brw_MOV(p, brw_message_reg(3), pos);
+      len_vertex_header = 2;
    } else if (intel->gen == 5) {
       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
@@ -1400,9 +1416,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * m6 is a pad so that the vertex element data is aligned
        * m7 is the first vertex data we fill, which is the vertex position.
        */
-      brw_MOV(p, offset(m0, 2), ndc);
-      brw_MOV(p, offset(m0, 3), pos);
-      brw_MOV(p, offset(m0, 7), pos);
+      brw_MOV(p, brw_message_reg(2), ndc);
+      brw_MOV(p, brw_message_reg(3), pos);
+      brw_MOV(p, brw_message_reg(7), pos);
       len_vertex_header = 6;
    } else {
       /* There are 8 dwords in VUE header pre-Ironlake:
@@ -1412,8 +1428,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * dword 8-11 (m3) is the first vertex data, which we always have be the
        * vertex position.
        */
-      brw_MOV(p, offset(m0, 2), ndc);
-      brw_MOV(p, offset(m0, 3), pos);
+      brw_MOV(p, brw_message_reg(2), ndc);
+      brw_MOV(p, brw_message_reg(3), pos);
       len_vertex_header = 2;
    }
 
@@ -1437,29 +1453,26 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Move the overflowed attributes from the GRF to the MRF and
        * issue another brw_urb_WRITE().
        */
-      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
-       * at mrf[4] atm...
-       */
-      GLuint i, mrf = 0;
+      GLuint i, mrf = 1;
       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
             /* move from GRF to MRF */
-            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
             mrf++;
          }
       }
 
       brw_urb_WRITE(p,
                     brw_null_reg(), /* dest */
-                    4,              /* starting mrf reg nr */
+                    0,              /* starting mrf reg nr */
                     c->r0,          /* src */
                     0,              /* allocate */
                     1,              /* used */
-                    mrf+1,          /* msg len */
+                    mrf,            /* msg len */
                     0,              /* response len */
                     1,              /* eot */
                     1,              /* writes complete */
-                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    14 / 2,  /* urb destination offset */
                     BRW_URB_SWIZZLE_INTERLEAVE);
    }
 }
@@ -1665,7 +1678,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_ARL:
-	 emit_arl(c, dst, args[0]);
+	 brw_RNDD(p, dst, args[0]);
 	 break;
       case OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
@@ -1890,6 +1903,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
          }
       }
 
+      if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
+	 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
+	  * compute-to-mrf and the fact that we are allocating
+	  * registers for only the used PROGRAM_OUTPUTs.
+	  */
+	 move_to_reladdr_dst(c, inst, dst);
+      }
+
       release_tmps(c);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index be9e415cb07..0250a68d292 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -31,7 +31,7 @@
 
 #include "main/mtypes.h"
 #include "main/texstore.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 197b8754345..40f51c21c95 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -34,7 +34,7 @@
 #define BRW_WM_H
 
 
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index a90a2d3cf25..0c625a4cd02 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1326,7 +1326,7 @@ void emit_fb_write(struct brw_wm_compile *c,
 	  * + 1 for the second half we get destination + 4.
 	  */
 	 brw_MOV(p,
-		 brw_message_reg(nr + channel + (1 << 7)),
+		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
 		 arg0[channel]);
       } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
@@ -1763,12 +1763,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
+   /* Only properly tested on ILK */
+   if (p->brw->intel.gen == 5) {
+     brw_remove_duplicate_mrf_moves(p);
+     if (c->dispatch_width == 16)
+	brw_remove_grf_to_mrf_moves(p);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM) {
       int i;
 
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
+     printf("wm-native:\n");
+     for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
       printf("\n");
    }
 }
+
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index d73c3915824..0bef874b887 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -37,9 +37,9 @@
 #include "brw_wm.h"
 #include "brw_util.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_statevars.h"
 
 
 /** An invalid texture target */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 57be08a8d1d..2dd346d6dd1 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1,7 +1,7 @@
 #include "main/macros.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_optimize.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 60bd92ed223..05de85a957e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -32,7 +32,7 @@
 
 #include "brw_context.h"
 #include "brw_wm.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 1789b21451d..c1cf4db1cae 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -222,7 +222,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
       drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2),
 			      brw->wm.scratch_bo,
 			      wm.thread2.per_thread_scratch_space,
-			      0, 0);
+			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    }
 
    /* Emit sampler state relocation */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 77898dbbe72..17b016b569b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -32,7 +32,7 @@
 
 #include "main/mtypes.h"
 #include "main/texstore.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 51940efb443..6820ca3abf4 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -69,7 +69,7 @@ upload_sf_state(struct brw_context *brw)
    dw1 =
       num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT |
       (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-      3 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
+      1 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
    dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE |
       GEN6_SF_STATISTICS_ENABLE;
    dw3 = 0;
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 5916a139946..4080a9dedfd 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -29,8 +29,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 
 static void
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index ed1a72f03ba..863c85449d9 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -29,8 +29,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
 
 static void
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 698445c5268..ff741fc39ab 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -102,7 +102,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
    if (INTEL_DEBUG & DEBUG_BATCH) {
       drm_intel_bo_map(batch->buf, GL_FALSE);
       intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
-		   intel->intelScreen->deviceID);
+		   intel->intelScreen->deviceID, GL_TRUE);
       drm_intel_bo_unmap(batch->buf);
 
       if (intel->vtbl.debug_batch != NULL)
diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h
index cd614c59e55..72a74322ee5 100644
--- a/src/mesa/drivers/dri/intel/intel_chipset.h
+++ b/src/mesa/drivers/dri/intel/intel_chipset.h
@@ -115,6 +115,9 @@
 				 devid == PCI_CHIP_I946_GZ || \
 				 IS_G4X(devid))
 
+/* Compat macro for intel_decode.c */
+#define IS_IRONLAKE(devid)	IS_GEN5(devid)
+
 #define IS_GEN6(devid)		(devid == PCI_CHIP_SANDYBRIDGE || \
 				 devid == PCI_CHIP_SANDYBRIDGE_M)
 
diff --git a/src/mesa/drivers/dri/intel/intel_decode.c b/src/mesa/drivers/dri/intel/intel_decode.c
index 650010ac9ca..25b4131594f 100644
--- a/src/mesa/drivers/dri/intel/intel_decode.c
+++ b/src/mesa/drivers/dri/intel/intel_decode.c
@@ -1,48 +1,21 @@
-/* -*- c-basic-offset: 4 -*- */
-/*
- * Copyright © 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <[email protected]>
- *
- */
-
-/** @file intel_decode.c
- * This file contains code to print out batchbuffer contents in a
- * human-readable format.
- *
- * The current version only supports i915 packets, and only pretty-prints a
- * subset of them.  The intention is for it to make just a best attempt to
- * decode, but never crash in the process.
- */
-
+#include <stdint.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
-#include <inttypes.h>
 
 #include "intel_decode.h"
 #include "intel_chipset.h"
 
+static FILE *out;
+static uint32_t saved_s2 = 0, saved_s4 = 0;
+static char saved_s2_set = 0, saved_s4_set = 0;
+static uint32_t head_offset = 0xffffffff; /* undefined */
+static uint32_t tail_offset = 0xffffffff; /* undefined */
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+#endif
+
 #define BUFFER_FAIL(_count, _len, _name) do {			\
     fprintf(out, "Buffer size too small in %s (%d < %d)\n",	\
 	    (_name), (_count), (_len));				\
@@ -50,9 +23,6 @@
     return count;						\
 } while (0)
 
-static FILE *out;
-static uint32_t saved_s2 = 0, saved_s4 = 0;
-static char saved_s2_set = 0, saved_s4_set = 0;
 
 static float
 int_as_float(uint32_t intval)
@@ -71,15 +41,24 @@ instr_out(uint32_t *data, uint32_t hw_offset, unsigned int index,
 	  char *fmt, ...)
 {
     va_list va;
-
-    fprintf(out, "0x%08x: 0x%08x:%s ", hw_offset + index * 4, data[index],
-	    index == 0 ? "" : "  ");
+    char *parseinfo;
+    uint32_t offset = hw_offset + index * 4;
+
+    if (offset == head_offset)
+	parseinfo = "HEAD";
+    else if (offset == tail_offset)
+	parseinfo = "TAIL";
+    else
+	parseinfo = "    ";
+
+    fprintf(out, "0x%08x: %s 0x%08x: %s", offset, parseinfo,
+	    data[index],
+	    index == 0 ? "" : "   ");
     va_start(va, fmt);
     vfprintf(out, fmt, va);
     va_end(va);
 }
 
-
 static int
 decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 {
@@ -94,10 +73,11 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures)
     } opcodes_mi[] = {
 	{ 0x08, 0, 1, 1, "MI_ARB_ON_OFF" },
 	{ 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" },
+	{ 0x30, 0x3f, 3, 3, "MI_BATCH_BUFFER" },
 	{ 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" },
 	{ 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
 	{ 0x04, 0, 1, 1, "MI_FLUSH" },
-	{ 0x22, 0, 3, 3, "MI_LOAD_REGISTER_IMM" },
+	{ 0x22, 0x1f, 3, 3, "MI_LOAD_REGISTER_IMM" },
 	{ 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
 	{ 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
 	{ 0x00, 0, 1, 1, "MI_NOOP" },
@@ -111,6 +91,11 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	{ 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" },
     };
 
+    switch ((data[0] & 0x1f800000) >> 23) {
+    case 0x0a:
+	instr_out(data, hw_offset, 0, "MI_BATCH_BUFFER_END\n");
+	return -1;
+    }
 
     for (opcode = 0; opcode < sizeof(opcodes_mi) / sizeof(opcodes_mi[0]);
 	 opcode++) {
@@ -305,9 +290,13 @@ decode_2d(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 static int
 decode_3d_1c(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 {
-    switch ((data[0] & 0x00f80000) >> 19) {
+    uint32_t opcode;
+
+    opcode = (data[0] & 0x00f80000) >> 19;
+
+    switch (opcode) {
     case 0x11:
-	instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISALBE\n");
+	instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE\n");
 	return 1;
     case 0x10:
 	instr_out(data, hw_offset, 0, "3DSTATE_SCISSOR_ENABLE\n");
@@ -323,7 +312,8 @@ decode_3d_1c(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	return 1;
     }
 
-    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_1c opcode = 0x%x\n",
+	      opcode);
     (*failures)++;
     return 1;
 }
@@ -381,7 +371,7 @@ i915_get_instruction_dst(uint32_t *data, int i, char *dstname, int do_mask)
 	sprintf(dstname, "oD%s%s",  dstmask, sat);
 	break;
     case 6:
-	if (dst_nr > 2)
+	if (dst_nr > 3)
 	    fprintf(out, "bad destination reg U%d\n", dst_nr);
 	sprintf(dstname, "U%d%s%s", dst_nr, dstmask, sat);
 	break;
@@ -452,7 +442,7 @@ i915_get_instruction_src_name(uint32_t src_type, uint32_t src_nr, char *name)
 	break;
     case 6:
 	sprintf(name, "U%d", src_nr);
-	if (src_nr > 2)
+	if (src_nr > 3)
 	    fprintf(out, "bad src reg %s\n", name);
 	break;
     default:
@@ -797,10 +787,14 @@ i915_decode_instruction(uint32_t *data, uint32_t hw_offset,
 }
 
 static int
-decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
+decode_3d_1d(uint32_t *data, int count,
+	     uint32_t hw_offset,
+	     uint32_t devid,
+	     int *failures)
 {
-    unsigned int len, i, c, opcode, word, map, sampler, instr;
+    unsigned int len, i, c, idx, word, map, sampler, instr;
     char *format;
+    uint32_t opcode;
 
     struct {
 	uint32_t opcode;
@@ -811,7 +805,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
     } opcodes_3d_1d[] = {
 	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
 	{ 0x86, 0, 4, 4, "3DSTATE_CHROMA_KEY" },
-	{ 0x9c, 0, 1, 1, "3DSTATE_CLEAR_PARAMETERS" },
+	{ 0x9c, 0, 7, 7, "3DSTATE_CLEAR_PARAMETERS" },
 	{ 0x88, 0, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" },
 	{ 0x99, 0, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" },
 	{ 0x9a, 0, 2, 2, "3DSTATE_DEFAULT_SPECULAR" },
@@ -819,7 +813,6 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	{ 0x97, 0, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" },
 	{ 0x85, 0, 2, 2, "3DSTATE_DEST_BUFFER_VARIABLES" },
 	{ 0x80, 0, 5, 5, "3DSTATE_DRAWING_RECTANGLE" },
-	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
 	{ 0x9d, 0, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" },
 	{ 0x9e, 0, 4, 4, "3DSTATE_MONO_FILTER" },
 	{ 0x89, 0, 4, 4, "3DSTATE_FOG_MODE" },
@@ -831,9 +824,11 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	{ 0x8d, 1, 3, 3, "3DSTATE_W_STATE_I830" },
 	{ 0x01, 1, 2, 2, "3DSTATE_COLOR_FACTOR_I830" },
 	{ 0x02, 1, 2, 2, "3DSTATE_MAP_COORD_SETBIND_I830" },
-    };
+    }, *opcode_3d_1d;
+
+    opcode = (data[0] & 0x00ff0000) >> 16;
 
-    switch ((data[0] & 0x00ff0000) >> 16) {
+    switch (opcode) {
     case 0x07:
 	/* This instruction is unusual.  A 0 length means just 1 DWORD instead of
 	 * 2.  The 0 length is specified in one place to be unsupported, but
@@ -888,26 +883,56 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
 	len = (data[0] & 0x0000000f) + 2;
 	i = 1;
-	for (word = 0; word <= 7; word++) {
+	for (word = 0; word <= 8; word++) {
 	    if (data[0] & (1 << (4 + word))) {
 		if (i >= count)
 		    BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_1");
 
 		/* save vertex state for decode */
-		if (word == 2) {
-		    saved_s2_set = 1;
-		    saved_s2 = data[i];
-		}
-		if (word == 4) {
-		    saved_s4_set = 1;
-		    saved_s4 = data[i];
+		if (IS_9XX(devid)) {
+		    if (word == 2) {
+			saved_s2_set = 1;
+			saved_s2 = data[i];
+		    }
+		    if (word == 4) {
+			saved_s4_set = 1;
+			saved_s4 = data[i];
+		    }
 		}
 
 		instr_out(data, hw_offset, i++, "S%d\n", word);
 	    }
 	}
 	if (len != i) {
-	    fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n");
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x03:
+	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_2\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 6; word <= 14; word++) {
+	    if (data[0] & (1 << word)) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_2");
+
+		if (word == 6)
+		    instr_out(data, hw_offset, i++, "TBCF\n");
+		else if (word >= 7 && word <= 10) {
+		    instr_out(data, hw_offset, i++, "TB%dC\n", word - 7);
+		    instr_out(data, hw_offset, i++, "TB%dA\n", word - 7);
+		} else if (word >= 11 && word <= 14) {
+		    instr_out(data, hw_offset, i++, "TM%dS0\n", word - 11);
+		    instr_out(data, hw_offset, i++, "TM%dS1\n", word - 11);
+		    instr_out(data, hw_offset, i++, "TM%dS2\n", word - 11);
+		    instr_out(data, hw_offset, i++, "TM%dS3\n", word - 11);
+		    instr_out(data, hw_offset, i++, "TM%dS4\n", word - 11);
+		}
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_STATE_IMMEDIATE_2\n");
 	    (*failures)++;
 	}
 	return len;
@@ -919,11 +944,28 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	i = 2;
 	for (map = 0; map <= 15; map++) {
 	    if (data[1] & (1 << map)) {
+		int width, height, pitch, dword;
+		const char *tiling;
+
 		if (i + 3 >= count)
 		    BUFFER_FAIL(count, len, "3DSTATE_MAP_STATE");
+
 		instr_out(data, hw_offset, i++, "map %d MS2\n", map);
-		instr_out(data, hw_offset, i++, "map %d MS3\n", map);
-		instr_out(data, hw_offset, i++, "map %d MS4\n", map);
+
+		dword = data[i];
+		width = ((dword >> 10) & ((1 << 11) - 1))+1;
+		height = ((dword >> 21) & ((1 << 11) - 1))+1;
+
+		tiling = "none";
+		if (dword & (1 << 2))
+			tiling = "fenced";
+		else if (dword & (1 << 1))
+			tiling = dword & (1 << 0) ? "Y" : "X";
+		instr_out(data, hw_offset, i++, "map %d MS3 [width=%d, height=%d, tiling=%s]\n", map, width, height, tiling);
+
+		dword = data[i];
+		pitch = 4*(((dword >> 21) & ((1 << 11) - 1))+1);
+		instr_out(data, hw_offset, i++, "map %d MS4 [pitch=%d]\n", map, pitch);
 	    }
 	}
 	if (len != i) {
@@ -979,8 +1021,8 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	}
 	return len;
     case 0x01:
-	if (i830)
-	    break;
+	if (!IS_9XX(devid))
+		break;
 	instr_out(data, hw_offset, 0, "3DSTATE_SAMPLER_STATE\n");
 	instr_out(data, hw_offset, 1, "mask\n");
 	len = (data[0] & 0x0000003f) + 2;
@@ -1031,32 +1073,61 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 		  format,
 		  (data[1] & (1 << 31)) ? "en" : "dis");
 	return len;
+
+    case 0x8e:
+	{
+	    const char *name, *tiling;
+
+	    len = (data[0] & 0x0000000f) + 2;
+	    if (len != 3)
+		fprintf(out, "Bad count in 3DSTATE_BUFFER_INFO\n");
+	    if (count < 3)
+		BUFFER_FAIL(count, len, "3DSTATE_BUFFER_INFO");
+
+	    switch((data[1] >> 24) & 0x7) {
+	    case 0x3: name = "color"; break;
+	    case 0x7: name = "depth"; break;
+	    default: name = "unknown"; break;
+	    }
+
+	    tiling = "none";
+	    if (data[1] & (1 << 23))
+		tiling = "fenced";
+	    else if (data[1] & (1 << 22))
+		tiling = data[1] & (1 << 21) ? "Y" : "X";
+
+	    instr_out(data, hw_offset, 0, "3DSTATE_BUFFER_INFO\n");
+	    instr_out(data, hw_offset, 1, "%s, tiling = %s, pitch=%d\n", name, tiling, data[1]&0xffff);
+
+	    instr_out(data, hw_offset, 2, "address\n");
+	    return len;
+	}
     }
 
-    for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
-	 opcode++)
+    for (idx = 0; idx < ARRAY_SIZE(opcodes_3d_1d); idx++)
     {
-	if (opcodes_3d_1d[opcode].i830_only && !i830)
+	opcode_3d_1d = &opcodes_3d_1d[idx];
+	if (opcode_3d_1d->i830_only && IS_9XX(devid))
 	    continue;
 
-	if (((data[0] & 0x00ff0000) >> 16) == opcodes_3d_1d[opcode].opcode) {
+	if (((data[0] & 0x00ff0000) >> 16) == opcode_3d_1d->opcode) {
 	    len = 1;
 
-	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d_1d[opcode].name);
-	    if (opcodes_3d_1d[opcode].max_len > 1) {
+	    instr_out(data, hw_offset, 0, "%s\n", opcode_3d_1d->name);
+	    if (opcode_3d_1d->max_len > 1) {
 		len = (data[0] & 0x0000ffff) + 2;
-		if (len < opcodes_3d_1d[opcode].min_len ||
-		    len > opcodes_3d_1d[opcode].max_len)
+		if (len < opcode_3d_1d->min_len ||
+		    len > opcode_3d_1d->max_len)
 		{
 		    fprintf(out, "Bad count in %s\n",
-			    opcodes_3d_1d[opcode].name);
+			    opcode_3d_1d->name);
 		    (*failures)++;
 		}
 	    }
 
 	    for (i = 1; i < len; i++) {
 		if (i >= count)
-		    BUFFER_FAIL(count, len,  opcodes_3d_1d[opcode].name);
+		    BUFFER_FAIL(count, len,  opcode_3d_1d->name);
 		instr_out(data, hw_offset, i, "dword %d\n", i);
 	    }
 
@@ -1064,7 +1135,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	}
     }
 
-    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_1d opcode = 0x%x\n", opcode);
     (*failures)++;
     return 1;
 }
@@ -1074,8 +1145,10 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		    int *failures)
 {
     char immediate = (data[0] & (1 << 23)) == 0;
-    unsigned int len, i;
+    unsigned int len, i, ret;
     char *primtype;
+    int original_s2 = saved_s2;
+    int original_s4 = saved_s4;
 
     switch ((data[0] >> 18) & 0xf) {
     case 0x0: primtype = "TRILIST"; break;
@@ -1088,7 +1161,7 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
     case 0x7: primtype = "RECTLIST"; break;
     case 0x8: primtype = "POINTLIST"; break;
     case 0x9: primtype = "DIB"; break;
-    case 0xa: primtype = "CLEAR_RECT"; break;
+    case 0xa: primtype = "CLEAR_RECT"; saved_s4 = 3 << 6; saved_s2 = ~0; break;
     default: primtype = "unknown"; break;
     }
 
@@ -1192,6 +1265,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		vertex++;
 	    }
 	}
+
+	ret = len;
     } else {
 	/* indirect vertices */
 	len = data[0] & 0x0000ffff; /* index count */
@@ -1209,13 +1284,15 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		    if ((data[i] & 0xffff) == 0xffff) {
 			instr_out(data, hw_offset, i,
 				  "            indices: (terminator)\n");
-			return i;
+			ret = i;
+			goto out;
 		    } else if ((data[i] >> 16) == 0xffff) {
 			instr_out(data, hw_offset, i,
 				  "            indices: 0x%04x, "
 				  "(terminator)\n",
 				  data[i] & 0xffff);
-			return i;
+			ret = i;
+			goto out;
 		    } else {
 			instr_out(data, hw_offset, i,
 				  "            indices: 0x%04x, 0x%04x\n",
@@ -1225,7 +1302,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		fprintf(out,
 			"3DPRIMITIVE: no terminator found in index buffer\n");
 		(*failures)++;
-		return count;
+		ret = count;
+		goto out;
 	    } else {
 		/* fixed size vertex index buffer */
 		for (i = 0; i < len; i += 2) {
@@ -1240,7 +1318,8 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		    }
 		}
 	    }
-	    return (len + 1) / 2 + 1;
+	    ret = (len + 1) / 2 + 1;
+	    goto out;
 	} else {
 	    /* sequential vertex access */
 	    if (count < 2)
@@ -1249,17 +1328,22 @@ decode_3d_primitive(uint32_t *data, int count, uint32_t hw_offset,
 		      "3DPRIMITIVE sequential indirect %s, %d starting from "
 		      "%d\n", primtype, len, data[1] & 0xffff);
 	    instr_out(data, hw_offset, 1, "           start\n");
-	    return 2;
+	    ret = 2;
+	    goto out;
 	}
     }
 
-    return len;
+out:
+    saved_s2 = original_s2;
+    saved_s4 = original_s4;
+    return ret;
 }
 
 static int
-decode_3d(uint32_t *data, int count, uint32_t hw_offset, int *failures)
+decode_3d(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures)
 {
-    unsigned int opcode;
+    uint32_t opcode;
+    unsigned int idx;
 
     struct {
 	uint32_t opcode;
@@ -1276,42 +1360,44 @@ decode_3d(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	{ 0x0d, 1, 1, "3DSTATE_MODES_4" },
 	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
 	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
-    };
+    }, *opcode_3d;
+
+    opcode = (data[0] & 0x1f000000) >> 24;
 
-    switch ((data[0] & 0x1f000000) >> 24) {
+    switch (opcode) {
     case 0x1f:
 	return decode_3d_primitive(data, count, hw_offset, failures);
     case 0x1d:
-	return decode_3d_1d(data, count, hw_offset, failures, 0);
+	return decode_3d_1d(data, count, hw_offset, devid, failures);
     case 0x1c:
 	return decode_3d_1c(data, count, hw_offset, failures);
     }
 
-    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
-	 opcode++) {
-	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+    for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) {
+	opcode_3d = &opcodes_3d[idx];
+	if (opcode == opcode_3d->opcode) {
 	    unsigned int len = 1, i;
 
-	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
-	    if (opcodes_3d[opcode].max_len > 1) {
+	    instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name);
+	    if (opcode_3d->max_len > 1) {
 		len = (data[0] & 0xff) + 2;
-		if (len < opcodes_3d[opcode].min_len ||
-		    len > opcodes_3d[opcode].max_len)
+		if (len < opcode_3d->min_len ||
+		    len > opcode_3d->max_len)
 		{
-		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		    fprintf(out, "Bad count in %s\n", opcode_3d->name);
 		}
 	    }
 
 	    for (i = 1; i < len; i++) {
 		if (i >= count)
-		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		    BUFFER_FAIL(count, len, opcode_3d->name);
 		instr_out(data, hw_offset, i, "dword %d\n", i);
 	    }
 	    return len;
 	}
     }
 
-    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d opcode = 0x%x\n", opcode);
     (*failures)++;
     return 1;
 }
@@ -1403,11 +1489,86 @@ get_965_prim_type(uint32_t data)
 }
 
 static int
-decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
+i965_decode_urb_fence(uint32_t *data, uint32_t hw_offset, int len, int count,
+		      int *failures)
 {
-    unsigned int opcode, len;
-    int i;
-    char *desc1;
+	uint32_t vs_fence, clip_fence, gs_fence, sf_fence, vfe_fence, cs_fence;
+
+	if (len != 3)
+	    fprintf(out, "Bad count in URB_FENCE\n");
+	if (count < 3)
+	    BUFFER_FAIL(count, len, "URB_FENCE");
+
+	vs_fence = data[1] & 0x3ff;
+	gs_fence = (data[1] >> 10) & 0x3ff;
+	clip_fence = (data[1] >> 20) & 0x3ff;
+	sf_fence = data[2] & 0x3ff;
+	vfe_fence = (data[2] >> 10) & 0x3ff;
+	cs_fence = (data[2] >> 20) & 0x7ff;
+
+	instr_out(data, hw_offset, 0, "URB_FENCE: %s%s%s%s%s%s\n",
+			(data[0] >> 13) & 1 ? "cs " : "",
+			(data[0] >> 12) & 1 ? "vfe " : "",
+			(data[0] >> 11) & 1 ? "sf " : "",
+			(data[0] >> 10) & 1 ? "clip " : "",
+			(data[0] >> 9)  & 1 ? "gs " : "",
+			(data[0] >> 8)  & 1 ? "vs " : "");
+	instr_out(data, hw_offset, 1,
+		  "vs fence: %d, clip_fence: %d, gs_fence: %d\n",
+		  vs_fence, clip_fence, gs_fence);
+	instr_out(data, hw_offset, 2,
+		  "sf fence: %d, vfe_fence: %d, cs_fence: %d\n",
+		  sf_fence, vfe_fence, cs_fence);
+	if (gs_fence < vs_fence)
+	    fprintf(out, "gs fence < vs fence!\n");
+	if (clip_fence < gs_fence)
+	    fprintf(out, "clip fence < gs fence!\n");
+	if (sf_fence < clip_fence)
+	    fprintf(out, "sf fence < clip fence!\n");
+	if (cs_fence < sf_fence)
+	    fprintf(out, "cs fence < sf fence!\n");
+
+	return len;
+}
+
+static void
+state_base_out(uint32_t *data, uint32_t hw_offset, unsigned int index,
+	       char *name)
+{
+    if (data[index] & 1) {
+	instr_out(data, hw_offset, index, "%s state base address 0x%08x\n",
+		  name, data[index] & ~1);
+    } else {
+	instr_out(data, hw_offset, index, "%s state base not updated\n",
+		  name);
+    }
+}
+
+static void
+state_max_out(uint32_t *data, uint32_t hw_offset, unsigned int index,
+	      char *name)
+{
+    if (data[index] & 1) {
+	if (data[index] == 1) {
+	    instr_out(data, hw_offset, index,
+		      "%s state upper bound disabled\n", name);
+	} else {
+	    instr_out(data, hw_offset, index, "%s state upper bound 0x%08x\n",
+		      name, data[index] & ~1);
+	}
+    } else {
+	instr_out(data, hw_offset, index, "%s state upper bound not updated\n",
+		  name);
+    }
+}
+
+static int
+decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures)
+{
+    uint32_t opcode;
+    unsigned int idx, len;
+    int i, sba_len;
+    char *desc1 = NULL;
 
     struct {
 	uint32_t opcode;
@@ -1436,57 +1597,78 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
 	{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
 	{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+	{ 0x7909, 2, 2, "3DSTATE_CLEAR_PARAMS" },
 	{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+	{ 0x790b, 4, 4, "3DSTATE_GS_SVB_INDEX" },
+	{ 0x790d, 3, 3, "3DSTATE_MULTISAMPLE" },
 	{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+	{ 0x7802, 4, 4, "3DSTATE_SAMPLER_STATE_POINTERS" },
+	{ 0x7805, 3, 3, "3DSTATE_URB" },
 	{ 0x780e, 4, 4, "3DSTATE_CC_STATE_POINTERS" },
 	{ 0x7810, 6, 6, "3DSTATE_VS_STATE" },
-	{ 0x7811, 6, 6, "3DSTATE_GS_STATE" },
+	{ 0x7811, 7, 7, "3DSTATE_GS_STATE" },
+	{ 0x7812, 4, 4, "3DSTATE_CLIP_STATE" },
+	{ 0x7813, 20, 20, "3DSTATE_SF_STATE" },
+	{ 0x7814, 9, 9, "3DSTATE_WM_STATE" },
 	{ 0x7812, 4, 4, "3DSTATE_CLIP_STATE" },
 	{ 0x7815, 5, 5, "3DSTATE_CONSTANT_VS_STATE" },
 	{ 0x7816, 5, 5, "3DSTATE_CONSTANT_GS_STATE" },
-    };
+	{ 0x7817, 5, 5, "3DSTATE_CONSTANT_PS_STATE" },
+	{ 0x7818, 2, 2, "3DSTATE_SAMPLE_MASK" },
+   }, *opcode_3d;
 
     len = (data[0] & 0x0000ffff) + 2;
 
-    switch ((data[0] & 0xffff0000) >> 16) {
+    opcode = (data[0] & 0xffff0000) >> 16;
+    switch (opcode) {
+    case 0x6000:
+	len = (data[0] & 0x000000ff) + 2;
+	return i965_decode_urb_fence(data, hw_offset, len, count, failures);
+    case 0x6001:
+	instr_out(data, hw_offset, 0, "CS_URB_STATE\n");
+	instr_out(data, hw_offset, 1, "entry_size: %d [%d bytes], n_entries: %d\n",
+			(data[1] >> 4) & 0x1f,
+			(((data[1] >> 4) & 0x1f) + 1) * 64,
+			data[1] & 0x7);
+	return len;
+    case 0x6002:
+	len = (data[0] & 0x000000ff) + 2;
+	instr_out(data, hw_offset, 0, "CONSTANT_BUFFER: %s\n",
+			(data[0] >> 8) & 1 ? "valid" : "invalid");
+	instr_out(data, hw_offset, 1, "offset: 0x%08x, length: %d bytes\n",
+			data[1] & ~0x3f, ((data[1] & 0x3f) + 1) * 64);
+	return len;
     case 0x6101:
-	if (len != 6)
+	if (IS_GEN6(devid))
+	    sba_len = 10;
+	else if (IS_IRONLAKE(devid))
+	    sba_len = 8;
+	else
+	    sba_len = 6;
+	if (len != sba_len)
 	    fprintf(out, "Bad count in STATE_BASE_ADDRESS\n");
-	if (count < 6)
+	if (len != sba_len)
 	    BUFFER_FAIL(count, len, "STATE_BASE_ADDRESS");
 
+	i = 0;
 	instr_out(data, hw_offset, 0,
 		  "STATE_BASE_ADDRESS\n");
-
-	if (data[1] & 1) {
-	    instr_out(data, hw_offset, 1, "General state at 0x%08x\n",
-		      data[1] & ~1);
-	} else
-	    instr_out(data, hw_offset, 1, "General state not updated\n");
-
-	if (data[2] & 1) {
-	    instr_out(data, hw_offset, 2, "Surface state at 0x%08x\n",
-		      data[2] & ~1);
-	} else
-	    instr_out(data, hw_offset, 2, "Surface state not updated\n");
-
-	if (data[3] & 1) {
-	    instr_out(data, hw_offset, 3, "Indirect state at 0x%08x\n",
-		      data[3] & ~1);
-	} else
-	    instr_out(data, hw_offset, 3, "Indirect state not updated\n");
-
-	if (data[4] & 1) {
-	    instr_out(data, hw_offset, 4, "General state upper bound 0x%08x\n",
-		      data[4] & ~1);
-	} else
-	    instr_out(data, hw_offset, 4, "General state not updated\n");
-
-	if (data[5] & 1) {
-	    instr_out(data, hw_offset, 5, "Indirect state upper bound 0x%08x\n",
-		      data[5] & ~1);
-	} else
-	    instr_out(data, hw_offset, 5, "Indirect state not updated\n");
+	i++;
+
+	state_base_out(data, hw_offset, i++, "general");
+	state_base_out(data, hw_offset, i++, "surface");
+	if (IS_GEN6(devid))
+	    state_base_out(data, hw_offset, i++, "dynamic");
+	state_base_out(data, hw_offset, i++, "indirect");
+	if (IS_IRONLAKE(devid) || IS_GEN6(devid))
+	    state_base_out(data, hw_offset, i++, "instruction");
+
+	state_max_out(data, hw_offset, i++, "general");
+	if (IS_GEN6(devid))
+	    state_max_out(data, hw_offset, i++, "dynamic");
+	state_max_out(data, hw_offset, i++, "indirect");
+	if (IS_IRONLAKE(devid) || IS_GEN6(devid))
+	    state_max_out(data, hw_offset, i++, "instruction");
 
 	return len;
     case 0x7800:
@@ -1505,18 +1687,33 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	instr_out(data, hw_offset, 6, "CC state\n");
 	return len;
     case 0x7801:
-	if (len != 6)
+	len = (data[0] & 0x000000ff) + 2;
+	if (len != 6 && len != 4)
 	    fprintf(out, "Bad count in 3DSTATE_BINDING_TABLE_POINTERS\n");
-	if (count < 6)
-	    BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS");
+	if (len == 6) {
+	    if (count < 6)
+		BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS");
+	    instr_out(data, hw_offset, 0,
+		      "3DSTATE_BINDING_TABLE_POINTERS\n");
+	    instr_out(data, hw_offset, 1, "VS binding table\n");
+	    instr_out(data, hw_offset, 2, "GS binding table\n");
+	    instr_out(data, hw_offset, 3, "Clip binding table\n");
+	    instr_out(data, hw_offset, 4, "SF binding table\n");
+	    instr_out(data, hw_offset, 5, "WM binding table\n");
+	} else {
+	    if (count < 4)
+		BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS");
 
-	instr_out(data, hw_offset, 0,
-		  "3DSTATE_BINDING_TABLE_POINTERS\n");
-	instr_out(data, hw_offset, 1, "VS binding table\n");
-	instr_out(data, hw_offset, 2, "GS binding table\n");
-	instr_out(data, hw_offset, 3, "Clip binding table\n");
-	instr_out(data, hw_offset, 4, "SF binding table\n");
-	instr_out(data, hw_offset, 5, "WM binding table\n");
+	    instr_out(data, hw_offset, 0,
+		      "3DSTATE_BINDING_TABLE_POINTERS: VS mod %d, "
+		      "GS mod %d, PS mod %d\n",
+		      (data[0] & (1 << 8)) != 0,
+		      (data[0] & (1 << 9)) != 0,
+		      (data[0] & (1 << 10)) != 0);
+	    instr_out(data, hw_offset, 1, "VS binding table\n");
+	    instr_out(data, hw_offset, 2, "GS binding table\n");
+	    instr_out(data, hw_offset, 3, "WM binding table\n");
+	}
 
 	return len;
 
@@ -1567,6 +1764,18 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	}
 	return len;
 
+    case 0x780d:
+	len = (data[0] & 0xff) + 2;
+	if (len != 4)
+	    fprintf(out, "Bad count in 3DSTATE_VIEWPORT_STATE_POINTERS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VIEWPORT_STATE_POINTERS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VIEWPORT_STATE_POINTERS\n");
+	instr_out(data, hw_offset, 1, "clip\n");
+	instr_out(data, hw_offset, 2, "sf\n");
+	instr_out(data, hw_offset, 3, "cc\n");
+	return len;
+
     case 0x780a:
 	len = (data[0] & 0xff) + 2;
 	if (len != 3)
@@ -1616,10 +1825,10 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 		  ((data[3] & 0x0007ffc0) >> 6) + 1,
 		  ((data[3] & 0xfff80000) >> 19) + 1);
 	instr_out(data, hw_offset, 4, "volume depth\n");
-	if (len == 6)
+	if (len >= 6)
 	    instr_out(data, hw_offset, 5, "\n");
-	if (len == 7)
-	    instr_out(data, hw_offset, 6, "render target view extent\n");
+       if (len >= 7)
+           instr_out(data, hw_offset, 6, "render target view extent\n");
 
 	return len;
 
@@ -1638,12 +1847,11 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	}
 	instr_out(data, hw_offset, 0,
 		  "PIPE_CONTROL: %s, %sdepth stall, %sRC write flush, "
-		  "%sinst flush, %stexture flush\n",
+		  "%sinst flush\n",
 		  desc1,
 		  data[0] & (1 << 13) ? "" : "no ",
 		  data[0] & (1 << 12) ? "" : "no ",
-		  data[0] & (1 << 11) ? "" : "no ",
-		  data[0] & (1 << 9) ? "" : "no ");
+		  data[0] & (1 << 11) ? "" : "no ");
 	instr_out(data, hw_offset, 1, "destination address\n");
 	instr_out(data, hw_offset, 2, "immediate dword low\n");
 	instr_out(data, hw_offset, 3, "immediate dword high\n");
@@ -1668,40 +1876,41 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	return len;
     }
 
-    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
-	 opcode++) {
-	if ((data[0] & 0xffff0000) >> 16 == opcodes_3d[opcode].opcode) {
+    for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) {
+	opcode_3d = &opcodes_3d[idx];
+	if ((data[0] & 0xffff0000) >> 16 == opcode_3d->opcode) {
 	    unsigned int i;
 	    len = 1;
 
-	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
-	    if (opcodes_3d[opcode].max_len > 1) {
+	    instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name);
+	    if (opcode_3d->max_len > 1) {
 		len = (data[0] & 0xff) + 2;
-		if (len < opcodes_3d[opcode].min_len ||
-		    len > opcodes_3d[opcode].max_len)
+		if (len < opcode_3d->min_len ||
+		    len > opcode_3d->max_len)
 		{
-		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		    fprintf(out, "Bad count in %s\n", opcode_3d->name);
 		}
 	    }
 
 	    for (i = 1; i < len; i++) {
 		if (i >= count)
-		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		    BUFFER_FAIL(count, len, opcode_3d->name);
 		instr_out(data, hw_offset, i, "dword %d\n", i);
 	    }
 	    return len;
 	}
     }
 
-    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_965 opcode = 0x%x\n", opcode);
     (*failures)++;
     return 1;
 }
 
 static int
-decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures)
+decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid, int *failures)
 {
-    unsigned int opcode;
+    unsigned int idx;
+    uint32_t opcode;
 
     struct {
 	uint32_t opcode;
@@ -1725,42 +1934,44 @@ decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	{ 0x0f, 1, 1, "3DSTATE_MODES_2" },
 	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
 	{ 0x16, 1, 1, "3DSTATE_MODES_4" },
-    };
+    }, *opcode_3d;
 
-    switch ((data[0] & 0x1f000000) >> 24) {
+    opcode = (data[0] & 0x1f000000) >> 24;
+
+    switch (opcode) {
     case 0x1f:
 	return decode_3d_primitive(data, count, hw_offset, failures);
     case 0x1d:
-	return decode_3d_1d(data, count, hw_offset, failures, 1);
+	return decode_3d_1d(data, count, hw_offset, devid, failures);
     case 0x1c:
 	return decode_3d_1c(data, count, hw_offset, failures);
     }
 
-    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
-	 opcode++) {
-	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+    for (idx = 0; idx < ARRAY_SIZE(opcodes_3d); idx++) {
+	opcode_3d = &opcodes_3d[idx];
+	if ((data[0] & 0x1f000000) >> 24 == opcode_3d->opcode) {
 	    unsigned int len = 1, i;
 
-	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
-	    if (opcodes_3d[opcode].max_len > 1) {
+	    instr_out(data, hw_offset, 0, "%s\n", opcode_3d->name);
+	    if (opcode_3d->max_len > 1) {
 		len = (data[0] & 0xff) + 2;
-		if (len < opcodes_3d[opcode].min_len ||
-		    len > opcodes_3d[opcode].max_len)
+		if (len < opcode_3d->min_len ||
+		    len > opcode_3d->max_len)
 		{
-		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		    fprintf(out, "Bad count in %s\n", opcode_3d->name);
 		}
 	    }
 
 	    for (i = 1; i < len; i++) {
 		if (i >= count)
-		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		    BUFFER_FAIL(count, len, opcode_3d->name);
 		instr_out(data, hw_offset, i, "dword %d\n", i);
 	    }
 	    return len;
 	}
     }
 
-    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    instr_out(data, hw_offset, 0, "3D UNKNOWN: 3d_i830 opcode = 0x%x\n", opcode);
     (*failures)++;
     return 1;
 }
@@ -1773,18 +1984,37 @@ decode_3d_i830(uint32_t *data, int count, uint32_t hw_offset, int *failures)
  * \param hw_offset hardware address for the buffer
  */
 int
-intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid)
+intel_decode(uint32_t *data, int count,
+	     uint32_t hw_offset,
+	     uint32_t devid,
+	     uint32_t ignore_end_of_batchbuffer)
 {
+    int ret;
     int index = 0;
     int failures = 0;
 
-    out = stderr;
+    out = stdout;
 
     while (index < count) {
 	switch ((data[index] & 0xe0000000) >> 29) {
 	case 0x0:
-	    index += decode_mi(data + index, count - index,
+	    ret = decode_mi(data + index, count - index,
 			       hw_offset + index * 4, &failures);
+
+	    /* If MI_BATCHBUFFER_END happened, then dump the rest of the
+	     * output in case we some day want it in debugging, but don't
+	     * decode it since it'll just confuse in the common case.
+	     */
+	    if (ret == -1) {
+		if (ignore_end_of_batchbuffer) {
+		    index++;
+		} else {
+		    for (index = index + 1; index < count; index++) {
+			instr_out(data, hw_offset, index, "\n");
+		    }
+		}
+	    } else
+		index += ret;
 	    break;
 	case 0x2:
 	    index += decode_2d(data + index, count - index,
@@ -1793,13 +2023,16 @@ intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid)
 	case 0x3:
 	    if (IS_965(devid)) {
 		index += decode_3d_965(data + index, count - index,
-				       hw_offset + index * 4, &failures);
+				       hw_offset + index * 4,
+				       devid, &failures);
 	    } else if (IS_9XX(devid)) {
 		index += decode_3d(data + index, count - index,
-				   hw_offset + index * 4, &failures);
+				   hw_offset + index * 4,
+				   devid, &failures);
 	    } else {
 		index += decode_3d_i830(data + index, count - index,
-					hw_offset + index * 4, &failures);
+					hw_offset + index * 4,
+					devid, &failures);
 	    }
 	    break;
 	default:
@@ -1820,3 +2053,8 @@ void intel_decode_context_reset(void)
     saved_s4_set = 1;
 }
 
+void intel_decode_context_set_head_tail(uint32_t head, uint32_t tail)
+{
+	head_offset = head;
+	tail_offset = tail;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_decode.h b/src/mesa/drivers/dri/intel/intel_decode.h
index c50644a46b5..a13b075cef8 100644
--- a/src/mesa/drivers/dri/intel/intel_decode.h
+++ b/src/mesa/drivers/dri/intel/intel_decode.h
@@ -25,5 +25,7 @@
  *
  */
 
-int intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid);
+int intel_decode(uint32_t *data, int count, uint32_t hw_offset, uint32_t devid,
+		 uint32_t ignore_end_of_batchbuffer);
+void intel_decode_context_set_head_tail(uint32_t head, uint32_t tail);
 void intel_decode_context_reset(void);
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 076fee89bdd..0e2fe893fed 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 #include "main/glheader.h"
+#include "main/arbprogram.h"
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/colormac.h"
@@ -44,7 +45,6 @@
 #include "main/attrib.h"
 #include "main/enable.h"
 #include "main/viewport.h"
-#include "shader/arbprogram.h"
 #include "swrast/swrast.h"
 
 #include "intel_screen.h"
diff --git a/src/mesa/drivers/dri/mach64/mach64_screen.c b/src/mesa/drivers/dri/mach64/mach64_screen.c
index 4bd6dee6c0e..239e8bc8fd0 100644
--- a/src/mesa/drivers/dri/mach64/mach64_screen.c
+++ b/src/mesa/drivers/dri/mach64/mach64_screen.c
@@ -256,7 +256,6 @@ mach64CreateScreen( __DRIscreen *sPriv )
    mach64Screen->driScreen = sPriv;
 
    i = 0;
-   mach64Screen->extensions[i++] = &driFrameTrackingExtension.base;
    if ( mach64Screen->irq != 0 ) {
       mach64Screen->extensions[i++] = &driSwapControlExtension.base;
       mach64Screen->extensions[i++] = &driMediaStreamCounterExtension.base;
diff --git a/src/mesa/drivers/dri/mga/mga_xmesa.c b/src/mesa/drivers/dri/mga/mga_xmesa.c
index 31007ccb1da..3a31dfb44a3 100644
--- a/src/mesa/drivers/dri/mga/mga_xmesa.c
+++ b/src/mesa/drivers/dri/mga/mga_xmesa.c
@@ -182,7 +182,6 @@ mgaFillInModes( __DRIscreen *psp,
 const __DRIextension *mgaScreenExtensions[] = {
     &driReadDrawableExtension,
     &driSwapControlExtension.base,
-    &driFrameTrackingExtension.base,
     &driMediaStreamCounterExtension.base,
     NULL
 };
diff --git a/src/mesa/drivers/dri/r128/r128_screen.c b/src/mesa/drivers/dri/r128/r128_screen.c
index 2d918028236..7626a159d6a 100644
--- a/src/mesa/drivers/dri/r128/r128_screen.c
+++ b/src/mesa/drivers/dri/r128/r128_screen.c
@@ -221,7 +221,6 @@ r128CreateScreen( __DRIscreen *sPriv )
    r128Screen->driScreen = sPriv;
 
    i = 0;
-   r128Screen->extensions[i++] = &driFrameTrackingExtension.base;
    if ( r128Screen->irq != 0 ) {
        r128Screen->extensions[i++] = &driSwapControlExtension.base;
        r128Screen->extensions[i++] = &driMediaStreamCounterExtension.base;
diff --git a/src/mesa/drivers/dri/r128/r128_state.c b/src/mesa/drivers/dri/r128/r128_state.c
index 4d773feaaa8..9ad25f7f463 100644
--- a/src/mesa/drivers/dri/r128/r128_state.c
+++ b/src/mesa/drivers/dri/r128/r128_state.c
@@ -42,6 +42,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/colormac.h"
+#include "main/macros.h"
 #include "swrast/swrast.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
diff --git a/src/mesa/drivers/dri/r128/r128_tex.c b/src/mesa/drivers/dri/r128/r128_tex.c
index 4ec4be9a47b..b5a19b510af 100644
--- a/src/mesa/drivers/dri/r128/r128_tex.c
+++ b/src/mesa/drivers/dri/r128/r128_tex.c
@@ -44,6 +44,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/texobj.h"
 #include "main/imports.h"
 #include "main/texobj.h"
+#include "main/macros.h"
 
 #include "xmlpool.h"
 
diff --git a/src/mesa/drivers/dri/r200/r200_fragshader.c b/src/mesa/drivers/dri/r200/r200_fragshader.c
index 85c1b7bdd19..2a9268dd343 100644
--- a/src/mesa/drivers/dri/r200/r200_fragshader.c
+++ b/src/mesa/drivers/dri/r200/r200_fragshader.c
@@ -26,11 +26,11 @@
  **************************************************************************/
 
 #include "main/glheader.h"
+#include "main/atifragshader.h"
 #include "main/macros.h"
 #include "main/enums.h"
 #include "tnl/t_context.h"
-#include "shader/atifragshader.h"
-#include "shader/program.h"
+#include "program/program.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_tex.h"
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index b72f69b7f45..df73de5394a 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -253,113 +253,6 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
    }
 }
 
-/* This version of AllocateMemoryMESA allocates only GART memory, and
- * only does so after the point at which the driver has been
- * initialized.
- *
- * Theoretically a valid context isn't required.  However, in this
- * implementation, it is, as I'm using the hardware lock to protect
- * the kernel data structures, and the current context to get the
- * device fd.
- */
-void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
-			     GLfloat readfreq, GLfloat writefreq,
-			     GLfloat priority)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   r200ContextPtr rmesa;
-   int region_offset;
-   drm_radeon_mem_alloc_t alloc;
-   int ret;
-
-   if (R200_DEBUG & RADEON_IOCTL)
-      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq,
-	      writefreq, priority);
-
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map)
-      return NULL;
-
-   if (getenv("R200_NO_ALLOC"))
-      return NULL;
-
-   alloc.region = RADEON_MEM_REGION_GART;
-   alloc.alignment = 0;
-   alloc.size = size;
-   alloc.region_offset = &region_offset;
-
-   ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd,
-			      DRM_RADEON_ALLOC,
-			      &alloc, sizeof(alloc));
-
-   if (ret) {
-      fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret);
-      return NULL;
-   }
-
-   {
-      char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-      return (void *)(region_start + region_offset);
-   }
-}
-
-
-/* Called via glXFreeMemoryMESA() */
-void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   r200ContextPtr rmesa;
-   ptrdiff_t region_offset;
-   drm_radeon_mem_free_t memfree;
-   int ret;
-
-   if (R200_DEBUG & RADEON_IOCTL)
-      fprintf(stderr, "%s %p\n", __FUNCTION__, pointer);
-
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) {
-      fprintf(stderr, "%s: no context\n", __FUNCTION__);
-      return;
-   }
-
-   region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-
-   if (region_offset < 0 ||
-       region_offset > rmesa->radeon.radeonScreen->gartTextures.size) {
-      fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
-	      rmesa->radeon.radeonScreen->gartTextures.size);
-      return;
-   }
-
-   memfree.region = RADEON_MEM_REGION_GART;
-   memfree.region_offset = region_offset;
-
-   ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd,
-			  DRM_RADEON_FREE,
-			  &memfree, sizeof(memfree));
-
-   if (ret)
-      fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret);
-}
-
-/* Called via glXGetMemoryOffsetMESA() */
-GLuint r200GetMemoryOffsetMESA(__DRIscreen *screen, const GLvoid *pointer)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   r200ContextPtr rmesa;
-   GLuint card_offset;
-
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) ) {
-      fprintf(stderr, "%s: no context\n", __FUNCTION__);
-      return ~0;
-   }
-
-   if (!r200IsGartMemory( rmesa, pointer, 0 ))
-      return ~0;
-
-   card_offset = r200GartOffsetFromVirtual( rmesa, pointer );
-
-   return card_offset - rmesa->radeon.radeonScreen->gart_base;
-}
-
 GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
 			   GLint size )
 {
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h
index 8d51aefa042..c5dca89bc76 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.h
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.h
@@ -64,11 +64,6 @@ extern void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset);
 
 extern void r200InitIoctlFuncs( struct dd_function_table *functions );
 
-extern void *r200AllocateMemoryMESA( __DRIscreen *screen, GLsizei size, GLfloat readfreq,
-				   GLfloat writefreq, GLfloat priority );
-extern void r200FreeMemoryMESA( __DRIscreen *screen, GLvoid *pointer );
-extern GLuint r200GetMemoryOffsetMESA( __DRIscreen *screen, const GLvoid *pointer );
-
 extern GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
 				   GLint size );
 
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index 12f869d96f8..5d268319f3f 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -33,11 +33,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
-#include "shader/programopt.h"
+#include "program/program.h"
+#include "program/prog_instruction.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
+#include "program/programopt.h"
 #include "tnl/tnl.h"
 
 #include "r200_context.h"
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index ff3801dc676..3167d49bcae 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -23,6 +23,7 @@ C_SOURCES = \
 		radeon_dataflow_deadcode.c \
 		radeon_dataflow_swizzles.c \
 		radeon_optimize.c \
+		radeon_rename_regs.c \
 		r3xx_fragprog.c \
 		r300_fragprog.c \
 		r300_fragprog_swizzle.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 50d9cdb7f2d..c6f47a6f8a4 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -22,6 +22,7 @@ r300compiler = env.ConvenienceLibrary(
         'radeon_pair_schedule.c',
         'radeon_pair_regalloc.c',
         'radeon_optimize.c',
+        'radeon_rename_regs.c',
         'radeon_emulate_branches.c',
         'radeon_emulate_loops.c',
         'radeon_dataflow.c',
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 38312658d65..a326ee4c4fa 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -29,6 +29,7 @@
 #include "radeon_emulate_loops.h"
 #include "radeon_program_alu.h"
 #include "radeon_program_tex.h"
+#include "radeon_rename_regs.h"
 #include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
 #include "r500_fragprog.h"
@@ -97,25 +98,27 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch
 
 void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 {
+	struct emulate_loop_state loop_state;
+
 	rewrite_depth_out(c);
 
+	/* This transformation needs to be done before any of the IF
+	 * instructions are modified. */
+	radeonTransformKILP(&c->Base);
+
 	debug_program_log(c, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
-	
-	if(c->Base.is_r500){
-		rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
+	if (c->Base.is_r500){
+		r500_transform_unroll_loops(&c->Base, &loop_state);	
+		debug_program_log(c, "after r500 transform loops");
 	}
 	else{
-		rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
+		rc_transform_unroll_loops(&c->Base, &loop_state);
+		debug_program_log(c, "after transform loops");
+		
+		rc_emulate_branches(&c->Base);
+		debug_program_log(c, "after emulate branches");
 	}
-	debug_program_log(c, "after emulate loops");
-	
-	rc_emulate_branches(&c->Base);
-
-	debug_program_log(c, "after emulate branches");
 
 	if (c->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
@@ -162,6 +165,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "after deadcode");
 
+	if(!c->Base.is_r500){
+		rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST);
+		debug_program_log(c, "after emulate loops");
+	}
+
 	rc_optimize(&c->Base);
 
 	debug_program_log(c, "after dataflow optimize");
@@ -172,6 +180,16 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "after dataflow passes");
 
+	if(!c->Base.is_r500) {
+		/* This pass makes it easier for the scheduler to group TEX
+		 * instructions and reduces the chances of creating too
+		 * many texture indirections.*/
+		rc_rename_regs(&c->Base);
+		if (c->Base.Error)
+			return;
+		debug_program_log(c, "after register rename");
+	}
+
 	rc_pair_translate(c);
 	if (c->Base.Error)
 		return;
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index 507b2e532fe..d347b4df9cd 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -30,6 +30,7 @@
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
@@ -145,7 +146,8 @@ static unsigned long t_src(struct r300_vertex_program_code *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
 			       t_src_class(src->File),
-			       src->Negate) | (src->RelAddr << 4);
+			       src->Negate) |
+	       (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
@@ -161,7 +163,7 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_src_class(src->File),
 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
-	    (src->RelAddr << 4);
+	       (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static int valid_dst(struct r300_vertex_program_code *vp,
@@ -348,7 +350,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		if (!valid_dst(compiler->code, &vpi->DstReg))
 			continue;
 
-		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+		if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS ||
+		    (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) {
 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
 			return;
 		}
@@ -404,7 +407,7 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 {
 	struct rc_instruction *inst;
 	unsigned int num_orig_temps = 0;
-	char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
 	unsigned int i, j;
 
@@ -463,11 +466,11 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 				unsigned int orig = inst->U.I.DstReg.Index;
 
 				if (!ta[orig].Allocated) {
-					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+					for(j = 0; j < R300_VS_MAX_TEMPS; ++j) {
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+					if (j >= R300_VS_MAX_TEMPS) {
 						fprintf(stderr, "Out of hw temporaries\n");
 					} else {
 						ta[orig].Allocated = 1;
@@ -485,6 +488,44 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 	}
 }
 
+/**
+ * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
+ * and the Saturate opcode modifier. Only Absolute is currently transformed.
+ */
+static int transform_nonnative_modifiers(
+	struct radeon_compiler *c,
+	struct rc_instruction *inst,
+	void* unused)
+{
+	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	unsigned i;
+
+	/* Transform ABS(a) to MAX(a, -a). */
+	for (i = 0; i < opcode->NumSrcRegs; i++) {
+		if (inst->U.I.SrcReg[i].Abs) {
+			struct rc_instruction *new_inst;
+			unsigned temp;
+
+			inst->U.I.SrcReg[i].Abs = 0;
+
+			temp = rc_find_free_temporary(c);
+
+			new_inst = rc_insert_new_instruction(c, inst->Prev);
+			new_inst->U.I.Opcode = RC_OPCODE_MAX;
+			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			new_inst->U.I.DstReg.Index = temp;
+			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
+			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
+			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
+
+			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
+			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+			inst->U.I.SrcReg[i].Index = temp;
+			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
+		}
+	}
+	return 1;
+}
 
 /**
  * Vertex engine cannot read two inputs or two constants at the same time.
@@ -591,6 +632,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
+	struct emulate_loop_state loop_state;
+	
 	compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
 	addArtificialOutputs(compiler);
@@ -600,19 +643,48 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	rc_transform_unroll_loops(&compiler->Base, &loop_state);
+	
+	debug_program_log(compiler, "after transform loops");
+	
+	if (compiler->Base.is_r500){
+		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
+	} else {
+		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
+	}
+	debug_program_log(compiler, "after emulate loops");
+
 	rc_emulate_branches(&compiler->Base);
 
 	debug_program_log(compiler, "after emulate branches");
 
-	{
+	if (compiler->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
 			{ &r300_transform_vertex_alu, 0 },
 			{ &r300_transform_trig_scale_vertex, 0 }
 		};
 		radeonLocalTransform(&compiler->Base, 2, transformations);
-	}
 
-	debug_program_log(compiler, "after native rewrite");
+		debug_program_log(compiler, "after native rewrite");
+	} else {
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_vertex_alu, 0 },
+			{ &radeonTransformTrigSimple, 0 }
+		};
+		radeonLocalTransform(&compiler->Base, 2, transformations);
+
+		debug_program_log(compiler, "after native rewrite");
+
+		/* Note: This pass has to be done seperately from ALU rewrite,
+		 * because it needs to check every instruction.
+		 */
+		struct radeon_program_transformation transformations2[] = {
+			{ &transform_nonnative_modifiers, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations2);
+
+		debug_program_log(compiler, "after emulate modifiers");
+	}
 
 	{
 		/* Note: This pass has to be done seperately from ALU rewrite,
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 632f0bcf4f8..e6b5522c5b9 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "../r300_reg.h"
+#include "radeon_emulate_loops.h"
 
 /**
  * Rewrite IF instructions to use the ALU result special register.
@@ -59,6 +60,31 @@ int r500_transform_IF(
 	return 1;
 }
 
+/**
+ * Rewrite loops to make them easier to emit.  This is not a local
+ * transformation, because it modifies and reorders an entire block of code.
+ */
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state *s)
+{
+	int i;
+	
+	rc_transform_unroll_loops(c, s);
+	
+	for( i = s->LoopCount - 1; i >= 0; i-- ){
+		struct rc_instruction * inst_continue;
+		if(!s->Loops[i].EndLoop){
+			continue;
+		}
+		/* Insert a continue instruction at the end of the loop.  This
+		 * is required in order to emit loops correctly. */
+		inst_continue = rc_insert_new_instruction(c,
+						s->Loops[i].EndIf->Prev);
+		inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE;
+	}
+
+}
+
 static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	unsigned int relevant;
@@ -252,7 +278,7 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
   struct r500_fragment_program_code *code = &c->code.r500;
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
 
-  int n;
+  int n, i;
   uint32_t inst;
   uint32_t inst0;
   char *str = NULL;
@@ -275,8 +301,8 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
 	    to_mask((inst >> 15) & 0xf));
 
     switch(inst0 & 0x3) {
-    case 0:
-    case 1:
+    case R500_INST_TYPE_ALU:
+    case R500_INST_TYPE_OUT:
       fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
       inst = code->inst[n].inst1;
 
@@ -319,9 +345,87 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
 	      (inst >> 23) & 0x3,
 	      (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3);
       break;
-    case 2:
+    case R500_INST_TYPE_FC:
+      fprintf(stderr, "\t2:FC_INST    0x%08x:", code->inst[n].inst2);
+      inst = code->inst[n].inst2;
+      /* JUMP_FUNC JUMP_ANY*/
+      fprintf(stderr, "0x%02x %1x ", inst >> 8 & 0xff,
+          (inst & R500_FC_JUMP_ANY) >> 5);
+      
+      /* OP */
+      switch(inst & 0x7){
+      case R500_FC_OP_JUMP:
+      	fprintf(stderr, "JUMP");
+        break;
+      case R500_FC_OP_LOOP:
+        fprintf(stderr, "LOOP");
+        break;
+      case R500_FC_OP_ENDLOOP:
+        fprintf(stderr, "ENDLOOP");
+        break;
+      case R500_FC_OP_REP:
+        fprintf(stderr, "REP");
+        break;
+      case R500_FC_OP_ENDREP:
+        fprintf(stderr, "ENDREP");
+        break;
+      case R500_FC_OP_BREAKLOOP:
+        fprintf(stderr, "BREAKLOOP");
+        break;
+      case R500_FC_OP_BREAKREP:
+        fprintf(stderr, "BREAKREP");
+	break;
+      case R500_FC_OP_CONTINUE:
+        fprintf(stderr, "CONTINUE");
+        break;
+      }
+      fprintf(stderr," "); 
+      /* A_OP */
+      switch(inst & (0x3 << 6)){
+      case R500_FC_A_OP_NONE:
+        fprintf(stderr, "NONE");
+        break;
+      case R500_FC_A_OP_POP:
+	fprintf(stderr, "POP");
+        break;
+      case R500_FC_A_OP_PUSH:
+        fprintf(stderr, "PUSH");
+        break;
+      }
+      /* B_OP0 B_OP1 */
+      for(i=0; i<2; i++){
+        fprintf(stderr, " ");
+        switch(inst & (0x3 << (24 + (i * 2)))){
+        /* R500_FC_B_OP0_NONE 
+	 * R500_FC_B_OP1_NONE */
+	case 0:
+          fprintf(stderr, "NONE");
+          break;
+        case R500_FC_B_OP0_DECR:
+        case R500_FC_B_OP1_DECR:
+          fprintf(stderr, "DECR");
+          break;
+        case R500_FC_B_OP0_INCR:
+        case R500_FC_B_OP1_INCR:
+          fprintf(stderr, "INCR");
+          break;
+        }
+      }
+      /*POP_CNT B_ELSE */
+      fprintf(stderr, " %d %1x", (inst >> 16) & 0x1f, (inst & R500_FC_B_ELSE) >> 4);
+      inst = code->inst[n].inst3;
+      /* JUMP_ADDR */
+      fprintf(stderr, " %d", inst >> 16);
+      
+      if(code->inst[n].inst2 & R500_FC_IGNORE_UNCOVERED){
+        fprintf(stderr, " IGN_UNC");
+      }
+      inst = code->inst[n].inst3;
+      fprintf(stderr, "\n\t3:FC_ADDR    0x%08x:", inst);
+      fprintf(stderr, "BOOL: 0x%02x, INT: 0x%02x, JUMP_ADDR: %d, JMP_GLBL: %1x\n",
+      inst & 0x1f, (inst >> 8) & 0x1f, (inst >> 16) & 0x1ff, inst >> 31); 
       break;
-    case 3:
+    case R500_INST_TYPE_TEX:
       inst = code->inst[n].inst1;
       fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
 	      to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 4efbae7ba67..0d005a794ff 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -36,6 +36,8 @@
 #include "radeon_compiler.h"
 #include "radeon_swizzle.h"
 
+struct emulate_loop_state;
+
 extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
 extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c);
@@ -47,4 +49,6 @@ extern int r500_transform_IF(
 	struct rc_instruction * inst,
 	void* data);
 
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state * s);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index fb2d8b5a9c0..0bd8f0a239f 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,6 +45,8 @@
 
 #include "radeon_program_pair.h"
 
+#define MAX_BRANCH_DEPTH_FULL 32
+#define MAX_BRANCH_DEPTH_PARTIAL 4
 
 #define PROG_CODE \
 	struct r500_fragment_program_code *code = &c->code->code.r500
@@ -61,6 +63,10 @@ struct branch_info {
 	int Endif;
 };
 
+struct loop_info {
+	int LoopStart;
+};
+
 struct emit_state {
 	struct radeon_compiler * C;
 	struct r500_fragment_program_code * Code;
@@ -69,7 +75,12 @@ struct emit_state {
 	unsigned int CurrentBranchDepth;
 	unsigned int BranchesReserved;
 
+	struct loop_info * Loops;
+	unsigned int CurrentLoopDepth;
+	unsigned int LoopsReserved;
+
 	unsigned int MaxBranchDepth;
+
 };
 
 static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
@@ -359,16 +370,49 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 
 	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
 
-	if (inst->U.I.Opcode == RC_OPCODE_IF) {
-		if (s->CurrentBranchDepth >= 32) {
+	switch(inst->U.I.Opcode){
+	struct branch_info * branch;
+	struct loop_info * loop;
+	case RC_OPCODE_BGNLOOP:
+		memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
+
+		loop = &s->Loops[s->CurrentLoopDepth++];
+		
+		/* We don't emit an instruction for BGNLOOP, so we need to
+		 * decrement the instruction counter, but first we need to
+		 * set LoopStart to the current value of inst_end, which
+		 * will end up being the first real instruction in the loop.*/
+		loop->LoopStart = s->Code->inst_end--;
+		break;
+	
+	case RC_OPCODE_BRK:
+		/* Don't emit an instruction for BRK */
+		s->Code->inst_end--;
+		break;
+
+	case RC_OPCODE_CONTINUE:
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP |
+			R500_FC_JUMP_FUNC(0xff);
+		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart);
+		break;
+
+	case RC_OPCODE_ENDLOOP:
+		/* Don't emit an instruction for ENDLOOP */
+		s->Code->inst_end--;
+		s->CurrentLoopDepth--;
+		break;
+
+	case RC_OPCODE_IF:
+		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
 			return;
 		}
-
 		memory_pool_array_reserve(&s->C->Pool, struct branch_info,
 				s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1);
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++];
+		branch = &s->Branches[s->CurrentBranchDepth++];
 		branch->If = newip;
 		branch->Else = -1;
 		branch->Endif = -1;
@@ -377,29 +421,50 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->MaxBranchDepth = s->CurrentBranchDepth;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ELSE) {
+		break;
+	
+	case RC_OPCODE_ELSE:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
 		branch->Else = newip;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+		break;
+
+	case RC_OPCODE_ENDIF:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
-		branch->Endif = newip;
-
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
+		
+		if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){
+			branch->Endif = --s->Code->inst_end;
+			s->Code->inst[branch->Endif].inst2 |=
+				R500_FC_B_OP0_DECR;
+		}
+		else{
+			branch->Endif = newip;
+		
+			s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+				| R500_FC_A_OP_NONE /* no address stack */
+				| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+				| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+				| R500_FC_B_OP1_NONE /* no branch counter if stay */
+				| R500_FC_B_POP_CNT(1)
+			;
+			s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+		}
 		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
 			| R500_FC_A_OP_NONE /* no address stack */
 			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
 			| R500_FC_B_OP0_INCR /* increment branch counter if stay */
+			| R500_FC_IGNORE_UNCOVERED
 		;
 
 		if (branch->Else >= 0) {
@@ -421,17 +486,10 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 		}
 
-		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
-			| R500_FC_A_OP_NONE /* no address stack */
-			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
-			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
-			| R500_FC_B_OP1_NONE /* no branch counter if stay */
-			| R500_FC_B_POP_CNT(1)
-		;
-		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 
 		s->CurrentBranchDepth--;
-	} else {
+		break;
+	default:
 		rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name);
 	}
 }
@@ -486,6 +544,10 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
 
+	/* Use FULL flow control mode if branches are nested deep enough.
+	 * We don not need to enable FULL flow control mode for loops, becasue
+	 * we aren't using the hardware loop instructions.
+	 */
 	if (s.MaxBranchDepth >= 4) {
 		if (code->max_temp_idx < 1)
 			code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index 1979e7e4e49..d03689763bc 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -235,8 +235,11 @@ struct rX00_fragment_program_code {
 };
 
 
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-#define VSF_MAX_FRAGMENT_TEMPS (14)
+#define R300_VS_MAX_ALU		256
+#define R300_VS_MAX_ALU_DWORDS  (R300_VS_MAX_ALU * 4)
+#define R500_VS_MAX_ALU	        1024
+#define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
+#define R300_VS_MAX_TEMPS	32
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -244,8 +247,8 @@ struct rX00_fragment_program_code {
 struct r300_vertex_program_code {
 	int length;
 	union {
-		uint32_t d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
+		uint32_t d[R500_VS_MAX_ALU_DWORDS];
+		float f[R500_VS_MAX_ALU_DWORDS];
 	} body;
 
 	int pos_end;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index e3c2c83c0cf..fbb4235c223 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -202,32 +202,65 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 	    inst = inst->Prev) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		if (opcode->IsFlowControl) {
-			if (opcode->Opcode == RC_OPCODE_ENDIF) {
-				push_branch(&s);
-			} else {
-				if (s.BranchStackSize) {
-					struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
-
-					if (opcode->Opcode == RC_OPCODE_IF) {
-						or_updatemasks(&s.R,
-								&s.R,
-								branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
-
-						s.BranchStackSize--;
-					} else if (opcode->Opcode == RC_OPCODE_ELSE) {
-						if (branch->HaveElse) {
-							rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
-						} else {
-							memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
-							memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
-							branch->HaveElse = 1;
-						}
+		switch(opcode->Opcode){
+		/* Mark all sources in the loop body as used before doing
+		 * normal deadcode analysis.  This is probably not optimal.
+		 */
+		case RC_OPCODE_ENDLOOP:
+		{
+			int endloops = 1;
+			struct rc_instruction *ptr;
+			for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){
+				opcode = rc_get_opcode_info(ptr->U.I.Opcode);
+				if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
+					endloops--;
+					continue;
+				}
+				if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){
+					endloops++;
+					continue;
+				}
+				if(opcode->HasDstReg){
+					int src = 0;
+					unsigned int srcmasks[3];
+					rc_compute_sources_for_writemask(ptr,
+						ptr->U.I.DstReg.WriteMask, srcmasks);
+					for(src=0; src < opcode->NumSrcRegs; src++){
+						mark_used(&s,
+							ptr->U.I.SrcReg[src].File,
+							ptr->U.I.SrcReg[src].Index,
+							srcmasks[src]);
+					}
+				}
+			}
+			break;
+		}
+		case RC_OPCODE_CONTINUE:
+		case RC_OPCODE_BRK:
+		case RC_OPCODE_BGNLOOP:
+			break;
+		case RC_OPCODE_ENDIF:
+			push_branch(&s);
+			break;
+		default:
+			if (opcode->IsFlowControl && s.BranchStackSize) {
+				struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
+				if (opcode->Opcode == RC_OPCODE_IF) {
+					or_updatemasks(&s.R,
+							&s.R,
+							branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
+
+					s.BranchStackSize--;
+				} else if (opcode->Opcode == RC_OPCODE_ELSE) {
+					if (branch->HaveElse) {
+						rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
 					} else {
-						rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
+						memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
+						memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
+						branch->HaveElse = 1;
 					}
 				} else {
-					rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__);
+					rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
 				}
 			}
 		}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 4c5d29f4217..131e9e7436d 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -38,22 +38,6 @@
 
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
-struct emulate_loop_state {
-	struct radeon_compiler * C;
-	struct loop_info * Loops;
-	unsigned int LoopCount;
-	unsigned int LoopReserved;
-};
-
-struct loop_info {
-	struct rc_instruction * BeginLoop;
-	struct rc_instruction * Cond;
-	struct rc_instruction * If;
-	struct rc_instruction * Brk;
-	struct rc_instruction * EndIf;
-	struct rc_instruction * EndLoop;
-};
-
 struct const_value {
 	
 	struct radeon_compiler * C;
@@ -94,22 +78,13 @@ static int src_reg_is_immediate(struct rc_src_register * src,
 	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
 }
 
-static unsigned int loop_count_instructions(struct loop_info * loop)
+static unsigned int loop_calc_iterations(struct emulate_loop_state *s, 
+			struct loop_info * loop, unsigned int max_instructions)
 {
-	unsigned int count = 0;
-	struct rc_instruction * inst = loop->BeginLoop->Next;
-	while(inst != loop->EndLoop){
-		count++;
-		inst = inst->Next;
-	}
-	return count;
-}
-
-static unsigned int loop_calc_iterations(struct loop_info * loop,
-		unsigned int loop_count, unsigned int max_instructions)
-{
-	unsigned int icount = loop_count_instructions(loop);
-	return max_instructions / (loop_count * icount);
+	unsigned int total_i = rc_recompute_ips(s->C);
+	unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1;
+	/* +1 because the program already has one iteration of the loop. */
+	return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i));
 }
 
 static void loop_unroll(struct emulate_loop_state * s,
@@ -214,8 +189,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 }
 
 static int transform_const_loop(struct emulate_loop_state * s,
-						struct loop_info * loop,
-						struct rc_instruction * cond)
+						struct loop_info * loop)
 {
 	int end_loops = 1;
 	int iterations;
@@ -228,13 +202,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
 
 	/* Find the counter and the upper limit */
 	
-	if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
-		limit = &cond->U.I.SrcReg[0];
-		counter = &cond->U.I.SrcReg[1];
+	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
+		limit = &loop->Cond->U.I.SrcReg[0];
+		counter = &loop->Cond->U.I.SrcReg[1];
 	}
-	else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
-		limit = &cond->U.I.SrcReg[1];
-		counter = &cond->U.I.SrcReg[0];
+	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
+		limit = &loop->Cond->U.I.SrcReg[1];
+		counter = &loop->Cond->U.I.SrcReg[0];
 	}
 	else{
 		DBG("No constant limit.\n");
@@ -293,8 +267,22 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	 * simple, since we only support increment and decrement loops.
 	 */
 	limit_value = get_constant_value(s->C, limit, 0);
-	iterations = (int) ((limit_value - counter_value.Value) /
+	DBG("Limit is %f.\n", limit_value);
+	switch(loop->Cond->U.I.Opcode){
+	case RC_OPCODE_SGT:
+	case RC_OPCODE_SLT:
+		iterations = (int) ceilf((limit_value - counter_value.Value) /
 							count_inst.Amount);
+		break;
+
+	case RC_OPCODE_SLE:
+	case RC_OPCODE_SGE:
+		iterations = (int) floorf((limit_value - counter_value.Value) /
+							count_inst.Amount) + 1;
+		break;
+	default:
+		return 0;
+	}
 
 	DBG("Loop will have %d iterations.\n", iterations);
 	
@@ -414,7 +402,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 	}
 	
 	/* Check if the number of loops is known at compile time. */
-	if(transform_const_loop(s, loop, ptr)){
+	if(transform_const_loop(s, loop)){
 		return loop->BeginLoop->Next;
 	}
 
@@ -425,9 +413,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 	return loop->EndLoop;
 }
 
-static void rc_transform_loops(struct emulate_loop_state * s)
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+					struct emulate_loop_state * s)
 {
-	struct rc_instruction * ptr = s->C->Program.Instructions.Next;
+	struct rc_instruction * ptr;
+	
+	memset(s, 0, sizeof(struct emulate_loop_state));
+	s->C = c;
+	ptr = s->C->Program.Instructions.Next;
 	while(ptr != &s->C->Program.Instructions) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
 					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
@@ -440,7 +433,7 @@ static void rc_transform_loops(struct emulate_loop_state * s)
 	}
 }
 
-static void rc_unroll_loops(struct emulate_loop_state *s,
+void rc_emulate_loops(struct emulate_loop_state *s,
 						unsigned int max_instructions)
 {
 	int i;
@@ -451,24 +444,8 @@ static void rc_unroll_loops(struct emulate_loop_state *s,
 		if(!s->Loops[i].EndLoop){
 			continue;
 		}
-		unsigned int iterations = loop_calc_iterations(&s->Loops[i],
-						s->LoopCount, max_instructions);
+		unsigned int iterations = loop_calc_iterations(s, &s->Loops[i],
+							max_instructions);
 		loop_unroll(s, &s->Loops[i], iterations);
 	}
 }
-
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
-{
-	struct emulate_loop_state s;
-
-	memset(&s, 0, sizeof(struct emulate_loop_state));
-	s.C = c;
-
-	/* We may need to move these two operations to r3xx_(vert|frag)prog.c
-	 * and run the optimization passes between them in order to increase
-	 * the number of unrolls we can do for each loop.
-	 */
-	rc_transform_loops(&s);
-	
-	rc_unroll_loops(&s, max_instructions);
-}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index ddcf1c0fabe..7748813c4eb 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -7,6 +7,26 @@
 
 struct radeon_compiler;
 
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
+struct loop_info {
+	struct rc_instruction * BeginLoop;
+	struct rc_instruction * Cond;
+	struct rc_instruction * If;
+	struct rc_instruction * Brk;
+	struct rc_instruction * EndIf;
+	struct rc_instruction * EndLoop;
+};
+
+struct emulate_loop_state {
+	struct radeon_compiler * C;
+	struct loop_info * Loops;
+	unsigned int LoopCount;
+	unsigned int LoopReserved;
+};
+
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+					struct emulate_loop_state * s);
+
+void rc_emulate_loops(struct emulate_loop_state *s,
+					unsigned int max_instructions);
 
 #endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index 1dc16855dc1..04f234f11d8 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -386,6 +386,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0,
 	},
 	{
+		.Opcode = RC_OPCODE_CONTINUE,
+		.Name = "CONTINUE",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
 		.Opcode = RC_OPCODE_REPL_ALPHA,
 		.Name = "REPL_ALPHA",
 		.HasDstReg = 1
@@ -393,6 +399,10 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 	{
 		.Opcode = RC_OPCODE_BEGIN_TEX,
 		.Name = "BEGIN_TEX"
+	},
+	{
+		.Opcode = RC_OPCODE_KILP,
+		.Name = "KILP",
 	}
 };
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 91c82ac0890..8b9fa07dde2 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -187,6 +187,8 @@ typedef enum {
 
 	RC_OPCODE_ENDLOOP,
 
+	RC_OPCODE_CONTINUE,
+
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
 	 * across all other channels */
@@ -197,6 +199,9 @@ typedef enum {
 	 * can run simultaneously. */
 	RC_OPCODE_BEGIN_TEX,
 
+	/** Stop execution of the shader (GLSL discard) */
+	RC_OPCODE_KILP,
+
 	MAX_RC_OPCODE
 } rc_opcode;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 21d72108886..eca06515367 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -75,6 +75,15 @@ struct peephole_state {
 	int BranchDepth;
 };
 
+/**
+ * This is a callback function that is meant to be passed to
+ * rc_for_all_reads_mask.  This function will be called once for each source
+ * register in inst.
+ * @param inst The instruction that the source register belongs to.
+ * @param file The register file of the source register.
+ * @param index The index of the source register.
+ * @param mask The components of the source register that are being read from.
+ */
 static void peephole_scan_read(void * data, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int mask)
 {
@@ -153,6 +162,11 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 	for(struct rc_instruction * inst = inst_mov->Next;
 	    inst != &c->Program.Instructions;
 	    inst = inst->Next) {
+		/* XXX In the future we might be able to make the optimizer
+		 * smart enough to handle loops. */
+		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){
+			return;
+		}
 		rc_for_all_reads_mask(inst, peephole_scan_read, &s);
 		rc_for_all_writes_mask(inst, peephole_scan_write, &s);
 		if (s.Conflict)
@@ -161,7 +175,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 		if (s.BranchDepth >= 0) {
 			if (inst->U.I.Opcode == RC_OPCODE_IF) {
 				s.BranchDepth++;
-			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
+				|| inst->U.I.Opcode == RC_OPCODE_ELSE) {
 				s.BranchDepth--;
 				if (s.BranchDepth < 0) {
 					s.DefinedMask &= ~s.MovMask;
@@ -208,7 +223,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 		if (s.BranchDepth >= 0) {
 			if (inst->U.I.Opcode == RC_OPCODE_IF) {
 				s.BranchDepth++;
-			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+			} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
+				|| inst->U.I.Opcode == RC_OPCODE_ELSE) {
 				s.BranchDepth--;
 				if (s.BranchDepth < 0)
 					break; /* no more readers after this point */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index a279549ff89..fc540496c41 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -141,12 +141,28 @@ static void add_inst_to_list(struct schedule_instruction ** list, struct schedul
 	*list = inst;
 }
 
+static void add_inst_to_list_end(struct schedule_instruction ** list,
+					struct schedule_instruction * inst)
+{
+	if(!*list){
+		*list = inst;
+	}else{
+		struct schedule_instruction * temp = *list;
+		while(temp->NextReady){
+			temp = temp->NextReady;
+		}
+		temp->NextReady = inst;
+	}
+}
+
 static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst)
 {
 	DBG("%i is now ready\n", sinst->Instruction->IP);
 
+	/* Adding Ready TEX instructions to the end of the "Ready List" helps
+	 * us emit TEX instructions in blocks without losing our place. */
 	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL)
-		add_inst_to_list(&s->ReadyTEX, sinst);
+		add_inst_to_list_end(&s->ReadyTEX, sinst);
 	else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP)
 		add_inst_to_list(&s->ReadyRGB, sinst);
 	else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP)
@@ -163,11 +179,14 @@ static void decrease_dependencies(struct schedule_state * s, struct schedule_ins
 		instruction_ready(s, sinst);
 }
 
-static void commit_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
-{
-	DBG("%i: commit\n", sinst->Instruction->IP);
-
-	for(unsigned int i = 0; i < sinst->NumReadValues; ++i) {
+/**
+ * This function decreases the dependencies of the next instruction that
+ * wants to write to each of sinst's read values.
+ */
+static void commit_update_reads(struct schedule_state * s,
+					struct schedule_instruction * sinst){
+	unsigned int i;
+	for(i = 0; i < sinst->NumReadValues; ++i) {
 		struct reg_value * v = sinst->ReadValues[i];
 		assert(v->NumReaders > 0);
 		v->NumReaders--;
@@ -176,8 +195,12 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru
 				decrease_dependencies(s, v->Next->Writer);
 		}
 	}
+}
 
-	for(unsigned int i = 0; i < sinst->NumWriteValues; ++i) {
+static void commit_update_writes(struct schedule_state * s,
+					struct schedule_instruction * sinst){
+	unsigned int i;
+	for(i = 0; i < sinst->NumWriteValues; ++i) {
 		struct reg_value * v = sinst->WriteValues[i];
 		if (v->NumReaders) {
 			for(struct reg_value_reader * r = v->Readers; r; r = r->Next) {
@@ -196,6 +219,15 @@ static void commit_instruction(struct schedule_state * s, struct schedule_instru
 	}
 }
 
+static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
+{
+	DBG("%i: commit\n", sinst->Instruction->IP);
+
+	commit_update_reads(s, sinst);
+
+	commit_update_writes(s, sinst);
+}
+
 /**
  * Emit all ready texture instructions in a single block.
  *
@@ -208,21 +240,37 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo
 
 	assert(s->ReadyTEX);
 
-	/* Don't let the ready list change under us! */
-	readytex = s->ReadyTEX;
-	s->ReadyTEX = 0;
-
 	/* Node marker for R300 */
 	struct rc_instruction * inst_begin = rc_insert_new_instruction(s->C, before->Prev);
 	inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX;
 
 	/* Link texture instructions back in */
+	readytex = s->ReadyTEX;
 	while(readytex) {
-		struct schedule_instruction * tex = readytex;
+		rc_insert_instruction(before->Prev, readytex->Instruction);
+		DBG("%i: commit TEX reads\n", readytex->Instruction->IP);
+
+		/* All of the TEX instructions in the same TEX block have
+		 * their source registers read from before any of the
+		 * instructions in that block write to their destination
+		 * registers.  This means that when we commit a TEX
+		 * instruction, any other TEX instruction that wants to write
+		 * to one of the committed instruction's source register can be
+		 * marked as ready and should be emitted in the same TEX
+		 * block. This prevents the following sequence from being
+		 * emitted in two different TEX blocks:
+		 * 0: TEX temp[0].xyz, temp[1].xy__, 2D[0];
+		 * 1: TEX temp[1].xyz, temp[2].xy__, 2D[0];
+		 */
+		commit_update_reads(s, readytex);
+		readytex = readytex->NextReady;
+	}
+	readytex = s->ReadyTEX;
+	s->ReadyTEX = 0;
+	while(readytex){
+		DBG("%i: commit TEX writes\n", readytex->Instruction->IP);
+		commit_update_writes(s, readytex);
 		readytex = readytex->NextReady;
-
-		rc_insert_instruction(before->Prev, tex->Instruction);
-		commit_instruction(s, tex);
 	}
 }
 
@@ -328,7 +376,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 		}
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
-		commit_instruction(s, sinst);
+		commit_alu_instruction(s, sinst);
 	} else {
 		struct schedule_instruction **prgb;
 		struct schedule_instruction **palpha;
@@ -346,8 +394,8 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 				*prgb = (*prgb)->NextReady;
 				*palpha = (*palpha)->NextReady;
 				rc_insert_instruction(before->Prev, psirgb->Instruction);
-				commit_instruction(s, psirgb);
-				commit_instruction(s, psialpha);
+				commit_alu_instruction(s, psirgb);
+				commit_alu_instruction(s, psialpha);
 				goto success;
 			}
 		}
@@ -357,7 +405,7 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 		s->ReadyRGB = s->ReadyRGB->NextReady;
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
-		commit_instruction(s, sinst);
+		commit_alu_instruction(s, sinst);
 	success: ;
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index c922d3d9a44..3cc28972934 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -973,3 +973,32 @@ int radeonTransformDeriv(struct radeon_compiler* c,
 
 	return 1;
 }
+
+/**
+ * IF Temp[0].x -\
+ * KILP         - > KIL -abs(Temp[0].x)
+ * ENDIF        -/
+ *
+ * This needs to be done in its own pass, because it modifies the instructions
+ * before and after KILP.
+ */
+void radeonTransformKILP(struct radeon_compiler * c)
+{
+	struct rc_instruction * inst;
+	for (inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		if (inst->U.I.Opcode != RC_OPCODE_KILP
+			|| inst->Prev->U.I.Opcode != RC_OPCODE_IF
+			|| inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+			continue;
+		}
+		inst->U.I.Opcode = RC_OPCODE_KIL;
+		inst->U.I.SrcReg[0] = negate(absolute(inst->Prev->U.I.SrcReg[0]));
+
+		/* Remove IF */
+		rc_remove_instruction(inst->Prev);
+		/* Remove ENDIF */
+		rc_remove_instruction(inst->Next);
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
index 77d444476f2..e6e2cc20c5a 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
@@ -60,4 +60,6 @@ int radeonTransformDeriv(
 	struct rc_instruction * inst,
 	void*);
 
+void radeonTransformKILP(struct radeon_compiler * c);
+
 #endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
new file mode 100644
index 00000000000..31c98668838
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2010 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ */
+
+#include "radeon_rename_regs.h"
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+struct reg_rename {
+	int old_index;
+	int new_index;
+	int temp_index;
+};
+
+static void rename_reg(void * data, struct rc_instruction * inst,
+			rc_register_file * file, unsigned int * index)
+{
+	struct reg_rename *r = data;
+
+	if(r->old_index == *index && *file == RC_FILE_TEMPORARY) {
+		*index = r->new_index;
+	}
+	else if(r->new_index == *index && *file == RC_FILE_TEMPORARY) {
+		*index = r->temp_index;
+	}
+}
+
+static void rename_all(
+	struct radeon_compiler *c,
+	struct rc_instruction * start,
+	unsigned int old,
+	unsigned int new,
+	unsigned int temp)
+{
+	struct rc_instruction * inst;
+	struct reg_rename r;
+	r.old_index = old;
+	r.new_index = new;
+	r.temp_index = temp;
+	for(inst = start; inst != &c->Program.Instructions;
+						inst = inst->Next) {
+		rc_remap_registers(inst, rename_reg, &r);
+	}
+}
+
+/**
+ * This function renames registers in an attempt to get the code close to
+ * SSA form.  After this function has completed, most of the register are only
+ * written to one time, with a few exceptions.  For example, this block of code
+ * will not be modified by this function:
+ * Mov Temp[0].x Const[0].x
+ * Mov Temp[0].y Const[0].y
+ * Basically, destination registers will be renamed if:
+ * 1. There have been no previous writes to that register
+ * or
+ * 2. If the instruction is writting to the exact components (no more, no less)
+ * of a register that has been written to by previous instructions.
+ *
+ * This function assumes all the instructions are still of type
+ * RC_INSTRUCTION_NORMAL.
+ */
+void rc_rename_regs(struct radeon_compiler * c)
+{
+	unsigned int cur_index = 0;
+	unsigned int icount;
+	struct rc_instruction * inst;
+	unsigned int * masks;
+
+	/* The number of instructions in the program is also the maximum
+	 * number of temp registers that could potentially be used. */
+	icount = rc_recompute_ips(c);
+	masks = memory_pool_malloc(&c->Pool, icount * sizeof(unsigned int));
+	memset(masks, 0, icount * sizeof(unsigned int));
+
+	for(inst = c->Program.Instructions.Next;
+					inst != &c->Program.Instructions;
+					inst = inst->Next) {
+		const struct rc_opcode_info * info;
+		if(inst->Type != RC_INSTRUCTION_NORMAL) {
+			rc_error(c, "%s only works with normal instructions.",
+								__FUNCTION__);
+			return;
+		}
+		unsigned int old_index, temp_index;
+		struct rc_dst_register * dst = &inst->U.I.DstReg;
+		info = rc_get_opcode_info(inst->U.I.Opcode);
+		if(!info->HasDstReg || dst->File != RC_FILE_TEMPORARY) {
+			continue;
+		}
+		if(dst->Index >= icount || !masks[dst->Index] ||
+					masks[dst->Index] == dst->WriteMask) {
+			old_index = dst->Index;
+			/* We need to set dst->Index here so get free temporary
+			 * will work. */
+			dst->Index = cur_index++;
+			temp_index = rc_find_free_temporary(c);
+			rename_all(c, inst->Next, old_index,
+						dst->Index, temp_index);
+		}
+		assert(dst->Index < icount);
+		masks[dst->Index] |= dst->WriteMask;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h
new file mode 100644
index 00000000000..4323b995d84
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.h
@@ -0,0 +1,9 @@
+
+#ifndef RADEON_RENAME_REGS_H
+#define RADEON_RENAME_REGS_H
+
+struct radeon_compiler;
+
+void rc_rename_regs(struct radeon_compiler * c);
+
+#endif /* RADEON_RENAME_REGS_H */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 6992ca59dbf..e4b302bbad9 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -376,13 +376,12 @@ static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
 	ctx->Const.MaxDrawBuffers = 1;
 	ctx->Const.MaxColorAttachments = 1;
 
-	/* currently bogus data */
 	if (r300->options.hw_tcl_enabled) {
-		ctx->Const.VertexProgram.MaxNativeInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAluInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAluInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;
 		ctx->Const.VertexProgram.MaxNativeTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;
 		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
 	}
 
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index fbb609b9f61..99540e3354f 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -43,7 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_common.h"
 
 #include "main/mtypes.h"
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 #include "compiler/radeon_code.h"
 
 struct r300_context;
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
index 282c0e18bca..5ae9f49840b 100644
--- a/src/mesa/drivers/dri/r300/r300_draw.c
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -523,8 +523,7 @@ static void r300AllocDmaRegions(GLcontext *ctx, const struct gl_client_array *in
 			r300ConvertAttrib(ctx, count, input[i], &vbuf->attribs[index]);
 		} else {
 			if (input[i]->BufferObj->Name) {
-				if (stride % 4 != 0) {
-					assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0);
+				if (stride % 4 != 0 || (intptr_t)input[i]->Ptr % 4 != 0) {
 					r300AlignDataToDword(ctx, input[i], count, &vbuf->attribs[index]);
 					vbuf->attribs[index].is_named_bo = GL_FALSE;
 				} else {
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
index 7be2f74b5b2..95f4306f604 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_common.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -38,8 +38,8 @@
 
 #include "r300_fragprog_common.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
 
 #include "compiler/radeon_compiler.h"
 
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index ac93563ed9e..f25264b6f2d 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -3066,8 +3066,8 @@ enum {
 #   define R500_FC_B_OP0_NONE				(0 << 24)
 #   define R500_FC_B_OP0_DECR				(1 << 24)
 #   define R500_FC_B_OP0_INCR				(2 << 24)
-#   define R500_FC_B_OP1_DECR				(0 << 26)
-#   define R500_FC_B_OP1_NONE				(1 << 26)
+#   define R500_FC_B_OP1_NONE				(0 << 26)
+#   define R500_FC_B_OP1_DECR				(1 << 26)
 #   define R500_FC_B_OP1_INCR				(2 << 26)
 #   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
 #define R500_US_FC_INT_CONST_0				0x4c00
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index 9c24166ec5b..a9bddf05779 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -27,7 +27,7 @@
 
 #include "main/glheader.h"
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "tnl/tnl.h"
 #include "r300_context.h"
 #include "r300_fragprog_common.h"
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index fa33be49989..0113eecaa3a 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -49,8 +49,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drivers/common/meta.h"
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
 
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index a1fe3780294..67d8b2b3286 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -31,12 +31,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/program.h"
-#include "shader/programopt.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include "program/program.h"
+#include "program/programopt.h"
+#include "program/prog_instruction.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_statevars.h"
 #include "tnl/tnl.h"
 
 #include "compiler/radeon_compiler.h"
diff --git a/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c b/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c
index 9f9dec840b4..471a3723cb9 100644
--- a/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c
+++ b/src/mesa/drivers/dri/r300/radeon_mesa_to_rc.c
@@ -28,8 +28,8 @@
 #include "radeon_mesa_to_rc.h"
 
 #include "main/mtypes.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_instruction.h"
+#include "program/prog_parameter.h"
 
 #include "compiler/radeon_compiler.h"
 #include "compiler/radeon_program.h"
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
index afe2d55dc7c..8013553f679 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.c
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -46,7 +46,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r600_context.h"
 #include "radeon_reg.h"
 #include "r600_cmdbuf.h"
-#include "r600_emit.h"
 #include "radeon_bocs_wrapper.h"
 #include "radeon_reg.h"
 
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
index dff00096999..78fccd0b601 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.h
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
@@ -37,7 +37,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define __R600_CMDBUF_H__
 
 #include "r600_context.h"
-#include "r600_emit.h"
 
 #define RADEON_CP_PACKET3_NOP                       0xC0001000
 #define RADEON_CP_PACKET3_NEXT_CHAR                 0xC0001900
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index f4aed4e87fd..84d9d423124 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -59,7 +59,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_buffer_objects.h"
 #include "radeon_span.h"
 #include "r600_cmdbuf.h"
-#include "r600_emit.h"
 #include "radeon_bocs_wrapper.h"
 #include "radeon_queryobj.h"
 #include "r600_blit.h"
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index de5c5d89fea..99a33df4fcb 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -32,7 +32,7 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
-#include "shader/prog_parameter.h"
+#include "program/prog_parameter.h"
 
 #include "radeon_debug.h"
 #include "r600_context.h"
@@ -293,7 +293,9 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
                 case 2:
                     format = FMT_16_16; break;
                 case 3:
-                    format = FMT_16_16_16; break;
+                    /* 3 comp GL_SHORT vertex format doesnt work on r700
+                       4 somehow works, test - sauerbraten  */
+                    format = FMT_16_16_16_16; break;
                 case 4:
                     format = FMT_16_16_16_16; break;
                 default:
@@ -1262,7 +1264,7 @@ GLboolean checkop3(r700_AssemblerBase* pAsm)
 	    {
             if( GL_FALSE == mov_temp(pAsm, 1) )
             {
-                return 1;
+                return GL_FALSE;
             }
         }
 
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h
index 2d3c32487e6..dbc6cdb1903 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.h
+++ b/src/mesa/drivers/dri/r600/r700_assembler.h
@@ -28,7 +28,7 @@
 #define _R700_ASSEMBLER_H_
 
 #include "main/mtypes.h"
-#include "shader/prog_instruction.h"
+#include "program/prog_instruction.h"
 
 #include "r700_chip.h"
 #include "r700_shaderinst.h"
diff --git a/src/mesa/drivers/dri/r600/r700_chip.h b/src/mesa/drivers/dri/r600/r700_chip.h
index ae249e15fd4..0b6b72f8501 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.h
+++ b/src/mesa/drivers/dri/r600/r700_chip.h
@@ -27,7 +27,9 @@
 #ifndef _R700_CHIP_H_
 #define _R700_CHIP_H_
 
-#include "r600_context.h"
+#include <GL/gl.h>
+
+#include "radeon_common_context.h"
 
 #include "r600_reg.h"
 #include "r600_reg_auto_r6xx.h"
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index fbb808e0662..f9d84b6ed68 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -32,12 +32,13 @@
 #include <math.h>
 
 #include "main/imports.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
-#include "shader/program.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
+#include "program/program.h"
 
 #include "r600_context.h"
 #include "r600_cmdbuf.h"
+#include "r600_emit.h"
 
 #include "r700_fragprog.h"
 
@@ -586,7 +587,9 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
         SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_T, PNT_SPRITE_OVRD_Y_shift, PNT_SPRITE_OVRD_Y_mask);
         SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_0, PNT_SPRITE_OVRD_Z_shift, PNT_SPRITE_OVRD_Z_mask);
         SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_1, PNT_SPRITE_OVRD_W_shift, PNT_SPRITE_OVRD_W_mask);
-        if(ctx->Point.SpriteOrigin == GL_LOWER_LEFT)
+        /* Like e.g. viewport and winding, point sprite coordinates are
+         * inverted when rendering to FBO. */
+        if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == !ctx->DrawBuffer->Name)
             SETbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit);
         else
             CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit);
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.c b/src/mesa/drivers/dri/r600/r700_oglprog.c
index b7124e644a3..83517925115 100644
--- a/src/mesa/drivers/dri/r600/r700_oglprog.c
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.c
@@ -29,7 +29,7 @@
 #include "main/glheader.h"
 #include "main/imports.h"
 
-#include "shader/program.h"
+#include "program/program.h"
 #include "tnl/tnl.h"
 
 #include "r600_context.h"
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.h b/src/mesa/drivers/dri/r600/r700_oglprog.h
index fe2e9d19749..4d421338678 100644
--- a/src/mesa/drivers/dri/r600/r700_oglprog.h
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.h
@@ -27,7 +27,7 @@
 
 #ifndef _R700_OGLPROG_H_
 #define _R700_OGLPROG_H_
-#include "r600_context.h"
+#include "main/dd.h"
 
 extern void r700InitShaderFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
index ac64bbf874f..5ea8918611c 100644
--- a/src/mesa/drivers/dri/r600/r700_state.c
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -41,8 +41,8 @@
 #include "main/framebuffer.h"
 #include "drivers/common/meta.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 #include "vbo/vbo.h"
 
 #include "r600_context.h"
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
index 14dd2a5482c..137f3007ced 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.c
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -35,14 +35,15 @@
 #include "main/mtypes.h"
 
 #include "tnl/t_context.h"
-#include "shader/program.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_statevars.h"
+#include "program/program.h"
+#include "program/prog_parameter.h"
+#include "program/prog_statevars.h"
 
 #include "radeon_debug.h"
 #include "r600_context.h"
 #include "r600_cmdbuf.h"
-#include "shader/programopt.h"
+#include "r600_emit.h"
+#include "program/programopt.h"
 
 #include "r700_debug.h"
 #include "r700_vertprog.h"
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 94f476617b6..5a7d52c4d2f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -300,10 +300,10 @@ void radeonDestroyContext(__DRIcontext *driContextPriv )
 	_mesa_meta_free(radeon->glCtx);
 
 	if (radeon == current) {
-		radeon_firevertices(radeon);
 		_mesa_make_current(NULL, NULL, NULL);
 	}
 
+	radeon_firevertices(radeon);
 	if (!is_empty_list(&radeon->dma.reserved)) {
 		rcommonFlushCmdBuf( radeon, __FUNCTION__ );
 	}
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 6cd1d87de24..c877e6c1765 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -602,17 +602,17 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t
 			__FUNCTION__, texObj ,t->minLod, t->maxLod);
 
 	radeon_mipmap_tree *dst_miptree;
-	dst_miptree = get_biggest_matching_miptree(t, t->minLod, t->maxLod);
+	dst_miptree = get_biggest_matching_miptree(t, t->base.BaseLevel, t->base.MaxLevel);
 
+	radeon_miptree_unreference(&t->mt);
 	if (!dst_miptree) {
-		radeon_miptree_unreference(&t->mt);
 		radeon_try_alloc_miptree(rmesa, t);
-		dst_miptree = t->mt;
 		radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 			"%s: No matching miptree found, allocated new one %p\n",
 			__FUNCTION__, t->mt);
 
 	} else {
+		radeon_miptree_reference(dst_miptree, &t->mt);
 		radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 			"%s: Using miptree %p\n", __FUNCTION__, t->mt);
 	}
@@ -629,7 +629,7 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t
 				"Checking image level %d, face %d, mt %p ... ",
 				level, face, img->mt);
 			
-			if (img->mt != dst_miptree) {
+			if (img->mt != t->mt) {
 				radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 					"MIGRATING\n");
 
@@ -637,7 +637,7 @@ int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *t
 				if (src_bo && radeon_bo_is_referenced_by_cs(src_bo, rmesa->cmdbuf.cs)) {
 					radeon_firevertices(rmesa);
 				}
-				migrate_image_to_miptree(dst_miptree, img, face, level);
+				migrate_image_to_miptree(t->mt, img, face, level);
 			} else
 				radeon_print(RADEON_TEXTURE, RADEON_TRACE, "OK\n");
 		}
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 4f59511a528..82107cc6aeb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -52,7 +52,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tex.h"
 #elif defined(RADEON_R200)
 #include "r200_context.h"
-#include "r200_ioctl.h"
 #include "r200_tex.h"
 #elif defined(RADEON_R300)
 #include "r300_context.h"
@@ -338,12 +337,6 @@ static const __DRItexBufferExtension radeonTexBufferExtension = {
 #endif
 
 #if defined(RADEON_R200)
-static const __DRIallocateExtension r200AllocateExtension = {
-    { __DRI_ALLOCATE, __DRI_ALLOCATE_VERSION },
-    r200AllocateMemoryMESA,
-    r200FreeMemoryMESA,
-    r200GetMemoryOffsetMESA
-};
 
 static const __DRItexOffsetExtension r200texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
@@ -1209,7 +1202,6 @@ radeonCreateScreen( __DRIscreen *sPriv )
 
    i = 0;
    screen->extensions[i++] = &driCopySubBufferExtension.base;
-   screen->extensions[i++] = &driFrameTrackingExtension.base;
    screen->extensions[i++] = &driReadDrawableExtension;
 
    if ( screen->irq != 0 ) {
@@ -1222,9 +1214,6 @@ radeonCreateScreen( __DRIscreen *sPriv )
 #endif
 
 #if defined(RADEON_R200)
-   if (IS_R200_CLASS(screen))
-      screen->extensions[i++] = &r200AllocateExtension.base;
-
    screen->extensions[i++] = &r200texOffsetExtension.base;
 #endif
 
@@ -1366,8 +1355,8 @@ radeonCreateScreen2(__DRIscreen *sPriv)
 
    i = 0;
    screen->extensions[i++] = &driCopySubBufferExtension.base;
-   screen->extensions[i++] = &driFrameTrackingExtension.base;
    screen->extensions[i++] = &driReadDrawableExtension;
+   screen->extensions[i++] = &dri2ConfigQueryExtension.base;
 
    if ( screen->irq != 0 ) {
        screen->extensions[i++] = &driSwapControlExtension.base;
@@ -1379,9 +1368,6 @@ radeonCreateScreen2(__DRIscreen *sPriv)
 #endif
 
 #if defined(RADEON_R200)
-   if (IS_R200_CLASS(screen))
-       screen->extensions[i++] = &r200AllocateExtension.base;
-
    screen->extensions[i++] = &r200TexBufferExtension.base;
 #endif
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c b/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c
index 3ababb1ef53..f878b48e5f9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex_getimage.c
@@ -31,6 +31,7 @@
 
 #include "radeon_common_context.h"
 #include "radeon_texture.h"
+#include "radeon_mipmap_tree.h"
 
 #include "main/texgetimage.h"
 
@@ -51,7 +52,15 @@ radeon_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
                  __func__, ctx, texObj, image, compressed);
 
     if (image->mt) {
+        radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
         /* Map the texture image read-only */
+        if (radeon_bo_is_referenced_by_cs(image->mt->bo, rmesa->cmdbuf.cs)) {
+            radeon_print(RADEON_TEXTURE, RADEON_VERBOSE,
+                "%s: called for texture that is queued for GPU processing\n",
+                __func__);
+            radeon_firevertices(rmesa);
+        }
+
         radeon_teximage_map(image, GL_FALSE);
     } else {
         /* Image hasn't been uploaded to a miptree yet */
diff --git a/src/mesa/drivers/dri/sis/sis_state.c b/src/mesa/drivers/dri/sis/sis_state.c
index a22195ccceb..6173231a82e 100644
--- a/src/mesa/drivers/dri/sis/sis_state.c
+++ b/src/mesa/drivers/dri/sis/sis_state.c
@@ -37,6 +37,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "sis_lock.h"
 
 #include "main/context.h"
+#include "main/macros.h"
 #include "swrast/swrast.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
diff --git a/src/mesa/drivers/dri/unichrome/via_screen.c b/src/mesa/drivers/dri/unichrome/via_screen.c
index ee10b569bf1..4b3e9d5a38f 100644
--- a/src/mesa/drivers/dri/unichrome/via_screen.c
+++ b/src/mesa/drivers/dri/unichrome/via_screen.c
@@ -166,7 +166,6 @@ viaInitDriver(__DRIscreen *sPriv)
     viaScreen->sareaPrivOffset = gDRIPriv->sarea_priv_offset;
 
     i = 0;
-    viaScreen->extensions[i++] = &driFrameTrackingExtension.base;
     viaScreen->extensions[i++] = &driReadDrawableExtension;
     if ( viaScreen->irqEnabled ) {
 	viaScreen->extensions[i++] = &driSwapControlExtension.base;
diff --git a/src/mesa/drivers/glslcompiler/glslcompiler.c b/src/mesa/drivers/glslcompiler/glslcompiler.c
index d58f32b2930..7259bf4c560 100644
--- a/src/mesa/drivers/glslcompiler/glslcompiler.c
+++ b/src/mesa/drivers/glslcompiler/glslcompiler.c
@@ -49,16 +49,14 @@
 #include "main/context.h"
 #include "main/extensions.h"
 #include "main/framebuffer.h"
-#include "main/shaders.h"
-#include "shader/shader_api.h"
-#include "shader/prog_print.h"
+#include "main/shaderapi.h"
+#include "main/shaderobj.h"
+#include "program/prog_print.h"
 #include "drivers/common/driverfuncs.h"
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 #include "swrast/swrast.h"
-#include "swrast/s_context.h"
-#include "swrast/s_triangle.h"
 #include "swrast_setup/swrast_setup.h"
 #include "vbo/vbo.h"
 
@@ -72,6 +70,7 @@ struct options {
    gl_prog_print_mode Mode;
    const char *VertFile;
    const char *FragFile;
+   const char *GeoFile;
    const char *OutputFile;
    GLboolean Params;
    struct gl_sl_pragmas Pragmas;
@@ -126,6 +125,7 @@ CreateContext(void)
          _mesa_destroy_visual(vis);
       if (buf)
          _mesa_destroy_framebuffer(buf);
+      free(cc);
       return GL_FALSE;
    }
 
@@ -143,6 +143,7 @@ CreateContext(void)
        !_tnl_CreateContext( ctx ) ||
        !_swsetup_CreateContext( ctx )) {
       _mesa_destroy_visual(vis);
+      _mesa_destroy_framebuffer(buf);
       _mesa_free_context_data(ctx);
       free(cc);
       return GL_FALSE;
@@ -251,7 +252,8 @@ CompileShader(const char *filename, GLenum type)
    GLuint shader;
 
    assert(type == GL_FRAGMENT_SHADER ||
-          type == GL_VERTEX_SHADER);
+          type == GL_VERTEX_SHADER ||
+          type == GL_GEOMETRY_SHADER_ARB);
 
    shader = _mesa_CreateShader(type);
    ReadShader(shader, filename);
@@ -267,6 +269,7 @@ Usage(void)
    printf("Usage:\n");
    printf("  --vs FILE          vertex shader input filename\n");
    printf("  --fs FILE          fragment shader input filename\n");
+   printf("  --gs FILE          geometry shader input filename\n");
    printf("  --arb              emit ARB-style instructions\n");
    printf("  --nv               emit NV-style instructions\n");
    printf("  --link             run linker\n");
@@ -290,6 +293,7 @@ ParseOptions(int argc, char *argv[])
    Options.Mode = PROG_PRINT_DEBUG;
    Options.VertFile = NULL;
    Options.FragFile = NULL;
+   Options.GeoFile = NULL;
    Options.OutputFile = NULL;
    Options.Params = GL_FALSE;
    Options.Pragmas.IgnoreOptimize = GL_FALSE;
@@ -311,6 +315,10 @@ ParseOptions(int argc, char *argv[])
          Options.FragFile = argv[i + 1];
          i++;
       }
+      else if (strcmp(argv[i], "--gs") == 0) {
+         Options.GeoFile = argv[i + 1];
+         i++;
+      }
       else if (strcmp(argv[i], "--arb") == 0) {
          Options.Mode = PROG_PRINT_ARB;
       }
@@ -369,7 +377,7 @@ ParseOptions(int argc, char *argv[])
 int
 main(int argc, char *argv[])
 {
-   GLuint v_shader = 0, f_shader = 0;
+   GLuint v_shader = 0, f_shader = 0, g_shader = 0;
 
    ParseOptions(argc, argv);
 
@@ -386,10 +394,19 @@ main(int argc, char *argv[])
       f_shader = CompileShader(Options.FragFile, GL_FRAGMENT_SHADER);
    }
 
-   if (v_shader || f_shader) {
+   if (Options.GeoFile) {
+      g_shader = CompileShader(Options.GeoFile, GL_GEOMETRY_SHADER_ARB);
+   }
+
+
+   if (v_shader || f_shader || g_shader) {
       if (Options.OutputFile) {
+         FILE *f;
          fclose(stdout);
-         /*stdout =*/ freopen(Options.OutputFile, "w", stdout);
+         /*stdout =*/ f = freopen(Options.OutputFile, "w", stdout);
+         if (!f) {
+            fprintf(stderr, "freopen error\n");
+         }
       }
       if (stdout && v_shader) {
          PrintShaderInstructions(v_shader, stdout);
@@ -397,6 +414,9 @@ main(int argc, char *argv[])
       if (stdout && f_shader) {
          PrintShaderInstructions(f_shader, stdout);
       }
+      if (stdout && g_shader) {
+         PrintShaderInstructions(g_shader, stdout);
+      }
       if (Options.OutputFile) {
          fclose(stdout);
       }
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index ead40503977..93d0e8568a1 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1328,6 +1328,7 @@ OSMesaMakeCurrent( OSMesaContext osmesa, void *buffer, GLenum type,
     * size.
     */
    osmesa->rb = new_osmesa_renderbuffer(&osmesa->mesa, osmesa->format, type);
+   _mesa_remove_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT);
    _mesa_add_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT, osmesa->rb);
    assert(osmesa->rb->RefCount == 2);
 
diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c
index 955eba4e944..8c3f2730f3d 100644
--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -1111,31 +1111,6 @@ glXGetAGPOffsetMESA( const GLvoid *pointer )
 }
 
 
-/*** GLX_MESA_allocate_memory */
-
-void PUBLIC *
-glXAllocateMemoryMESA(Display *dpy, int scrn, size_t size,
-                      float readfreq, float writefreq, float priority)
-{
-   /* dummy */
-   return NULL;
-}
-
-void PUBLIC
-glXFreeMemoryMESA(Display *dpy, int scrn, void *pointer)
-{
-   /* dummy */
-}
-
-
-GLuint PUBLIC
-glXGetMemoryOffsetMESA(Display *dpy, int scrn, const void *pointer)
-{
-   /* dummy */
-   return 0;
-}
-
-
 /*** GLX_EXT_texture_from_pixmap */
 
 void PUBLIC
@@ -1387,11 +1362,6 @@ static struct name_address_pair GLX_functions[] = {
    /*** GLX_MESA_agp_offset ***/
    { "glXGetAGPOffsetMESA", (__GLXextFuncPtr) glXGetAGPOffsetMESA },
 
-   /*** GLX_MESA_allocate_memory ***/
-   { "glXAllocateMemoryMESA", (__GLXextFuncPtr) glXAllocateMemoryMESA },
-   { "glXFreeMemoryMESA", (__GLXextFuncPtr) glXFreeMemoryMESA },
-   { "glXGetMemoryOffsetMESA", (__GLXextFuncPtr) glXGetMemoryOffsetMESA },
-
    /*** GLX_EXT_texture_from_pixmap ***/
    { "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT },
    { "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT },