3 files changed, 93 insertions, 12 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8785957b6e6..4e3adbc0a69 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2040,6 +2040,59 @@ fs_visitor::emit_interpolation_setup_gen6()
 }
 
 void
+fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
+{
+   int reg_width = c->dispatch_width / 8;
+
+   if (c->dispatch_width == 8 || intel->gen == 6) {
+      /* SIMD8 write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       *
+       * gen6 SIMD16 DP write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       * m + 4: b0
+       * m + 5: b1
+       * m + 6: a0
+       * m + 7: a1
+       */
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
+	   color);
+   } else {
+      /* pre-gen6 SIMD16 single source DP write looks like:
+       * m + 0: r0
+       * m + 1: g0
+       * m + 2: b0
+       * m + 3: a0
+       * m + 4: r1
+       * m + 5: g1
+       * m + 6: b1
+       * m + 7: a1
+       *
+       * By setting the high bit of the MRF register number,
+       * we could indicate that we want COMPR4 mode - instead
+       * of doing the usual destination + 1 for the second
+       * half we would get destination + 4.  We would need to
+       * clue the optimizer into that, though.
+       */
+      push_force_uncompressed();
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
+      pop_force_uncompressed();
+
+      push_force_sechalf();
+      color.sechalf = true;
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
+      pop_force_sechalf();
+      color.sechalf = false;
+   }
+}
+
+void
 fs_visitor::emit_fb_writes()
 {
    this->current_annotation = "FB write header";
@@ -2113,7 +2166,7 @@ fs_visitor::emit_fb_writes()
 						 target);
       if (this->frag_color || this->frag_data) {
 	 for (int i = 0; i < 4; i++) {
-	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
+	    emit_color_write(i, color_mrf, color);
 	    color.reg_offset++;
 	 }
       }
@@ -2137,7 +2190,7 @@ fs_visitor::emit_fb_writes()
 	  * renderbuffer.
 	  */
 	 color.reg_offset += 3;
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
+	 emit_color_write(3, color_mrf, color);
       }
 
       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
@@ -2330,7 +2383,7 @@ fs_visitor::generate_math(fs_inst *inst,
 	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 	 }
       }
-   } else {
+   } else /* gen <= 5 */{
       assert(inst->mlen >= 1);
 
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -2351,6 +2404,7 @@ fs_visitor::generate_math(fs_inst *inst,
 		  inst->base_mrf + 1, sechalf(src[0]),
 		  BRW_MATH_DATA_VECTOR,
 		  BRW_MATH_PRECISION_FULL);
+
 	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
       }
    }
@@ -3528,6 +3582,8 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 				reg->hw_reg, reg->smear);
       }
       brw_reg = retype(brw_reg, reg->type);
+      if (reg->sechalf)
+	 brw_reg = sechalf(brw_reg);
       break;
    case IMM:
       switch (reg->type) {
@@ -3881,7 +3937,7 @@ fs_visitor::run()
 	 /* Haven't hooked in support for uniforms through the 16-wide
 	  * version yet.
 	  */
-	 return GL_FALSE;
+	 return false;
       }
 
       /* align to 64 byte boundary. */
@@ -3957,11 +4013,10 @@ fs_visitor::run()
    assert(force_uncompressed_stack == 0);
    assert(force_sechalf_stack == 0);
 
-   if (!failed)
-      generate_code();
-
    if (failed)
-      return GL_FALSE;
+      return false;
+
+   generate_code();
 
    if (c->dispatch_width == 8) {
       c->prog_data.total_grf = grf_used;
@@ -4005,7 +4060,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
       return false;
    }
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 5) {
       c->dispatch_width = 16;
       fs_visitor v2(c, shader);
       v2.run();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b158992071e..60398ac870e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -178,6 +178,7 @@ public:
    int type;
    bool negate;
    bool abs;
+   bool sechalf;
    struct brw_reg fixed_hw_reg;
    int smear; /* -1, or a channel of the reg to smear to all channels. */
 
@@ -521,6 +522,7 @@ public:
    void emit_if_gen6(ir_if *ir);
    void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset);
 
+   void emit_color_write(int index, int first_color_mrf, fs_reg color);
    void emit_fb_writes();
    void emit_assignment_writes(fs_reg &l, fs_reg &r,
 			       const glsl_type *type, bool predicated);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index be4b260a5ff..9d0a7a8d27d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -41,10 +41,11 @@
  */
 
 struct brw_wm_unit_key {
-   unsigned int total_grf, total_scratch;
+   unsigned int total_grf, total_grf_16, total_scratch;
    unsigned int urb_entry_read_length;
    unsigned int curb_entry_read_length;
    unsigned int dispatch_grf_start_reg;
+   uint32_t prog_offset_16;
 
    unsigned int curbe_offset;
 
@@ -92,10 +93,21 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* CACHE_NEW_WM_PROG */
    key->total_grf = brw->wm.prog_data->total_grf;
+   key->total_grf_16 = brw->wm.prog_data->total_grf_16;
    key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
    key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
    key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
    key->total_scratch = brw->wm.prog_data->total_scratch;
+   key->prog_offset_16 = brw->wm.prog_data->prog_offset_16;
+
+   if (key->prog_offset_16) {
+      /* These two fields should be the same pre-gen6, which is why we
+       * only have one hardware field to program for both dispatch
+       * widths.
+       */
+      assert(brw->wm.prog_data->first_curbe_grf ==
+	     brw->wm.prog_data->first_curbe_grf_16);
+   }
 
    /* BRW_NEW_CURBE_OFFSETS */
    key->curbe_offset = brw->curbe.wm_start;
@@ -166,7 +178,10 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    memset(&wm, 0, sizeof(wm));
 
    wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   wm.wm9.grf_reg_count_2 = ALIGN(key->total_grf_16, 16) / 16 - 1;
    wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
+   wm.wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset +
+				    key->prog_offset_16) >> 6; /* reloc */
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
@@ -206,9 +221,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.wm5.program_computes_depth = key->computes_depth;
    wm.wm5.program_uses_killpixel = key->uses_kill;
 
-   if (key->is_glsl)
+   if (key->is_glsl) {
       wm.wm5.enable_8_pix = 1;
-   else
+      if (key->prog_offset_16)
+	 wm.wm5.enable_16_pix = 1;
+   } else
       wm.wm5.enable_16_pix = 1;
 
    wm.wm5.max_threads = brw->wm_max_threads - 1;
@@ -256,6 +273,13 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			   brw->wm.prog_bo, wm.thread0.grf_reg_count << 1,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 
+   if (key->prog_offset_16) {
+      drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, wm9),
+			      brw->wm.prog_bo, ((wm.wm9.grf_reg_count_2 << 1) +
+						key->prog_offset_16),
+			      I915_GEM_DOMAIN_INSTRUCTION, 0);
+   }
+
    /* Emit scratch space relocation */
    if (key->total_scratch != 0) {
       drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2),