16 files changed, 308 insertions, 237 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 5d69039d1af..45f7a91295d 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -43,6 +43,7 @@ i965_compiler_FILES = \
 	brw_nir.h \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
+	brw_nir_attribute_workarounds.c \
 	brw_nir_opt_peephole_ffma.c \
 	brw_nir_uniforms.cpp \
 	brw_packed_float.c \
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 0a916c99947..44446694b4b 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -205,7 +205,9 @@ remap_patch_urb_offsets(nir_block *block, void *closure)
 static void
 brw_nir_lower_inputs(nir_shader *nir,
                      const struct brw_device_info *devinfo,
-                     bool is_scalar)
+                     bool is_scalar,
+                     bool use_legacy_snorm_formula,
+                     const uint8_t *vs_attrib_wa_flags)
 {
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
@@ -225,6 +227,9 @@ brw_nir_lower_inputs(nir_shader *nir,
 
       add_const_offset_to_base(nir, nir_var_shader_in);
 
+      brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula,
+                                          vs_attrib_wa_flags);
+
       if (is_scalar) {
          /* Finally, translate VERT_ATTRIB_* values into the actual registers.
           *
@@ -509,12 +514,15 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
 nir_shader *
 brw_nir_lower_io(nir_shader *nir,
                  const struct brw_device_info *devinfo,
-                 bool is_scalar)
+                 bool is_scalar,
+                 bool use_legacy_snorm_formula,
+                 const uint8_t *vs_attrib_wa_flags)
 {
    bool progress; /* Written by OPT and OPT_V */
    (void)progress;
 
-   OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
+   OPT_V(brw_nir_lower_inputs, devinfo, is_scalar,
+         use_legacy_snorm_formula, vs_attrib_wa_flags);
    OPT_V(brw_nir_lower_outputs, devinfo, is_scalar);
    if (nir->stage == MESA_SHADER_COMPUTE)
       OPT_V(brw_nir_lower_shared);
@@ -627,9 +635,10 @@ brw_create_nir(struct brw_context *brw,
       OPT_V(nir_lower_atomics, shader_prog);
    }
 
-   if (nir->stage != MESA_SHADER_TESS_CTRL &&
+   if (nir->stage != MESA_SHADER_VERTEX &&
+       nir->stage != MESA_SHADER_TESS_CTRL &&
        nir->stage != MESA_SHADER_TESS_EVAL) {
-      nir = brw_nir_lower_io(nir, devinfo, is_scalar);
+      nir = brw_nir_lower_io(nir, devinfo, is_scalar, false, NULL);
    }
 
    return nir;
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 079d8b25174..9a90e36964b 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -84,11 +84,16 @@ nir_shader *brw_create_nir(struct brw_context *brw,
 nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
 nir_shader *brw_nir_lower_io(nir_shader *nir,
                             const struct brw_device_info *devinfo,
-                            bool is_scalar);
+                            bool is_scalar,
+                            bool use_legacy_snorm_formula,
+                            const uint8_t *vs_attrib_wa_flags);
 nir_shader *brw_postprocess_nir(nir_shader *nir,
                                 const struct brw_device_info *devinfo,
                                 bool is_scalar);
 
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+                                         bool use_legacy_snorm_formula,
+                                         const uint8_t *attrib_wa_flags);
 
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
                                       const struct brw_device_info *devinfo,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c b/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
new file mode 100644
index 00000000000..9c65e540d79
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+#include "brw_vs.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats.  This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+struct attr_wa_state {
+   nir_builder builder;
+   bool impl_progress;
+   bool use_legacy_snorm_formula;
+   const uint8_t *wa_flags;
+};
+
+static bool
+apply_attr_wa_block(nir_block *block, void *void_state)
+{
+   struct attr_wa_state *state = void_state;
+   nir_builder *b = &state->builder;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic != nir_intrinsic_load_input)
+         continue;
+
+      uint8_t wa_flags = state->wa_flags[intrin->const_index[0]];
+      if (wa_flags == 0)
+         continue;
+
+      b->cursor = nir_after_instr(instr);
+
+      nir_ssa_def *val = &intrin->dest.ssa;
+
+      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+       * come in as floating point conversions of the integer values.
+       */
+      if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+         nir_ssa_def *scaled =
+            nir_fmul(b, val, nir_imm_float(b, 1.0f / 65536.0f));
+         nir_ssa_def *comps[4];
+         for (int i = 0; i < val->num_components; i++) {
+            bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+            comps[i] = nir_channel(b, rescale ? scaled : val, i);
+         }
+         val = nir_vec(b, comps, val->num_components);
+      }
+
+      /* Do sign recovery for 2101010 formats if required. */
+      if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+         /* sign recovery shift: <22, 22, 22, 30> */
+         nir_ssa_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+         val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+      }
+
+      /* Apply BGRA swizzle if required. */
+      if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+         val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4, true);
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+         /* ES 3.0 has different rules for converting signed normalized
+          * fixed-point numbers than desktop GL.
+          */
+         if ((wa_flags & BRW_ATTRIB_WA_SIGN) &&
+             !state->use_legacy_snorm_formula) {
+            /* According to equation 2.2 of the ES 3.0 specification,
+             * signed normalization conversion is done by:
+             *
+             * f = c / (2^(b-1)-1)
+             */
+            nir_ssa_def *es3_normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+                               1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+            val = nir_fmax(b,
+                           nir_fmul(b, nir_i2f(b, val), es3_normalize_factor),
+                           nir_imm_float(b, -1.0f));
+         } else {
+            /* The following equations are from the OpenGL 3.2 specification:
+             *
+             * 2.1 unsigned normalization
+             * f = c/(2^n-1)
+             *
+             * 2.2 signed normalization
+             * f = (2c+1)/(2^n-1)
+             *
+             * Both of these share a common divisor, which we handle by
+             * multiplying by 1 / (2^b - 1) for b = <10, 10, 10, 2>.
+             */
+            nir_ssa_def *normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+                               1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2)  - 1));
+
+            if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+               /* For signed normalization, the numerator is 2c+1. */
+               nir_ssa_def *two = nir_imm_float(b, 2.0f);
+               nir_ssa_def *one = nir_imm_float(b, 1.0f);
+               val = nir_fadd(b, nir_fmul(b, nir_i2f(b, val), two), one);
+            } else {
+               /* For unsigned normalization, the numerator is just c. */
+               val = nir_u2f(b, val);
+            }
+            val = nir_fmul(b, val, normalize_factor);
+         }
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+         val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f(b, val)
+                                               : nir_u2f(b, val);
+      }
+
+      nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, nir_src_for_ssa(val),
+                                     val->parent_instr);
+      state->impl_progress = true;
+   }
+
+   return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+                                    bool use_legacy_snorm_formula,
+                                    const uint8_t *attrib_wa_flags)
+{
+   bool progress = false;
+   struct attr_wa_state state = {
+      .use_legacy_snorm_formula = use_legacy_snorm_formula,
+      .wa_flags = attrib_wa_flags,
+   };
+
+   nir_foreach_function(shader, func) {
+      if (!func->impl)
+         continue;
+
+      nir_builder_init(&state.builder, func->impl);
+      state.impl_progress = false;
+
+      nir_foreach_block(func->impl, apply_attr_wa_block, &state);
+
+      if (state.impl_progress) {
+         nir_metadata_preserve(func->impl, nir_metadata_block_index |
+                                           nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index e4ce8cbf748..91e47800e1f 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -1230,7 +1230,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    nir->info.inputs_read = key->inputs_read;
    nir->info.patch_inputs_read = key->patch_inputs_read;
-   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 1dd599e93ea..2e79de6a883 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1516,22 +1516,6 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
                                           bool interleaved)
 {
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      /* We have to support ATTR as a destination for GL_FIXED fixup. */
-      if (inst->dst.file == ATTR) {
-         int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset];
-
-         /* All attributes used in the shader need to have been assigned a
-          * hardware register by the caller
-          */
-         assert(grf != 0);
-
-	 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
-	 reg.type = inst->dst.type;
-	 reg.writemask = inst->dst.writemask;
-
-         inst->dst = reg;
-      }
-
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file != ATTR)
 	    continue;
@@ -1984,6 +1968,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                       is_scalar);
+   shader = brw_nir_lower_io(shader, compiler->devinfo, is_scalar,
+                             use_legacy_snorm_formula,
+                             key->gl_attrib_wa_flags);
    shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);
 
    const unsigned *assembly = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index 6f66978f8e1..d9c048e1764 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -73,6 +73,10 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] +
                           instr->const_index[0] + offset->u[0],
                     type);
+      /* gl_PointSize is passed in the .w component of the VUE header */
+      if (instr->const_index[0] == VARYING_SLOT_PSIZ)
+         src.swizzle = BRW_SWIZZLE_WWWW;
+
       dest = get_nir_dest(instr->dest, src.type);
       dest.writemask = brw_writemask_for_size(instr->num_components);
       emit(MOV(dest, src));
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index b2a971a40ff..1b63d568d85 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -182,29 +182,6 @@ vec4_gs_visitor::emit_prolog()
       }
    }
 
-   /* If the geometry shader uses the gl_PointSize input, we need to fix it up
-    * to account for the fact that the vertex shader stored it in the w
-    * component of VARYING_SLOT_PSIZ.
-    */
-   if (nir->info.inputs_read & VARYING_BIT_PSIZ) {
-      this->current_annotation = "swizzle gl_PointSize input";
-      for (int vertex = 0; vertex < (int)nir->info.gs.vertices_in; vertex++) {
-         dst_reg dst(ATTR,
-                     BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
-         dst.type = BRW_REGISTER_TYPE_F;
-         src_reg src(dst);
-         dst.writemask = WRITEMASK_X;
-         src.swizzle = BRW_SWIZZLE_WWWW;
-         inst = emit(MOV(dst, src));
-
-         /* In dual instanced dispatch mode, dst has a width of 4, so we need
-          * to make sure the MOV happens regardless of which channels are
-          * enabled.
-          */
-         inst->force_writemask_all = true;
-      }
-   }
-
    this->current_annotation = NULL;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 6531b8ff772..df6b44dde14 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -514,7 +514,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    nir->info.outputs_written = key->outputs_written;
    nir->info.patch_outputs_written = key->patch_outputs_written;
-   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 86701f3fbd8..39f0c0b932d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -30,115 +30,6 @@ namespace brw {
 void
 vec4_vs_visitor::emit_prolog()
 {
-   dst_reg sign_recovery_shift;
-   dst_reg normalize_factor;
-   dst_reg es3_normalize_factor;
-
-   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
-         uint8_t wa_flags = key->gl_attrib_wa_flags[i];
-         dst_reg reg(ATTR, i);
-         dst_reg reg_d = reg;
-         reg_d.type = BRW_REGISTER_TYPE_D;
-         dst_reg reg_ud = reg;
-         reg_ud.type = BRW_REGISTER_TYPE_UD;
-
-         /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
-          * come in as floating point conversions of the integer values.
-          */
-         if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
-            dst_reg dst = reg;
-            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-            dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
-            emit(MUL(dst, src_reg(dst), brw_imm_f(1.0f / 65536.0f)));
-         }
-
-         /* Do sign recovery for 2101010 formats if required. */
-         if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-            if (sign_recovery_shift.file == BAD_FILE) {
-               /* shift constant: <22,22,22,30> */
-               sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), brw_imm_ud(22u)));
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), brw_imm_ud(30u)));
-            }
-
-            emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
-            emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
-         }
-
-         /* Apply BGRA swizzle if required. */
-         if (wa_flags & BRW_ATTRIB_WA_BGRA) {
-            src_reg temp = src_reg(reg);
-            temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
-            emit(MOV(reg, temp));
-         }
-
-         if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
-            /* ES 3.0 has different rules for converting signed normalized
-             * fixed-point numbers than desktop GL.
-             */
-            if ((wa_flags & BRW_ATTRIB_WA_SIGN) && !use_legacy_snorm_formula) {
-               /* According to equation 2.2 of the ES 3.0 specification,
-                * signed normalization conversion is done by:
-                *
-                * f = c / (2^(b-1)-1)
-                */
-               if (es3_normalize_factor.file == BAD_FILE) {
-                  /* mul constant: 1 / (2^(b-1) - 1) */
-                  es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
-                           brw_imm_f(1.0f / ((1<<9) - 1))));
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
-                           brw_imm_f(1.0f / ((1<<1) - 1))));
-               }
-
-               dst_reg dst = reg;
-               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg(reg_d)));
-               emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
-               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), brw_imm_f(-1.0f));
-            } else {
-               /* The following equations are from the OpenGL 3.2 specification:
-                *
-                * 2.1 unsigned normalization
-                * f = c/(2^n-1)
-                *
-                * 2.2 signed normalization
-                * f = (2c+1)/(2^n-1)
-                *
-                * Both of these share a common divisor, which is represented by
-                * "normalize_factor" in the code below.
-                */
-               if (normalize_factor.file == BAD_FILE) {
-                  /* 1 / (2^b - 1) for b=<10,10,10,2> */
-                  normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
-                           brw_imm_f(1.0f / ((1<<10) - 1))));
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_W),
-                           brw_imm_f(1.0f / ((1<<2) - 1))));
-               }
-
-               dst_reg dst = reg;
-               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
-
-               /* For signed normalization, we want the numerator to be 2c+1. */
-               if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-                  emit(MUL(dst, src_reg(dst), brw_imm_f(2.0f)));
-                  emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
-               }
-
-               emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
-            }
-         }
-
-         if (wa_flags & BRW_ATTRIB_WA_SCALE) {
-            dst_reg dst = reg;
-            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-            emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
-         }
-      }
-   }
 }
 
 
diff --git a/src/mesa/main/image.c b/src/mesa/main/image.c
index e79e3e68eac..99f253cd373 100644
--- a/src/mesa/main/image.c
+++ b/src/mesa/main/image.c
@@ -670,7 +670,7 @@ _mesa_clip_drawpixels(const struct gl_context *ctx,
  * so that the image region is entirely within the window bounds.
  * Note: this is different from _mesa_clip_drawpixels() in that the
  * scissor box is ignored, and we use the bounds of the current readbuffer
- * surface.
+ * surface or the attached image.
  *
  * \return  GL_TRUE if region to read is in bounds
  *          GL_FALSE if region is completely out of bounds (nothing to read)
@@ -682,6 +682,18 @@ _mesa_clip_readpixels(const struct gl_context *ctx,
                       struct gl_pixelstore_attrib *pack)
 {
    const struct gl_framebuffer *buffer = ctx->ReadBuffer;
+   struct gl_renderbuffer *rb = buffer->_ColorReadBuffer;
+   GLsizei clip_width;
+   GLsizei clip_height;
+
+   if (rb) {
+      clip_width = rb->Width;
+      clip_height = rb->Height;
+   } else {
+      clip_width = buffer->Width;
+      clip_height = buffer->Height;
+   }
+
 
    if (pack->RowLength == 0) {
       pack->RowLength = *width;
@@ -694,8 +706,8 @@ _mesa_clip_readpixels(const struct gl_context *ctx,
       *srcX = 0;
    }
    /* right clipping */
-   if (*srcX + *width > (GLsizei) buffer->Width)
-      *width -= (*srcX + *width - buffer->Width);
+   if (*srcX + *width > clip_width)
+      *width -= (*srcX + *width - clip_width);
 
    if (*width <= 0)
       return GL_FALSE;
@@ -707,8 +719,8 @@ _mesa_clip_readpixels(const struct gl_context *ctx,
       *srcY = 0;
    }
    /* top clipping */
-   if (*srcY + *height > (GLsizei) buffer->Height)
-      *height -= (*srcY + *height - buffer->Height);
+   if (*srcY + *height > clip_height)
+      *height -= (*srcY + *height - clip_height);
 
    if (*height <= 0)
       return GL_FALSE;
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 8cdc9fe1cb2..470182ab23d 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -858,46 +858,40 @@ _mesa_readpixels(struct gl_context *ctx,
                  const struct gl_pixelstore_attrib *packing,
                  GLvoid *pixels)
 {
-   struct gl_pixelstore_attrib clippedPacking = *packing;
-
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   /* Do all needed clipping here, so that we can forget about it later */
-   if (_mesa_clip_readpixels(ctx, &x, &y, &width, &height, &clippedPacking)) {
-
-      pixels = _mesa_map_pbo_dest(ctx, &clippedPacking, pixels);
+   pixels = _mesa_map_pbo_dest(ctx, packing, pixels);
 
-      if (pixels) {
-         /* Try memcpy first. */
-         if (readpixels_memcpy(ctx, x, y, width, height, format, type,
-                               pixels, packing)) {
-            _mesa_unmap_pbo_dest(ctx, &clippedPacking);
-            return;
-         }
-
-         /* Otherwise take the slow path. */
-         switch (format) {
-         case GL_STENCIL_INDEX:
-            read_stencil_pixels(ctx, x, y, width, height, type, pixels,
-                                &clippedPacking);
-            break;
-         case GL_DEPTH_COMPONENT:
-            read_depth_pixels(ctx, x, y, width, height, type, pixels,
-                              &clippedPacking);
-            break;
-         case GL_DEPTH_STENCIL_EXT:
-            read_depth_stencil_pixels(ctx, x, y, width, height, type, pixels,
-                                      &clippedPacking);
-            break;
-         default:
-            /* all other formats should be color formats */
-            read_rgba_pixels(ctx, x, y, width, height, format, type, pixels,
-                             &clippedPacking);
-         }
+   if (pixels) {
+      /* Try memcpy first. */
+      if (readpixels_memcpy(ctx, x, y, width, height, format, type,
+                            pixels, packing)) {
+         _mesa_unmap_pbo_dest(ctx, packing);
+         return;
+      }
 
-         _mesa_unmap_pbo_dest(ctx, &clippedPacking);
+      /* Otherwise take the slow path. */
+      switch (format) {
+      case GL_STENCIL_INDEX:
+         read_stencil_pixels(ctx, x, y, width, height, type, pixels,
+                             packing);
+         break;
+      case GL_DEPTH_COMPONENT:
+         read_depth_pixels(ctx, x, y, width, height, type, pixels,
+                           packing);
+         break;
+      case GL_DEPTH_STENCIL_EXT:
+         read_depth_stencil_pixels(ctx, x, y, width, height, type, pixels,
+                                   packing);
+         break;
+      default:
+         /* all other formats should be color formats */
+         read_rgba_pixels(ctx, x, y, width, height, format, type, pixels,
+                          packing);
       }
+
+      _mesa_unmap_pbo_dest(ctx, packing);
    }
 }
 
@@ -993,6 +987,7 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
 {
    GLenum err = GL_NO_ERROR;
    struct gl_renderbuffer *rb;
+   struct gl_pixelstore_attrib clippedPacking;
 
    GET_CURRENT_CONTEXT(ctx);
 
@@ -1094,7 +1089,9 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
       }
    }
 
-   if (width == 0 || height == 0)
+   /* Do all needed clipping here, so that we can forget about it later */
+   clippedPacking = ctx->Pack;
+   if (!_mesa_clip_readpixels(ctx, &x, &y, &width, &height, &clippedPacking))
       return; /* nothing to do */
 
    if (!_mesa_validate_pbo_access(2, &ctx->Pack, width, height, 1,
@@ -1118,7 +1115,7 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    }
 
    ctx->Driver.ReadPixels(ctx, x, y, width, height,
-			  format, type, &ctx->Pack, pixels);
+                          format, type, &clippedPacking, pixels);
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 34809ad7163..627b8cbd598 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -251,8 +251,7 @@ setup_render_state(struct gl_context *ctx,
       for (i = 0; i < st->state.num_samplers[PIPE_SHADER_FRAGMENT]; i++) {
          samplers[i] = &st->state.samplers[PIPE_SHADER_FRAGMENT][i];
       }
-      samplers[fpv->bitmap_sampler] =
-         &st->bitmap.samplers[sv->texture->target != PIPE_TEXTURE_RECT];
+      samplers[fpv->bitmap_sampler] = &st->bitmap.sampler;
       cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num,
                        (const struct pipe_sampler_state **) samplers);
    }
@@ -438,7 +437,7 @@ reset_cache(struct st_context *st)
    assert(!cache->texture);
 
    /* allocate a new texture */
-   cache->texture = st_texture_create(st, PIPE_TEXTURE_2D,
+   cache->texture = st_texture_create(st, st->internal_target,
                                       st->bitmap.tex_format, 0,
                                       BITMAP_CACHE_WIDTH, BITMAP_CACHE_HEIGHT,
                                       1, 1, 0,
@@ -624,26 +623,27 @@ accum_bitmap(struct gl_context *ctx,
 static void
 init_bitmap_state(struct st_context *st)
 {
-   struct pipe_sampler_state *sampler = &st->bitmap.samplers[0];
    struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = pipe->screen;
 
    /* This function should only be called once */
    assert(st->bitmap.cache == NULL);
 
+   assert(st->internal_target == PIPE_TEXTURE_2D ||
+          st->internal_target == PIPE_TEXTURE_RECT);
+
    /* alloc bitmap cache object */
    st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
 
    /* init sampler state once */
-   memset(sampler, 0, sizeof(*sampler));
-   sampler->wrap_s = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_t = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_r = PIPE_TEX_WRAP_CLAMP;
-   sampler->min_img_filter = PIPE_TEX_FILTER_NEAREST;
-   sampler->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-   st->bitmap.samplers[1] = *sampler;
-   st->bitmap.samplers[1].normalized_coords = 1;
+   memset(&st->bitmap.sampler, 0, sizeof(st->bitmap.sampler));
+   st->bitmap.sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
+   st->bitmap.sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
+   st->bitmap.sampler.wrap_r = PIPE_TEX_WRAP_CLAMP;
+   st->bitmap.sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+   st->bitmap.sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   st->bitmap.sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+   st->bitmap.sampler.normalized_coords = st->internal_target == PIPE_TEXTURE_2D;
 
    /* init baseline rasterizer state once */
    memset(&st->bitmap.rasterizer, 0, sizeof(st->bitmap.rasterizer));
@@ -653,17 +653,17 @@ init_bitmap_state(struct st_context *st)
 
    /* find a usable texture format */
    if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   st->internal_target, 0,
                                    PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
    }
    else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
+                                        st->internal_target, 0,
                                         PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
    }
    else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
+                                        st->internal_target, 0,
                                         PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
    }
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 04a9de06520..fd58886a782 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -358,8 +358,8 @@ make_texture(struct st_context *st,
       GLenum intFormat = internal_format(ctx, format, type);
 
       pipeFormat = st_choose_format(st, intFormat, format, type,
-                                    PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW,
-                                    FALSE);
+                                    st->internal_target, 0,
+                                    PIPE_BIND_SAMPLER_VIEW, FALSE);
       assert(pipeFormat != PIPE_FORMAT_NONE);
    }
 
@@ -556,7 +556,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    struct cso_context *cso = st->cso_context;
    GLfloat x0, y0, x1, y1;
    GLsizei maxSize;
-   boolean normalized = sv[0]->texture->target != PIPE_TEXTURE_RECT;
+   boolean normalized = sv[0]->texture->target == PIPE_TEXTURE_2D;
+
+   assert(sv[0]->texture->target == st->internal_target);
 
    /* limit checks */
    /* XXX if DrawPixels image is larger than max texture size, break
@@ -648,8 +650,10 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       sampler.normalized_coords = normalized;
 
       if (fpv) {
+         /* drawing a color image */
          const struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
-         uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1,
+         uint num = MAX3(fpv->drawpix_sampler + 1,
+                         fpv->pixelmap_sampler + 1,
                          st->state.num_samplers[PIPE_SHADER_FRAGMENT]);
          uint i;
 
@@ -662,12 +666,33 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
          cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num, samplers);
       } else {
+         /* drawing a depth/stencil image */
          const struct pipe_sampler_state *samplers[2] = {&sampler, &sampler};
 
          cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, samplers);
       }
    }
 
+   /* user textures, plus the drawpix textures */
+   if (fpv) {
+      /* drawing a color image */
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
+      uint num = MAX3(fpv->drawpix_sampler + 1,
+                      fpv->pixelmap_sampler + 1,
+                      st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
+
+      memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT],
+             sizeof(sampler_views));
+
+      sampler_views[fpv->drawpix_sampler] = sv[0];
+      if (sv[1])
+         sampler_views[fpv->pixelmap_sampler] = sv[1];
+      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num, sampler_views);
+   } else {
+      /* drawing a depth/stencil image */
+      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv);
+   }
+
    /* viewport state: viewport matching window dims */
    {
       const float w = (float) ctx->DrawBuffer->Width;
@@ -685,23 +710,6 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_set_vertex_elements(cso, 3, st->velems_util_draw);
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
 
-   /* user textures, plus the drawpix textures */
-   if (fpv) {
-      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
-      uint num = MAX3(fpv->drawpix_sampler + 1,
-                      fpv->pixelmap_sampler + 1,
-                      st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
-
-      memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT],
-             sizeof(sampler_views));
-
-      sampler_views[fpv->drawpix_sampler] = sv[0];
-      if (sv[1])
-         sampler_views[fpv->pixelmap_sampler] = sv[1];
-      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num, sampler_views);
-   } else
-      cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv);
-
    /* Compute Gallium window coords (y=0=top) with pixel zoom.
     * Recall that these coords are transformed by the current
     * vertex shader and viewport transformation.
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 352e795d06a..9a80f4bae70 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -182,7 +182,7 @@ struct st_context
    /** for glBitmap */
    struct {
       struct pipe_rasterizer_state rasterizer;
-      struct pipe_sampler_state samplers[2];
+      struct pipe_sampler_state sampler;
       enum pipe_format tex_format;
       void *vs;
       struct bitmap_cache *cache;
diff --git a/src/vulkan/anv_pipeline.c b/src/vulkan/anv_pipeline.c
index 3c5072ba924..4e75557f48d 100644
--- a/src/vulkan/anv_pipeline.c
+++ b/src/vulkan/anv_pipeline.c
@@ -481,8 +481,12 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
    prog_data->binding_table.image_start = bias;
 
    /* Finish the optimization and compilation process */
-   nir = brw_nir_lower_io(nir, &pipeline->device->info,
-                          compiler->scalar_stage[stage]);
+   if (nir->stage != MESA_SHADER_VERTEX &&
+       nir->stage != MESA_SHADER_TESS_CTRL &&
+       nir->stage != MESA_SHADER_TESS_EVAL) {
+      nir = brw_nir_lower_io(nir, &pipeline->device->info,
+                             compiler->scalar_stage[stage], false, NULL);
+   }
 
    /* nir_lower_io will only handle the push constants; we need to set this
     * to the full number of possible uniforms.