summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Turner <[email protected]>2015-04-06 17:44:40 -0700
committerMatt Turner <[email protected]>2015-04-21 09:24:48 -0700
commit5af0604d528733af9113a6f8711c39796ce0ae40 (patch)
treeed1f61dc3c1a6d4a51b53e04c37426e60df9c077
parentfde3100fe65a175f034c77e7989601839c9983bb (diff)
i965/fs: Calculate delta_x and delta_y together.
This lets SIMD16 programs on G45 and Gen5 use the PLN instruction. On Ironlake: total instructions in shared programs: 5634757 -> 5518055 (-2.07%) instructions in affected programs: 1745837 -> 1629135 (-6.68%) helped: 11439 HURT: 4 Reviewed-by: Jason Ekstrand <[email protected]>
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp46
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h3
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_generator.cpp25
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_nir.cpp13
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp8
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp51
-rw-r--r--src/mesa/drivers/dri/i965/brw_reg.h7
7 files changed, 79 insertions, 74 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5cdc19caab0..cf1c385f098 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1265,8 +1265,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
} else {
emit(FS_OPCODE_LINTERP, wpos,
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
+ this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
interp_reg(VARYING_SLOT_POS, 2));
}
wpos = offset(wpos, 1);
@@ -1308,8 +1307,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
}
return emit(FS_OPCODE_LINTERP, attr,
- this->delta_x[barycoord_mode],
- this->delta_y[barycoord_mode], interp);
+ this->delta_xy[barycoord_mode], interp);
}
void
@@ -1859,8 +1857,8 @@ fs_visitor::assign_urb_setup()
*/
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->opcode == FS_OPCODE_LINTERP) {
- assert(inst->src[2].file == HW_REG);
- inst->src[2].fixed_hw_reg.nr += urb_start;
+ assert(inst->src[1].file == HW_REG);
+ inst->src[1].fixed_hw_reg.nr += urb_start;
}
if (inst->opcode == FS_OPCODE_CINTERP) {
@@ -2114,25 +2112,16 @@ fs_visitor::compact_virtual_grfs()
}
}
- /* Patch all the references to delta_x/delta_y, since they're used in
- * register allocation. If they're unused, switch them to BAD_FILE so
- * we don't think some random VGRF is delta_x/delta_y.
+ /* Patch all the references to delta_xy, since they're used in register
+ * allocation. If they're unused, switch them to BAD_FILE so we don't
+ * think some random VGRF is delta_xy.
*/
- for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
- if (delta_x[i].file == GRF) {
- if (remap_table[delta_x[i].reg] != -1) {
- delta_x[i].reg = remap_table[delta_x[i].reg];
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == GRF) {
+ if (remap_table[delta_xy[i].reg] != -1) {
+ delta_xy[i].reg = remap_table[delta_xy[i].reg];
} else {
- delta_x[i].file = BAD_FILE;
- }
- }
- }
- for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
- if (delta_y[i].file == GRF) {
- if (remap_table[delta_y[i].reg] != -1) {
- delta_y[i].reg = remap_table[delta_y[i].reg];
- } else {
- delta_y[i].file = BAD_FILE;
+ delta_xy[i].file = BAD_FILE;
}
}
}
@@ -2685,14 +2674,9 @@ fs_visitor::opt_register_renaming()
if (progress) {
invalidate_live_intervals();
- for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
- if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
- delta_x[i].reg = remap[delta_x[i].reg];
- }
- }
- for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
- if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
- delta_y[i].reg = remap[delta_y[i].reg];
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
+ delta_xy[i].reg = remap[delta_xy[i].reg];
}
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index d625d91e60e..24ca43ccdbe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -514,8 +514,7 @@ public:
fs_reg pixel_y;
fs_reg wpos_w;
fs_reg pixel_w;
- fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
- fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
+ fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg shader_start_time;
fs_reg userplane[MAX_CLIP_PLANES];
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 353f35adda0..495564058e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -391,12 +391,31 @@ void
fs_generator::generate_linterp(fs_inst *inst,
struct brw_reg dst, struct brw_reg *src)
{
+ /* PLN reads:
+ * / in SIMD16 \
+ * -----------------------------------
+ * | src1+0 | src1+1 | src1+2 | src1+3 |
+ * |-----------------------------------|
+ * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
+ * -----------------------------------
+ *
+ * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
+ *
+ * -----------------------------------
+ * | src1+0 | src1+1 | src1+2 | src1+3 |
+ * |-----------------------------------|
+ * |(x0, x1)|(y0, y1)| | | in SIMD8
+ * |-----------------------------------|
+ * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
+ * -----------------------------------
+ *
+ * See also: emit_interpolation_setup_gen4().
+ */
struct brw_reg delta_x = src[0];
- struct brw_reg delta_y = src[1];
- struct brw_reg interp = src[2];
+ struct brw_reg delta_y = offset(src[0], dispatch_width / 8);
+ struct brw_reg interp = src[1];
if (brw->has_pln &&
- delta_y.nr == delta_x.nr + 1 &&
(brw->gen >= 7 || (delta_x.nr & 1) == 0)) {
brw_PLN(p, dst, interp, delta_x);
} else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3972581ef12..e1687edb2b1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1482,8 +1482,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
*/
no16("interpolate_at_* not yet supported in SIMD16 mode.");
- fs_reg dst_x = vgrf(2);
- fs_reg dst_y = offset(dst_x, 1);
+ fs_reg dst_xy = vgrf(2);
/* For most messages, we need one reg of ignored data; the hardware
* requires mlen==1 even when there is no payload. in the per-slot
@@ -1495,7 +1494,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
switch (instr->intrinsic) {
case nir_intrinsic_interp_var_at_centroid:
- inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
break;
case nir_intrinsic_interp_var_at_sample: {
@@ -1503,7 +1502,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
assert(const_sample);
unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
- inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src,
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
fs_reg(msg_data));
break;
}
@@ -1515,7 +1514,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
- inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
fs_reg(off_x | (off_y << 4)));
} else {
src = vgrf(glsl_type::ivec2_type);
@@ -1548,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
}
mlen = 2;
- inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
fs_reg(0u));
}
break;
@@ -1567,7 +1566,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
src.type = dest.type;
- emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src);
+ emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
dest = offset(dest, 1);
}
break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 2a4054a29b7..47f5a42b187 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -244,7 +244,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
}
assert(reg == ra_reg_count);
- /* Add a special class for aligned pairs, which we'll put delta_x/y
+ /* Add a special class for aligned pairs, which we'll put delta_xy
* in on Gen <= 6 so that we can do PLN.
*/
if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
@@ -558,14 +558,14 @@ fs_visitor::assign_regs(bool allow_spilling)
* second operand of a PLN instruction needs to be an
* even-numbered register, so we have a special register class
* wm_aligned_pairs_class to handle this case. pre-GEN6 always
- * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
+ * uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
* second operand of a PLN instruction (since it doesn't support
* any other interpolation modes). So all we need to do is find
* that register and set it to the appropriate class.
*/
if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 &&
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
+ this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
+ this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
c = screen->wm_reg_sets[rsi].aligned_pairs_class;
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 98c6988f6ad..7fdd4e566fa 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -593,8 +593,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
/* 1. collect interpolation factors */
- fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
- fs_reg dst_y = offset(dst_x, 1);
+ fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
/* for most messages, we need one reg of ignored data; the hardware requires mlen==1
* even when there is no payload. in the per-slot offset case, we'll replace this with
@@ -606,7 +605,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
switch (ir->operation) {
case ir_unop_interpolate_at_centroid:
- inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
break;
case ir_binop_interpolate_at_sample: {
@@ -614,7 +613,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
assert(sample_num || !"nonconstant sample number should have been lowered.");
unsigned msg_data = sample_num->value.i[0] << 4;
- inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
break;
}
@@ -623,7 +622,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
if (const_offset) {
unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
(pack_pixel_offset(const_offset->value.f[1]) << 4);
- inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
fs_reg(msg_data));
} else {
/* pack the operands: hw wants offsets as 4 bit signed ints */
@@ -656,7 +655,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
}
mlen = 2 * reg_width;
- inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
+ inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
fs_reg(0u));
}
break;
@@ -678,8 +677,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
for (int i = 0; i < ir->type->vector_elements; i++) {
int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
- emit(FS_OPCODE_LINTERP, res,
- dst_x, dst_y,
+ emit(FS_OPCODE_LINTERP, res, dst_xy,
fs_reg(interp_reg(var->data.location, ch)));
res = offset(res, 1);
}
@@ -3443,31 +3441,31 @@ fs_visitor::emit_interpolation_setup_gen4()
fs_reg(brw_imm_v(0x11001100))));
this->current_annotation = "compute pixel deltas from v0";
- if (brw->has_pln) {
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
- vgrf(glsl_type::vec2_type);
- this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
- offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
+
+ this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
+ vgrf(glsl_type::vec2_type);
+ const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
+ const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
+ const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
+
+ if (brw->has_pln && dispatch_width == 16) {
+ emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
+ emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
+ emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
+ ->force_sechalf = true;
+ emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
+ ->force_sechalf = true;
} else {
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
- vgrf(glsl_type::float_type);
- this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
- vgrf(glsl_type::float_type);
+ emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
+ emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
}
- emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
- emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
this->current_annotation = "compute pos.w and 1/pos.w";
/* Compute wpos.w. It's always in our setup, since it's needed to
* interpolate the other attributes.
*/
this->wpos_w = vgrf(glsl_type::float_type);
- emit(FS_OPCODE_LINTERP, wpos_w,
- this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
- interp_reg(VARYING_SLOT_POS, 3));
+ emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
/* Compute the pixel 1/W value from wpos.w. */
this->pixel_w = vgrf(glsl_type::float_type);
emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
@@ -3509,8 +3507,7 @@ fs_visitor::emit_interpolation_setup_gen6()
for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
uint8_t reg = payload.barycentric_coord_reg[i];
- this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
- this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
+ this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
}
this->current_annotation = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 3a50e864aad..1b2bb107f07 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -704,6 +704,13 @@ brw_vec8_grf(unsigned nr, unsigned subnr)
return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
}
+/** Construct float[16] general-purpose register */
+static inline struct brw_reg
+brw_vec16_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
static inline struct brw_reg
brw_uw8_grf(unsigned nr, unsigned subnr)