summaryrefslogtreecommitdiffstats
path: root/src/mesa
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2012-11-08 16:06:24 -0800
committerEric Anholt <[email protected]>2012-12-04 17:11:11 -0800
commita64c1eb9b110f29b8abf803a8256306702629bdc (patch)
tree25c2e5ec6a799f667e388412add80007ebde3c1e /src/mesa
parent67d9e7b5816a7643220c854ea1ed330f8363a3e0 (diff)
i965/fs: Add support for uniform array access with a variable index.
Serious Sam 3 had a shader hitting this path, but it's used rarely so it didn't show a significant performance difference (n=7). It does reduce compile time massively, though -- one shader goes from 14s compile time and 11723 instructions generated to .44s and 499 instructions. Note that some shaders lose 16-wide mode because we don't support 16-wide and pull constants at the moment (generally, things looping over a few-element array where the loop isn't getting unrolled). Given that those shaders are being generated with 15-20% fewer instructions, it probably outweighs the loss of 16-wide.
Diffstat (limited to 'src/mesa')
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp162
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h8
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp68
-rw-r--r--src/mesa/drivers/dri/i965/brw_shader.cpp2
4 files changed, 216 insertions, 24 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d0260af3e21..9ed91632cbe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -219,6 +219,45 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
return inst;
}
+exec_list
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+ fs_reg offset)
+{
+ exec_list instructions;
+ fs_inst *inst;
+
+ if (intel->gen >= 7) {
+ inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+ dst, surf_index, offset);
+ instructions.push_tail(inst);
+ } else {
+ int base_mrf = 13;
+ bool header_present = true;
+
+ fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
+ mrf.type = BRW_REGISTER_TYPE_D;
+
+ /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
+ * dword-aligned byte offset.
+ */
+ if (intel->gen == 6) {
+ instructions.push_tail(MOV(mrf, offset));
+ } else {
+ instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
+ }
+ inst = MOV(mrf, offset);
+ inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
+ dst, surf_index);
+ inst->header_present = header_present;
+ inst->base_mrf = base_mrf;
+ inst->mlen = header_present + dispatch_width / 8;
+
+ instructions.push_tail(inst);
+ }
+
+ return instructions;
+}
+
bool
fs_inst::equals(fs_inst *inst)
{
@@ -365,6 +404,7 @@ fs_reg::equals(const fs_reg &r) const
type == r.type &&
negate == r.negate &&
abs == r.abs &&
+ !reladdr && !r.reladdr &&
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
sizeof(fixed_hw_reg)) == 0 &&
smear == r.smear &&
@@ -1387,6 +1427,81 @@ fs_visitor::remove_dead_constants()
return true;
}
+/*
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants. In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+fs_visitor::move_uniform_array_access_to_pull_constants()
+{
+ int pull_constant_loc[c->prog_data.nr_params];
+
+ for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+ pull_constant_loc[i] = -1;
+ }
+
+ /* Walk through and find array access of uniforms. Put a copy of that
+ * uniform in the pull constant buffer.
+ *
+ * Note that we don't move constant-indexed accesses to arrays. No
+ * testing has been done of the performance impact of this choice.
+ */
+ foreach_list_safe(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+
+ for (int i = 0 ; i < 3; i++) {
+ if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+ continue;
+
+ int uniform = inst->src[i].reg;
+
+ /* If this array isn't already present in the pull constant buffer,
+ * add it.
+ */
+ if (pull_constant_loc[uniform] == -1) {
+ const float **values = &c->prog_data.param[uniform];
+
+ pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
+
+ assert(param_size[uniform]);
+
+ for (int j = 0; j < param_size[uniform]; j++) {
+ c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
+ values[j];
+ }
+ }
+
+ /* Set up the annotation tracking for new generated instructions. */
+ base_ir = inst->ir;
+ current_annotation = inst->annotation;
+
+ fs_reg offset = fs_reg(this, glsl_type::int_type);
+ inst->insert_before(ADD(offset, *inst->src[i].reladdr,
+ fs_reg(pull_constant_loc[uniform] +
+ inst->src[i].reg_offset)));
+
+ fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
+ fs_reg temp = fs_reg(this, glsl_type::float_type);
+ exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
+ surf_index, offset);
+ inst->insert_before(&list);
+
+ inst->src[i].file = temp.file;
+ inst->src[i].reg = temp.reg;
+ inst->src[i].reg_offset = temp.reg_offset;
+ inst->src[i].reladdr = NULL;
+ }
+ }
+}
+
/**
* Choose accesses from the UNIFORM file to demote to using the pull
* constant buffer.
@@ -1413,8 +1528,31 @@ fs_visitor::setup_pull_constants()
/* Just demote the end of the list. We could probably do better
* here, demoting things that are rarely used in the program first.
*/
- int pull_uniform_base = max_uniform_components;
- int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
+ unsigned int pull_uniform_base = max_uniform_components;
+
+ int pull_constant_loc[c->prog_data.nr_params];
+ for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+ if (i < pull_uniform_base) {
+ pull_constant_loc[i] = -1;
+ } else {
+ pull_constant_loc[i] = -1;
+ /* If our constant is already being uploaded for reladdr purposes,
+ * reuse it.
+ */
+ for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
+ if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
+ pull_constant_loc[i] = j;
+ break;
+ }
+ }
+ if (pull_constant_loc[i] == -1) {
+ int pull_index = c->prog_data.nr_pull_params++;
+ c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
+ pull_constant_loc[i] = pull_index;;
+ }
+ }
+ }
+ c->prog_data.nr_params = pull_uniform_base;
foreach_list(node, &this->instructions) {
fs_inst *inst = (fs_inst *)node;
@@ -1423,14 +1561,16 @@ fs_visitor::setup_pull_constants()
if (inst->src[i].file != UNIFORM)
continue;
- int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
- if (uniform_nr < pull_uniform_base)
+ int pull_index = pull_constant_loc[inst->src[i].reg +
+ inst->src[i].reg_offset];
+ if (pull_index == -1)
continue;
+ assert(!inst->src[i].reladdr);
+
fs_reg dst = fs_reg(this, glsl_type::float_type);
fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
- fs_reg offset = fs_reg((unsigned)(((uniform_nr -
- pull_uniform_base) * 4) & ~15));
+ fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
fs_inst *pull =
new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
dst, index, offset);
@@ -1444,15 +1584,9 @@ fs_visitor::setup_pull_constants()
inst->src[i].file = GRF;
inst->src[i].reg = dst.reg;
inst->src[i].reg_offset = 0;
- inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
+ inst->src[i].smear = pull_index & 3;
}
}
-
- for (int i = 0; i < pull_uniform_count; i++) {
- c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
- }
- c->prog_data.nr_params -= pull_uniform_count;
- c->prog_data.nr_pull_params = pull_uniform_count;
}
bool
@@ -2082,6 +2216,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
end->predicate ||
end->force_uncompressed ||
end->force_sechalf ||
+ reg.reladdr ||
!reg.equals(end->dst)) {
return NULL;
} else {
@@ -2188,6 +2323,7 @@ fs_visitor::run()
split_virtual_grfs();
setup_paramvalues_refs();
+ move_uniform_array_access_to_pull_constants();
setup_pull_constants();
bool progress;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index ec4d486b682..00ba3349bf7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -121,6 +121,8 @@ public:
uint32_t u;
float f;
} imm;
+
+ fs_reg *reladdr;
};
static const fs_reg reg_undef;
@@ -227,6 +229,7 @@ public:
fs_inst *emit(fs_inst inst);
fs_inst *emit(fs_inst *inst);
+ void emit(exec_list list);
fs_inst *emit(enum opcode opcode);
fs_inst *emit(enum opcode opcode, fs_reg dst);
@@ -261,6 +264,9 @@ public:
fs_inst *end,
fs_reg reg);
+ exec_list VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+ fs_reg offset);
+
bool run();
void setup_payload_gen4();
void setup_payload_gen6();
@@ -278,6 +284,7 @@ public:
void spill_reg(int spill_reg);
void split_virtual_grfs();
void compact_virtual_grfs();
+ void move_uniform_array_access_to_pull_constants();
void setup_pull_constants();
void calculate_live_intervals();
bool opt_algebraic();
@@ -381,6 +388,7 @@ public:
*/
int param_index[MAX_UNIFORMS * 4];
int param_offset[MAX_UNIFORMS * 4];
+ int param_size[MAX_UNIFORMS * 4];
int *virtual_grf_sizes;
int virtual_grf_count;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9fe1b8629ee..f1c6860fe47 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -117,6 +117,7 @@ fs_visitor::visit(ir_variable *ir)
return;
}
+ param_size[param_index] = type_size(ir->type);
if (!strncmp(ir->name, "gl_", 3)) {
setup_builtin_uniform_values(ir);
} else {
@@ -160,21 +161,41 @@ fs_visitor::visit(ir_dereference_record *ir)
void
fs_visitor::visit(ir_dereference_array *ir)
{
- ir_constant *index;
- int element_size;
+ ir_constant *constant_index;
+ fs_reg src;
+ int element_size = type_size(ir->type);
- ir->array->accept(this);
- index = ir->array_index->as_constant();
+ constant_index = ir->array_index->as_constant();
- element_size = type_size(ir->type);
- this->result.type = brw_type_for_base_type(ir->type);
+ ir->array->accept(this);
+ src = this->result;
+ src.type = brw_type_for_base_type(ir->type);
- if (index) {
- assert(this->result.file == UNIFORM || this->result.file == GRF);
- this->result.reg_offset += index->value.i[0] * element_size;
+ if (constant_index) {
+ assert(src.file == UNIFORM || src.file == GRF);
+ src.reg_offset += constant_index->value.i[0] * element_size;
} else {
- assert(!"FINISHME: non-constant array element");
+ /* Variable index array dereference. We attach the variable index
+ * component to the reg as a pointer to a register containing the
+ * offset. Currently only uniform arrays are supported in this patch,
+ * and that reladdr pointer is resolved by
+ * move_uniform_array_access_to_pull_constants(). All other array types
+ * are lowered by lower_variable_index_to_cond_assign().
+ */
+ ir->array_index->accept(this);
+
+ fs_reg index_reg;
+ index_reg = fs_reg(this, glsl_type::int_type);
+ emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
+
+ if (src.reladdr) {
+ emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
+ }
+
+ src.reladdr = ralloc(mem_ctx, fs_reg);
+ memcpy(src.reladdr, &index_reg, sizeof(index_reg));
}
+ this->result = src;
}
void
@@ -586,6 +607,21 @@ fs_visitor::visit(ir_expression *ir)
*/
assert(packed_consts.smear < 8);
}
+ } else {
+ /* Turn the byte offset into a dword offset. */
+ fs_reg base_offset = fs_reg(this, glsl_type::int_type);
+ emit(SHR(base_offset, op[1], fs_reg(2)));
+
+ for (int i = 0; i < ir->type->vector_elements; i++) {
+ fs_reg offset = fs_reg(this, glsl_type::int_type);
+ emit(ADD(offset, base_offset, fs_reg(i)));
+ emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index, offset));
+
+ if (ir->type->base_type == GLSL_TYPE_BOOL)
+ emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
+
+ result.reg_offset++;
+ }
}
result.reg_offset = 0;
@@ -1792,6 +1828,16 @@ fs_visitor::emit(fs_inst *inst)
return inst;
}
+void
+fs_visitor::emit(exec_list list)
+{
+ foreach_list_safe(node, &list) {
+ fs_inst *inst = (fs_inst *)node;
+ inst->remove();
+ emit(inst);
+ }
+}
+
/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
void
fs_visitor::emit_dummy_fs()
@@ -2220,6 +2266,8 @@ fs_visitor::fs_visitor(struct brw_context *brw,
this->force_uncompressed_stack = 0;
this->force_sechalf_stack = 0;
+
+ memset(&this->param_size, 0, sizeof(this->param_size));
}
fs_visitor::~fs_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 005779da9d9..add3d960267 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -140,7 +140,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
bool input = true;
bool output = stage == MESA_SHADER_FRAGMENT;
bool temp = stage == MESA_SHADER_FRAGMENT;
- bool uniform = stage == MESA_SHADER_FRAGMENT;
+ bool uniform = false;
bool lowered_variable_indexing =
lower_variable_index_to_cond_assign(shader->ir,