summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2012-12-05 16:19:43 -0800
committerEric Anholt <[email protected]>2012-12-14 15:18:22 -0800
commit2cae9f2d4a57dc9cca934b489df43f0ec4eb98bc (patch)
tree5408b3b63efbde631a21025816275833f8d2aaab
parent4df1e18864dc6b7830bb3c7998889883fe8dae2b (diff)
i965/fs: Add empirically-determined instruction latencies for gen7.
v2: Actually switch on the other math instructions mentioned in the comment. v3: Add timing data for textureSize(), and clean up some long comment lines. Testing shader_time of fs16 shaders on a few frames of various apps: nexuiz improved by 2.9% +/- 1.5% (n=10) no difference on GLB2.5 (n=36, outliers removed) no difference on GLB2.7 (n=25) etqw improved by 2.6% +/- 2.2% (n=25) no difference on lightsmark (n=25) Acked-by: Kenneth Graunke <[email protected]>
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp182
1 files changed, 179 insertions, 3 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
index 458854cdeb7..3fbca6c4a13 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -57,7 +57,7 @@ static bool debug = false;
class schedule_node : public exec_node
{
public:
- schedule_node(fs_inst *inst)
+ schedule_node(fs_inst *inst, int gen)
{
this->inst = inst;
this->child_array_size = 0;
@@ -67,10 +67,14 @@ public:
this->parent_count = 0;
this->unblocked_time = 0;
- set_latency_gen4();
+ if (gen >= 7)
+ set_latency_gen7();
+ else
+ set_latency_gen4();
}
void set_latency_gen4();
+ void set_latency_gen7();
fs_inst *inst;
schedule_node **children;
@@ -120,6 +124,178 @@ schedule_node::set_latency_gen4()
}
}
+void
+schedule_node::set_latency_gen7()
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MAD:
+ /* 3 cycles (this is said to be 4 cycles sometimes depending on the
+ * register numbers in the sources):
+ * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+ *
+ * 20 cycles:
+ * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+ * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
+ */
+ latency = 17;
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ /* 2 cycles:
+ * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
+ *
+ * 18 cycles:
+ * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * Same for exp2, log2, rsq, sqrt, sin, cos.
+ */
+ latency = 16;
+ break;
+
+ case SHADER_OPCODE_POW:
+ /* 2 cycles:
+ * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
+ *
+ * 26 cycles:
+ * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ */
+ latency = 24;
+ break;
+
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXL:
+ /* 18 cycles:
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ *
+ * 697 +/-49 cycles (min 610, n=26):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * So the latency on our first texture load of the batchbuffer takes
+ * ~700 cycles, since the caches are cold at that point.
+ *
+ * 840 +/- 92 cycles (min 720, n=25):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * On the second load, it takes just an extra ~140 cycles, and after
+ * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+ *
+ * 683 +/- 49 cycles (min = 602, n=47):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * send(8) g50<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * The unit appears to be pipelined, since this matches up with the
+ * cache-cold case, despite there being two loads here. If you replace
+ * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+ *
+ * So, take some number between the cache-hot 140 cycles and the
+ * cache-cold 700 cycles. No particular tuning was done on this.
+ *
+ * I haven't done significant testing of the non-TEX opcodes. TXL at
+ * least looked about the same as TEX.
+ */
+ latency = 200;
+ break;
+
+ case SHADER_OPCODE_TXS:
+ /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
+ * cycles (n=15):
+ * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
+ * send(8) g6<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
+ * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
+ *
+ *
+ * Two loads was 535 +/- 30 cycles (n=19):
+ * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
+ * send(16) g6<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
+ * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
+ * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
+ * send(16) g8<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
+ * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
+ * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
+ *
+ * Since the only caches that should matter are just the
+ * instruction/state cache containing the surface state, assume that we
+ * always have hot caches.
+ */
+ latency = 100;
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ /* testing using varying-index pull constants:
+ *
+ * 16 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ *
+ * ~480 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * ~620 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * So, if it's cache-hot, it's about 140. If it's cache cold, it's
+ * about 460. We expect to mostly be cache hot, so pick something more
+ * in that direction.
+ */
+ latency = 200;
+ break;
+
+ default:
+ /* 2 cycles:
+ * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
+ *
+ * 16 cycles:
+ * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ */
+ latency = 14;
+ break;
+ }
+}
+
class instruction_scheduler {
public:
instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
@@ -159,7 +335,7 @@ public:
void
instruction_scheduler::add_inst(fs_inst *inst)
{
- schedule_node *n = new(mem_ctx) schedule_node(inst);
+ schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
assert(!inst->is_head_sentinel());
assert(!inst->is_tail_sentinel());