From 80c5062abfdef28e23615f44b214760449f6a582 Mon Sep 17 00:00:00 2001 From: Glenn Kennard Date: Mon, 21 Sep 2015 16:21:37 +0200 Subject: r600g/sb: Support gs5 sampler indexing (v2) [airlied: v2 cayman fixups] Signed-off-by: Glenn Kennard Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/r600_shader.c | 12 ++- src/gallium/drivers/r600/r600_shader.h | 4 +- src/gallium/drivers/r600/sb/sb_bc.h | 10 ++- src/gallium/drivers/r600/sb/sb_bc_dump.cpp | 17 +++- src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 3 +- src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 40 ++++++++- src/gallium/drivers/r600/sb/sb_gcm.cpp | 11 ++- src/gallium/drivers/r600/sb/sb_sched.cpp | 118 +++++++++++++++++++++++-- src/gallium/drivers/r600/sb/sb_sched.h | 5 +- 9 files changed, 195 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1d905822cde..24c3d43b0fa 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,8 +166,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx, if (rctx->b.chip_class <= R700) { use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); } - /* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_index_registers; + /* disable SB for shaders using ubo array indexing as it doesn't handle those currently */ + use_sb &= !shader->shader.uses_ubo_indexing; /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; @@ -1251,7 +1251,7 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx) } if (ctx->src[i].kc_rel) - ctx->shader->uses_index_registers = true; + ctx->shader->uses_ubo_indexing = true; if (ctx->src[i].rel) { int chan = inst->Src[i].Indirect.Swizzle; @@ -1912,7 +1912,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->uses_doubles = ctx.info.uses_doubles; - indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); + indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); tgsi_parse_init(&ctx.parse, tokens); ctx.type = ctx.info.processor; shader->processor_type = ctx.type; @@ -1936,7 +1936,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.gs_next_vertex = 0; ctx.gs_stream_output_info = &so; - shader->uses_index_registers = false; + shader->uses_ubo_indexing = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; @@ -5703,8 +5703,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) sampler_src_reg = 3; sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - if (sampler_index_mode) - ctx->shader->uses_index_registers = true; src_gpr = tgsi_tex_get_src_gpr(ctx, 0); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 48de9cdb156..8ba32ae4999 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -75,8 +75,8 @@ struct r600_shader { boolean has_txq_cube_array_z_comp; boolean uses_tex_buffers; boolean gs_prim_id_input; - /* Temporarily workaround SB not handling CF_INDEX_[01] index registers */ - boolean uses_index_registers; + /* Temporarily workaround SB not handling ubo indexing */ + boolean uses_ubo_indexing; /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index ab988f8716d..126750d5c7e 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -48,6 +48,7 @@ class fetch_node; class alu_group_node; class region_node; class shader; +class value; class sb_ostream { public: @@ -818,13 +819,16 @@ class bc_parser { bool gpr_reladdr; + // Note: currently relies on input emitting SET_CF in same basic block as uses + value *cf_index_value[2]; + alu_node *mova; public: bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader) : ctx(sctx), dec(), bc(bc), pshader(pshader), dw(), bc_ndw(), max_cf(), sh(), error(), slots(), cgroup(), - cf_map(), loop_stack(), gpr_reladdr() { } + cf_map(), loop_stack(), gpr_reladdr(), cf_index_value(), mova() { } int decode(); int prepare(); @@ -852,6 +856,10 @@ private: int prepare_loop(cf_node *c); int prepare_if(cf_node *c); + void save_set_cf_index(value *val, unsigned idx); + value *get_cf_index_value(unsigned idx); + void save_mova(alu_node *mova); + alu_node *get_mova(); }; diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp index 0fc73c419a6..3c70ea7cd3d 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp @@ -27,6 +27,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_0/1 namespace r600_sb { @@ -354,6 +355,14 @@ void bc_dump::dump(alu_node& n) { s << " " << vec_bs[n.bc.bank_swizzle]; } + if (ctx.is_cayman()) { + if (n.bc.op == ALU_OP1_MOVA_INT) { + static const char *mova_str[] = { " AR_X", " PC", " CF_IDX0", " CF_IDX1", + " Unknown MOVA_INT dest" }; + s << mova_str[std::min(n.bc.dst_gpr, 4u)]; // CM_V_SQ_MOVA_DST_AR_* + } + } + sblog << s.str() << "\n"; } @@ -450,9 +459,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.fetch_whole_quad) s << " FWQ"; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); s << " UCF:" << n.bc.use_const_fields << " FMT(DTA:" << n.bc.data_format @@ -470,9 +479,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.offset[k]) s << " O" << chans[k] << ":" << n.bc.offset[k]; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); } sblog << s.str() << "\n"; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 522ff9d956e..193ade8a661 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -303,7 +303,8 @@ void bc_finalizer::finalize_alu_group(alu_group_node* g, node *prev_node) { assert(fdst.chan() == slot || slot == SLOT_TRANS); } - n->bc.dst_gpr = fdst.sel(); + if (!(n->bc.op_ptr->flags & AF_MOVA && ctx.is_cayman())) + n->bc.dst_gpr = fdst.sel(); n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 19bd0784a61..7f712b451c9 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -34,6 +34,7 @@ #include "r600_pipe.h" #include "r600_shader.h" +#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1 #include @@ -121,7 +122,7 @@ int bc_parser::parse_decls() { return 0; } - if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) { + if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) { assert(pshader->num_arrays); @@ -328,6 +329,28 @@ int bc_parser::prepare_alu_clause(cf_node* cf) { return 0; } +void bc_parser::save_set_cf_index(value *val, unsigned idx) +{ + assert(idx <= 1); + assert(val); + cf_index_value[idx] = val; +} +value *bc_parser::get_cf_index_value(unsigned idx) +{ + assert(idx <= 1); + return cf_index_value[idx]; +} +void bc_parser::save_mova(alu_node *mova) +{ + assert(mova); + this->mova = mova; +} +alu_node *bc_parser::get_mova() +{ + assert(mova); + return mova; +} + int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { alu_node *n; @@ -375,9 +398,14 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { n->dst.resize(1); } - if (flags & AF_MOVA) { + if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) { + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + // DCE will kill this op + save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1); + } else if (flags & AF_MOVA) { n->dst[0] = sh->get_special_value(SV_AR_INDEX); + save_mova(n); n->flags |= NF_DONT_HOIST; @@ -469,6 +497,10 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { } } } + if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && + ctx.is_cayman()) + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1); } // pack multislot instructions into alu_packed_node @@ -608,6 +640,10 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) { n->bc.src_sel[s], false); } + // Scheduler will emit the appropriate instructions to set CF_IDX0/1 + if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1)); + } } } diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp index bccb6713967..236b2ea0031 100644 --- a/src/gallium/drivers/r600/sb/sb_gcm.cpp +++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp @@ -37,6 +37,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE namespace r600_sb { @@ -406,6 +407,14 @@ void gcm::bu_sched_bb(bb_node* bb) { ncnt = 3; } + bool sampler_indexing = false; + if (n->is_fetch_inst() && + static_cast(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) + { + sampler_indexing = true; // Give sampler indexed ops get their own clause + ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1 + } + if ((sq == SQ_TEX || sq == SQ_VTX) && ((last_count >= ctx.max_fetch/2 && check_alu_ready_count(24)) || @@ -418,7 +427,7 @@ void gcm::bu_sched_bb(bb_node* bb) { bu_ready[sq].pop_front(); if (sq != SQ_CF) { - if (!clause) { + if (!clause || sampler_indexing) { clause = sh.create_clause(sq == SQ_ALU ? NST_ALU_CLAUSE : sq == SQ_TEX ? NST_TEX_CLAUSE : diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index c98b8fff764..601445f7dc3 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -36,6 +36,7 @@ #include "sb_shader.h" #include "sb_pass.h" #include "sb_sched.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1 namespace r600_sb { @@ -781,7 +782,14 @@ void post_scheduler::schedule_bb(bb_node* bb) { sblog << "\n"; ); - if (n->subtype == NST_ALU_CLAUSE) { + // May require emitting ALU ops to load index registers + if (n->is_fetch_clause()) { + n->remove(); + process_fetch(static_cast(n)); + continue; + } + + if (n->is_alu_clause()) { n->remove(); process_alu(static_cast(n)); continue; @@ -823,6 +831,102 @@ void post_scheduler::init_regmap() { } } +static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { + alu_node *a = sh.create_alu(); + + assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1); + if (ar_idx == V_SQ_CF_INDEX_0) + a->bc.set_op(ALU_OP0_SET_CF_IDX0); + else + a->bc.set_op(ALU_OP0_SET_CF_IDX1); + a->bc.slot = SLOT_X; + a->dst.resize(1); // Dummy needed for recolor + + PSC_DUMP( + sblog << "created IDX load: " + dump::dump_op(a); + sblog << "\n"; + ); + + return a; +} + +void post_scheduler::load_index_register(value *v, unsigned ar_idx) +{ + alu.reset(); + + if (!sh.get_ctx().is_cayman()) { + // Evergreen has to first load address register, then use CF_SET_IDX0/1 + alu_group_tracker &rt = alu.grp(); + alu_node *set_idx = create_set_idx(sh, ar_idx); + if (!rt.try_reserve(set_idx)) { + sblog << "can't emit SET_CF_IDX"; + dump::dump_op(set_idx); + sblog << "\n"; + } + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + alu.emit_group(); + } + + alu_group_tracker &rt = alu.grp(); + alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y); + + if (!rt.try_reserve(a)) { + sblog << "can't emit AR load : "; + dump::dump_op(a); + sblog << "\n"; + } + + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + + alu.emit_group(); + alu.emit_clause(cur_bb); +} + +void post_scheduler::process_fetch(container_node *c) { + if (c->empty()) + return; + + for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) { + N = I; + ++N; + + node *n = *I; + + fetch_node *f = static_cast(n); + + PSC_DUMP( + sblog << "process_tex "; + dump::dump_op(n); + sblog << " "; + ); + + if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + // Currently require prior opt passes to use one TEX per indexed op + assert(f->parent->count() == 1); + + value *v = f->src.back(); // Last src is index offset + + cur_bb->push_front(c); + + load_index_register(v, f->bc.sampler_index_mode); + f->src.pop_back(); // Don't need index value any more + + return; + } + } + + cur_bb->push_front(c); +} + void post_scheduler::process_alu(container_node *c) { if (c->empty()) @@ -1180,7 +1284,7 @@ void post_scheduler::emit_load_ar() { alu.discard_current_group(); alu_group_tracker &rt = alu.grp(); - alu_node *a = alu.create_ar_load(); + alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X); if (!rt.try_reserve(a)) { sblog << "can't emit AR load : "; @@ -1936,11 +2040,9 @@ bool alu_kcache_tracker::update_kc() { return true; } -alu_node* alu_clause_tracker::create_ar_load() { +alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) { alu_node *a = sh.create_alu(); - // FIXME use MOVA_GPR on R6xx - if (sh.get_ctx().uses_mova_gpr) { a->bc.set_op(ALU_OP1_MOVA_GPR_INT); a->bc.slot = SLOT_TRANS; @@ -1948,9 +2050,13 @@ alu_node* alu_clause_tracker::create_ar_load() { a->bc.set_op(ALU_OP1_MOVA_INT); a->bc.slot = SLOT_X; } + a->bc.dst_chan = ar_channel; + if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) { + a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; + } a->dst.resize(1); - a->src.push_back(current_ar); + a->src.push_back(v); PSC_DUMP( sblog << "created AR load: "; diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h index 87c45867e16..2ca714665a7 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.h +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -235,7 +235,7 @@ public: void new_group(); bool is_empty(); - alu_node* create_ar_load(); + alu_node* create_ar_load(value *v, chan_select ar_channel); void discard_current_group(); @@ -266,6 +266,9 @@ public: void run_on(container_node *n); void schedule_bb(bb_node *bb); + void load_index_register(value *v, unsigned idx); + void process_fetch(container_node *c); + void process_alu(container_node *c); void schedule_alu(container_node *c); bool prepare_alu_group(); -- cgit v1.2.3