diff options
Diffstat (limited to 'src/gallium/drivers/r600')
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_pass.h | 26 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_ra_init.cpp | 78 |
2 files changed, 89 insertions, 15 deletions
diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h index c3ea8734de3..95d2a203a60 100644 --- a/src/gallium/drivers/r600/sb/sb_pass.h +++ b/src/gallium/drivers/r600/sb/sb_pass.h @@ -507,12 +507,36 @@ class ra_init : public pass { public: - ra_init(shader &sh) : pass(sh) {} + ra_init(shader &sh) : pass(sh), prev_chans() { + + // The parameter below affects register channels distribution. + // For cayman (VLIW-4) we're trying to distribute the channels + // uniformly, this means significantly better alu slots utilization + // at the expense of higher gpr usage. Hopefully this will improve + // performance, though it has to be proven with real benchmarks yet. + // For VLIW-5 this method could also slightly improve slots + // utilization, but increased register pressure seems more significant + // and overall performance effect is negative according to some + // benchmarks, so it's not used currently. Basically, VLIW-5 doesn't + // really need it because trans slot (unrestricted by register write + // channel) allows to consume most deviations from uniform channel + // distribution. + // Value 3 means that for new allocation we'll use channel that differs + // from 3 last used channels. 0 for VLIW-5 effectively turns this off. + + ra_tune = sh.get_ctx().is_cayman() ? 3 : 0; + } virtual int run(); private: + unsigned prev_chans; + unsigned ra_tune; + + void add_prev_chan(unsigned chan); + unsigned get_preferable_chan_mask(); + void ra_node(container_node *c); void process_op(node *n); diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp index 24b24a0bde3..0b332a9847a 100644 --- a/src/gallium/drivers/r600/sb/sb_ra_init.cpp +++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp @@ -72,6 +72,7 @@ public: sel_chan find_free_bit(); sel_chan find_free_chans(unsigned mask); + sel_chan find_free_chan_by_mask(unsigned mask); sel_chan find_free_array(unsigned size, unsigned mask); void dump(); @@ -86,7 +87,7 @@ void regbits::dump() { sblog << "\n"; if (!(i & 3)) { - sblog.print_wl(i / 4, 7); + sblog.print_w(i / 4, 7); sblog << " "; } @@ -186,34 +187,64 @@ sel_chan regbits::find_free_chans(unsigned mask) { unsigned elt = 0; unsigned bit = 0; - basetype cd = dta[elt] >> bit; + assert (!(mask & ~0xF)); + basetype cd = dta[elt]; do { - if (!cd) { - if (++elt < size) + if (++elt < size) { cd = dta[elt]; - else + bit = 0; + continue; + } else return 0; - - bit = 0; } unsigned p = __builtin_ctz(cd) & ~(basetype)3u; - if (p > bt_bits - bit) { - if (++elt < size) + assert (p <= bt_bits - bit); + bit += p; + cd >>= p; + + if ((cd & mask) == mask) { + return ((elt << bt_index_shift) | bit) + 1; + } + + bit += 4; + cd >>= 4; + + } while (1); + + return 0; +} + +sel_chan regbits::find_free_chan_by_mask(unsigned mask) { + unsigned elt = 0; + unsigned bit = 0; + + assert (!(mask & ~0xF)); + basetype cd = dta[elt]; + + do { + if (!cd) { + if (++elt < size) { cd = dta[elt]; - else + bit = 0; + continue; + } else return 0; - bit = 0; } + unsigned p = __builtin_ctz(cd) & ~(basetype)3u; + + assert (p <= bt_bits - bit); bit += p; cd >>= p; - if ((cd & mask) == mask) { - return ((elt << bt_index_shift) | bit) + 1; + if (cd & mask) { + unsigned nb = __builtin_ctz(cd & mask); + unsigned ofs = ((elt << bt_index_shift) | bit); + return nb + ofs + 1; } bit += 4; @@ -476,7 +507,9 @@ void ra_init::color(value* v) { unsigned mask = 1 << v->pin_gpr.chan(); c = rb.find_free_chans(mask) + v->pin_gpr.chan(); } else { - c = rb.find_free_bit(); + unsigned cm = get_preferable_chan_mask(); + RA_DUMP( sblog << "pref chan mask: " << cm << "\n"; ); + c = rb.find_free_chan_by_mask(cm); } assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed"); @@ -484,6 +517,7 @@ void ra_init::color(value* v) { } void ra_init::assign_color(value* v, sel_chan c) { + add_prev_chan(c.chan()); v->gpr = c; RA_DUMP( sblog << "colored "; @@ -790,4 +824,20 @@ void ra_split::split_vector_inst(node* n) { } } +void ra_init::add_prev_chan(unsigned chan) { + prev_chans = (prev_chans << 4) | (1 << chan); +} + +unsigned ra_init::get_preferable_chan_mask() { + unsigned i, used_chans = 0; + unsigned chans = prev_chans; + + for (i = 0; i < ra_tune; ++i) { + used_chans |= chans; + chans >>= 4; + } + + return (~used_chans) & 0xF; +} + } // namespace r600_sb |