diff options
author | Eric Anholt <[email protected]> | 2015-07-31 20:58:57 -0700 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2015-10-20 12:47:27 +0100 |
commit | 921feb8782bdc3c459922858bee6d55919467436 (patch) | |
tree | 78de8ee7d805489f662193278744a5607d724d3a /src/gallium | |
parent | 85b946478c326df853926ed18bfbd898c0a514ef (diff) |
vc4: Switch our vertex attr lowering to being NIR-based.
This exposes more information to NIR's optimization, and should be
particularly useful when we do range-based optimization.
total uniforms in shared programs: 32066 -> 32065 (-0.00%)
uniforms in affected programs: 21 -> 20 (-4.76%)
total instructions in shared programs: 93104 -> 92630 (-0.51%)
instructions in affected programs: 31901 -> 31427 (-1.49%)
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_nir_lower_io.c | 233 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_program.c | 110 |
2 files changed, 200 insertions, 143 deletions
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index 761e2c819c5..caf706aa2a6 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -23,6 +23,7 @@ #include "vc4_qir.h" #include "glsl/nir/nir_builder.h" +#include "util/u_format.h" /** * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into @@ -50,14 +51,182 @@ replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr, nir_instr_remove(&intr->instr); } +static nir_ssa_def * +vc4_nir_unpack_8i(nir_builder *b, nir_ssa_def *src, unsigned chan) +{ + return nir_ubitfield_extract(b, + src, + nir_imm_int(b, 8 * chan), + nir_imm_int(b, 8)); +} + +/** Returns the 16 bit field as a sign-extended 32-bit value. */ +static nir_ssa_def * +vc4_nir_unpack_16i(nir_builder *b, nir_ssa_def *src, unsigned chan) +{ + return nir_ibitfield_extract(b, + src, + nir_imm_int(b, 16 * chan), + nir_imm_int(b, 16)); +} + +/** Returns the 16 bit field as an unsigned 32 bit value. */ +static nir_ssa_def * +vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan) +{ + if (chan == 0) { + return nir_iand(b, src, nir_imm_int(b, 0xffff)); + } else { + return nir_ushr(b, src, nir_imm_int(b, 16)); + } +} + +static nir_ssa_def * +vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan) +{ + return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false); +} + +static nir_ssa_def * +vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, + nir_builder *b, + nir_ssa_def **vpm_reads, + uint8_t swiz, + const struct util_format_description *desc) +{ + const struct util_format_channel_description *chan = + &desc->channel[swiz]; + nir_ssa_def *temp; + + if (swiz > UTIL_FORMAT_SWIZZLE_W) { + return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz); + } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_FLOAT) { + return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz); + } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) { + if (chan->normalized) { + return nir_fmul(b, + nir_i2f(b, vpm_reads[swiz]), + nir_imm_float(b, + 1.0 / 0x7fffffff)); + } else { + return nir_i2f(b, vpm_reads[swiz]); + } + } else if (chan->size == 8 && + (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || + chan->type == UTIL_FORMAT_TYPE_SIGNED)) { + nir_ssa_def *vpm = vpm_reads[0]; + if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { + temp = nir_ixor(b, vpm, nir_imm_int(b, 0x80808080)); + if (chan->normalized) { + return nir_fsub(b, nir_fmul(b, + vc4_nir_unpack_8f(b, temp, swiz), + nir_imm_float(b, 2.0)), + nir_imm_float(b, 1.0)); + } else { + return nir_fadd(b, + nir_i2f(b, + vc4_nir_unpack_8i(b, temp, + swiz)), + nir_imm_float(b, -128.0)); + } + } else { + if (chan->normalized) { + return vc4_nir_unpack_8f(b, vpm, swiz); + } else { + return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz)); + } + } + } else if (chan->size == 16 && + (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || + chan->type == UTIL_FORMAT_TYPE_SIGNED)) { + nir_ssa_def *vpm = vpm_reads[swiz / 2]; + + /* Note that UNPACK_16F eats a half float, not ints, so we use + * UNPACK_16_I for all of these. + */ + if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { + temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1)); + if (chan->normalized) { + return nir_fmul(b, temp, + nir_imm_float(b, 1/32768.0f)); + } else { + return temp; + } + } else { + temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1)); + if (chan->normalized) { + return nir_fmul(b, temp, + nir_imm_float(b, 1 / 65535.0)); + } else { + return temp; + } + } + } else { + return NULL; + } +} + +static void +vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + b->cursor = nir_before_instr(&intr->instr); + + int attr = intr->const_index[0]; + enum pipe_format format = c->vs_key->attr_formats[attr]; + uint32_t attr_size = util_format_get_blocksize(format); + + /* All TGSI-to-NIR inputs are vec4. */ + assert(intr->num_components == 4); + + /* Generate dword loads for the VPM values (Since these intrinsics may + * be reordered, the actual reads will be generated at the top of the + * shader by ntq_setup_inputs(). + */ + nir_ssa_def *vpm_reads[4]; + for (int i = 0; i < align(attr_size, 4) / 4; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, + nir_intrinsic_load_input); + intr_comp->num_components = 1; + intr_comp->const_index[0] = intr->const_index[0] * 4 + i; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_builder_instr_insert(b, &intr_comp->instr); + + vpm_reads[i] = &intr_comp->dest.ssa; + } + + bool format_warned = false; + const struct util_format_description *desc = + util_format_description(format); + + nir_ssa_def *dests[4]; + for (int i = 0; i < 4; i++) { + uint8_t swiz = desc->swizzle[i]; + dests[i] = vc4_nir_get_vattr_channel_vpm(c, b, vpm_reads, swiz, + desc); + + if (!dests[i]) { + if (!format_warned) { + fprintf(stderr, + "vtx element %d unsupported type: %s\n", + attr, util_format_name(format)); + format_warned = true; + } + dests[i] = nir_imm_float(b, 0.0); + } + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + static void -vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) +vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) { b->cursor = nir_before_instr(&intr->instr); - if (c->stage == QSTAGE_FRAG && intr->const_index[0] == - VC4_NIR_TLB_COLOR_READ_INPUT) { + if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { /* This doesn't need any lowering. */ return; } @@ -87,38 +256,31 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, dests[i] = &intr_comp->dest.ssa; } - switch (c->stage) { - case QSTAGE_FRAG: - if (input_var->data.location == VARYING_SLOT_FACE) { - dests[0] = nir_fsub(b, - nir_imm_float(b, 1.0), - nir_fmul(b, - nir_i2f(b, dests[0]), - nir_imm_float(b, 2.0))); - dests[1] = nir_imm_float(b, 0.0); + if (input_var->data.location == VARYING_SLOT_FACE) { + dests[0] = nir_fsub(b, + nir_imm_float(b, 1.0), + nir_fmul(b, + nir_i2f(b, dests[0]), + nir_imm_float(b, 2.0))); + dests[1] = nir_imm_float(b, 0.0); + dests[2] = nir_imm_float(b, 0.0); + dests[3] = nir_imm_float(b, 1.0); + } else if (input_var->data.location >= VARYING_SLOT_VAR0) { + if (c->fs_key->point_sprite_mask & + (1 << (input_var->data.location - + VARYING_SLOT_VAR0))) { + if (!c->fs_key->is_points) { + dests[0] = nir_imm_float(b, 0.0); + dests[1] = nir_imm_float(b, 0.0); + } + if (c->fs_key->point_coord_upper_left) { + dests[1] = nir_fsub(b, + nir_imm_float(b, 1.0), + dests[1]); + } dests[2] = nir_imm_float(b, 0.0); dests[3] = nir_imm_float(b, 1.0); - } else if (input_var->data.location >= VARYING_SLOT_VAR0) { - if (c->fs_key->point_sprite_mask & - (1 << (input_var->data.location - - VARYING_SLOT_VAR0))) { - if (!c->fs_key->is_points) { - dests[0] = nir_imm_float(b, 0.0); - dests[1] = nir_imm_float(b, 0.0); - } - if (c->fs_key->point_coord_upper_left) { - dests[1] = nir_fsub(b, - nir_imm_float(b, 1.0), - dests[1]); - } - dests[2] = nir_imm_float(b, 0.0); - dests[3] = nir_imm_float(b, 1.0); - } } - break; - case QSTAGE_COORD: - case QSTAGE_VERT: - break; } replace_intrinsic_with_vec4(b, intr, dests); @@ -232,7 +394,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, switch (intr->intrinsic) { case nir_intrinsic_load_input: - vc4_nir_lower_input(c, b, intr); + if (c->stage == QSTAGE_FRAG) + vc4_nir_lower_fs_input(c, b, intr); + else + vc4_nir_lower_vertex_attr(c, b, intr); break; case nir_intrinsic_store_output: diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index d3e856a8530..6e9ec6530c6 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -602,126 +602,18 @@ ntq_fsign(struct vc4_compile *c, struct qreg src) qir_uniform_f(c, -1.0)); } -static struct qreg -get_channel_from_vpm(struct vc4_compile *c, - struct qreg *vpm_reads, - uint8_t swiz, - const struct util_format_description *desc) -{ - const struct util_format_channel_description *chan = - &desc->channel[swiz]; - struct qreg temp; - - if (swiz > UTIL_FORMAT_SWIZZLE_W) - return get_swizzled_channel(c, vpm_reads, swiz); - else if (chan->size == 32 && - chan->type == UTIL_FORMAT_TYPE_FLOAT) { - return get_swizzled_channel(c, vpm_reads, swiz); - } else if (chan->size == 32 && - chan->type == UTIL_FORMAT_TYPE_SIGNED) { - if (chan->normalized) { - return qir_FMUL(c, - qir_ITOF(c, vpm_reads[swiz]), - qir_uniform_f(c, - 1.0 / 0x7fffffff)); - } else { - return qir_ITOF(c, vpm_reads[swiz]); - } - } else if (chan->size == 8 && - (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || - chan->type == UTIL_FORMAT_TYPE_SIGNED)) { - struct qreg vpm = vpm_reads[0]; - if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { - temp = qir_XOR(c, vpm, qir_uniform_ui(c, 0x80808080)); - if (chan->normalized) { - return qir_FSUB(c, qir_FMUL(c, - qir_UNPACK_8_F(c, temp, swiz), - qir_uniform_f(c, 2.0)), - qir_uniform_f(c, 1.0)); - } else { - return qir_FADD(c, - qir_ITOF(c, - qir_UNPACK_8_I(c, temp, - swiz)), - qir_uniform_f(c, -128.0)); - } - } else { - if (chan->normalized) { - return qir_UNPACK_8_F(c, vpm, swiz); - } else { - return qir_ITOF(c, qir_UNPACK_8_I(c, vpm, swiz)); - } - } - } else if (chan->size == 16 && - (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || - chan->type == UTIL_FORMAT_TYPE_SIGNED)) { - struct qreg vpm = vpm_reads[swiz / 2]; - - /* Note that UNPACK_16F eats a half float, not ints, so we use - * UNPACK_16_I for all of these. - */ - if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { - temp = qir_ITOF(c, qir_UNPACK_16_I(c, vpm, swiz % 2)); - if (chan->normalized) { - return qir_FMUL(c, temp, - qir_uniform_f(c, 1/32768.0f)); - } else { - return temp; - } - } else { - /* UNPACK_16I sign-extends, so we have to emit ANDs. */ - temp = vpm; - if (swiz == 1 || swiz == 3) - temp = qir_UNPACK_16_I(c, temp, 1); - temp = qir_AND(c, temp, qir_uniform_ui(c, 0xffff)); - temp = qir_ITOF(c, temp); - - if (chan->normalized) { - return qir_FMUL(c, temp, - qir_uniform_f(c, 1 / 65535.0)); - } else { - return temp; - } - } - } else { - return c->undef; - } -} - static void emit_vertex_input(struct vc4_compile *c, int attr) { enum pipe_format format = c->vs_key->attr_formats[attr]; uint32_t attr_size = util_format_get_blocksize(format); - struct qreg vpm_reads[4]; c->vattr_sizes[attr] = align(attr_size, 4); for (int i = 0; i < align(attr_size, 4) / 4; i++) { struct qreg vpm = { QFILE_VPM, attr * 4 + i }; - vpm_reads[i] = qir_MOV(c, vpm); + c->inputs[attr * 4 + i] = qir_MOV(c, vpm); c->num_inputs++; } - - bool format_warned = false; - const struct util_format_description *desc = - util_format_description(format); - - for (int i = 0; i < 4; i++) { - uint8_t swiz = desc->swizzle[i]; - struct qreg result = get_channel_from_vpm(c, vpm_reads, - swiz, desc); - - if (result.file == QFILE_NULL) { - if (!format_warned) { - fprintf(stderr, - "vtx element %d unsupported type: %s\n", - attr, util_format_name(format)); - format_warned = true; - } - result = qir_uniform_f(c, 0.0); - } - c->inputs[attr * 4 + i] = result; - } } static void |