From 2350569a78c60d32e3b751b4386ea7e6d7e2ebe9 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 3 Aug 2016 11:55:55 -0700 Subject: vc4: Avoid VS shader recompiles by keeping a set of FS inputs seen so far. We don't want to bake the whole array into the FS key, because of the hashing overhead. But we can keep a set of the arrays seen, and use a pointer to the copy in as the array's proxy. Between this and the previous patch, gl-1.0-blend-func now passes on hardware, where previously it was filling the 256MB CMA area with shaders and OOMing. Drops 712 shaders from shader-db. --- src/gallium/drivers/vc4/vc4_context.h | 21 ++++++---- src/gallium/drivers/vc4/vc4_program.c | 78 +++++++++++++++++++++++++++++------ src/gallium/drivers/vc4/vc4_qir.h | 7 +--- 3 files changed, 81 insertions(+), 25 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index b656539611c..c3474a04963 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -69,6 +69,7 @@ #define VC4_DIRTY_COMPILED_CS (1 << 23) #define VC4_DIRTY_COMPILED_VS (1 << 24) #define VC4_DIRTY_COMPILED_FS (1 << 25) +#define VC4_DIRTY_FS_INPUTS (1 << 26) struct vc4_sampler_view { struct pipe_sampler_view base; @@ -123,6 +124,17 @@ struct vc4_ubo_range { uint32_t size; }; +struct vc4_fs_inputs { + /** + * Array of the meanings of the VPM inputs this shader needs. + * + * It doesn't include those that aren't part of the VPM, like + * point/line coordinates. + */ + struct vc4_varying_slot *input_slots; + uint32_t num_inputs; +}; + struct vc4_compiled_shader { uint64_t program_id; struct vc4_bo *bo; @@ -152,13 +164,7 @@ struct vc4_compiled_shader { uint8_t vattr_offsets[9]; uint8_t vattrs_live; - /** - * Array of the meanings of the VPM inputs this shader needs. - * - * It doesn't include those that aren't part of the VPM, like - * point/line coordinates. - */ - struct vc4_varying_slot *input_slots; + const struct vc4_fs_inputs *fs_inputs; }; struct vc4_program_stateobj { @@ -270,6 +276,7 @@ struct vc4_context { struct primconvert_context *primconvert; struct hash_table *fs_cache, *vs_cache; + struct set *fs_inputs_set; uint32_t next_uncompiled_program_id; uint64_t next_compiled_program_id; diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 487491ad667..d87caa7c517 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2100,8 +2100,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, break; case QSTAGE_VERT: emit_vert_end(c, - vc4->prog.fs->input_slots, - vc4->prog.fs->num_inputs); + c->vs_key->fs_inputs->input_slots, + c->vs_key->fs_inputs->num_inputs); break; case QSTAGE_COORD: emit_coord_end(c); @@ -2207,6 +2207,13 @@ static void vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, struct vc4_compiled_shader *shader) { + struct vc4_fs_inputs inputs; + + memset(&inputs, 0, sizeof(inputs)); + inputs.input_slots = ralloc_array(shader, + struct vc4_varying_slot, + c->num_input_slots); + bool input_live[c->num_input_slots]; memset(input_live, 0, sizeof(input_live)); @@ -2217,10 +2224,6 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, } } - shader->input_slots = ralloc_array(shader, - struct vc4_varying_slot, - c->num_input_slots); - for (int i = 0; i < c->num_input_slots; i++) { struct vc4_varying_slot *slot = &c->input_slots[i]; @@ -2235,11 +2238,33 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, slot->slot == VARYING_SLOT_COL1 || slot->slot == VARYING_SLOT_BFC0 || slot->slot == VARYING_SLOT_BFC1) { - shader->color_inputs |= (1 << shader->num_inputs); + shader->color_inputs |= (1 << inputs.num_inputs); } - shader->input_slots[shader->num_inputs] = *slot; - shader->num_inputs++; + inputs.input_slots[inputs.num_inputs] = *slot; + inputs.num_inputs++; + } + shader->num_inputs = inputs.num_inputs; + + /* Add our set of inputs to the set of all inputs seen. This way, we + * can have a single pointer that identifies an FS inputs set, + * allowing VS to avoid recompiling when the FS is recompiled (or a + * new one is bound using separate shader objects) but the inputs + * don't change. + */ + struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs); + if (entry) { + shader->fs_inputs = entry->key; + ralloc_free(inputs.input_slots); + } else { + struct vc4_fs_inputs *alloc_inputs; + + alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs); + memcpy(alloc_inputs, &inputs, sizeof(inputs)); + ralloc_steal(alloc_inputs, inputs.input_slots); + _mesa_set_add(vc4->fs_inputs_set, alloc_inputs); + + shader->fs_inputs = alloc_inputs; } } @@ -2434,10 +2459,14 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) return; vc4->dirty |= VC4_DIRTY_COMPILED_FS; + if (vc4->rasterizer->base.flatshade && old_fs && vc4->prog.fs->color_inputs != old_fs->color_inputs) { vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS; } + + if (old_fs && vc4->prog.fs->fs_inputs != old_fs->fs_inputs) + vc4->dirty |= VC4_DIRTY_FS_INPUTS; } static void @@ -2451,14 +2480,14 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) VC4_DIRTY_VERTTEX | VC4_DIRTY_VTXSTATE | VC4_DIRTY_UNCOMPILED_VS | - VC4_DIRTY_COMPILED_FS))) { + VC4_DIRTY_FS_INPUTS))) { return; } memset(key, 0, sizeof(*key)); vc4_setup_shared_key(vc4, &key->base, &vc4->verttex); key->base.shader_state = vc4->prog.bind_vs; - key->compiled_fs_id = vc4->prog.fs->program_id; + key->fs_inputs = vc4->prog.fs->fs_inputs; key->clamp_color = vc4->rasterizer->base.clamp_vertex_color; for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++) @@ -2477,7 +2506,7 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) key->is_coord = true; /* Coord shaders don't care what the FS inputs are. */ - key->compiled_fs_id = 0; + key->fs_inputs = NULL; struct vc4_compiled_shader *cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); if (cs != vc4->prog.cs) { @@ -2517,6 +2546,29 @@ vs_cache_compare(const void *key1, const void *key2) return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0; } +static uint32_t +fs_inputs_hash(const void *key) +{ + const struct vc4_fs_inputs *inputs = key; + + return _mesa_hash_data(inputs->input_slots, + sizeof(*inputs->input_slots) * + inputs->num_inputs); +} + +static bool +fs_inputs_compare(const void *key1, const void *key2) +{ + const struct vc4_fs_inputs *inputs1 = key1; + const struct vc4_fs_inputs *inputs2 = key2; + + return (inputs1->num_inputs == inputs2->num_inputs && + memcmp(inputs1->input_slots, + inputs2->input_slots, + sizeof(*inputs1->input_slots) * + inputs1->num_inputs) == 0); +} + static void delete_from_cache_if_matches(struct hash_table *ht, struct hash_entry *entry, @@ -2582,6 +2634,8 @@ vc4_program_init(struct pipe_context *pctx) fs_cache_compare); vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash, vs_cache_compare); + vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash, + fs_inputs_compare); } void diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index b8ded30711c..e6297c5c82c 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -352,12 +352,7 @@ struct vc4_fs_key { struct vc4_vs_key { struct vc4_key base; - /** - * This is a proxy for the array of FS input semantics, which is - * larger than we would want to put in the key. - */ - uint64_t compiled_fs_id; - + const struct vc4_fs_inputs *fs_inputs; enum pipe_format attr_formats[8]; bool is_coord; bool per_vertex_point_size; -- cgit v1.2.3