diff options
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r-- | src/gallium/drivers/vc4/Makefile.sources | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_blit.c | 10 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_context.c | 17 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_drm.h | 45 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_job.c | 12 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_program.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir_schedule.c | 619 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_schedule.c | 85 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_resource.c | 12 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_screen.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_screen.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_simulator.c | 89 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_simulator_validate.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_state.c | 4 |
15 files changed, 844 insertions, 63 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 24b577ae9f3..a9a2742ec66 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -32,6 +32,7 @@ C_SOURCES := \ vc4_program.c \ vc4_qir.c \ vc4_qir_lower_uniforms.c \ + vc4_qir_schedule.c \ vc4_qir.h \ vc4_qpu.c \ vc4_qpu_defines.h \ diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c index 16dcece1ea1..876c296dc85 100644 --- a/src/gallium/drivers/vc4/vc4_blit.c +++ b/src/gallium/drivers/vc4/vc4_blit.c @@ -54,8 +54,8 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) bool old_msaa = vc4->msaa; int old_tile_width = vc4->tile_width; int old_tile_height = vc4->tile_height; - bool msaa = (info->src.resource->nr_samples || - info->dst.resource->nr_samples); + bool msaa = (info->src.resource->nr_samples > 1 || + info->dst.resource->nr_samples > 1); int tile_width = msaa ? 32 : 64; int tile_height = msaa ? 32 : 64; @@ -110,9 +110,11 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) pipe_surface_reference(&vc4->color_read, src_surf); pipe_surface_reference(&vc4->color_write, - dst_surf->texture->nr_samples ? NULL : dst_surf); + dst_surf->texture->nr_samples > 1 ? + NULL : dst_surf); pipe_surface_reference(&vc4->msaa_color_write, - dst_surf->texture->nr_samples ? dst_surf : NULL); + dst_surf->texture->nr_samples > 1 ? + dst_surf : NULL); pipe_surface_reference(&vc4->zs_read, NULL); pipe_surface_reference(&vc4->zs_write, NULL); pipe_surface_reference(&vc4->msaa_zs_write, NULL); diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 1cd167600b9..312b006f96e 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -67,15 +67,13 @@ vc4_flush(struct pipe_context *pctx) cl_u8(&bcl, VC4_PACKET_FLUSH); cl_end(&vc4->bcl, bcl); - vc4->msaa = false; if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_write, - cbuf->texture->nr_samples ? NULL : cbuf); + cbuf->texture->nr_samples > 1 ? + NULL : cbuf); pipe_surface_reference(&vc4->msaa_color_write, - cbuf->texture->nr_samples ? cbuf : NULL); - - if (cbuf->texture->nr_samples) - vc4->msaa = true; + cbuf->texture->nr_samples > 1 ? + cbuf : NULL); if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_read, cbuf); @@ -92,15 +90,12 @@ vc4_flush(struct pipe_context *pctx) if (vc4->framebuffer.zsbuf && (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { pipe_surface_reference(&vc4->zs_write, - zsbuf->texture->nr_samples ? + zsbuf->texture->nr_samples > 1 ? NULL : zsbuf); pipe_surface_reference(&vc4->msaa_zs_write, - zsbuf->texture->nr_samples ? + zsbuf->texture->nr_samples > 1 ? zsbuf : NULL); - if (zsbuf->texture->nr_samples) - vc4->msaa = true; - if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { pipe_surface_reference(&vc4->zs_read, zsbuf); } else { diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index bf58c6cd078..110c783e5fd 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -32,6 +32,7 @@ #define DRM_VC4_CREATE_BO 0x03 #define DRM_VC4_MMAP_BO 0x04 #define DRM_VC4_CREATE_SHADER_BO 0x05 +#define DRM_VC4_GET_HANG_STATE 0x06 #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) @@ -39,6 +40,7 @@ #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) #define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) +#define DRM_IOCTL_VC4_GET_HANG_STATE DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state) struct drm_vc4_submit_rcl_surface { uint32_t hindex; /* Handle index, or ~0 if not present. */ @@ -231,4 +233,47 @@ struct drm_vc4_mmap_bo { uint64_t offset; }; +struct drm_vc4_get_hang_state_bo { + uint32_t handle; + uint32_t paddr; + uint32_t size; + uint32_t pad; +}; + +/** + * struct drm_vc4_hang_state - ioctl argument for collecting state + * from a GPU hang for analysis. +*/ +struct drm_vc4_get_hang_state { + /** Pointer to array of struct drm_vc4_get_hang_state_bo. */ + uint64_t bo; + /** + * On input, the size of the bo array. Output is the number + * of bos to be returned. + */ + uint32_t bo_count; + + uint32_t start_bin, start_render; + + uint32_t ct0ca, ct0ea; + uint32_t ct1ca, ct1ea; + uint32_t ct0cs, ct1cs; + uint32_t ct0ra0, ct1ra0; + + uint32_t bpca, bpcs; + uint32_t bpoa, bpos; + + uint32_t vpmbase; + + uint32_t dbge; + uint32_t fdbgo; + uint32_t fdbgb; + uint32_t fdbgr; + uint32_t fdbgs; + uint32_t errstat; + + /* Pad that we may save more registers into in the future. */ + uint32_t pad[16]; +}; + #endif /* _UAPI_VC4_DRM_H_ */ diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index 79bf2c49eec..5d071ec862f 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -89,7 +89,7 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4, submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); submit_surf->offset = surf->offset; - if (psurf->texture->nr_samples == 0) { + if (psurf->texture->nr_samples <= 1) { if (is_depth) { submit_surf->bits = VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS, @@ -132,7 +132,7 @@ vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4, submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); submit_surf->offset = surf->offset; - if (psurf->texture->nr_samples == 0) { + if (psurf->texture->nr_samples <= 1) { submit_surf->bits = VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ? VC4_RENDER_CONFIG_FORMAT_BGR565 : @@ -240,9 +240,11 @@ vc4_job_submit(struct vc4_context *vc4) #else ret = vc4_simulator_flush(vc4, &submit); #endif - if (ret) { - fprintf(stderr, "VC4 submit failed\n"); - abort(); + static bool warned = false; + if (ret && !warned) { + fprintf(stderr, "Draw call returned %s. " + "Expect corruption.\n", strerror(errno)); + warned = true; } } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index caad05cb9f7..d11c4e3b7ca 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1848,12 +1848,15 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, qir_optimize(c); qir_lower_uniforms(c); + qir_schedule_instructions(c); + if (vc4_debug & VC4_DEBUG_QIR) { fprintf(stderr, "%s prog %d/%d QIR:\n", qir_get_stage_name(c->stage), c->program_id, c->variant_id); qir_dump(c); } + qir_reorder_uniforms(c); vc4_generate_code(vc4, c); @@ -2043,7 +2046,7 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key, key->tex[i].swizzle[2] = sampler->swizzle_b; key->tex[i].swizzle[3] = sampler->swizzle_a; - if (sampler->texture->nr_samples) { + if (sampler->texture->nr_samples > 1) { key->tex[i].msaa_width = sampler->texture->width0; key->tex[i].msaa_height = sampler->texture->height0; } else if (sampler){ diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index c34dce3aec8..b0fbb4c1db2 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -459,6 +459,7 @@ void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst); struct qreg qir_uniform(struct vc4_compile *c, enum quniform_contents contents, uint32_t data); +void qir_schedule_instructions(struct vc4_compile *c); void qir_reorder_uniforms(struct vc4_compile *c); void qir_emit(struct vc4_compile *c, struct qinst *inst); diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c new file mode 100644 index 00000000000..d20815f055e --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -0,0 +1,619 @@ +/* + * Copyright © 2010 Intel Corporation + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_qir_schedule.c + * + * The basic model of the list scheduler is to take a basic block, compute a + * DAG of the dependencies from the bottom up, and make a list of the DAG + * heads. Heuristically pick a DAG head and schedule (remove) it, then put + * all the parents that are now DAG heads into the list of things to + * schedule. + * + * The goal of scheduling here, before register allocation and conversion to + * QPU instructions, is to reduce register pressure by reordering instructions + * to consume values when possible. + */ + +#include "vc4_qir.h" + +static bool debug; + +struct schedule_node { + struct list_head link; + struct qinst *inst; + + struct schedule_node **children; + uint32_t child_count; + uint32_t child_array_size; + uint32_t parent_count; + + /* Length of the longest (latency) chain from a DAG head to the this + * instruction. + */ + uint32_t delay; + + /* Longest time + latency_between(parent, this) of any parent of this + * node. + */ + uint32_t unblocked_time; +}; + +struct schedule_state { + /* List of struct schedule_node *. This starts out with all + * instructions, and after dependency updates it's trimmed to be just + * the DAG heads. + */ + struct list_head worklist; + + uint32_t time; + + uint32_t *temp_writes; + + BITSET_WORD *temp_live; +}; + +/* When walking the instructions in reverse, we need to swap before/after in + * add_dep(). + */ +enum direction { F, R }; + +/** + * Marks a dependency between two intructions, that @after must appear after + * @before. + * + * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up, + * the latest instructions with nothing left to schedule are the DAG heads, + * and their inputs are their children. + */ +static void +add_dep(enum direction dir, + struct schedule_node *before, + struct schedule_node *after) +{ + if (!before || !after) + return; + + assert(before != after); + + if (dir == R) { + struct schedule_node *t = before; + before = after; + after = t; + } + + for (int i = 0; i < after->child_count; i++) { + if (after->children[i] == after) + return; + } + + if (after->child_array_size <= after->child_count) { + after->child_array_size = MAX2(after->child_array_size * 2, 16); + after->children = reralloc(after, after->children, + struct schedule_node *, + after->child_array_size); + } + + after->children[after->child_count] = before; + after->child_count++; + before->parent_count++; +} + +static void +add_write_dep(enum direction dir, + struct schedule_node **before, + struct schedule_node *after) +{ + add_dep(dir, *before, after); + *before = after; +} + +struct schedule_setup_state { + struct schedule_node **last_temp_write; + struct schedule_node *last_sf; + struct schedule_node *last_vary_read; + struct schedule_node *last_vpm_read; + struct schedule_node *last_vpm_write; + struct schedule_node *last_tex_coord; + struct schedule_node *last_tex_result; + struct schedule_node *last_tlb; + enum direction dir; + + /** + * Texture FIFO tracking. This is done top-to-bottom, and is used to + * track the QOP_TEX_RESULTs and add dependencies on previous ones + * when trying to submit texture coords with TFREQ full or new texture + * fetches with TXRCV full. + */ + struct { + struct schedule_node *node; + int coords; + } tex_fifo[8]; + int tfreq_count; /**< Number of texture coords outstanding. */ + int tfrcv_count; /**< Number of texture results outstanding. */ + int tex_fifo_pos; +}; + +static void +block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n) +{ + add_dep(state->dir, state->tex_fifo[0].node, n); + + state->tfreq_count -= state->tex_fifo[0].coords; + state->tfrcv_count--; + + memmove(&state->tex_fifo[0], + &state->tex_fifo[1], + state->tex_fifo_pos * sizeof(state->tex_fifo[0])); + state->tex_fifo_pos--; +} + +/** + * Common code for dependencies that need to be tracked both forward and + * backward. + * + * This is for things like "all VPM reads have to happen in order." + */ +static void +calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) +{ + struct qinst *inst = n->inst; + enum direction dir = state->dir; + + + /* Add deps for temp registers and varyings accesses. Note that we + * ignore uniforms accesses, because qir_reorder_uniforms() happens + * after this. + */ + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + switch (inst->src[i].file) { + case QFILE_TEMP: + add_dep(dir, + state->last_temp_write[inst->src[i].index], n); + break; + + case QFILE_VARY: + add_write_dep(dir, &state->last_vary_read, n); + break; + + case QFILE_VPM: + add_write_dep(dir, &state->last_vpm_read, n); + break; + + default: + break; + } + } + + switch (inst->op) { + case QOP_VARY_ADD_C: + add_dep(dir, state->last_vary_read, n); + break; + + case QOP_TEX_S: + case QOP_TEX_T: + case QOP_TEX_R: + case QOP_TEX_B: + case QOP_TEX_DIRECT: + /* Texturing setup gets scheduled in order, because + * the uniforms referenced by them have to land in a + * specific order. + */ + add_write_dep(dir, &state->last_tex_coord, n); + break; + + case QOP_TEX_RESULT: + /* Results have to be fetched in order. */ + add_write_dep(dir, &state->last_tex_result, n); + break; + + case QOP_TLB_COLOR_WRITE: + case QOP_TLB_COLOR_READ: + case QOP_TLB_Z_WRITE: + case QOP_TLB_STENCIL_SETUP: + case QOP_MS_MASK: + add_write_dep(dir, &state->last_tlb, n); + break; + + case QOP_TLB_DISCARD_SETUP: + add_write_dep(dir, &state->last_sf, n); + add_write_dep(dir, &state->last_tlb, n); + break; + + default: + break; + } + + if (inst->dst.file == QFILE_VPM) + add_write_dep(dir, &state->last_vpm_write, n); + else if (inst->dst.file == QFILE_TEMP) + add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); + + if (inst->sf) + add_write_dep(dir, &state->last_sf, n); + + if (qir_depends_on_flags(inst)) { + add_dep(dir, state->last_sf, n); + } +} + +static void +calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, + struct list_head *schedule_list) +{ + struct schedule_setup_state state; + + memset(&state, 0, sizeof(state)); + state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, + c->num_temps); + state.dir = F; + + list_for_each_entry(struct schedule_node, n, schedule_list, link) { + struct qinst *inst = n->inst; + + calculate_deps(&state, n); + + switch (inst->op) { + case QOP_TEX_S: + case QOP_TEX_T: + case QOP_TEX_R: + case QOP_TEX_B: + case QOP_TEX_DIRECT: + /* If the texture coordinate fifo is full, + * block this on the last QOP_TEX_RESULT. + */ + if (state.tfreq_count == 8) { + block_until_tex_result(&state, n); + } + + /* If the texture result fifo is full, block + * adding any more to it until the last + * QOP_TEX_RESULT. + */ + if (inst->op == QOP_TEX_S || + inst->op == QOP_TEX_DIRECT) { + if (state.tfrcv_count == 4) + block_until_tex_result(&state, n); + state.tfrcv_count++; + } + + state.tex_fifo[state.tex_fifo_pos].coords++; + state.tfreq_count++; + break; + + case QOP_TEX_RESULT: + /* Results have to be fetched after the + * coordinate setup. Note that we're assuming + * here that our input shader has the texture + * coord setup and result fetch in order, + * which is true initially but not of our + * instruction stream after this pass. + */ + add_dep(state.dir, state.last_tex_coord, n); + + state.tex_fifo[state.tex_fifo_pos].node = n; + + state.tex_fifo_pos++; + memset(&state.tex_fifo[state.tex_fifo_pos], 0, + sizeof(state.tex_fifo[0])); + break; + default: + assert(!qir_is_tex(inst)); + break; + } + } +} + +static void +calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx, + struct list_head *schedule_list) +{ + struct schedule_setup_state state; + + memset(&state, 0, sizeof(state)); + state.dir = R; + state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, + c->num_temps); + + list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) { + calculate_deps(&state, n); + } +} + +static int +get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) +{ + int cost = 0; + + if (inst->dst.file == QFILE_TEMP && + state->temp_writes[inst->dst.index] == 1) + cost--; + + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP && + !BITSET_TEST(state->temp_live, inst->src[i].index)) { + cost++; + } + } + + return cost; +} + +static bool +locks_scoreboard(struct qinst *inst) +{ + switch (inst->op) { + case QOP_TLB_Z_WRITE: + case QOP_TLB_COLOR_WRITE: + case QOP_TLB_COLOR_WRITE_MS: + case QOP_TLB_COLOR_READ: + return true; + default: + return false; + } +} + +static struct schedule_node * +choose_instruction(struct schedule_state *state) +{ + struct schedule_node *chosen = NULL; + + list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + if (!chosen) { + chosen = n; + continue; + } + + /* Prefer scheduling things that lock the scoreboard, so that + * they appear late in the program and we get more parallelism + * between shaders on multiple QPUs hitting the same fragment. + */ + if (locks_scoreboard(n->inst) && + !locks_scoreboard(chosen->inst)) { + chosen = n; + continue; + } else if (!locks_scoreboard(n->inst) && + locks_scoreboard(chosen->inst)) { + continue; + } + + /* If we would block on the previously chosen node, but would + * block less on this one, then then prefer it. + */ + if (chosen->unblocked_time > state->time && + n->unblocked_time < chosen->unblocked_time) { + chosen = n; + continue; + } else if (n->unblocked_time > state->time && + n->unblocked_time > chosen->unblocked_time) { + continue; + } + + /* If we can definitely reduce register pressure, do so + * immediately. + */ + int register_pressure_cost = + get_register_pressure_cost(state, n->inst); + int chosen_register_pressure_cost = + get_register_pressure_cost(state, chosen->inst); + + if (register_pressure_cost < chosen_register_pressure_cost) { + chosen = n; + continue; + } else if (register_pressure_cost > + chosen_register_pressure_cost) { + continue; + } + + /* Otherwise, prefer instructions with the deepest chain to + * the end of the program. This avoids the problem of + * "everything generates a temp, nothing finishes freeing one, + * guess I'll just keep emitting varying mul/adds". + */ + if (n->delay > chosen->delay) { + chosen = n; + continue; + } else if (n->delay < chosen->delay) { + continue; + } + } + + return chosen; +} + +static void +dump_state(struct vc4_compile *c, struct schedule_state *state) +{ + uint32_t i = 0; + list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + fprintf(stderr, "%3d: ", i++); + qir_dump_inst(c, n->inst); + fprintf(stderr, " (%d cost)\n", + get_register_pressure_cost(state, n->inst)); + + for (int i = 0; i < n->child_count; i++) { + struct schedule_node *child = n->children[i]; + fprintf(stderr, " - "); + qir_dump_inst(c, child->inst); + fprintf(stderr, " (%d parents)\n", child->parent_count); + } + } +} + +/* Estimate of how many instructions we should schedule between operations. + * + * These aren't in real cycle counts, because we're just estimating cycle + * times anyway. QIR instructions will get paired up when turned into QPU + * instructions, or extra NOP delays will have to be added due to register + * allocation choices. + */ +static uint32_t +latency_between(struct schedule_node *before, struct schedule_node *after) +{ + if ((before->inst->op == QOP_TEX_S || + before->inst->op == QOP_TEX_DIRECT) && + after->inst->op == QOP_TEX_RESULT) + return 100; + + return 1; +} + +/** Recursive computation of the delay member of a node. */ +static void +compute_delay(struct schedule_node *n) +{ + if (!n->child_count) { + /* The color read needs to be scheduled late, to avoid locking + * the scoreboard early. This is our best tool for + * encouraging that. The other scoreboard locking ops will + * have this happen by default, since they are generally the + * DAG heads or close to them. + */ + if (n->inst->op == QOP_TLB_COLOR_READ) + n->delay = 1000; + else + n->delay = 1; + } else { + for (int i = 0; i < n->child_count; i++) { + if (!n->children[i]->delay) + compute_delay(n->children[i]); + n->delay = MAX2(n->delay, + n->children[i]->delay + + latency_between(n, n->children[i])); + } + } +} + +static void +schedule_instructions(struct vc4_compile *c, struct schedule_state *state) +{ + if (debug) { + fprintf(stderr, "initial deps:\n"); + dump_state(c, state); + } + + /* Remove non-DAG heads from the list. */ + list_for_each_entry_safe(struct schedule_node, n, + &state->worklist, link) { + if (n->parent_count != 0) + list_del(&n->link); + } + + state->time = 0; + while (!list_empty(&state->worklist)) { + struct schedule_node *chosen = choose_instruction(state); + struct qinst *inst = chosen->inst; + + if (debug) { + fprintf(stderr, "current list:\n"); + dump_state(c, state); + fprintf(stderr, "chose: "); + qir_dump_inst(c, inst); + fprintf(stderr, " (%d cost)\n", + get_register_pressure_cost(state, inst)); + } + + state->time = MAX2(state->time, chosen->unblocked_time); + + /* Schedule this instruction back onto the QIR list. */ + list_del(&chosen->link); + list_add(&inst->link, &c->instructions); + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + for (int i = chosen->child_count - 1; i >= 0; i--) { + struct schedule_node *child = chosen->children[i]; + + child->unblocked_time = MAX2(child->unblocked_time, + state->time + + latency_between(chosen, + child)); + child->parent_count--; + if (child->parent_count == 0) + list_add(&child->link, &state->worklist); + } + + /* Update our tracking of register pressure. */ + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP) + BITSET_SET(state->temp_live, inst->src[i].index); + } + if (inst->dst.file == QFILE_TEMP) { + state->temp_writes[inst->dst.index]--; + if (state->temp_writes[inst->dst.index] == 0) + BITSET_CLEAR(state->temp_live, inst->dst.index); + } + + state->time++; + } +} + +void +qir_schedule_instructions(struct vc4_compile *c) +{ + void *mem_ctx = ralloc_context(NULL); + struct schedule_state state = { 0 }; + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); + qir_dump(c); + } + + state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps); + state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(c->num_temps)); + list_inithead(&state.worklist); + + /* Wrap each instruction in a scheduler structure. */ + list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) { + struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + + n->inst = inst; + list_del(&inst->link); + list_addtail(&n->link, &state.worklist); + + if (inst->dst.file == QFILE_TEMP) + state.temp_writes[inst->dst.index]++; + } + + /* Dependencies tracked top-to-bottom. */ + calculate_forward_deps(c, mem_ctx, &state.worklist); + /* Dependencies tracked bottom-to-top. */ + calculate_reverse_deps(c, mem_ctx, &state.worklist); + + list_for_each_entry(struct schedule_node, n, &state.worklist, link) + compute_delay(n); + + schedule_instructions(c, &state); + + if (debug) { + fprintf(stderr, "Post-schedule instructions\n"); + qir_dump(c); + } + + ralloc_free(mem_ctx); +} diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 98b7b6070d7..09164b74932 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -50,7 +50,7 @@ struct schedule_node { uint32_t child_array_size; uint32_t parent_count; - /* Longest cycles + n->latency of any parent of this node. */ + /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; /** @@ -259,7 +259,8 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, } } else if (is_tmu_write(waddr)) { add_write_dep(state, &state->last_tmu_write, n); - } else if (qpu_waddr_is_tlb(waddr)) { + } else if (qpu_waddr_is_tlb(waddr) || + waddr == QPU_W_MS_FLAGS) { add_write_dep(state, &state->last_tlb, n); } else { switch (waddr) { @@ -623,6 +624,46 @@ dump_state(struct list_head *schedule_list) } } +static uint32_t waddr_latency(uint32_t waddr, uint64_t after) +{ + if (waddr < 32) + return 2; + + /* Apply some huge latency between texture fetch requests and getting + * their results back. + */ + if (waddr == QPU_W_TMU0_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) + return 100; + } + if (waddr == QPU_W_TMU1_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1) + return 100; + } + + switch(waddr) { + case QPU_W_SFU_RECIP: + case QPU_W_SFU_RECIPSQRT: + case QPU_W_SFU_EXP: + case QPU_W_SFU_LOG: + return 3; + default: + return 1; + } +} + +static uint32_t +instruction_latency(struct schedule_node *before, struct schedule_node *after) +{ + uint64_t before_inst = before->inst->inst; + uint64_t after_inst = after->inst->inst; + + return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD), + after_inst), + waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL), + after_inst)); +} + /** Recursive computation of the delay member of a node. */ static void compute_delay(struct schedule_node *n) @@ -634,7 +675,8 @@ compute_delay(struct schedule_node *n) if (!n->children[i].node->delay) compute_delay(n->children[i].node); n->delay = MAX2(n->delay, - n->children[i].node->delay + n->latency); + n->children[i].node->delay + + instruction_latency(n, n->children[i].node)); } } } @@ -663,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list, * immediately after (or paired with!) the thing reading the * destination. */ - int latency_from_previous = war_only ? 0 : node->latency; + uint32_t latency = 0; + if (!war_only) { + latency = instruction_latency(node, + node->children[i].node); + } + child->unblocked_time = MAX2(child->unblocked_time, - time + latency_from_previous); + time + latency); child->parent_count--; if (child->parent_count == 0) list_add(&child->link, schedule_list); @@ -798,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) return time; } -static uint32_t waddr_latency(uint32_t waddr) -{ - if (waddr < 32) - return 2; - - /* Some huge number, really. */ - if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B) - return 100; - - switch(waddr) { - case QPU_W_SFU_RECIP: - case QPU_W_SFU_RECIPSQRT: - case QPU_W_SFU_EXP: - case QPU_W_SFU_LOG: - return 3; - default: - return 1; - } -} - -static uint32_t -instruction_latency(uint64_t inst) -{ - return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)), - waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); -} - uint32_t qpu_schedule_instructions(struct vc4_compile *c) { @@ -851,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c) struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); n->inst = inst; - n->latency = instruction_latency(inst->inst); if (reads_uniform(inst->inst)) { n->uniform = next_uniform++; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index e7069a4e9ff..9e6678a0625 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -207,7 +207,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, /* If the resource is multisampled, we need to resolve to single * sample. This seems like it should be handled at a higher layer. */ - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box); if (!trans->ss_resource) goto fail; @@ -377,7 +377,7 @@ vc4_setup_slices(struct vc4_resource *rsc) if (!rsc->tiled) { slice->tiling = VC4_TILING_FORMAT_LINEAR; - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { /* MSAA (4x) surfaces are stored as raw tile buffer contents. */ level_width = align(level_width, 32); level_height = align(level_height, 32); @@ -458,7 +458,7 @@ vc4_resource_setup(struct pipe_screen *pscreen, prsc->screen = pscreen; rsc->base.vtbl = &vc4_resource_vtbl; - if (prsc->nr_samples == 0) + if (prsc->nr_samples <= 1) rsc->cpp = util_format_get_blocksize(tmpl->format); else rsc->cpp = sizeof(uint32_t); @@ -475,7 +475,7 @@ get_resource_texture_format(struct pipe_resource *prsc) uint8_t format = vc4_get_tex_format(prsc->format); if (!rsc->tiled) { - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { return ~0; } else { assert(format == VC4_TEXTURE_TYPE_RGBA8888); @@ -497,7 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen, * communicate metadata about tiling currently. */ if (tmpl->target == PIPE_BUFFER || - tmpl->nr_samples || + tmpl->nr_samples > 1 || (tmpl->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR | PIPE_BIND_SHARED | @@ -832,7 +832,7 @@ vc4_dump_surface(struct pipe_surface *psurf) if (!psurf) return; - if (psurf->texture->nr_samples) + if (psurf->texture->nr_samples > 1) vc4_dump_surface_msaa(psurf); else vc4_dump_surface_non_msaa(psurf); diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 090579c2c76..8ddf0865d21 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -57,6 +57,10 @@ static const struct debug_named_value debug_options[] = { "Flush after each draw call" }, { "always_sync", VC4_DEBUG_ALWAYS_SYNC, "Wait for finish after each flush" }, +#if USE_VC4_SIMULATOR + { "dump", VC4_DEBUG_DUMP, + "Write a GPU command stream trace file" }, +#endif { NULL } }; diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h index 5992e371093..03f76b257e3 100644 --- a/src/gallium/drivers/vc4/vc4_screen.h +++ b/src/gallium/drivers/vc4/vc4_screen.h @@ -41,6 +41,7 @@ struct vc4_bo; #define VC4_DEBUG_ALWAYS_FLUSH 0x0080 #define VC4_DEBUG_ALWAYS_SYNC 0x0100 #define VC4_DEBUG_NIR 0x0200 +#define VC4_DEBUG_DUMP 0x0400 #define VC4_MAX_MIP_LEVELS 12 #define VC4_MAX_TEXTURE_SAMPLERS 16 diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index 4b1df9234b6..521ef50f814 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -131,6 +131,93 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec) return 0; } +static void +vc4_dump_to_file(struct vc4_exec_info *exec) +{ + static int dumpno = 0; + struct drm_vc4_get_hang_state *state; + struct drm_vc4_get_hang_state_bo *bo_state; + unsigned int dump_version = 0; + + if (!(vc4_debug & VC4_DEBUG_DUMP)) + return; + + state = calloc(1, sizeof(*state)); + + int unref_count = 0; + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + unref_count++; + } + + /* Add one more for the overflow area that isn't wrapped in a BO. */ + state->bo_count = exec->bo_count + unref_count + 1; + bo_state = calloc(state->bo_count, sizeof(*bo_state)); + + char *filename = NULL; + asprintf(&filename, "vc4-dri-%d.dump", dumpno++); + FILE *f = fopen(filename, "w+"); + if (!f) { + fprintf(stderr, "Couldn't open %s: %s", filename, + strerror(errno)); + return; + } + + fwrite(&dump_version, sizeof(dump_version), 1, f); + + state->ct0ca = exec->ct0ca; + state->ct0ea = exec->ct0ea; + state->ct1ca = exec->ct1ca; + state->ct1ea = exec->ct1ea; + state->start_bin = exec->ct0ca; + state->start_render = exec->ct1ca; + fwrite(state, sizeof(*state), 1, f); + + int i; + for (i = 0; i < exec->bo_count; i++) { + struct drm_gem_cma_object *cma_bo = exec->bo[i]; + bo_state[i].handle = i; /* Not used by the parser. */ + bo_state[i].paddr = cma_bo->paddr; + bo_state[i].size = cma_bo->base.size; + } + + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + struct drm_gem_cma_object *cma_bo = &bo->base; + bo_state[i].handle = 0; + bo_state[i].paddr = cma_bo->paddr; + bo_state[i].size = cma_bo->base.size; + i++; + } + + /* Add the static overflow memory area. */ + bo_state[i].handle = exec->bo_count; + bo_state[i].paddr = 0; + bo_state[i].size = OVERFLOW_SIZE; + i++; + + fwrite(bo_state, sizeof(*bo_state), state->bo_count, f); + + for (int i = 0; i < exec->bo_count; i++) { + struct drm_gem_cma_object *cma_bo = exec->bo[i]; + fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f); + } + + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + struct drm_gem_cma_object *cma_bo = &bo->base; + fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f); + } + + void *overflow = calloc(1, OVERFLOW_SIZE); + fwrite(overflow, 1, OVERFLOW_SIZE, f); + free(overflow); + + free(state); + free(bo_state); + fclose(f); +} + int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) { @@ -183,6 +270,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) exec.ct1ea - exec.ct1ca, true); } + vc4_dump_to_file(&exec); + if (exec.ct0ca != exec.ct0ea) { int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea); if (bfc != 1) { diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 40d3ada6ca2..e1f8b5a305e 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -45,7 +45,7 @@ struct vc4_exec_info; #define roundup(x, y) align(x, y) #define round_up(x, y) align(x, y) #define max(x, y) MAX2(x, y) -#define min(x, y) MiN2(x, y) +#define min(x, y) MIN2(x, y) #define BUG_ON(condition) assert(!(condition)) static inline int diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index d9c0f55b23d..7e0ada7d0fa 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -462,9 +462,9 @@ vc4_set_framebuffer_state(struct pipe_context *pctx, vc4->msaa = false; if (cso->cbufs[0]) - vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0; + vc4->msaa = cso->cbufs[0]->texture->nr_samples > 1; else if (cso->zsbuf) - vc4->msaa = cso->zsbuf->texture->nr_samples != 0; + vc4->msaa = cso->zsbuf->texture->nr_samples > 1; if (vc4->msaa) { vc4->tile_width = 32; |