aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r--src/gallium/drivers/vc4/Makefile.sources1
-rw-r--r--src/gallium/drivers/vc4/vc4_blit.c10
-rw-r--r--src/gallium/drivers/vc4/vc4_context.c17
-rw-r--r--src/gallium/drivers/vc4/vc4_drm.h45
-rw-r--r--src/gallium/drivers/vc4/vc4_job.c12
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c5
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h1
-rw-r--r--src/gallium/drivers/vc4/vc4_qir_schedule.c619
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_schedule.c85
-rw-r--r--src/gallium/drivers/vc4/vc4_resource.c12
-rw-r--r--src/gallium/drivers/vc4/vc4_screen.c4
-rw-r--r--src/gallium/drivers/vc4/vc4_screen.h1
-rw-r--r--src/gallium/drivers/vc4/vc4_simulator.c89
-rw-r--r--src/gallium/drivers/vc4/vc4_simulator_validate.h2
-rw-r--r--src/gallium/drivers/vc4/vc4_state.c4
15 files changed, 844 insertions, 63 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 24b577ae9f3..a9a2742ec66 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -32,6 +32,7 @@ C_SOURCES := \
vc4_program.c \
vc4_qir.c \
vc4_qir_lower_uniforms.c \
+ vc4_qir_schedule.c \
vc4_qir.h \
vc4_qpu.c \
vc4_qpu_defines.h \
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index 16dcece1ea1..876c296dc85 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -54,8 +54,8 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
bool old_msaa = vc4->msaa;
int old_tile_width = vc4->tile_width;
int old_tile_height = vc4->tile_height;
- bool msaa = (info->src.resource->nr_samples ||
- info->dst.resource->nr_samples);
+ bool msaa = (info->src.resource->nr_samples > 1 ||
+ info->dst.resource->nr_samples > 1);
int tile_width = msaa ? 32 : 64;
int tile_height = msaa ? 32 : 64;
@@ -110,9 +110,11 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
pipe_surface_reference(&vc4->color_read, src_surf);
pipe_surface_reference(&vc4->color_write,
- dst_surf->texture->nr_samples ? NULL : dst_surf);
+ dst_surf->texture->nr_samples > 1 ?
+ NULL : dst_surf);
pipe_surface_reference(&vc4->msaa_color_write,
- dst_surf->texture->nr_samples ? dst_surf : NULL);
+ dst_surf->texture->nr_samples > 1 ?
+ dst_surf : NULL);
pipe_surface_reference(&vc4->zs_read, NULL);
pipe_surface_reference(&vc4->zs_write, NULL);
pipe_surface_reference(&vc4->msaa_zs_write, NULL);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 1cd167600b9..312b006f96e 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -67,15 +67,13 @@ vc4_flush(struct pipe_context *pctx)
cl_u8(&bcl, VC4_PACKET_FLUSH);
cl_end(&vc4->bcl, bcl);
- vc4->msaa = false;
if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
pipe_surface_reference(&vc4->color_write,
- cbuf->texture->nr_samples ? NULL : cbuf);
+ cbuf->texture->nr_samples > 1 ?
+ NULL : cbuf);
pipe_surface_reference(&vc4->msaa_color_write,
- cbuf->texture->nr_samples ? cbuf : NULL);
-
- if (cbuf->texture->nr_samples)
- vc4->msaa = true;
+ cbuf->texture->nr_samples > 1 ?
+ cbuf : NULL);
if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
pipe_surface_reference(&vc4->color_read, cbuf);
@@ -92,15 +90,12 @@ vc4_flush(struct pipe_context *pctx)
if (vc4->framebuffer.zsbuf &&
(vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
pipe_surface_reference(&vc4->zs_write,
- zsbuf->texture->nr_samples ?
+ zsbuf->texture->nr_samples > 1 ?
NULL : zsbuf);
pipe_surface_reference(&vc4->msaa_zs_write,
- zsbuf->texture->nr_samples ?
+ zsbuf->texture->nr_samples > 1 ?
zsbuf : NULL);
- if (zsbuf->texture->nr_samples)
- vc4->msaa = true;
-
if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
pipe_surface_reference(&vc4->zs_read, zsbuf);
} else {
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index bf58c6cd078..110c783e5fd 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -32,6 +32,7 @@
#define DRM_VC4_CREATE_BO 0x03
#define DRM_VC4_MMAP_BO 0x04
#define DRM_VC4_CREATE_SHADER_BO 0x05
+#define DRM_VC4_GET_HANG_STATE 0x06
#define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
#define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -39,6 +40,7 @@
#define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
#define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
#define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
struct drm_vc4_submit_rcl_surface {
uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -231,4 +233,47 @@ struct drm_vc4_mmap_bo {
uint64_t offset;
};
+struct drm_vc4_get_hang_state_bo {
+ uint32_t handle;
+ uint32_t paddr;
+ uint32_t size;
+ uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+ /** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+ uint64_t bo;
+ /**
+ * On input, the size of the bo array. Output is the number
+ * of bos to be returned.
+ */
+ uint32_t bo_count;
+
+ uint32_t start_bin, start_render;
+
+ uint32_t ct0ca, ct0ea;
+ uint32_t ct1ca, ct1ea;
+ uint32_t ct0cs, ct1cs;
+ uint32_t ct0ra0, ct1ra0;
+
+ uint32_t bpca, bpcs;
+ uint32_t bpoa, bpos;
+
+ uint32_t vpmbase;
+
+ uint32_t dbge;
+ uint32_t fdbgo;
+ uint32_t fdbgb;
+ uint32_t fdbgr;
+ uint32_t fdbgs;
+ uint32_t errstat;
+
+ /* Pad that we may save more registers into in the future. */
+ uint32_t pad[16];
+};
+
#endif /* _UAPI_VC4_DRM_H_ */
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 79bf2c49eec..5d071ec862f 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -89,7 +89,7 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
submit_surf->offset = surf->offset;
- if (psurf->texture->nr_samples == 0) {
+ if (psurf->texture->nr_samples <= 1) {
if (is_depth) {
submit_surf->bits =
VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
@@ -132,7 +132,7 @@ vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4,
submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
submit_surf->offset = surf->offset;
- if (psurf->texture->nr_samples == 0) {
+ if (psurf->texture->nr_samples <= 1) {
submit_surf->bits =
VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
VC4_RENDER_CONFIG_FORMAT_BGR565 :
@@ -240,9 +240,11 @@ vc4_job_submit(struct vc4_context *vc4)
#else
ret = vc4_simulator_flush(vc4, &submit);
#endif
- if (ret) {
- fprintf(stderr, "VC4 submit failed\n");
- abort();
+ static bool warned = false;
+ if (ret && !warned) {
+ fprintf(stderr, "Draw call returned %s. "
+ "Expect corruption.\n", strerror(errno));
+ warned = true;
}
}
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index caad05cb9f7..d11c4e3b7ca 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1848,12 +1848,15 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
qir_optimize(c);
qir_lower_uniforms(c);
+ qir_schedule_instructions(c);
+
if (vc4_debug & VC4_DEBUG_QIR) {
fprintf(stderr, "%s prog %d/%d QIR:\n",
qir_get_stage_name(c->stage),
c->program_id, c->variant_id);
qir_dump(c);
}
+
qir_reorder_uniforms(c);
vc4_generate_code(vc4, c);
@@ -2043,7 +2046,7 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
key->tex[i].swizzle[2] = sampler->swizzle_b;
key->tex[i].swizzle[3] = sampler->swizzle_a;
- if (sampler->texture->nr_samples) {
+ if (sampler->texture->nr_samples > 1) {
key->tex[i].msaa_width = sampler->texture->width0;
key->tex[i].msaa_height = sampler->texture->height0;
} else if (sampler){
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index c34dce3aec8..b0fbb4c1db2 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -459,6 +459,7 @@ void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst);
struct qreg qir_uniform(struct vc4_compile *c,
enum quniform_contents contents,
uint32_t data);
+void qir_schedule_instructions(struct vc4_compile *c);
void qir_reorder_uniforms(struct vc4_compile *c);
void qir_emit(struct vc4_compile *c, struct qinst *inst);
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
new file mode 100644
index 00000000000..d20815f055e
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_qir_schedule.c
+ *
+ * The basic model of the list scheduler is to take a basic block, compute a
+ * DAG of the dependencies from the bottom up, and make a list of the DAG
+ * heads. Heuristically pick a DAG head and schedule (remove) it, then put
+ * all the parents that are now DAG heads into the list of things to
+ * schedule.
+ *
+ * The goal of scheduling here, before register allocation and conversion to
+ * QPU instructions, is to reduce register pressure by reordering instructions
+ * to consume values when possible.
+ */
+
+#include "vc4_qir.h"
+
+static bool debug;
+
+struct schedule_node {
+ struct list_head link;
+ struct qinst *inst;
+
+ struct schedule_node **children;
+ uint32_t child_count;
+ uint32_t child_array_size;
+ uint32_t parent_count;
+
+ /* Length of the longest (latency) chain from a DAG head to the this
+ * instruction.
+ */
+ uint32_t delay;
+
+ /* Longest time + latency_between(parent, this) of any parent of this
+ * node.
+ */
+ uint32_t unblocked_time;
+};
+
+struct schedule_state {
+ /* List of struct schedule_node *. This starts out with all
+ * instructions, and after dependency updates it's trimmed to be just
+ * the DAG heads.
+ */
+ struct list_head worklist;
+
+ uint32_t time;
+
+ uint32_t *temp_writes;
+
+ BITSET_WORD *temp_live;
+};
+
+/* When walking the instructions in reverse, we need to swap before/after in
+ * add_dep().
+ */
+enum direction { F, R };
+
+/**
+ * Marks a dependency between two intructions, that @after must appear after
+ * @before.
+ *
+ * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up,
+ * the latest instructions with nothing left to schedule are the DAG heads,
+ * and their inputs are their children.
+ */
+static void
+add_dep(enum direction dir,
+ struct schedule_node *before,
+ struct schedule_node *after)
+{
+ if (!before || !after)
+ return;
+
+ assert(before != after);
+
+ if (dir == R) {
+ struct schedule_node *t = before;
+ before = after;
+ after = t;
+ }
+
+ for (int i = 0; i < after->child_count; i++) {
+ if (after->children[i] == after)
+ return;
+ }
+
+ if (after->child_array_size <= after->child_count) {
+ after->child_array_size = MAX2(after->child_array_size * 2, 16);
+ after->children = reralloc(after, after->children,
+ struct schedule_node *,
+ after->child_array_size);
+ }
+
+ after->children[after->child_count] = before;
+ after->child_count++;
+ before->parent_count++;
+}
+
+static void
+add_write_dep(enum direction dir,
+ struct schedule_node **before,
+ struct schedule_node *after)
+{
+ add_dep(dir, *before, after);
+ *before = after;
+}
+
+struct schedule_setup_state {
+ struct schedule_node **last_temp_write;
+ struct schedule_node *last_sf;
+ struct schedule_node *last_vary_read;
+ struct schedule_node *last_vpm_read;
+ struct schedule_node *last_vpm_write;
+ struct schedule_node *last_tex_coord;
+ struct schedule_node *last_tex_result;
+ struct schedule_node *last_tlb;
+ enum direction dir;
+
+ /**
+ * Texture FIFO tracking. This is done top-to-bottom, and is used to
+ * track the QOP_TEX_RESULTs and add dependencies on previous ones
+ * when trying to submit texture coords with TFREQ full or new texture
+ * fetches with TXRCV full.
+ */
+ struct {
+ struct schedule_node *node;
+ int coords;
+ } tex_fifo[8];
+ int tfreq_count; /**< Number of texture coords outstanding. */
+ int tfrcv_count; /**< Number of texture results outstanding. */
+ int tex_fifo_pos;
+};
+
+static void
+block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
+{
+ add_dep(state->dir, state->tex_fifo[0].node, n);
+
+ state->tfreq_count -= state->tex_fifo[0].coords;
+ state->tfrcv_count--;
+
+ memmove(&state->tex_fifo[0],
+ &state->tex_fifo[1],
+ state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
+ state->tex_fifo_pos--;
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all VPM reads have to happen in order."
+ */
+static void
+calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
+{
+ struct qinst *inst = n->inst;
+ enum direction dir = state->dir;
+
+
+ /* Add deps for temp registers and varyings accesses. Note that we
+ * ignore uniforms accesses, because qir_reorder_uniforms() happens
+ * after this.
+ */
+ for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ switch (inst->src[i].file) {
+ case QFILE_TEMP:
+ add_dep(dir,
+ state->last_temp_write[inst->src[i].index], n);
+ break;
+
+ case QFILE_VARY:
+ add_write_dep(dir, &state->last_vary_read, n);
+ break;
+
+ case QFILE_VPM:
+ add_write_dep(dir, &state->last_vpm_read, n);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ switch (inst->op) {
+ case QOP_VARY_ADD_C:
+ add_dep(dir, state->last_vary_read, n);
+ break;
+
+ case QOP_TEX_S:
+ case QOP_TEX_T:
+ case QOP_TEX_R:
+ case QOP_TEX_B:
+ case QOP_TEX_DIRECT:
+ /* Texturing setup gets scheduled in order, because
+ * the uniforms referenced by them have to land in a
+ * specific order.
+ */
+ add_write_dep(dir, &state->last_tex_coord, n);
+ break;
+
+ case QOP_TEX_RESULT:
+ /* Results have to be fetched in order. */
+ add_write_dep(dir, &state->last_tex_result, n);
+ break;
+
+ case QOP_TLB_COLOR_WRITE:
+ case QOP_TLB_COLOR_READ:
+ case QOP_TLB_Z_WRITE:
+ case QOP_TLB_STENCIL_SETUP:
+ case QOP_MS_MASK:
+ add_write_dep(dir, &state->last_tlb, n);
+ break;
+
+ case QOP_TLB_DISCARD_SETUP:
+ add_write_dep(dir, &state->last_sf, n);
+ add_write_dep(dir, &state->last_tlb, n);
+ break;
+
+ default:
+ break;
+ }
+
+ if (inst->dst.file == QFILE_VPM)
+ add_write_dep(dir, &state->last_vpm_write, n);
+ else if (inst->dst.file == QFILE_TEMP)
+ add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
+
+ if (inst->sf)
+ add_write_dep(dir, &state->last_sf, n);
+
+ if (qir_depends_on_flags(inst)) {
+ add_dep(dir, state->last_sf, n);
+ }
+}
+
+static void
+calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
+ struct list_head *schedule_list)
+{
+ struct schedule_setup_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+ c->num_temps);
+ state.dir = F;
+
+ list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+ struct qinst *inst = n->inst;
+
+ calculate_deps(&state, n);
+
+ switch (inst->op) {
+ case QOP_TEX_S:
+ case QOP_TEX_T:
+ case QOP_TEX_R:
+ case QOP_TEX_B:
+ case QOP_TEX_DIRECT:
+ /* If the texture coordinate fifo is full,
+ * block this on the last QOP_TEX_RESULT.
+ */
+ if (state.tfreq_count == 8) {
+ block_until_tex_result(&state, n);
+ }
+
+ /* If the texture result fifo is full, block
+ * adding any more to it until the last
+ * QOP_TEX_RESULT.
+ */
+ if (inst->op == QOP_TEX_S ||
+ inst->op == QOP_TEX_DIRECT) {
+ if (state.tfrcv_count == 4)
+ block_until_tex_result(&state, n);
+ state.tfrcv_count++;
+ }
+
+ state.tex_fifo[state.tex_fifo_pos].coords++;
+ state.tfreq_count++;
+ break;
+
+ case QOP_TEX_RESULT:
+ /* Results have to be fetched after the
+ * coordinate setup. Note that we're assuming
+ * here that our input shader has the texture
+ * coord setup and result fetch in order,
+ * which is true initially but not of our
+ * instruction stream after this pass.
+ */
+ add_dep(state.dir, state.last_tex_coord, n);
+
+ state.tex_fifo[state.tex_fifo_pos].node = n;
+
+ state.tex_fifo_pos++;
+ memset(&state.tex_fifo[state.tex_fifo_pos], 0,
+ sizeof(state.tex_fifo[0]));
+ break;
+ default:
+ assert(!qir_is_tex(inst));
+ break;
+ }
+ }
+}
+
+static void
+calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
+ struct list_head *schedule_list)
+{
+ struct schedule_setup_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.dir = R;
+ state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+ c->num_temps);
+
+ list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
+ calculate_deps(&state, n);
+ }
+}
+
+static int
+get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
+{
+ int cost = 0;
+
+ if (inst->dst.file == QFILE_TEMP &&
+ state->temp_writes[inst->dst.index] == 1)
+ cost--;
+
+ for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ if (inst->src[i].file == QFILE_TEMP &&
+ !BITSET_TEST(state->temp_live, inst->src[i].index)) {
+ cost++;
+ }
+ }
+
+ return cost;
+}
+
+static bool
+locks_scoreboard(struct qinst *inst)
+{
+ switch (inst->op) {
+ case QOP_TLB_Z_WRITE:
+ case QOP_TLB_COLOR_WRITE:
+ case QOP_TLB_COLOR_WRITE_MS:
+ case QOP_TLB_COLOR_READ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct schedule_node *
+choose_instruction(struct schedule_state *state)
+{
+ struct schedule_node *chosen = NULL;
+
+ list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+ if (!chosen) {
+ chosen = n;
+ continue;
+ }
+
+ /* Prefer scheduling things that lock the scoreboard, so that
+ * they appear late in the program and we get more parallelism
+ * between shaders on multiple QPUs hitting the same fragment.
+ */
+ if (locks_scoreboard(n->inst) &&
+ !locks_scoreboard(chosen->inst)) {
+ chosen = n;
+ continue;
+ } else if (!locks_scoreboard(n->inst) &&
+ locks_scoreboard(chosen->inst)) {
+ continue;
+ }
+
+ /* If we would block on the previously chosen node, but would
+ * block less on this one, then then prefer it.
+ */
+ if (chosen->unblocked_time > state->time &&
+ n->unblocked_time < chosen->unblocked_time) {
+ chosen = n;
+ continue;
+ } else if (n->unblocked_time > state->time &&
+ n->unblocked_time > chosen->unblocked_time) {
+ continue;
+ }
+
+ /* If we can definitely reduce register pressure, do so
+ * immediately.
+ */
+ int register_pressure_cost =
+ get_register_pressure_cost(state, n->inst);
+ int chosen_register_pressure_cost =
+ get_register_pressure_cost(state, chosen->inst);
+
+ if (register_pressure_cost < chosen_register_pressure_cost) {
+ chosen = n;
+ continue;
+ } else if (register_pressure_cost >
+ chosen_register_pressure_cost) {
+ continue;
+ }
+
+ /* Otherwise, prefer instructions with the deepest chain to
+ * the end of the program. This avoids the problem of
+ * "everything generates a temp, nothing finishes freeing one,
+ * guess I'll just keep emitting varying mul/adds".
+ */
+ if (n->delay > chosen->delay) {
+ chosen = n;
+ continue;
+ } else if (n->delay < chosen->delay) {
+ continue;
+ }
+ }
+
+ return chosen;
+}
+
+static void
+dump_state(struct vc4_compile *c, struct schedule_state *state)
+{
+ uint32_t i = 0;
+ list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+ fprintf(stderr, "%3d: ", i++);
+ qir_dump_inst(c, n->inst);
+ fprintf(stderr, " (%d cost)\n",
+ get_register_pressure_cost(state, n->inst));
+
+ for (int i = 0; i < n->child_count; i++) {
+ struct schedule_node *child = n->children[i];
+ fprintf(stderr, " - ");
+ qir_dump_inst(c, child->inst);
+ fprintf(stderr, " (%d parents)\n", child->parent_count);
+ }
+ }
+}
+
+/* Estimate of how many instructions we should schedule between operations.
+ *
+ * These aren't in real cycle counts, because we're just estimating cycle
+ * times anyway. QIR instructions will get paired up when turned into QPU
+ * instructions, or extra NOP delays will have to be added due to register
+ * allocation choices.
+ */
+static uint32_t
+latency_between(struct schedule_node *before, struct schedule_node *after)
+{
+ if ((before->inst->op == QOP_TEX_S ||
+ before->inst->op == QOP_TEX_DIRECT) &&
+ after->inst->op == QOP_TEX_RESULT)
+ return 100;
+
+ return 1;
+}
+
+/** Recursive computation of the delay member of a node. */
+static void
+compute_delay(struct schedule_node *n)
+{
+ if (!n->child_count) {
+ /* The color read needs to be scheduled late, to avoid locking
+ * the scoreboard early. This is our best tool for
+ * encouraging that. The other scoreboard locking ops will
+ * have this happen by default, since they are generally the
+ * DAG heads or close to them.
+ */
+ if (n->inst->op == QOP_TLB_COLOR_READ)
+ n->delay = 1000;
+ else
+ n->delay = 1;
+ } else {
+ for (int i = 0; i < n->child_count; i++) {
+ if (!n->children[i]->delay)
+ compute_delay(n->children[i]);
+ n->delay = MAX2(n->delay,
+ n->children[i]->delay +
+ latency_between(n, n->children[i]));
+ }
+ }
+}
+
+static void
+schedule_instructions(struct vc4_compile *c, struct schedule_state *state)
+{
+ if (debug) {
+ fprintf(stderr, "initial deps:\n");
+ dump_state(c, state);
+ }
+
+ /* Remove non-DAG heads from the list. */
+ list_for_each_entry_safe(struct schedule_node, n,
+ &state->worklist, link) {
+ if (n->parent_count != 0)
+ list_del(&n->link);
+ }
+
+ state->time = 0;
+ while (!list_empty(&state->worklist)) {
+ struct schedule_node *chosen = choose_instruction(state);
+ struct qinst *inst = chosen->inst;
+
+ if (debug) {
+ fprintf(stderr, "current list:\n");
+ dump_state(c, state);
+ fprintf(stderr, "chose: ");
+ qir_dump_inst(c, inst);
+ fprintf(stderr, " (%d cost)\n",
+ get_register_pressure_cost(state, inst));
+ }
+
+ state->time = MAX2(state->time, chosen->unblocked_time);
+
+ /* Schedule this instruction back onto the QIR list. */
+ list_del(&chosen->link);
+ list_add(&inst->link, &c->instructions);
+
+ /* Now that we've scheduled a new instruction, some of its
+ * children can be promoted to the list of instructions ready to
+ * be scheduled. Update the children's unblocked time for this
+ * DAG edge as we do so.
+ */
+ for (int i = chosen->child_count - 1; i >= 0; i--) {
+ struct schedule_node *child = chosen->children[i];
+
+ child->unblocked_time = MAX2(child->unblocked_time,
+ state->time +
+ latency_between(chosen,
+ child));
+ child->parent_count--;
+ if (child->parent_count == 0)
+ list_add(&child->link, &state->worklist);
+ }
+
+ /* Update our tracking of register pressure. */
+ for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ if (inst->src[i].file == QFILE_TEMP)
+ BITSET_SET(state->temp_live, inst->src[i].index);
+ }
+ if (inst->dst.file == QFILE_TEMP) {
+ state->temp_writes[inst->dst.index]--;
+ if (state->temp_writes[inst->dst.index] == 0)
+ BITSET_CLEAR(state->temp_live, inst->dst.index);
+ }
+
+ state->time++;
+ }
+}
+
+void
+qir_schedule_instructions(struct vc4_compile *c)
+{
+ void *mem_ctx = ralloc_context(NULL);
+ struct schedule_state state = { 0 };
+
+ if (debug) {
+ fprintf(stderr, "Pre-schedule instructions\n");
+ qir_dump(c);
+ }
+
+ state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
+ state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
+ BITSET_WORDS(c->num_temps));
+ list_inithead(&state.worklist);
+
+ /* Wrap each instruction in a scheduler structure. */
+ list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) {
+ struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
+
+ n->inst = inst;
+ list_del(&inst->link);
+ list_addtail(&n->link, &state.worklist);
+
+ if (inst->dst.file == QFILE_TEMP)
+ state.temp_writes[inst->dst.index]++;
+ }
+
+ /* Dependencies tracked top-to-bottom. */
+ calculate_forward_deps(c, mem_ctx, &state.worklist);
+ /* Dependencies tracked bottom-to-top. */
+ calculate_reverse_deps(c, mem_ctx, &state.worklist);
+
+ list_for_each_entry(struct schedule_node, n, &state.worklist, link)
+ compute_delay(n);
+
+ schedule_instructions(c, &state);
+
+ if (debug) {
+ fprintf(stderr, "Post-schedule instructions\n");
+ qir_dump(c);
+ }
+
+ ralloc_free(mem_ctx);
+}
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 98b7b6070d7..09164b74932 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,7 +50,7 @@ struct schedule_node {
uint32_t child_array_size;
uint32_t parent_count;
- /* Longest cycles + n->latency of any parent of this node. */
+ /* Longest cycles + instruction_latency() of any parent of this node. */
uint32_t unblocked_time;
/**
@@ -259,7 +259,8 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
}
} else if (is_tmu_write(waddr)) {
add_write_dep(state, &state->last_tmu_write, n);
- } else if (qpu_waddr_is_tlb(waddr)) {
+ } else if (qpu_waddr_is_tlb(waddr) ||
+ waddr == QPU_W_MS_FLAGS) {
add_write_dep(state, &state->last_tlb, n);
} else {
switch (waddr) {
@@ -623,6 +624,46 @@ dump_state(struct list_head *schedule_list)
}
}
+static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
+{
+ if (waddr < 32)
+ return 2;
+
+ /* Apply some huge latency between texture fetch requests and getting
+ * their results back.
+ */
+ if (waddr == QPU_W_TMU0_S) {
+ if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
+ return 100;
+ }
+ if (waddr == QPU_W_TMU1_S) {
+ if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
+ return 100;
+ }
+
+ switch(waddr) {
+ case QPU_W_SFU_RECIP:
+ case QPU_W_SFU_RECIPSQRT:
+ case QPU_W_SFU_EXP:
+ case QPU_W_SFU_LOG:
+ return 3;
+ default:
+ return 1;
+ }
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+ uint64_t before_inst = before->inst->inst;
+ uint64_t after_inst = after->inst->inst;
+
+ return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
+ after_inst),
+ waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
+ after_inst));
+}
+
/** Recursive computation of the delay member of a node. */
static void
compute_delay(struct schedule_node *n)
@@ -634,7 +675,8 @@ compute_delay(struct schedule_node *n)
if (!n->children[i].node->delay)
compute_delay(n->children[i].node);
n->delay = MAX2(n->delay,
- n->children[i].node->delay + n->latency);
+ n->children[i].node->delay +
+ instruction_latency(n, n->children[i].node));
}
}
}
@@ -663,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
* immediately after (or paired with!) the thing reading the
* destination.
*/
- int latency_from_previous = war_only ? 0 : node->latency;
+ uint32_t latency = 0;
+ if (!war_only) {
+ latency = instruction_latency(node,
+ node->children[i].node);
+ }
+
child->unblocked_time = MAX2(child->unblocked_time,
- time + latency_from_previous);
+ time + latency);
child->parent_count--;
if (child->parent_count == 0)
list_add(&child->link, schedule_list);
@@ -798,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
return time;
}
-static uint32_t waddr_latency(uint32_t waddr)
-{
- if (waddr < 32)
- return 2;
-
- /* Some huge number, really. */
- if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
- return 100;
-
- switch(waddr) {
- case QPU_W_SFU_RECIP:
- case QPU_W_SFU_RECIPSQRT:
- case QPU_W_SFU_EXP:
- case QPU_W_SFU_LOG:
- return 3;
- default:
- return 1;
- }
-}
-
-static uint32_t
-instruction_latency(uint64_t inst)
-{
- return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
- waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
-}
-
uint32_t
qpu_schedule_instructions(struct vc4_compile *c)
{
@@ -851,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
n->inst = inst;
- n->latency = instruction_latency(inst->inst);
if (reads_uniform(inst->inst)) {
n->uniform = next_uniform++;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index e7069a4e9ff..9e6678a0625 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -207,7 +207,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
/* If the resource is multisampled, we need to resolve to single
* sample. This seems like it should be handled at a higher layer.
*/
- if (prsc->nr_samples) {
+ if (prsc->nr_samples > 1) {
trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
if (!trans->ss_resource)
goto fail;
@@ -377,7 +377,7 @@ vc4_setup_slices(struct vc4_resource *rsc)
if (!rsc->tiled) {
slice->tiling = VC4_TILING_FORMAT_LINEAR;
- if (prsc->nr_samples) {
+ if (prsc->nr_samples > 1) {
/* MSAA (4x) surfaces are stored as raw tile buffer contents. */
level_width = align(level_width, 32);
level_height = align(level_height, 32);
@@ -458,7 +458,7 @@ vc4_resource_setup(struct pipe_screen *pscreen,
prsc->screen = pscreen;
rsc->base.vtbl = &vc4_resource_vtbl;
- if (prsc->nr_samples == 0)
+ if (prsc->nr_samples <= 1)
rsc->cpp = util_format_get_blocksize(tmpl->format);
else
rsc->cpp = sizeof(uint32_t);
@@ -475,7 +475,7 @@ get_resource_texture_format(struct pipe_resource *prsc)
uint8_t format = vc4_get_tex_format(prsc->format);
if (!rsc->tiled) {
- if (prsc->nr_samples) {
+ if (prsc->nr_samples > 1) {
return ~0;
} else {
assert(format == VC4_TEXTURE_TYPE_RGBA8888);
@@ -497,7 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
* communicate metadata about tiling currently.
*/
if (tmpl->target == PIPE_BUFFER ||
- tmpl->nr_samples ||
+ tmpl->nr_samples > 1 ||
(tmpl->bind & (PIPE_BIND_SCANOUT |
PIPE_BIND_LINEAR |
PIPE_BIND_SHARED |
@@ -832,7 +832,7 @@ vc4_dump_surface(struct pipe_surface *psurf)
if (!psurf)
return;
- if (psurf->texture->nr_samples)
+ if (psurf->texture->nr_samples > 1)
vc4_dump_surface_msaa(psurf);
else
vc4_dump_surface_non_msaa(psurf);
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 090579c2c76..8ddf0865d21 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -57,6 +57,10 @@ static const struct debug_named_value debug_options[] = {
"Flush after each draw call" },
{ "always_sync", VC4_DEBUG_ALWAYS_SYNC,
"Wait for finish after each flush" },
+#if USE_VC4_SIMULATOR
+ { "dump", VC4_DEBUG_DUMP,
+ "Write a GPU command stream trace file" },
+#endif
{ NULL }
};
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 5992e371093..03f76b257e3 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -41,6 +41,7 @@ struct vc4_bo;
#define VC4_DEBUG_ALWAYS_FLUSH 0x0080
#define VC4_DEBUG_ALWAYS_SYNC 0x0100
#define VC4_DEBUG_NIR 0x0200
+#define VC4_DEBUG_DUMP 0x0400
#define VC4_MAX_MIP_LEVELS 12
#define VC4_MAX_TEXTURE_SAMPLERS 16
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 4b1df9234b6..521ef50f814 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -131,6 +131,93 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
return 0;
}
+static void
+vc4_dump_to_file(struct vc4_exec_info *exec)
+{
+ static int dumpno = 0;
+ struct drm_vc4_get_hang_state *state;
+ struct drm_vc4_get_hang_state_bo *bo_state;
+ unsigned int dump_version = 0;
+
+ if (!(vc4_debug & VC4_DEBUG_DUMP))
+ return;
+
+ state = calloc(1, sizeof(*state));
+
+ int unref_count = 0;
+ list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+ unref_head) {
+ unref_count++;
+ }
+
+ /* Add one more for the overflow area that isn't wrapped in a BO. */
+ state->bo_count = exec->bo_count + unref_count + 1;
+ bo_state = calloc(state->bo_count, sizeof(*bo_state));
+
+ char *filename = NULL;
+ asprintf(&filename, "vc4-dri-%d.dump", dumpno++);
+ FILE *f = fopen(filename, "w+");
+ if (!f) {
+ fprintf(stderr, "Couldn't open %s: %s", filename,
+ strerror(errno));
+ return;
+ }
+
+ fwrite(&dump_version, sizeof(dump_version), 1, f);
+
+ state->ct0ca = exec->ct0ca;
+ state->ct0ea = exec->ct0ea;
+ state->ct1ca = exec->ct1ca;
+ state->ct1ea = exec->ct1ea;
+ state->start_bin = exec->ct0ca;
+ state->start_render = exec->ct1ca;
+ fwrite(state, sizeof(*state), 1, f);
+
+ int i;
+ for (i = 0; i < exec->bo_count; i++) {
+ struct drm_gem_cma_object *cma_bo = exec->bo[i];
+ bo_state[i].handle = i; /* Not used by the parser. */
+ bo_state[i].paddr = cma_bo->paddr;
+ bo_state[i].size = cma_bo->base.size;
+ }
+
+ list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+ unref_head) {
+ struct drm_gem_cma_object *cma_bo = &bo->base;
+ bo_state[i].handle = 0;
+ bo_state[i].paddr = cma_bo->paddr;
+ bo_state[i].size = cma_bo->base.size;
+ i++;
+ }
+
+ /* Add the static overflow memory area. */
+ bo_state[i].handle = exec->bo_count;
+ bo_state[i].paddr = 0;
+ bo_state[i].size = OVERFLOW_SIZE;
+ i++;
+
+ fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
+
+ for (int i = 0; i < exec->bo_count; i++) {
+ struct drm_gem_cma_object *cma_bo = exec->bo[i];
+ fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+ }
+
+ list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+ unref_head) {
+ struct drm_gem_cma_object *cma_bo = &bo->base;
+ fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+ }
+
+ void *overflow = calloc(1, OVERFLOW_SIZE);
+ fwrite(overflow, 1, OVERFLOW_SIZE, f);
+ free(overflow);
+
+ free(state);
+ free(bo_state);
+ fclose(f);
+}
+
int
vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
{
@@ -183,6 +270,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
exec.ct1ea - exec.ct1ca, true);
}
+ vc4_dump_to_file(&exec);
+
if (exec.ct0ca != exec.ct0ea) {
int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea);
if (bfc != 1) {
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 40d3ada6ca2..e1f8b5a305e 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -45,7 +45,7 @@ struct vc4_exec_info;
#define roundup(x, y) align(x, y)
#define round_up(x, y) align(x, y)
#define max(x, y) MAX2(x, y)
-#define min(x, y) MiN2(x, y)
+#define min(x, y) MIN2(x, y)
#define BUG_ON(condition) assert(!(condition))
static inline int
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index d9c0f55b23d..7e0ada7d0fa 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -462,9 +462,9 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
vc4->msaa = false;
if (cso->cbufs[0])
- vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0;
+ vc4->msaa = cso->cbufs[0]->texture->nr_samples > 1;
else if (cso->zsbuf)
- vc4->msaa = cso->zsbuf->texture->nr_samples != 0;
+ vc4->msaa = cso->zsbuf->texture->nr_samples > 1;
if (vc4->msaa) {
vc4->tile_width = 32;