diff options
Diffstat (limited to 'src/gallium/drivers')
256 files changed, 22026 insertions, 16637 deletions
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index 4b2629f77bd..cbf62c6daae 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -21,15 +21,16 @@ libfreedreno_la_SOURCES = \ noinst_PROGRAMS = ir3_compiler +# XXX: Required due to the C++ sources in libnir/libglsl_util +nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp ir3_compiler_SOURCES = \ ir3/ir3_cmdline.c ir3_compiler_LDADD = \ libfreedreno.la \ - ../../auxiliary/libgallium.la \ + $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ $(top_builddir)/src/libglsl_util.la \ - -lstdc++ \ $(top_builddir)/src/util/libmesautil.la \ $(GALLIUM_COMMON_LIB_DEPS) \ $(FREEDRENO_LIBS) diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index a565a9c4e4d..baae9144005 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -120,18 +120,17 @@ ir3_SOURCES := \ ir3/disasm-a3xx.c \ ir3/instr-a3xx.h \ ir3/ir3.c \ - ir3/ir3_compiler.c \ ir3/ir3_compiler_nir.c \ + ir3/ir3_compiler.c \ ir3/ir3_compiler.h \ ir3/ir3_cp.c \ ir3/ir3_depth.c \ - ir3/ir3_dump.c \ - ir3/ir3_flatten.c \ ir3/ir3_group.c \ ir3/ir3.h \ ir3/ir3_legalize.c \ ir3/ir3_nir.h \ ir3/ir3_nir_lower_if_else.c \ + ir3/ir3_print.c \ ir3/ir3_ra.c \ ir3/ir3_sched.c \ ir3/ir3_shader.c \ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c index e4acc7e95b4..b48fb4659cd 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c @@ -414,32 +414,16 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, static void add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) { - switch (inst->Instruction.Saturate) { - case TGSI_SAT_NONE: - break; - case TGSI_SAT_ZERO_ONE: + if (inst->Instruction.Saturate) { alu->alu.vector_clamp = true; - break; - case TGSI_SAT_MINUS_PLUS_ONE: - DBG("unsupported saturate"); - assert(0); - break; } } static void add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) { - switch (inst->Instruction.Saturate) { - case TGSI_SAT_NONE: - break; - case TGSI_SAT_ZERO_ONE: + if (inst->Instruction.Saturate) { alu->alu.scalar_clamp = true; - break; - case TGSI_SAT_MINUS_PLUS_ONE: - DBG("unsupported saturate"); - assert(0); - break; } } @@ -758,7 +742,7 @@ translate_tex(struct fd2_compile_context *ctx, struct tgsi_src_register tmp_src; const struct tgsi_src_register *coord; bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) || - (inst->Instruction.Saturate != TGSI_SAT_NONE); + inst->Instruction.Saturate; int idx; if (using_temp || (opc == TGSI_OPCODE_TXP)) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h index 4e3f521716e..77e4605e550 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h @@ -105,9 +105,6 @@ struct fd3_context { */ unsigned fsaturate_s, fsaturate_t, fsaturate_r; - /* bitmask of integer texture samplers */ - uint16_t vinteger_s, finteger_s; - /* some state changes require a different shader variant. Keep * track of this so we know when we need to re-emit shader state * due to variant change. See fixup_shader_state() diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index b522cf86695..b5838b58eb2 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -104,14 +104,12 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key) if (last_key->has_per_samp || key->has_per_samp) { if ((last_key->vsaturate_s != key->vsaturate_s) || (last_key->vsaturate_t != key->vsaturate_t) || - (last_key->vsaturate_r != key->vsaturate_r) || - (last_key->vinteger_s != key->vinteger_s)) + (last_key->vsaturate_r != key->vsaturate_r)) ctx->prog.dirty |= FD_SHADER_DIRTY_VP; if ((last_key->fsaturate_s != key->fsaturate_s) || (last_key->fsaturate_t != key->fsaturate_t) || - (last_key->fsaturate_r != key->fsaturate_r) || - (last_key->finteger_s != key->finteger_s)) + (last_key->fsaturate_r != key->fsaturate_r)) ctx->prog.dirty |= FD_SHADER_DIRTY_FP; } @@ -140,16 +138,13 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) // TODO set .half_precision based on render target format, // ie. float16 and smaller use half, float32 use full.. .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF), - .has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate || - fd3_ctx->vinteger_s || fd3_ctx->finteger_s), + .has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate), .vsaturate_s = fd3_ctx->vsaturate_s, .vsaturate_t = fd3_ctx->vsaturate_t, .vsaturate_r = fd3_ctx->vsaturate_r, .fsaturate_s = fd3_ctx->fsaturate_s, .fsaturate_t = fd3_ctx->fsaturate_t, .fsaturate_r = fd3_ctx->fsaturate_r, - .vinteger_s = fd3_ctx->vinteger_s, - .finteger_s = fd3_ctx->finteger_s, }, .rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade, .sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : 0, diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index a6824ef92e7..57fcaa9020e 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -413,12 +413,15 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, } } - /* TODO: Figure out if there's a way to make it spit out 0's and - * 1's for the .z and .w components. + /* Replace the .xy coordinates with S/T from the point sprite. Set + * interpolation bits for .zw such that they become .01 */ - if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic))) + if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic))) { vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09) << ((inloc % 16) * 2); + vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2); + vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2); + } } OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c index 3497921257c..094dcf376e5 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c @@ -32,6 +32,7 @@ #include "fd3_screen.h" #include "fd3_context.h" #include "fd3_format.h" +#include "ir3_compiler.h" static boolean fd3_screen_is_format_supported(struct pipe_screen *pscreen, @@ -103,7 +104,9 @@ fd3_screen_is_format_supported(struct pipe_screen *pscreen, void fd3_screen_init(struct pipe_screen *pscreen) { - fd_screen(pscreen)->max_rts = 4; + struct fd_screen *screen = fd_screen(pscreen); + screen->max_rts = 4; + screen->compiler = ir3_compiler_create(screen->gpu_id); pscreen->context_create = fd3_context_create; pscreen->is_format_supported = fd3_screen_is_format_supported; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c index 6f44ee3c08e..a278bf5c603 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c @@ -263,44 +263,11 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, return &so->base; } -static void -fd3_set_sampler_views(struct pipe_context *pctx, unsigned shader, - unsigned start, unsigned nr, - struct pipe_sampler_view **views) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd3_context *fd3_ctx = fd3_context(ctx); - struct fd_texture_stateobj *tex; - uint16_t integer_s = 0, *ptr; - int i; - - fd_set_sampler_views(pctx, shader, start, nr, views); - - switch (shader) { - case PIPE_SHADER_FRAGMENT: - tex = &ctx->fragtex; - ptr = &fd3_ctx->finteger_s; - break; - case PIPE_SHADER_VERTEX: - tex = &ctx->verttex; - ptr = &fd3_ctx->vinteger_s; - break; - default: - return; - } - - for (i = 0; i < tex->num_textures; i++) - if (util_format_is_pure_integer(tex->textures[i]->format)) - integer_s |= 1 << i; - *ptr = integer_s; -} - - void fd3_texture_init(struct pipe_context *pctx) { pctx->create_sampler_state = fd3_sampler_state_create; pctx->bind_sampler_states = fd3_sampler_states_bind; pctx->create_sampler_view = fd3_sampler_view_create; - pctx->set_sampler_views = fd3_set_sampler_views; + pctx->set_sampler_views = fd_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index 384602a2e4f..53e1bf6a2e6 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -83,9 +83,6 @@ struct fd4_context { */ uint16_t fsaturate_s, fsaturate_t, fsaturate_r; - /* bitmask of integer texture samplers */ - uint16_t vinteger_s, finteger_s; - /* some state changes require a different shader variant. Keep * track of this so we know when we need to re-emit shader state * due to variant change. See fixup_shader_state() diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index ae407f753fe..de5a306af60 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -82,8 +82,7 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key) if (last_key->has_per_samp || key->has_per_samp) { if ((last_key->vsaturate_s != key->vsaturate_s) || (last_key->vsaturate_t != key->vsaturate_t) || - (last_key->vsaturate_r != key->vsaturate_r) || - (last_key->vinteger_s != key->vinteger_s)) + (last_key->vsaturate_r != key->vsaturate_r)) ctx->prog.dirty |= FD_SHADER_DIRTY_VP; if ((last_key->fsaturate_s != key->fsaturate_s) || @@ -122,16 +121,13 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) // TODO set .half_precision based on render target format, // ie. float16 and smaller use half, float32 use full.. .half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF), - .has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate || - fd4_ctx->vinteger_s || fd4_ctx->finteger_s), + .has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate), .vsaturate_s = fd4_ctx->vsaturate_s, .vsaturate_t = fd4_ctx->vsaturate_t, .vsaturate_r = fd4_ctx->vsaturate_r, .fsaturate_s = fd4_ctx->fsaturate_s, .fsaturate_t = fd4_ctx->fsaturate_t, .fsaturate_r = fd4_ctx->fsaturate_r, - .vinteger_s = fd4_ctx->vinteger_s, - .finteger_s = fd4_ctx->finteger_s, }, .format = fd4_emit_format(pfb->cbufs[0]), .pformat = pipe_surface_format(pfb->cbufs[0]), diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c index f5b46685bdf..e8cbb2d201a 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c @@ -32,6 +32,7 @@ #include "fd4_screen.h" #include "fd4_context.h" #include "fd4_format.h" +#include "ir3_compiler.h" static boolean fd4_screen_is_format_supported(struct pipe_screen *pscreen, @@ -100,7 +101,9 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen, void fd4_screen_init(struct pipe_screen *pscreen) { - fd_screen(pscreen)->max_rts = 1; + struct fd_screen *screen = fd_screen(pscreen); + screen->max_rts = 1; + screen->compiler = ir3_compiler_create(screen->gpu_id); pscreen->context_create = fd4_context_create; pscreen->is_format_supported = fd4_screen_is_format_supported; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c index ff1ff8f0d34..6ba25d0816d 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c @@ -205,43 +205,11 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, return &so->base; } -static void -fd4_set_sampler_views(struct pipe_context *pctx, unsigned shader, - unsigned start, unsigned nr, struct pipe_sampler_view **views) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd4_context *fd4_ctx = fd4_context(ctx); - struct fd_texture_stateobj *tex; - uint16_t integer_s = 0, *ptr; - int i; - - fd_set_sampler_views(pctx, shader, start, nr, views); - - switch (shader) { - case PIPE_SHADER_FRAGMENT: - tex = &ctx->fragtex; - ptr = &fd4_ctx->finteger_s; - break; - case PIPE_SHADER_VERTEX: - tex = &ctx->verttex; - ptr = &fd4_ctx->vinteger_s; - break; - default: - return; - } - - for (i = 0; i < tex->num_textures; i++) - if (util_format_is_pure_integer(tex->textures[i]->format)) - integer_s |= 1 << i; - - *ptr = integer_s; -} - void fd4_texture_init(struct pipe_context *pctx) { pctx->create_sampler_state = fd4_sampler_state_create; pctx->bind_sampler_states = fd_sampler_states_bind; pctx->create_sampler_view = fd4_sampler_view_create; - pctx->set_sampler_views = fd4_set_sampler_views; + pctx->set_sampler_views = fd_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 2c816b4b1f6..e420f1e5bd9 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -297,7 +297,7 @@ struct fd_context { */ struct fd_gmem_stateobj gmem; struct fd_vsc_pipe pipe[8]; - struct fd_tile tile[64]; + struct fd_tile tile[256]; /* which state objects need to be re-emit'd: */ enum { diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c index 46b057d9062..375e58f7022 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.c +++ b/src/gallium/drivers/freedreno/freedreno_fence.c @@ -35,6 +35,7 @@ struct pipe_fence_handle { struct pipe_reference reference; struct fd_context *ctx; + struct fd_screen *screen; uint32_t timestamp; }; @@ -68,7 +69,7 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) { - if (fd_pipe_wait(fence->ctx->screen->pipe, fence->timestamp)) + if (fd_pipe_wait(fence->screen->pipe, fence->timestamp)) return false; return true; @@ -86,6 +87,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx) pipe_reference_init(&fence->reference, 1); fence->ctx = ctx; + fence->screen = ctx->screen; fence->timestamp = fd_ringbuffer_timestamp(ctx->ring); return fence; diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 11a1b62b26b..c105378ec4e 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -117,6 +117,7 @@ calculate_tiles(struct fd_context *ctx) uint32_t i, j, t, xoff, yoff; uint32_t tpp_x, tpp_y; bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)); + int tile_n[ARRAY_SIZE(ctx->pipe)]; if (has_zs) { struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); @@ -247,6 +248,7 @@ calculate_tiles(struct fd_context *ctx) /* configure tiles: */ t = 0; yoff = miny; + memset(tile_n, 0, sizeof(tile_n)); for (i = 0; i < nbins_y; i++) { uint32_t bw, bh; @@ -257,20 +259,17 @@ calculate_tiles(struct fd_context *ctx) for (j = 0; j < nbins_x; j++) { struct fd_tile *tile = &ctx->tile[t]; - uint32_t n, p; + uint32_t p; assert(t < ARRAY_SIZE(ctx->tile)); /* pipe number: */ p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); - /* slot number: */ - n = ((i % tpp_y) * tpp_x) + (j % tpp_x); - /* clip bin width: */ bw = MIN2(bin_w, minx + width - xoff); - tile->n = n; + tile->n = tile_n[p]++; tile->p = p; tile->bin_w = bw; tile->bin_h = bh; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 556c8ab18d4..b3b5462b437 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -68,10 +68,7 @@ static const struct debug_named_value debug_options[] = { {"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"}, {"nobin", FD_DBG_NOBIN, "Disable hw binning"}, {"optmsgs", FD_DBG_OPTMSGS,"Enable optimizer debug messages"}, - {"optdump", FD_DBG_OPTDUMP,"Dump shader DAG to .dot files"}, {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"}, - {"nocp", FD_DBG_NOCP, "Disable copy-propagation"}, - {"nir", FD_DBG_NIR, "Enable experimental NIR compiler"}, DEBUG_NAMED_VALUE_END }; @@ -220,6 +217,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -374,6 +372,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: return 1; @@ -519,6 +518,7 @@ fd_screen_create(struct fd_device *dev) case 220: fd2_screen_init(pscreen); break; + case 307: case 320: case 330: fd3_screen_init(pscreen); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 3b470d1d8a6..dbc2808262a 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -46,7 +46,9 @@ struct fd_screen { uint32_t device_id; uint32_t gpu_id; /* 220, 305, etc */ uint32_t chip_id; /* coreid:8 majorrev:8 minorrev:8 patch:8 */ - uint32_t max_rts; + uint32_t max_rts; /* max # of render targets */ + + void *compiler; /* currently unused for a2xx */ struct fd_device *dev; struct fd_pipe *pipe; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 2735ae41315..deb0e602ce2 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -62,11 +62,8 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_NOBYPASS 0x0040 #define FD_DBG_FRAGHALF 0x0080 #define FD_DBG_NOBIN 0x0100 -#define FD_DBG_OPTMSGS 0x0400 -#define FD_DBG_OPTDUMP 0x0800 -#define FD_DBG_GLSL120 0x1000 -#define FD_DBG_NOCP 0x2000 -#define FD_DBG_NIR 0x4000 +#define FD_DBG_OPTMSGS 0x0200 +#define FD_DBG_GLSL120 0x0400 extern int fd_mesa_debug; extern bool fd_binning_enabled; diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c index a5136c6bd3d..48ae7c71b9f 100644 --- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c +++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -133,16 +133,16 @@ static void print_instr_cat0(instr_t *instr) break; case OPC_BR: printf(" %sp0.%c, #%d", cat0->inv ? "!" : "", - component[cat0->comp], cat0->immed); + component[cat0->comp], cat0->a3xx.immed); break; case OPC_JUMP: case OPC_CALL: - printf(" #%d", cat0->immed); + printf(" #%d", cat0->a3xx.immed); break; } - if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4)) - printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4); + if ((debug & PRINT_VERBOSE) && (cat0->a3xx.dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4)) + printf("\t{0: %x,%x,%x,%x}", cat0->a3xx.dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4); } static void print_instr_cat1(instr_t *instr) diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h index cffa62b6f34..efb07ea479e 100644 --- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -191,9 +191,9 @@ typedef enum { OPC_LDLV = 31, /* meta instructions (category -1): */ - /* placeholder instr to mark inputs/outputs: */ + /* placeholder instr to mark shader inputs: */ OPC_META_INPUT = 0, - OPC_META_OUTPUT = 1, + OPC_META_PHI = 1, /* The "fan-in" and "fan-out" instructions are used for keeping * track of instructions that write to multiple dst registers * (fan-out) like texture sample instructions, or read multiple @@ -201,9 +201,6 @@ typedef enum { */ OPC_META_FO = 2, OPC_META_FI = 3, - /* branches/flow control */ - OPC_META_FLOW = 4, - OPC_META_PHI = 5, } opc_t; @@ -281,8 +278,16 @@ static inline int reg_special(reg_t reg) typedef struct PACKED { /* dword0: */ - int16_t immed : 16; - uint32_t dummy1 : 16; + union PACKED { + struct PACKED { + int16_t immed : 16; + uint32_t dummy1 : 16; + } a3xx; + struct PACKED { + int32_t immed : 20; + uint32_t dummy1 : 12; + } a4xx; + }; /* dword1: */ uint32_t dummy2 : 8; diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index e015de91c33..a166b67d7cf 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -66,11 +66,22 @@ void * ir3_alloc(struct ir3 *shader, int sz) return ptr; } -struct ir3 * ir3_create(void) +struct ir3 * ir3_create(struct ir3_compiler *compiler, + unsigned nin, unsigned nout) { - struct ir3 *shader = - calloc(1, sizeof(struct ir3)); + struct ir3 *shader = calloc(1, sizeof(struct ir3)); + grow_heap(shader); + + shader->compiler = compiler; + shader->ninputs = nin; + shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin); + + shader->noutputs = nout; + shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); + + list_inithead(&shader->block_list); + return shader; } @@ -81,7 +92,8 @@ void ir3_destroy(struct ir3 *shader) shader->chunk = chunk->next; free(chunk); } - free(shader->instrs); + free(shader->indirects); + free(shader->predicates); free(shader->baryfs); free(shader); } @@ -142,7 +154,11 @@ static int emit_cat0(struct ir3_instruction *instr, void *ptr, { instr_cat0_t *cat0 = ptr; - cat0->immed = instr->cat0.immed; + if (info->gpu_id >= 400) { + cat0->a4xx.immed = instr->cat0.immed; + } else { + cat0->a3xx.immed = instr->cat0.immed; + } cat0->repeat = instr->repeat; cat0->ss = !!(instr->flags & IR3_INSTR_SS); cat0->inv = instr->cat0.inv; @@ -535,32 +551,40 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, uint32_t gpu_id) { uint32_t *ptr, *dwords; - uint32_t i; + info->gpu_id = gpu_id; info->max_reg = -1; info->max_half_reg = -1; info->max_const = -1; info->instrs_count = 0; + info->sizedwords = 0; + + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + info->sizedwords += 2; + } + } /* need a integer number of instruction "groups" (sets of 16 * instructions on a4xx or sets of 4 instructions on a3xx), * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits) */ if (gpu_id >= 400) { - info->sizedwords = 2 * align(shader->instrs_count, 16); + info->sizedwords = align(info->sizedwords, 16 * 2); } else { - info->sizedwords = 2 * align(shader->instrs_count, 4); + info->sizedwords = align(info->sizedwords, 4 * 2); } ptr = dwords = calloc(4, info->sizedwords); - for (i = 0; i < shader->instrs_count; i++) { - struct ir3_instruction *instr = shader->instrs[i]; - int ret = emit[instr->category](instr, dwords, info); - if (ret) - goto fail; - info->instrs_count += 1 + instr->repeat; - dwords += 2; + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + int ret = emit[instr->category](instr, dwords, info); + if (ret) + goto fail; + info->instrs_count += 1 + instr->repeat; + dwords += 2; + } } return ptr; @@ -581,50 +605,30 @@ static struct ir3_register * reg_create(struct ir3 *shader, return reg; } -static void insert_instr(struct ir3 *shader, +static void insert_instr(struct ir3_block *block, struct ir3_instruction *instr) { + struct ir3 *shader = block->shader; #ifdef DEBUG static uint32_t serialno = 0; instr->serialno = ++serialno; #endif - array_insert(shader->instrs, instr); + list_addtail(&instr->node, &block->instr_list); if (is_input(instr)) array_insert(shader->baryfs, instr); } -struct ir3_block * ir3_block_create(struct ir3 *shader, - unsigned ntmp, unsigned nin, unsigned nout) +struct ir3_block * ir3_block_create(struct ir3 *shader) { - struct ir3_block *block; - unsigned size; - char *ptr; - - size = sizeof(*block); - size += sizeof(block->temporaries[0]) * ntmp; - size += sizeof(block->inputs[0]) * nin; - size += sizeof(block->outputs[0]) * nout; - - ptr = ir3_alloc(shader, size); - - block = (void *)ptr; - ptr += sizeof(*block); - - block->temporaries = (void *)ptr; - block->ntemporaries = ntmp; - ptr += sizeof(block->temporaries[0]) * ntmp; - - block->inputs = (void *)ptr; - block->ninputs = nin; - ptr += sizeof(block->inputs[0]) * nin; - - block->outputs = (void *)ptr; - block->noutputs = nout; - ptr += sizeof(block->outputs[0]) * nout; - + struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); +#ifdef DEBUG + static uint32_t serialno = 0; + block->serialno = ++serialno; +#endif block->shader = shader; - + list_inithead(&block->node); + list_inithead(&block->instr_list); return block; } @@ -652,7 +656,7 @@ struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, instr->block = block; instr->category = category; instr->opc = opc; - insert_instr(block->shader, instr); + insert_instr(block, instr); return instr; } @@ -677,7 +681,7 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) *new_instr = *instr; new_instr->regs = regs; - insert_instr(instr->block->shader, new_instr); + insert_instr(instr->block, new_instr); /* clone registers: */ new_instr->regs_count = 0; @@ -694,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags) { - struct ir3_register *reg = reg_create(instr->block->shader, num, flags); + struct ir3 *shader = instr->block->shader; + struct ir3_register *reg = reg_create(shader, num, flags); #ifdef DEBUG debug_assert(instr->regs_count < instr->regs_max); #endif instr->regs[instr->regs_count++] = reg; return reg; } + +void +ir3_block_clear_mark(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + instr->flags &= ~IR3_INSTR_MARK; +} + +void +ir3_clear_mark(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ir3_block_clear_mark(block); + } +} + +/* note: this will destroy instr->depth, don't do it until after sched! */ +void +ir3_count_instructions(struct ir3 *ir) +{ + unsigned ip = 0; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + instr->ip = ip++; + } + block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip; + block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip; + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index c0a14a07d48..9c35a763d58 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -28,17 +28,20 @@ #include <stdbool.h> #include "util/u_debug.h" +#include "util/list.h" #include "instr-a3xx.h" #include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */ /* low level intermediate representation of an adreno shader program */ +struct ir3_compiler; struct ir3; struct ir3_instruction; struct ir3_block; struct ir3_info { + uint32_t gpu_id; uint16_t sizedwords; uint16_t instrs_count; /* expanded to account for rpt's */ /* NOTE: max_reg, etc, does not include registers not touched @@ -80,8 +83,8 @@ struct ir3_register { * before register assignment is done: */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ - IR3_REG_IA = 0x4000, /* meta-input dst is "assigned" */ - IR3_REG_ADDR = 0x8000, /* register is a0.x */ + IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */ + } flags; union { /* normal registers: @@ -185,6 +188,7 @@ struct ir3_instruction { char inv; char comp; int immed; + struct ir3_block *target; } cat0; struct { type_t src_type, dst_type; @@ -218,14 +222,14 @@ struct ir3_instruction { int aid; } fi; struct { - struct ir3_block *if_block, *else_block; - } flow; + /* used to temporarily hold reference to nir_phi_instr + * until we resolve the phi srcs + */ + void *nphi; + } phi; struct { struct ir3_block *block; } inout; - - /* XXX keep this as big as all other union members! */ - uint32_t info[3]; }; /* transient values used during various algorithms: */ @@ -243,6 +247,13 @@ struct ir3_instruction { */ #define DEPTH_UNUSED ~0 unsigned depth; + /* When we get to the RA stage, we no longer need depth, but + * we do need instruction's position/name: + */ + struct { + uint16_t ip; + uint16_t name; + }; }; /* Used during CP and RA stages. For fanin and shader inputs/ @@ -290,7 +301,9 @@ struct ir3_instruction { */ struct ir3_instruction *fanin; - struct ir3_instruction *next; + /* Entry in ir3_block's instruction list: */ + struct list_head node; + #ifdef DEBUG uint32_t serialno; #endif @@ -321,8 +334,11 @@ static inline int ir3_neighbor_count(struct ir3_instruction *instr) struct ir3_heap_chunk; struct ir3 { - unsigned instrs_count, instrs_sz; - struct ir3_instruction **instrs; + struct ir3_compiler *compiler; + + unsigned ninputs, noutputs; + struct ir3_instruction **inputs; + struct ir3_instruction **outputs; /* Track bary.f (and ldlv) instructions.. this is needed in * scheduling to ensure that all varying fetches happen before @@ -345,33 +361,54 @@ struct ir3 { */ unsigned indirects_count, indirects_sz; struct ir3_instruction **indirects; + /* and same for instructions that consume predicate register: */ + unsigned predicates_count, predicates_sz; + struct ir3_instruction **predicates; + + /* List of blocks: */ + struct list_head block_list; - struct ir3_block *block; unsigned heap_idx; struct ir3_heap_chunk *chunk; }; +typedef struct nir_block nir_block; + struct ir3_block { + struct list_head node; struct ir3 *shader; - unsigned ntemporaries, ninputs, noutputs; - /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */ - struct ir3_instruction **temporaries; - struct ir3_instruction **inputs; - struct ir3_instruction **outputs; - /* only a single address register: */ - struct ir3_instruction *address; - struct ir3_block *parent; - struct ir3_instruction *head; + + nir_block *nblock; + + struct list_head instr_list; /* list of ir3_instruction */ + + /* each block has either one or two successors.. in case of + * two successors, 'condition' decides which one to follow. + * A block preceding an if/else has two successors. + */ + struct ir3_instruction *condition; + struct ir3_block *successors[2]; + + uint16_t start_ip, end_ip; + + /* used for per-pass extra block data. Mainly used right + * now in RA step to track livein/liveout. + */ + void *bd; + +#ifdef DEBUG + uint32_t serialno; +#endif }; -struct ir3 * ir3_create(void); +struct ir3 * ir3_create(struct ir3_compiler *compiler, + unsigned nin, unsigned nout); void ir3_destroy(struct ir3 *shader); void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, uint32_t gpu_id); void * ir3_alloc(struct ir3 *shader, int sz); -struct ir3_block * ir3_block_create(struct ir3 *shader, - unsigned ntmp, unsigned nin, unsigned nout); +struct ir3_block * ir3_block_create(struct ir3 *shader); struct ir3_instruction * ir3_instr_create(struct ir3_block *block, int category, opc_t opc); @@ -383,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); - static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) { if (instr->flags & IR3_INSTR_MARK) @@ -392,22 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) return false; } -static inline void ir3_clear_mark(struct ir3 *shader) -{ - /* TODO would be nice to drop the instruction array.. for - * new compiler, _clear_mark() is all we use it for, and - * we could probably manage a linked list instead.. - * - * Also, we'll probably want to mark instructions within - * a block, so tracking the list of instrs globally is - * unlikely to be what we want. - */ - unsigned i; - for (i = 0; i < shader->instrs_count; i++) { - struct ir3_instruction *instr = shader->instrs[i]; - instr->flags &= ~IR3_INSTR_MARK; - } -} +void ir3_block_clear_mark(struct ir3_block *block); +void ir3_clear_mark(struct ir3 *shader); + +void ir3_count_instructions(struct ir3 *ir); static inline int ir3_instr_regno(struct ir3_instruction *instr, struct ir3_register *reg) @@ -501,6 +525,28 @@ static inline bool is_mem(struct ir3_instruction *instr) return (instr->category == 6); } +static inline bool +is_store(struct ir3_instruction *instr) +{ + if (is_mem(instr)) { + /* these instructions, the "destination" register is + * actually a source, the address to store to. + */ + switch (instr->opc) { + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + case OPC_L2G: + case OPC_G2L: + return true; + default: + break; + } + } + return false; +} + static inline bool is_input(struct ir3_instruction *instr) { /* in some cases, ldlv is used to fetch varying without @@ -525,7 +571,7 @@ static inline bool writes_addr(struct ir3_instruction *instr) { if (instr->regs_count > 0) { struct ir3_register *dst = instr->regs[0]; - return !!(dst->flags & IR3_REG_ADDR); + return reg_num(dst) == REG_A0; } return false; } @@ -556,13 +602,29 @@ static inline bool conflicts(struct ir3_instruction *a, static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_ADDR)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) return false; if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) return false; return true; } +static inline type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + /* some cat2 instructions (ie. those which are not float) can embed an * immediate: */ @@ -747,37 +809,31 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr /* dump: */ -#include <stdio.h> -void ir3_dump(struct ir3 *shader, const char *name, - struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, - FILE *f); -void ir3_dump_instr_single(struct ir3_instruction *instr); -void ir3_dump_instr_list(struct ir3_instruction *instr); - -/* flatten if/else: */ -int ir3_block_flatten(struct ir3_block *block); +void ir3_print(struct ir3 *ir); +void ir3_print_instr(struct ir3_instruction *instr); /* depth calculation: */ int ir3_delayslots(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n); -void ir3_block_depth(struct ir3_block *block); +void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list); +void ir3_depth(struct ir3 *ir); /* copy-propagate: */ -void ir3_block_cp(struct ir3_block *block); +void ir3_cp(struct ir3 *ir); -/* group neightbors and insert mov's to resolve conflicts: */ -void ir3_block_group(struct ir3_block *block); +/* group neighbors and insert mov's to resolve conflicts: */ +void ir3_group(struct ir3 *ir); /* scheduling: */ -int ir3_block_sched(struct ir3_block *block); +int ir3_sched(struct ir3 *ir); /* register assignment: */ -int ir3_block_ra(struct ir3_block *block, enum shader_t type, +struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(void *memctx); +int ir3_ra(struct ir3 *ir3, enum shader_t type, bool frag_coord, bool frag_face); /* legalize: */ -void ir3_block_legalize(struct ir3_block *block, - bool *has_samp, int *max_bary); +void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary); /* ************************************************************************* */ /* instruction helpers */ @@ -807,6 +863,21 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src, return instr; } +static inline struct ir3_instruction * +ir3_NOP(struct ir3_block *block) +{ + return ir3_instr_create(block, 0, OPC_NOP); +} + +#define INSTR0(CAT, name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, CAT, OPC_##name); \ + return instr; \ +} + #define INSTR1(CAT, name) \ static inline struct ir3_instruction * \ ir3_##name(struct ir3_block *block, \ @@ -850,7 +921,10 @@ ir3_##name(struct ir3_block *block, \ } /* cat0 instructions: */ +INSTR0(0, BR); +INSTR0(0, JUMP); INSTR1(0, KILL); +INSTR0(0, END); /* cat2 instructions, most 2 src but some 1 src: */ INSTR2(2, ADD_F) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index d0517aab8ce..ad9d2719d59 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -30,6 +30,7 @@ #include <fcntl.h> #include <stdint.h> #include <stdlib.h> +#include <stdio.h> #include <err.h> #include "tgsi/tgsi_parse.h" @@ -65,34 +66,34 @@ static void dump_info(struct ir3_shader_variant *so, const char *str) // TODO make gpu_id configurable on cmdline bin = ir3_shader_assemble(so, 320); if (fd_mesa_debug & FD_DBG_DISASM) { - struct ir3_block *block = so->ir->block; + struct ir3 *ir = so->ir; struct ir3_register *reg; uint8_t regid; unsigned i; debug_printf("; %s: %s\n", type, str); - for (i = 0; i < block->ninputs; i++) { - if (!block->inputs[i]) { + for (i = 0; i < ir->ninputs; i++) { + if (!ir->inputs[i]) { debug_printf("; in%d unused\n", i); continue; } - reg = block->inputs[i]->regs[0]; + reg = ir->inputs[i]->regs[0]; regid = reg->num; debug_printf("@in(%sr%d.%c)\tin%d\n", (reg->flags & IR3_REG_HALF) ? "h" : "", (regid >> 2), "xyzw"[regid & 0x3], i); } - for (i = 0; i < block->noutputs; i++) { - if (!block->outputs[i]) { + for (i = 0; i < ir->noutputs; i++) { + if (!ir->outputs[i]) { debug_printf("; out%d unused\n", i); continue; } /* kill shows up as a virtual output.. skip it! */ - if (is_kill(block->outputs[i])) + if (is_kill(ir->outputs[i])) continue; - reg = block->outputs[i]->regs[0]; + reg = ir->outputs[i]->regs[0]; regid = reg->num; debug_printf("@out(%sr%d.%c)\tout%d\n", (reg->flags & IR3_REG_HALF) ? "h" : "", @@ -194,16 +195,6 @@ read_file(const char *filename, void **ptr, size_t *size) return 0; } -static void reset_variant(struct ir3_shader_variant *v, const char *msg) -{ - printf("; %s\n", msg); - v->inputs_count = 0; - v->outputs_count = 0; - v->total_in = 0; - v->has_samp = false; - v->immediates_count = 0; -} - static void print_usage(void) { printf("Usage: ir3_compiler [OPTIONS]... FILE\n"); @@ -225,12 +216,12 @@ int main(int argc, char **argv) const char *filename; struct tgsi_token toks[65536]; struct tgsi_parse_context parse; + struct ir3_compiler *compiler; struct ir3_shader_variant v; struct ir3_shader_key key = {}; const char *info; void *ptr; size_t size; - int use_nir = 0; fd_mesa_debug |= FD_DBG_DISASM; @@ -243,7 +234,7 @@ int main(int argc, char **argv) while (n < argc) { if (!strcmp(argv[n], "--verbose")) { - fd_mesa_debug |= FD_DBG_OPTDUMP | FD_DBG_MSGS | FD_DBG_OPTMSGS; + fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS; n++; continue; } @@ -290,17 +281,6 @@ int main(int argc, char **argv) continue; } - if (!strcmp(argv[n], "--nocp")) { - fd_mesa_debug |= FD_DBG_NOCP; - n++; - continue; - } - if (!strcmp(argv[n], "--nir")) { - use_nir = true; - n++; - continue; - } - if (!strcmp(argv[n], "--help")) { print_usage(); return 0; @@ -340,31 +320,14 @@ int main(int argc, char **argv) break; } - if (use_nir) { - info = "NIR compiler"; - ret = ir3_compile_shader_nir(&v, toks, key); - } else { - info = "TGSI compiler"; - ret = ir3_compile_shader(&v, toks, key, true); - } - - if (ret) { - reset_variant(&v, "compiler failed, trying without copy propagation!"); - info = "compiler (no copy propagation)"; - ret = ir3_compile_shader(&v, toks, key, false); - } + /* TODO cmdline option to target different gpus: */ + compiler = ir3_compiler_create(320); + info = "NIR compiler"; + ret = ir3_compile_shader_nir(compiler, &v, toks, key); if (ret) { fprintf(stderr, "compiler failed!\n"); return ret; } dump_info(&v, info); } - -void _mesa_error_no_memory(const char *caller); - -void -_mesa_error_no_memory(const char *caller) -{ - fprintf(stderr, "Mesa error: out of memory in %s", caller); -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c index 43f4c955ac0..7c8eccb54e1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -1,7 +1,7 @@ /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ /* - * Copyright (C) 2013 Rob Clark <[email protected]> + * Copyright (C) 2015 Rob Clark <[email protected]> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,3710 +26,19 @@ * Rob Clark <[email protected]> */ -#include <stdarg.h> - -#include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "tgsi/tgsi_lowering.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_ureg.h" -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_scan.h" - -#include "freedreno_util.h" +#include "util/ralloc.h" #include "ir3_compiler.h" -#include "ir3_shader.h" - -#include "instr-a3xx.h" -#include "ir3.h" - -struct ir3_compile_context { - const struct tgsi_token *tokens; - bool free_tokens; - struct ir3 *ir; - struct ir3_shader_variant *so; - uint16_t integer_s; - - struct ir3_block *block; - struct ir3_instruction *current_instr; - - /* we need to defer updates to block->outputs[] until the end - * of an instruction (so we don't see new value until *after* - * the src registers are processed) - */ - struct { - struct ir3_instruction *instr, **instrp; - } output_updates[64]; - unsigned num_output_updates; - - /* are we in a sequence of "atomic" instructions? - */ - bool atomic; - - /* For fragment shaders, from the hw perspective the only - * actual input is r0.xy position register passed to bary.f. - * But TGSI doesn't know that, it still declares things as - * IN[] registers. So we do all the input tracking normally - * and fix things up after compile_instructions() - * - * NOTE that frag_pos is the hardware position (possibly it - * is actually an index or tag or some such.. it is *not* - * values that can be directly used for gl_FragCoord..) - */ - struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; - - /* For vertex shaders, keep track of the system values sources */ - struct ir3_instruction *vertex_id, *basevertex, *instance_id; - - struct tgsi_parse_context parser; - unsigned type; - - struct tgsi_shader_info info; - - /* hmm, would be nice if tgsi_scan_shader figured this out - * for us: - */ - struct { - unsigned first, last; - struct ir3_instruction *fanin; - } array[MAX_ARRAYS]; - uint32_t array_dirty; - /* offset into array[], per file, of first array info */ - uint8_t array_offsets[TGSI_FILE_COUNT]; - - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - - /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate - * so we need to use ldlv.u32 to load the varying directly: - */ - bool flat_bypass; - - unsigned num_internal_temps; - struct tgsi_src_register internal_temps[8]; - - /* for looking up which system value is which */ - unsigned sysval_semantics[8]; - - /* idx/slot for last compiler generated immediate */ - unsigned immediate_idx; - - /* stack of branch instructions that mark (potentially nested) - * branch if/else/loop/etc - */ - struct { - struct ir3_instruction *instr, *cond; - bool inv; /* true iff in else leg of branch */ - } branch[16]; - unsigned int branch_count; - - /* list of kill instructions: */ - struct ir3_instruction *kill[16]; - unsigned int kill_count; - - /* used when dst is same as one of the src, to avoid overwriting a - * src element before the remaining scalar instructions that make - * up the vector operation - */ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - /* just for catching incorrect use of get_dst()/put_dst(): - */ - bool using_tmp_dst; -}; - - -static void vectorize(struct ir3_compile_context *ctx, - struct ir3_instruction *instr, struct tgsi_dst_register *dst, - int nsrcs, ...); -static void create_mov(struct ir3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *src); -static type_t get_ftype(struct ir3_compile_context *ctx); -static type_t get_utype(struct ir3_compile_context *ctx); - -static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i) -{ - /* ArrayID 0 for a given file is the legacy array spanning the entire file: */ - ctx->array[i].first = 0; - ctx->array[i].last = ctx->info.file_max[file]; - ctx->array_offsets[file] = i; - i += ctx->info.array_max[file] + 1; - return i; -} - -static unsigned -compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, - const struct tgsi_token *tokens) -{ - unsigned ret, i; - struct tgsi_shader_info *info = &ctx->info; - struct tgsi_lowering_config lconfig = { - .color_two_side = so->key.color_two_side, - .lower_DST = true, - .lower_XPD = true, - .lower_SCS = true, - .lower_LRP = true, - .lower_FRC = true, - .lower_POW = true, - .lower_LIT = true, - .lower_EXP = true, - .lower_LOG = true, - .lower_DP4 = true, - .lower_DP3 = true, - .lower_DPH = true, - .lower_DP2 = true, - .lower_DP2A = true, - }; - - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - lconfig.saturate_s = so->key.fsaturate_s; - lconfig.saturate_t = so->key.fsaturate_t; - lconfig.saturate_r = so->key.fsaturate_r; - ctx->integer_s = so->key.finteger_s; - break; - case SHADER_VERTEX: - lconfig.saturate_s = so->key.vsaturate_s; - lconfig.saturate_t = so->key.vsaturate_t; - lconfig.saturate_r = so->key.vsaturate_r; - ctx->integer_s = so->key.vinteger_s; - break; - } - - if (!so->shader) { - /* hack for standalone compiler which does not have - * screen/context: - */ - } else if (ir3_shader_gpuid(so->shader) >= 400) { - /* a4xx seems to have *no* sam.p */ - lconfig.lower_TXP = ~0; /* lower all txp */ - /* need special handling for "flat" */ - ctx->flat_bypass = true; - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D); - /* no special handling for "flat" */ - ctx->flat_bypass = false; - } - - ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info); - ctx->free_tokens = !!ctx->tokens; - if (!ctx->tokens) { - /* no lowering */ - ctx->tokens = tokens; - } - ctx->ir = so->ir; - ctx->so = so; - ctx->array_dirty = 0; - ctx->next_inloc = 8; - ctx->num_internal_temps = 0; - ctx->branch_count = 0; - ctx->kill_count = 0; - ctx->block = NULL; - ctx->current_instr = NULL; - ctx->num_output_updates = 0; - ctx->atomic = false; - ctx->frag_pos = NULL; - ctx->frag_face = NULL; - ctx->vertex_id = NULL; - ctx->instance_id = NULL; - ctx->tmp_src = NULL; - ctx->using_tmp_dst = false; - - memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); - memset(ctx->array, 0, sizeof(ctx->array)); - memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets)); - -#define FM(x) (1 << TGSI_FILE_##x) - /* NOTE: if relative addressing is used, we set constlen in - * the compiler (to worst-case value) since we don't know in - * the assembler what the max addr reg value can be: - */ - if (info->indirect_files & FM(CONSTANT)) - so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1); - - i = 0; - i += setup_arrays(ctx, TGSI_FILE_INPUT, i); - i += setup_arrays(ctx, TGSI_FILE_TEMPORARY, i); - i += setup_arrays(ctx, TGSI_FILE_OUTPUT, i); - /* any others? we don't track arrays for const..*/ - - /* Immediates go after constants: */ - so->first_immediate = so->first_driver_param = - info->const_file_max[0] + 1; - /* 1 unit for the vertex id base */ - if (so->type == SHADER_VERTEX) - so->first_immediate++; - /* 4 (vec4) units for ubo base addresses */ - so->first_immediate += 4; - ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); - - ret = tgsi_parse_init(&ctx->parser, ctx->tokens); - if (ret != TGSI_PARSE_OK) - return ret; - - ctx->type = ctx->parser.FullHeader.Processor.Processor; - - return ret; -} - -static void -compile_error(struct ir3_compile_context *ctx, const char *format, ...) -{ - va_list ap; - va_start(ap, format); - _debug_vprintf(format, ap); - va_end(ap); - tgsi_dump(ctx->tokens, 0); - debug_assert(0); -} - -#define compile_assert(ctx, cond) do { \ - if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ - } while (0) - -static void -compile_free(struct ir3_compile_context *ctx) -{ - if (ctx->free_tokens) - free((void *)ctx->tokens); - tgsi_parse_free(&ctx->parser); -} - -struct instr_translater { - void (*fxn)(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst); - unsigned tgsi_opc; - opc_t opc; - opc_t hopc; /* opc to use for half_precision mode, if different */ - unsigned arg; -}; - -static void -instr_finish(struct ir3_compile_context *ctx) -{ - unsigned i; - - if (ctx->atomic) - return; - - for (i = 0; i < ctx->num_output_updates; i++) - *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; - - ctx->num_output_updates = 0; - - while (ctx->array_dirty) { - unsigned aid = ffs(ctx->array_dirty) - 1; - ctx->array[aid].fanin = NULL; - ctx->array_dirty &= ~(1 << aid); - } -} - -/* For "atomic" groups of instructions, for example the four scalar - * instructions to perform a vec4 operation. Basically this just - * blocks out handling of output_updates so the next scalar instruction - * still sees the result from before the start of the atomic group. - * - * NOTE: when used properly, this could probably replace get/put_dst() - * stuff. - */ -static void -instr_atomic_start(struct ir3_compile_context *ctx) -{ - ctx->atomic = true; -} - -static void -instr_atomic_end(struct ir3_compile_context *ctx) -{ - ctx->atomic = false; - instr_finish(ctx); -} - -static struct ir3_instruction * -instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) -{ - instr_finish(ctx); - return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); -} - -static struct ir3_block * -push_block(struct ir3_compile_context *ctx) -{ - struct ir3_block *block; - unsigned ntmp, nin, nout; - -#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) - - /* hmm, give ourselves room to create 8 extra temporaries (vec4): - */ - ntmp = SCALAR_REGS(TEMPORARY); - ntmp += 8 * 4; - - nout = SCALAR_REGS(OUTPUT); - nin = SCALAR_REGS(INPUT) + SCALAR_REGS(SYSTEM_VALUE); - - /* for outermost block, 'inputs' are the actual shader INPUT - * register file. Reads from INPUT registers always go back to - * top block. For nested blocks, 'inputs' is used to track any - * TEMPORARY file register from one of the enclosing blocks that - * is ready in this block. - */ - if (!ctx->block) { - /* NOTE: fragment shaders actually have two inputs (r0.xy, the - * position) - */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - int n = 2; - if (ctx->info.reads_position) - n += 4; - if (ctx->info.uses_frontface) - n += 4; - nin = MAX2(n, nin); - nout += ARRAY_SIZE(ctx->kill); - } - } else { - nin = ntmp; - } - - block = ir3_block_create(ctx->ir, ntmp, nin, nout); - - if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) - block->noutputs -= ARRAY_SIZE(ctx->kill); - - block->parent = ctx->block; - ctx->block = block; - - return block; -} - -static void -pop_block(struct ir3_compile_context *ctx) -{ - ctx->block = ctx->block->parent; - compile_assert(ctx, ctx->block); -} - -static struct ir3_instruction * -create_output(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) -{ - struct ir3_instruction *out; - - out = ir3_instr_create(block, -1, OPC_META_OUTPUT); - out->inout.block = block; - ir3_reg_create(out, n, 0); - if (instr) - ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; - - return out; -} - -static struct ir3_instruction * -create_input(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) -{ - struct ir3_instruction *in; - - in = ir3_instr_create(block, -1, OPC_META_INPUT); - in->inout.block = block; - ir3_reg_create(in, n, 0); - if (instr) - ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; - - return in; -} - -static struct ir3_instruction * -block_input(struct ir3_block *block, unsigned n) -{ - /* references to INPUT register file always go back up to - * top level: - */ - if (block->parent) - return block_input(block->parent, n); - return block->inputs[n]; -} - -/* return temporary in scope, creating if needed meta-input node - * to track block inputs - */ -static struct ir3_instruction * -block_temporary(struct ir3_block *block, unsigned n) -{ - /* references to TEMPORARY register file, find the nearest - * enclosing block which has already assigned this temporary, - * creating meta-input instructions along the way to keep - * track of block inputs - */ - if (block->parent && !block->temporaries[n]) { - /* if already have input for this block, reuse: */ - if (!block->inputs[n]) - block->inputs[n] = block_temporary(block->parent, n); - - /* and create new input to return: */ - return create_input(block, block->inputs[n], n); - } - return block->temporaries[n]; -} - -static struct ir3_instruction * -create_immed(struct ir3_compile_context *ctx, float val) -{ - /* NOTE: *don't* use instr_create() here! - */ - struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; - return instr; -} - -static void -ssa_instr_set(struct ir3_compile_context *ctx, unsigned file, unsigned n, - struct ir3_instruction *instr) -{ - struct ir3_block *block = ctx->block; - unsigned idx = ctx->num_output_updates; - - compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); - - /* NOTE: defer update of temporaries[idx] or output[idx] - * until instr_finish(), so that if the current instruction - * reads the same TEMP/OUT[] it gets the old value: - * - * bleh.. this might be a bit easier to just figure out - * in instr_finish(). But at that point we've already - * lost information about OUTPUT vs TEMPORARY register - * file.. - */ - - switch (file) { - case TGSI_FILE_OUTPUT: - compile_assert(ctx, n < block->noutputs); - ctx->output_updates[idx].instrp = &block->outputs[n]; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - case TGSI_FILE_TEMPORARY: - compile_assert(ctx, n < block->ntemporaries); - ctx->output_updates[idx].instrp = &block->temporaries[n]; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - case TGSI_FILE_ADDRESS: - compile_assert(ctx, n < 1); - ctx->output_updates[idx].instrp = &block->address; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - } -} - -static struct ir3_instruction * -ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *instr = NULL; - - switch (file) { - case TGSI_FILE_INPUT: - instr = block_input(ctx->block, n); - break; - case TGSI_FILE_OUTPUT: - /* really this should just happen in case of 'MOV_SAT OUT[n], ..', - * for the following clamp instructions: - */ - instr = block->outputs[n]; - /* we don't have to worry about read from an OUTPUT that was - * assigned outside of the current block, because the _SAT - * clamp instructions will always be in the same block as - * the original instruction which wrote the OUTPUT - */ - compile_assert(ctx, instr); - break; - case TGSI_FILE_TEMPORARY: - instr = block_temporary(ctx->block, n); - if (!instr) { - /* this can happen when registers (or components of a TGSI - * register) are used as src before they have been assigned - * (undefined contents). To avoid confusing the rest of the - * compiler, and to generally keep things peachy, substitute - * an instruction that sets the src to 0.0. Or to keep - * things undefined, I could plug in a random number? :-P - * - * NOTE: *don't* use instr_create() here! - */ - instr = create_immed(ctx, 0.0); - /* no need to recreate the immed for every access: */ - block->temporaries[n] = instr; - } - break; - case TGSI_FILE_SYSTEM_VALUE: - switch (ctx->sysval_semantics[n >> 2]) { - case TGSI_SEMANTIC_VERTEXID_NOBASE: - instr = ctx->vertex_id; - break; - case TGSI_SEMANTIC_BASEVERTEX: - instr = ctx->basevertex; - break; - case TGSI_SEMANTIC_INSTANCEID: - instr = ctx->instance_id; - break; - } - break; - } - - return instr; -} - -static int dst_array_id(struct ir3_compile_context *ctx, - const struct tgsi_dst_register *dst) -{ - // XXX complete hack to recover tgsi_full_dst_register... - // nothing that isn't wrapped in a tgsi_full_dst_register - // should be indirect - const struct tgsi_full_dst_register *fdst = (const void *)dst; - return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File]; -} - -static int src_array_id(struct ir3_compile_context *ctx, - const struct tgsi_src_register *src) -{ - // XXX complete hack to recover tgsi_full_src_register... - // nothing that isn't wrapped in a tgsi_full_src_register - // should be indirect - const struct tgsi_full_src_register *fsrc = (const void *)src; - debug_assert(src->File != TGSI_FILE_CONSTANT); - return fsrc->Indirect.ArrayID + ctx->array_offsets[src->File]; -} - -static struct ir3_instruction * -array_fanin(struct ir3_compile_context *ctx, unsigned aid, unsigned file) -{ - struct ir3_instruction *instr; - - if (ctx->array[aid].fanin) { - instr = ctx->array[aid].fanin; - } else { - unsigned first = ctx->array[aid].first; - unsigned last = ctx->array[aid].last; - unsigned i, j; - - instr = ir3_instr_create2(ctx->block, -1, OPC_META_FI, - 1 + (4 * (last + 1 - first))); - ir3_reg_create(instr, 0, 0); - for (i = first; i <= last; i++) { - for (j = 0; j < 4; j++) { - unsigned n = regid(i, j); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = - ssa_instr_get(ctx, file, n); - } - } - ctx->array[aid].fanin = instr; - ctx->array_dirty |= (1 << aid); - } - - return instr; -} - -static void -ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) -{ - if (dst->Indirect) { - struct ir3_register *reg = instr->regs[0]; - unsigned i, aid = dst_array_id(ctx, dst); - unsigned first = ctx->array[aid].first; - unsigned last = ctx->array[aid].last; - unsigned off = dst->Index - first; /* vec4 offset */ - - reg->size = 4 * (1 + last - first); - reg->offset = regid(off, chan); - - instr->fanin = array_fanin(ctx, aid, dst->File); - - /* annotate with the array-id, to help out the register- - * assignment stage. At least for the case of indirect - * writes, we should capture enough dependencies to - * preserve the order of reads/writes of the array, so - * the multiple "names" for the array should end up all - * assigned to the same registers. - */ - instr->fanin->fi.aid = aid; - - /* Since we are scalarizing vec4 tgsi instructions/regs, we - * run into a slight complication here. To do the naive thing - * and setup a fanout for each scalar array element would end - * up with the result that the instructions generated for each - * component of the vec4 would end up clobbering each other. - * So we take advantage here of knowing that the array index - * (after the shl.b) will be a multiple of four, and only set - * every fourth scalar component in the array. See also - * fixup_ssa_dst_array() - */ - for (i = first; i <= last; i++) { - struct ir3_instruction *split; - unsigned n = regid(i, chan); - int off = (4 * (i - first)) + chan; - - if (is_meta(instr) && (instr->opc == OPC_META_FO)) - off -= instr->fo.off; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = off; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr; - - ssa_instr_set(ctx, dst->File, n, split); - } - } else { - /* normal case (not relative addressed GPR) */ - ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr); - } -} - -static void -ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, - const struct tgsi_src_register *src, unsigned chan) -{ - struct ir3_instruction *instr; - - if (src->Indirect && (src->File != TGSI_FILE_CONSTANT)) { - /* for relative addressing of gpr's (due to register assignment) - * we must generate a fanin instruction to collect all possible - * array elements that the instruction could address together: - */ - unsigned aid = src_array_id(ctx, src); - unsigned first = ctx->array[aid].first; - unsigned last = ctx->array[aid].last; - unsigned off = src->Index - first; /* vec4 offset */ - - reg->size = 4 * (1 + last - first); - reg->offset = regid(off, chan); - - instr = array_fanin(ctx, aid, src->File); - } else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) { - const struct tgsi_full_src_register *fsrc = (const void *)src; - struct ir3_instruction *temp = NULL; - int ubo_regid = regid(ctx->so->first_driver_param, 0) + - fsrc->Dimension.Index - 1; - int offset = 0; - - /* We don't handle indirect UBO array accesses... yet. */ - compile_assert(ctx, !fsrc->Dimension.Indirect); - /* UBOs start at index 1. */ - compile_assert(ctx, fsrc->Dimension.Index > 0); - - if (src->Indirect) { - /* In case of an indirect index, it will have been loaded into an - * address register. There will be a sequence of - * - * shl.b x, val, 2 - * mova a0, x - * - * We rely on this sequence to get the original val out and shift - * it by 4, since we're dealing in vec4 units. - */ - compile_assert(ctx, ctx->block->address); - compile_assert(ctx, ctx->block->address->regs[1]->instr->opc == - OPC_SHL_B); - - temp = instr = instr_create(ctx, 2, OPC_SHL_B); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr = - ctx->block->address->regs[1]->instr->regs[1]->instr; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; - } else if (src->Index >= 64) { - /* Otherwise it's a plain index (in vec4 units). Move it into a - * register. - */ - temp = instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_utype(ctx); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16; - } else { - /* The offset is small enough to fit into the ldg instruction - * directly. - */ - offset = src->Index * 16; - } - - if (temp) { - /* If there was an offset (most common), add it to the buffer - * address. - */ - instr = instr_create(ctx, 2, OPC_ADD_S); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp; - ir3_reg_create(instr, ubo_regid, IR3_REG_CONST); - } else { - /* Otherwise just load the buffer address directly */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_utype(ctx); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, ubo_regid, IR3_REG_CONST); - } - - temp = instr; - - instr = instr_create(ctx, 6, OPC_LDG); - instr->cat6.type = TYPE_U32; - instr->cat6.offset = offset + chan * 4; - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - - reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST); - } else { - /* normal case (not relative addressed GPR) */ - instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan)); - } - - if (instr) { - reg->flags |= IR3_REG_SSA; - reg->instr = instr; - } else if (reg->flags & IR3_REG_SSA) { - /* special hack for trans_samp() which calls ssa_src() directly - * to build up the collect (fanin) for const src.. (so SSA flag - * set but no src instr... it basically gets lucky because we - * default to 0.0 for "undefined" src instructions, which is - * what it wants. We probably need to give it a better way to - * do this, but for now this hack: - */ - reg->instr = create_immed(ctx, 0.0); - } -} - -static struct ir3_register * -add_dst_reg_wrmask(struct ir3_compile_context *ctx, - struct ir3_instruction *instr, const struct tgsi_dst_register *dst, - unsigned chan, unsigned wrmask) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - - switch (dst->File) { - case TGSI_FILE_OUTPUT: - case TGSI_FILE_TEMPORARY: - /* uses SSA */ - break; - case TGSI_FILE_ADDRESS: - flags |= IR3_REG_ADDR; - /* uses SSA */ - break; - default: - compile_error(ctx, "unsupported dst register file: %s\n", - tgsi_file_name(dst->File)); - break; - } - - if (dst->Indirect) { - flags |= IR3_REG_RELATIV; - - /* shouldn't happen, and we can't cope with it below: */ - compile_assert(ctx, wrmask == 0x1); - - compile_assert(ctx, ctx->block->address); - if (instr->address) - compile_assert(ctx, ctx->block->address == instr->address); - - instr->address = ctx->block->address; - array_insert(ctx->ir->indirects, instr); - } - - reg = ir3_reg_create(instr, regid(num, chan), flags); - reg->wrmask = wrmask; - - if (wrmask == 0x1) { - /* normal case */ - ssa_dst(ctx, instr, dst, chan); - } else if ((dst->File == TGSI_FILE_TEMPORARY) || - (dst->File == TGSI_FILE_OUTPUT) || - (dst->File == TGSI_FILE_ADDRESS)) { - struct ir3_instruction *prev = NULL; - unsigned i; - - compile_assert(ctx, !dst->Indirect); - - /* if instruction writes multiple, we need to create - * some place-holder collect the registers: - */ - for (i = 0; i < 4; i++) { - /* NOTE: slightly ugly that we setup neighbor ptrs - * for FO here, but handle FI in CP pass.. we should - * probably just always setup neighbor ptrs in the - * frontend? - */ - struct ir3_instruction *split = - ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = i; - /* unused dst reg: */ - /* NOTE: set SSA flag on dst here, because unused FO's - * which don't get scheduled will end up not in the - * instruction list when RA sets SSA flag on each dst. - * Slight hack. We really should set SSA flag on - * every dst register in the frontend. - */ - ir3_reg_create(split, 0, IR3_REG_SSA); - /* and src reg used to hold original instr */ - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr; - if (prev) { - split->cp.left = prev; - split->cp.left_cnt++; - prev->cp.right = split; - prev->cp.right_cnt++; - } - if ((wrmask & (1 << i)) && !ctx->atomic) - ssa_dst(ctx, split, dst, chan+i); - prev = split; - } - } - - return reg; -} - -static struct ir3_register * -add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) -{ - return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); -} - -static struct ir3_register * -add_src_reg_wrmask(struct ir3_compile_context *ctx, - struct ir3_instruction *instr, const struct tgsi_src_register *src, - unsigned chan, unsigned wrmask) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - - switch (src->File) { - case TGSI_FILE_IMMEDIATE: - /* TODO if possible, use actual immediate instead of const.. but - * TGSI has vec4 immediates, we can only embed scalar (of limited - * size, depending on instruction..) - */ - flags |= IR3_REG_CONST; - num = src->Index + ctx->so->first_immediate; - break; - case TGSI_FILE_CONSTANT: - flags |= IR3_REG_CONST; - num = src->Index; - break; - case TGSI_FILE_OUTPUT: - /* NOTE: we should only end up w/ OUTPUT file for things like - * clamp()'ing saturated dst instructions - */ - case TGSI_FILE_INPUT: - case TGSI_FILE_TEMPORARY: - case TGSI_FILE_SYSTEM_VALUE: - /* uses SSA */ - break; - default: - compile_error(ctx, "unsupported src register file: %s\n", - tgsi_file_name(src->File)); - break; - } - - /* We seem to have 8 bits (6.2) for dst register always, so I think - * it is safe to assume GPR cannot be >=64 - * - * cat3 instructions only have 8 bits for src2, but cannot take a - * const for src2 - * - * cat5 and cat6 in some cases only has 8 bits, but cannot take a - * const for any src. - * - * Other than that we seem to have 12 bits to encode const src, - * except for cat1 which may only have 11 bits (but that seems like - * a bug) - */ - if (flags & IR3_REG_CONST) - compile_assert(ctx, src->Index < (1 << 9)); - else - compile_assert(ctx, src->Index < (1 << 6)); - - /* NOTE: abs/neg modifiers in tgsi only apply to float */ - if (src->Absolute) - flags |= IR3_REG_FABS; - if (src->Negate) - flags |= IR3_REG_FNEG; - - if (src->Indirect) { - flags |= IR3_REG_RELATIV; - - /* shouldn't happen, and we can't cope with it below: */ - compile_assert(ctx, wrmask == 0x1); - - compile_assert(ctx, ctx->block->address); - if (instr->address) - compile_assert(ctx, ctx->block->address == instr->address); - - instr->address = ctx->block->address; - array_insert(ctx->ir->indirects, instr); - } - - reg = ir3_reg_create(instr, regid(num, chan), flags); - reg->wrmask = wrmask; - - if (wrmask == 0x1) { - /* normal case */ - ssa_src(ctx, reg, src, chan); - } else if ((src->File == TGSI_FILE_TEMPORARY) || - (src->File == TGSI_FILE_OUTPUT) || - (src->File == TGSI_FILE_INPUT)) { - struct ir3_instruction *collect; - unsigned i; - - compile_assert(ctx, !src->Indirect); - - /* if instruction reads multiple, we need to create - * some place-holder collect the registers: - */ - collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); - ir3_reg_create(collect, 0, 0); /* unused dst reg */ - - for (i = 0; i < 4; i++) { - if (wrmask & (1 << i)) { - /* and src reg used point to the original instr */ - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - src, chan + i); - } else if (wrmask & ~((i << i) - 1)) { - /* if any remaining components, then dummy - * placeholder src reg to fill in the blanks: - */ - ir3_reg_create(collect, 0, 0); - } - } - - reg->flags |= IR3_REG_SSA; - reg->instr = collect; - } - - return reg; -} - -static struct ir3_register * -add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_src_register *src, unsigned chan) -{ - return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); -} - -static void -src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) -{ - src->File = dst->File; - src->Indirect = dst->Indirect; - src->Dimension = dst->Dimension; - src->Index = dst->Index; - src->Absolute = 0; - src->Negate = 0; - src->SwizzleX = TGSI_SWIZZLE_X; - src->SwizzleY = TGSI_SWIZZLE_Y; - src->SwizzleZ = TGSI_SWIZZLE_Z; - src->SwizzleW = TGSI_SWIZZLE_W; -} - -/* Get internal-temp src/dst to use for a sequence of instructions - * generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp(struct ir3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - -static inline bool -is_const(struct tgsi_src_register *src) -{ - return (src->File == TGSI_FILE_CONSTANT) || - (src->File == TGSI_FILE_IMMEDIATE); -} - -static inline bool -is_relative(struct tgsi_src_register *src) -{ - return src->Indirect; -} - -static inline bool -is_rel_or_const(struct tgsi_src_register *src) -{ - return is_relative(src) || is_const(src); -} - -static type_t -get_ftype(struct ir3_compile_context *ctx) -{ - return TYPE_F32; -} - -static type_t -get_utype(struct ir3_compile_context *ctx) -{ - return TYPE_U32; -} - -static type_t -get_stype(struct ir3_compile_context *ctx) -{ - return TYPE_S32; -} - -static unsigned -src_swiz(struct tgsi_src_register *src, int chan) -{ - switch (chan) { - case 0: return src->SwizzleX; - case 1: return src->SwizzleY; - case 2: return src->SwizzleZ; - case 3: return src->SwizzleW; - } - assert(0); - return 0; -} - -/* for instructions that cannot take a const register as src, if needed - * generate a move to temporary gpr: - */ -static struct tgsi_src_register * -get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) -{ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - compile_assert(ctx, is_rel_or_const(src)); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - create_mov(ctx, &tmp_dst, src); - - return tmp_src; -} - -static void -get_immediate(struct ir3_compile_context *ctx, - struct tgsi_src_register *reg, uint32_t val) -{ - unsigned neg, swiz, idx, i; - /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ - static const unsigned swiz2tgsi[] = { - TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, - }; - - for (i = 0; i < ctx->immediate_idx; i++) { - swiz = i % 4; - idx = i / 4; - - if (ctx->so->immediates[idx].val[swiz] == val) { - neg = 0; - break; - } - - if (ctx->so->immediates[idx].val[swiz] == -val) { - neg = 1; - break; - } - } - - if (i == ctx->immediate_idx) { - /* need to generate a new immediate: */ - swiz = i % 4; - idx = i / 4; - neg = 0; - ctx->so->immediates[idx].val[swiz] = val; - ctx->so->immediates_count = idx + 1; - ctx->immediate_idx++; - } - - reg->File = TGSI_FILE_IMMEDIATE; - reg->Indirect = 0; - reg->Dimension = 0; - reg->Index = idx; - reg->Absolute = 0; - reg->Negate = neg; - reg->SwizzleX = swiz2tgsi[swiz]; - reg->SwizzleY = swiz2tgsi[swiz]; - reg->SwizzleZ = swiz2tgsi[swiz]; - reg->SwizzleW = swiz2tgsi[swiz]; -} - -static void -create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, - struct tgsi_src_register *src) -{ - type_t type_mov = get_ftype(ctx); - unsigned i; - - for (i = 0; i < 4; i++) { - /* move to destination: */ - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *instr; - - if (src->Absolute || src->Negate) { - /* can't have abs or neg on a mov instr, so use - * absneg.f instead to handle these cases: - */ - instr = instr_create(ctx, 2, OPC_ABSNEG_F); - } else { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - } - - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src_swiz(src, i)); - } - } -} - -static void -create_clamp(struct ir3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *val, - struct tgsi_src_register *minval, struct tgsi_src_register *maxval) -{ - struct ir3_instruction *instr; - - instr = instr_create(ctx, 2, OPC_MAX_F); - vectorize(ctx, instr, dst, 2, val, 0, minval, 0); - - instr = instr_create(ctx, 2, OPC_MIN_F); - vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); -} - -static void -create_clamp_imm(struct ir3_compile_context *ctx, - struct tgsi_dst_register *dst, - uint32_t minval, uint32_t maxval) -{ - struct tgsi_src_register minconst, maxconst; - struct tgsi_src_register src; - - src_from_dst(&src, dst); - - get_immediate(ctx, &minconst, minval); - get_immediate(ctx, &maxconst, maxval); - - create_clamp(ctx, dst, &src, &minconst, &maxconst); -} - -static struct tgsi_dst_register * -get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - unsigned i; - - compile_assert(ctx, !ctx->using_tmp_dst); - ctx->using_tmp_dst = true; - - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - struct tgsi_src_register *src = &inst->Src[i].Register; - if ((src->File == dst->File) && (src->Index == dst->Index)) { - if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && - (src->SwizzleX == TGSI_SWIZZLE_X) && - (src->SwizzleY == TGSI_SWIZZLE_Y) && - (src->SwizzleZ == TGSI_SWIZZLE_Z) && - (src->SwizzleW == TGSI_SWIZZLE_W)) - continue; - ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); - ctx->tmp_dst.WriteMask = dst->WriteMask; - dst = &ctx->tmp_dst; - break; - } - } - return dst; -} - -static void -put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, - struct tgsi_dst_register *dst) -{ - compile_assert(ctx, ctx->using_tmp_dst); - ctx->using_tmp_dst = false; - - /* if necessary, add mov back into original dst: */ - if (dst != &inst->Dst[0].Register) { - create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); - } -} - -/* helper to generate the necessary repeat and/or additional instructions - * to turn a scalar instruction into a vector operation: - */ -static void -vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, - struct tgsi_dst_register *dst, int nsrcs, ...) -{ - va_list ap; - int i, j, n = 0; - - instr_atomic_start(ctx); - - for (i = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *cur; - - if (n++ == 0) { - cur = instr; - } else { - cur = instr_create(ctx, instr->category, instr->opc); - memcpy(cur->info, instr->info, sizeof(cur->info)); - } - - add_dst_reg(ctx, cur, dst, i); - - va_start(ap, nsrcs); - for (j = 0; j < nsrcs; j++) { - struct tgsi_src_register *src = - va_arg(ap, struct tgsi_src_register *); - unsigned flags = va_arg(ap, unsigned); - struct ir3_register *reg; - if (flags & IR3_REG_IMMED) { - reg = ir3_reg_create(cur, 0, IR3_REG_IMMED); - /* this is an ugly cast.. should have put flags first! */ - reg->iim_val = *(int *)&src; - } else { - reg = add_src_reg(ctx, cur, src, src_swiz(src, i)); - } - reg->flags |= flags & ~(IR3_REG_FNEG | IR3_REG_SNEG); - if (flags & IR3_REG_FNEG) - reg->flags ^= IR3_REG_FNEG; - if (flags & IR3_REG_SNEG) - reg->flags ^= IR3_REG_SNEG; - } - va_end(ap); - } - } - - instr_atomic_end(ctx); -} - -/* - * Handlers for TGSI instructions which do not have a 1:1 mapping to - * native instructions: - */ - -static void -trans_clamp(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct tgsi_src_register *src2 = &inst->Src[2].Register; - - create_clamp(ctx, dst, src0, src1, src2); - - put_dst(ctx, inst, dst); -} - -/* ARL(x) = x, but mova from hrN.x to a0.. */ -static void -trans_arl(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *src = &inst->Src[0].Register; - unsigned chan = src->SwizzleX; - - compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - - /* NOTE: we allocate a temporary from a flat register - * namespace (ignoring half vs full). It turns out - * not to really matter since registers get reassigned - * later in ir3_ra which (hopefully!) can deal a bit - * better with mixed half and full precision. - */ - tmp_src = get_internal_temp(ctx, &tmp_dst); - - /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ? - get_ftype(ctx) : get_utype(ctx); - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, src, chan); - - /* shl.b Rtmp, Rtmp, 2 */ - instr = instr_create(ctx, 2, OPC_SHL_B); - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - - /* mova a0, Rtmp */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = TYPE_S16; - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; -} - -/* - * texture fetch/sample instructions: - */ - -struct tex_info { - int8_t order[4]; - int8_t args; - unsigned src_wrmask, flags; -}; - -struct target_info { - uint8_t dims; - uint8_t cube; - uint8_t array; - uint8_t shadow; -}; - -static const struct target_info tex_targets[] = { - [TGSI_TEXTURE_1D] = { 1, 0, 0, 0 }, - [TGSI_TEXTURE_2D] = { 2, 0, 0, 0 }, - [TGSI_TEXTURE_3D] = { 3, 0, 0, 0 }, - [TGSI_TEXTURE_CUBE] = { 3, 1, 0, 0 }, - [TGSI_TEXTURE_RECT] = { 2, 0, 0, 0 }, - [TGSI_TEXTURE_SHADOW1D] = { 1, 0, 0, 1 }, - [TGSI_TEXTURE_SHADOW2D] = { 2, 0, 0, 1 }, - [TGSI_TEXTURE_SHADOWRECT] = { 2, 0, 0, 1 }, - [TGSI_TEXTURE_1D_ARRAY] = { 1, 0, 1, 0 }, - [TGSI_TEXTURE_2D_ARRAY] = { 2, 0, 1, 0 }, - [TGSI_TEXTURE_SHADOW1D_ARRAY] = { 1, 0, 1, 1 }, - [TGSI_TEXTURE_SHADOW2D_ARRAY] = { 2, 0, 1, 1 }, - [TGSI_TEXTURE_SHADOWCUBE] = { 3, 1, 0, 1 }, - [TGSI_TEXTURE_2D_MSAA] = { 2, 0, 0, 0 }, - [TGSI_TEXTURE_2D_ARRAY_MSAA] = { 2, 0, 1, 0 }, - [TGSI_TEXTURE_CUBE_ARRAY] = { 3, 1, 1, 0 }, - [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 }, -}; - -static void -fill_tex_info(struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst, - struct tex_info *info) -{ - const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; - - if (tgt->dims == 3) - info->flags |= IR3_INSTR_3D; - if (tgt->array) - info->flags |= IR3_INSTR_A; - if (tgt->shadow) - info->flags |= IR3_INSTR_S; - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXB2: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXF: - info->args = 2; - break; - case TGSI_OPCODE_TXP: - info->flags |= IR3_INSTR_P; - /* fallthrough */ - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXD: - info->args = 1; - break; - } - - /* - * lay out the first argument in the proper order: - * - actual coordinates first - * - shadow reference - * - array index - * - projection w - * - * bias/lod go into the second arg - */ - int arg, pos = 0; - for (arg = 0; arg < tgt->dims; arg++) - info->order[arg] = pos++; - if (tgt->dims == 1) - info->order[pos++] = -1; - if (tgt->shadow) - info->order[pos++] = MAX2(arg + tgt->array, 2); - if (tgt->array) - info->order[pos++] = arg++; - if (info->flags & IR3_INSTR_P) - info->order[pos++] = 3; - - info->src_wrmask = (1 << pos) - 1; - - for (; pos < 4; pos++) - info->order[pos] = -1; - - assert(pos <= 4); -} - -static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4]) -{ - unsigned i; - for (i = 1; (i < 4) && order[i] >= 0; i++) - if (src_swiz(src, i) != (src_swiz(src, 0) + order[i])) - return false; - return true; -} - -static bool is_1d(unsigned tex) -{ - return tex_targets[tex].dims == 1; -} - -static struct tgsi_src_register * -get_tex_coord(struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst, - const struct tex_info *tinf) -{ - struct tgsi_src_register *coord = &inst->Src[0].Register; - struct ir3_instruction *instr; - unsigned tex = inst->Texture.Texture; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - type_t type_mov = get_ftype(ctx); - unsigned j; - - /* need to move things around: */ - tmp_src = get_internal_temp(ctx, &tmp_dst); - - for (j = 0; j < 4; j++) { - if (tinf->order[j] < 0) - continue; - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, j); - add_src_reg(ctx, instr, coord, - src_swiz(coord, tinf->order[j])); - } - - /* fix up .y coord: */ - if (is_1d(tex)) { - struct ir3_register *imm; - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ - imm = ir3_reg_create(instr, 0, IR3_REG_IMMED); - if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) - imm->iim_val = 0; - else - imm->fim_val = 0.5; - } - - return tmp_src; -} - -static void -trans_samp(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr, *collect; - struct ir3_register *reg; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy; - struct tgsi_src_register zero; - const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; - struct tex_info tinf; - int i; - - memset(&tinf, 0, sizeof(tinf)); - fill_tex_info(ctx, inst, &tinf); - coord = get_tex_coord(ctx, inst, &tinf); - get_immediate(ctx, &zero, 0); - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_TXB2: - orig = &inst->Src[1].Register; - samp = &inst->Src[2].Register; - break; - case TGSI_OPCODE_TXD: - orig = &inst->Src[0].Register; - dpdx = &inst->Src[1].Register; - dpdy = &inst->Src[2].Register; - samp = &inst->Src[3].Register; - if (is_rel_or_const(dpdx)) - dpdx = get_unconst(ctx, dpdx); - if (is_rel_or_const(dpdy)) - dpdy = get_unconst(ctx, dpdy); - break; - default: - orig = &inst->Src[0].Register; - samp = &inst->Src[1].Register; - break; - } - if (tinf.args > 1 && is_rel_or_const(orig)) - orig = get_unconst(ctx, orig); - - /* scale up integer coords for TXF based on the LOD */ - if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - type_t type_mov = get_utype(ctx); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - for (i = 0; i < tgt->dims; i++) { - instr = instr_create(ctx, 2, OPC_SHL_B); - add_dst_reg(ctx, instr, &tmp_dst, i); - add_src_reg(ctx, instr, coord, src_swiz(coord, i)); - add_src_reg(ctx, instr, orig, orig->SwizzleW); - } - if (tgt->dims < 2) { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, i); - add_src_reg(ctx, instr, &zero, 0); - i++; - } - if (tgt->array) { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, i); - add_src_reg(ctx, instr, coord, src_swiz(coord, i)); - } - coord = tmp_src; - } - - if (inst->Texture.NumOffsets) { - struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0]; - struct tgsi_src_register offset_src = {0}; - - offset_src.File = tex_offset->File; - offset_src.Index = tex_offset->Index; - offset_src.SwizzleX = tex_offset->SwizzleX; - offset_src.SwizzleY = tex_offset->SwizzleY; - offset_src.SwizzleZ = tex_offset->SwizzleZ; - offset = get_unconst(ctx, &offset_src); - tinf.flags |= IR3_INSTR_O; - } - - instr = instr_create(ctx, 5, t->opc); - if (ctx->integer_s & (1 << samp->Index)) - instr->cat5.type = get_utype(ctx); - else - instr->cat5.type = get_ftype(ctx); - instr->cat5.samp = samp->Index; - instr->cat5.tex = samp->Index; - instr->flags |= tinf.flags; - - add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); - - reg = ir3_reg_create(instr, 0, IR3_REG_SSA); - - collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 12); - ir3_reg_create(collect, 0, 0); - for (i = 0; i < 4; i++) { - if (tinf.src_wrmask & (1 << i)) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - coord, src_swiz(coord, i)); - else if (tinf.src_wrmask & ~((1 << i) - 1)) - ir3_reg_create(collect, 0, 0); - } - - /* Attach derivatives onto the end of the fan-in. Derivatives start after - * the 4th argument, so make sure that fi is padded up to 4 first. - */ - if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { - while (collect->regs_count < 5) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); - for (i = 0; i < tgt->dims; i++) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i); - if (tgt->dims < 2) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); - for (i = 0; i < tgt->dims; i++) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i); - if (tgt->dims < 2) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); - tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4; - } - - reg->instr = collect; - reg->wrmask = tinf.src_wrmask; - - /* The second argument contains the offsets, followed by the lod/bias - * argument. This is constructed more manually due to the dynamic nature. - */ - if (inst->Texture.NumOffsets == 0 && tinf.args == 1) - return; - - reg = ir3_reg_create(instr, 0, IR3_REG_SSA); - - collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 5); - ir3_reg_create(collect, 0, 0); - - if (inst->Texture.NumOffsets) { - for (i = 0; i < tgt->dims; i++) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - offset, i); - if (tgt->dims < 2) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); - } - if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - orig, orig->SwizzleX); - else if (tinf.args > 1) - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - orig, orig->SwizzleW); - - reg->instr = collect; - reg->wrmask = (1 << (collect->regs_count - 1)) - 1; -} - -static void -trans_txq(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *level = &inst->Src[0].Register; - struct tgsi_src_register *samp = &inst->Src[1].Register; - const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; - struct tex_info tinf; - - memset(&tinf, 0, sizeof(tinf)); - fill_tex_info(ctx, inst, &tinf); - if (is_rel_or_const(level)) - level = get_unconst(ctx, level); - - instr = instr_create(ctx, 5, OPC_GETSIZE); - instr->cat5.type = get_utype(ctx); - instr->cat5.samp = samp->Index; - instr->cat5.tex = samp->Index; - instr->flags |= tinf.flags; - - if (tgt->array && (dst->WriteMask & (1 << tgt->dims))) { - /* Array size actually ends up in .w rather than .z. This doesn't - * matter for miplevel 0, but for higher mips the value in z is - * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is - * returned, which means that we have to add 1 to it for arrays. - */ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - type_t type_mov = get_utype(ctx); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0, - dst->WriteMask | TGSI_WRITEMASK_W); - add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1); - - if (dst->WriteMask & TGSI_WRITEMASK_X) { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, dst, 0); - add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 0)); - } - - if (tgt->dims == 2) { - if (dst->WriteMask & TGSI_WRITEMASK_Y) { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, dst, 1); - add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 1)); - } - } - - instr = instr_create(ctx, 2, OPC_ADD_U); - add_dst_reg(ctx, instr, dst, tgt->dims); - add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 3)); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - } else { - add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); - add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1); - } - - if (dst->WriteMask & TGSI_WRITEMASK_W) { - /* The # of levels comes from getinfo.z. We need to add 1 to it, since - * the value in TEX_CONST_0 is zero-based. - */ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - tmp_src = get_internal_temp(ctx, &tmp_dst); - instr = instr_create(ctx, 5, OPC_GETINFO); - instr->cat5.type = get_utype(ctx); - instr->cat5.samp = samp->Index; - instr->cat5.tex = samp->Index; - add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0, TGSI_WRITEMASK_Z); - - instr = instr_create(ctx, 2, OPC_ADD_U); - add_dst_reg(ctx, instr, dst, 3); - add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 2)); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - } -} - -/* DDX/DDY */ -static void -trans_deriv(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *src = &inst->Src[0].Register; - static const int8_t order[4] = {0, 1, 2, 3}; - - if (!check_swiz(src, order)) { - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - tmp_src = get_internal_temp(ctx, &tmp_dst); - create_mov(ctx, &tmp_dst, src); - - src = tmp_src; - } - - /* This might be a workaround for hw bug? Blob compiler always - * seems to work two components at a time for dsy/dsx. It does - * actually seem to work in some cases (or at least some piglit - * tests) for four components at a time. But seems more reliable - * to split this into two instructions like the blob compiler - * does: - */ - - instr = instr_create(ctx, 5, t->opc); - instr->cat5.type = get_ftype(ctx); - add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3); - add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3); - - instr = instr_create(ctx, 5, t->opc); - instr->cat5.type = get_ftype(ctx); - add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3); - add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3); -} - -/* - * SEQ(a,b) = (a == b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SNE(a,b) = (a != b) ? 1.0 : 0.0 - * cmps.f.ne tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SGE(a,b) = (a >= b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SLE(a,b) = (a <= b) ? 1.0 : 0.0 - * cmps.f.le tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SGT(a,b) = (a > b) ? 1.0 : 0.0 - * cmps.f.gt tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SLT(a,b) = (a < b) ? 1.0 : 0.0 - * cmps.f.lt tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * CMP(a,b,c) = (a < 0.0) ? b : c - * cmps.f.lt tmp0, a, {0.0} - * sel.b16 dst, b, tmp0, c - */ -static void -trans_cmp(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval0; - /* final instruction for CMP() uses orig src1 and src2: */ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a0, *a1, *a2; - unsigned condition; - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_FSEQ: - condition = IR3_COND_EQ; - break; - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_FSNE: - condition = IR3_COND_NE; - break; - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_FSGE: - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_FSLT: - condition = IR3_COND_LT; - break; - case TGSI_OPCODE_SLE: - condition = IR3_COND_LE; - break; - case TGSI_OPCODE_SGT: - condition = IR3_COND_GT; - break; - case TGSI_OPCODE_CMP: - get_immediate(ctx, &constval0, fui(0.0)); - a0 = &inst->Src[0].Register; /* a */ - a1 = &constval0; /* {0.0} */ - condition = IR3_COND_LT; - break; - default: - compile_assert(ctx, 0); - return; - } - - if (is_const(a0) && is_const(a1)) - a0 = get_unconst(ctx, a0); - - /* cmps.f.<cond> tmp, a0, a1 */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = condition; - vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLT: - /* cov.u16f16 dst, tmp0 */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, dst, 1, tmp_src, 0); - break; - case TGSI_OPCODE_FSEQ: - case TGSI_OPCODE_FSGE: - case TGSI_OPCODE_FSNE: - case TGSI_OPCODE_FSLT: - /* absneg.s dst, (neg)tmp0 */ - instr = instr_create(ctx, 2, OPC_ABSNEG_S); - vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG); - break; - case TGSI_OPCODE_CMP: - a1 = &inst->Src[1].Register; - a2 = &inst->Src[2].Register; - /* sel.{b32,b16} dst, src2, tmp, src1 */ - instr = instr_create(ctx, 3, OPC_SEL_B32); - vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); - break; - } - - put_dst(ctx, inst, dst); -} - -/* - * USNE(a,b) = (a != b) ? ~0 : 0 - * cmps.u32.ne dst, a, b - * - * USEQ(a,b) = (a == b) ? ~0 : 0 - * cmps.u32.eq dst, a, b - * - * ISGE(a,b) = (a > b) ? ~0 : 0 - * cmps.s32.ge dst, a, b - * - * USGE(a,b) = (a > b) ? ~0 : 0 - * cmps.u32.ge dst, a, b - * - * ISLT(a,b) = (a < b) ? ~0 : 0 - * cmps.s32.lt dst, a, b - * - * USLT(a,b) = (a < b) ? ~0 : 0 - * cmps.u32.lt dst, a, b - * - */ -static void -trans_icmp(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register *a0, *a1; - unsigned condition; - - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - - switch (t->tgsi_opc) { - case TGSI_OPCODE_USNE: - condition = IR3_COND_NE; - break; - case TGSI_OPCODE_USEQ: - condition = IR3_COND_EQ; - break; - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_USGE: - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_USLT: - condition = IR3_COND_LT; - break; - - default: - compile_assert(ctx, 0); - return; - } - - if (is_const(a0) && is_const(a1)) - a0 = get_unconst(ctx, a0); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - /* cmps.{u32,s32}.<cond> tmp, a0, a1 */ - instr = instr_create(ctx, 2, t->opc); - instr->cat2.condition = condition; - vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); - - /* absneg.s dst, (neg)tmp */ - instr = instr_create(ctx, 2, OPC_ABSNEG_S); - vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG); - - put_dst(ctx, inst, dst); -} - -/* - * UCMP(a,b,c) = a ? b : c - * sel.b16 dst, b, a, c - */ -static void -trans_ucmp(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a0, *a1, *a2; - - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - a2 = &inst->Src[2].Register; /* c */ - - if (is_rel_or_const(a0)) - a0 = get_unconst(ctx, a0); - - /* sel.{b32,b16} dst, b, a, c */ - instr = instr_create(ctx, 3, OPC_SEL_B32); - vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0); - put_dst(ctx, inst, dst); -} - -/* - * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0 - * cmps.s.lt tmp_neg, a, 0 # 1 if a is negative - * cmps.s.gt tmp_pos, a, 0 # 1 if a is positive - * sub.u dst, tmp_pos, tmp_neg - */ -static void -trans_issg(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a = &inst->Src[0].Register; - struct tgsi_dst_register neg_dst, pos_dst; - struct tgsi_src_register *neg_src, *pos_src; - - neg_src = get_internal_temp(ctx, &neg_dst); - pos_src = get_internal_temp(ctx, &pos_dst); - - /* cmps.s.lt neg, a, 0 */ - instr = instr_create(ctx, 2, OPC_CMPS_S); - instr->cat2.condition = IR3_COND_LT; - vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED); - - /* cmps.s.gt pos, a, 0 */ - instr = instr_create(ctx, 2, OPC_CMPS_S); - instr->cat2.condition = IR3_COND_GT; - vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED); - - /* sub.u dst, pos, neg */ - instr = instr_create(ctx, 2, OPC_SUB_U); - vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0); - - put_dst(ctx, inst, dst); -} - - - -/* - * Conditional / Flow control - */ - -static void -push_branch(struct ir3_compile_context *ctx, bool inv, - struct ir3_instruction *instr, struct ir3_instruction *cond) -{ - unsigned int idx = ctx->branch_count++; - compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); - ctx->branch[idx].instr = instr; - ctx->branch[idx].inv = inv; - /* else side of branch has same condition: */ - if (!inv) - ctx->branch[idx].cond = cond; -} - -static struct ir3_instruction * -pop_branch(struct ir3_compile_context *ctx) -{ - unsigned int idx = --ctx->branch_count; - return ctx->branch[idx].instr; -} - -static void -trans_if(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr, *cond; - struct tgsi_src_register *src = &inst->Src[0].Register; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval; - - get_immediate(ctx, &constval, fui(0.0)); - tmp_src = get_internal_temp(ctx, &tmp_dst); - - if (is_const(src)) - src = get_unconst(ctx, src); - - /* cmps.{f,u}.ne tmp0, b, {0.0} */ - instr = instr_create(ctx, 2, t->opc); - add_dst_reg(ctx, instr, &tmp_dst, 0); - add_src_reg(ctx, instr, src, src->SwizzleX); - add_src_reg(ctx, instr, &constval, constval.SwizzleX); - instr->cat2.condition = IR3_COND_NE; - - compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ - cond = instr->regs[1]->instr; - - /* meta:flow tmp0 */ - instr = instr_create(ctx, -1, OPC_META_FLOW); - ir3_reg_create(instr, 0, 0); /* dummy dst */ - add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); - - push_branch(ctx, false, instr, cond); - instr->flow.if_block = push_block(ctx); -} - -static void -trans_else(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - - pop_block(ctx); - - instr = pop_branch(ctx); - - compile_assert(ctx, (instr->category == -1) && - (instr->opc == OPC_META_FLOW)); - - push_branch(ctx, true, instr, NULL); - instr->flow.else_block = push_block(ctx); -} - -static struct ir3_instruction * -find_temporary(struct ir3_block *block, unsigned n) -{ - if (block->parent && !block->temporaries[n]) - return find_temporary(block->parent, n); - return block->temporaries[n]; -} - -static struct ir3_instruction * -find_output(struct ir3_block *block, unsigned n) -{ - if (block->parent && !block->outputs[n]) - return find_output(block->parent, n); - return block->outputs[n]; -} - -static struct ir3_instruction * -create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond, - struct ir3_instruction *a, struct ir3_instruction *b) -{ - struct ir3_instruction *phi; - - compile_assert(ctx, cond); - - /* Either side of the condition could be null.. which - * indicates a variable written on only one side of the - * branch. Normally this should only be variables not - * used outside of that side of the branch. So we could - * just 'return a ? a : b;' in that case. But for better - * defined undefined behavior we just stick in imm{0.0}. - * In the common case of a value only used within the - * one side of the branch, the PHI instruction will not - * get scheduled - */ - if (!a) - a = create_immed(ctx, 0.0); - if (!b) - b = create_immed(ctx, 0.0); - - phi = instr_create(ctx, -1, OPC_META_PHI); - ir3_reg_create(phi, 0, 0); /* dummy dst */ - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; - - return phi; -} - -static void -trans_endif(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct ir3_block *ifb, *elseb; - struct ir3_instruction **ifout, **elseout; - unsigned i, ifnout = 0, elsenout = 0; - - pop_block(ctx); - - instr = pop_branch(ctx); - - compile_assert(ctx, (instr->category == -1) && - (instr->opc == OPC_META_FLOW)); - - ifb = instr->flow.if_block; - elseb = instr->flow.else_block; - /* if there is no else block, the parent block is used for the - * branch-not-taken src of the PHI instructions: - */ - if (!elseb) - elseb = ifb->parent; - - /* worst case sizes: */ - ifnout = ifb->ntemporaries + ifb->noutputs; - elsenout = elseb->ntemporaries + elseb->noutputs; - - ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); - if (elseb != ifb->parent) - elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); - - ifnout = 0; - elsenout = 0; - - /* generate PHI instructions for any temporaries written: */ - for (i = 0; i < ifb->ntemporaries; i++) { - struct ir3_instruction *a = ifb->temporaries[i]; - struct ir3_instruction *b = elseb->temporaries[i]; - - /* if temporary written in if-block, or if else block - * is present and temporary written in else-block: - */ - if (a || ((elseb != ifb->parent) && b)) { - struct ir3_instruction *phi; - - /* if only written on one side, find the closest - * enclosing update on other side: - */ - if (!a) - a = find_temporary(ifb, i); - if (!b) - b = find_temporary(elseb, i); - - ifout[ifnout] = a; - a = create_output(ifb, a, ifnout++); - - if (elseb != ifb->parent) { - elseout[elsenout] = b; - b = create_output(elseb, b, elsenout++); - } - - phi = create_phi(ctx, instr, a, b); - ctx->block->temporaries[i] = phi; - } - } - - compile_assert(ctx, ifb->noutputs == elseb->noutputs); - - /* .. and any outputs written: */ - for (i = 0; i < ifb->noutputs; i++) { - struct ir3_instruction *a = ifb->outputs[i]; - struct ir3_instruction *b = elseb->outputs[i]; - - /* if output written in if-block, or if else block - * is present and output written in else-block: - */ - if (a || ((elseb != ifb->parent) && b)) { - struct ir3_instruction *phi; - - /* if only written on one side, find the closest - * enclosing update on other side: - */ - if (!a) - a = find_output(ifb, i); - if (!b) - b = find_output(elseb, i); - - ifout[ifnout] = a; - a = create_output(ifb, a, ifnout++); - - if (elseb != ifb->parent) { - elseout[elsenout] = b; - b = create_output(elseb, b, elsenout++); - } - - phi = create_phi(ctx, instr, a, b); - ctx->block->outputs[i] = phi; - } - } - - ifb->noutputs = ifnout; - ifb->outputs = ifout; - - if (elseb != ifb->parent) { - elseb->noutputs = elsenout; - elseb->outputs = elseout; - } - - // TODO maybe we want to compact block->inputs? -} - -/* - * Kill - */ - -static void -trans_kill(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr, *immed, *cond = NULL; - bool inv = false; - - /* unconditional kill, use enclosing if condition: */ - if (ctx->branch_count > 0) { - unsigned int idx = ctx->branch_count - 1; - cond = ctx->branch[idx].cond; - inv = ctx->branch[idx].inv; - } else { - cond = create_immed(ctx, 1.0); - } - - compile_assert(ctx, cond); - - immed = create_immed(ctx, 0.0); - - /* cmps.f.ne p0.x, cond, {0.0} */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = IR3_COND_NE; - ir3_reg_create(instr, regid(REG_P0, 0), 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; - cond = instr; - - /* kill p0.x */ - instr = instr_create(ctx, 0, OPC_KILL); - instr->cat0.inv = inv; - ir3_reg_create(instr, 0, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - - ctx->kill[ctx->kill_count++] = instr; - - ctx->so->has_kill = true; -} - -/* - * Kill-If - */ - -static void -trans_killif(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_src_register *src = &inst->Src[0].Register; - struct ir3_instruction *instr, *immed, *cond = NULL; - bool inv = false; - - immed = create_immed(ctx, 0.0); - - /* cmps.f.ne p0.x, cond, {0.0} */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = IR3_COND_NE; - ir3_reg_create(instr, regid(REG_P0, 0), 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; - add_src_reg(ctx, instr, src, src->SwizzleX); - - cond = instr; - - /* kill p0.x */ - instr = instr_create(ctx, 0, OPC_KILL); - instr->cat0.inv = inv; - ir3_reg_create(instr, 0, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - - ctx->kill[ctx->kill_count++] = instr; - - ctx->so->has_kill = true; - -} -/* - * I2F / U2F / F2I / F2U - */ - -static void -trans_cov(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - - // cov.f32s32 dst, tmp0 / - instr = instr_create(ctx, 1, 0); - switch (t->tgsi_opc) { - case TGSI_OPCODE_U2F: - instr->cat1.src_type = TYPE_U32; - instr->cat1.dst_type = TYPE_F32; - break; - case TGSI_OPCODE_I2F: - instr->cat1.src_type = TYPE_S32; - instr->cat1.dst_type = TYPE_F32; - break; - case TGSI_OPCODE_F2U: - instr->cat1.src_type = TYPE_F32; - instr->cat1.dst_type = TYPE_U32; - break; - case TGSI_OPCODE_F2I: - instr->cat1.src_type = TYPE_F32; - instr->cat1.dst_type = TYPE_S32; - break; - - } - vectorize(ctx, instr, dst, 1, src, 0); - put_dst(ctx, inst, dst); -} - -/* - * UMUL / UMAD - * - * There is no 32-bit multiply instruction, so splitting a and b into high and - * low components, we get that - * - * dst = al * bl + ah * bl << 16 + al * bh << 16 - * - * mull.u tmp0, a, b (mul low, i.e. al * bl) - * madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16) - * madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16) - * - * For UMAD, add in the extra argument after mull.u. - */ -static void -trans_umul(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a = &inst->Src[0].Register; - struct tgsi_src_register *b = &inst->Src[1].Register; - - struct tgsi_dst_register tmp0_dst, tmp1_dst; - struct tgsi_src_register *tmp0_src, *tmp1_src; - - tmp0_src = get_internal_temp(ctx, &tmp0_dst); - tmp1_src = get_internal_temp(ctx, &tmp1_dst); - - if (is_rel_or_const(a)) - a = get_unconst(ctx, a); - if (is_rel_or_const(b)) - b = get_unconst(ctx, b); - - /* mull.u tmp0, a, b */ - instr = instr_create(ctx, 2, OPC_MULL_U); - vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0); - - if (t->tgsi_opc == TGSI_OPCODE_UMAD) { - struct tgsi_src_register *c = &inst->Src[2].Register; - - /* add.u tmp0, tmp0, c */ - instr = instr_create(ctx, 2, OPC_ADD_U); - vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0); - } - - /* madsh.m16 tmp1, a, b, tmp0 */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0); - - /* madsh.m16 dst, b, a, tmp1 */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0); - put_dst(ctx, inst, dst); -} - -/* - * IDIV / UDIV / MOD / UMOD - * - * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For - * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus. - */ -static void -trans_idiv(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst; - struct tgsi_src_register *a = &inst->Src[0].Register; - struct tgsi_src_register *b = &inst->Src[1].Register; - - struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst; - struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src; - - struct tgsi_src_register negative_2, thirty_one; - type_t src_type; - - if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD) - src_type = get_stype(ctx); - else - src_type = get_utype(ctx); - - af_src = get_internal_temp(ctx, &af_dst); - bf_src = get_internal_temp(ctx, &bf_dst); - q_src = get_internal_temp(ctx, &q_dst); - r_src = get_internal_temp(ctx, &r_dst); - a_src = get_internal_temp(ctx, &a_dst); - b_src = get_internal_temp(ctx, &b_dst); - - get_immediate(ctx, &negative_2, -2); - get_immediate(ctx, &thirty_one, 31); - - if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) - premod_dst = &q_dst; - - /* cov.[us]32f32 af, numerator */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = src_type; - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, &af_dst, 1, a, 0); - - /* cov.[us]32f32 bf, denominator */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = src_type; - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, &bf_dst, 1, b, 0); - - /* Get the absolute values for IDIV */ - if (type_sint(src_type)) { - /* absneg.f af, (abs)af */ - instr = instr_create(ctx, 2, OPC_ABSNEG_F); - vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_FABS); - - /* absneg.f bf, (abs)bf */ - instr = instr_create(ctx, 2, OPC_ABSNEG_F); - vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_FABS); - - /* absneg.s a, (abs)numerator */ - instr = instr_create(ctx, 2, OPC_ABSNEG_S); - vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_SABS); - - /* absneg.s b, (abs)denominator */ - instr = instr_create(ctx, 2, OPC_ABSNEG_S); - vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_SABS); - } else { - /* mov.u32u32 a, numerator */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = src_type; - instr->cat1.dst_type = src_type; - vectorize(ctx, instr, &a_dst, 1, a, 0); - - /* mov.u32u32 b, denominator */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = src_type; - instr->cat1.dst_type = src_type; - vectorize(ctx, instr, &b_dst, 1, b, 0); - } - - /* rcp.f bf, bf */ - instr = instr_create(ctx, 4, OPC_RCP); - vectorize(ctx, instr, &bf_dst, 1, bf_src, 0); - - /* That's right, subtract 2 as an integer from the float */ - /* add.u bf, bf, -2 */ - instr = instr_create(ctx, 2, OPC_ADD_U); - vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0); - - /* mul.f q, af, bf */ - instr = instr_create(ctx, 2, OPC_MUL_F); - vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0); - - /* cov.f32[us]32 q, q */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = src_type; - vectorize(ctx, instr, &q_dst, 1, q_src, 0); - - /* integer multiply q by b */ - /* mull.u r, q, b */ - instr = instr_create(ctx, 2, OPC_MULL_U); - vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0); - - /* madsh.m16 r, q, b, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0); - - /* madsh.m16, r, b, q, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0); - - /* sub.u r, a, r */ - instr = instr_create(ctx, 2, OPC_SUB_U); - vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0); - - /* cov.u32f32, r, r */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, &r_dst, 1, r_src, 0); - - /* mul.f r, r, bf */ - instr = instr_create(ctx, 2, OPC_MUL_F); - vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0); - - /* cov.f32u32 r, r */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = get_utype(ctx); - vectorize(ctx, instr, &r_dst, 1, r_src, 0); - - /* add.u q, q, r */ - instr = instr_create(ctx, 2, OPC_ADD_U); - vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0); - - /* mull.u r, q, b */ - instr = instr_create(ctx, 2, OPC_MULL_U); - vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0); - - /* madsh.m16 r, q, b, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0); - - /* madsh.m16 r, b, q, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0); - - /* sub.u r, a, r */ - instr = instr_create(ctx, 2, OPC_SUB_U); - vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0); - - /* cmps.u.ge r, r, b */ - instr = instr_create(ctx, 2, OPC_CMPS_U); - instr->cat2.condition = IR3_COND_GE; - vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0); - - if (type_uint(src_type)) { - /* add.u dst, q, r */ - instr = instr_create(ctx, 2, OPC_ADD_U); - vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0); - } else { - /* add.u q, q, r */ - instr = instr_create(ctx, 2, OPC_ADD_U); - vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0); - - /* negate result based on the original arguments */ - if (is_const(a) && is_const(b)) - a = get_unconst(ctx, a); - - /* xor.b r, numerator, denominator */ - instr = instr_create(ctx, 2, OPC_XOR_B); - vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0); - - /* shr.b r, r, 31 */ - instr = instr_create(ctx, 2, OPC_SHR_B); - vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0); - - /* absneg.s b, (neg)q */ - instr = instr_create(ctx, 2, OPC_ABSNEG_S); - vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_SNEG); - - /* sel.b dst, b, r, q */ - instr = instr_create(ctx, 3, OPC_SEL_B32); - vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0); - } - - if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) { - /* The division result will have ended up in q. */ - - if (is_rel_or_const(b)) - b = get_unconst(ctx, b); - - /* mull.u r, q, b */ - instr = instr_create(ctx, 2, OPC_MULL_U); - vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0); - - /* madsh.m16 r, q, b, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0); - - /* madsh.m16 r, b, q, r */ - instr = instr_create(ctx, 3, OPC_MADSH_M16); - vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0); - - /* sub.u dst, a, r */ - instr = instr_create(ctx, 2, OPC_SUB_U); - vectorize(ctx, instr, dst, 2, a, 0, r_src, 0); - } - - put_dst(ctx, inst, dst); -} - -/* - * Handlers for TGSI instructions which do have 1:1 mapping to native - * instructions: - */ - -static void -instr_cat0(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) +struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id) { - instr_create(ctx, 0, t->opc); + struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); + compiler->gpu_id = gpu_id; + compiler->set = ir3_ra_alloc_reg_set(compiler); + return compiler; } -static void -instr_cat1(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) +void ir3_compiler_destroy(struct ir3_compiler *compiler) { - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *src = &inst->Src[0].Register; - - /* NOTE: atomic start/end, rather than in create_mov() since - * create_mov() is used already w/in atomic sequences (and - * we aren't clever enough to deal with the nesting) - */ - instr_atomic_start(ctx); - create_mov(ctx, dst, src); - instr_atomic_end(ctx); -} - -static void -instr_cat2(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - unsigned src0_flags = 0, src1_flags = 0; - - switch (t->tgsi_opc) { - case TGSI_OPCODE_ABS: - src0_flags = IR3_REG_FABS; - break; - case TGSI_OPCODE_IABS: - src0_flags = IR3_REG_SABS; - break; - case TGSI_OPCODE_INEG: - src0_flags = IR3_REG_SNEG; - break; - case TGSI_OPCODE_SUB: - src1_flags = IR3_REG_FNEG; - break; - } - - switch (t->opc) { - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - case OPC_CLZ_B: - case OPC_CLZ_S: - case OPC_SIGN_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_NOT_B: - case OPC_BFREV_B: - case OPC_SETRM: - case OPC_CBITS_B: - /* these only have one src reg */ - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 1, src0, src0_flags); - break; - default: - if (is_const(src0) && is_const(src1)) - src0 = get_unconst(ctx, src0); - - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 2, src0, src0_flags, - src1, src1_flags); - break; - } - - put_dst(ctx, inst, dst); -} - -static void -instr_cat3(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - - /* in particular, can't handle const for src1 for cat3.. - * for mad, we can swap first two src's if needed: - */ - if (is_rel_or_const(src1)) { - if (is_mad(t->opc) && !is_rel_or_const(src0)) { - struct tgsi_src_register *tmp; - tmp = src0; - src0 = src1; - src1 = tmp; - } else { - src1 = get_unconst(ctx, src1); - } - } - - instr = instr_create(ctx, 3, t->opc); - vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, - &inst->Src[2].Register, 0); - put_dst(ctx, inst, dst); -} - -static void -instr_cat4(const struct instr_translater *t, - struct ir3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - struct ir3_instruction *instr; - unsigned i; - - /* seems like blob compiler avoids const as src.. */ - if (is_const(src)) - src = get_unconst(ctx, src); - - /* we need to replicate into each component: */ - for (i = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - instr = instr_create(ctx, 4, t->opc); - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src->SwizzleX); - } - } - - put_dst(ctx, inst, dst); -} - -static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { -#define INSTR(n, f, ...) \ - [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } - - INSTR(MOV, instr_cat1), - INSTR(RCP, instr_cat4, .opc = OPC_RCP), - INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), - INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), - INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), - INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), - INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), - INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), - INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), - INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), - INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), - INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), - INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), - INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), - INSTR(AND, instr_cat2, .opc = OPC_AND_B), - INSTR(OR, instr_cat2, .opc = OPC_OR_B), - INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), - INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), - INSTR(UMUL, trans_umul), - INSTR(UMAD, trans_umul), - INSTR(UDIV, trans_idiv), - INSTR(IDIV, trans_idiv), - INSTR(MOD, trans_idiv), - INSTR(UMOD, trans_idiv), - INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), - INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), - INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), - INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), - INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), - INSTR(AND, instr_cat2, .opc = OPC_AND_B), - INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), - INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), - INSTR(CLAMP, trans_clamp), - INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), - INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), - INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), - INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), - INSTR(ARL, trans_arl), - INSTR(UARL, trans_arl), - INSTR(EX2, instr_cat4, .opc = OPC_EXP2), - INSTR(LG2, instr_cat4, .opc = OPC_LOG2), - INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), - INSTR(COS, instr_cat4, .opc = OPC_COS), - INSTR(SIN, instr_cat4, .opc = OPC_SIN), - INSTR(TEX, trans_samp, .opc = OPC_SAM), - INSTR(TXP, trans_samp, .opc = OPC_SAM), - INSTR(TXB, trans_samp, .opc = OPC_SAMB), - INSTR(TXB2, trans_samp, .opc = OPC_SAMB), - INSTR(TXL, trans_samp, .opc = OPC_SAML), - INSTR(TXD, trans_samp, .opc = OPC_SAMGQ), - INSTR(TXF, trans_samp, .opc = OPC_ISAML), - INSTR(TXQ, trans_txq), - INSTR(DDX, trans_deriv, .opc = OPC_DSX), - INSTR(DDY, trans_deriv, .opc = OPC_DSY), - INSTR(SGT, trans_cmp), - INSTR(SLT, trans_cmp), - INSTR(FSLT, trans_cmp), - INSTR(SGE, trans_cmp), - INSTR(FSGE, trans_cmp), - INSTR(SLE, trans_cmp), - INSTR(SNE, trans_cmp), - INSTR(FSNE, trans_cmp), - INSTR(SEQ, trans_cmp), - INSTR(FSEQ, trans_cmp), - INSTR(CMP, trans_cmp), - INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), - INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), - INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), - INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), - INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), - INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), - INSTR(UCMP, trans_ucmp), - INSTR(ISSG, trans_issg), - INSTR(IF, trans_if, .opc = OPC_CMPS_F), - INSTR(UIF, trans_if, .opc = OPC_CMPS_U), - INSTR(ELSE, trans_else), - INSTR(ENDIF, trans_endif), - INSTR(END, instr_cat0, .opc = OPC_END), - INSTR(KILL, trans_kill, .opc = OPC_KILL), - INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), - INSTR(I2F, trans_cov), - INSTR(U2F, trans_cov), - INSTR(F2I, trans_cov), - INSTR(F2U, trans_cov), -}; - -static ir3_semantic -decl_semantic(const struct tgsi_declaration_semantic *sem) -{ - return ir3_semantic_name(sem->Name, sem->Index); -} - -static struct ir3_instruction * -decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid, - unsigned j, unsigned inloc, bool use_ldlv) -{ - struct ir3_instruction *instr; - struct ir3_register *src; - - if (use_ldlv) { - /* ldlv.u32 dst, l[#inloc], 1 */ - instr = instr_create(ctx, 6, OPC_LDLV); - instr->cat6.type = TYPE_U32; - instr->cat6.iim_val = 1; - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - - return instr; - } - - /* bary.f dst, #inloc, r0.x */ - instr = instr_create(ctx, 2, OPC_BARY_F); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; - src = ir3_reg_create(instr, 0, IR3_REG_SSA); - src->wrmask = 0x3; - src->instr = ctx->frag_pos; - - return instr; -} - -/* TGSI_SEMANTIC_POSITION - * """""""""""""""""""""" - * - * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that - * fragment shader input contains the fragment's window position. The X - * component starts at zero and always increases from left to right. - * The Y component starts at zero and always increases but Y=0 may either - * indicate the top of the window or the bottom depending on the fragment - * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). - * The Z coordinate ranges from 0 to 1 to represent depth from the front - * to the back of the Z buffer. The W component contains the reciprocol - * of the interpolated vertex position W component. - */ -static struct ir3_instruction * -decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid, - unsigned j) -{ - struct ir3_instruction *instr, *src; - - compile_assert(ctx, !ctx->frag_coord[j]); - - ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); - - - switch (j) { - case 0: /* .x */ - case 1: /* .y */ - /* for frag_coord, we get unsigned values.. we need - * to subtract (integer) 8 and divide by 16 (right- - * shift by 4) then convert to float: - */ - - /* add.s tmp, src, -8 */ - instr = instr_create(ctx, 2, OPC_ADD_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; - src = instr; - - /* shr.b tmp, tmp, 4 */ - instr = instr_create(ctx, 2, OPC_SHR_B); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; - src = instr; - - /* mov.u32f32 dst, tmp */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = TYPE_U32; - instr->cat1.dst_type = TYPE_F32; - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - - break; - case 2: /* .z */ - case 3: /* .w */ - /* seems that we can use these as-is: */ - instr = ctx->frag_coord[j]; - break; - default: - compile_error(ctx, "invalid channel\n"); - instr = create_immed(ctx, 0.0); - break; - } - - return instr; -} - -/* TGSI_SEMANTIC_FACE - * """""""""""""""""" - * - * This label applies to fragment shader inputs only and indicates that - * the register contains front/back-face information of the form (F, 0, - * 0, 1). The first component will be positive when the fragment belongs - * to a front-facing polygon, and negative when the fragment belongs to a - * back-facing polygon. - */ -static struct ir3_instruction * -decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid, - unsigned j) -{ - struct ir3_instruction *instr, *src; - - switch (j) { - case 0: /* .x */ - compile_assert(ctx, !ctx->frag_face); - - ctx->frag_face = create_input(ctx->block, NULL, 0); - - /* for faceness, we always get -1 or 0 (int).. but TGSI expects - * positive vs negative float.. and piglit further seems to - * expect -1.0 or 1.0: - * - * mul.s tmp, hr0.x, 2 - * add.s tmp, tmp, 1 - * mov.s16f32, dst, tmp - * - */ - - instr = instr_create(ctx, 2, OPC_MUL_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - src = instr; - - instr = instr_create(ctx, 2, OPC_ADD_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - src = instr; - - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = TYPE_S32; - instr->cat1.dst_type = TYPE_F32; - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - - break; - case 1: /* .y */ - case 2: /* .z */ - instr = create_immed(ctx, 0.0); - break; - case 3: /* .w */ - instr = create_immed(ctx, 1.0); - break; - default: - compile_error(ctx, "invalid channel\n"); - instr = create_immed(ctx, 0.0); - break; - } - - return instr; -} - -static void -decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned name = decl->Semantic.Name; - unsigned i; - - /* I don't think we should get frag shader input without - * semantic info? Otherwise how do inputs get linked to - * vert outputs? - */ - compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || - decl->Declaration.Semantic); - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->inputs_count++; - unsigned r = regid(i, 0); - unsigned ncomp, j; - - /* we'll figure out the actual components used after scheduling */ - ncomp = 4; - - DBG("decl in -> r%d", i); - - compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); - - so->inputs[n].semantic = decl_semantic(&decl->Semantic); - so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].regid = r; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].interpolate = decl->Interp.Interpolate; - - for (j = 0; j < ncomp; j++) { - struct ir3_instruction *instr = NULL; - - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - /* for fragment shaders, POSITION and FACE are handled - * specially, not using normal varying / bary.f - */ - if (name == TGSI_SEMANTIC_POSITION) { - so->inputs[n].bary = false; - so->frag_coord = true; - instr = decl_in_frag_coord(ctx, r + j, j); - } else if (name == TGSI_SEMANTIC_FACE) { - so->inputs[n].bary = false; - so->frag_face = true; - instr = decl_in_frag_face(ctx, r + j, j); - } else { - bool use_ldlv = false; - - /* if no interpolation given, pick based on - * semantic: - */ - if (!decl->Declaration.Interpolate) { - switch (decl->Semantic.Name) { - case TGSI_SEMANTIC_COLOR: - so->inputs[n].interpolate = - TGSI_INTERPOLATE_COLOR; - break; - default: - so->inputs[n].interpolate = - TGSI_INTERPOLATE_LINEAR; - } - } - - if (ctx->flat_bypass) { - switch (so->inputs[n].interpolate) { - case TGSI_INTERPOLATE_COLOR: - if (!ctx->so->key.rasterflat) - break; - /* fallthrough */ - case TGSI_INTERPOLATE_CONSTANT: - use_ldlv = true; - break; - } - } - - so->inputs[n].bary = true; - - instr = decl_in_frag_bary(ctx, r + j, j, - so->inputs[n].inloc + j - 8, use_ldlv); - } - } else { - instr = create_input(ctx->block, NULL, (i * 4) + j); - } - - ctx->block->inputs[(i * 4) + j] = instr; - } - - if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { - ctx->next_inloc += ncomp; - so->total_in += ncomp; - } - } -} - -static void -decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned r = regid(so->inputs_count, 0); - unsigned n = so->inputs_count++; - - DBG("decl sv -> r%d", n); - - compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); - compile_assert(ctx, decl->Range.First < ARRAY_SIZE(ctx->sysval_semantics)); - - ctx->sysval_semantics[decl->Range.First] = decl->Semantic.Name; - so->inputs[n].semantic = decl_semantic(&decl->Semantic); - so->inputs[n].compmask = 1; - so->inputs[n].regid = r; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; - - struct ir3_instruction *instr = NULL; - - switch (decl->Semantic.Name) { - case TGSI_SEMANTIC_VERTEXID_NOBASE: - ctx->vertex_id = instr = create_input(ctx->block, NULL, r); - break; - case TGSI_SEMANTIC_BASEVERTEX: - ctx->basevertex = instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_stype(ctx); - instr->cat1.dst_type = get_stype(ctx); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, regid(so->first_driver_param + 4, 0), - IR3_REG_CONST); - break; - case TGSI_SEMANTIC_INSTANCEID: - ctx->instance_id = instr = create_input(ctx->block, NULL, r); - break; - default: - compile_error(ctx, "Unknown semantic: %s\n", - tgsi_semantic_names[decl->Semantic.Name]); - } - - ctx->block->inputs[r] = instr; - ctx->next_inloc++; - so->total_in++; -} - -static void -decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned comp = 0; - unsigned name = decl->Semantic.Name; - unsigned i; - - compile_assert(ctx, decl->Declaration.Semantic); - - DBG("decl out[%d] -> r%d", name, decl->Range.First); - - if (ctx->type == TGSI_PROCESSOR_VERTEX) { - switch (name) { - case TGSI_SEMANTIC_POSITION: - so->writes_pos = true; - break; - case TGSI_SEMANTIC_PSIZE: - so->writes_psize = true; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - break; - default: - compile_error(ctx, "unknown VS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } else { - switch (name) { - case TGSI_SEMANTIC_POSITION: - comp = 2; /* tgsi will write to .z component */ - so->writes_pos = true; - break; - case TGSI_SEMANTIC_COLOR: - break; - default: - compile_error(ctx, "unknown FS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->outputs_count++; - unsigned ncomp, j; - - ncomp = 4; - - compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - - so->outputs[n].semantic = decl_semantic(&decl->Semantic); - so->outputs[n].regid = regid(i, comp); - - /* avoid undefined outputs, stick a dummy mov from imm{0.0}, - * which if the output is actually assigned will be over- - * written - */ - for (j = 0; j < ncomp; j++) - ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); - } -} - -/* from TGSI perspective, we actually have inputs. But most of the "inputs" - * for a fragment shader are just bary.f instructions. The *actual* inputs - * from the hw perspective are the frag_pos and optionally frag_coord and - * frag_face. - */ -static void -fixup_frag_inputs(struct ir3_compile_context *ctx) -{ - struct ir3_shader_variant *so = ctx->so; - struct ir3_block *block = ctx->block; - struct ir3_instruction **inputs; - struct ir3_instruction *instr; - int n, regid = 0; - - block->ninputs = 0; - - n = 4; /* always have frag_pos */ - n += COND(so->frag_face, 4); - n += COND(so->frag_coord, 4); - - inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); - - if (so->frag_face) { - /* this ultimately gets assigned to hr0.x so doesn't conflict - * with frag_coord/frag_pos.. - */ - inputs[block->ninputs++] = ctx->frag_face; - ctx->frag_face->regs[0]->num = 0; - - /* remaining channels not used, but let's avoid confusing - * other parts that expect inputs to come in groups of vec4 - */ - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; - } - - /* since we don't know where to set the regid for frag_coord, - * we have to use r0.x for it. But we don't want to *always* - * use r1.x for frag_pos as that could increase the register - * footprint on simple shaders: - */ - if (so->frag_coord) { - ctx->frag_coord[0]->regs[0]->num = regid++; - ctx->frag_coord[1]->regs[0]->num = regid++; - ctx->frag_coord[2]->regs[0]->num = regid++; - ctx->frag_coord[3]->regs[0]->num = regid++; - - inputs[block->ninputs++] = ctx->frag_coord[0]; - inputs[block->ninputs++] = ctx->frag_coord[1]; - inputs[block->ninputs++] = ctx->frag_coord[2]; - inputs[block->ninputs++] = ctx->frag_coord[3]; - } - - /* we always have frag_pos: */ - so->pos_regid = regid; - - /* r0.x */ - instr = create_input(block, NULL, block->ninputs); - instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; - ctx->frag_pos->regs[1]->instr = instr; - - /* r0.y */ - instr = create_input(block, NULL, block->ninputs); - instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; - ctx->frag_pos->regs[2]->instr = instr; - - block->inputs = inputs; -} - -static void -compile_instructions(struct ir3_compile_context *ctx) -{ - push_block(ctx); - - /* for fragment shader, we have a single input register (usually - * r0.xy) which is used as the base for bary.f varying fetch instrs: - */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ - ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ - ctx->frag_pos = instr; - } - - while (!tgsi_parse_end_of_tokens(&ctx->parser)) { - tgsi_parse_token(&ctx->parser); - - switch (ctx->parser.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_DECLARATION: { - struct tgsi_full_declaration *decl = - &ctx->parser.FullToken.FullDeclaration; - unsigned file = decl->Declaration.File; - if (file == TGSI_FILE_OUTPUT) { - decl_out(ctx, decl); - } else if (file == TGSI_FILE_INPUT) { - decl_in(ctx, decl); - } else if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { - decl_sv(ctx, decl); - } - - if ((file != TGSI_FILE_CONSTANT) && decl->Declaration.Array) { - int aid = decl->Array.ArrayID + ctx->array_offsets[file]; - - compile_assert(ctx, aid < ARRAY_SIZE(ctx->array)); - - /* legacy ArrayID==0 stuff probably isn't going to work - * well (and is at least untested).. let's just scream: - */ - compile_assert(ctx, aid != 0); - - ctx->array[aid].first = decl->Range.First; - ctx->array[aid].last = decl->Range.Last; - } - break; - } - case TGSI_TOKEN_TYPE_IMMEDIATE: { - /* TODO: if we know the immediate is small enough, and only - * used with instructions that can embed an immediate, we - * can skip this: - */ - struct tgsi_full_immediate *imm = - &ctx->parser.FullToken.FullImmediate; - unsigned n = ctx->so->immediates_count++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); - memcpy(ctx->so->immediates[n].val, imm->u, 16); - break; - } - case TGSI_TOKEN_TYPE_INSTRUCTION: { - struct tgsi_full_instruction *inst = - &ctx->parser.FullToken.FullInstruction; - unsigned opc = inst->Instruction.Opcode; - const struct instr_translater *t = &translaters[opc]; - - if (t->fxn) { - t->fxn(t, ctx, inst); - ctx->num_internal_temps = 0; - - compile_assert(ctx, !ctx->using_tmp_dst); - } else { - compile_error(ctx, "unknown TGSI opc: %s\n", - tgsi_get_opcode_name(opc)); - } - - switch (inst->Instruction.Saturate) { - case TGSI_SAT_ZERO_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(0.0), fui(1.0)); - break; - case TGSI_SAT_MINUS_PLUS_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(-1.0), fui(1.0)); - break; - } - - instr_finish(ctx); - - break; - } - case TGSI_TOKEN_TYPE_PROPERTY: { - struct tgsi_full_property *prop = - &ctx->parser.FullToken.FullProperty; - switch (prop->Property.PropertyName) { - case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: - ctx->so->color0_mrt = !!prop->u[0].Data; - break; - } - } - default: - break; - } - } -} - -static void -compile_dump(struct ir3_compile_context *ctx) -{ - const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; - static unsigned n = 0; - char fname[16]; - FILE *f; - snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); - f = fopen(fname, "w"); - if (!f) - return; - ir3_block_depth(ctx->block); - ir3_dump(ctx->ir, name, ctx->block, f); - fclose(f); -} - -int -ir3_compile_shader(struct ir3_shader_variant *so, - const struct tgsi_token *tokens, struct ir3_shader_key key, - bool cp) -{ - struct ir3_compile_context ctx; - struct ir3_block *block; - struct ir3_instruction **inputs; - unsigned i, j, actual_in; - int ret = 0, max_bary; - - assert(!so->ir); - - so->ir = ir3_create(); - - assert(so->ir); - - if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { - DBG("INIT failed!"); - ret = -1; - goto out; - } - - /* for now, until the edge cases are worked out: */ - if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) - cp = false; - - compile_instructions(&ctx); - - block = ctx.block; - so->ir->block = block; - - /* keep track of the inputs from TGSI perspective.. */ - inputs = block->inputs; - - /* but fixup actual inputs for frag shader: */ - if (ctx.type == TGSI_PROCESSOR_FRAGMENT) - fixup_frag_inputs(&ctx); - - /* at this point, for binning pass, throw away unneeded outputs: */ - if (key.binning_pass) { - for (i = 0, j = 0; i < so->outputs_count; i++) { - unsigned name = sem2name(so->outputs[i].semantic); - unsigned idx = sem2idx(so->outputs[i].semantic); - - /* throw away everything but first position/psize */ - if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || - (name == TGSI_SEMANTIC_PSIZE))) { - if (i != j) { - so->outputs[j] = so->outputs[i]; - block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; - block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; - block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; - block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; - } - j++; - } - } - so->outputs_count = j; - block->noutputs = j * 4; - } - - /* if we want half-precision outputs, mark the output registers - * as half: - */ - if (key.half_precision) { - for (i = 0; i < block->noutputs; i++) { - if (!block->outputs[i]) - continue; - block->outputs[i]->regs[0]->flags |= IR3_REG_HALF; - } - } - - /* at this point, we want the kill's in the outputs array too, - * so that they get scheduled (since they have no dst).. we've - * already ensured that the array is big enough in push_block(): - */ - if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { - for (i = 0; i < ctx.kill_count; i++) - block->outputs[block->noutputs++] = ctx.kill[i]; - } - - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(&ctx); - - ret = ir3_block_flatten(block); - if (ret < 0) { - DBG("FLATTEN failed!"); - goto out; - } - if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) - compile_dump(&ctx); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("BEFORE CP:\n"); - ir3_dump_instr_list(block->head); - } - - ir3_block_depth(block); - - /* First remove all the extra mov's (which we could skip if the - * front-end was clever enough not to insert them in the first - * place). Then figure out left/right neighbors, re-inserting - * extra mov's when needed to avoid conflicts. - */ - if (cp && !(fd_mesa_debug & FD_DBG_NOCP)) - ir3_block_cp(block); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("BEFORE GROUPING:\n"); - ir3_dump_instr_list(block->head); - } - - /* Group left/right neighbors, inserting mov's where needed to - * solve conflicts: - */ - ir3_block_group(block); - - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(&ctx); - - ir3_block_depth(block); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER DEPTH:\n"); - ir3_dump_instr_list(block->head); - } - - ret = ir3_block_sched(block); - if (ret) { - DBG("SCHED failed!"); - goto out; - } - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER SCHED:\n"); - ir3_dump_instr_list(block->head); - } - - ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face); - if (ret) { - DBG("RA failed!"); - goto out; - } - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER RA:\n"); - ir3_dump_instr_list(block->head); - } - - ir3_block_legalize(block, &so->has_samp, &max_bary); - - /* fixup input/outputs: */ - for (i = 0; i < so->outputs_count; i++) { - so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; - /* preserve hack for depth output.. tgsi writes depth to .z, - * but what we give the hw is the scalar register: - */ - if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && - (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) - so->outputs[i].regid += 2; - } - /* Note that some or all channels of an input may be unused: */ - actual_in = 0; - for (i = 0; i < so->inputs_count; i++) { - unsigned j, regid = ~0, compmask = 0; - so->inputs[i].ncomp = 0; - for (j = 0; j < 4; j++) { - struct ir3_instruction *in = inputs[(i*4) + j]; - if (in) { - compmask |= (1 << j); - regid = in->regs[0]->num - j; - actual_in++; - so->inputs[i].ncomp++; - } - } - so->inputs[i].regid = regid; - so->inputs[i].compmask = compmask; - } - - /* fragment shader always gets full vec4's even if it doesn't - * fetch all components, but vertex shader we need to update - * with the actual number of components fetch, otherwise thing - * will hang due to mismaptch between VFD_DECODE's and - * TOTALATTRTOVS - */ - if (so->type == SHADER_VERTEX) - so->total_in = actual_in; - else - so->total_in = align(max_bary + 1, 4); - -out: - if (ret) { - ir3_destroy(so->ir); - so->ir = NULL; - } - compile_free(&ctx); - - return ret; + ralloc_free(compiler); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h index 9213386e00c..86b1161d9cb 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -31,12 +31,19 @@ #include "ir3_shader.h" +struct ir3_ra_reg_set; -int ir3_compile_shader_nir(struct ir3_shader_variant *so, - const struct tgsi_token *tokens, struct ir3_shader_key key); +struct ir3_compiler { + uint32_t gpu_id; + struct ir3_ra_reg_set *set; +}; -int ir3_compile_shader(struct ir3_shader_variant *so, +struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id); +void ir3_compiler_destroy(struct ir3_compiler *compiler); + +int ir3_compile_shader_nir(struct ir3_compiler *compiler, + struct ir3_shader_variant *so, const struct tgsi_token *tokens, - struct ir3_shader_key key, bool cp); + struct ir3_shader_key key); #endif /* IR3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 05e7049ad55..48b1d8f3606 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -48,19 +48,19 @@ #include "ir3.h" -static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); - struct ir3_compile { + struct ir3_compiler *compiler; + const struct tgsi_token *tokens; struct nir_shader *s; struct ir3 *ir; struct ir3_shader_variant *so; - /* bitmask of which samplers are integer: */ - uint16_t integer_s; + struct ir3_block *block; /* the current block */ + struct ir3_block *in_block; /* block created for shader inputs */ - struct ir3_block *block; + nir_function_impl *impl; /* For fragment shaders, from the hw perspective the only * actual input is r0.xy position register passed to bary.f. @@ -92,6 +92,11 @@ struct ir3_compile { */ struct hash_table *addr_ht; + /* maps nir_block to ir3_block, mostly for the purposes of + * figuring out the blocks successors + */ + struct hash_table *block_ht; + /* for calculating input/output positions/linkages: */ unsigned next_inloc; @@ -104,6 +109,11 @@ struct ir3_compile { */ bool levels_add_one; + /* on a3xx, we need to scale up integer coords for isaml based + * on LoD: + */ + bool unminify_coords; + /* for looking up which system value is which */ unsigned sysval_semantics[8]; @@ -118,6 +128,9 @@ struct ir3_compile { }; +static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); +static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); + static struct nir_shader *to_nir(const struct tgsi_token *tokens) { struct nir_shader_compiler_options options = { @@ -146,6 +159,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens) nir_lower_vars_to_ssa(s); nir_lower_alu_to_scalar(s); + nir_lower_phis_to_scalar(s); progress |= nir_copy_prop(s); progress |= nir_opt_dce(s); @@ -170,7 +184,8 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens) /* TODO nir doesn't lower everything for us yet, but ideally it would: */ static const struct tgsi_token * -lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so) +lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens, + struct ir3_shader_variant *so) { struct tgsi_shader_info info; struct tgsi_lowering_config lconfig = { @@ -192,11 +207,7 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so) break; } - if (!so->shader) { - /* hack for standalone compiler which does not have - * screen/context: - */ - } else if (ir3_shader_gpuid(so->shader) >= 400) { + if (ctx->compiler->gpu_id >= 400) { /* a4xx seems to have *no* sam.p */ lconfig.lower_TXP = ~0; /* lower all txp */ } else { @@ -208,36 +219,26 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so) } static struct ir3_compile * -compile_init(struct ir3_shader_variant *so, +compile_init(struct ir3_compiler *compiler, + struct ir3_shader_variant *so, const struct tgsi_token *tokens) { struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); const struct tgsi_token *lowered_tokens; - if (!so->shader) { - /* hack for standalone compiler which does not have - * screen/context: - */ - } else if (ir3_shader_gpuid(so->shader) >= 400) { + if (compiler->gpu_id >= 400) { /* need special handling for "flat" */ ctx->flat_bypass = true; ctx->levels_add_one = false; + ctx->unminify_coords = false; } else { /* no special handling for "flat" */ ctx->flat_bypass = false; ctx->levels_add_one = true; + ctx->unminify_coords = true; } - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - ctx->integer_s = so->key.finteger_s; - break; - case SHADER_VERTEX: - ctx->integer_s = so->key.vinteger_s; - break; - } - + ctx->compiler = compiler; ctx->ir = so->ir; ctx->so = so; ctx->next_inloc = 8; @@ -247,8 +248,10 @@ compile_init(struct ir3_shader_variant *so, _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->addr_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx->block_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); - lowered_tokens = lower_tgsi(tokens, so); + lowered_tokens = lower_tgsi(ctx, tokens, so); if (!lowered_tokens) lowered_tokens = tokens; ctx->s = to_nir(lowered_tokens); @@ -290,33 +293,206 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } - +/* global per-array information: */ struct ir3_array { unsigned length, aid; +}; + +/* per-block array state: */ +struct ir3_array_value { + /* TODO drop length/aid, and just have ptr back to ir3_array */ + unsigned length, aid; + /* initial array element values are phi's, other than for the + * entry block. The phi src's get added later in a resolve step + * after we have visited all the blocks, to account for back + * edges in the cfg. + */ + struct ir3_instruction **phis; + /* current array element values (as block is processed). When + * the array phi's are resolved, it will contain the array state + * at exit of block, so successor blocks can use it to add their + * phi srcs. + */ struct ir3_instruction *arr[]; }; +/* track array assignments per basic block. When an array is read + * outside of the same basic block, we can use NIR's dominance-frontier + * information to figure out where phi nodes are needed. + */ +struct ir3_nir_block_data { + unsigned foo; + /* indexed by array-id (aid): */ + struct ir3_array_value *arrs[]; +}; + +static struct ir3_nir_block_data * +get_block_data(struct ir3_compile *ctx, struct ir3_block *block) +{ + if (!block->bd) { + struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + + ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); + block->bd = bd; + } + return block->bd; +} + static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ - struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) + - (length * sizeof(arr->arr[0]))); + struct ir3_array *arr = ralloc(ctx, struct ir3_array); arr->length = length; arr->aid = ++ctx->num_arrays; - /* Some shaders end up reading array elements without first writing.. - * so initialize things to prevent null instr ptrs later: - */ - for (unsigned i = 0; i < length; i++) - arr->arr[i] = create_immed(ctx->block, 0); _mesa_hash_table_insert(ctx->var_ht, var, arr); } -static struct ir3_array * +static nir_block * +nir_block_pred(nir_block *block) +{ + assert(block->predecessors->entries < 2); + if (block->predecessors->entries == 0) + return NULL; + return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; +} + +static struct ir3_array_value * get_var(struct ir3_compile *ctx, nir_variable *var) { struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - return entry->data; + struct ir3_block *block = ctx->block; + struct ir3_nir_block_data *bd = get_block_data(ctx, block); + struct ir3_array *arr = entry->data; + + if (!bd->arrs[arr->aid]) { + struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + + (arr->length * sizeof(av->arr[0]))); + struct ir3_array_value *defn = NULL; + nir_block *pred_block; + + av->length = arr->length; + av->aid = arr->aid; + + /* For loops, we have to consider that we have not visited some + * of the blocks who should feed into the phi (ie. back-edges in + * the cfg).. for example: + * + * loop { + * block { load_var; ... } + * if then block {} else block {} + * block { store_var; ... } + * if then block {} else block {} + * block {...} + * } + * + * We can skip the phi if we can chase the block predecessors + * until finding the block previously defining the array without + * crossing a block that has more than one predecessor. + * + * Otherwise create phi's and resolve them as a post-pass after + * all the blocks have been visited (to handle back-edges). + */ + + for (pred_block = block->nblock; + pred_block && (pred_block->predecessors->entries < 2) && !defn; + pred_block = nir_block_pred(pred_block)) { + struct ir3_block *pblock = get_block(ctx, pred_block); + struct ir3_nir_block_data *pbd = pblock->bd; + if (!pbd) + continue; + defn = pbd->arrs[arr->aid]; + } + + if (defn) { + /* only one possible definer: */ + for (unsigned i = 0; i < arr->length; i++) + av->arr[i] = defn->arr[i]; + } else if (pred_block) { + /* not the first block, and multiple potential definers: */ + av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); + + for (unsigned i = 0; i < arr->length; i++) { + struct ir3_instruction *phi; + + phi = ir3_instr_create2(block, -1, OPC_META_PHI, + 1 + ctx->impl->num_blocks); + ir3_reg_create(phi, 0, 0); /* dst */ + + /* phi's should go at head of block: */ + list_delinit(&phi->node); + list_add(&phi->node, &block->instr_list); + + av->phis[i] = av->arr[i] = phi; + } + } else { + /* Some shaders end up reading array elements without + * first writing.. so initialize things to prevent null + * instr ptrs later: + */ + for (unsigned i = 0; i < arr->length; i++) + av->arr[i] = create_immed(block, 0); + } + + bd->arrs[arr->aid] = av; + } + + return bd->arrs[arr->aid]; +} + +static void +add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, + struct ir3_array_value *av, BITSET_WORD *visited) +{ + struct ir3_block *block; + struct ir3_nir_block_data *bd; + + if (BITSET_TEST(visited, nblock->index)) + return; + + BITSET_SET(visited, nblock->index); + + block = get_block(ctx, nblock); + bd = block->bd; + + if (bd && bd->arrs[av->aid]) { + struct ir3_array_value *dav = bd->arrs[av->aid]; + for (unsigned i = 0; i < av->length; i++) { + ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = + dav->arr[i]; + } + } else { + /* didn't find defn, recurse predecessors: */ + struct set_entry *entry; + set_foreach(nblock->predecessors, entry) { + add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); + } + } +} + +static void +resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) +{ + struct ir3_nir_block_data *bd = block->bd; + unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); + + if (!bd) + return; + + /* TODO use nir dom_frontier to help us with this? */ + + for (unsigned i = 1; i <= ctx->num_arrays; i++) { + struct ir3_array_value *av = bd->arrs[i]; + BITSET_WORD visited[bitset_words]; + struct set_entry *entry; + + if (!(av && av->phis)) + continue; + + memset(visited, 0, sizeof(visited)); + set_foreach(block->nblock->predecessors, entry) { + add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); + } + } } /* allocate a n element value array (to be populated by caller) and @@ -393,7 +569,8 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src) instr->regs[1]->flags |= IR3_REG_HALF; instr = ir3_MOV(block, instr, TYPE_S16); - instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF; + instr->regs[0]->num = regid(REG_A0, 0); + instr->regs[0]->flags |= IR3_REG_HALF; instr->regs[1]->flags |= IR3_REG_HALF; return instr; @@ -419,6 +596,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src) } static struct ir3_instruction * +get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *cond; + + /* NOTE: only cmps.*.* can write p0.x: */ + cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0); + cond->cat2.condition = IR3_COND_NE; + + /* condition always goes in predicate register: */ + cond->regs[0]->num = regid(REG_P0, 0); + + return cond; +} + +static struct ir3_instruction * create_uniform(struct ir3_compile *ctx, unsigned n) { struct ir3_instruction *mov; @@ -461,7 +654,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, return NULL; collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz); - ir3_reg_create(collect, 0, 0); + ir3_reg_create(collect, 0, 0); /* dst */ for (unsigned i = 0; i < arrsz; i++) ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i]; @@ -597,6 +790,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) compile_assert(ctx, !ctx->frag_face); ctx->frag_face = create_input(block, NULL, 0); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; /* for faceness, we always get -1 or 0 (int).. but TGSI expects * positive vs negative float.. and piglit further seems to @@ -628,10 +822,10 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) */ static void split_dest(struct ir3_block *block, struct ir3_instruction **dst, - struct ir3_instruction *src) + struct ir3_instruction *src, unsigned n) { struct ir3_instruction *prev = NULL; - for (int i = 0, j = 0; i < 4; i++) { + for (int i = 0, j = 0; i < n; i++) { struct ir3_instruction *split = ir3_instr_create(block, -1, OPC_META_FO); ir3_reg_create(split, 0, IR3_REG_SSA); @@ -882,9 +1076,15 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu) case nir_op_imax: dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); break; + case nir_op_umax: + dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); + break; case nir_op_imin: dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); break; + case nir_op_umin: + dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); + break; case nir_op_imul: /* * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16) @@ -1030,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_array_value *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1070,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_array_value *arr = get_var(ctx, dvar->var); struct ir3_instruction **src; compile_assert(ctx, dvar->deref.child && @@ -1140,8 +1340,8 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name, so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; so->total_in++; - ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1); - ctx->block->inputs[r] = instr; + ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); + ctx->ir->inputs[r] = instr; } static void @@ -1154,18 +1354,18 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) if (info->has_dest) { dst = get_dst(ctx, &intr->dest, intr->num_components); + } else { + dst = NULL; } switch (intr->intrinsic) { case nir_intrinsic_load_uniform: - compile_assert(ctx, intr->const_index[1] == 1); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = create_uniform(ctx, n); } break; case nir_intrinsic_load_uniform_indirect: - compile_assert(ctx, intr->const_index[1] == 1); src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; @@ -1178,21 +1378,20 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) emit_intrinsic_load_ubo(ctx, intr, dst); break; case nir_intrinsic_load_input: - compile_assert(ctx, intr->const_index[1] == 1); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; - dst[i] = b->inputs[n]; + dst[i] = ctx->ir->inputs[n]; } break; case nir_intrinsic_load_input_indirect: - compile_assert(ctx, intr->const_index[1] == 1); src = get_src(ctx, &intr->src[0]); struct ir3_instruction *collect = - create_collect(b, b->inputs, b->ninputs); + create_collect(b, ctx->ir->inputs, ctx->ir->ninputs); struct ir3_instruction *addr = get_addr(ctx, src[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; - dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect); + dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, + n, addr, collect); } break; case nir_intrinsic_load_var: @@ -1202,11 +1401,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) emit_intrinisic_store_var(ctx, intr); break; case nir_intrinsic_store_output: - compile_assert(ctx, intr->const_index[1] == 1); src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; - b->outputs[n] = src[i]; + ctx->ir->outputs[n] = src[i]; } break; case nir_intrinsic_load_base_vertex: @@ -1248,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) cond = create_immed(b, 1); } + /* NOTE: only cmps.*.* can write p0.x: */ cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); cond->cat2.condition = IR3_COND_NE; @@ -1255,6 +1454,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) cond->regs[0]->num = regid(REG_P0, 0); kill = ir3_KILL(b, cond, 0); + array_insert(ctx->ir->predicates, kill); ctx->kill[ctx->kill_count++] = kill; ctx->so->has_kill = true; @@ -1318,6 +1518,8 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) coords = 3; flags |= IR3_INSTR_3D; break; + default: + unreachable("bad sampler_dim"); } if (tex->is_shadow) @@ -1340,7 +1542,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) unsigned i, coords, flags; unsigned nsrc0 = 0, nsrc1 = 0; type_t type; - opc_t opc; + opc_t opc = 0; + + coord = off = ddx = ddy = NULL; + lod = proj = compare = NULL; /* TODO: might just be one component for gathers? */ dst = get_dst(ctx, &tex->dest, 4); @@ -1400,11 +1605,12 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) tex_info(tex, &flags, &coords); /* scale up integer coords for TXF based on the LOD */ - if (opc == OPC_ISAML) { + if (ctx->unminify_coords && (opc == OPC_ISAML)) { assert(has_lod); for (i = 0; i < coords; i++) coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0); } + /* * lay out the first argument in the proper order: * - actual coordinates first @@ -1484,6 +1690,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_type_bool: type = TYPE_U32; break; + default: + unreachable("bad dest_type"); } sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, @@ -1491,7 +1699,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) create_collect(b, src0, nsrc0), create_collect(b, src1, nsrc1)); - split_dest(b, dst, sam); + split_dest(b, dst, sam, 4); } static void @@ -1508,7 +1716,7 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex) /* even though there is only one component, since it ends * up in .z rather than .x, we need a split_dest() */ - split_dest(b, dst, sam); + split_dest(b, dst, sam, 3); /* The # of levels comes from getinfo.z. We need to add 1 to it, since * the value in TEX_CONST_0 is zero-based. @@ -1536,7 +1744,7 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, tex->sampler_index, tex->sampler_index, lod, NULL); - split_dest(b, dst, sam); + split_dest(b, dst, sam, 4); /* Array size actually ends up in .w rather than .z. This doesn't * matter for miplevel 0, but for higher mips the value in z is @@ -1553,6 +1761,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) } static void +emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi) +{ + struct ir3_instruction *phi, **dst; + + /* NOTE: phi's should be lowered to scalar at this point */ + compile_assert(ctx, nphi->dest.ssa.num_components == 1); + + dst = get_dst(ctx, &nphi->dest, 1); + + phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI, + 1 + exec_list_length(&nphi->srcs)); + ir3_reg_create(phi, 0, 0); /* dst */ + phi->phi.nphi = nphi; + + dst[0] = phi; +} + +/* phi instructions are left partially constructed. We don't resolve + * their srcs until the end of the block, since (eg. loops) one of + * the phi's srcs might be defined after the phi due to back edges in + * the CFG. + */ +static void +resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + nir_phi_instr *nphi; + + /* phi's only come at start of block: */ + if (!(is_meta(instr) && (instr->opc == OPC_META_PHI))) + break; + + if (!instr->phi.nphi) + break; + + nphi = instr->phi.nphi; + instr->phi.nphi = NULL; + + foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) { + struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0]; + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + } + } + + resolve_array_phis(ctx, block); +} + +static void +emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + case nir_jump_continue: + /* I *think* we can simply just ignore this, and use the + * successor block link to figure out where we need to + * jump to for break/continue + */ + break; + default: + compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); + break; + } +} + +static void emit_instr(struct ir3_compile *ctx, nir_instr *instr) { switch (instr->type) { @@ -1585,45 +1858,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) } break; } - case nir_instr_type_call: - case nir_instr_type_jump: case nir_instr_type_phi: + emit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + case nir_instr_type_call: case nir_instr_type_parallel_copy: compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); break; } } +static struct ir3_block * +get_block(struct ir3_compile *ctx, nir_block *nblock) +{ + struct ir3_block *block; + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->block_ht, nblock); + if (entry) + return entry->data; + + block = ir3_block_create(ctx->ir); + block->nblock = nblock; + _mesa_hash_table_insert(ctx->block_ht, nblock, block); + + return block; +} + static void -emit_block(struct ir3_compile *ctx, nir_block *block) +emit_block(struct ir3_compile *ctx, nir_block *nblock) { - nir_foreach_instr(block, instr) { + struct ir3_block *block = get_block(ctx, nblock); + + for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { + if (nblock->successors[i]) { + block->successors[i] = + get_block(ctx, nblock->successors[i]); + } + } + + ctx->block = block; + list_addtail(&block->node, &ctx->ir->block_list); + + nir_foreach_instr(nblock, instr) { emit_instr(ctx, instr); if (ctx->error) return; } } +static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list); + static void -emit_function(struct ir3_compile *ctx, nir_function_impl *impl) +emit_if(struct ir3_compile *ctx, nir_if *nif) +{ + struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0]; + + ctx->block->condition = + get_predicate(ctx, ir3_b2n(condition->block, condition)); + + emit_cf_list(ctx, &nif->then_list); + emit_cf_list(ctx, &nif->else_list); +} + +static void +emit_loop(struct ir3_compile *ctx, nir_loop *nloop) +{ + emit_cf_list(ctx, &nloop->body); +} + +static void +emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) { - foreach_list_typed(nir_cf_node, node, node, &impl->body) { + foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: emit_block(ctx, nir_cf_node_as_block(node)); break; case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; case nir_cf_node_function: compile_error(ctx, "TODO\n"); break; } - if (ctx->error) - return; } } static void +emit_function(struct ir3_compile *ctx, nir_function_impl *impl) +{ + emit_cf_list(ctx, &impl->body); + emit_block(ctx, impl->end_block); + + /* at this point, we should have a single empty block, + * into which we emit the 'end' instruction. + */ + compile_assert(ctx, list_empty(&ctx->block->instr_list)); + ir3_END(ctx->block); +} + +static void setup_input(struct ir3_compile *ctx, nir_variable *in) { struct ir3_shader_variant *so = ctx->so; @@ -1708,7 +2048,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) instr = create_input(ctx->block, NULL, idx); } - ctx->block->inputs[idx] = instr; + ctx->ir->inputs[idx] = instr; } if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) { @@ -1775,15 +2115,26 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) for (int i = 0; i < ncomp; i++) { unsigned idx = (n * 4) + i; - ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0)); + ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); } } static void emit_instructions(struct ir3_compile *ctx) { - unsigned ninputs = exec_list_length(&ctx->s->inputs) * 4; - unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4; + unsigned ninputs, noutputs; + nir_function_impl *fxn = NULL; + + /* Find the main function: */ + nir_foreach_overload(ctx->s, overload) { + compile_assert(ctx, strcmp(overload->function->name, "main") == 0); + compile_assert(ctx, overload->impl); + fxn = overload->impl; + break; + } + + ninputs = exec_list_length(&ctx->s->inputs) * 4; + noutputs = exec_list_length(&ctx->s->outputs) * 4; /* we need to allocate big enough outputs array so that * we can stuff the kill's at the end. Likewise for vtx @@ -1795,12 +2146,17 @@ emit_instructions(struct ir3_compile *ctx) ninputs += 8; } - ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs); + ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); + + /* Create inputs in first block: */ + ctx->block = get_block(ctx, fxn->start_block); + ctx->in_block = ctx->block; + list_addtail(&ctx->block->node, &ctx->ir->block_list); if (ctx->so->type == SHADER_FRAGMENT) { - ctx->block->noutputs -= ARRAY_SIZE(ctx->kill); + ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill); } else if (ctx->so->type == SHADER_VERTEX) { - ctx->block->ninputs -= 8; + ctx->ir->ninputs -= 8; } /* for fragment shader, we have a single input register (usually @@ -1831,13 +2187,12 @@ emit_instructions(struct ir3_compile *ctx) declare_var(ctx, var); } - /* Find the main function and emit the body: */ - nir_foreach_overload(ctx->s, overload) { - compile_assert(ctx, strcmp(overload->function->name, "main") == 0); - compile_assert(ctx, overload->impl); - emit_function(ctx, overload->impl); - if (ctx->error) - return; + /* And emit the body: */ + ctx->impl = fxn; + emit_function(ctx, fxn); + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + resolve_phis(ctx, block); } } @@ -1850,12 +2205,12 @@ static void fixup_frag_inputs(struct ir3_compile *ctx) { struct ir3_shader_variant *so = ctx->so; - struct ir3_block *block = ctx->block; + struct ir3 *ir = ctx->ir; struct ir3_instruction **inputs; struct ir3_instruction *instr; int n, regid = 0; - block->ninputs = 0; + ir->ninputs = 0; n = 4; /* always have frag_pos */ n += COND(so->frag_face, 4); @@ -1867,15 +2222,15 @@ fixup_frag_inputs(struct ir3_compile *ctx) /* this ultimately gets assigned to hr0.x so doesn't conflict * with frag_coord/frag_pos.. */ - inputs[block->ninputs++] = ctx->frag_face; + inputs[ir->ninputs++] = ctx->frag_face; ctx->frag_face->regs[0]->num = 0; /* remaining channels not used, but let's avoid confusing * other parts that expect inputs to come in groups of vec4 */ - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; + inputs[ir->ninputs++] = NULL; + inputs[ir->ninputs++] = NULL; + inputs[ir->ninputs++] = NULL; } /* since we don't know where to set the regid for frag_coord, @@ -1889,63 +2244,45 @@ fixup_frag_inputs(struct ir3_compile *ctx) ctx->frag_coord[2]->regs[0]->num = regid++; ctx->frag_coord[3]->regs[0]->num = regid++; - inputs[block->ninputs++] = ctx->frag_coord[0]; - inputs[block->ninputs++] = ctx->frag_coord[1]; - inputs[block->ninputs++] = ctx->frag_coord[2]; - inputs[block->ninputs++] = ctx->frag_coord[3]; + inputs[ir->ninputs++] = ctx->frag_coord[0]; + inputs[ir->ninputs++] = ctx->frag_coord[1]; + inputs[ir->ninputs++] = ctx->frag_coord[2]; + inputs[ir->ninputs++] = ctx->frag_coord[3]; } /* we always have frag_pos: */ so->pos_regid = regid; /* r0.x */ - instr = create_input(block, NULL, block->ninputs); + instr = create_input(ctx->in_block, NULL, ir->ninputs); instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; + inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[1]->instr = instr; /* r0.y */ - instr = create_input(block, NULL, block->ninputs); + instr = create_input(ctx->in_block, NULL, ir->ninputs); instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; + inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[2]->instr = instr; - block->inputs = inputs; -} - -static void -compile_dump(struct ir3_compile *ctx) -{ - const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; - static unsigned n = 0; - char fname[16]; - FILE *f; - snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); - f = fopen(fname, "w"); - if (!f) - return; - ir3_block_depth(ctx->block); - ir3_dump(ctx->ir, name, ctx->block, f); - fclose(f); + ir->inputs = inputs; } int -ir3_compile_shader_nir(struct ir3_shader_variant *so, - const struct tgsi_token *tokens, struct ir3_shader_key key) +ir3_compile_shader_nir(struct ir3_compiler *compiler, + struct ir3_shader_variant *so, + const struct tgsi_token *tokens, + struct ir3_shader_key key) { struct ir3_compile *ctx; - struct ir3_block *block; + struct ir3 *ir; struct ir3_instruction **inputs; unsigned i, j, actual_in; int ret = 0, max_bary; assert(!so->ir); - so->ir = ir3_create(); - - assert(so->ir); - - ctx = compile_init(so, tokens); + ctx = compile_init(compiler, so, tokens); if (!ctx) { DBG("INIT failed!"); ret = -1; @@ -1960,11 +2297,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, goto out; } - block = ctx->block; - so->ir->block = block; + ir = so->ir = ctx->ir; /* keep track of the inputs from TGSI perspective.. */ - inputs = block->inputs; + inputs = ir->inputs; /* but fixup actual inputs for frag shader: */ if (so->type == SHADER_FRAGMENT) @@ -1981,26 +2317,39 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, (name == TGSI_SEMANTIC_PSIZE))) { if (i != j) { so->outputs[j] = so->outputs[i]; - block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; - block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; - block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; - block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; + ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; + ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1]; + ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2]; + ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3]; } j++; } } so->outputs_count = j; - block->noutputs = j * 4; + ir->noutputs = j * 4; } /* if we want half-precision outputs, mark the output registers * as half: */ if (key.half_precision) { - for (i = 0; i < block->noutputs; i++) { - if (!block->outputs[i]) + for (i = 0; i < ir->noutputs; i++) { + struct ir3_instruction *out = ir->outputs[i]; + if (!out) continue; - block->outputs[i]->regs[0]->flags |= IR3_REG_HALF; + out->regs[0]->flags |= IR3_REG_HALF; + /* output could be a fanout (ie. texture fetch output) + * in which case we need to propagate the half-reg flag + * up to the definer so that RA sees it: + */ + if (is_meta(out) && (out->opc == OPC_META_FO)) { + out = out->regs[1]->instr; + out->regs[0]->flags |= IR3_REG_HALF; + } + + if (out->category == 1) { + out->cat1.dst_type = half_type(out->cat1.dst_type); + } } } @@ -2010,42 +2359,34 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, */ if (so->type == SHADER_FRAGMENT) { for (i = 0; i < ctx->kill_count; i++) - block->outputs[block->noutputs++] = ctx->kill[i]; + ir->outputs[ir->noutputs++] = ctx->kill[i]; } - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(ctx); - if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE CP:\n"); - ir3_dump_instr_list(block->head); + ir3_print(ir); } - ir3_block_depth(block); - - ir3_block_cp(block); + ir3_cp(ir); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE GROUPING:\n"); - ir3_dump_instr_list(block->head); + ir3_print(ir); } /* Group left/right neighbors, inserting mov's where needed to * solve conflicts: */ - ir3_block_group(block); - - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(ctx); + ir3_group(ir); - ir3_block_depth(block); + ir3_depth(ir); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER DEPTH:\n"); - ir3_dump_instr_list(block->head); + ir3_print(ir); } - ret = ir3_block_sched(block); + ret = ir3_sched(ir); if (ret) { DBG("SCHED failed!"); goto out; @@ -2053,10 +2394,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER SCHED:\n"); - ir3_dump_instr_list(block->head); + ir3_print(ir); } - ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face); + ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face); if (ret) { DBG("RA failed!"); goto out; @@ -2064,14 +2405,19 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER RA:\n"); - ir3_dump_instr_list(block->head); + ir3_print(ir); } - ir3_block_legalize(block, &so->has_samp, &max_bary); + ir3_legalize(ir, &so->has_samp, &max_bary); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { - so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; + so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; /* preserve hack for depth output.. tgsi writes depth to .z, * but what we give the hw is the scalar register: */ @@ -2111,7 +2457,8 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so, out: if (ret) { - ir3_destroy(so->ir); + if (so->ir) + ir3_destroy(so->ir); so->ir = NULL; } compile_free(ctx); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index fa7d363be7b..8c7c80f7aae 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -41,7 +41,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; struct ir3_instruction *src_instr = ssa(src); - if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV)) + if (dst->flags & IR3_REG_RELATIV) return false; if (src->flags & IR3_REG_RELATIV) return false; @@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) /* TODO: remove this hack: */ if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO)) return false; + /* TODO: we currently don't handle left/right neighbors + * very well when inserting parallel-copies into phi.. + * to avoid problems don't eliminate a mov coming out + * of phi.. + */ + if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI)) + return false; return true; } return false; @@ -354,13 +361,6 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags) { struct ir3_register *reg; - /* stay within the block.. don't try to operate across - * basic block boundaries or we'll have problems when - * dealing with multiple basic blocks: - */ - if (is_meta(instr) && (instr->opc == OPC_META_INPUT)) - return instr; - if (is_eligible_mov(instr, !!flags)) { struct ir3_register *reg = instr->regs[1]; struct ir3_instruction *src_instr = ssa(reg); @@ -394,22 +394,22 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags) return instr; } -static void block_cp(struct ir3_block *block) +void +ir3_cp(struct ir3 *ir) { - unsigned i; + ir3_clear_mark(ir); - for (i = 0; i < block->noutputs; i++) { - if (block->outputs[i]) { + for (unsigned i = 0; i < ir->noutputs; i++) { + if (ir->outputs[i]) { struct ir3_instruction *out = - instr_cp(block->outputs[i], NULL); + instr_cp(ir->outputs[i], NULL); - block->outputs[i] = out; + ir->outputs[i] = out; } } -} -void ir3_block_cp(struct ir3_block *block) -{ - ir3_clear_mark(block->shader); - block_cp(block); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + if (block->condition) + block->condition = instr_cp(block->condition, NULL); + } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index b899c66b37e..3a108243479 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -84,25 +84,25 @@ int ir3_delayslots(struct ir3_instruction *assigner, } } -static void insert_by_depth(struct ir3_instruction *instr) +void +ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) { - struct ir3_block *block = instr->block; - struct ir3_instruction *n = block->head; - struct ir3_instruction *p = NULL; - - while (n && (n != instr) && (n->depth > instr->depth)) { - p = n; - n = n->next; + /* remove from existing spot in list: */ + list_delinit(&instr->node); + + /* find where to re-insert instruction: */ + list_for_each_entry (struct ir3_instruction, pos, list, node) { + if (pos->depth > instr->depth) { + list_add(&instr->node, &pos->node); + return; + } } - - instr->next = n; - if (p) - p->next = instr; - else - block->head = instr; + /* if we get here, we didn't find an insertion spot: */ + list_addtail(&instr->node, list); } -static void ir3_instr_depth(struct ir3_instruction *instr) +static void +ir3_instr_depth(struct ir3_instruction *instr) { struct ir3_instruction *src; @@ -123,47 +123,54 @@ static void ir3_instr_depth(struct ir3_instruction *instr) instr->depth = MAX2(instr->depth, sd); } - /* meta-instructions don't add cycles, other than PHI.. which - * might translate to a real instruction.. - * - * well, not entirely true, fan-in/out, etc might need to need - * to generate some extra mov's in edge cases, etc.. probably - * we might want to do depth calculation considering the worst - * case for these?? - */ if (!is_meta(instr)) instr->depth++; - insert_by_depth(instr); + ir3_insert_by_depth(instr, &instr->block->instr_list); +} + +static void +remove_unused_by_block(struct ir3_block *block) +{ + list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { + if (!ir3_instr_check_mark(instr)) { + if (is_flow(instr) && (instr->opc == OPC_END)) + continue; + /* mark it, in case it is input, so we can + * remove unused inputs: + */ + instr->depth = DEPTH_UNUSED; + /* and remove from instruction list: */ + list_delinit(&instr->node); + } + } } -void ir3_block_depth(struct ir3_block *block) +void +ir3_depth(struct ir3 *ir) { unsigned i; - block->head = NULL; + ir3_clear_mark(ir); + for (i = 0; i < ir->noutputs; i++) + if (ir->outputs[i]) + ir3_instr_depth(ir->outputs[i]); - ir3_clear_mark(block->shader); - for (i = 0; i < block->noutputs; i++) - if (block->outputs[i]) - ir3_instr_depth(block->outputs[i]); + /* We also need to account for if-condition: */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + if (block->condition) + ir3_instr_depth(block->condition); + } /* mark un-used instructions: */ - for (i = 0; i < block->shader->instrs_count; i++) { - struct ir3_instruction *instr = block->shader->instrs[i]; - - /* just consider instructions within this block: */ - if (instr->block != block) - continue; - - if (!ir3_instr_check_mark(instr)) - instr->depth = DEPTH_UNUSED; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + remove_unused_by_block(block); } /* cleanup unused inputs: */ - for (i = 0; i < block->ninputs; i++) { - struct ir3_instruction *in = block->inputs[i]; + for (i = 0; i < ir->ninputs; i++) { + struct ir3_instruction *in = ir->inputs[i]; if (in && (in->depth == DEPTH_UNUSED)) - block->inputs[i] = NULL; + ir->inputs[i] = NULL; } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c deleted file mode 100644 index 1614d637b13..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c +++ /dev/null @@ -1,456 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include <stdarg.h> - -#include "ir3.h" - -#define PTRID(x) ((unsigned long)(x)) - -struct ir3_dump_ctx { - FILE *f; - bool verbose; -}; - -static void dump_instr_name(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - /* for debugging: */ - if (ctx->verbose) { -#ifdef DEBUG - fprintf(ctx->f, "%04u:", instr->serialno); -#endif - fprintf(ctx->f, "%03u: ", instr->depth); - } - - if (instr->flags & IR3_INSTR_SY) - fprintf(ctx->f, "(sy)"); - if (instr->flags & IR3_INSTR_SS) - fprintf(ctx->f, "(ss)"); - - if (is_meta(instr)) { - switch(instr->opc) { - case OPC_META_PHI: - fprintf(ctx->f, "Φ"); - break; - default: - /* shouldn't hit here.. just for debugging: */ - switch (instr->opc) { - case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break; - case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break; - case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; - case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; - case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; - - default: fprintf(ctx->f, "_meta:%d", instr->opc); break; - } - break; - } - } else if (instr->category == 1) { - static const char *type[] = { - [TYPE_F16] = "f16", - [TYPE_F32] = "f32", - [TYPE_U16] = "u16", - [TYPE_U32] = "u32", - [TYPE_S16] = "s16", - [TYPE_S32] = "s32", - [TYPE_U8] = "u8", - [TYPE_S8] = "s8", - }; - if (instr->cat1.src_type == instr->cat1.dst_type) - fprintf(ctx->f, "mov"); - else - fprintf(ctx->f, "cov"); - fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); - } else { - fprintf(ctx->f, "%s", ir3_instr_name(instr)); - if (instr->flags & IR3_INSTR_3D) - fprintf(ctx->f, ".3d"); - if (instr->flags & IR3_INSTR_A) - fprintf(ctx->f, ".a"); - if (instr->flags & IR3_INSTR_O) - fprintf(ctx->f, ".o"); - if (instr->flags & IR3_INSTR_P) - fprintf(ctx->f, ".p"); - if (instr->flags & IR3_INSTR_S) - fprintf(ctx->f, ".s"); - if (instr->flags & IR3_INSTR_S2EN) - fprintf(ctx->f, ".s2en"); - } -} - -static void dump_reg_name(struct ir3_dump_ctx *ctx, - struct ir3_register *reg, bool followssa) -{ - if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && - (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) - fprintf(ctx->f, "(absneg)"); - else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)) - fprintf(ctx->f, "(neg)"); - else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) - fprintf(ctx->f, "(abs)"); - - if (reg->flags & IR3_REG_IMMED) { - fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_SSA) { - if (ctx->verbose) { - fprintf(ctx->f, "_"); - if (followssa) { - fprintf(ctx->f, "["); - dump_instr_name(ctx, reg->instr); - fprintf(ctx->f, "]"); - } - } - } else if (reg->flags & IR3_REG_RELATIV) { - if (reg->flags & IR3_REG_HALF) - fprintf(ctx->f, "h"); - if (reg->flags & IR3_REG_CONST) - fprintf(ctx->f, "c<a0.x + %u>", reg->num); - else - fprintf(ctx->f, "\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size); - } else { - if (reg->flags & IR3_REG_HALF) - fprintf(ctx->f, "h"); - if (reg->flags & IR3_REG_CONST) - fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); - else - fprintf(ctx->f, "\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]); - } -} - -static void ir3_instr_dump(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr); -static void ir3_block_dump(struct ir3_dump_ctx *ctx, - struct ir3_block *block, const char *name); - -static void dump_instr(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return; - - /* some meta-instructions need to be handled specially: */ - if (is_meta(instr)) { - if ((instr->opc == OPC_META_FO) || - (instr->opc == OPC_META_FI)) { - struct ir3_instruction *src; - foreach_ssa_src(src, instr) - dump_instr(ctx, src); - } else if (instr->opc == OPC_META_FLOW) { - struct ir3_register *reg = instr->regs[1]; - ir3_block_dump(ctx, instr->flow.if_block, "if"); - if (instr->flow.else_block) - ir3_block_dump(ctx, instr->flow.else_block, "else"); - if (reg->flags & IR3_REG_SSA) - dump_instr(ctx, reg->instr); - } else if (instr->opc == OPC_META_PHI) { - /* treat like a normal instruction: */ - ir3_instr_dump(ctx, instr); - } - } else { - ir3_instr_dump(ctx, instr); - } -} - -/* arrarraggh! if link is to something outside of the current block, we - * need to defer emitting the link until the end of the block, since the - * edge triggers pre-creation of the node it links to inside the cluster, - * even though it is meant to be outside.. - */ -static struct { - char buf[40960]; - unsigned n; -} edge_buf; - -/* helper to print or defer: */ -static void printdef(struct ir3_dump_ctx *ctx, - bool defer, const char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - if (defer) { - unsigned n = edge_buf.n; - n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n, - fmt, ap); - edge_buf.n = n; - } else { - vfprintf(ctx->f, fmt, ap); - } - va_end(ap); -} - -static void dump_link2(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr, const char *target, bool defer) -{ - /* some meta-instructions need to be handled specially: */ - if (is_meta(instr)) { - if (instr->opc == OPC_META_INPUT) { - printdef(ctx, defer, "input%lx:<in%u>:w -> %s", - PTRID(instr->inout.block), - instr->regs[0]->num, target); - } else if (instr->opc == OPC_META_FO) { - struct ir3_register *reg = instr->regs[1]; - dump_link2(ctx, reg->instr, target, defer); - printdef(ctx, defer, "[label=\".%c\"]", - "xyzw"[instr->fo.off & 0x3]); - } else if (instr->opc == OPC_META_FI) { - struct ir3_instruction *src; - - foreach_ssa_src_n(src, i, instr) { - dump_link2(ctx, src, target, defer); - printdef(ctx, defer, "[label=\".%c\"]", - "xyzw"[i & 0x3]); - } - } else if (instr->opc == OPC_META_OUTPUT) { - printdef(ctx, defer, "output%lx:<out%u>:w -> %s", - PTRID(instr->inout.block), - instr->regs[0]->num, target); - } else if (instr->opc == OPC_META_PHI) { - /* treat like a normal instruction: */ - printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); - } - } else { - printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); - } -} - -static void dump_link(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr, - struct ir3_block *block, const char *target) -{ - bool defer = instr->block != block; - dump_link2(ctx, instr, target, defer); - printdef(ctx, defer, "\n"); -} - -static struct ir3_register *follow_flow(struct ir3_register *reg) -{ - if (reg->flags & IR3_REG_SSA) { - struct ir3_instruction *instr = reg->instr; - /* go with the flow.. */ - if (is_meta(instr) && (instr->opc == OPC_META_FLOW)) - return instr->regs[1]; - } - return reg; -} - -static void ir3_instr_dump(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_register *src; - - fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{", - PTRID(instr)); - dump_instr_name(ctx, instr); - - /* destination register: */ - fprintf(ctx->f, "|<dst0>"); - - /* source register(s): */ - foreach_src_n(src, i, instr) { - struct ir3_register *reg = follow_flow(src); - - fprintf(ctx->f, "|"); - - if (reg->flags & IR3_REG_SSA) - fprintf(ctx->f, "<src%u> ", i); - - dump_reg_name(ctx, reg, true); - } - - fprintf(ctx->f, "}\"];\n"); - - /* and recursively dump dependent instructions: */ - foreach_src_n(src, i, instr) { - struct ir3_register *reg = follow_flow(src); - char target[32]; /* link target */ - - if (!(reg->flags & IR3_REG_SSA)) - continue; - - snprintf(target, sizeof(target), "instr%lx:<src%u>", - PTRID(instr), i); - - dump_instr(ctx, reg->instr); - dump_link(ctx, reg->instr, instr->block, target); - } -} - -static void ir3_block_dump(struct ir3_dump_ctx *ctx, - struct ir3_block *block, const char *name) -{ - unsigned i, n; - - n = edge_buf.n; - - fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block)); - fprintf(ctx->f, "label=\"%s\";\n", name); - - /* draw inputs: */ - fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block)); - for (i = 0; i < block->ninputs; i++) - if (block->inputs[i]) - fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); - fprintf(ctx->f, "\"];\n"); - - /* draw instruction graph: */ - for (i = 0; i < block->noutputs; i++) - if (block->outputs[i]) - dump_instr(ctx, block->outputs[i]); - - /* draw outputs: */ - fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block)); - for (i = 0; i < block->noutputs; i++) - fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); - fprintf(ctx->f, "\"];\n"); - - /* and links to outputs: */ - for (i = 0; i < block->noutputs; i++) { - char target[32]; /* link target */ - - /* NOTE: there could be outputs that are never assigned, - * so skip them - */ - if (!block->outputs[i]) - continue; - - snprintf(target, sizeof(target), "output%lx:<out%u>:e", - PTRID(block), i); - - dump_link(ctx, block->outputs[i], block, target); - } - - fprintf(ctx->f, "}\n"); - - /* and links to inputs: */ - if (block->parent) { - for (i = 0; i < block->ninputs; i++) { - char target[32]; /* link target */ - - if (!block->inputs[i]) - continue; - - dump_instr(ctx, block->inputs[i]); - - snprintf(target, sizeof(target), "input%lx:<in%u>:e", - PTRID(block), i); - - dump_link(ctx, block->inputs[i], block, target); - } - } - - /* dump deferred edges: */ - if (edge_buf.n > n) { - fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]); - edge_buf.n = n; - } -} - -void ir3_dump(struct ir3 *shader, const char *name, - struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, - FILE *f) -{ - struct ir3_dump_ctx ctx = { - .f = f, - }; - ir3_clear_mark(shader); - fprintf(ctx.f, "digraph G {\n"); - fprintf(ctx.f, "rankdir=RL;\n"); - fprintf(ctx.f, "nodesep=0.25;\n"); - fprintf(ctx.f, "ranksep=1.5;\n"); - ir3_block_dump(&ctx, block, name); - fprintf(ctx.f, "}\n"); -} - -/* - * For Debugging: - */ - -void -ir3_dump_instr_single(struct ir3_instruction *instr) -{ - struct ir3_dump_ctx ctx = { - .f = stdout, - .verbose = true, - }; - unsigned i; - - dump_instr_name(&ctx, instr); - for (i = 0; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - printf(i ? ", " : " "); - dump_reg_name(&ctx, reg, !!i); - } - - if (instr->address) { - fprintf(ctx.f, ", address=_"); - fprintf(ctx.f, "["); - dump_instr_name(&ctx, instr->address); - fprintf(ctx.f, "]"); - } - - if (instr->fanin) { - fprintf(ctx.f, ", fanin=_"); - fprintf(ctx.f, "["); - dump_instr_name(&ctx, instr->fanin); - fprintf(ctx.f, "]"); - } - - if (is_meta(instr)) { - if (instr->opc == OPC_META_FO) { - printf(", off=%d", instr->fo.off); - } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { - printf(", aid=%d", instr->fi.aid); - } - } - - printf("\n"); -} - -void -ir3_dump_instr_list(struct ir3_instruction *instr) -{ - struct ir3_block *block = instr->block; - unsigned n = 0; - - while (instr) { - ir3_dump_instr_single(instr); - if (!is_meta(instr)) - n++; - instr = instr->next; - } - printf("%u instructions\n", n); - - for (n = 0; n < block->noutputs; n++) { - if (!block->outputs[n]) - continue; - printf("out%d: ", n); - ir3_dump_instr_single(block->outputs[n]); - } -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c deleted file mode 100644 index 419cd9dfcd4..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c +++ /dev/null @@ -1,152 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include <stdarg.h> - -#include "ir3.h" - -/* - * Flatten: flatten out legs of if/else, etc - * - * TODO probably should use some heuristic to decide to not flatten - * if one side of the other is too large / deeply nested / whatever? - */ - -struct ir3_flatten_ctx { - struct ir3_block *block; - unsigned cnt; -}; - -static struct ir3_register *unwrap(struct ir3_register *reg) -{ - - if (reg->flags & IR3_REG_SSA) { - struct ir3_instruction *instr = reg->instr; - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_OUTPUT: - case OPC_META_FLOW: - if (instr->regs_count > 1) - return instr->regs[1]; - return NULL; - default: - break; - } - } - } - return reg; -} - -static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return; - - instr->block = ctx->block; - - /* TODO: maybe some threshold to decide whether to - * flatten or not?? - */ - if (is_meta(instr)) { - if (instr->opc == OPC_META_PHI) { - struct ir3_register *cond, *t, *f; - - cond = unwrap(instr->regs[1]); - t = unwrap(instr->regs[2]); /* true val */ - f = unwrap(instr->regs[3]); /* false val */ - - /* must have cond, but t or f may be null if only written - * one one side of the if/else (in which case we can just - * convert the PHI to a simple move). - */ - assert(cond); - assert(t || f); - - if (t && f) { - /* convert the PHI instruction to sel.{b16,b32} */ - instr->category = 3; - - /* instruction type based on dst size: */ - if (instr->regs[0]->flags & IR3_REG_HALF) - instr->opc = OPC_SEL_B16; - else - instr->opc = OPC_SEL_B32; - - instr->regs[1] = t; - instr->regs[2] = cond; - instr->regs[3] = f; - } else { - /* convert to simple mov: */ - instr->category = 1; - instr->cat1.dst_type = TYPE_F32; - instr->cat1.src_type = TYPE_F32; - instr->regs_count = 2; - instr->regs[1] = t ? t : f; - } - - ctx->cnt++; - } else if ((instr->opc == OPC_META_INPUT) && - (instr->regs_count == 2)) { - type_t ftype; - - if (instr->regs[0]->flags & IR3_REG_HALF) - ftype = TYPE_F16; - else - ftype = TYPE_F32; - - /* convert meta:input to mov: */ - instr->category = 1; - instr->cat1.src_type = ftype; - instr->cat1.dst_type = ftype; - } - } - - /* recursively visit children: */ - foreach_ssa_src(src, instr) - ir3_instr_flatten(ctx, src); -} - -/* return >= 0 is # of phi's flattened, < 0 is error */ -int ir3_block_flatten(struct ir3_block *block) -{ - struct ir3_flatten_ctx ctx = { - .block = block, - }; - unsigned i; - - ir3_clear_mark(block->shader); - for(i = 0; i < block->noutputs; i++) - if (block->outputs[i]) - ir3_instr_flatten(&ctx, block->outputs[i]); - - return ctx.cnt; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c index 782f6e87e56..70d9b08e019 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_group.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c @@ -34,35 +34,6 @@ * Find/group instruction neighbors: */ -/* stop condition for iteration: */ -static bool check_stop(struct ir3_instruction *instr) -{ - if (ir3_instr_check_mark(instr)) - return true; - - /* stay within the block.. don't try to operate across - * basic block boundaries or we'll have problems when - * dealing with multiple basic blocks: - */ - if (is_meta(instr) && (instr->opc == OPC_META_INPUT)) - return true; - - return false; -} - -static struct ir3_instruction * create_mov(struct ir3_instruction *instr) -{ - struct ir3_instruction *mov; - - mov = ir3_instr_create(instr->block, 1, 0); - mov->cat1.src_type = TYPE_F32; - mov->cat1.dst_type = TYPE_F32; - ir3_reg_create(mov, 0, 0); /* dst */ - ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = instr; - - return mov; -} - /* bleh.. we need to do the same group_n() thing for both inputs/outputs * (where we have a simple instr[] array), and fanin nodes (where we have * an extra indirection via reg->instr). @@ -78,7 +49,8 @@ static struct ir3_instruction *arr_get(void *arr, int idx) } static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr) { - ((struct ir3_instruction **)arr)[idx] = create_mov(instr); + ((struct ir3_instruction **)arr)[idx] = + ir3_MOV(instr->block, instr, TYPE_F32); } static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr) { @@ -111,14 +83,17 @@ static struct ir3_instruction *instr_get(void *arr, int idx) { return ssa(((struct ir3_instruction *)arr)->regs[idx+1]); } -static void instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr) +static void +instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr) { - ((struct ir3_instruction *)arr)->regs[idx+1]->instr = create_mov(instr); + ((struct ir3_instruction *)arr)->regs[idx+1]->instr = + ir3_MOV(instr->block, instr, TYPE_F32); } static struct group_ops instr_ops = { instr_get, instr_insert_mov }; -static void group_n(struct group_ops *ops, void *arr, unsigned n) +static void +group_n(struct group_ops *ops, void *arr, unsigned n) { unsigned i, j; @@ -141,6 +116,10 @@ restart: conflict = conflicts(instr->cp.left, left) || conflicts(instr->cp.right, right); + /* RA can't yet deal very well w/ group'd phi's: */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) + conflict = true; + /* we also can't have an instr twice in the group: */ for (j = i + 1; (j < n) && !conflict; j++) if (ops->get(arr, j) == instr) @@ -181,11 +160,12 @@ restart: } } -static void instr_find_neighbors(struct ir3_instruction *instr) +static void +instr_find_neighbors(struct ir3_instruction *instr) { struct ir3_instruction *src; - if (check_stop(instr)) + if (ir3_instr_check_mark(instr)) return; if (is_meta(instr) && (instr->opc == OPC_META_FI)) @@ -200,7 +180,8 @@ static void instr_find_neighbors(struct ir3_instruction *instr) * we need to insert dummy/padding instruction for grouping, and * then take it back out again before anyone notices. */ -static void pad_and_group_input(struct ir3_instruction **input, unsigned n) +static void +pad_and_group_input(struct ir3_instruction **input, unsigned n) { int i, mask = 0; struct ir3_block *block = NULL; @@ -210,8 +191,8 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n) if (instr) { block = instr->block; } else if (block) { - instr = ir3_instr_create(block, 0, OPC_NOP); - ir3_reg_create(instr, 0, IR3_REG_SSA); /* dst */ + instr = ir3_NOP(block); + ir3_reg_create(instr, 0, IR3_REG_SSA); /* dummy dst */ input[i] = instr; mask |= (1 << i); } @@ -225,42 +206,41 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n) } } -static void block_find_neighbors(struct ir3_block *block) +static void +find_neighbors(struct ir3 *ir) { unsigned i; - for (i = 0; i < block->noutputs; i++) { - if (block->outputs[i]) { - struct ir3_instruction *instr = block->outputs[i]; - instr_find_neighbors(instr); - } - } - /* shader inputs/outputs themselves must be contiguous as well: + * + * NOTE: group inputs first, since we only insert mov's + * *before* the conflicted instr (and that would go badly + * for inputs). By doing inputs first, we should never + * have a conflict on inputs.. pushing any conflict to + * resolve to the outputs, for stuff like: + * + * MOV OUT[n], IN[m].wzyx + * + * NOTE: we assume here inputs/outputs are grouped in vec4. + * This logic won't quite cut it if we don't align smaller + * on vec4 boundaries */ - if (!block->parent) { - /* NOTE: group inputs first, since we only insert mov's - * *before* the conflicted instr (and that would go badly - * for inputs). By doing inputs first, we should never - * have a conflict on inputs.. pushing any conflict to - * resolve to the outputs, for stuff like: - * - * MOV OUT[n], IN[m].wzyx - * - * NOTE: we assume here inputs/outputs are grouped in vec4. - * This logic won't quite cut it if we don't align smaller - * on vec4 boundaries - */ - for (i = 0; i < block->ninputs; i += 4) - pad_and_group_input(&block->inputs[i], 4); - for (i = 0; i < block->noutputs; i += 4) - group_n(&arr_ops_out, &block->outputs[i], 4); - + for (i = 0; i < ir->ninputs; i += 4) + pad_and_group_input(&ir->inputs[i], 4); + for (i = 0; i < ir->noutputs; i += 4) + group_n(&arr_ops_out, &ir->outputs[i], 4); + + for (i = 0; i < ir->noutputs; i++) { + if (ir->outputs[i]) { + struct ir3_instruction *instr = ir->outputs[i]; + instr_find_neighbors(instr); + } } } -void ir3_block_group(struct ir3_block *block) +void +ir3_group(struct ir3 *ir) { - ir3_clear_mark(block->shader); - block_find_neighbors(block); + ir3_clear_mark(ir); + find_neighbors(ir); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index 2455f7e4efc..f4a4223ae17 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -26,7 +26,6 @@ * Rob Clark <[email protected]> */ -#include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "freedreno_util.h" @@ -43,20 +42,31 @@ */ struct ir3_legalize_ctx { - struct ir3_block *block; bool has_samp; int max_bary; }; -static void legalize(struct ir3_legalize_ctx *ctx) +/* We want to evaluate each block from the position of any other + * predecessor block, in order that the flags set are the union + * of all possible program paths. For stopping condition, we + * want to stop when the pair of <pred-block, current-block> has + * been visited already. + * + * XXX is that completely true? We could have different needs_xyz + * flags set depending on path leading to pred-block.. we could + * do *most* of this based on chasing src instructions ptrs (and + * following all phi srcs).. except the write-after-read hazzard. + * + * For now we just set ss/sy flag on first instruction on block, + * and handle everything within the block as before. + */ + +static void +legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) { - struct ir3_block *block = ctx->block; - struct ir3_instruction *n; - struct ir3 *shader = block->shader; - struct ir3_instruction *end = - ir3_instr_create(block, 0, OPC_END); struct ir3_instruction *last_input = NULL; struct ir3_instruction *last_rel = NULL; + struct list_head instr_list; regmask_t needs_ss_war; /* write after read */ regmask_t needs_ss; regmask_t needs_sy; @@ -65,9 +75,13 @@ static void legalize(struct ir3_legalize_ctx *ctx) regmask_init(&needs_ss); regmask_init(&needs_sy); - shader->instrs_count = 0; + /* remove all the instructions from the list, we'll be adding + * them back in as we go + */ + list_replace(&block->instr_list, &instr_list); + list_inithead(&block->instr_list); - for (n = block->head; n; n = n->next) { + list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) { struct ir3_register *reg; unsigned i; @@ -134,18 +148,18 @@ static void legalize(struct ir3_legalize_ctx *ctx) */ if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) { struct ir3_instruction *nop; - nop = ir3_instr_create(block, 0, OPC_NOP); + nop = ir3_NOP(block); nop->flags |= IR3_INSTR_SS; n->flags &= ~IR3_INSTR_SS; } /* need to be able to set (ss) on first instruction: */ - if ((shader->instrs_count == 0) && (n->category >= 5)) - ir3_instr_create(block, 0, OPC_NOP); + if (list_empty(&block->instr_list) && (n->category >= 5)) + ir3_NOP(block); - if (is_nop(n) && shader->instrs_count) { - struct ir3_instruction *last = - shader->instrs[shader->instrs_count-1]; + if (is_nop(n) && !list_empty(&block->instr_list)) { + struct ir3_instruction *last = list_last_entry(&block->instr_list, + struct ir3_instruction, node); if (is_nop(last) && (last->repeat < 5)) { last->repeat++; last->flags |= n->flags; @@ -153,7 +167,7 @@ static void legalize(struct ir3_legalize_ctx *ctx) } } - shader->instrs[shader->instrs_count++] = n; + list_addtail(&n->node, &block->instr_list); if (is_sfu(n)) regmask_set(&needs_ss, n->regs[0]); @@ -192,35 +206,20 @@ static void legalize(struct ir3_legalize_ctx *ctx) * the (ei) flag: */ if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) { - int i, cnt; - - /* note that ir3_instr_create() inserts into - * shader->instrs[] and increments the count.. - * so we need to bump up the cnt initially (to - * avoid it clobbering the last real instr) and - * restore it after. - */ - cnt = ++shader->instrs_count; + struct ir3_instruction *baryf; - /* inserting instructions would be a bit nicer if list.. */ - for (i = cnt - 2; i >= 0; i--) { - if (shader->instrs[i] == last_input) { + /* (ss)bary.f (ei)r63.x, 0, r0.x */ + baryf = ir3_instr_create(block, 2, OPC_BARY_F); + baryf->flags |= IR3_INSTR_SS; + ir3_reg_create(baryf, regid(63, 0), 0); + ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; + ir3_reg_create(baryf, regid(0, 0), 0); - /* (ss)bary.f (ei)r63.x, 0, r0.x */ - last_input = ir3_instr_create(block, 2, OPC_BARY_F); - last_input->flags |= IR3_INSTR_SS; - ir3_reg_create(last_input, regid(63, 0), 0); - ir3_reg_create(last_input, 0, IR3_REG_IMMED)->iim_val = 0; - ir3_reg_create(last_input, regid(0, 0), 0); + /* insert the dummy bary.f after last_input: */ + list_delinit(&baryf->node); + list_add(&baryf->node, &last_input->node); - shader->instrs[i + 1] = last_input; - - break; - } - shader->instrs[i + 1] = shader->instrs[i]; - } - - shader->instrs_count = cnt; + last_input = baryf; } last_input->regs[0]->flags |= IR3_REG_EI; } @@ -228,21 +227,177 @@ static void legalize(struct ir3_legalize_ctx *ctx) if (last_rel) last_rel->flags |= IR3_INSTR_UL; - shader->instrs[shader->instrs_count++] = end; + list_first_entry(&block->instr_list, struct ir3_instruction, node) + ->flags |= IR3_INSTR_SS | IR3_INSTR_SY; +} + +/* NOTE: branch instructions are always the last instruction(s) + * in the block. We take advantage of this as we resolve the + * branches, since "if (foo) break;" constructs turn into + * something like: + * + * block3 { + * ... + * 0029:021: mov.s32s32 r62.x, r1.y + * 0082:022: br !p0.x, target=block5 + * 0083:023: br p0.x, target=block4 + * // succs: if _[0029:021: mov.s32s32] block4; else block5; + * } + * block4 { + * 0084:024: jump, target=block6 + * // succs: block6; + * } + * block5 { + * 0085:025: jump, target=block7 + * // succs: block7; + * } + * + * ie. only instruction in block4/block5 is a jump, so when + * resolving branches we can easily detect this by checking + * that the first instruction in the target block is itself + * a jump, and setup the br directly to the jump's target + * (and strip back out the now unreached jump) + * + * TODO sometimes we end up with things like: + * + * br !p0.x, #2 + * br p0.x, #12 + * add.u r0.y, r0.y, 1 + * + * If we swapped the order of the branches, we could drop one. + */ +static struct ir3_block * +resolve_dest_block(struct ir3_block *block) +{ + /* special case for last block: */ + if (!block->successors[0]) + return block; + + /* NOTE that we may or may not have inserted the jump + * in the target block yet, so conditions to resolve + * the dest to the dest block's successor are: + * + * (1) successor[1] == NULL && + * (2) (block-is-empty || only-instr-is-jump) + */ + if (block->successors[1] == NULL) { + if (list_empty(&block->instr_list)) { + return block->successors[0]; + } else if (list_length(&block->instr_list) == 1) { + struct ir3_instruction *instr = list_first_entry( + &block->instr_list, struct ir3_instruction, node); + if (is_flow(instr) && (instr->opc == OPC_JUMP)) + return block->successors[0]; + } + } + return block; +} + +static bool +resolve_jump(struct ir3_instruction *instr) +{ + struct ir3_block *tblock = + resolve_dest_block(instr->cat0.target); + struct ir3_instruction *target; + + if (tblock != instr->cat0.target) { + list_delinit(&instr->cat0.target->node); + instr->cat0.target = tblock; + return true; + } + + target = list_first_entry(&tblock->instr_list, + struct ir3_instruction, node); + + if ((!target) || (target->ip == (instr->ip + 1))) { + list_delinit(&instr->node); + return true; + } else { + instr->cat0.immed = + (int)target->ip - (int)instr->ip; + } + return false; +} + +/* resolve jumps, removing jumps/branches to immediately following + * instruction which we end up with from earlier stages. Since + * removing an instruction can invalidate earlier instruction's + * branch offsets, we need to do this iteratively until no more + * branches are removed. + */ +static bool +resolve_jumps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + if (is_flow(instr) && instr->cat0.target) + if (resolve_jump(instr)) + return true; + + return false; +} - shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; +/* we want to mark points where divergent flow control re-converges + * with (jp) flags. For now, since we don't do any optimization for + * things that start out as a 'do {} while()', re-convergence points + * will always be a branch or jump target. Note that this is overly + * conservative, since unconditional jump targets are not convergence + * points, we are just assuming that the other path to reach the jump + * target was divergent. If we were clever enough to optimize the + * jump at end of a loop back to a conditional branch into a single + * conditional branch, ie. like: + * + * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start + * mul.f r1.z, r1.z, r0.x + * mul.f r1.y, r1.y, r0.x + * mul.f r0.z, r1.x, r0.x + * mul.f r0.w, r0.y, r0.x + * cmps.f.ge r0.x, (r)c2.y, (r)r1.w + * add.s r0.x, (r)r0.x, (r)-1 + * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x + * cmps.f.eq p0.x, r0.x, c3.y + * mov.f32f32 r0.x, r1.w + * mov.f32f32 r0.y, r0.w + * mov.f32f32 r1.x, r0.z + * (rpt2)nop + * br !p0.x, #-13 + * (jp)mul.f r0.x, c263.y, r1.y + * + * Then we'd have to be more clever, as the convergence point is no + * longer a branch or jump target. + */ +static void +mark_convergence_points(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_flow(instr) && instr->cat0.target) { + struct ir3_instruction *target = + list_first_entry(&instr->cat0.target->instr_list, + struct ir3_instruction, node); + target->flags |= IR3_INSTR_JP; + } + } + } } -void ir3_block_legalize(struct ir3_block *block, - bool *has_samp, int *max_bary) +void +ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary) { struct ir3_legalize_ctx ctx = { - .block = block, .max_bary = -1, }; - legalize(&ctx); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + legalize_block(&ctx, block); + } *has_samp = ctx.has_samp; *max_bary = ctx.max_bary; + + do { + ir3_count_instructions(ir); + } while(resolve_jumps(ir)); + + mark_convergence_points(ir); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c index ae36019ed5f..dc9e4626f27 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c @@ -74,14 +74,13 @@ valid_dest(nir_block *block, nir_dest *dest) * (so this is run iteratively in a loop). Therefore if * we get this far, it should not have any if_uses: */ - assert(dest->ssa.if_uses->entries == 0); + assert(list_empty(&dest->ssa.if_uses)); /* The only uses of this definition must be phi's in the * successor or in the current block */ - struct set_entry *entry; - set_foreach(dest->ssa.uses, entry) { - const nir_instr *dest_instr = entry->key; + nir_foreach_use(&dest->ssa, use) { + nir_instr *dest_instr = use->parent_instr; if (dest_instr->block == block) continue; if ((dest_instr->type == nir_instr_type_phi) && diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c new file mode 100644 index 00000000000..f377982dd5e --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -0,0 +1,237 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> +#include <stdio.h> + +#include "ir3.h" + +#define PTRID(x) ((unsigned long)(x)) + +static void print_instr_name(struct ir3_instruction *instr) +{ +#ifdef DEBUG + printf("%04u:", instr->serialno); +#endif + printf("%03u: ", instr->depth); + + if (instr->flags & IR3_INSTR_SY) + printf("(sy)"); + if (instr->flags & IR3_INSTR_SS) + printf("(ss)"); + + if (is_meta(instr)) { + switch(instr->opc) { + case OPC_META_PHI: + printf("Φ"); + break; + default: + /* shouldn't hit here.. just for debugging: */ + switch (instr->opc) { + case OPC_META_INPUT: printf("_meta:in"); break; + case OPC_META_FO: printf("_meta:fo"); break; + case OPC_META_FI: printf("_meta:fi"); break; + + default: printf("_meta:%d", instr->opc); break; + } + break; + } + } else if (instr->category == 1) { + static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", + }; + if (instr->cat1.src_type == instr->cat1.dst_type) + printf("mov"); + else + printf("cov"); + printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); + } else { + printf("%s", ir3_instr_name(instr)); + if (instr->flags & IR3_INSTR_3D) + printf(".3d"); + if (instr->flags & IR3_INSTR_A) + printf(".a"); + if (instr->flags & IR3_INSTR_O) + printf(".o"); + if (instr->flags & IR3_INSTR_P) + printf(".p"); + if (instr->flags & IR3_INSTR_S) + printf(".s"); + if (instr->flags & IR3_INSTR_S2EN) + printf(".s2en"); + } +} + +static void print_reg_name(struct ir3_register *reg, bool followssa) +{ + if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && + (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) + printf("(absneg)"); + else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)) + printf("(neg)"); + else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) + printf("(abs)"); + + if (reg->flags & IR3_REG_IMMED) { + printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); + } else if (reg->flags & IR3_REG_SSA) { + printf("_"); + if (followssa) { + printf("["); + print_instr_name(reg->instr); + printf("]"); + } + } else if (reg->flags & IR3_REG_RELATIV) { + if (reg->flags & IR3_REG_HALF) + printf("h"); + if (reg->flags & IR3_REG_CONST) + printf("c<a0.x + %u>", reg->num); + else + printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size); + } else { + if (reg->flags & IR3_REG_HALF) + printf("h"); + if (reg->flags & IR3_REG_CONST) + printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + else + printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]); + } +} + +static void +tab(int lvl) +{ + for (int i = 0; i < lvl; i++) + printf("\t"); +} + +static uint32_t +block_id(struct ir3_block *block) +{ +#ifdef DEBUG + return block->serialno; +#else + return (uint32_t)(uint64_t)block; +#endif +} + +static void +print_instr(struct ir3_instruction *instr, int lvl) +{ + unsigned i; + + tab(lvl); + + print_instr_name(instr); + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + printf(i ? ", " : " "); + print_reg_name(reg, !!i); + } + + if (instr->address) { + printf(", address=_"); + printf("["); + print_instr_name(instr->address); + printf("]"); + } + + if (instr->fanin) { + printf(", fanin=_"); + printf("["); + print_instr_name(instr->fanin); + printf("]"); + } + + if (is_meta(instr)) { + if (instr->opc == OPC_META_FO) { + printf(", off=%d", instr->fo.off); + } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { + printf(", aid=%d", instr->fi.aid); + } + } + + if (is_flow(instr) && instr->cat0.target) { + /* the predicate register src is implied: */ + if (instr->opc == OPC_BR) { + printf(" %sp0.x", instr->cat0.inv ? "!" : ""); + } + printf(", target=block%u", block_id(instr->cat0.target)); + } + + printf("\n"); +} + +void ir3_print_instr(struct ir3_instruction *instr) +{ + print_instr(instr, 0); +} + +static void +print_block(struct ir3_block *block, int lvl) +{ + tab(lvl); printf("block%u {\n", block_id(block)); + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + print_instr(instr, lvl+1); + } + if (block->successors[1]) { + /* leading into if/else: */ + tab(lvl+1); + printf("/* succs: if _["); + print_instr_name(block->condition); + printf("] block%u; else block%u; */\n", + block_id(block->successors[0]), + block_id(block->successors[1])); + } else if (block->successors[0]) { + tab(lvl+1); + printf("/* succs: block%u; */\n", + block_id(block->successors[0])); + } + tab(lvl); printf("}\n"); +} + +void +ir3_print(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + print_block(block, 0); + + for (unsigned i = 0; i < ir->noutputs; i++) { + if (!ir->outputs[i]) + continue; + printf("out%d: ", i); + print_instr(ir->outputs[i], 0); + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index a4235a77a15..e5aba859fab 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -26,284 +26,702 @@ * Rob Clark <[email protected]> */ -#include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/register_allocate.h" +#include "util/ralloc.h" +#include "util/bitset.h" #include "ir3.h" +#include "ir3_compiler.h" /* * Register Assignment: * - * NOTE: currently only works on a single basic block.. need to think - * about how multiple basic blocks are going to get scheduled. But - * I think I want to re-arrange how blocks work, ie. get rid of the - * block nesting thing.. + * Uses the register_allocate util, which implements graph coloring + * algo with interference classes. To handle the cases where we need + * consecutive registers (for example, texture sample instructions), + * we model these as larger (double/quad/etc) registers which conflict + * with the corresponding registers in other classes. * - * NOTE: we could do register coalescing (eliminate moves) as part of - * the RA step.. OTOH I think we need to do scheduling before register - * assignment. And if we remove a mov that effects scheduling (unless - * we leave a placeholder nop, which seems lame), so I'm not really - * sure how practical this is to do both in a single stage. But OTOH - * I'm not really sure a sane way for the CP stage to realize when it - * cannot remove a mov due to multi-register constraints.. + * Additionally we create additional classes for half-regs, which + * do not conflict with the full-reg classes. We do need at least + * sizes 1-4 (to deal w/ texture sample instructions output to half- + * reg). At the moment we don't create the higher order half-reg + * classes as half-reg frequently does not have enough precision + * for texture coords at higher resolutions. * - * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has - * some ideas to handle array allocation with a more conventional - * graph coloring algorithm for register assignment, which might be - * a good alternative to the current algo. However afaict it cannot - * handle overlapping arrays, which is a scenario that we have to - * deal with + * There are some additional cases that we need to handle specially, + * as the graph coloring algo doesn't understand "partial writes". + * For example, a sequence like: + * + * add r0.z, ... + * sam (f32)(xy)r0.x, ... + * ... + * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord + * + * In this scenario, we treat r0.xyz as class size 3, which is written + * (from a use/def perspective) at the 'add' instruction and ignore the + * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the + * defining instruction, as it is the first to partially write r0.xyz. + * + * Note i965 has a similar scenario, which they solve with a virtual + * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after + * register assignment. But for us that is horrible from a scheduling + * standpoint. Instead what we do is use idea of 'definer' instruction. + * Ie. the first instruction (lowest ip) to write to the array is the + * one we consider from use/def perspective when building interference + * graph. (Other instructions which write other array elements just + * define the variable some more.) + */ + +static const unsigned class_sizes[] = { + 1, 2, 3, 4, + 4 + 4, /* txd + 1d/2d */ + 4 + 6, /* txd + 3d */ + /* temporary: until we can assign arrays, create classes so we + * can round up array to fit. NOTE with tgsi arrays should + * really all be multiples of four: + */ + 4 * 4, + 4 * 8, + 4 * 16, + 4 * 32, + +}; +#define class_count ARRAY_SIZE(class_sizes) + +static const unsigned half_class_sizes[] = { + 1, 2, 3, 4, +}; +#define half_class_count ARRAY_SIZE(half_class_sizes) +#define total_class_count (class_count + half_class_count) + +/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ +#define NUM_REGS (4 * (REG_A0 - 1)) +/* Number of virtual regs in a given class: */ +#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) +#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) + +/* register-set, created one time, used for all shaders: */ +struct ir3_ra_reg_set { + struct ra_regs *regs; + unsigned int classes[class_count]; + unsigned int half_classes[half_class_count]; + /* maps flat virtual register space to base gpr: */ + uint16_t *ra_reg_to_gpr; + /* maps cls,gpr to flat virtual register space: */ + uint16_t **gpr_to_ra_reg; +}; + +/* One-time setup of RA register-set, which describes all the possible + * "virtual" registers and their interferences. Ie. double register + * occupies (and conflicts with) two single registers, and so forth. + * Since registers do not need to be aligned to their class size, they + * can conflict with other registers in the same class too. Ie: + * + * Single (base) | Double + * --------------+--------------- + * R0 | D0 + * R1 | D0 D1 + * R2 | D1 D2 + * R3 | D2 + * .. and so on.. + * + * (NOTE the disassembler uses notation like r0.x/y/z/w but those are + * really just four scalar registers. Don't let that confuse you.) */ +struct ir3_ra_reg_set * +ir3_ra_alloc_reg_set(void *memctx) +{ + struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set); + unsigned ra_reg_count, reg, first_half_reg; + unsigned int **q_values; + + /* calculate # of regs across all classes: */ + ra_reg_count = 0; + for (unsigned i = 0; i < class_count; i++) + ra_reg_count += CLASS_REGS(i); + for (unsigned i = 0; i < half_class_count; i++) + ra_reg_count += HALF_CLASS_REGS(i); + + /* allocate and populate q_values: */ + q_values = ralloc_array(set, unsigned *, total_class_count); + for (unsigned i = 0; i < class_count; i++) { + q_values[i] = rzalloc_array(q_values, unsigned, total_class_count); + + /* From register_allocate.c: + * + * q(B,C) (indexed by C, B is this register class) in + * Runeson/Nyström paper. This is "how many registers of B could + * the worst choice register from C conflict with". + * + * If we just let the register allocation algorithm compute these + * values, is extremely expensive. However, since all of our + * registers are laid out, we can very easily compute them + * ourselves. View the register from C as fixed starting at GRF n + * somewhere in the middle, and the register from B as sliding back + * and forth. Then the first register to conflict from B is the + * one starting at n - class_size[B] + 1 and the last register to + * conflict will start at n + class_size[B] - 1. Therefore, the + * number of conflicts from B is class_size[B] + class_size[C] - 1. + * + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * B | | | | | |n| --> | | | | | | | + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * +-+-+-+-+-+ + * C |n| | | | | + * +-+-+-+-+-+ + * + * (Idea copied from brw_fs_reg_allocate.cpp) + */ + for (unsigned j = 0; j < class_count; j++) + q_values[i][j] = class_sizes[i] + class_sizes[j] - 1; + } + + for (unsigned i = class_count; i < total_class_count; i++) { + q_values[i] = ralloc_array(q_values, unsigned, total_class_count); + + /* see comment above: */ + for (unsigned j = class_count; j < total_class_count; j++) { + q_values[i][j] = half_class_sizes[i - class_count] + + half_class_sizes[j - class_count] - 1; + } + } + /* allocate the reg-set.. */ + set->regs = ra_alloc_reg_set(set, ra_reg_count); + set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); + set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); + + /* .. and classes */ + reg = 0; + for (unsigned i = 0; i < class_count; i++) { + set->classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); + + for (unsigned j = 0; j < CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[i][j] = reg; + + for (unsigned br = j; br < j + class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br, reg); + + reg++; + } + } + + first_half_reg = reg; + + for (unsigned i = 0; i < half_class_count; i++) { + set->half_classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[class_count + i] = + ralloc_array(set, uint16_t, CLASS_REGS(i)); + + for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->half_classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[class_count + i][j] = reg; + + for (unsigned br = j; br < j + half_class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg); + + reg++; + } + } + + ra_set_finalize(set->regs, q_values); + + ralloc_free(q_values); + + return set; +} + +/* register-assign context, per-shader */ struct ir3_ra_ctx { - struct ir3_block *block; + struct ir3 *ir; enum shader_t type; - bool frag_coord; bool frag_face; - int cnt; - bool error; - struct { - unsigned base; - unsigned size; - } arrays[MAX_ARRAYS]; + + struct ir3_ra_reg_set *set; + struct ra_graph *g; + unsigned alloc_count; + unsigned class_alloc_count[total_class_count]; + unsigned class_base[total_class_count]; + unsigned instr_cnt; + unsigned *def, *use; /* def/use table */ }; -#ifdef DEBUG -# include "freedreno_util.h" -# define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS) -#else -# define ra_debug 0 -#endif - -#define ra_dump_list(msg, n) do { \ - if (ra_debug) { \ - debug_printf("-- " msg); \ - ir3_dump_instr_list(n); \ - } \ - } while (0) - -#define ra_dump_instr(msg, n) do { \ - if (ra_debug) { \ - debug_printf(">> " msg); \ - ir3_dump_instr_single(n); \ - } \ - } while (0) - -#define ra_assert(ctx, x) do { \ - debug_assert(x); \ - if (!(x)) { \ - debug_printf("RA: failed assert: %s\n", #x); \ - (ctx)->error = true; \ - }; \ - } while (0) - - -/* sorta ugly way to retrofit half-precision support.. rather than - * passing extra param around, just OR in a high bit. All the low - * value arithmetic (ie. +/- offset within a contiguous vec4, etc) - * will continue to work as long as you don't underflow (and that - * would go badly anyways). - */ -#define REG_HALF 0x8000 +/* additional block-data (per-block) */ +struct ir3_ra_block_data { + BITSET_WORD *def; /* variables defined before used in block */ + BITSET_WORD *use; /* variables used before defined in block */ + BITSET_WORD *livein; /* which defs reach entry point of block */ + BITSET_WORD *liveout; /* which defs reach exit point of block */ +}; + +static bool +is_half(struct ir3_instruction *instr) +{ + return !!(instr->regs[0]->flags & IR3_REG_HALF); +} -#define REG(n, wm, f) (struct ir3_register){ \ - .flags = (f), \ - .num = (n), \ - .wrmask = TGSI_WRITEMASK_ ## wm, \ +static int +size_to_class(unsigned sz, bool half) +{ + if (half) { + for (unsigned i = 0; i < half_class_count; i++) + if (half_class_sizes[i] >= sz) + return i + class_count; + } else { + for (unsigned i = 0; i < class_count; i++) + if (class_sizes[i] >= sz) + return i; } + debug_assert(0); + return -1; +} -/* check that the register exists, is a GPR and is not special (a0/p0) */ -static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) +static bool +is_temp(struct ir3_register *reg) { - if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) && - !(instr->regs[n]->flags & IR3_REG_SSA)) - return instr->regs[n]; - return NULL; + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return false; + if (reg->flags & IR3_REG_RELATIV) // TODO + return false; + if ((reg->num == regid(REG_A0, 0)) || + (reg->num == regid(REG_P0, 0))) + return false; + return true; } -/* figure out if an unassigned src register points back to the instr we - * are assigning: - */ -static bool instr_used_by(struct ir3_instruction *instr, - struct ir3_register *src) +static bool +writes_gpr(struct ir3_instruction *instr) { - struct ir3_instruction *src_instr = ssa(src); - unsigned i; - if (instr == src_instr) - return true; - if (src_instr && is_meta(src_instr)) - for (i = 1; i < src_instr->regs_count; i++) - if (instr_used_by(instr, src_instr->regs[i])) - return true; - - return false; + if (is_store(instr)) + return false; + /* is dest a normal temp register: */ + return is_temp(instr->regs[0]); } -static bool instr_is_output(struct ir3_instruction *instr) +static struct ir3_instruction * +get_definer(struct ir3_instruction *instr, int *sz, int *off) { - struct ir3_block *block = instr->block; - unsigned i; + struct ir3_instruction *d = NULL; + if (is_meta(instr) && (instr->opc == OPC_META_FI)) { + /* What about the case where collect is subset of array, we + * need to find the distance between where actual array starts + * and fanin.. that probably doesn't happen currently. + */ + struct ir3_register *src; - for (i = 0; i < block->noutputs; i++) - if (instr == block->outputs[i]) - return true; + /* note: don't use foreach_ssa_src as this gets called once + * while assigning regs (which clears SSA flag) + */ + foreach_src(src, instr) { + if (!src->instr) + continue; + if ((!d) || (src->instr->ip < d->ip)) + d = src->instr; + } - return false; -} + *sz = instr->regs_count - 1; + *off = 0; -static void mark_sources(struct ir3_instruction *instr, - struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written) -{ - unsigned i; + } else if (instr->cp.right || instr->cp.left) { + /* covers also the meta:fo case, which ends up w/ single + * scalar instructions for each component: + */ + struct ir3_instruction *f = ir3_neighbor_first(instr); + + /* by definition, the entire sequence forms one linked list + * of single scalar register nodes (even if some of them may + * be fanouts from a texture sample (for example) instr. We + * just need to walk the list finding the first element of + * the group defined (lowest ip) + */ + int cnt = 0; + + d = f; + while (f) { + if (f->ip < d->ip) + d = f; + if (f == instr) + *off = cnt; + f = f->cp.right; + cnt++; + } + + *sz = cnt; + + } else { + /* second case is looking directly at the instruction which + * produces multiple values (eg, texture sample), rather + * than the fanout nodes that point back to that instruction. + * This isn't quite right, because it may be part of a larger + * group, such as: + * + * sam (f32)(xyzw)r0.x, ... + * add r1.x, ... + * add r1.y, ... + * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) + * + * need to come up with a better way to handle that case. + */ + if (instr->address) { + *sz = instr->regs[0]->size; + } else { + *sz = util_last_bit(instr->regs[0]->wrmask); + } + *off = 0; + d = instr; + } + + if (d->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = d->regs[0]->instr; + struct ir3_instruction *dd; + int dsz, doff; + + dd = get_definer(phi, &dsz, &doff); + + *sz = MAX2(*sz, dsz); + *off = doff; + + if (dd->ip < d->ip) { + d = dd; + } + } - for (i = 1; i < n->regs_count; i++) { - struct ir3_register *r = reg_check(n, i); - if (r) - regmask_set_if_not(liveregs, r, written); + if (is_meta(d) && (d->opc == OPC_META_PHI)) { + /* we have already inserted parallel-copies into + * the phi, so we don't need to chase definers + */ + struct ir3_register *src; - /* if any src points back to the instruction(s) in - * the block of neighbors that we are assigning then - * mark any written (clobbered) registers as live: + /* note: don't use foreach_ssa_src as this gets called once + * while assigning regs (which clears SSA flag) */ - if (instr_used_by(instr, n->regs[i])) - regmask_or(liveregs, liveregs, written); + foreach_src(src, d) { + if (!src->instr) + continue; + if (src->instr->ip < d->ip) + d = src->instr; + } } + if (is_meta(d) && (d->opc == OPC_META_FO)) { + struct ir3_instruction *dd; + int dsz, doff; + + dd = get_definer(d->regs[1]->instr, &dsz, &doff); + + /* by definition, should come before: */ + debug_assert(dd->ip < d->ip); + + *sz = MAX2(*sz, dsz); + + /* Fanout's are grouped, so *off should already valid */ + + d = dd; + } + + return d; } -/* live means read before written */ -static void compute_liveregs(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, regmask_t *liveregs) +/* give each instruction a name (and ip), and count up the # of names + * of each class + */ +static void +ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) { - struct ir3_block *block = instr->block; - struct ir3_instruction *n; - regmask_t written; - unsigned i; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *defn; + int cls, sz, off; - regmask_init(&written); + ctx->instr_cnt++; - for (n = instr->next; n; n = n->next) { - struct ir3_register *r; - - if (is_meta(n)) + if (instr->regs_count == 0) continue; - /* check first src's read: */ - mark_sources(instr, n, liveregs, &written); + if (!writes_gpr(instr)) + continue; - /* for instructions that write to an array, we need to - * capture the dependency on the array elements: - */ - if (n->fanin) - mark_sources(instr, n->fanin, liveregs, &written); + defn = get_definer(instr, &sz, &off); - /* meta-instructions don't actually get scheduled, - * so don't let it's write confuse us.. what we - * really care about is when the src to the meta - * instr was written: - */ - if (is_meta(n)) + if (defn != instr) continue; - /* then dst written (if assigned already): */ - r = reg_check(n, 0); - if (r) { - /* if an instruction *is* an output, then it is live */ - if (!instr_is_output(n)) - regmask_set(&written, r); + /* arrays which don't fit in one of the pre-defined class + * sizes are pre-colored: + * + * TODO but we still need to allocate names for them, don't we?? + */ + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + instr->name = ctx->class_alloc_count[cls]++; + ctx->alloc_count++; } - } +} - /* be sure to account for output registers too: */ - for (i = 0; i < block->noutputs; i++) { - struct ir3_register *r; - if (!block->outputs[i]) - continue; - r = reg_check(block->outputs[i], 0); - if (r) - regmask_set_if_not(liveregs, r, &written); +static void +ra_init(struct ir3_ra_ctx *ctx) +{ + ir3_clear_mark(ctx->ir); + ir3_count_instructions(ctx->ir); + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_name_instructions(ctx, block); } - /* if instruction is output, we need a reg that isn't written - * before the end.. equiv to the instr_used_by() check above - * in the loop body - * TODO maybe should follow fanin/fanout? + /* figure out the base register name for each class. The + * actual ra name is class_base[cls] + instr->name; */ - if (instr_is_output(instr)) - regmask_or(liveregs, liveregs, &written); + ctx->class_base[0] = 0; + for (unsigned i = 1; i < total_class_count; i++) { + ctx->class_base[i] = ctx->class_base[i-1] + + ctx->class_alloc_count[i-1]; + } + + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); + ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); + ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); +} + +static unsigned +ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +{ + unsigned name; + debug_assert(cls >= 0); + name = ctx->class_base[cls] + defn->name; + debug_assert(name < ctx->alloc_count); + return name; } -static int find_available(regmask_t *liveregs, int size, bool half) +static void +ra_destroy(struct ir3_ra_ctx *ctx) { - unsigned i; - unsigned f = half ? IR3_REG_HALF : 0; - for (i = 0; i < MAX_REG - size; i++) { - if (!regmask_get(liveregs, ®(i, X, f))) { - unsigned start = i++; - for (; (i < MAX_REG) && ((i - start) < size); i++) - if (regmask_get(liveregs, ®(i, X, f))) - break; - if ((i - start) >= size) - return start; + ralloc_free(ctx->g); +} + +static void +ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_ra_block_data *bd; + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + + bd = rzalloc(ctx->g, struct ir3_ra_block_data); + + bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); + + block->bd = bd; + + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *src; + + if (instr->regs_count == 0) + continue; + + /* There are a couple special cases to deal with here: + * + * fanout: used to split values from a higher class to a lower + * class, for example split the results of a texture fetch + * into individual scalar values; We skip over these from + * a 'def' perspective, and for a 'use' we walk the chain + * up to the defining instruction. + * + * fanin: used to collect values from lower class and assemble + * them together into a higher class, for example arguments + * to texture sample instructions; We consider these to be + * defined at the earliest fanin source. + * + * phi: used to merge values from different flow control paths + * to the same reg. Consider defined at earliest phi src, + * and update all the other phi src's (which may come later + * in the program) as users to extend the var's live range. + * + * Most of this, other than phi, is completely handled in the + * get_definer() helper. + * + * In either case, we trace the instruction back to the original + * definer and consider that as the def/use ip. + */ + + if (writes_gpr(instr)) { + struct ir3_instruction *defn; + int cls, sz, off; + + defn = get_definer(instr, &sz, &off); + if (defn == instr) { + /* arrays which don't fit in one of the pre-defined class + * sizes are pre-colored: + */ + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ra_name(ctx, cls, defn); + + ctx->def[name] = defn->ip; + ctx->use[name] = defn->ip; + + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + BITSET_SET(bd->def, name); + + if (is_half(defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[cls - class_count]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[cls]); + } + + /* extend the live range for phi srcs, which may come + * from the bottom of the loop + */ + if (defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = defn->regs[0]->instr; + foreach_ssa_src(src, phi) { + /* if src is after phi, then we need to extend + * the liverange to the end of src's block: + */ + if (src->ip > phi->ip) { + struct ir3_instruction *last = + list_last_entry(&src->block->instr_list, + struct ir3_instruction, node); + ctx->use[name] = MAX2(ctx->use[name], last->ip); + } + } + } + } + } + } + + foreach_ssa_src(src, instr) { + if (writes_gpr(src)) { + struct ir3_instruction *srcdefn; + int cls, sz, off; + + srcdefn = get_definer(src, &sz, &off); + cls = size_to_class(sz, is_half(srcdefn)); + if (cls >= 0) { + unsigned name = ra_name(ctx, cls, srcdefn); + ctx->use[name] = MAX2(ctx->use[name], instr->ip); + if (!BITSET_TEST(bd->def, name)) + BITSET_SET(bd->use, name); + } + } } } - assert(0); - return -1; } -static int alloc_block(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, int size) +static bool +ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_instruction *n; - regmask_t liveregs; - unsigned name; + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + bool progress = false; - /* should only ever be called w/ head of neighbor list: */ - debug_assert(!instr->cp.left); + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + struct ir3_ra_block_data *bd = block->bd; - regmask_init(&liveregs); + /* update livein: */ + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = + (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); - for (n = instr; n; n = n->cp.right) - compute_liveregs(ctx, n, &liveregs); + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + progress = true; + } + } - /* because we do assignment on fanout nodes for wrmask!=0x1, we - * need to handle this special case, where the fanout nodes all - * appear after one or more of the consumers of the src node: - * - * 0098:009: sam _, r2.x - * 0028:010: mul.f r3.z, r4.x, c13.x - * ; we start assigning here for '0098:009: sam'.. but - * ; would miss the usage at '0028:010: mul.f' - * 0101:009: _meta:fo _, _[0098:009: sam], off=2 - */ - if (is_meta(instr) && (instr->opc == OPC_META_FO)) - compute_liveregs(ctx, instr->regs[1]->instr, &liveregs); + /* update liveout: */ + for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { + struct ir3_block *succ = block->successors[j]; + struct ir3_ra_block_data *succ_bd; + + if (!succ) + continue; - name = find_available(&liveregs, size, - !!(dst->flags & IR3_REG_HALF)); + succ_bd = succ->bd; - if (dst->flags & IR3_REG_HALF) - name |= REG_HALF; + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = + (succ_bd->livein[i] & ~bd->liveout[i]); - return name; + if (new_liveout) { + bd->liveout[i] |= new_liveout; + progress = true; + } + } + } + } + + return progress; } -static type_t half_type(type_t type) +static void +ra_add_interference(struct ir3_ra_ctx *ctx) { - switch (type) { - case TYPE_F32: return TYPE_F16; - case TYPE_U32: return TYPE_U16; - case TYPE_S32: return TYPE_S16; - /* instructions may already be fixed up: */ - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return type; - default: - assert(0); - return ~0; + struct ir3 *ir = ctx->ir; + + /* compute live ranges (use/def) on a block level, also updating + * block's def/use bitmasks (used below to calculate per-block + * livein/liveout): + */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ra_block_compute_live_ranges(ctx, block); + } + + /* update per-block livein/liveout: */ + while (ra_compute_livein_liveout(ctx)) {} + + /* extend start/end ranges based on livein/liveout info from cfg: */ + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + struct ir3_ra_block_data *bd = block->bd; + + for (unsigned i = 0; i < bitset_words; i++) { + if (BITSET_TEST(bd->livein, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->start_ip); + ctx->use[i] = MAX2(ctx->use[i], block->start_ip); + } + + if (BITSET_TEST(bd->liveout, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->end_ip); + ctx->use[i] = MAX2(ctx->use[i], block->end_ip); + } + } + } + + /* need to fix things up to keep outputs live: */ + for (unsigned i = 0; i < ir->noutputs; i++) { + struct ir3_instruction *instr = ir->outputs[i]; + struct ir3_instruction *defn; + int cls, sz, off; + + defn = get_definer(instr, &sz, &off); + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ra_name(ctx, cls, defn); + ctx->use[name] = ctx->instr_cnt; + } + } + + for (unsigned i = 0; i < ctx->alloc_count; i++) { + for (unsigned j = 0; j < ctx->alloc_count; j++) { + if (!((ctx->def[i] >= ctx->use[j]) || + (ctx->def[j] >= ctx->use[i]))) { + ra_add_node_interference(ctx->g, i, j); + } + } } } @@ -358,302 +776,124 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) } } -static void reg_assign(struct ir3_instruction *instr, - unsigned r, unsigned name) +static void +reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, + struct ir3_instruction *instr) { - struct ir3_register *reg = instr->regs[r]; - - reg->flags &= ~IR3_REG_SSA; - reg->num = name & ~REG_HALF; - - if (name & REG_HALF) { - reg->flags |= IR3_REG_HALF; - /* if dst reg being assigned, patch up the instr: */ - if (reg == instr->regs[0]) - fixup_half_instr_dst(instr); - else - fixup_half_instr_src(instr); - } -} - -static void instr_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name); + struct ir3_instruction *defn; + int cls, sz, off; -static void instr_assign_src(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned r, unsigned name) -{ - struct ir3_register *reg = instr->regs[r]; + defn = get_definer(instr, &sz, &off); + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ra_name(ctx, cls, defn); + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r] + off; - if (reg->flags & IR3_REG_RELATIV) - name += reg->offset; + if (reg->flags & IR3_REG_RELATIV) + num += reg->offset; - reg_assign(instr, r, name); + reg->num = num; + reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: - /* shader-input does not have a src, only block input: */ - debug_assert(instr->regs_count == 2); - instr_assign(ctx, instr, name); - return; - case OPC_META_FO: - instr_assign(ctx, instr, name + instr->fo.off); - return; - case OPC_META_FI: - instr_assign(ctx, instr, name - (r - 1)); - return; - default: - break; - } + if (is_half(defn)) + reg->flags |= IR3_REG_HALF; } } -static void instr_assign_srcs(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name) +static void +ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) { - struct ir3_instruction *n, *src; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_register *reg; - for (n = instr->next; n && !ctx->error; n = n->next) { - foreach_ssa_src_n(src, i, n) { - unsigned r = i + 1; - - /* skip address / etc (non real sources): */ - if (r >= n->regs_count) - continue; + if (instr->regs_count == 0) + continue; - if (src == instr) - instr_assign_src(ctx, n, r, name); + if (writes_gpr(instr)) { + reg_assign(ctx, instr->regs[0], instr); + if (instr->regs[0]->flags & IR3_REG_HALF) + fixup_half_instr_dst(instr); } - } -} - -static void instr_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name) -{ - struct ir3_register *reg = instr->regs[0]; - - if (reg->flags & IR3_REG_RELATIV) - return; - - /* check if already assigned: */ - if (!(reg->flags & IR3_REG_SSA)) { - /* ... and if so, sanity check: */ - ra_assert(ctx, reg->num == (name & ~REG_HALF)); - return; - } - - /* rename this instructions dst register: */ - reg_assign(instr, 0, name); - - /* and rename any subsequent use of result of this instr: */ - instr_assign_srcs(ctx, instr, name); - - /* To simplify the neighbor logic, and to "avoid" dealing with - * instructions which write more than one output, we actually - * do register assignment for instructions that produce multiple - * outputs on the fanout nodes and propagate up the assignment - * to the actual instruction: - */ - if (is_meta(instr) && (instr->opc == OPC_META_FO)) { - struct ir3_instruction *src; - debug_assert(name >= instr->fo.off); - - foreach_ssa_src(src, instr) - instr_assign(ctx, src, name - instr->fo.off); - } -} + foreach_src_n(reg, n, instr) { + struct ir3_instruction *src = reg->instr; + if (!src) + continue; -/* check neighbor list to see if it is already partially (or completely) - * assigned, in which case register block is already allocated and we - * just need to complete the assignment: - */ -static int check_partial_assignment(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *n; - int off = 0; - - debug_assert(!instr->cp.left); - - for (n = instr; n; n = n->cp.right) { - struct ir3_register *dst = n->regs[0]; - if ((n->depth != DEPTH_UNUSED) && - !(dst->flags & IR3_REG_SSA)) { - int name = dst->num - off; - debug_assert(name >= 0); - return name; + reg_assign(ctx, instr->regs[n+1], src); + if (instr->regs[n+1]->flags & IR3_REG_HALF) + fixup_half_instr_src(instr); } - off++; } - - return -1; } -/* allocate register name(s) for a list of neighboring instructions; - * instr should point to leftmost neighbor (head of list) - */ -static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) +static int +ra_alloc(struct ir3_ra_ctx *ctx) { - struct ir3_instruction *n; - struct ir3_register *dst; - int name; - - debug_assert(!instr->cp.left); - - if (instr->regs_count == 0) - return; - - dst = instr->regs[0]; - - /* For indirect dst, take the register assignment from the - * fanin and propagate it forward. - */ - if (dst->flags & IR3_REG_RELATIV) { - /* NOTE can be grouped, if for example outputs: - * for now disable cp if indirect writes - */ - instr_alloc_and_assign(ctx, instr->fanin); - - dst->num += instr->fanin->regs[0]->num; - dst->flags &= ~IR3_REG_SSA; - - instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num); - - return; - } - - /* for instructions w/ fanouts, do the actual register assignment - * on the group of fanout neighbor nodes and propagate the reg - * name back up to the texture instruction. - */ - if (dst->wrmask != 0x1) - return; - - name = check_partial_assignment(ctx, instr); - - /* allocate register(s): */ - if (name >= 0) { - /* already partially assigned, just finish the job */ - } else if (reg_gpr(dst)) { - int size; - /* number of consecutive registers to assign: */ - size = ir3_neighbor_count(instr); - if (dst->wrmask != 0x1) - size = MAX2(size, ffs(~dst->wrmask) - 1); - name = alloc_block(ctx, instr, size); - } else if (dst->flags & IR3_REG_ADDR) { - debug_assert(!instr->cp.right); - dst->flags &= ~IR3_REG_ADDR; - name = regid(REG_A0, 0) | REG_HALF; - } else { - debug_assert(!instr->cp.right); - /* predicate register (p0).. etc */ - name = regid(REG_P0, 0); - debug_assert(dst->num == name); - } - - ra_assert(ctx, name >= 0); - - for (n = instr; n && !ctx->error; n = n->cp.right) { - instr_assign(ctx, n, name); - name++; - } -} - -static void instr_assign_array(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - int name, aid = instr->fi.aid; - - if (ctx->arrays[aid].base == ~0) { - int size = instr->regs_count - 1; - ctx->arrays[aid].base = alloc_block(ctx, instr, size); - ctx->arrays[aid].size = size; - } - - name = ctx->arrays[aid].base; - - foreach_ssa_src_n(src, i, instr) { - unsigned r = i + 1; - - /* skip address / etc (non real sources): */ - if (r >= instr->regs_count) - break; - - instr_assign(ctx, src, name); - name++; - } - -} - -static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - struct ir3_instruction *n; - /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ - if ((ctx->type == SHADER_FRAGMENT) && !block->parent) { + if (ctx->type == SHADER_FRAGMENT) { + struct ir3 *ir = ctx->ir; unsigned i = 0, j; - if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) { + if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) { + struct ir3_instruction *instr = ir->inputs[i]; + int cls = size_to_class(1, true); + unsigned name = ra_name(ctx, cls, instr); + unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; + /* if we have frag_face, it gets hr0.x */ - instr_assign(ctx, block->inputs[i], REG_HALF | 0); + ra_set_node_reg(ctx->g, name, reg); i += 4; } - for (j = 0; i < block->ninputs; i++, j++) - if (block->inputs[i]) - instr_assign(ctx, block->inputs[i], j); - } - ra_dump_list("-------\n", block->head); + for (j = 0; i < ir->ninputs; i++) { + struct ir3_instruction *instr = ir->inputs[i]; + if (instr) { + struct ir3_instruction *defn; + int cls, sz, off; - /* first pass, assign arrays: */ - for (n = block->head; n && !ctx->error; n = n->next) { - if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) { - debug_assert(!n->cp.left); /* don't think this should happen */ - ra_dump_instr("ASSIGN ARRAY: ", n); - instr_assign_array(ctx, n); - ra_dump_list("-------\n", block->head); + defn = get_definer(instr, &sz, &off); + if (defn == instr) { + unsigned name, reg; + + cls = size_to_class(sz, is_half(defn)); + name = ra_name(ctx, cls, defn); + reg = ctx->set->gpr_to_ra_reg[cls][j]; + + ra_set_node_reg(ctx->g, name, reg); + j += sz; + } + } } } - for (n = block->head; n && !ctx->error; n = n->next) { - ra_dump_instr("ASSIGN: ", n); - instr_alloc_and_assign(ctx, ir3_neighbor_first(n)); - ra_dump_list("-------\n", block->head); + if (!ra_allocate(ctx->g)) + return -1; + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_alloc(ctx, block); } - return ctx->error ? -1 : 0; + return 0; } -int ir3_block_ra(struct ir3_block *block, enum shader_t type, +int ir3_ra(struct ir3 *ir, enum shader_t type, bool frag_coord, bool frag_face) { - struct ir3_instruction *n; struct ir3_ra_ctx ctx = { - .block = block, + .ir = ir, .type = type, - .frag_coord = frag_coord, .frag_face = frag_face, + .set = ir->compiler->set, }; int ret; - memset(&ctx.arrays, ~0, sizeof(ctx.arrays)); - - /* mark dst registers w/ SSA flag so we can see which - * have been assigned so far: - * NOTE: we really should set SSA flag consistently on - * every dst register in the frontend. - */ - for (n = block->head; n; n = n->next) - if (n->regs_count > 0) - n->regs[0]->flags |= IR3_REG_SSA; - - ir3_clear_mark(block->shader); - ret = block_ra(&ctx, block); + ra_init(&ctx); + ra_add_interference(&ctx); + ret = ra_alloc(&ctx); + ra_destroy(&ctx); return ret; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index a790cba129b..49a4426d163 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -31,23 +31,14 @@ #include "ir3.h" -enum { - SCHEDULED = -1, - DELAYED = -2, -}; - /* * Instruction Scheduling: * - * Using the depth sorted list from depth pass, attempt to recursively - * schedule deepest unscheduled path. The first instruction that cannot - * be scheduled, returns the required delay slots it needs, at which - * point we return back up to the top and attempt to schedule by next - * highest depth. After a sufficient number of instructions have been - * scheduled, return back to beginning of list and start again. If you - * reach the end of depth sorted list without being able to insert any - * instruction, insert nop's. Repeat until no more unscheduled - * instructions. + * A priority-queue based scheduling algo. Add eligible instructions, + * ie. ones with all their dependencies scheduled, to the priority + * (depth) sorted queue (list). Pop highest priority instruction off + * the queue and schedule it, add newly eligible instructions to the + * priority queue, rinse, repeat. * * There are a few special cases that need to be handled, since sched * is currently independent of register allocation. Usages of address @@ -60,90 +51,33 @@ enum { */ struct ir3_sched_ctx { - struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_block *block; /* the current block */ + struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ struct ir3_instruction *addr; /* current a0.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ - unsigned cnt; bool error; }; -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; - - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; -} - -static unsigned distance(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr, unsigned maxd) -{ - struct ir3_instruction *n = ctx->scheduled; - unsigned d = 0; - while (n && (n != instr) && (d < maxd)) { - if (is_alu(n) || is_flow(n)) - d++; - n = n->next; - } - return d; -} - -/* TODO maybe we want double linked list? */ -static struct ir3_instruction * prev(struct ir3_instruction *instr) -{ - struct ir3_instruction *p = instr->block->head; - while (p && (p->next != instr)) - p = p->next; - return p; -} - static bool is_sfu_or_mem(struct ir3_instruction *instr) { return is_sfu(instr) || is_mem(instr); } -static void schedule(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr, bool remove) +static void +schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_block *block = instr->block; + debug_assert(ctx->block == instr->block); /* maybe there is a better way to handle this than just stuffing * a nop.. ideally we'd know about this constraint in the * scheduling and depth calculation.. */ if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) - schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + ir3_NOP(ctx->block); /* remove from depth list: */ - if (remove) { - struct ir3_instruction *p = prev(instr); - - /* NOTE: this can happen for inputs which are not - * read.. in that case there is no need to schedule - * the input, so just bail: - */ - if (instr != (p ? p->next : block->head)) - return; - - if (p) - p->next = instr->next; - else - block->head = instr->next; - } + list_delinit(&instr->node); if (writes_addr(instr)) { assert(ctx->addr == NULL); @@ -157,18 +91,30 @@ static void schedule(struct ir3_sched_ctx *ctx, instr->flags |= IR3_INSTR_MARK; - instr->next = ctx->scheduled; + list_addtail(&instr->node, &instr->block->instr_list); ctx->scheduled = instr; - - ctx->cnt++; } -/* - * Delay-slot calculation. Follows fanin/fanout. - */ +static unsigned +distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, + unsigned maxd) +{ + struct list_head *instr_list = &ctx->block->instr_list; + unsigned d = 0; + + list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { + if ((n == instr) || (d >= maxd)) + break; + if (is_alu(n) || is_flow(n)) + d++; + } + + return d; +} /* calculate delay for specified src: */ -static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, +static unsigned +delay_calc_srcn(struct ir3_sched_ctx *ctx, struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned srcn) { @@ -177,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, if (is_meta(assigner)) { struct ir3_instruction *src; foreach_ssa_src(src, assigner) { - unsigned d = delay_calc_srcn(ctx, src, consumer, srcn); + unsigned d; + if (src->block != assigner->block) + break; + d = delay_calc_srcn(ctx, src, consumer, srcn); delay = MAX2(delay, d); } } else { @@ -189,48 +138,87 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, } /* calculate delay for instruction (maximum of delay for all srcs): */ -static unsigned delay_calc(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) +static unsigned +delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { unsigned delay = 0; struct ir3_instruction *src; foreach_ssa_src_n(src, i, instr) { - unsigned d = delay_calc_srcn(ctx, src, instr, i); + unsigned d; + if (src->block != instr->block) + continue; + d = delay_calc_srcn(ctx, src, instr, i); delay = MAX2(delay, d); } return delay; } -/* A negative return value signals that an instruction has been newly - * SCHEDULED (or DELAYED due to address or predicate register already - * in use), return back up to the top of the stack (to block_sched()) +struct ir3_sched_notes { + /* there is at least one kill which could be scheduled, except + * for unscheduled bary.f's: + */ + bool blocked_kill; + /* there is at least one instruction that could be scheduled, + * except for conflicting address/predicate register usage: + */ + bool addr_conflict, pred_conflict; +}; + +static bool is_scheduled(struct ir3_instruction *instr) +{ + return !!(instr->flags & IR3_INSTR_MARK); +} + +static bool +check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + assert(ctx->addr != instr); + notes->addr_conflict = true; + return true; + } + + if (writes_pred(instr) && ctx->pred) { + assert(ctx->pred != instr); + notes->pred_conflict = true; + return true; + } + + return false; +} + +/* is this instruction ready to be scheduled? Return negative for not + * ready (updating notes if needed), or >= 0 to indicate number of + * delay slots needed. */ -static int trysched(struct ir3_sched_ctx *ctx, +static int +instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, struct ir3_instruction *instr) { - struct ir3_instruction *srcs[64]; struct ir3_instruction *src; - unsigned delay, nsrcs = 0; + unsigned delay = 0; - /* if already scheduled: */ - if (instr->flags & IR3_INSTR_MARK) + /* Phi instructions can have a dependency on something not + * scheduled yet (for ex, loops). But OTOH we don't really + * care. By definition phi's should appear at the top of + * the block, and it's sources should be values from the + * previously executing block, so they are always ready to + * be scheduled: + */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) return 0; - /* figure out our src's, copy 'em out into an array for sorting: */ foreach_ssa_src(src, instr) { - debug_assert(nsrcs < ARRAY_SIZE(srcs)); - srcs[nsrcs++] = src; - } - - /* for each src register in sorted order: - */ - delay = 0; - while ((src = deepest(srcs, nsrcs))) { - delay = trysched(ctx, src); - if (delay) - return delay; + /* if dependency not scheduled, we aren't ready yet: */ + if (!is_scheduled(src)) + return -1; } /* all our dependents are scheduled, figure out if @@ -255,216 +243,276 @@ static int trysched(struct ir3_sched_ctx *ctx, */ if (is_kill(instr)) { struct ir3 *ir = instr->block->shader; - unsigned i; - for (i = 0; i < ir->baryfs_count; i++) { + for (unsigned i = 0; i < ir->baryfs_count; i++) { struct ir3_instruction *baryf = ir->baryfs[i]; if (baryf->depth == DEPTH_UNUSED) continue; - delay = trysched(ctx, baryf); - if (delay) - return delay; + if (!is_scheduled(baryf)) { + notes->blocked_kill = true; + return -1; + } } } - /* if this is a write to address/predicate register, and that - * register is currently in use, we need to defer until it is - * free: - */ - if (writes_addr(instr) && ctx->addr) { - assert(ctx->addr != instr); - return DELAYED; - } - if (writes_pred(instr) && ctx->pred) { - assert(ctx->pred != instr); - return DELAYED; - } + if (check_conflict(ctx, notes, instr)) + return -1; - schedule(ctx, instr, true); - return SCHEDULED; + return 0; } -static struct ir3_instruction * reverse(struct ir3_instruction *instr) +/* move eligible instructions to the priority list: */ +static unsigned +add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct list_head *prio_queue, struct list_head *unscheduled_list) { - struct ir3_instruction *reversed = NULL; - while (instr) { - struct ir3_instruction *next = instr->next; - instr->next = reversed; - reversed = instr; - instr = next; + unsigned min_delay = ~0; + + list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) { + int e = instr_eligibility(ctx, notes, instr); + if (e < 0) + continue; + min_delay = MIN2(min_delay, e); + if (e == 0) { + /* remove from unscheduled list and into priority queue: */ + list_delinit(&instr->node); + ir3_insert_by_depth(instr, prio_queue); + } } - return reversed; -} -static bool uses_current_addr(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - return instr->address && (ctx->addr == instr->address); + return min_delay; } -static bool uses_current_pred(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) +/* "spill" the address register by remapping any unscheduled + * instructions which depend on the current address register + * to a clone of the instruction which wrote the address reg. + */ +static void +split_addr(struct ir3_sched_ctx *ctx) { - struct ir3_instruction *src; - foreach_ssa_src(src, instr) - if (ctx->pred == src) - return true; - return false; + struct ir3 *ir = ctx->addr->block->shader; + struct ir3_instruction *new_addr = NULL; + unsigned i; + + debug_assert(ctx->addr); + + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + + /* skip instructions already scheduled: */ + if (indirect->flags & IR3_INSTR_MARK) + continue; + + /* remap remaining instructions using current addr + * to new addr: + */ + if (indirect->address == ctx->addr) { + if (!new_addr) { + new_addr = ir3_instr_clone(ctx->addr); + /* original addr is scheduled, but new one isn't: */ + new_addr->flags &= ~IR3_INSTR_MARK; + } + indirect->address = new_addr; + } + } + + /* all remaining indirects remapped to new addr: */ + ctx->addr = NULL; } -/* when we encounter an instruction that writes to the address register - * when it is in use, we delay that instruction and try to schedule all - * other instructions using the current address register: +/* "spill" the predicate register by remapping any unscheduled + * instructions which depend on the current predicate register + * to a clone of the instruction which wrote the address reg. */ -static int block_sched_undelayed(struct ir3_sched_ctx *ctx, - struct ir3_block *block) +static void +split_pred(struct ir3_sched_ctx *ctx) { - struct ir3_instruction *instr = block->head; - bool addr_in_use = false; - bool pred_in_use = false; - bool all_delayed = true; - unsigned cnt = ~0, attempted = 0; - - while (instr) { - struct ir3_instruction *next = instr->next; - bool addr = uses_current_addr(ctx, instr); - bool pred = uses_current_pred(ctx, instr); - - if (addr || pred) { - int ret = trysched(ctx, instr); - - if (ret != DELAYED) - all_delayed = false; - - if (ret == SCHEDULED) - cnt = 0; - else if (ret > 0) - cnt = MIN2(cnt, ret); - if (addr) - addr_in_use = true; - if (pred) - pred_in_use = true; - - attempted++; - } + struct ir3 *ir = ctx->pred->block->shader; + struct ir3_instruction *new_pred = NULL; + unsigned i; - instr = next; - } + debug_assert(ctx->pred); - if (!addr_in_use) - ctx->addr = NULL; + for (i = 0; i < ir->predicates_count; i++) { + struct ir3_instruction *predicated = ir->predicates[i]; - if (!pred_in_use) - ctx->pred = NULL; + /* skip instructions already scheduled: */ + if (predicated->flags & IR3_INSTR_MARK) + continue; - /* detect if we've gotten ourselves into an impossible situation - * and bail if needed - */ - if (all_delayed && (attempted > 0)) { - if (pred_in_use) { - /* TODO we probably need to keep a list of instructions - * that reference predicate, similar to indirects - */ - ctx->error = true; - return DELAYED; - } - if (addr_in_use) { - struct ir3 *ir = ctx->addr->block->shader; - struct ir3_instruction *new_addr = - ir3_instr_clone(ctx->addr); - unsigned i; - - /* original addr is scheduled, but new one isn't: */ - new_addr->flags &= ~IR3_INSTR_MARK; - - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - - /* skip instructions already scheduled: */ - if (indirect->flags & IR3_INSTR_MARK) - continue; - - /* remap remaining instructions using current addr - * to new addr: - */ - if (indirect->address == ctx->addr) - indirect->address = new_addr; + /* remap remaining instructions using current pred + * to new pred: + * + * TODO is there ever a case when pred isn't first + * (and only) src? + */ + if (ssa(predicated->regs[1]) == ctx->pred) { + if (!new_pred) { + new_pred = ir3_instr_clone(ctx->pred); + /* original pred is scheduled, but new one isn't: */ + new_pred->flags &= ~IR3_INSTR_MARK; } - - /* all remaining indirects remapped to new addr: */ - ctx->addr = NULL; - - /* not really, but this will trigger us to go back to - * main trysched() loop now that we've resolved the - * conflict by duplicating the instr that writes to - * the address register. - */ - return SCHEDULED; + predicated->regs[1]->instr = new_pred; } } - return cnt; + /* all remaining predicated remapped to new pred: */ + ctx->pred = NULL; } -static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) +static void +sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) { - struct ir3_instruction *instr; + struct list_head unscheduled_list, prio_queue; - /* schedule all the shader input's (meta-instr) first so that - * the RA step sees that the input registers contain a value - * from the start of the shader: + ctx->block = block; + + /* move all instructions to the unscheduled list, and + * empty the block's instruction list (to which we will + * be inserting. */ - if (!block->parent) { - unsigned i; - for (i = 0; i < block->ninputs; i++) { - struct ir3_instruction *in = block->inputs[i]; - if (in) - schedule(ctx, in, true); + list_replace(&block->instr_list, &unscheduled_list); + list_inithead(&block->instr_list); + list_inithead(&prio_queue); + + /* first a pre-pass to schedule all meta:input/phi instructions + * (which need to appear first so that RA knows the register is + * occupied: + */ + list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { + if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) || + (instr->opc == OPC_META_PHI))) + schedule(ctx, instr); + } + + while (!(list_empty(&unscheduled_list) && + list_empty(&prio_queue))) { + struct ir3_sched_notes notes = {0}; + unsigned delay; + + delay = add_eligible_instrs(ctx, ¬es, &prio_queue, &unscheduled_list); + + if (!list_empty(&prio_queue)) { + struct ir3_instruction *instr = list_last_entry(&prio_queue, + struct ir3_instruction, node); + /* ugg, this is a bit ugly, but between the time when + * the instruction became eligible and now, a new + * conflict may have arose.. + */ + if (check_conflict(ctx, ¬es, instr)) { + list_del(&instr->node); + list_addtail(&instr->node, &unscheduled_list); + continue; + } + + schedule(ctx, instr); + } else if (delay == ~0) { + /* nothing available to schedule.. if we are blocked on + * address/predicate register conflict, then break the + * deadlock by cloning the instruction that wrote that + * reg: + */ + if (notes.addr_conflict) { + split_addr(ctx); + } else if (notes.pred_conflict) { + split_pred(ctx); + } else { + debug_assert(0); + ctx->error = true; + return; + } + } else { + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } } } - while ((instr = block->head) && !ctx->error) { - /* NOTE: always grab next *before* trysched(), in case the - * instruction is actually scheduled (and therefore moved - * from depth list into scheduled list) - */ - struct ir3_instruction *next = instr->next; - int cnt = trysched(ctx, instr); + /* And lastly, insert branch/jump instructions to take us to + * the next block. Later we'll strip back out the branches + * that simply jump to next instruction. + */ + if (block->successors[1]) { + /* if/else, conditional branches to "then" or "else": */ + struct ir3_instruction *br; + unsigned delay = 6; - if (cnt == DELAYED) - cnt = block_sched_undelayed(ctx, block); + debug_assert(ctx->pred); + debug_assert(block->condition); - /* -1 is signal to return up stack, but to us means same as 0: */ - cnt = MAX2(0, cnt); - cnt += ctx->cnt; - instr = next; + delay -= distance(ctx, ctx->pred, delay); - /* if deepest remaining instruction cannot be scheduled, try - * the increasingly more shallow instructions until needed - * number of delay slots is filled: - */ - while (instr && (cnt > ctx->cnt)) { - next = instr->next; - trysched(ctx, instr); - instr = next; + while (delay > 0) { + ir3_NOP(block); + delay--; } - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: + /* create "else" branch first (since "then" block should + * frequently/always end up being a fall-thru): + */ + br = ir3_BR(block); + br->cat0.inv = true; + br->cat0.target = block->successors[1]; + + /* NOTE: we have to hard code delay of 6 above, since + * we want to insert the nop's before constructing the + * branch. Throw in an assert so we notice if this + * ever breaks on future generation: */ - while (cnt > ctx->cnt) - schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); + + br = ir3_BR(block); + br->cat0.target = block->successors[0]; + + } else if (block->successors[0]) { + /* otherwise unconditional jump to next block: */ + struct ir3_instruction *jmp; + + jmp = ir3_JUMP(block); + jmp->cat0.target = block->successors[0]; } - /* at this point, scheduled list is in reverse order, so fix that: */ - block->head = reverse(ctx->scheduled); + /* NOTE: if we kept track of the predecessors, we could do a better + * job w/ (jp) flags.. every node w/ > predecessor is a join point. + * Note that as we eliminate blocks which contain only an unconditional + * jump we probably need to propagate (jp) flag.. + */ } -int ir3_block_sched(struct ir3_block *block) +/* this is needed to ensure later RA stage succeeds: */ +static void +sched_insert_parallel_copies(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) { + struct ir3_register *reg; + foreach_src(reg, instr) { + struct ir3_instruction *src = reg->instr; + struct ir3_instruction *mov = + ir3_MOV(src->block, src, TYPE_U32); + mov->regs[0]->flags |= IR3_REG_PHI_SRC; + mov->regs[0]->instr = instr; + reg->instr = mov; + } + } + } +} + +int ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx ctx = {0}; - ir3_clear_mark(block->shader); - block_sched(&ctx, block); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_insert_parallel_copies(block); + } + ir3_clear_mark(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_block(&ctx, block); + } if (ctx.error) return -1; return 0; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 9bf4e64c7f1..b5b038100cc 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -127,7 +127,7 @@ static void assemble_variant(struct ir3_shader_variant *v) { struct fd_context *ctx = fd_context(v->shader->pctx); - uint32_t gpu_id = ir3_shader_gpuid(v->shader); + uint32_t gpu_id = v->shader->compiler->gpu_id; uint32_t sz, *bin; bin = ir3_shader_assemble(v, gpu_id); @@ -146,17 +146,6 @@ assemble_variant(struct ir3_shader_variant *v) v->ir = NULL; } -/* reset before attempting to compile again.. */ -static void reset_variant(struct ir3_shader_variant *v, const char *msg) -{ - debug_error(msg); - v->inputs_count = 0; - v->outputs_count = 0; - v->total_in = 0; - v->has_samp = false; - v->immediates_count = 0; -} - static struct ir3_shader_variant * create_variant(struct ir3_shader *shader, struct ir3_shader_key key) { @@ -177,22 +166,7 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key) tgsi_dump(tokens, 0); } - if (fd_mesa_debug & FD_DBG_NIR) { - ret = ir3_compile_shader_nir(v, tokens, key); - if (ret) - reset_variant(v, "NIR compiler failed, fallback to TGSI!"); - } else { - ret = -1; - } - - if (ret) { - ret = ir3_compile_shader(v, tokens, key, true); - if (ret) { - reset_variant(v, "new compiler failed, trying without copy propagation!"); - ret = ir3_compile_shader(v, tokens, key, false); - } - } - + ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key); if (ret) { debug_error("compile failed!"); goto fail; @@ -217,13 +191,6 @@ fail: return NULL; } -uint32_t -ir3_shader_gpuid(struct ir3_shader *shader) -{ - struct fd_context *ctx = fd_context(shader->pctx); - return ctx->screen->gpu_id; -} - struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key) { @@ -286,6 +253,7 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens, enum shader_t type) { struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); + shader->compiler = fd_context(pctx)->screen->compiler; shader->pctx = pctx; shader->type = type; shader->tokens = tgsi_dup_tokens(tokens); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index e5410bf88b2..9f1b0769180 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -86,10 +86,6 @@ struct ir3_shader_key { * shader: */ uint16_t fsaturate_s, fsaturate_t, fsaturate_r; - - /* bitmask of sampler which produces integer outputs: - */ - uint16_t vinteger_s, finteger_s; }; static inline bool @@ -196,6 +192,8 @@ struct ir3_shader_variant { struct ir3_shader { enum shader_t type; + struct ir3_compiler *compiler; + struct pipe_context *pctx; const struct tgsi_token *tokens; @@ -212,7 +210,6 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); struct ir3_shader * ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens, enum shader_t type); void ir3_shader_destroy(struct ir3_shader *shader); -uint32_t ir3_shader_gpuid(struct ir3_shader *shader); struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key); @@ -220,6 +217,8 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, * Helper/util: */ +#include "pipe/p_shader_tokens.h" + static inline int ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic) { diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c index e0134a7c4ee..83bb64918d4 100644 --- a/src/gallium/drivers/i915/i915_fpc_optimize.c +++ b/src/gallium/drivers/i915/i915_fpc_optimize.c @@ -552,7 +552,7 @@ static boolean i915_fpc_useless_mov(union tgsi_full_token *tgsi_current) if ( current.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION && current.FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV && op_has_dst(current.FullInstruction.Instruction.Opcode) && - current.FullInstruction.Instruction.Saturate == TGSI_SAT_NONE && + !current.FullInstruction.Instruction.Saturate && current.FullInstruction.Src[0].Register.Absolute == 0 && current.FullInstruction.Src[0].Register.Negate == 0 && is_unswizzled(¤t.FullInstruction.Src[0], current.FullInstruction.Dst[0].Register.WriteMask) && @@ -582,7 +582,7 @@ static void i915_fpc_optimize_useless_mov_after_inst(struct i915_optimize_contex next->Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION && next->FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV && op_has_dst(current->FullInstruction.Instruction.Opcode) && - next->FullInstruction.Instruction.Saturate == TGSI_SAT_NONE && + !next->FullInstruction.Instruction.Saturate && next->FullInstruction.Src[0].Register.Absolute == 0 && next->FullInstruction.Src[0].Register.Negate == 0 && unused_from(ctx, ¤t->FullInstruction.Dst[0], index) && diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c index b74f8239bb4..38a33888166 100644 --- a/src/gallium/drivers/i915/i915_fpc_translate.c +++ b/src/gallium/drivers/i915/i915_fpc_translate.c @@ -329,7 +329,7 @@ get_result_flags(const struct i915_full_instruction *inst) = inst->Dst[0].Register.WriteMask; uint flags = 0x0; - if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) + if (inst->Instruction.Saturate) flags |= A0_DEST_SATURATE; if (writeMask & TGSI_WRITEMASK_X) diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 7216160bb22..0590da07b9a 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -165,6 +165,7 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; default: debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap); @@ -241,6 +242,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources index 91a6f65f2e9..e1bbb9a0781 100644 --- a/src/gallium/drivers/ilo/Makefile.sources +++ b/src/gallium/drivers/ilo/Makefile.sources @@ -15,14 +15,34 @@ C_SOURCES := \ core/ilo_debug.h \ core/ilo_dev.c \ core/ilo_dev.h \ - core/ilo_format.c \ - core/ilo_format.h \ - core/ilo_fence.h \ core/ilo_image.c \ core/ilo_image.h \ - core/ilo_state_3d.h \ - core/ilo_state_3d_bottom.c \ - core/ilo_state_3d_top.c \ + core/ilo_state_cc.c \ + core/ilo_state_cc.h \ + core/ilo_state_compute.c \ + core/ilo_state_compute.h \ + core/ilo_state_raster.c \ + core/ilo_state_raster.h \ + core/ilo_state_sampler.c \ + core/ilo_state_sampler.h \ + core/ilo_state_sbe.c \ + core/ilo_state_sbe.h \ + core/ilo_state_shader.c \ + core/ilo_state_shader_ps.c \ + core/ilo_state_shader.h \ + core/ilo_state_sol.c \ + core/ilo_state_sol.h \ + core/ilo_state_surface.c \ + core/ilo_state_surface_format.c \ + core/ilo_state_surface.h \ + core/ilo_state_urb.c \ + core/ilo_state_urb.h \ + core/ilo_state_vf.c \ + core/ilo_state_vf.h \ + core/ilo_state_viewport.c \ + core/ilo_state_viewport.h \ + core/ilo_state_zs.c \ + core/ilo_state_zs.h \ core/intel_winsys.h \ ilo_blit.c \ ilo_blit.h \ @@ -38,6 +58,8 @@ C_SOURCES := \ ilo_cp.h \ ilo_draw.c \ ilo_draw.h \ + ilo_format.c \ + ilo_format.h \ ilo_gpgpu.c \ ilo_gpgpu.h \ ilo_public.h \ diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_buffer.h index 50f97d10bd7..ca3c61ff890 100644 --- a/src/gallium/drivers/ilo/core/ilo_buffer.h +++ b/src/gallium/drivers/ilo/core/ilo_buffer.h @@ -31,11 +31,13 @@ #include "intel_winsys.h" #include "ilo_core.h" +#include "ilo_debug.h" #include "ilo_dev.h" struct ilo_buffer { unsigned bo_size; + /* managed by users */ struct intel_bo *bo; }; @@ -43,6 +45,8 @@ static inline void ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev, unsigned size, uint32_t bind, uint32_t flags) { + assert(ilo_is_zeroed(buf, sizeof(*buf))); + buf->bo_size = size; /* @@ -55,36 +59,6 @@ ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev, */ if (bind & PIPE_BIND_SAMPLER_VIEW) buf->bo_size = align(buf->bo_size, 256) + 16; - - if ((bind & PIPE_BIND_VERTEX_BUFFER) && ilo_dev_gen(dev) < ILO_GEN(7.5)) { - /* - * As noted in ilo_format_translate(), we treat some 3-component formats - * as 4-component formats to work around hardware limitations. Imagine - * the case where the vertex buffer holds a single - * PIPE_FORMAT_R16G16B16_FLOAT vertex, and buf->bo_size is 6. The - * hardware would fail to fetch it at boundary check because the vertex - * buffer is expected to hold a PIPE_FORMAT_R16G16B16A16_FLOAT vertex - * and that takes at least 8 bytes. - * - * For the workaround to work, we should add 2 to the bo size. But that - * would waste a page when the bo size is already page aligned. Let's - * round it to page size for now and revisit this when needed. - */ - buf->bo_size = align(buf->bo_size, 4096); - } -} - -static inline void -ilo_buffer_cleanup(struct ilo_buffer *buf) -{ - intel_bo_unref(buf->bo); -} - -static inline void -ilo_buffer_set_bo(struct ilo_buffer *buf, struct intel_bo *bo) -{ - intel_bo_unref(buf->bo); - buf->bo = intel_bo_ref(bo); } #endif /* ILO_BUFFER_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c index 3c5eef9bcbc..4e05a3aca1e 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder.c +++ b/src/gallium/drivers/ilo/core/ilo_builder.c @@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder, { int i; - memset(builder, 0, sizeof(*builder)); + assert(ilo_is_zeroed(builder, sizeof(*builder))); builder->dev = dev; builder->winsys = winsys; diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d.h b/src/gallium/drivers/ilo/core/ilo_builder_3d.h index 6cf1732ee1c..fb8b53cbe23 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_3d.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_3d.h @@ -35,45 +35,45 @@ #include "ilo_builder_3d_top.h" #include "ilo_builder_3d_bottom.h" +struct gen6_3dprimitive_info { + enum gen_3dprim_type topology; + bool indexed; + + uint32_t vertex_count; + uint32_t vertex_start; + uint32_t instance_count; + uint32_t instance_start; + int32_t vertex_base; +}; + static inline void gen6_3DPRIMITIVE(struct ilo_builder *builder, - const struct pipe_draw_info *info, - const struct ilo_ib_state *ib) + const struct gen6_3dprimitive_info *info) { const uint8_t cmd_len = 6; - const int prim = gen6_3d_translate_pipe_prim(info->mode); - const int vb_access = (info->indexed) ? - GEN6_3DPRIM_DW0_ACCESS_RANDOM : GEN6_3DPRIM_DW0_ACCESS_SEQUENTIAL; - const uint32_t vb_start = info->start + - ((info->indexed) ? ib->draw_start_offset : 0); uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 6); ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | - vb_access | - prim << GEN6_3DPRIM_DW0_TYPE__SHIFT | - (cmd_len - 2); - dw[1] = info->count; - dw[2] = vb_start; + dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2) | + info->topology << GEN6_3DPRIM_DW0_TYPE__SHIFT; + if (info->indexed) + dw[0] |= GEN6_3DPRIM_DW0_ACCESS_RANDOM; + + dw[1] = info->vertex_count; + dw[2] = info->vertex_start; dw[3] = info->instance_count; - dw[4] = info->start_instance; - dw[5] = info->index_bias; + dw[4] = info->instance_start; + dw[5] = info->vertex_base; } static inline void gen7_3DPRIMITIVE(struct ilo_builder *builder, - const struct pipe_draw_info *info, - const struct ilo_ib_state *ib) + const struct gen6_3dprimitive_info *info) { const uint8_t cmd_len = 7; - const int prim = gen6_3d_translate_pipe_prim(info->mode); - const int vb_access = (info->indexed) ? - GEN7_3DPRIM_DW1_ACCESS_RANDOM : GEN7_3DPRIM_DW1_ACCESS_SEQUENTIAL; - const uint32_t vb_start = info->start + - ((info->indexed) ? ib->draw_start_offset : 0); uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 8); @@ -81,12 +81,16 @@ gen7_3DPRIMITIVE(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2); - dw[1] = vb_access | prim; - dw[2] = info->count; - dw[3] = vb_start; + + dw[1] = info->topology << GEN7_3DPRIM_DW1_TYPE__SHIFT; + if (info->indexed) + dw[1] |= GEN7_3DPRIM_DW1_ACCESS_RANDOM; + + dw[2] = info->vertex_count; + dw[3] = info->vertex_start; dw[4] = info->instance_count; - dw[5] = info->start_instance; - dw[6] = info->index_bias; + dw[5] = info->instance_start; + dw[6] = info->vertex_base; } #endif /* ILO_BUILDER_3D_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h index 16ec4afd15b..6d9e3699125 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h @@ -29,335 +29,121 @@ #define ILO_BUILDER_3D_BOTTOM_H #include "genhw/genhw.h" -#include "../ilo_shader.h" #include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" -#include "ilo_format.h" +#include "ilo_state_cc.h" +#include "ilo_state_raster.h" +#include "ilo_state_sbe.h" +#include "ilo_state_shader.h" +#include "ilo_state_viewport.h" +#include "ilo_state_zs.h" #include "ilo_builder.h" #include "ilo_builder_3d_top.h" static inline void gen6_3DSTATE_CLIP(struct ilo_builder *builder, - const struct ilo_rasterizer_state *rasterizer, - const struct ilo_shader_state *fs, - bool enable_guardband, - int num_viewports) -{ - const uint8_t cmd_len = 4; - uint32_t dw1, dw2, dw3, *dw; - int interps; - - ILO_DEV_ASSERT(builder->dev, 6, 8); - - dw1 = rasterizer->clip.payload[0]; - dw2 = rasterizer->clip.payload[1]; - dw3 = rasterizer->clip.payload[2]; - - if (enable_guardband && rasterizer->clip.can_enable_guardband) - dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE; - - interps = (fs) ? ilo_shader_get_kernel_param(fs, - ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) : 0; - - if (interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL | - GEN6_INTERP_NONPERSPECTIVE_CENTROID | - GEN6_INTERP_NONPERSPECTIVE_SAMPLE)) - dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE; - - dw3 |= GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO | - (num_viewports - 1); - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2); - dw[1] = dw1; - dw[2] = dw2; - dw[3] = dw3; -} - -static inline void -gen6_disable_3DSTATE_CLIP(struct ilo_builder *builder) + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 4; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 6, 7.5); + ILO_DEV_ASSERT(builder->dev, 6, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; -} - -static inline void -gen7_internal_3dstate_sf(struct ilo_builder *builder, - uint8_t cmd_len, uint32_t *dw, - const struct ilo_rasterizer_sf *sf, - int num_samples) -{ - ILO_DEV_ASSERT(builder->dev, 6, 7.5); - - assert(cmd_len == 7); - - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2); - - if (!sf) { - dw[1] = 0; - dw[2] = (num_samples > 1) ? GEN7_SF_DW2_MSRASTMODE_ON_PATTERN : 0; - dw[3] = 0; - dw[4] = 0; - dw[5] = 0; - dw[6] = 0; - - return; - } - - /* see rasterizer_init_sf_gen6() */ - STATIC_ASSERT(Elements(sf->payload) >= 3); - dw[1] = sf->payload[0]; - dw[2] = sf->payload[1]; - dw[3] = sf->payload[2]; - - if (num_samples > 1) - dw[2] |= sf->dw_msaa; - - dw[4] = sf->dw_depth_offset_const; - dw[5] = sf->dw_depth_offset_scale; - dw[6] = sf->dw_depth_offset_clamp; -} - -static inline void -gen8_internal_3dstate_sbe(struct ilo_builder *builder, - uint8_t cmd_len, uint32_t *dw, - const struct ilo_shader_state *fs, - int sprite_coord_mode) -{ - const struct ilo_kernel_routing *routing; - int vue_offset, vue_len, out_count; - - ILO_DEV_ASSERT(builder->dev, 6, 8); - - assert(cmd_len == 4); - - dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2); - - if (!fs) { - dw[1] = 1 << GEN7_SBE_DW1_URB_READ_LEN__SHIFT; - dw[2] = 0; - dw[3] = 0; - return; - } - - routing = ilo_shader_get_kernel_routing(fs); - - vue_offset = routing->source_skip; - assert(vue_offset % 2 == 0); - vue_offset /= 2; - - vue_len = (routing->source_len + 1) / 2; - if (!vue_len) - vue_len = 1; - - out_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT); - assert(out_count <= 32); - - dw[1] = out_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT | - vue_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[1] |= GEN8_SBE_DW1_USE_URB_READ_LEN | - GEN8_SBE_DW1_USE_URB_READ_OFFSET | - vue_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT; - } else { - dw[1] |= vue_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT; - } - - if (routing->swizzle_enable) - dw[1] |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE; - - switch (sprite_coord_mode) { - case PIPE_SPRITE_COORD_UPPER_LEFT: - dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT; - break; - case PIPE_SPRITE_COORD_LOWER_LEFT: - dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT; - break; - } - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 268: - * - * "This field (Point Sprite Texture Coordinate Enable) must be - * programmed to 0 when non-point primitives are rendered." - * - * TODO We do not check that yet. - */ - dw[2] = routing->point_sprite_enable; - - dw[3] = routing->const_interp_enable; -} - -static inline void -gen8_internal_3dstate_sbe_swiz(struct ilo_builder *builder, - uint8_t cmd_len, uint32_t *dw, - const struct ilo_shader_state *fs) -{ - const struct ilo_kernel_routing *routing; - - ILO_DEV_ASSERT(builder->dev, 6, 8); - - assert(cmd_len == 11); - - dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2); - - if (!fs) { - memset(&dw[1], 0, sizeof(*dw) * (cmd_len - 1)); - return; - } - - routing = ilo_shader_get_kernel_routing(fs); - - STATIC_ASSERT(sizeof(routing->swizzles) >= sizeof(*dw) * 8); - memcpy(&dw[1], routing->swizzles, sizeof(*dw) * 8); - - /* WrapShortest enables */ - dw[9] = 0; - dw[10] = 0; + /* see raster_set_gen6_3DSTATE_CLIP() */ + dw[1] = rs->clip[0]; + dw[2] = rs->clip[1]; + dw[3] = rs->clip[2]; } static inline void gen6_3DSTATE_SF(struct ilo_builder *builder, - const struct ilo_rasterizer_state *rasterizer, - const struct ilo_shader_state *fs, - int sample_count) + const struct ilo_state_raster *rs, + const struct ilo_state_sbe *sbe) { const uint8_t cmd_len = 20; - uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11]; - uint32_t gen7_3dstate_sf[7]; - const struct ilo_rasterizer_sf *sf; - int sprite_coord_mode; uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 6); - sf = (rasterizer) ? &rasterizer->sf : NULL; - sprite_coord_mode = (rasterizer) ? rasterizer->state.sprite_coord_mode : 0; - - gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe), - gen8_3dstate_sbe, fs, sprite_coord_mode); - gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz), - gen8_3dstate_sbe_swiz, fs); - gen7_internal_3dstate_sf(builder, Elements(gen7_3dstate_sf), - gen7_3dstate_sf, sf, sample_count); - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2); - dw[1] = gen8_3dstate_sbe[1]; - memcpy(&dw[2], &gen7_3dstate_sf[1], sizeof(*dw) * 6); - memcpy(&dw[8], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8); - dw[16] = gen8_3dstate_sbe[2]; - dw[17] = gen8_3dstate_sbe[3]; - dw[18] = gen8_3dstate_sbe_swiz[9]; - dw[19] = gen8_3dstate_sbe_swiz[10]; + /* see sbe_set_gen8_3DSTATE_SBE() */ + dw[1] = sbe->sbe[0]; + + /* see raster_set_gen7_3DSTATE_SF() */ + dw[2] = rs->sf[0]; + dw[3] = rs->sf[1]; + dw[4] = rs->sf[2]; + dw[5] = rs->raster[1]; + dw[6] = rs->raster[2]; + dw[7] = rs->raster[3]; + + /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */ + memcpy(&dw[8], sbe->swiz, sizeof(*dw) * 8); + + dw[16] = sbe->sbe[1]; + dw[17] = sbe->sbe[2]; + /* WrapShortest enables */ + dw[18] = 0; + dw[19] = 0; } static inline void gen7_3DSTATE_SF(struct ilo_builder *builder, - const struct ilo_rasterizer_sf *sf, - enum pipe_format zs_format, - int sample_count) + const struct ilo_state_raster *rs) { - const uint8_t cmd_len = 7; + const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 4 : 7; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 7.5); - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - gen7_internal_3dstate_sf(builder, cmd_len, dw, sf, sample_count); - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) { - int hw_format; - - /* separate stencil */ - switch (zs_format) { - case PIPE_FORMAT_Z16_UNORM: - hw_format = GEN6_ZFORMAT_D16_UNORM; - break; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - hw_format = GEN6_ZFORMAT_D32_FLOAT; - break; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - hw_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT; - break; - default: - /* FLOAT surface is assumed when there is no depth buffer */ - hw_format = GEN6_ZFORMAT_D32_FLOAT; - break; - } - - dw[1] |= hw_format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT; - } -} - -static inline void -gen8_3DSTATE_SF(struct ilo_builder *builder, - const struct ilo_rasterizer_sf *sf) -{ - const uint8_t cmd_len = 4; - uint32_t *dw; - - ILO_DEV_ASSERT(builder->dev, 8, 8); + ILO_DEV_ASSERT(builder->dev, 7, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2); - /* see rasterizer_init_sf_gen8() */ - STATIC_ASSERT(Elements(sf->payload) >= 3); - dw[1] = sf->payload[0]; - dw[2] = sf->payload[1]; - dw[3] = sf->payload[2]; + /* see raster_set_gen7_3DSTATE_SF() or raster_set_gen8_3DSTATE_SF() */ + dw[1] = rs->sf[0]; + dw[2] = rs->sf[1]; + dw[3] = rs->sf[2]; + if (ilo_dev_gen(builder->dev) < ILO_GEN(8)) { + dw[4] = rs->raster[1]; + dw[5] = rs->raster[2]; + dw[6] = rs->raster[3]; + } } static inline void gen7_3DSTATE_SBE(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - int sprite_coord_mode) + const struct ilo_state_sbe *sbe) { const uint8_t cmd_len = 14; - uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11]; uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 7.5); - gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe), - gen8_3dstate_sbe, fs, sprite_coord_mode); - gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz), - gen8_3dstate_sbe_swiz, fs); - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2); - dw[1] = gen8_3dstate_sbe[1]; - memcpy(&dw[2], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8); - dw[10] = gen8_3dstate_sbe[2]; - dw[11] = gen8_3dstate_sbe[3]; - dw[12] = gen8_3dstate_sbe_swiz[9]; - dw[13] = gen8_3dstate_sbe_swiz[10]; + /* see sbe_set_gen8_3DSTATE_SBE() and sbe_set_gen8_3DSTATE_SBE_SWIZ() */ + dw[1] = sbe->sbe[0]; + memcpy(&dw[2], sbe->swiz, sizeof(*dw) * 8); + dw[10] = sbe->sbe[1]; + dw[11] = sbe->sbe[2]; + + /* WrapShortest enables */ + dw[12] = 0; + dw[13] = 0; } static inline void gen8_3DSTATE_SBE(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - int sprite_coord_mode) + const struct ilo_state_sbe *sbe) { const uint8_t cmd_len = 4; uint32_t *dw; @@ -366,12 +152,16 @@ gen8_3DSTATE_SBE(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); - gen8_internal_3dstate_sbe(builder, cmd_len, dw, fs, sprite_coord_mode); + /* see sbe_set_gen8_3DSTATE_SBE() */ + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2); + dw[1] = sbe->sbe[0]; + dw[2] = sbe->sbe[1]; + dw[3] = sbe->sbe[2]; } static inline void gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder, - const struct ilo_shader_state *fs) + const struct ilo_state_sbe *sbe) { const uint8_t cmd_len = 11; uint32_t *dw; @@ -380,12 +170,17 @@ gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); - gen8_internal_3dstate_sbe_swiz(builder, cmd_len, dw, fs); + dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2); + /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */ + memcpy(&dw[1], sbe->swiz, sizeof(*dw) * 8); + /* WrapShortest enables */ + dw[9] = 0; + dw[10] = 0; } static inline void gen8_3DSTATE_RASTER(struct ilo_builder *builder, - const struct ilo_rasterizer_sf *sf) + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 5; uint32_t *dw; @@ -395,232 +190,108 @@ gen8_3DSTATE_RASTER(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_RASTER) | (cmd_len - 2); - dw[1] = sf->dw_raster; - dw[2] = sf->dw_depth_offset_const; - dw[3] = sf->dw_depth_offset_scale; - dw[4] = sf->dw_depth_offset_clamp; + /* see raster_set_gen8_3DSTATE_RASTER() */ + dw[1] = rs->raster[0]; + dw[2] = rs->raster[1]; + dw[3] = rs->raster[2]; + dw[4] = rs->raster[3]; } static inline void gen6_3DSTATE_WM(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - const struct ilo_rasterizer_state *rasterizer, - bool dual_blend, bool cc_may_kill) + const struct ilo_state_raster *rs, + const struct ilo_state_ps *ps, + uint32_t kernel_offset) { const uint8_t cmd_len = 9; - const int num_samples = 1; - const struct ilo_shader_cso *cso; - uint32_t dw2, dw4, dw5, dw6, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 6); - cso = ilo_shader_get_kernel_cso(fs); - dw2 = cso->payload[0]; - dw4 = cso->payload[1]; - dw5 = cso->payload[2]; - dw6 = cso->payload[3]; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 248: - * - * "This bit (Statistics Enable) must be disabled if either of these - * bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve - * Enable or Depth Buffer Resolve Enable." - */ - dw4 |= GEN6_WM_DW4_STATISTICS; - - if (cc_may_kill) - dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL | GEN6_WM_DW5_PS_DISPATCH_ENABLE; - - if (dual_blend) - dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND; - - dw5 |= rasterizer->wm.payload[0]; - - dw6 |= rasterizer->wm.payload[1]; - - if (num_samples > 1) { - dw6 |= rasterizer->wm.dw_msaa_rast | - rasterizer->wm.dw_msaa_disp; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(fs); - dw[2] = dw2; - dw[3] = 0; /* scratch */ - dw[4] = dw4; - dw[5] = dw5; - dw[6] = dw6; + dw[1] = kernel_offset; + /* see raster_set_gen6_3dstate_wm() and ps_set_gen6_3dstate_wm() */ + dw[2] = ps->ps[0]; + dw[3] = ps->ps[1]; + dw[4] = rs->wm[0] | ps->ps[2]; + dw[5] = rs->wm[1] | ps->ps[3]; + dw[6] = rs->wm[2] | ps->ps[4]; dw[7] = 0; /* kernel 1 */ dw[8] = 0; /* kernel 2 */ } static inline void -gen6_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op) -{ - const uint8_t cmd_len = 9; - const int max_threads = (builder->dev->gt == 2) ? 80 : 40; - uint32_t *dw; - - ILO_DEV_ASSERT(builder->dev, 6, 6); - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - dw[4] = hiz_op; - /* honor the valid range even if dispatching is disabled */ - dw[5] = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT; - dw[6] = 0; - dw[7] = 0; - dw[8] = 0; -} - -static inline void gen7_3DSTATE_WM(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - const struct ilo_rasterizer_state *rasterizer, - bool cc_may_kill) + const struct ilo_state_raster *rs, + const struct ilo_state_ps *ps) { const uint8_t cmd_len = 3; - const int num_samples = 1; - const struct ilo_shader_cso *cso; - uint32_t dw1, dw2, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 7.5); - /* see rasterizer_init_wm_gen7() */ - dw1 = rasterizer->wm.payload[0]; - dw2 = rasterizer->wm.payload[1]; - - /* see fs_init_cso_gen7() */ - cso = ilo_shader_get_kernel_cso(fs); - dw1 |= cso->payload[3]; - - dw1 |= GEN7_WM_DW1_STATISTICS; - - if (cc_may_kill) - dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE | GEN7_WM_DW1_PS_KILL_PIXEL; - - if (num_samples > 1) { - dw1 |= rasterizer->wm.dw_msaa_rast; - dw2 |= rasterizer->wm.dw_msaa_disp; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2); - dw[1] = dw1; - dw[2] = dw2; + /* see raster_set_gen8_3DSTATE_WM() and ps_set_gen7_3dstate_wm() */ + dw[1] = rs->wm[0] | ps->ps[0]; + dw[2] = ps->ps[1]; } static inline void gen8_3DSTATE_WM(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - const struct ilo_rasterizer_state *rasterizer) + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 2; - const struct ilo_shader_cso *cso; - uint32_t dw1, interps, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - /* see rasterizer_get_wm_gen8() */ - dw1 = rasterizer->wm.payload[0]; - dw1 |= GEN7_WM_DW1_STATISTICS; - - /* see fs_init_cso_gen8() */ - cso = ilo_shader_get_kernel_cso(fs); - interps = cso->payload[4]; - - assert(!(dw1 & interps)); - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2); - dw[1] = dw1 | interps; -} - -static inline void -gen7_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op) -{ - const uint8_t cmd_len = 3; - uint32_t *dw; - - ILO_DEV_ASSERT(builder->dev, 7, 7.5); - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2); - dw[1] = hiz_op; - dw[2] = 0; + /* see raster_set_gen8_3DSTATE_WM() */ + dw[1] = rs->wm[0]; } static inline void gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_builder *builder, - const struct ilo_dsa_state *dsa) + const struct ilo_state_cc *cc) { const uint8_t cmd_len = 3; - uint32_t dw1, dw2, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - dw1 = dsa->payload[0]; - dw2 = dsa->payload[1]; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_DEPTH_STENCIL) | (cmd_len - 2); - dw[1] = dw1; - dw[2] = dw2; + /* see cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL() */ + dw[1] = cc->ds[0]; + dw[2] = cc->ds[1]; } static inline void -gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder, uint32_t op, - uint16_t width, uint16_t height, int sample_count) +gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder, + const struct ilo_state_raster *rs, + uint16_t width, uint16_t height) { const uint8_t cmd_len = 5; - const uint32_t sample_mask = ((1 << sample_count) - 1) | 0x1; - uint32_t dw1, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - dw1 = op; - - switch (sample_count) { - case 0: - case 1: - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1; - break; - case 2: - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_2; - break; - case 4: - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_4; - break; - case 8: - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_8; - break; - case 16: - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_16; - break; - default: - assert(!"unsupported sample count"); - dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1; - break; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_HZ_OP) | (cmd_len - 2); - dw[1] = dw1; + /* see raster_set_gen8_3dstate_wm_hz_op() */ + dw[1] = rs->wm[1]; dw[2] = 0; - /* exclusive? */ + /* exclusive */ dw[3] = height << 16 | width; - dw[4] = sample_mask; + dw[4] = rs->wm[2]; } static inline void @@ -656,100 +327,48 @@ gen8_3DSTATE_WM_CHROMAKEY(struct ilo_builder *builder) static inline void gen7_3DSTATE_PS(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - bool dual_blend) + const struct ilo_state_ps *ps, + uint32_t kernel_offset) { const uint8_t cmd_len = 8; - const struct ilo_shader_cso *cso; - uint32_t dw2, dw4, dw5, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 7.5); - /* see fs_init_cso_gen7() */ - cso = ilo_shader_get_kernel_cso(fs); - dw2 = cso->payload[0]; - dw4 = cso->payload[1]; - dw5 = cso->payload[2]; - - if (dual_blend) - dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(fs); - dw[2] = dw2; - dw[3] = 0; /* scratch */ - dw[4] = dw4; - dw[5] = dw5; + dw[1] = kernel_offset; + /* see ps_set_gen7_3DSTATE_PS() */ + dw[2] = ps->ps[2]; + dw[3] = ps->ps[3]; + dw[4] = ps->ps[4]; + dw[5] = ps->ps[5]; dw[6] = 0; /* kernel 1 */ dw[7] = 0; /* kernel 2 */ } static inline void -gen7_disable_3DSTATE_PS(struct ilo_builder *builder) -{ - const uint8_t cmd_len = 8; - int max_threads; - uint32_t dw4, *dw; - - ILO_DEV_ASSERT(builder->dev, 7, 7.5); - - /* GPU hangs if none of the dispatch enable bits is set */ - dw4 = GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT; - - /* see brwCreateContext() */ - switch (ilo_dev_gen(builder->dev)) { - case ILO_GEN(7.5): - max_threads = (builder->dev->gt == 3) ? 408 : - (builder->dev->gt == 2) ? 204 : 102; - dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT; - break; - case ILO_GEN(7): - default: - max_threads = (builder->dev->gt == 2) ? 172 : 48; - dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT; - break; - } - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - dw[4] = dw4; - dw[5] = 0; - dw[6] = 0; - dw[7] = 0; -} - -static inline void gen8_3DSTATE_PS(struct ilo_builder *builder, - const struct ilo_shader_state *fs) + const struct ilo_state_ps *ps, + uint32_t kernel_offset) { const uint8_t cmd_len = 12; - const struct ilo_shader_cso *cso; - uint32_t dw3, dw6, dw7, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - /* see fs_init_cso_gen8() */ - cso = ilo_shader_get_kernel_cso(fs); - dw3 = cso->payload[0]; - dw6 = cso->payload[1]; - dw7 = cso->payload[2]; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(fs); + dw[1] = kernel_offset; dw[2] = 0; - dw[3] = dw3; - dw[4] = 0; /* scratch */ + /* see ps_set_gen8_3DSTATE_PS() */ + dw[3] = ps->ps[0]; + dw[4] = ps->ps[1]; dw[5] = 0; - dw[6] = dw6; - dw[7] = dw7; + dw[6] = ps->ps[2]; + dw[7] = ps->ps[3]; dw[8] = 0; /* kernel 1 */ dw[9] = 0; dw[10] = 0; /* kernel 2 */ @@ -758,66 +377,34 @@ gen8_3DSTATE_PS(struct ilo_builder *builder, static inline void gen8_3DSTATE_PS_EXTRA(struct ilo_builder *builder, - const struct ilo_shader_state *fs, - bool cc_may_kill, bool per_sample) + const struct ilo_state_ps *ps) { const uint8_t cmd_len = 2; - const struct ilo_shader_cso *cso; - uint32_t dw1, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - /* see fs_init_cso_gen8() */ - cso = ilo_shader_get_kernel_cso(fs); - dw1 = cso->payload[3]; - - if (cc_may_kill) - dw1 |= GEN8_PSX_DW1_DISPATCH_ENABLE | GEN8_PSX_DW1_KILL_PIXEL; - if (per_sample) - dw1 |= GEN8_PSX_DW1_PER_SAMPLE; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_EXTRA) | (cmd_len - 2); - dw[1] = dw1; + /* see ps_set_gen8_3DSTATE_PS_EXTRA() */ + dw[1] = ps->ps[4]; } static inline void gen8_3DSTATE_PS_BLEND(struct ilo_builder *builder, - const struct ilo_blend_state *blend, - const struct ilo_fb_state *fb, - const struct ilo_dsa_state *dsa) + const struct ilo_state_cc *cc) { const uint8_t cmd_len = 2; - uint32_t dw1, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - dw1 = 0; - if (blend->alpha_to_coverage && fb->num_samples > 1) - dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE; - - if (fb->state.nr_cbufs && fb->state.cbufs[0]) { - const struct ilo_fb_blend_caps *caps = &fb->blend_caps[0]; - - dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT; - if (caps->can_blend) { - if (caps->dst_alpha_forced_one) - dw1 |= blend->dw_ps_blend_dst_alpha_forced_one; - else - dw1 |= blend->dw_ps_blend; - } - - if (caps->can_alpha_test) - dw1 |= dsa->dw_ps_blend_alpha; - } else { - dw1 |= dsa->dw_ps_blend_alpha; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_BLEND) | (cmd_len - 2); - dw[1] = dw1; + /* see cc_set_gen8_3DSTATE_PS_BLEND() */ + dw[1] = cc->blend[0]; } static inline void @@ -862,101 +449,49 @@ gen7_3DSTATE_SAMPLER_STATE_POINTERS_PS(struct ilo_builder *builder, static inline void gen6_3DSTATE_MULTISAMPLE(struct ilo_builder *builder, - int num_samples, const uint32_t *pattern, - bool pixel_location_center) + const struct ilo_state_raster *rs, + const struct ilo_state_sample_pattern *pattern, + uint8_t sample_count) { const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? 4 : 3; - uint32_t dw1, dw2, dw3, *dw; + const uint32_t *packed = (const uint32_t *) + ilo_state_sample_pattern_get_packed_offsets(pattern, + builder->dev, sample_count); + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 7.5); - dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER : - GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER; - - switch (num_samples) { - case 0: - case 1: - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1; - dw2 = 0; - dw3 = 0; - break; - case 4: - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4; - dw2 = pattern[0]; - dw3 = 0; - break; - case 8: - assert(ilo_dev_gen(builder->dev) >= ILO_GEN(7)); - dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8; - dw2 = pattern[0]; - dw3 = pattern[1]; - break; - default: - assert(!"unsupported sample count"); - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1; - dw2 = 0; - dw3 = 0; - break; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2); - dw[1] = dw1; - dw[2] = dw2; + /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */ + dw[1] = rs->sample[0]; + + /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */ + dw[2] = (sample_count >= 4) ? packed[0] : 0; if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) - dw[3] = dw3; + dw[3] = (sample_count >= 8) ? packed[1] : 0; } static inline void gen8_3DSTATE_MULTISAMPLE(struct ilo_builder *builder, - int num_samples, - bool pixel_location_center) + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 2; - uint32_t dw1, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER : - GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER; - - switch (num_samples) { - case 0: - case 1: - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1; - break; - case 2: - dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2; - break; - case 4: - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4; - break; - case 8: - dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8; - break; - case 16: - dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16; - break; - default: - assert(!"unsupported sample count"); - dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1; - break; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2); - dw[1] = dw1; + /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */ + dw[1] = rs->sample[0]; } static inline void gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder, - const uint32_t *pattern_1x, - const uint32_t *pattern_2x, - const uint32_t *pattern_4x, - const uint32_t *pattern_8x, - const uint32_t *pattern_16x) + const struct ilo_state_sample_pattern *pattern) { const uint8_t cmd_len = 9; uint32_t *dw; @@ -966,61 +501,32 @@ gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SAMPLE_PATTERN) | (cmd_len - 2); - dw[1] = pattern_16x[3]; - dw[2] = pattern_16x[2]; - dw[3] = pattern_16x[1]; - dw[4] = pattern_16x[0]; - dw[5] = pattern_8x[1]; - dw[6] = pattern_8x[0]; - dw[7] = pattern_4x[0]; - dw[8] = pattern_1x[0] << 16 | - pattern_2x[0]; + dw[1] = 0; + dw[2] = 0; + dw[3] = 0; + dw[4] = 0; + /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */ + dw[5] = ((const uint32_t *) pattern->pattern_8x)[1]; + dw[6] = ((const uint32_t *) pattern->pattern_8x)[0]; + dw[7] = ((const uint32_t *) pattern->pattern_4x)[0]; + dw[8] = pattern->pattern_1x[0] << 16 | + ((const uint16_t *) pattern->pattern_2x)[0]; } static inline void gen6_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder, - unsigned sample_mask) + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 2; - const unsigned valid_mask = 0xf; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 6, 6); - - sample_mask &= valid_mask; - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2); - dw[1] = sample_mask; -} - -static inline void -gen7_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder, - unsigned sample_mask, - int num_samples) -{ - const uint8_t cmd_len = 2; - const unsigned valid_mask = ((1 << num_samples) - 1) | 0x1; - uint32_t *dw; - - ILO_DEV_ASSERT(builder->dev, 7, 8); - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 294: - * - * "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field - * (Sample Mask) must be zero. - * - * If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field - * must be zero." - */ - sample_mask &= valid_mask; + ILO_DEV_ASSERT(builder->dev, 6, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2); - dw[1] = sample_mask; + /* see raster_set_gen6_3DSTATE_SAMPLE_MASK() */ + dw[1] = rs->sample[1]; } static inline void @@ -1070,95 +576,75 @@ gen6_3DSTATE_DRAWING_RECTANGLE(struct ilo_builder *builder, static inline void gen6_3DSTATE_POLY_STIPPLE_OFFSET(struct ilo_builder *builder, - int x_offset, int y_offset) + const struct ilo_state_poly_stipple *stipple) { const uint8_t cmd_len = 2; uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 8); - assert(x_offset >= 0 && x_offset <= 31); - assert(y_offset >= 0 && y_offset <= 31); - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_OFFSET) | (cmd_len - 2); - dw[1] = x_offset << 8 | y_offset; + /* constant */ + dw[1] = 0; } static inline void gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_builder *builder, - const struct pipe_poly_stipple *pattern) + const struct ilo_state_poly_stipple *stipple) { const uint8_t cmd_len = 33; uint32_t *dw; - int i; ILO_DEV_ASSERT(builder->dev, 6, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_PATTERN) | (cmd_len - 2); - dw++; - - STATIC_ASSERT(Elements(pattern->stipple) == 32); - for (i = 0; i < 32; i++) - dw[i] = pattern->stipple[i]; + /* see poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN() */ + memcpy(&dw[1], stipple->stipple, sizeof(stipple->stipple)); } static inline void gen6_3DSTATE_LINE_STIPPLE(struct ilo_builder *builder, - unsigned pattern, unsigned factor) + const struct ilo_state_line_stipple *stipple) { const uint8_t cmd_len = 3; - unsigned inverse; uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 8); - assert((pattern & 0xffff) == pattern); - assert(factor >= 1 && factor <= 256); - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_LINE_STIPPLE) | (cmd_len - 2); - dw[1] = pattern; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) { - /* in U1.16 */ - inverse = 65536 / factor; - - dw[2] = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT | - factor; - } - else { - /* in U1.13 */ - inverse = 8192 / factor; - - dw[2] = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT | - factor; - } + /* see line_stipple_set_gen6_3DSTATE_LINE_STIPPLE() */ + dw[1] = stipple->stipple[0]; + dw[2] = stipple->stipple[1]; } static inline void -gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder) +gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder, + const struct ilo_state_raster *rs) { const uint8_t cmd_len = 3; - const uint32_t dw[3] = { - GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2), - 0 << GEN6_AA_LINE_DW1_BIAS__SHIFT | 0, - 0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT | 0, - }; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 8); - ilo_builder_batch_write(builder, cmd_len, dw); + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2); + /* constant */ + dw[1] = 0 << GEN6_AA_LINE_DW1_BIAS__SHIFT | + 0 << GEN6_AA_LINE_DW1_SLOPE__SHIFT; + dw[2] = 0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT | + 0 << GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT; } static inline void gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder, - const struct ilo_zs_surface *zs, - bool aligned_8x4) + const struct ilo_state_zs *zs) { const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? GEN7_RENDER_CMD(3D, 3DSTATE_DEPTH_BUFFER) : @@ -1172,44 +658,49 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder, pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = cmd | (cmd_len - 2); - dw[1] = zs->payload[0]; - dw[2] = 0; - /* see ilo_gpe_init_zs_surface() */ + /* + * see zs_set_gen6_3DSTATE_DEPTH_BUFFER() and + * zs_set_gen7_3DSTATE_DEPTH_BUFFER() + */ if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { + dw[1] = zs->depth[0]; + dw[2] = 0; dw[3] = 0; - dw[4] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2]; - dw[5] = zs->payload[3]; - dw[6] = zs->payload[4]; - dw[7] = zs->payload[5]; + dw[4] = zs->depth[2]; + dw[5] = zs->depth[3]; + dw[6] = 0; + dw[7] = zs->depth[4]; dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT; - if (zs->bo) { - ilo_builder_batch_reloc64(builder, pos + 2, zs->bo, - zs->payload[1], INTEL_RELOC_WRITE); + if (zs->depth_bo) { + ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo, + zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { - dw[3] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2]; - dw[4] = zs->payload[3]; - dw[5] = zs->payload[4]; - dw[6] = zs->payload[5]; + dw[1] = zs->depth[0]; + dw[2] = 0; + dw[3] = zs->depth[2]; + dw[4] = zs->depth[3]; + dw[5] = 0; + dw[6] = zs->depth[4]; if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) dw[4] |= builder->mocs << GEN7_DEPTH_DW4_MOCS__SHIFT; else dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT; - if (zs->bo) { - ilo_builder_batch_reloc(builder, pos + 2, zs->bo, - zs->payload[1], INTEL_RELOC_WRITE); + if (zs->depth_bo) { + ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo, + zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } } static inline void gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder, - const struct ilo_zs_surface *zs) + const struct ilo_state_zs *zs) { const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? GEN7_RENDER_CMD(3D, 3DSTATE_STENCIL_BUFFER) : @@ -1223,33 +714,36 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder, pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = cmd | (cmd_len - 2); - /* see ilo_gpe_init_zs_surface() */ - dw[1] = zs->payload[6]; - dw[2] = 0; + /* see zs_set_gen6_3DSTATE_STENCIL_BUFFER() */ if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT; - + dw[1] = zs->stencil[0]; + dw[2] = 0; dw[3] = 0; - dw[4] = zs->payload[8]; + dw[4] = zs->stencil[2]; - if (zs->separate_s8_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, - zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE); + dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT; + + if (zs->stencil_bo) { + ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo, + zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { + dw[1] = zs->stencil[0]; + dw[2] = 0; + dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT; - if (zs->separate_s8_bo) { - ilo_builder_batch_reloc(builder, pos + 2, - zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE); + if (zs->stencil_bo) { + ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo, + zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); } } } static inline void gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder, - const struct ilo_zs_surface *zs) + const struct ilo_state_zs *zs) { const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? GEN7_RENDER_CMD(3D, 3DSTATE_HIER_DEPTH_BUFFER) : @@ -1263,26 +757,29 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder, pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = cmd | (cmd_len - 2); - /* see ilo_gpe_init_zs_surface() */ - dw[1] = zs->payload[9]; - dw[2] = 0; + /* see zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER() */ if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT; - + dw[1] = zs->hiz[0]; + dw[2] = 0; dw[3] = 0; - dw[4] = zs->payload[11]; + dw[4] = zs->hiz[2]; + + dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT; if (zs->hiz_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, - zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE); + ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo, + zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { + dw[1] = zs->hiz[0]; + dw[2] = 0; + dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT; if (zs->hiz_bo) { - ilo_builder_batch_reloc(builder, pos + 2, - zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE); + ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo, + zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } } @@ -1440,34 +937,24 @@ gen7_3DSTATE_BLEND_STATE_POINTERS(struct ilo_builder *builder, static inline uint32_t gen6_CLIP_VIEWPORT(struct ilo_builder *builder, - const struct ilo_viewport_cso *viewports, - unsigned num_viewports) + const struct ilo_state_viewport *vp) { const int state_align = 32; - const int state_len = 4 * num_viewports; + const int state_len = 4 * vp->count; uint32_t state_offset, *dw; - unsigned i; + int i; ILO_DEV_ASSERT(builder->dev, 6, 6); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 193: - * - * "The viewport-related state is stored as an array of up to 16 - * elements..." - */ - assert(num_viewports && num_viewports <= 16); - state_offset = ilo_builder_dynamic_pointer(builder, ILO_BUILDER_ITEM_CLIP_VIEWPORT, state_align, state_len, &dw); - for (i = 0; i < num_viewports; i++) { - const struct ilo_viewport_cso *vp = &viewports[i]; - - dw[0] = fui(vp->min_gbx); - dw[1] = fui(vp->max_gbx); - dw[2] = fui(vp->min_gby); - dw[3] = fui(vp->max_gby); + for (i = 0; i < vp->count; i++) { + /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */ + dw[0] = vp->sf_clip[i][8]; + dw[1] = vp->sf_clip[i][9]; + dw[2] = vp->sf_clip[i][10]; + dw[3] = vp->sf_clip[i][11]; dw += 4; } @@ -1477,38 +964,21 @@ gen6_CLIP_VIEWPORT(struct ilo_builder *builder, static inline uint32_t gen6_SF_VIEWPORT(struct ilo_builder *builder, - const struct ilo_viewport_cso *viewports, - unsigned num_viewports) + const struct ilo_state_viewport *vp) { const int state_align = 32; - const int state_len = 8 * num_viewports; + const int state_len = 8 * vp->count; uint32_t state_offset, *dw; - unsigned i; + int i; ILO_DEV_ASSERT(builder->dev, 6, 6); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 262: - * - * "The viewport-specific state used by the SF unit (SF_VIEWPORT) is - * stored as an array of up to 16 elements..." - */ - assert(num_viewports && num_viewports <= 16); - state_offset = ilo_builder_dynamic_pointer(builder, ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw); - for (i = 0; i < num_viewports; i++) { - const struct ilo_viewport_cso *vp = &viewports[i]; - - dw[0] = fui(vp->m00); - dw[1] = fui(vp->m11); - dw[2] = fui(vp->m22); - dw[3] = fui(vp->m30); - dw[4] = fui(vp->m31); - dw[5] = fui(vp->m32); - dw[6] = 0; - dw[7] = 0; + for (i = 0; i < vp->count; i++) { + /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */ + memcpy(dw, vp->sf_clip[i], sizeof(*dw) * 8); dw += 8; } @@ -1518,298 +988,103 @@ gen6_SF_VIEWPORT(struct ilo_builder *builder, static inline uint32_t gen7_SF_CLIP_VIEWPORT(struct ilo_builder *builder, - const struct ilo_viewport_cso *viewports, - unsigned num_viewports) + const struct ilo_state_viewport *vp) { const int state_align = 64; - const int state_len = 16 * num_viewports; - uint32_t state_offset, *dw; - unsigned i; + const int state_len = 16 * vp->count; ILO_DEV_ASSERT(builder->dev, 7, 8); - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 270: - * - * "The viewport-specific state used by both the SF and CL units - * (SF_CLIP_VIEWPORT) is stored as an array of up to 16 elements, each - * of which contains the DWords described below. The start of each - * element is spaced 16 DWords apart. The location of first element of - * the array, as specified by both Pointer to SF_VIEWPORT and Pointer - * to CLIP_VIEWPORT, is aligned to a 64-byte boundary." - */ - assert(num_viewports && num_viewports <= 16); - - state_offset = ilo_builder_dynamic_pointer(builder, - ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw); - - for (i = 0; i < num_viewports; i++) { - const struct ilo_viewport_cso *vp = &viewports[i]; - - dw[0] = fui(vp->m00); - dw[1] = fui(vp->m11); - dw[2] = fui(vp->m22); - dw[3] = fui(vp->m30); - dw[4] = fui(vp->m31); - dw[5] = fui(vp->m32); - dw[6] = 0; - dw[7] = 0; - - dw[8] = fui(vp->min_gbx); - dw[9] = fui(vp->max_gbx); - dw[10] = fui(vp->min_gby); - dw[11] = fui(vp->max_gby); - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[12] = fui(vp->min_x); - dw[13] = fui(vp->max_x - 1.0f); - dw[14] = fui(vp->min_y); - dw[15] = fui(vp->max_y - 1.0f); - } else { - dw[12] = 0; - dw[13] = 0; - dw[14] = 0; - dw[15] = 0; - } - - dw += 16; - } - - return state_offset; + /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */ + return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SF_VIEWPORT, + state_align, state_len, (const uint32_t *) vp->sf_clip); } static inline uint32_t gen6_CC_VIEWPORT(struct ilo_builder *builder, - const struct ilo_viewport_cso *viewports, - unsigned num_viewports) + const struct ilo_state_viewport *vp) { const int state_align = 32; - const int state_len = 2 * num_viewports; - uint32_t state_offset, *dw; - unsigned i; + const int state_len = 2 * vp->count; ILO_DEV_ASSERT(builder->dev, 6, 8); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 385: - * - * "The viewport state is stored as an array of up to 16 elements..." - */ - assert(num_viewports && num_viewports <= 16); - - state_offset = ilo_builder_dynamic_pointer(builder, - ILO_BUILDER_ITEM_CC_VIEWPORT, state_align, state_len, &dw); - - for (i = 0; i < num_viewports; i++) { - const struct ilo_viewport_cso *vp = &viewports[i]; - - dw[0] = fui(vp->min_z); - dw[1] = fui(vp->max_z); - - dw += 2; - } - - return state_offset; + /* see viewport_matrix_set_gen6_CC_VIEWPORT() */ + return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_CC_VIEWPORT, + state_align, state_len, (const uint32_t *) vp->cc); } static inline uint32_t gen6_SCISSOR_RECT(struct ilo_builder *builder, - const struct ilo_scissor_state *scissor, - unsigned num_viewports) + const struct ilo_state_viewport *vp) { const int state_align = 32; - const int state_len = 2 * num_viewports; + const int state_len = 2 * vp->count; ILO_DEV_ASSERT(builder->dev, 6, 8); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 263: - * - * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is - * stored as an array of up to 16 elements..." - */ - assert(num_viewports && num_viewports <= 16); - assert(Elements(scissor->payload) >= state_len); - + /* see viewport_scissor_set_gen6_SCISSOR_RECT() */ return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SCISSOR_RECT, - state_align, state_len, scissor->payload); + state_align, state_len, (const uint32_t *) vp->scissor); } static inline uint32_t gen6_COLOR_CALC_STATE(struct ilo_builder *builder, - const struct pipe_stencil_ref *stencil_ref, - ubyte alpha_ref, - const struct pipe_blend_color *blend_color) + const struct ilo_state_cc *cc) { const int state_align = 64; const int state_len = 6; - uint32_t state_offset, *dw; ILO_DEV_ASSERT(builder->dev, 6, 8); - state_offset = ilo_builder_dynamic_pointer(builder, - ILO_BUILDER_ITEM_COLOR_CALC, state_align, state_len, &dw); - - dw[0] = stencil_ref->ref_value[0] << 24 | - stencil_ref->ref_value[1] << 16 | - GEN6_CC_DW0_ALPHATEST_UNORM8; - dw[1] = alpha_ref; - dw[2] = fui(blend_color->color[0]); - dw[3] = fui(blend_color->color[1]); - dw[4] = fui(blend_color->color[2]); - dw[5] = fui(blend_color->color[3]); - - return state_offset; + /* see cc_params_set_gen6_COLOR_CALC_STATE() */ + return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_COLOR_CALC, + state_align, state_len, cc->cc); } static inline uint32_t gen6_DEPTH_STENCIL_STATE(struct ilo_builder *builder, - const struct ilo_dsa_state *dsa) + const struct ilo_state_cc *cc) { const int state_align = 64; const int state_len = 3; ILO_DEV_ASSERT(builder->dev, 6, 7.5); - STATIC_ASSERT(Elements(dsa->payload) >= state_len); - + /* see cc_set_gen6_DEPTH_STENCIL_STATE() */ return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_DEPTH_STENCIL, - state_align, state_len, dsa->payload); + state_align, state_len, cc->ds); } static inline uint32_t gen6_BLEND_STATE(struct ilo_builder *builder, - const struct ilo_blend_state *blend, - const struct ilo_fb_state *fb, - const struct ilo_dsa_state *dsa) + const struct ilo_state_cc *cc) { const int state_align = 64; - int state_len; - uint32_t state_offset, *dw; - unsigned num_targets, i; + const int state_len = 2 * cc->blend_state_count; ILO_DEV_ASSERT(builder->dev, 6, 7.5); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 376: - * - * "The blend state is stored as an array of up to 8 elements..." - */ - num_targets = fb->state.nr_cbufs; - assert(num_targets <= 8); - - if (!num_targets) { - if (!dsa->dw_blend_alpha) - return 0; - /* to be able to reference alpha func */ - num_targets = 1; - } - - state_len = 2 * num_targets; - - state_offset = ilo_builder_dynamic_pointer(builder, - ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw); - - for (i = 0; i < num_targets; i++) { - const struct ilo_blend_cso *cso = &blend->cso[i]; - - dw[0] = cso->payload[0]; - dw[1] = cso->payload[1] | blend->dw_shared; - - if (i < fb->state.nr_cbufs && fb->state.cbufs[i]) { - const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i]; - - if (caps->can_blend) { - if (caps->dst_alpha_forced_one) - dw[0] |= cso->dw_blend_dst_alpha_forced_one; - else - dw[0] |= cso->dw_blend; - } - - if (caps->can_logicop) - dw[1] |= blend->dw_logicop; - - if (caps->can_alpha_test) - dw[1] |= dsa->dw_blend_alpha; - } else { - dw[1] |= GEN6_RT_DW1_WRITE_DISABLE_A | - GEN6_RT_DW1_WRITE_DISABLE_R | - GEN6_RT_DW1_WRITE_DISABLE_G | - GEN6_RT_DW1_WRITE_DISABLE_B | - dsa->dw_blend_alpha; - } + if (!state_len) + return 0; - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 356: - * - * "When NumSamples = 1, AlphaToCoverage and AlphaToCoverage - * Dither both must be disabled." - * - * There is no such limitation on GEN7, or for AlphaToOne. But GL - * requires that anyway. - */ - if (fb->num_samples > 1) - dw[1] |= blend->dw_alpha_mod; - - dw += 2; - } - - return state_offset; + /* see cc_set_gen6_BLEND_STATE() */ + return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND, + state_align, state_len, cc->blend); } static inline uint32_t gen8_BLEND_STATE(struct ilo_builder *builder, - const struct ilo_blend_state *blend, - const struct ilo_fb_state *fb, - const struct ilo_dsa_state *dsa) + const struct ilo_state_cc *cc) { const int state_align = 64; - const int state_len = 1 + 2 * fb->state.nr_cbufs; - uint32_t state_offset, *dw; - unsigned i; + const int state_len = 1 + 2 * cc->blend_state_count; ILO_DEV_ASSERT(builder->dev, 8, 8); - assert(fb->state.nr_cbufs <= 8); - - state_offset = ilo_builder_dynamic_pointer(builder, - ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw); - - dw[0] = blend->dw_shared; - if (fb->num_samples > 1) - dw[0] |= blend->dw_alpha_mod; - if (!fb->state.nr_cbufs || fb->blend_caps[0].can_alpha_test) - dw[0] |= dsa->dw_blend_alpha; - dw++; - - for (i = 0; i < fb->state.nr_cbufs; i++) { - const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i]; - const struct ilo_blend_cso *cso = &blend->cso[i]; - - dw[0] = cso->payload[0]; - dw[1] = cso->payload[1]; - - if (fb->state.cbufs[i]) { - if (caps->can_blend) { - if (caps->dst_alpha_forced_one) - dw[0] |= cso->dw_blend_dst_alpha_forced_one; - else - dw[0] |= cso->dw_blend; - } - - if (caps->can_logicop) - dw[1] |= blend->dw_logicop; - } else { - dw[0] |= GEN8_RT_DW0_WRITE_DISABLE_A | - GEN8_RT_DW0_WRITE_DISABLE_R | - GEN8_RT_DW0_WRITE_DISABLE_G | - GEN8_RT_DW0_WRITE_DISABLE_B; - } - - dw += 2; - } - - return state_offset; + /* see cc_set_gen8_BLEND_STATE() */ + return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND, + state_align, state_len, &cc->blend[1]); } #endif /* ILO_BUILDER_3D_BOTTOM_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h index 05dbce7c905..8d30095e6f6 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h @@ -29,303 +29,167 @@ #define ILO_BUILDER_3D_TOP_H #include "genhw/genhw.h" -#include "../ilo_resource.h" -#include "../ilo_shader.h" #include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" -#include "ilo_state_3d.h" +#include "ilo_state_sampler.h" +#include "ilo_state_shader.h" +#include "ilo_state_sol.h" +#include "ilo_state_surface.h" +#include "ilo_state_urb.h" +#include "ilo_state_vf.h" #include "ilo_builder.h" static inline void gen6_3DSTATE_URB(struct ilo_builder *builder, - int vs_total_size, int gs_total_size, - int vs_entry_size, int gs_entry_size) + const struct ilo_state_urb *urb) { const uint8_t cmd_len = 3; - const int row_size = 128; /* 1024 bits */ - int vs_alloc_size, gs_alloc_size; - int vs_num_entries, gs_num_entries; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 6, 6); - - /* in 1024-bit URB rows */ - vs_alloc_size = (vs_entry_size + row_size - 1) / row_size; - gs_alloc_size = (gs_entry_size + row_size - 1) / row_size; - - /* the valid range is [1, 5] */ - if (!vs_alloc_size) - vs_alloc_size = 1; - if (!gs_alloc_size) - gs_alloc_size = 1; - assert(vs_alloc_size <= 5 && gs_alloc_size <= 5); - - /* the valid range is [24, 256] in multiples of 4 */ - vs_num_entries = (vs_total_size / row_size / vs_alloc_size) & ~3; - if (vs_num_entries > 256) - vs_num_entries = 256; - assert(vs_num_entries >= 24); - - /* the valid range is [0, 256] in multiples of 4 */ - gs_num_entries = (gs_total_size / row_size / gs_alloc_size) & ~3; - if (gs_num_entries > 256) - gs_num_entries = 256; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_URB) | (cmd_len - 2); - dw[1] = (vs_alloc_size - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT | - vs_num_entries << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT; - dw[2] = gs_num_entries << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT | - (gs_alloc_size - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT; + /* see urb_set_gen6_3DSTATE_URB() */ + dw[1] = urb->urb[0]; + dw[2] = urb->urb[1]; } static inline void -gen7_3dstate_push_constant_alloc(struct ilo_builder *builder, - int subop, int offset, int size) +gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder, + const struct ilo_state_urb *urb) { - const uint32_t cmd = GEN6_RENDER_TYPE_RENDER | - GEN6_RENDER_SUBTYPE_3D | - subop; const uint8_t cmd_len = 2; - const int slice_count = ((ilo_dev_gen(builder->dev) == ILO_GEN(7.5) && - builder->dev->gt == 3) || - ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 2 : 1; uint32_t *dw; - int end; - - ILO_DEV_ASSERT(builder->dev, 7, 8); - - /* VS, HS, DS, GS, and PS variants */ - assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS && - subop <= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS); - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 68: - * - * "(A table that says the maximum size of each constant buffer is - * 16KB") - * - * From the Ivy Bridge PRM, volume 2 part 1, page 115: - * - * "The sum of the Constant Buffer Offset and the Constant Buffer Size - * may not exceed the maximum value of the Constant Buffer Size." - * - * Thus, the valid range of buffer end is [0KB, 16KB]. - */ - end = (offset + size) / 1024; - if (end > 16 * slice_count) { - assert(!"invalid constant buffer end"); - end = 16 * slice_count; - } - - /* the valid range of buffer offset is [0KB, 15KB] */ - offset = (offset + 1023) / 1024; - if (offset > 15 * slice_count) { - assert(!"invalid constant buffer offset"); - offset = 15 * slice_count; - } - - if (offset > end) { - assert(!size); - offset = end; - } - - /* the valid range of buffer size is [0KB, 15KB] */ - size = end - offset; - if (size > 15 * slice_count) { - assert(!"invalid constant buffer size"); - size = 15 * slice_count; - } - - assert(offset % slice_count == 0 && size % slice_count == 0); ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = cmd | (cmd_len - 2); - dw[1] = offset << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT | - size; -} - -static inline void -gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder, - int offset, int size) -{ - gen7_3dstate_push_constant_alloc(builder, - GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS, offset, size); + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_VS) | + (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->pcb[0]; } static inline void gen7_3DSTATE_PUSH_CONSTANT_ALLOC_HS(struct ilo_builder *builder, - int offset, int size) + const struct ilo_state_urb *urb) { - gen7_3dstate_push_constant_alloc(builder, - GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_HS, offset, size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_HS) | + (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->pcb[1]; } static inline void gen7_3DSTATE_PUSH_CONSTANT_ALLOC_DS(struct ilo_builder *builder, - int offset, int size) + const struct ilo_state_urb *urb) { - gen7_3dstate_push_constant_alloc(builder, - GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_DS, offset, size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_DS) | + (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->pcb[2]; } static inline void gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(struct ilo_builder *builder, - int offset, int size) + const struct ilo_state_urb *urb) { - gen7_3dstate_push_constant_alloc(builder, - GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_GS, offset, size); -} + const uint8_t cmd_len = 2; + uint32_t *dw; -static inline void -gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder, - int offset, int size) -{ - gen7_3dstate_push_constant_alloc(builder, - GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS, offset, size); + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_GS) | + (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->pcb[3]; } static inline void -gen7_3dstate_urb(struct ilo_builder *builder, - int subop, int offset, int size, - int entry_size) +gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder, + const struct ilo_state_urb *urb) { - const uint32_t cmd = GEN6_RENDER_TYPE_RENDER | - GEN6_RENDER_SUBTYPE_3D | - subop; const uint8_t cmd_len = 2; - const int row_size = 64; /* 512 bits */ - int alloc_size, num_entries, min_entries, max_entries; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 8); - - /* VS, HS, DS, and GS variants */ - assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_URB_VS && - subop <= GEN7_RENDER_OPCODE_3DSTATE_URB_GS); - - /* in multiples of 8KB */ - assert(offset % 8192 == 0); - offset /= 8192; - - /* in multiple of 512-bit rows */ - alloc_size = (entry_size + row_size - 1) / row_size; - if (!alloc_size) - alloc_size = 1; - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 34: - * - * "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may - * cause performance to decrease due to banking in the URB. Element - * sizes of 16 to 20 should be programmed with six 512-bit URB rows." - */ - if (subop == GEN7_RENDER_OPCODE_3DSTATE_URB_VS && alloc_size == 5) - alloc_size = 6; - - /* in multiples of 8 */ - num_entries = (size / row_size / alloc_size) & ~7; - - switch (subop) { - case GEN7_RENDER_OPCODE_3DSTATE_URB_VS: - switch (ilo_dev_gen(builder->dev)) { - case ILO_GEN(8): - max_entries = 2560; - min_entries = 64; - break; - case ILO_GEN(7.5): - max_entries = (builder->dev->gt >= 2) ? 1664 : 640; - min_entries = (builder->dev->gt >= 2) ? 64 : 32; - break; - case ILO_GEN(7): - default: - max_entries = (builder->dev->gt == 2) ? 704 : 512; - min_entries = 32; - break; - } - - assert(num_entries >= min_entries); - if (num_entries > max_entries) - num_entries = max_entries; - break; - case GEN7_RENDER_OPCODE_3DSTATE_URB_HS: - max_entries = (builder->dev->gt == 2) ? 64 : 32; - if (num_entries > max_entries) - num_entries = max_entries; - break; - case GEN7_RENDER_OPCODE_3DSTATE_URB_DS: - if (num_entries) - assert(num_entries >= 138); - break; - case GEN7_RENDER_OPCODE_3DSTATE_URB_GS: - switch (ilo_dev_gen(builder->dev)) { - case ILO_GEN(8): - max_entries = 960; - break; - case ILO_GEN(7.5): - max_entries = (builder->dev->gt >= 2) ? 640 : 256; - break; - case ILO_GEN(7): - default: - max_entries = (builder->dev->gt == 2) ? 320 : 192; - break; - } - - if (num_entries > max_entries) - num_entries = max_entries; - break; - default: - break; - } - ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = cmd | (cmd_len - 2); - dw[1] = offset << GEN7_URB_DW1_OFFSET__SHIFT | - (alloc_size - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT | - num_entries; + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_PS) | + (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->pcb[4]; } static inline void gen7_3DSTATE_URB_VS(struct ilo_builder *builder, - int offset, int size, int entry_size) + const struct ilo_state_urb *urb) { - gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_VS, - offset, size, entry_size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_VS) | (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->urb[0]; } static inline void gen7_3DSTATE_URB_HS(struct ilo_builder *builder, - int offset, int size, int entry_size) + const struct ilo_state_urb *urb) { - gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_HS, - offset, size, entry_size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_HS) | (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->urb[1]; } static inline void gen7_3DSTATE_URB_DS(struct ilo_builder *builder, - int offset, int size, int entry_size) + const struct ilo_state_urb *urb) { - gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_DS, - offset, size, entry_size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_DS) | (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->urb[2]; } static inline void gen7_3DSTATE_URB_GS(struct ilo_builder *builder, - int offset, int size, int entry_size) + const struct ilo_state_urb *urb) { - gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_GS, - offset, size, entry_size); + const uint8_t cmd_len = 2; + uint32_t *dw; + + ilo_builder_batch_pointer(builder, cmd_len, &dw); + + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_GS) | (cmd_len - 2); + /* see urb_set_gen7_3dstate_push_constant_alloc() */ + dw[1] = urb->urb[3]; } static inline void gen75_3DSTATE_VF(struct ilo_builder *builder, - bool enable_cut_index, - uint32_t cut_index) + const struct ilo_state_vf *vf) { const uint8_t cmd_len = 2; uint32_t *dw; @@ -334,11 +198,10 @@ gen75_3DSTATE_VF(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2); - if (enable_cut_index) - dw[0] |= GEN75_VF_DW0_CUT_INDEX_ENABLE; - - dw[1] = cut_index; + /* see vf_params_set_gen75_3DSTATE_VF() */ + dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2) | + vf->cut[0]; + dw[1] = vf->cut[1]; } static inline void @@ -354,40 +217,11 @@ gen6_3DSTATE_VF_STATISTICS(struct ilo_builder *builder, ilo_builder_batch_write(builder, cmd_len, &dw0); } -/** - * Translate a pipe primitive type to the matching hardware primitive type. - */ -static inline int -gen6_3d_translate_pipe_prim(unsigned prim) -{ - static const int prim_mapping[ILO_PRIM_MAX] = { - [PIPE_PRIM_POINTS] = GEN6_3DPRIM_POINTLIST, - [PIPE_PRIM_LINES] = GEN6_3DPRIM_LINELIST, - [PIPE_PRIM_LINE_LOOP] = GEN6_3DPRIM_LINELOOP, - [PIPE_PRIM_LINE_STRIP] = GEN6_3DPRIM_LINESTRIP, - [PIPE_PRIM_TRIANGLES] = GEN6_3DPRIM_TRILIST, - [PIPE_PRIM_TRIANGLE_STRIP] = GEN6_3DPRIM_TRISTRIP, - [PIPE_PRIM_TRIANGLE_FAN] = GEN6_3DPRIM_TRIFAN, - [PIPE_PRIM_QUADS] = GEN6_3DPRIM_QUADLIST, - [PIPE_PRIM_QUAD_STRIP] = GEN6_3DPRIM_QUADSTRIP, - [PIPE_PRIM_POLYGON] = GEN6_3DPRIM_POLYGON, - [PIPE_PRIM_LINES_ADJACENCY] = GEN6_3DPRIM_LINELIST_ADJ, - [PIPE_PRIM_LINE_STRIP_ADJACENCY] = GEN6_3DPRIM_LINESTRIP_ADJ, - [PIPE_PRIM_TRIANGLES_ADJACENCY] = GEN6_3DPRIM_TRILIST_ADJ, - [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = GEN6_3DPRIM_TRISTRIP_ADJ, - [ILO_PRIM_RECTANGLES] = GEN6_3DPRIM_RECTLIST, - }; - - assert(prim_mapping[prim]); - - return prim_mapping[prim]; -} - static inline void -gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim) +gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, + enum gen_3dprim_type topology) { const uint8_t cmd_len = 2; - const int prim = gen6_3d_translate_pipe_prim(pipe_prim); uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); @@ -395,12 +229,13 @@ gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim) ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_TOPOLOGY) | (cmd_len - 2); - dw[1] = prim; + dw[1] = topology << GEN8_TOPOLOGY_DW1_TYPE__SHIFT; } static inline void gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder, - int vb_index, uint32_t step_rate) + const struct ilo_state_vf *vf, + uint32_t attr) { const uint8_t cmd_len = 3; uint32_t *dw; @@ -410,16 +245,20 @@ gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_INSTANCING) | (cmd_len - 2); - dw[1] = vb_index; - if (step_rate) - dw[1] |= GEN8_INSTANCING_DW1_ENABLE; - dw[2] = step_rate; + dw[1] = attr << GEN8_INSTANCING_DW1_VE_INDEX__SHIFT; + dw[2] = 0; + /* see vf_set_gen8_3DSTATE_VF_INSTANCING() */ + if (attr >= vf->internal_ve_count) { + attr -= vf->internal_ve_count; + + dw[1] |= vf->user_instancing[attr][0]; + dw[2] |= vf->user_instancing[attr][1]; + } } static inline void gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder, - bool vid_enable, int vid_ve, int vid_comp, - bool iid_enable, int iid_ve, int iid_comp) + const struct ilo_state_vf *vf) { const uint8_t cmd_len = 2; uint32_t *dw; @@ -429,29 +268,19 @@ gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder, ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_SGVS) | (cmd_len - 2); - dw[1] = 0; - - if (iid_enable) { - dw[1] |= GEN8_SGVS_DW1_IID_ENABLE | - vid_comp << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT | - vid_ve << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT; - } - - if (vid_enable) { - dw[1] |= GEN8_SGVS_DW1_VID_ENABLE | - vid_comp << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT | - vid_ve << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT; - } + /* see vf_params_set_gen8_3DSTATE_VF_SGVS() */ + dw[1] = vf->sgvs[0]; } static inline void gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder, - const struct ilo_ve_state *ve, - const struct ilo_vb_state *vb) + const struct ilo_state_vf *vf, + const struct ilo_state_vertex_buffer *vb, + unsigned vb_count) { uint8_t cmd_len; uint32_t *dw; - unsigned pos, hw_idx; + unsigned pos, i; ILO_DEV_ASSERT(builder->dev, 6, 8); @@ -460,67 +289,52 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder, * * "From 1 to 33 VBs can be specified..." */ - assert(ve->vb_count <= 33); + assert(vb_count <= 33); - if (!ve->vb_count) + if (!vb_count) return; - cmd_len = 1 + 4 * ve->vb_count; + cmd_len = 1 + 4 * vb_count; pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_BUFFERS) | (cmd_len - 2); dw++; pos++; - for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) { - const unsigned instance_divisor = ve->instance_divisors[hw_idx]; - const unsigned pipe_idx = ve->vb_mapping[hw_idx]; - const struct pipe_vertex_buffer *cso = &vb->states[pipe_idx]; + for (i = 0; i < vb_count; i++) { + const struct ilo_state_vertex_buffer *b = &vb[i]; - dw[0] = hw_idx << GEN6_VB_DW0_INDEX__SHIFT; + /* see vertex_buffer_set_gen8_vertex_buffer_state() */ + dw[0] = b->vb[0] | + i << GEN6_VB_DW0_INDEX__SHIFT; if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) dw[0] |= builder->mocs << GEN8_VB_DW0_MOCS__SHIFT; else dw[0] |= builder->mocs << GEN6_VB_DW0_MOCS__SHIFT; - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) - dw[0] |= GEN7_VB_DW0_ADDR_MODIFIED; - - if (instance_divisor) - dw[0] |= GEN6_VB_DW0_ACCESS_INSTANCEDATA; - else - dw[0] |= GEN6_VB_DW0_ACCESS_VERTEXDATA; - - /* use null vb if there is no buffer or the stride is out of range */ - if (!cso->buffer || cso->stride > 2048) { - dw[0] |= GEN6_VB_DW0_IS_NULL; - dw[1] = 0; - dw[2] = 0; - dw[3] = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? - 0 : instance_divisor; - - continue; - } - - dw[0] |= cso->stride << GEN6_VB_DW0_PITCH__SHIFT; + dw[1] = 0; + dw[2] = 0; + dw[3] = 0; if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - const struct ilo_buffer *buf = ilo_buffer(cso->buffer); - const uint32_t start_offset = cso->buffer_offset; + if (b->need_bo) + ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0); - ilo_builder_batch_reloc64(builder, pos + 1, - buf->bo, start_offset, 0); - dw[3] = buf->bo_size; + dw[3] |= b->vb[2]; } else { - const struct ilo_buffer *buf = ilo_buffer(cso->buffer); - const uint32_t start_offset = cso->buffer_offset; - const uint32_t end_offset = buf->bo_size - 1; + const int8_t elem = vf->vb_to_first_elem[i]; - dw[3] = instance_divisor; + /* see vf_set_gen6_vertex_buffer_state() */ + if (elem >= 0) { + dw[0] |= vf->user_instancing[elem][0]; + dw[3] |= vf->user_instancing[elem][1]; + } - ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0); - ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0); + if (b->need_bo) { + ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0); + ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0); + } } dw += 4; @@ -563,248 +377,189 @@ gen6_user_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder, static inline void gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_builder *builder, - const struct ilo_ve_state *ve) + const struct ilo_state_vf *vf) { uint8_t cmd_len; uint32_t *dw; - unsigned i; ILO_DEV_ASSERT(builder->dev, 6, 8); - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 92: - * - * "At least one VERTEX_ELEMENT_STATE structure must be included." - * - * From the Sandy Bridge PRM, volume 2 part 1, page 93: - * - * "Up to 34 (DevSNB+) vertex elements are supported." - */ - assert(ve->count + ve->prepend_nosrc_cso >= 1); - assert(ve->count + ve->prepend_nosrc_cso <= 34); - - STATIC_ASSERT(Elements(ve->cso[0].payload) == 2); + cmd_len = 1 + 2 * (vf->internal_ve_count + vf->user_ve_count); - cmd_len = 1 + 2 * (ve->count + ve->prepend_nosrc_cso); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_ELEMENTS) | (cmd_len - 2); dw++; - if (ve->prepend_nosrc_cso) { - memcpy(dw, ve->nosrc_cso.payload, sizeof(ve->nosrc_cso.payload)); - dw += 2; - } - - for (i = 0; i < ve->count - ve->last_cso_edgeflag; i++) { - memcpy(dw, ve->cso[i].payload, sizeof(ve->cso[i].payload)); - dw += 2; + /* + * see vf_params_set_gen6_internal_ve() and + * vf_set_gen6_3DSTATE_VERTEX_ELEMENTS() + */ + if (vf->internal_ve_count) { + memcpy(dw, vf->internal_ve, + sizeof(vf->internal_ve[0]) * vf->internal_ve_count); + dw += 2 * vf->internal_ve_count; } - if (ve->last_cso_edgeflag) - memcpy(dw, ve->edgeflag_cso.payload, sizeof(ve->edgeflag_cso.payload)); + memcpy(dw, vf->user_ve, sizeof(vf->user_ve[0]) * vf->user_ve_count); } static inline void gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder, - const struct ilo_ib_state *ib, - bool enable_cut_index) + const struct ilo_state_vf *vf, + const struct ilo_state_index_buffer *ib) { const uint8_t cmd_len = 3; - struct ilo_buffer *buf = ilo_buffer(ib->hw_resource); - uint32_t start_offset, end_offset; - int format; - uint32_t *dw; + uint32_t dw0, *dw; unsigned pos; ILO_DEV_ASSERT(builder->dev, 6, 7.5); - if (!buf) - return; - - /* this is moved to the new 3DSTATE_VF */ - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) - assert(!enable_cut_index); - - switch (ib->hw_index_size) { - case 4: - format = GEN6_IB_DW0_FORMAT_DWORD; - break; - case 2: - format = GEN6_IB_DW0_FORMAT_WORD; - break; - case 1: - format = GEN6_IB_DW0_FORMAT_BYTE; - break; - default: - assert(!"unknown index size"); - format = GEN6_IB_DW0_FORMAT_BYTE; - break; - } + dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) | + builder->mocs << GEN6_IB_DW0_MOCS__SHIFT; /* - * set start_offset to 0 here and adjust pipe_draw_info::start with - * ib->draw_start_offset in 3DPRIMITIVE + * see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() and + * vf_params_set_gen6_3dstate_index_buffer() */ - start_offset = 0; - end_offset = buf->bo_size; - - /* end_offset must also be aligned and is inclusive */ - end_offset -= (end_offset % ib->hw_index_size); - end_offset--; + dw0 |= ib->ib[0]; + if (ilo_dev_gen(builder->dev) <= ILO_GEN(7)) + dw0 |= vf->cut[0]; pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) | - builder->mocs << GEN6_IB_DW0_MOCS__SHIFT | - format; - if (enable_cut_index) - dw[0] |= GEN6_IB_DW0_CUT_INDEX_ENABLE; - - ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0); - ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0); + dw[0] = dw0; + if (ib->need_bo) { + ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0); + ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0); + } else { + dw[1] = 0; + dw[2] = 0; + } } static inline void gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder, - const struct ilo_ib_state *ib) + const struct ilo_state_vf *vf, + const struct ilo_state_index_buffer *ib) { const uint8_t cmd_len = 5; - struct ilo_buffer *buf = ilo_buffer(ib->hw_resource); - int format; uint32_t *dw; unsigned pos; ILO_DEV_ASSERT(builder->dev, 8, 8); - if (!buf) - return; - - switch (ib->hw_index_size) { - case 4: - format = GEN8_IB_DW1_FORMAT_DWORD; - break; - case 2: - format = GEN8_IB_DW1_FORMAT_WORD; - break; - case 1: - format = GEN8_IB_DW1_FORMAT_BYTE; - break; - default: - assert(!"unknown index size"); - format = GEN8_IB_DW1_FORMAT_BYTE; - break; - } - pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2); - dw[1] = format | + /* see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() */ + dw[1] = ib->ib[0] | builder->mocs << GEN8_IB_DW1_MOCS__SHIFT; - dw[4] = buf->bo_size; - /* ignore ib->offset here in favor of adjusting 3DPRIMITIVE */ - ilo_builder_batch_reloc64(builder, pos + 2, buf->bo, 0, 0); + if (ib->need_bo) { + ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0); + } else { + dw[2] = 0; + dw[3] = 0; + } + + dw[4] = ib->ib[2]; } static inline void gen6_3DSTATE_VS(struct ilo_builder *builder, - const struct ilo_shader_state *vs) + const struct ilo_state_vs *vs, + uint32_t kernel_offset) { const uint8_t cmd_len = 6; - const struct ilo_shader_cso *cso; - uint32_t dw2, dw4, dw5, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 6, 7.5); - cso = ilo_shader_get_kernel_cso(vs); - dw2 = cso->payload[0]; - dw4 = cso->payload[1]; - dw5 = cso->payload[2]; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(vs); - dw[2] = dw2; - dw[3] = 0; /* scratch */ - dw[4] = dw4; - dw[5] = dw5; + dw[1] = kernel_offset; + /* see vs_set_gen6_3DSTATE_VS() */ + dw[2] = vs->vs[0]; + dw[3] = vs->vs[1]; + dw[4] = vs->vs[2]; + dw[5] = vs->vs[3]; } static inline void gen8_3DSTATE_VS(struct ilo_builder *builder, - const struct ilo_shader_state *vs, - uint32_t clip_plane_enable) + const struct ilo_state_vs *vs, + uint32_t kernel_offset) { const uint8_t cmd_len = 9; - const struct ilo_shader_cso *cso; - uint32_t dw3, dw6, dw7, dw8, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 8, 8); - cso = ilo_shader_get_kernel_cso(vs); - dw3 = cso->payload[0]; - dw6 = cso->payload[1]; - dw7 = cso->payload[2]; - dw8 = clip_plane_enable << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(vs); + dw[1] = kernel_offset; dw[2] = 0; - dw[3] = dw3; - dw[4] = 0; /* scratch */ + /* see vs_set_gen6_3DSTATE_VS() */ + dw[3] = vs->vs[0]; + dw[4] = vs->vs[1]; dw[5] = 0; - dw[6] = dw6; - dw[7] = dw7; - dw[8] = dw8; + dw[6] = vs->vs[2]; + dw[7] = vs->vs[3]; + dw[8] = vs->vs[4]; } static inline void -gen6_disable_3DSTATE_VS(struct ilo_builder *builder) +gen7_3DSTATE_HS(struct ilo_builder *builder, + const struct ilo_state_hs *hs, + uint32_t kernel_offset) { - const uint8_t cmd_len = 6; + const uint8_t cmd_len = 7; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 6, 7.5); + ILO_DEV_ASSERT(builder->dev, 7, 7.5); ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - dw[4] = 0; - dw[5] = 0; + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2); + /* see hs_set_gen7_3DSTATE_HS() */ + dw[1] = hs->hs[0]; + dw[2] = hs->hs[1]; + dw[3] = kernel_offset; + dw[4] = hs->hs[2]; + dw[5] = hs->hs[3]; + dw[6] = 0; } static inline void -gen7_disable_3DSTATE_HS(struct ilo_builder *builder) +gen8_3DSTATE_HS(struct ilo_builder *builder, + const struct ilo_state_hs *hs, + uint32_t kernel_offset) { - const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 7; + const uint8_t cmd_len = 9; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 8); + ILO_DEV_ASSERT(builder->dev, 8, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; + /* see hs_set_gen7_3DSTATE_HS() */ + dw[1] = hs->hs[0]; + dw[2] = hs->hs[1]; + dw[3] = kernel_offset; dw[4] = 0; - dw[5] = 0; + dw[5] = hs->hs[2]; dw[6] = 0; - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[7] = 0; - dw[8] = 0; - } + dw[7] = hs->hs[3]; + dw[8] = 0; } static inline void -gen7_3DSTATE_TE(struct ilo_builder *builder) +gen7_3DSTATE_TE(struct ilo_builder *builder, + const struct ilo_state_ds *ds) { const uint8_t cmd_len = 4; uint32_t *dw; @@ -814,108 +569,61 @@ gen7_3DSTATE_TE(struct ilo_builder *builder) ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_TE) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; + /* see ds_set_gen7_3DSTATE_TE() */ + dw[1] = ds->te[0]; + dw[2] = ds->te[1]; + dw[3] = ds->te[2]; } static inline void -gen7_disable_3DSTATE_DS(struct ilo_builder *builder) +gen7_3DSTATE_DS(struct ilo_builder *builder, + const struct ilo_state_ds *ds, + uint32_t kernel_offset) { - const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 6; + const uint8_t cmd_len = 6; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 8); + ILO_DEV_ASSERT(builder->dev, 7, 7.5); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - dw[4] = 0; - dw[5] = 0; - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[6] = 0; - dw[7] = 0; - dw[8] = 0; - } -} - -static inline void -gen6_3DSTATE_GS(struct ilo_builder *builder, - const struct ilo_shader_state *gs) -{ - const uint8_t cmd_len = 7; - const struct ilo_shader_cso *cso; - uint32_t dw2, dw4, dw5, dw6, *dw; - - ILO_DEV_ASSERT(builder->dev, 6, 6); - - cso = ilo_shader_get_kernel_cso(gs); - dw2 = cso->payload[0]; - dw4 = cso->payload[1]; - dw5 = cso->payload[2]; - dw6 = cso->payload[3]; - - ilo_builder_batch_pointer(builder, cmd_len, &dw); - - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(gs); - dw[2] = dw2; - dw[3] = 0; /* scratch */ - dw[4] = dw4; - dw[5] = dw5; - dw[6] = dw6; + /* see ds_set_gen7_3DSTATE_DS() */ + dw[1] = kernel_offset; + dw[2] = ds->ds[0]; + dw[3] = ds->ds[1]; + dw[4] = ds->ds[2]; + dw[5] = ds->ds[3]; } static inline void -gen6_so_3DSTATE_GS(struct ilo_builder *builder, - const struct ilo_shader_state *vs, - int verts_per_prim) +gen8_3DSTATE_DS(struct ilo_builder *builder, + const struct ilo_state_ds *ds, + uint32_t kernel_offset) { - const uint8_t cmd_len = 7; - struct ilo_shader_cso cso; - enum ilo_kernel_param param; - uint32_t dw2, dw4, dw5, dw6, *dw; - - ILO_DEV_ASSERT(builder->dev, 6, 6); - - assert(ilo_shader_get_kernel_param(vs, ILO_KERNEL_VS_GEN6_SO)); - - switch (verts_per_prim) { - case 1: - param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET; - break; - case 2: - param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET; - break; - default: - param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET; - break; - } + const uint8_t cmd_len = 9; + uint32_t *dw; - /* cannot use VS's CSO */ - ilo_gpe_init_gs_cso(builder->dev, vs, &cso); - dw2 = cso.payload[0]; - dw4 = cso.payload[1]; - dw5 = cso.payload[2]; - dw6 = cso.payload[3]; + ILO_DEV_ASSERT(builder->dev, 8, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); - dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(vs) + - ilo_shader_get_kernel_param(vs, param); - dw[2] = dw2; - dw[3] = 0; - dw[4] = dw4; - dw[5] = dw5; - dw[6] = dw6; + dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2); + /* see ds_set_gen7_3DSTATE_DS() */ + dw[1] = kernel_offset; + dw[2] = 0; + dw[3] = ds->ds[0]; + dw[4] = ds->ds[1]; + dw[5] = 0; + dw[6] = ds->ds[2]; + dw[7] = ds->ds[3]; + dw[8] = ds->ds[4]; } static inline void -gen6_disable_3DSTATE_GS(struct ilo_builder *builder) +gen6_3DSTATE_GS(struct ilo_builder *builder, + const struct ilo_state_gs *gs, + uint32_t kernel_offset) { const uint8_t cmd_len = 7; uint32_t *dw; @@ -925,13 +633,13 @@ gen6_disable_3DSTATE_GS(struct ilo_builder *builder) ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2); - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - /* honor the valid range of URB read length */ - dw[4] = 1 << GEN6_GS_DW4_URB_READ_LEN__SHIFT; - dw[5] = GEN6_GS_DW5_STATISTICS; - dw[6] = 0; + dw[1] = kernel_offset; + /* see gs_set_gen6_3DSTATE_GS() */ + dw[2] = gs->gs[0]; + dw[3] = gs->gs[1]; + dw[4] = gs->gs[2]; + dw[5] = gs->gs[3]; + dw[6] = gs->gs[4]; } static inline void @@ -960,183 +668,90 @@ gen6_3DSTATE_GS_SVB_INDEX(struct ilo_builder *builder, static inline void gen7_3DSTATE_GS(struct ilo_builder *builder, - const struct ilo_shader_state *gs) + const struct ilo_state_gs *gs, + uint32_t kernel_offset) { const uint8_t cmd_len = 7; - const struct ilo_shader_cso *cso; - uint32_t dw2, dw4, dw5, *dw; + uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 7.5); - cso = ilo_shader_get_kernel_cso(gs); - dw2 = cso->payload[0]; - dw4 = cso->payload[1]; - dw5 = cso->payload[2]; - ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2); - dw[1] = ilo_shader_get_kernel_offset(gs); - dw[2] = dw2; - dw[3] = 0; /* scratch */ - dw[4] = dw4; - dw[5] = dw5; + dw[1] = kernel_offset; + /* see gs_set_gen7_3DSTATE_GS() */ + dw[2] = gs->gs[0]; + dw[3] = gs->gs[1]; + dw[4] = gs->gs[2]; + dw[5] = gs->gs[3]; dw[6] = 0; } static inline void -gen7_disable_3DSTATE_GS(struct ilo_builder *builder) +gen8_3DSTATE_GS(struct ilo_builder *builder, + const struct ilo_state_gs *gs, + uint32_t kernel_offset) { - const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 10 : 7; + const uint8_t cmd_len = 10; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 8); + ILO_DEV_ASSERT(builder->dev, 8, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2); - dw[1] = 0; + dw[1] = kernel_offset; dw[2] = 0; - dw[3] = 0; - dw[4] = 0; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[7] = GEN8_GS_DW7_STATISTICS; - dw[8] = 0; - dw[9] = 0; - } else { - dw[5] = GEN7_GS_DW5_STATISTICS; - dw[6] = 0; - } + /* see gs_set_gen7_3DSTATE_GS() */ + dw[3] = gs->gs[0]; + dw[4] = gs->gs[1]; + dw[5] = 0; + dw[6] = gs->gs[2]; + dw[7] = gs->gs[3]; + dw[8] = 0; + dw[9] = gs->gs[4]; } static inline void gen7_3DSTATE_STREAMOUT(struct ilo_builder *builder, - int render_stream, - bool render_disable, - int vertex_attrib_count, - const int *buf_strides) + const struct ilo_state_sol *sol) { const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 5 : 3; uint32_t *dw; - int buf_mask; ILO_DEV_ASSERT(builder->dev, 7, 8); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_STREAMOUT) | (cmd_len - 2); - - dw[1] = render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT; - if (render_disable) - dw[1] |= GEN7_SO_DW1_RENDER_DISABLE; - - if (buf_strides) { - buf_mask = ((bool) buf_strides[3]) << 3 | - ((bool) buf_strides[2]) << 2 | - ((bool) buf_strides[1]) << 1 | - ((bool) buf_strides[0]); - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[3] = buf_strides[1] << 16 | buf_strides[0]; - dw[4] = buf_strides[3] << 16 | buf_strides[1]; - } - } else { - buf_mask = 0; - } - - if (buf_mask) { - int read_len; - - dw[1] |= GEN7_SO_DW1_SO_ENABLE | - GEN7_SO_DW1_STATISTICS; - /* API_OPENGL */ - if (true) - dw[1] |= GEN7_SO_DW1_REORDER_TRAILING; - if (ilo_dev_gen(builder->dev) < ILO_GEN(8)) - dw[1] |= buf_mask << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT; - - read_len = (vertex_attrib_count + 1) / 2; - if (!read_len) - read_len = 1; - - dw[2] = 0 << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT | - (read_len - 1) << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT | - 0 << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT | - (read_len - 1) << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT | - 0 << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT | - (read_len - 1) << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT | - 0 << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT | - (read_len - 1) << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT; - } else { - dw[2] = 0; + /* see sol_set_gen7_3DSTATE_STREAMOUT() */ + dw[1] = sol->streamout[0]; + dw[2] = sol->streamout[1]; + if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { + dw[3] = sol->strides[1] << GEN8_SO_DW3_BUFFER1_PITCH__SHIFT | + sol->strides[0] << GEN8_SO_DW3_BUFFER0_PITCH__SHIFT; + dw[4] = sol->strides[3] << GEN8_SO_DW4_BUFFER3_PITCH__SHIFT | + sol->strides[2] << GEN8_SO_DW4_BUFFER2_PITCH__SHIFT; } } static inline void gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder, - const struct pipe_stream_output_info *so_info) + const struct ilo_state_sol *sol) { /* * Note that "DWord Length" has 9 bits for this command and the type of * cmd_len cannot be uint8_t. */ uint16_t cmd_len; - struct { - int buf_selects; - int decl_count; - uint16_t decls[128]; - } streams[4]; - unsigned buf_offsets[PIPE_MAX_SO_BUFFERS]; - int hw_decl_count, i; + int cmd_decl_count; uint32_t *dw; ILO_DEV_ASSERT(builder->dev, 7, 8); - memset(streams, 0, sizeof(streams)); - memset(buf_offsets, 0, sizeof(buf_offsets)); - - for (i = 0; i < so_info->num_outputs; i++) { - unsigned decl, st, buf, reg, mask; - - st = so_info->output[i].stream; - buf = so_info->output[i].output_buffer; - - /* pad with holes */ - while (buf_offsets[buf] < so_info->output[i].dst_offset) { - int num_dwords; - - num_dwords = so_info->output[i].dst_offset - buf_offsets[buf]; - if (num_dwords > 4) - num_dwords = 4; - - decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT | - GEN7_SO_DECL_HOLE_FLAG | - ((1 << num_dwords) - 1) << GEN7_SO_DECL_COMPONENT_MASK__SHIFT; - - assert(streams[st].decl_count < Elements(streams[st].decls)); - streams[st].decls[streams[st].decl_count++] = decl; - buf_offsets[buf] += num_dwords; - } - assert(buf_offsets[buf] == so_info->output[i].dst_offset); - - reg = so_info->output[i].register_index; - mask = ((1 << so_info->output[i].num_components) - 1) << - so_info->output[i].start_component; - - decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT | - reg << GEN7_SO_DECL_REG_INDEX__SHIFT | - mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT; - - assert(streams[st].decl_count < Elements(streams[st].decls)); - - streams[st].buf_selects |= 1 << buf; - streams[st].decls[streams[st].decl_count++] = decl; - buf_offsets[buf] += so_info->output[i].num_components; - } - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) { - hw_decl_count = MAX4(streams[0].decl_count, streams[1].decl_count, - streams[2].decl_count, streams[3].decl_count); + cmd_decl_count = sol->decl_count; } else { /* * From the Ivy Bridge PRM, volume 2 part 1, page 201: @@ -1145,100 +760,97 @@ gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder, * whenever this command is issued. The "Num Entries [n]" fields * still contain the actual numbers of valid decls." */ - hw_decl_count = 128; + cmd_decl_count = 128; } - cmd_len = 3 + 2 * hw_decl_count; + cmd_len = 3 + 2 * cmd_decl_count; ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_DECL_LIST) | (cmd_len - 2); - dw[1] = streams[3].buf_selects << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT | - streams[2].buf_selects << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT | - streams[1].buf_selects << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT | - streams[0].buf_selects << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT; - dw[2] = streams[3].decl_count << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT | - streams[2].decl_count << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT | - streams[1].decl_count << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT | - streams[0].decl_count << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT; - dw += 3; - - for (i = 0; i < hw_decl_count; i++) { - dw[0] = streams[1].decls[i] << 16 | streams[0].decls[i]; - dw[1] = streams[3].decls[i] << 16 | streams[2].decls[i]; - dw += 2; + /* see sol_set_gen7_3DSTATE_SO_DECL_LIST() */ + dw[1] = sol->so_decl[0]; + dw[2] = sol->so_decl[1]; + memcpy(&dw[3], sol->decl, sizeof(sol->decl[0]) * sol->decl_count); + + if (sol->decl_count < cmd_decl_count) { + memset(&dw[3 + 2 * sol->decl_count], 0, sizeof(sol->decl[0]) * + cmd_decl_count - sol->decl_count); } } static inline void -gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index, int stride, - const struct pipe_stream_output_target *so_target) +gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder, + const struct ilo_state_sol *sol, + const struct ilo_state_sol_buffer *sb, + uint8_t buffer) { - const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4; - struct ilo_buffer *buf; - int start, end; + const uint8_t cmd_len = 4; uint32_t *dw; unsigned pos; - ILO_DEV_ASSERT(builder->dev, 7, 8); - - buf = ilo_buffer(so_target->buffer); - - /* DWord-aligned */ - assert(stride % 4 == 0); - assert(so_target->buffer_offset % 4 == 0); + ILO_DEV_ASSERT(builder->dev, 7, 7.5); - stride &= ~3; - start = so_target->buffer_offset & ~3; - end = (start + so_target->buffer_size) & ~3; + assert(buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT); pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2); - dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT | - stride; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[1] |= builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT; - - dw[4] = end - start; - dw[5] = 0; - dw[6] = 0; - dw[7] = 0; - - ilo_builder_batch_reloc64(builder, pos + 2, - buf->bo, start, INTEL_RELOC_WRITE); + /* see sol_buffer_set_gen7_3dstate_so_buffer() */ + dw[1] = buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT | + builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT | + sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT; + + if (sb->need_bo) { + ilo_builder_batch_reloc(builder, pos + 2, sb->bo, + sb->so_buf[0], INTEL_RELOC_WRITE); + ilo_builder_batch_reloc(builder, pos + 3, sb->bo, + sb->so_buf[1], INTEL_RELOC_WRITE); } else { - dw[1] |= builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT; - - ilo_builder_batch_reloc(builder, pos + 2, - buf->bo, start, INTEL_RELOC_WRITE); - ilo_builder_batch_reloc(builder, pos + 3, - buf->bo, end, INTEL_RELOC_WRITE); + dw[2] = 0; + dw[3] = 0; } } static inline void -gen7_disable_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index) +gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder, + const struct ilo_state_sol *sol, + const struct ilo_state_sol_buffer *sb, + uint8_t buffer) { - const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4; + const uint8_t cmd_len = 8; uint32_t *dw; + unsigned pos; - ILO_DEV_ASSERT(builder->dev, 7, 8); + ILO_DEV_ASSERT(builder->dev, 8, 8); - ilo_builder_batch_pointer(builder, cmd_len, &dw); + pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2); - dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT; - dw[2] = 0; - dw[3] = 0; + /* see sol_buffer_set_gen8_3dstate_so_buffer() */ + dw[1] = sb->so_buf[0] | + buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT | + builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT; + + if (sb->need_bo) { + ilo_builder_batch_reloc64(builder, pos + 2, sb->bo, + sb->so_buf[1], INTEL_RELOC_WRITE); + } else { + dw[2] = 0; + dw[3] = 0; + } - if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - dw[4] = 0; + dw[4] = sb->so_buf[2]; + + if (sb->need_write_offset_bo) { + ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo, + sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE); + } else { dw[5] = 0; dw[6] = 0; - dw[7] = 0; } + + dw[7] = sb->so_buf[3]; } static inline void @@ -1627,8 +1239,7 @@ gen6_BINDING_TABLE_STATE(struct ilo_builder *builder, static inline uint32_t gen6_SURFACE_STATE(struct ilo_builder *builder, - const struct ilo_view_surface *surf, - bool for_render) + const struct ilo_state_surface *surf) { int state_align, state_len; uint32_t state_offset, *dw; @@ -1641,7 +1252,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, state_offset = ilo_builder_surface_pointer(builder, ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw); - memcpy(dw, surf->payload, state_len << 2); + memcpy(dw, surf->surface, state_len << 2); if (surf->bo) { const uint32_t mocs = (surf->scanout) ? @@ -1650,7 +1261,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT; ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo, - surf->payload[8], (for_render) ? INTEL_RELOC_WRITE : 0); + surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE); } } else { state_align = 32; @@ -1658,7 +1269,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, state_offset = ilo_builder_surface_pointer(builder, ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw); - memcpy(dw, surf->payload, state_len << 2); + memcpy(dw, surf->surface, state_len << 2); if (surf->bo) { /* @@ -1668,7 +1279,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT; ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo, - surf->payload[1], (for_render) ? INTEL_RELOC_WRITE : 0); + surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE); } } @@ -1676,55 +1287,13 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, } static inline uint32_t -gen6_so_SURFACE_STATE(struct ilo_builder *builder, - const struct pipe_stream_output_target *so, - const struct pipe_stream_output_info *so_info, - int so_index) -{ - struct ilo_buffer *buf = ilo_buffer(so->buffer); - unsigned bo_offset, struct_size; - enum pipe_format elem_format; - struct ilo_view_surface surf; - - ILO_DEV_ASSERT(builder->dev, 6, 6); - - bo_offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4; - struct_size = so_info->stride[so_info->output[so_index].output_buffer] * 4; - - switch (so_info->output[so_index].num_components) { - case 1: - elem_format = PIPE_FORMAT_R32_FLOAT; - break; - case 2: - elem_format = PIPE_FORMAT_R32G32_FLOAT; - break; - case 3: - elem_format = PIPE_FORMAT_R32G32B32_FLOAT; - break; - case 4: - elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT; - break; - default: - assert(!"unexpected SO components length"); - elem_format = PIPE_FORMAT_R32_FLOAT; - break; - } - - ilo_gpe_init_view_surface_for_buffer(builder->dev, buf, bo_offset, - so->buffer_size, struct_size, elem_format, false, true, &surf); - - return gen6_SURFACE_STATE(builder, &surf, false); -} - -static inline uint32_t gen6_SAMPLER_STATE(struct ilo_builder *builder, - const struct ilo_sampler_cso * const *samplers, - const struct pipe_sampler_view * const *views, + const struct ilo_state_sampler *samplers, const uint32_t *sampler_border_colors, - int num_samplers) + int sampler_count) { const int state_align = 32; - const int state_len = 4 * num_samplers; + const int state_len = 4 * sampler_count; uint32_t state_offset, *dw; int i; @@ -1735,9 +1304,9 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder, * * "The sampler state is stored as an array of up to 16 elements..." */ - assert(num_samplers <= 16); + assert(sampler_count <= 16); - if (!num_samplers) + if (!sampler_count) return 0; /* @@ -1749,86 +1318,19 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder, * * It also applies to other shader stages. */ - ilo_builder_dynamic_pad_top(builder, 4 * (4 - (num_samplers % 4))); + ilo_builder_dynamic_pad_top(builder, 4 * (4 - (sampler_count % 4))); state_offset = ilo_builder_dynamic_pointer(builder, ILO_BUILDER_ITEM_SAMPLER, state_align, state_len, &dw); - for (i = 0; i < num_samplers; i++) { - const struct ilo_sampler_cso *sampler = samplers[i]; - const struct pipe_sampler_view *view = views[i]; - const uint32_t border_color = sampler_border_colors[i]; - uint32_t dw_filter, dw_wrap; - - /* there may be holes */ - if (!sampler || !view) { - /* disabled sampler */ - dw[0] = 1 << 31; - dw[1] = 0; - dw[2] = 0; - dw[3] = 0; - dw += 4; - - continue; - } - - /* determine filter and wrap modes */ - switch (view->texture->target) { - case PIPE_TEXTURE_1D: - dw_filter = (sampler->anisotropic) ? - sampler->dw_filter_aniso : sampler->dw_filter; - dw_wrap = sampler->dw_wrap_1d; - break; - case PIPE_TEXTURE_3D: - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 103: - * - * "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for - * surfaces of type SURFTYPE_3D." - */ - dw_filter = sampler->dw_filter; - dw_wrap = sampler->dw_wrap; - break; - case PIPE_TEXTURE_CUBE: - dw_filter = (sampler->anisotropic) ? - sampler->dw_filter_aniso : sampler->dw_filter; - dw_wrap = sampler->dw_wrap_cube; - break; - default: - dw_filter = (sampler->anisotropic) ? - sampler->dw_filter_aniso : sampler->dw_filter; - dw_wrap = sampler->dw_wrap; - break; - } + for (i = 0; i < sampler_count; i++) { + /* see sampler_set_gen6_SAMPLER_STATE() */ + dw[0] = samplers[i].sampler[0]; + dw[1] = samplers[i].sampler[1]; + dw[3] = samplers[i].sampler[2]; - dw[0] = sampler->payload[0]; - dw[1] = sampler->payload[1]; - assert(!(border_color & 0x1f)); - dw[2] = border_color; - dw[3] = sampler->payload[2]; - - dw[0] |= dw_filter; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) { - dw[3] |= dw_wrap; - } - else { - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 21: - * - * "[DevSNB] Errata: Incorrect behavior is observed in cases - * where the min and mag mode filters are different and - * SurfMinLOD is nonzero. The determination of MagMode uses the - * following equation instead of the one in the above - * pseudocode: MagMode = (LOD + SurfMinLOD - Base <= 0)" - * - * As a way to work around that, we set Base to - * view->u.tex.first_level. - */ - dw[0] |= view->u.tex.first_level << 22; - - dw[1] |= dw_wrap; - } + assert(!(sampler_border_colors[i] & 0x1f)); + dw[2] = sampler_border_colors[i]; dw += 4; } @@ -1838,7 +1340,7 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder, static inline uint32_t gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder, - const struct ilo_sampler_cso *sampler) + const struct ilo_state_sampler_border *border) { const int state_align = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 64 : 32; @@ -1846,11 +1348,12 @@ gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder, ILO_DEV_ASSERT(builder->dev, 6, 8); - assert(Elements(sampler->payload) >= 3 + state_len); - - /* see ilo_gpe_init_sampler_cso() */ + /* + * see border_set_gen6_SAMPLER_BORDER_COLOR_STATE() and + * border_set_gen7_SAMPLER_BORDER_COLOR_STATE() + */ return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLOB, - state_align, state_len, &sampler->payload[3]); + state_align, state_len, border->color); } static inline uint32_t diff --git a/src/gallium/drivers/ilo/core/ilo_builder_decode.c b/src/gallium/drivers/ilo/core/ilo_builder_decode.c index cedaab1559d..c5a98c91204 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_decode.c +++ b/src/gallium/drivers/ilo/core/ilo_builder_decode.c @@ -319,7 +319,7 @@ writer_decode_color_calc(const struct ilo_builder *builder, "stencil ref %d, bf stencil ref %d\n", GEN_EXTRACT(dw, GEN6_CC_DW0_ALPHATEST) ? "FLOAT32" : "UNORM8", (bool) (dw & GEN6_CC_DW0_ROUND_DISABLE_DISABLE), - GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL0_REF), + GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL_REF), GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL1_REF)); writer_dw(builder, which, item->offset, 1, "CC\n"); @@ -347,13 +347,13 @@ writer_decode_depth_stencil(const struct ilo_builder *builder, dw = writer_dw(builder, which, item->offset, 0, "D_S"); ilo_printf("stencil %sable, func %d, write %sable\n", (dw & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) ? "en" : "dis", - GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL0_FUNC), + GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL_FUNC), (dw & GEN6_ZS_DW0_STENCIL_WRITE_ENABLE) ? "en" : "dis"); dw = writer_dw(builder, which, item->offset, 1, "D_S"); ilo_printf("stencil test mask 0x%x, write mask 0x%x\n", - GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_VALUEMASK), - GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_WRITEMASK)); + GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_TEST_MASK), + GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_WRITE_MASK)); dw = writer_dw(builder, which, item->offset, 2, "D_S"); ilo_printf("depth test %sable, func %d, write %sable\n", diff --git a/src/gallium/drivers/ilo/core/ilo_builder_media.h b/src/gallium/drivers/ilo/core/ilo_builder_media.h index 7fbe6d41635..7197104a23e 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_media.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_media.h @@ -29,57 +29,30 @@ #define ILO_BUILDER_MEDIA_H #include "genhw/genhw.h" -#include "../ilo_shader.h" #include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" +#include "ilo_state_compute.h" #include "ilo_builder.h" -struct gen6_idrt_data { - const struct ilo_shader_state *cs; - - uint32_t sampler_offset; - uint32_t binding_table_offset; - - unsigned curbe_size; - unsigned thread_group_size; -}; - static inline void gen6_MEDIA_VFE_STATE(struct ilo_builder *builder, - unsigned curbe_alloc, bool use_slm) + const struct ilo_state_compute *compute) { const uint8_t cmd_len = 8; - const unsigned idrt_alloc = - ((ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) ? 64 : 32) * 32; - int max_threads; uint32_t *dw; - ILO_DEV_ASSERT(builder->dev, 7, 7.5); - - max_threads = builder->dev->thread_count; - - curbe_alloc = align(curbe_alloc, 32); - assert(idrt_alloc + curbe_alloc <= builder->dev->urb_size / (use_slm + 1)); + ILO_DEV_ASSERT(builder->dev, 6, 7.5); ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = GEN6_RENDER_CMD(MEDIA, MEDIA_VFE_STATE) | (cmd_len - 2); - dw[1] = 0; /* scratch */ - - dw[2] = (max_threads - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT | - 0 << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT | - GEN6_VFE_DW2_RESET_GATEWAY_TIMER | - GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL; - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) - dw[2] |= GEN7_VFE_DW2_GPGPU_MODE; - + /* see compute_set_gen6_MEDIA_VFE_STATE() */ + dw[1] = compute->vfe[0]; + dw[2] = compute->vfe[1]; dw[3] = 0; - - dw[4] = 0 << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT | - (curbe_alloc / 32); - + dw[4] = compute->vfe[2]; dw[5] = 0; dw[6] = 0; dw[7] = 0; @@ -194,8 +167,10 @@ gen7_GPGPU_WALKER(struct ilo_builder *builder, static inline uint32_t gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder, - const struct gen6_idrt_data *data, - int idrt_count) + const struct ilo_state_compute *compute, + const uint32_t *kernel_offsets, + const uint32_t *sampler_offsets, + const uint32_t *binding_table_offsets) { /* * From the Sandy Bridge PRM, volume 2 part 2, page 34: @@ -211,61 +186,26 @@ gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder, * aligned address of the Interface Descriptor data." */ const int state_align = 32; - const int state_len = (32 / 4) * idrt_count; + const int state_len = (32 / 4) * compute->idrt_count; uint32_t state_offset, *dw; int i; - ILO_DEV_ASSERT(builder->dev, 7, 7.5); + ILO_DEV_ASSERT(builder->dev, 6, 7.5); state_offset = ilo_builder_dynamic_pointer(builder, ILO_BUILDER_ITEM_INTERFACE_DESCRIPTOR, state_align, state_len, &dw); - for (i = 0; i < idrt_count; i++) { - const struct gen6_idrt_data *idrt = &data[i]; - const struct ilo_shader_state *cs = idrt->cs; - unsigned sampler_count, bt_size, slm_size; - - sampler_count = - ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT); - assert(sampler_count <= 16); - sampler_count = (sampler_count + 3) / 4; - - bt_size = - ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT); - if (bt_size > 31) - bt_size = 31; - - slm_size = ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE); - - assert(idrt->curbe_size / 32 <= 63); - - dw[0] = ilo_shader_get_kernel_offset(idrt->cs); + for (i = 0; i < compute->idrt_count; i++) { + /* see compute_set_gen6_INTERFACE_DESCRIPTOR_DATA() */ + dw[0] = compute->idrt[i][0] + kernel_offsets[i]; dw[1] = 0; - dw[2] = idrt->sampler_offset | - sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT; - dw[3] = idrt->binding_table_offset | - bt_size << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT; - - dw[4] = (idrt->curbe_size / 32) << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT | - 0 << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT; - - if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) { - dw[5] = GEN7_IDRT_DW5_ROUNDING_MODE_RTNE; - - if (slm_size) { - assert(slm_size <= 64 * 1024); - slm_size = util_next_power_of_two((slm_size + 4095) / 4096); - - dw[5] |= GEN7_IDRT_DW5_BARRIER_ENABLE | - slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT | - idrt->thread_group_size << - GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT; - } - } else { - dw[5] = 0; - } - - dw[6] = 0; + dw[2] = compute->idrt[i][1] | + sampler_offsets[i]; + dw[3] = compute->idrt[i][2] | + binding_table_offsets[i]; + dw[4] = compute->idrt[i][3]; + dw[5] = compute->idrt[i][4]; + dw[6] = compute->idrt[i][5]; dw[7] = 0; dw += 8; diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h index 3587d3930f3..0a7f7d9d3fe 100644 --- a/src/gallium/drivers/ilo/core/ilo_core.h +++ b/src/gallium/drivers/ilo/core/ilo_core.h @@ -40,7 +40,4 @@ #include "util/u_memory.h" #include "util/u_pointer.h" -#define ILO_PRIM_RECTANGLES PIPE_PRIM_MAX -#define ILO_PRIM_MAX (PIPE_PRIM_MAX + 1) - #endif /* ILO_CORE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h index d9c460498ff..9833233d796 100644 --- a/src/gallium/drivers/ilo/core/ilo_debug.h +++ b/src/gallium/drivers/ilo/core/ilo_debug.h @@ -100,4 +100,21 @@ ilo_warn(const char *format, ...) #endif } +static inline bool +ilo_is_zeroed(const void *ptr, size_t size) +{ +#ifdef DEBUG + size_t i; + + for (i = 0; i < size; i++) { + if (*((const char *) ptr) != 0) + return false; + } + + return true; +#else + return true; +#endif +} + #endif /* ILO_DEBUG_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_dev.c b/src/gallium/drivers/ilo/core/ilo_dev.c index 7a774fa1591..925322abba4 100644 --- a/src/gallium/drivers/ilo/core/ilo_dev.c +++ b/src/gallium/drivers/ilo/core/ilo_dev.c @@ -32,14 +32,15 @@ #include "ilo_dev.h" /** - * Initialize the \p dev from \p winsys. \p winsys is considered owned by \p - * dev and will be destroyed in \p ilo_dev_cleanup(). + * Initialize the \p dev from \p winsys. */ bool ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys) { const struct intel_winsys_info *info; + assert(ilo_is_zeroed(dev, sizeof(*dev))); + info = intel_winsys_get_info(winsys); dev->winsys = winsys; @@ -178,9 +179,3 @@ ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys) return true; } - -void -ilo_dev_cleanup(struct ilo_dev *dev) -{ - intel_winsys_destroy(dev->winsys); -} diff --git a/src/gallium/drivers/ilo/core/ilo_dev.h b/src/gallium/drivers/ilo/core/ilo_dev.h index 4eb5d59dc86..a9f9b176e16 100644 --- a/src/gallium/drivers/ilo/core/ilo_dev.h +++ b/src/gallium/drivers/ilo/core/ilo_dev.h @@ -63,9 +63,6 @@ struct ilo_dev { bool ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys); -void -ilo_dev_cleanup(struct ilo_dev *dev); - static inline int ilo_dev_gen(const struct ilo_dev *dev) { diff --git a/src/gallium/drivers/ilo/core/ilo_fence.h b/src/gallium/drivers/ilo/core/ilo_fence.h deleted file mode 100644 index 00d555aa95b..00000000000 --- a/src/gallium/drivers/ilo/core/ilo_fence.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright (C) 2012-2013 LunarG, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Authors: - * Chia-I Wu <[email protected]> - */ - -#ifndef ILO_FENCE_H -#define ILO_FENCE_H - -#include "intel_winsys.h" - -#include "ilo_core.h" -#include "ilo_dev.h" - -struct ilo_fence { - struct intel_bo *seq_bo; -}; - -static inline void -ilo_fence_init(struct ilo_fence *fence, const struct ilo_dev *dev) -{ - /* no-op */ -} - -static inline void -ilo_fence_cleanup(struct ilo_fence *fence) -{ - intel_bo_unref(fence->seq_bo); -} - -/** - * Set the sequence bo for waiting. The fence is considered signaled when - * there is no sequence bo. - */ -static inline void -ilo_fence_set_seq_bo(struct ilo_fence *fence, struct intel_bo *seq_bo) -{ - intel_bo_unref(fence->seq_bo); - fence->seq_bo = intel_bo_ref(seq_bo); -} - -/** - * Wait for the fence to be signaled or until \p timeout nanoseconds has - * passed. It will wait indefinitely when \p timeout is negative. - */ -static inline bool -ilo_fence_wait(struct ilo_fence *fence, int64_t timeout) -{ - return (!fence->seq_bo || intel_bo_wait(fence->seq_bo, timeout) == 0); -} - -#endif /* ILO_FENCE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_format.c b/src/gallium/drivers/ilo/core/ilo_format.c deleted file mode 100644 index 280e499d54a..00000000000 --- a/src/gallium/drivers/ilo/core/ilo_format.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright (C) 2012-2013 LunarG, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Authors: - * Chia-I Wu <[email protected]> - */ - -#include "genhw/genhw.h" -#include "ilo_format.h" - -struct ilo_vf_cap { - int vertex_element; -}; - -struct ilo_sol_cap { - int buffer; -}; - -struct ilo_sampler_cap { - int sampling; - int filtering; - int shadow_map; - int chroma_key; -}; - -struct ilo_dp_cap { - int rt_write; - int rt_write_blending; - int typed_write; - int media_color_processing; -}; - -/* - * This table is based on: - * - * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 - * - the Ivy Bridge PRM, volume 2 part 1, page 97-99 - * - the Haswell PRM, volume 7, page 467-470 - */ -static const struct ilo_vf_cap ilo_vf_caps[] = { -#define CAP(vertex_element) { ILO_GEN(vertex_element) } - [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_UNORM] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_SNORM] = CAP( 1), - [GEN6_FORMAT_R64G64_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_SSCALED] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_USCALED] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_SFIXED] = CAP(7.5), - [GEN6_FORMAT_R32G32B32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_UINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_UNORM] = CAP( 1), - [GEN6_FORMAT_R32G32B32_SNORM] = CAP( 1), - [GEN6_FORMAT_R32G32B32_SSCALED] = CAP( 1), - [GEN6_FORMAT_R32G32B32_USCALED] = CAP( 1), - [GEN6_FORMAT_R32G32B32_SFIXED] = CAP(7.5), - [GEN6_FORMAT_R16G16B16A16_UNORM] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_SNORM] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_SINT] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_UINT] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32_UINT] = CAP( 1), - [GEN6_FORMAT_R32G32_UNORM] = CAP( 1), - [GEN6_FORMAT_R32G32_SNORM] = CAP( 1), - [GEN6_FORMAT_R64_FLOAT] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_SSCALED] = CAP( 1), - [GEN6_FORMAT_R16G16B16A16_USCALED] = CAP( 1), - [GEN6_FORMAT_R32G32_SSCALED] = CAP( 1), - [GEN6_FORMAT_R32G32_USCALED] = CAP( 1), - [GEN6_FORMAT_R32G32_SFIXED] = CAP(7.5), - [GEN6_FORMAT_B8G8R8A8_UNORM] = CAP( 1), - [GEN6_FORMAT_R10G10B10A2_UNORM] = CAP( 1), - [GEN6_FORMAT_R10G10B10A2_UINT] = CAP( 1), - [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_UNORM] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_SNORM] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_SINT] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_UINT] = CAP( 1), - [GEN6_FORMAT_R16G16_UNORM] = CAP( 1), - [GEN6_FORMAT_R16G16_SNORM] = CAP( 1), - [GEN6_FORMAT_R16G16_SINT] = CAP( 1), - [GEN6_FORMAT_R16G16_UINT] = CAP( 1), - [GEN6_FORMAT_R16G16_FLOAT] = CAP( 1), - [GEN6_FORMAT_B10G10R10A2_UNORM] = CAP(7.5), - [GEN6_FORMAT_R11G11B10_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32_SINT] = CAP( 1), - [GEN6_FORMAT_R32_UINT] = CAP( 1), - [GEN6_FORMAT_R32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32_UNORM] = CAP( 1), - [GEN6_FORMAT_R32_SNORM] = CAP( 1), - [GEN6_FORMAT_R10G10B10X2_USCALED] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_SSCALED] = CAP( 1), - [GEN6_FORMAT_R8G8B8A8_USCALED] = CAP( 1), - [GEN6_FORMAT_R16G16_SSCALED] = CAP( 1), - [GEN6_FORMAT_R16G16_USCALED] = CAP( 1), - [GEN6_FORMAT_R32_SSCALED] = CAP( 1), - [GEN6_FORMAT_R32_USCALED] = CAP( 1), - [GEN6_FORMAT_R8G8_UNORM] = CAP( 1), - [GEN6_FORMAT_R8G8_SNORM] = CAP( 1), - [GEN6_FORMAT_R8G8_SINT] = CAP( 1), - [GEN6_FORMAT_R8G8_UINT] = CAP( 1), - [GEN6_FORMAT_R16_UNORM] = CAP( 1), - [GEN6_FORMAT_R16_SNORM] = CAP( 1), - [GEN6_FORMAT_R16_SINT] = CAP( 1), - [GEN6_FORMAT_R16_UINT] = CAP( 1), - [GEN6_FORMAT_R16_FLOAT] = CAP( 1), - [GEN6_FORMAT_R8G8_SSCALED] = CAP( 1), - [GEN6_FORMAT_R8G8_USCALED] = CAP( 1), - [GEN6_FORMAT_R16_SSCALED] = CAP( 1), - [GEN6_FORMAT_R16_USCALED] = CAP( 1), - [GEN6_FORMAT_R8_UNORM] = CAP( 1), - [GEN6_FORMAT_R8_SNORM] = CAP( 1), - [GEN6_FORMAT_R8_SINT] = CAP( 1), - [GEN6_FORMAT_R8_UINT] = CAP( 1), - [GEN6_FORMAT_R8_SSCALED] = CAP( 1), - [GEN6_FORMAT_R8_USCALED] = CAP( 1), - [GEN6_FORMAT_R8G8B8_UNORM] = CAP( 1), - [GEN6_FORMAT_R8G8B8_SNORM] = CAP( 1), - [GEN6_FORMAT_R8G8B8_SSCALED] = CAP( 1), - [GEN6_FORMAT_R8G8B8_USCALED] = CAP( 1), - [GEN6_FORMAT_R64G64B64A64_FLOAT] = CAP( 1), - [GEN6_FORMAT_R64G64B64_FLOAT] = CAP( 1), - [GEN6_FORMAT_R16G16B16_FLOAT] = CAP( 6), - [GEN6_FORMAT_R16G16B16_UNORM] = CAP( 1), - [GEN6_FORMAT_R16G16B16_SNORM] = CAP( 1), - [GEN6_FORMAT_R16G16B16_SSCALED] = CAP( 1), - [GEN6_FORMAT_R16G16B16_USCALED] = CAP( 1), - [GEN6_FORMAT_R16G16B16_UINT] = CAP(7.5), - [GEN6_FORMAT_R16G16B16_SINT] = CAP(7.5), - [GEN6_FORMAT_R32_SFIXED] = CAP(7.5), - [GEN6_FORMAT_R10G10B10A2_SNORM] = CAP(7.5), - [GEN6_FORMAT_R10G10B10A2_USCALED] = CAP(7.5), - [GEN6_FORMAT_R10G10B10A2_SSCALED] = CAP(7.5), - [GEN6_FORMAT_R10G10B10A2_SINT] = CAP(7.5), - [GEN6_FORMAT_B10G10R10A2_SNORM] = CAP(7.5), - [GEN6_FORMAT_B10G10R10A2_USCALED] = CAP(7.5), - [GEN6_FORMAT_B10G10R10A2_SSCALED] = CAP(7.5), - [GEN6_FORMAT_B10G10R10A2_UINT] = CAP(7.5), - [GEN6_FORMAT_B10G10R10A2_SINT] = CAP(7.5), - [GEN6_FORMAT_R8G8B8_UINT] = CAP(7.5), - [GEN6_FORMAT_R8G8B8_SINT] = CAP(7.5), -#undef CAP -}; - -/* - * This table is based on: - * - * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 - * - the Ivy Bridge PRM, volume 2 part 1, page 195 - * - the Haswell PRM, volume 7, page 535 - */ -static const struct ilo_sol_cap ilo_sol_caps[] = { -#define CAP(buffer) { ILO_GEN(buffer) } - [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32B32_UINT] = CAP( 1), - [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1), - [GEN6_FORMAT_R32G32_SINT] = CAP( 1), - [GEN6_FORMAT_R32G32_UINT] = CAP( 1), - [GEN6_FORMAT_R32_SINT] = CAP( 1), - [GEN6_FORMAT_R32_UINT] = CAP( 1), - [GEN6_FORMAT_R32_FLOAT] = CAP( 1), -#undef CAP -}; - -/* - * This table is based on: - * - * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 - * - the Ivy Bridge PRM, volume 4 part 1, page 84-87 - */ -static const struct ilo_sampler_cap ilo_sampler_caps[] = { -#define CAP(sampling, filtering, shadow_map, chroma_key) \ - { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) } - [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32G32B32X32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_R32G32B32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_R32G32B32_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32G32B32_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16B16A16_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16B16A16_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16B16A16_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16B16A16_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16B16A16_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_R32G32_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32G32_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_L32A32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_R16G16B16X16_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16B16X16_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_A32X32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_L32X32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_I32X32_FLOAT] = CAP( 1, 5, 0, 0), - [GEN6_FORMAT_B8G8R8A8_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R10G10B10A2_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R10G10B10A2_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8A8_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8A8_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8A8_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R8G8B8A8_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16G16_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B10G10R10A2_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R11G11B10_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R32_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R32_FLOAT] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_R24_UNORM_X8_TYPELESS] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_X24_TYPELESS_G8_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_L16A16_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_I24X8_UNORM] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_L24X8_UNORM] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_A24X8_UNORM] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_I32_FLOAT] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_L32_FLOAT] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_A32_FLOAT] = CAP( 1, 5, 1, 0), - [GEN6_FORMAT_B8G8R8X8_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8X8_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R9G9B9E5_SHAREDEXP] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B10G10R10X2_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_L16A16_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B5G6R5_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_B5G6R5_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B5G5R5A1_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B4G4R4A4_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8_SNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_R8G8_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R8G8_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16_UNORM] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_R16_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R16_FLOAT] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_A8P8_UNORM_PALETTE0] = CAP( 5, 5, 0, 0), - [GEN6_FORMAT_A8P8_UNORM_PALETTE1] = CAP( 5, 5, 0, 0), - [GEN6_FORMAT_I16_UNORM] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_L16_UNORM] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_A16_UNORM] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_L8A8_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_I16_FLOAT] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_L16_FLOAT] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_A16_FLOAT] = CAP( 1, 1, 1, 0), - [GEN6_FORMAT_L8A8_UNORM_SRGB] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_R5G5_SNORM_B6_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_P8A8_UNORM_PALETTE0] = CAP( 5, 5, 0, 0), - [GEN6_FORMAT_P8A8_UNORM_PALETTE1] = CAP( 5, 5, 0, 0), - [GEN6_FORMAT_R8_UNORM] = CAP( 1, 1, 0, 4.5), - [GEN6_FORMAT_R8_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8_SINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_R8_UINT] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_A8_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_I8_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_L8_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_P4A4_UNORM_PALETTE0] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_A4P4_UNORM_PALETTE0] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_P8_UNORM_PALETTE0] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_L8_UNORM_SRGB] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_P8_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_P4A4_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_A4P4_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_DXT1_RGB_SRGB] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_R1_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_YCRCB_NORMAL] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_YCRCB_SWAPUVY] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_P2_UNORM_PALETTE0] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_P2_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), - [GEN6_FORMAT_BC1_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_BC2_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_BC3_UNORM] = CAP( 1, 1, 0, 1), - [GEN6_FORMAT_BC4_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC5_UNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC1_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC2_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC3_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_MONO8] = CAP( 1, 0, 0, 0), - [GEN6_FORMAT_YCRCB_SWAPUV] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_YCRCB_SWAPY] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_DXT1_RGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_FXT1] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC4_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_BC5_SNORM] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R16G16B16_FLOAT] = CAP( 5, 5, 0, 0), - [GEN6_FORMAT_BC6H_SF16] = CAP( 7, 7, 0, 0), - [GEN6_FORMAT_BC7_UNORM] = CAP( 7, 7, 0, 0), - [GEN6_FORMAT_BC7_UNORM_SRGB] = CAP( 7, 7, 0, 0), - [GEN6_FORMAT_BC6H_UF16] = CAP( 7, 7, 0, 0), -#undef CAP -}; - -/* - * This table is based on: - * - * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 - * - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278 - * - the Haswell PRM, volume 7, page 262-264 - */ -static const struct ilo_dp_cap ilo_dp_caps[] = { -#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \ - { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) } - [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16B16A16_UNORM] = CAP( 1, 4.5, 7, 6), - [GEN6_FORMAT_R16G16B16A16_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R16G16B16A16_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16B16A16_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16B16A16_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R32G32_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R32G32_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_B8G8R8A8_UNORM] = CAP( 1, 1, 7, 6), - [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R10G10B10A2_UNORM] = CAP( 1, 1, 7, 6), - [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB] = CAP( 0, 0, 0, 6), - [GEN6_FORMAT_R10G10B10A2_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R8G8B8A8_UNORM] = CAP( 1, 1, 7, 6), - [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB] = CAP( 1, 1, 0, 6), - [GEN6_FORMAT_R8G8B8A8_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R8G8B8A8_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R8G8B8A8_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16_UNORM] = CAP( 1, 4.5, 7, 0), - [GEN6_FORMAT_R16G16_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R16G16_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16G16_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B10G10R10A2_UNORM] = CAP( 1, 1, 7, 6), - [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB] = CAP( 1, 1, 0, 6), - [GEN6_FORMAT_R11G11B10_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R32_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R32_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R32_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B8G8R8X8_UNORM] = CAP( 0, 0, 0, 6), - [GEN6_FORMAT_B5G6R5_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B5G6R5_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B5G5R5A1_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_B4G4R4A4_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8G8_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R8G8_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R8G8_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R8G8_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16_UNORM] = CAP( 1, 4.5, 7, 7), - [GEN6_FORMAT_R16_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R16_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R16_FLOAT] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B5G5R5X1_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB] = CAP( 1, 1, 0, 0), - [GEN6_FORMAT_R8_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_R8_SNORM] = CAP( 1, 6, 7, 0), - [GEN6_FORMAT_R8_SINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_R8_UINT] = CAP( 1, 0, 7, 0), - [GEN6_FORMAT_A8_UNORM] = CAP( 1, 1, 7, 0), - [GEN6_FORMAT_YCRCB_NORMAL] = CAP( 1, 0, 0, 6), - [GEN6_FORMAT_YCRCB_SWAPUVY] = CAP( 1, 0, 0, 6), - [GEN6_FORMAT_YCRCB_SWAPUV] = CAP( 1, 0, 0, 6), - [GEN6_FORMAT_YCRCB_SWAPY] = CAP( 1, 0, 0, 6), -#undef CAP -}; - -bool -ilo_format_support_vb(const struct ilo_dev *dev, - enum pipe_format format) -{ - const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER); - const struct ilo_vf_cap *cap = (idx >= 0 && idx < Elements(ilo_vf_caps)) ? - &ilo_vf_caps[idx] : NULL; - - return (cap && cap->vertex_element && - ilo_dev_gen(dev) >= cap->vertex_element); -} - -bool -ilo_format_support_sol(const struct ilo_dev *dev, - enum pipe_format format) -{ - const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT); - const struct ilo_sol_cap *cap = (idx >= 0 && idx < Elements(ilo_sol_caps)) ? - &ilo_sol_caps[idx] : NULL; - - return (cap && cap->buffer && ilo_dev_gen(dev) >= cap->buffer); -} - -bool -ilo_format_support_sampler(const struct ilo_dev *dev, - enum pipe_format format) -{ - const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW); - const struct ilo_sampler_cap *cap = (idx >= 0 && - idx < Elements(ilo_sampler_caps)) ? &ilo_sampler_caps[idx] : NULL; - - if (!cap || !cap->sampling) - return false; - - assert(!cap->filtering || cap->filtering >= cap->sampling); - - if (util_format_is_pure_integer(format)) - return (ilo_dev_gen(dev) >= cap->sampling); - else if (cap->filtering) - return (ilo_dev_gen(dev) >= cap->filtering); - else - return false; -} - -bool -ilo_format_support_rt(const struct ilo_dev *dev, - enum pipe_format format) -{ - const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET); - const struct ilo_dp_cap *cap = (idx >= 0 && idx < Elements(ilo_dp_caps)) ? - &ilo_dp_caps[idx] : NULL; - - if (!cap || !cap->rt_write) - return false; - - assert(!cap->rt_write_blending || cap->rt_write_blending >= cap->rt_write); - - if (util_format_is_pure_integer(format)) - return (ilo_dev_gen(dev) >= cap->rt_write); - else if (cap->rt_write_blending) - return (ilo_dev_gen(dev) >= cap->rt_write_blending); - else - return false; -} - -bool -ilo_format_support_zs(const struct ilo_dev *dev, - enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return true; - case PIPE_FORMAT_S8_UINT: - /* TODO separate stencil */ - default: - return false; - } -} - -/** - * Translate a color (non-depth/stencil) pipe format to the matching hardware - * format. Return -1 on errors. - */ -int -ilo_format_translate_color(const struct ilo_dev *dev, - enum pipe_format format) -{ - static const int format_mapping[PIPE_FORMAT_COUNT] = { - [PIPE_FORMAT_NONE] = 0, - [PIPE_FORMAT_B8G8R8A8_UNORM] = GEN6_FORMAT_B8G8R8A8_UNORM, - [PIPE_FORMAT_B8G8R8X8_UNORM] = GEN6_FORMAT_B8G8R8X8_UNORM, - [PIPE_FORMAT_A8R8G8B8_UNORM] = 0, - [PIPE_FORMAT_X8R8G8B8_UNORM] = 0, - [PIPE_FORMAT_B5G5R5A1_UNORM] = GEN6_FORMAT_B5G5R5A1_UNORM, - [PIPE_FORMAT_B4G4R4A4_UNORM] = GEN6_FORMAT_B4G4R4A4_UNORM, - [PIPE_FORMAT_B5G6R5_UNORM] = GEN6_FORMAT_B5G6R5_UNORM, - [PIPE_FORMAT_R10G10B10A2_UNORM] = GEN6_FORMAT_R10G10B10A2_UNORM, - [PIPE_FORMAT_L8_UNORM] = GEN6_FORMAT_L8_UNORM, - [PIPE_FORMAT_A8_UNORM] = GEN6_FORMAT_A8_UNORM, - [PIPE_FORMAT_I8_UNORM] = GEN6_FORMAT_I8_UNORM, - [PIPE_FORMAT_L8A8_UNORM] = GEN6_FORMAT_L8A8_UNORM, - [PIPE_FORMAT_L16_UNORM] = GEN6_FORMAT_L16_UNORM, - [PIPE_FORMAT_UYVY] = GEN6_FORMAT_YCRCB_SWAPUVY, - [PIPE_FORMAT_YUYV] = GEN6_FORMAT_YCRCB_NORMAL, - [PIPE_FORMAT_Z16_UNORM] = 0, - [PIPE_FORMAT_Z32_UNORM] = 0, - [PIPE_FORMAT_Z32_FLOAT] = 0, - [PIPE_FORMAT_Z24_UNORM_S8_UINT] = 0, - [PIPE_FORMAT_S8_UINT_Z24_UNORM] = 0, - [PIPE_FORMAT_Z24X8_UNORM] = 0, - [PIPE_FORMAT_X8Z24_UNORM] = 0, - [PIPE_FORMAT_S8_UINT] = 0, - [PIPE_FORMAT_R64_FLOAT] = GEN6_FORMAT_R64_FLOAT, - [PIPE_FORMAT_R64G64_FLOAT] = GEN6_FORMAT_R64G64_FLOAT, - [PIPE_FORMAT_R64G64B64_FLOAT] = GEN6_FORMAT_R64G64B64_FLOAT, - [PIPE_FORMAT_R64G64B64A64_FLOAT] = GEN6_FORMAT_R64G64B64A64_FLOAT, - [PIPE_FORMAT_R32_FLOAT] = GEN6_FORMAT_R32_FLOAT, - [PIPE_FORMAT_R32G32_FLOAT] = GEN6_FORMAT_R32G32_FLOAT, - [PIPE_FORMAT_R32G32B32_FLOAT] = GEN6_FORMAT_R32G32B32_FLOAT, - [PIPE_FORMAT_R32G32B32A32_FLOAT] = GEN6_FORMAT_R32G32B32A32_FLOAT, - [PIPE_FORMAT_R32_UNORM] = GEN6_FORMAT_R32_UNORM, - [PIPE_FORMAT_R32G32_UNORM] = GEN6_FORMAT_R32G32_UNORM, - [PIPE_FORMAT_R32G32B32_UNORM] = GEN6_FORMAT_R32G32B32_UNORM, - [PIPE_FORMAT_R32G32B32A32_UNORM] = GEN6_FORMAT_R32G32B32A32_UNORM, - [PIPE_FORMAT_R32_USCALED] = GEN6_FORMAT_R32_USCALED, - [PIPE_FORMAT_R32G32_USCALED] = GEN6_FORMAT_R32G32_USCALED, - [PIPE_FORMAT_R32G32B32_USCALED] = GEN6_FORMAT_R32G32B32_USCALED, - [PIPE_FORMAT_R32G32B32A32_USCALED] = GEN6_FORMAT_R32G32B32A32_USCALED, - [PIPE_FORMAT_R32_SNORM] = GEN6_FORMAT_R32_SNORM, - [PIPE_FORMAT_R32G32_SNORM] = GEN6_FORMAT_R32G32_SNORM, - [PIPE_FORMAT_R32G32B32_SNORM] = GEN6_FORMAT_R32G32B32_SNORM, - [PIPE_FORMAT_R32G32B32A32_SNORM] = GEN6_FORMAT_R32G32B32A32_SNORM, - [PIPE_FORMAT_R32_SSCALED] = GEN6_FORMAT_R32_SSCALED, - [PIPE_FORMAT_R32G32_SSCALED] = GEN6_FORMAT_R32G32_SSCALED, - [PIPE_FORMAT_R32G32B32_SSCALED] = GEN6_FORMAT_R32G32B32_SSCALED, - [PIPE_FORMAT_R32G32B32A32_SSCALED] = GEN6_FORMAT_R32G32B32A32_SSCALED, - [PIPE_FORMAT_R16_UNORM] = GEN6_FORMAT_R16_UNORM, - [PIPE_FORMAT_R16G16_UNORM] = GEN6_FORMAT_R16G16_UNORM, - [PIPE_FORMAT_R16G16B16_UNORM] = GEN6_FORMAT_R16G16B16_UNORM, - [PIPE_FORMAT_R16G16B16A16_UNORM] = GEN6_FORMAT_R16G16B16A16_UNORM, - [PIPE_FORMAT_R16_USCALED] = GEN6_FORMAT_R16_USCALED, - [PIPE_FORMAT_R16G16_USCALED] = GEN6_FORMAT_R16G16_USCALED, - [PIPE_FORMAT_R16G16B16_USCALED] = GEN6_FORMAT_R16G16B16_USCALED, - [PIPE_FORMAT_R16G16B16A16_USCALED] = GEN6_FORMAT_R16G16B16A16_USCALED, - [PIPE_FORMAT_R16_SNORM] = GEN6_FORMAT_R16_SNORM, - [PIPE_FORMAT_R16G16_SNORM] = GEN6_FORMAT_R16G16_SNORM, - [PIPE_FORMAT_R16G16B16_SNORM] = GEN6_FORMAT_R16G16B16_SNORM, - [PIPE_FORMAT_R16G16B16A16_SNORM] = GEN6_FORMAT_R16G16B16A16_SNORM, - [PIPE_FORMAT_R16_SSCALED] = GEN6_FORMAT_R16_SSCALED, - [PIPE_FORMAT_R16G16_SSCALED] = GEN6_FORMAT_R16G16_SSCALED, - [PIPE_FORMAT_R16G16B16_SSCALED] = GEN6_FORMAT_R16G16B16_SSCALED, - [PIPE_FORMAT_R16G16B16A16_SSCALED] = GEN6_FORMAT_R16G16B16A16_SSCALED, - [PIPE_FORMAT_R8_UNORM] = GEN6_FORMAT_R8_UNORM, - [PIPE_FORMAT_R8G8_UNORM] = GEN6_FORMAT_R8G8_UNORM, - [PIPE_FORMAT_R8G8B8_UNORM] = GEN6_FORMAT_R8G8B8_UNORM, - [PIPE_FORMAT_R8G8B8A8_UNORM] = GEN6_FORMAT_R8G8B8A8_UNORM, - [PIPE_FORMAT_X8B8G8R8_UNORM] = 0, - [PIPE_FORMAT_R8_USCALED] = GEN6_FORMAT_R8_USCALED, - [PIPE_FORMAT_R8G8_USCALED] = GEN6_FORMAT_R8G8_USCALED, - [PIPE_FORMAT_R8G8B8_USCALED] = GEN6_FORMAT_R8G8B8_USCALED, - [PIPE_FORMAT_R8G8B8A8_USCALED] = GEN6_FORMAT_R8G8B8A8_USCALED, - [PIPE_FORMAT_R8_SNORM] = GEN6_FORMAT_R8_SNORM, - [PIPE_FORMAT_R8G8_SNORM] = GEN6_FORMAT_R8G8_SNORM, - [PIPE_FORMAT_R8G8B8_SNORM] = GEN6_FORMAT_R8G8B8_SNORM, - [PIPE_FORMAT_R8G8B8A8_SNORM] = GEN6_FORMAT_R8G8B8A8_SNORM, - [PIPE_FORMAT_R8_SSCALED] = GEN6_FORMAT_R8_SSCALED, - [PIPE_FORMAT_R8G8_SSCALED] = GEN6_FORMAT_R8G8_SSCALED, - [PIPE_FORMAT_R8G8B8_SSCALED] = GEN6_FORMAT_R8G8B8_SSCALED, - [PIPE_FORMAT_R8G8B8A8_SSCALED] = GEN6_FORMAT_R8G8B8A8_SSCALED, - [PIPE_FORMAT_R32_FIXED] = GEN6_FORMAT_R32_SFIXED, - [PIPE_FORMAT_R32G32_FIXED] = GEN6_FORMAT_R32G32_SFIXED, - [PIPE_FORMAT_R32G32B32_FIXED] = GEN6_FORMAT_R32G32B32_SFIXED, - [PIPE_FORMAT_R32G32B32A32_FIXED] = GEN6_FORMAT_R32G32B32A32_SFIXED, - [PIPE_FORMAT_R16_FLOAT] = GEN6_FORMAT_R16_FLOAT, - [PIPE_FORMAT_R16G16_FLOAT] = GEN6_FORMAT_R16G16_FLOAT, - [PIPE_FORMAT_R16G16B16_FLOAT] = GEN6_FORMAT_R16G16B16_FLOAT, - [PIPE_FORMAT_R16G16B16A16_FLOAT] = GEN6_FORMAT_R16G16B16A16_FLOAT, - [PIPE_FORMAT_L8_SRGB] = GEN6_FORMAT_L8_UNORM_SRGB, - [PIPE_FORMAT_L8A8_SRGB] = GEN6_FORMAT_L8A8_UNORM_SRGB, - [PIPE_FORMAT_R8G8B8_SRGB] = GEN6_FORMAT_R8G8B8_UNORM_SRGB, - [PIPE_FORMAT_A8B8G8R8_SRGB] = 0, - [PIPE_FORMAT_X8B8G8R8_SRGB] = 0, - [PIPE_FORMAT_B8G8R8A8_SRGB] = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB, - [PIPE_FORMAT_B8G8R8X8_SRGB] = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB, - [PIPE_FORMAT_A8R8G8B8_SRGB] = 0, - [PIPE_FORMAT_X8R8G8B8_SRGB] = 0, - [PIPE_FORMAT_R8G8B8A8_SRGB] = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB, - [PIPE_FORMAT_DXT1_RGB] = GEN6_FORMAT_DXT1_RGB, - [PIPE_FORMAT_DXT1_RGBA] = GEN6_FORMAT_BC1_UNORM, - [PIPE_FORMAT_DXT3_RGBA] = GEN6_FORMAT_BC2_UNORM, - [PIPE_FORMAT_DXT5_RGBA] = GEN6_FORMAT_BC3_UNORM, - [PIPE_FORMAT_DXT1_SRGB] = GEN6_FORMAT_DXT1_RGB_SRGB, - [PIPE_FORMAT_DXT1_SRGBA] = GEN6_FORMAT_BC1_UNORM_SRGB, - [PIPE_FORMAT_DXT3_SRGBA] = GEN6_FORMAT_BC2_UNORM_SRGB, - [PIPE_FORMAT_DXT5_SRGBA] = GEN6_FORMAT_BC3_UNORM_SRGB, - [PIPE_FORMAT_RGTC1_UNORM] = GEN6_FORMAT_BC4_UNORM, - [PIPE_FORMAT_RGTC1_SNORM] = GEN6_FORMAT_BC4_SNORM, - [PIPE_FORMAT_RGTC2_UNORM] = GEN6_FORMAT_BC5_UNORM, - [PIPE_FORMAT_RGTC2_SNORM] = GEN6_FORMAT_BC5_SNORM, - [PIPE_FORMAT_R8G8_B8G8_UNORM] = 0, - [PIPE_FORMAT_G8R8_G8B8_UNORM] = 0, - [PIPE_FORMAT_R8SG8SB8UX8U_NORM] = 0, - [PIPE_FORMAT_R5SG5SB6U_NORM] = 0, - [PIPE_FORMAT_A8B8G8R8_UNORM] = 0, - [PIPE_FORMAT_B5G5R5X1_UNORM] = GEN6_FORMAT_B5G5R5X1_UNORM, - [PIPE_FORMAT_R10G10B10A2_USCALED] = GEN6_FORMAT_R10G10B10A2_USCALED, - [PIPE_FORMAT_R11G11B10_FLOAT] = GEN6_FORMAT_R11G11B10_FLOAT, - [PIPE_FORMAT_R9G9B9E5_FLOAT] = GEN6_FORMAT_R9G9B9E5_SHAREDEXP, - [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = 0, - [PIPE_FORMAT_R1_UNORM] = GEN6_FORMAT_R1_UNORM, - [PIPE_FORMAT_R10G10B10X2_USCALED] = GEN6_FORMAT_R10G10B10X2_USCALED, - [PIPE_FORMAT_R10G10B10X2_SNORM] = 0, - [PIPE_FORMAT_L4A4_UNORM] = 0, - [PIPE_FORMAT_B10G10R10A2_UNORM] = GEN6_FORMAT_B10G10R10A2_UNORM, - [PIPE_FORMAT_R10SG10SB10SA2U_NORM] = 0, - [PIPE_FORMAT_R8G8Bx_SNORM] = 0, - [PIPE_FORMAT_R8G8B8X8_UNORM] = GEN6_FORMAT_R8G8B8X8_UNORM, - [PIPE_FORMAT_B4G4R4X4_UNORM] = 0, - [PIPE_FORMAT_X24S8_UINT] = 0, - [PIPE_FORMAT_S8X24_UINT] = 0, - [PIPE_FORMAT_X32_S8X24_UINT] = 0, - [PIPE_FORMAT_B2G3R3_UNORM] = 0, - [PIPE_FORMAT_L16A16_UNORM] = GEN6_FORMAT_L16A16_UNORM, - [PIPE_FORMAT_A16_UNORM] = GEN6_FORMAT_A16_UNORM, - [PIPE_FORMAT_I16_UNORM] = GEN6_FORMAT_I16_UNORM, - [PIPE_FORMAT_LATC1_UNORM] = 0, - [PIPE_FORMAT_LATC1_SNORM] = 0, - [PIPE_FORMAT_LATC2_UNORM] = 0, - [PIPE_FORMAT_LATC2_SNORM] = 0, - [PIPE_FORMAT_A8_SNORM] = 0, - [PIPE_FORMAT_L8_SNORM] = 0, - [PIPE_FORMAT_L8A8_SNORM] = 0, - [PIPE_FORMAT_I8_SNORM] = 0, - [PIPE_FORMAT_A16_SNORM] = 0, - [PIPE_FORMAT_L16_SNORM] = 0, - [PIPE_FORMAT_L16A16_SNORM] = 0, - [PIPE_FORMAT_I16_SNORM] = 0, - [PIPE_FORMAT_A16_FLOAT] = GEN6_FORMAT_A16_FLOAT, - [PIPE_FORMAT_L16_FLOAT] = GEN6_FORMAT_L16_FLOAT, - [PIPE_FORMAT_L16A16_FLOAT] = GEN6_FORMAT_L16A16_FLOAT, - [PIPE_FORMAT_I16_FLOAT] = GEN6_FORMAT_I16_FLOAT, - [PIPE_FORMAT_A32_FLOAT] = GEN6_FORMAT_A32_FLOAT, - [PIPE_FORMAT_L32_FLOAT] = GEN6_FORMAT_L32_FLOAT, - [PIPE_FORMAT_L32A32_FLOAT] = GEN6_FORMAT_L32A32_FLOAT, - [PIPE_FORMAT_I32_FLOAT] = GEN6_FORMAT_I32_FLOAT, - [PIPE_FORMAT_YV12] = 0, - [PIPE_FORMAT_YV16] = 0, - [PIPE_FORMAT_IYUV] = 0, - [PIPE_FORMAT_NV12] = 0, - [PIPE_FORMAT_NV21] = 0, - [PIPE_FORMAT_A4R4_UNORM] = 0, - [PIPE_FORMAT_R4A4_UNORM] = 0, - [PIPE_FORMAT_R8A8_UNORM] = 0, - [PIPE_FORMAT_A8R8_UNORM] = 0, - [PIPE_FORMAT_R10G10B10A2_SSCALED] = GEN6_FORMAT_R10G10B10A2_SSCALED, - [PIPE_FORMAT_R10G10B10A2_SNORM] = GEN6_FORMAT_R10G10B10A2_SNORM, - [PIPE_FORMAT_B10G10R10A2_USCALED] = GEN6_FORMAT_B10G10R10A2_USCALED, - [PIPE_FORMAT_B10G10R10A2_SSCALED] = GEN6_FORMAT_B10G10R10A2_SSCALED, - [PIPE_FORMAT_B10G10R10A2_SNORM] = GEN6_FORMAT_B10G10R10A2_SNORM, - [PIPE_FORMAT_R8_UINT] = GEN6_FORMAT_R8_UINT, - [PIPE_FORMAT_R8G8_UINT] = GEN6_FORMAT_R8G8_UINT, - [PIPE_FORMAT_R8G8B8_UINT] = GEN6_FORMAT_R8G8B8_UINT, - [PIPE_FORMAT_R8G8B8A8_UINT] = GEN6_FORMAT_R8G8B8A8_UINT, - [PIPE_FORMAT_R8_SINT] = GEN6_FORMAT_R8_SINT, - [PIPE_FORMAT_R8G8_SINT] = GEN6_FORMAT_R8G8_SINT, - [PIPE_FORMAT_R8G8B8_SINT] = GEN6_FORMAT_R8G8B8_SINT, - [PIPE_FORMAT_R8G8B8A8_SINT] = GEN6_FORMAT_R8G8B8A8_SINT, - [PIPE_FORMAT_R16_UINT] = GEN6_FORMAT_R16_UINT, - [PIPE_FORMAT_R16G16_UINT] = GEN6_FORMAT_R16G16_UINT, - [PIPE_FORMAT_R16G16B16_UINT] = GEN6_FORMAT_R16G16B16_UINT, - [PIPE_FORMAT_R16G16B16A16_UINT] = GEN6_FORMAT_R16G16B16A16_UINT, - [PIPE_FORMAT_R16_SINT] = GEN6_FORMAT_R16_SINT, - [PIPE_FORMAT_R16G16_SINT] = GEN6_FORMAT_R16G16_SINT, - [PIPE_FORMAT_R16G16B16_SINT] = GEN6_FORMAT_R16G16B16_SINT, - [PIPE_FORMAT_R16G16B16A16_SINT] = GEN6_FORMAT_R16G16B16A16_SINT, - [PIPE_FORMAT_R32_UINT] = GEN6_FORMAT_R32_UINT, - [PIPE_FORMAT_R32G32_UINT] = GEN6_FORMAT_R32G32_UINT, - [PIPE_FORMAT_R32G32B32_UINT] = GEN6_FORMAT_R32G32B32_UINT, - [PIPE_FORMAT_R32G32B32A32_UINT] = GEN6_FORMAT_R32G32B32A32_UINT, - [PIPE_FORMAT_R32_SINT] = GEN6_FORMAT_R32_SINT, - [PIPE_FORMAT_R32G32_SINT] = GEN6_FORMAT_R32G32_SINT, - [PIPE_FORMAT_R32G32B32_SINT] = GEN6_FORMAT_R32G32B32_SINT, - [PIPE_FORMAT_R32G32B32A32_SINT] = GEN6_FORMAT_R32G32B32A32_SINT, - [PIPE_FORMAT_A8_UINT] = 0, - [PIPE_FORMAT_I8_UINT] = GEN6_FORMAT_I8_UINT, - [PIPE_FORMAT_L8_UINT] = GEN6_FORMAT_L8_UINT, - [PIPE_FORMAT_L8A8_UINT] = GEN6_FORMAT_L8A8_UINT, - [PIPE_FORMAT_A8_SINT] = 0, - [PIPE_FORMAT_I8_SINT] = GEN6_FORMAT_I8_SINT, - [PIPE_FORMAT_L8_SINT] = GEN6_FORMAT_L8_SINT, - [PIPE_FORMAT_L8A8_SINT] = GEN6_FORMAT_L8A8_SINT, - [PIPE_FORMAT_A16_UINT] = 0, - [PIPE_FORMAT_I16_UINT] = 0, - [PIPE_FORMAT_L16_UINT] = 0, - [PIPE_FORMAT_L16A16_UINT] = 0, - [PIPE_FORMAT_A16_SINT] = 0, - [PIPE_FORMAT_I16_SINT] = 0, - [PIPE_FORMAT_L16_SINT] = 0, - [PIPE_FORMAT_L16A16_SINT] = 0, - [PIPE_FORMAT_A32_UINT] = 0, - [PIPE_FORMAT_I32_UINT] = 0, - [PIPE_FORMAT_L32_UINT] = 0, - [PIPE_FORMAT_L32A32_UINT] = 0, - [PIPE_FORMAT_A32_SINT] = 0, - [PIPE_FORMAT_I32_SINT] = 0, - [PIPE_FORMAT_L32_SINT] = 0, - [PIPE_FORMAT_L32A32_SINT] = 0, - [PIPE_FORMAT_B10G10R10A2_UINT] = GEN6_FORMAT_B10G10R10A2_UINT, - [PIPE_FORMAT_ETC1_RGB8] = GEN6_FORMAT_ETC1_RGB8, - [PIPE_FORMAT_R8G8_R8B8_UNORM] = 0, - [PIPE_FORMAT_G8R8_B8R8_UNORM] = 0, - [PIPE_FORMAT_R8G8B8X8_SNORM] = 0, - [PIPE_FORMAT_R8G8B8X8_SRGB] = 0, - [PIPE_FORMAT_R8G8B8X8_UINT] = 0, - [PIPE_FORMAT_R8G8B8X8_SINT] = 0, - [PIPE_FORMAT_B10G10R10X2_UNORM] = GEN6_FORMAT_B10G10R10X2_UNORM, - [PIPE_FORMAT_R16G16B16X16_UNORM] = GEN6_FORMAT_R16G16B16X16_UNORM, - [PIPE_FORMAT_R16G16B16X16_SNORM] = 0, - [PIPE_FORMAT_R16G16B16X16_FLOAT] = GEN6_FORMAT_R16G16B16X16_FLOAT, - [PIPE_FORMAT_R16G16B16X16_UINT] = 0, - [PIPE_FORMAT_R16G16B16X16_SINT] = 0, - [PIPE_FORMAT_R32G32B32X32_FLOAT] = GEN6_FORMAT_R32G32B32X32_FLOAT, - [PIPE_FORMAT_R32G32B32X32_UINT] = 0, - [PIPE_FORMAT_R32G32B32X32_SINT] = 0, - [PIPE_FORMAT_R8A8_SNORM] = 0, - [PIPE_FORMAT_R16A16_UNORM] = 0, - [PIPE_FORMAT_R16A16_SNORM] = 0, - [PIPE_FORMAT_R16A16_FLOAT] = 0, - [PIPE_FORMAT_R32A32_FLOAT] = 0, - [PIPE_FORMAT_R8A8_UINT] = 0, - [PIPE_FORMAT_R8A8_SINT] = 0, - [PIPE_FORMAT_R16A16_UINT] = 0, - [PIPE_FORMAT_R16A16_SINT] = 0, - [PIPE_FORMAT_R32A32_UINT] = 0, - [PIPE_FORMAT_R32A32_SINT] = 0, - [PIPE_FORMAT_R10G10B10A2_UINT] = GEN6_FORMAT_R10G10B10A2_UINT, - [PIPE_FORMAT_B5G6R5_SRGB] = GEN6_FORMAT_B5G6R5_UNORM_SRGB, - }; - int sfmt = format_mapping[format]; - - /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */ - if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT) - sfmt = -1; - - return sfmt; -} diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c index 22c8ef2620a..0d837d8a9d5 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.c +++ b/src/gallium/drivers/ilo/core/ilo_image.c @@ -675,9 +675,12 @@ img_init_size_and_format(struct ilo_image *img, enum pipe_format format = templ->format; bool require_separate_stencil = false; + img->target = templ->target; img->width0 = templ->width0; img->height0 = templ->height0; img->depth0 = templ->depth0; + img->array_size = templ->array_size; + img->level_count = templ->last_level + 1; img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1; /* @@ -794,6 +797,10 @@ img_want_hiz(const struct ilo_image *img, if (ilo_debug & ILO_DEBUG_NOHIZ) return false; + /* we want 8x4 aligned levels */ + if (templ->target == PIPE_TEXTURE_1D) + return false; + if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL)) return false; @@ -1343,9 +1350,12 @@ img_init_for_transfer(struct ilo_image *img, img->aux.type = ILO_IMAGE_AUX_NONE; + img->target = templ->target; img->width0 = templ->width0; img->height0 = templ->height0; img->depth0 = templ->depth0; + img->array_size = templ->array_size; + img->level_count = 1; img->sample_count = 1; img->format = templ->format; @@ -1386,6 +1396,8 @@ void ilo_image_init(struct ilo_image *img, struct ilo_image_params params; bool transfer_only; + assert(ilo_is_zeroed(img, sizeof(*img))); + /* use transfer layout when the texture is never bound to GPU */ transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE | PIPE_BIND_TRANSFER_READ)); @@ -1411,6 +1423,8 @@ ilo_image_init_for_imported(struct ilo_image *img, { struct ilo_image_params params; + assert(ilo_is_zeroed(img, sizeof(*img))); + if ((tiling == GEN6_TILING_X && bo_stride % 512) || (tiling == GEN6_TILING_Y && bo_stride % 128) || (tiling == GEN8_TILING_W && bo_stride % 64)) @@ -1435,3 +1449,22 @@ ilo_image_init_for_imported(struct ilo_image *img, return true; } + +bool +ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev) +{ + /* HiZ is required for separate stencil on Gen6 */ + if (ilo_dev_gen(dev) == ILO_GEN(6) && + img->aux.type == ILO_IMAGE_AUX_HIZ && + img->separate_stencil) + return false; + + /* MCS is required for multisample images */ + if (img->aux.type == ILO_IMAGE_AUX_MCS && + img->sample_count > 1) + return false; + + img->aux.enables = 0x0; + + return true; +} diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h index 4956bdae2ee..af15e856028 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.h +++ b/src/gallium/drivers/ilo/core/ilo_image.h @@ -88,10 +88,14 @@ struct ilo_image_lod { * Texture layout. */ struct ilo_image { + enum pipe_texture_target target; + /* size, format, etc for programming hardware states */ unsigned width0; unsigned height0; unsigned depth0; + unsigned array_size; + unsigned level_count; unsigned sample_count; enum pipe_format format; bool separate_stencil; @@ -125,8 +129,6 @@ struct ilo_image { bool scanout; - struct intel_bo *bo; - struct { enum ilo_image_aux_type type; @@ -140,8 +142,12 @@ struct ilo_image { unsigned bo_stride; unsigned bo_height; + /* managed by users */ struct intel_bo *bo; } aux; + + /* managed by users */ + struct intel_bo *bo; }; struct pipe_resource; @@ -158,31 +164,13 @@ ilo_image_init_for_imported(struct ilo_image *img, enum gen_surface_tiling tiling, unsigned bo_stride); -static inline void -ilo_image_cleanup(struct ilo_image *img) -{ - intel_bo_unref(img->bo); - intel_bo_unref(img->aux.bo); -} - -static inline void -ilo_image_set_bo(struct ilo_image *img, struct intel_bo *bo) -{ - intel_bo_unref(img->bo); - img->bo = intel_bo_ref(bo); -} - -static inline void -ilo_image_set_aux_bo(struct ilo_image *img, struct intel_bo *bo) -{ - intel_bo_unref(img->aux.bo); - img->aux.bo = intel_bo_ref(bo); -} +bool +ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev); static inline bool ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level) { - return (img->aux.bo && (img->aux.enables & (1 << level))); + return (img->aux.enables & (1 << level)); } /** diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d.h b/src/gallium/drivers/ilo/core/ilo_state_3d.h deleted file mode 100644 index fdce445f733..00000000000 --- a/src/gallium/drivers/ilo/core/ilo_state_3d.h +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright (C) 2012-2014 LunarG, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Authors: - * Chia-I Wu <[email protected]> - */ - -#ifndef ILO_STATE_3D_H -#define ILO_STATE_3D_H - -#include "genhw/genhw.h" -#include "pipe/p_state.h" - -#include "ilo_core.h" -#include "ilo_dev.h" - -/** - * \see brw_context.h - */ -#define ILO_MAX_DRAW_BUFFERS 8 -#define ILO_MAX_CONST_BUFFERS (1 + 12) -#define ILO_MAX_SAMPLER_VIEWS 16 -#define ILO_MAX_SAMPLERS 16 -#define ILO_MAX_SO_BINDINGS 64 -#define ILO_MAX_SO_BUFFERS 4 -#define ILO_MAX_VIEWPORTS 1 - -#define ILO_MAX_SURFACES 256 - -struct intel_bo; -struct ilo_buffer; -struct ilo_image; -struct ilo_shader_state; - -struct ilo_vb_state { - struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS]; - uint32_t enabled_mask; -}; - -struct ilo_ib_state { - struct pipe_resource *buffer; - const void *user_buffer; - unsigned offset; - unsigned index_size; - - /* these are not valid until the state is finalized */ - struct pipe_resource *hw_resource; - unsigned hw_index_size; - /* an offset to be added to pipe_draw_info::start */ - int64_t draw_start_offset; -}; - -struct ilo_ve_cso { - /* VERTEX_ELEMENT_STATE */ - uint32_t payload[2]; -}; - -struct ilo_ve_state { - struct ilo_ve_cso cso[PIPE_MAX_ATTRIBS]; - unsigned count; - - unsigned instance_divisors[PIPE_MAX_ATTRIBS]; - unsigned vb_mapping[PIPE_MAX_ATTRIBS]; - unsigned vb_count; - - /* these are not valid until the state is finalized */ - struct ilo_ve_cso edgeflag_cso; - bool last_cso_edgeflag; - - struct ilo_ve_cso nosrc_cso; - bool prepend_nosrc_cso; -}; - -struct ilo_so_state { - struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS]; - unsigned count; - unsigned append_bitmask; - - bool enabled; -}; - -struct ilo_viewport_cso { - /* matrix form */ - float m00, m11, m22, m30, m31, m32; - - /* guardband in NDC space */ - float min_gbx, min_gby, max_gbx, max_gby; - - /* viewport in screen space */ - float min_x, min_y, min_z; - float max_x, max_y, max_z; -}; - -struct ilo_viewport_state { - struct ilo_viewport_cso cso[ILO_MAX_VIEWPORTS]; - unsigned count; - - struct pipe_viewport_state viewport0; -}; - -struct ilo_scissor_state { - /* SCISSOR_RECT */ - uint32_t payload[ILO_MAX_VIEWPORTS * 2]; - - struct pipe_scissor_state scissor0; -}; - -struct ilo_rasterizer_clip { - /* 3DSTATE_CLIP */ - uint32_t payload[3]; - - uint32_t can_enable_guardband; -}; - -struct ilo_rasterizer_sf { - /* 3DSTATE_SF */ - uint32_t payload[3]; - uint32_t dw_msaa; - - /* Global Depth Offset Constant/Scale/Clamp */ - uint32_t dw_depth_offset_const; - uint32_t dw_depth_offset_scale; - uint32_t dw_depth_offset_clamp; - - /* Gen8+ 3DSTATE_RASTER */ - uint32_t dw_raster; -}; - -struct ilo_rasterizer_wm { - /* 3DSTATE_WM */ - uint32_t payload[2]; - uint32_t dw_msaa_rast; - uint32_t dw_msaa_disp; -}; - -struct ilo_rasterizer_state { - struct pipe_rasterizer_state state; - - struct ilo_rasterizer_clip clip; - struct ilo_rasterizer_sf sf; - struct ilo_rasterizer_wm wm; -}; - -struct ilo_dsa_state { - /* DEPTH_STENCIL_STATE or Gen8+ 3DSTATE_WM_DEPTH_STENCIL */ - uint32_t payload[3]; - - uint32_t dw_blend_alpha; - uint32_t dw_ps_blend_alpha; - ubyte alpha_ref; -}; - -struct ilo_blend_cso { - /* BLEND_STATE */ - uint32_t payload[2]; - - uint32_t dw_blend; - uint32_t dw_blend_dst_alpha_forced_one; -}; - -struct ilo_blend_state { - struct ilo_blend_cso cso[ILO_MAX_DRAW_BUFFERS]; - - bool dual_blend; - bool alpha_to_coverage; - - uint32_t dw_shared; - uint32_t dw_alpha_mod; - uint32_t dw_logicop; - - /* a part of 3DSTATE_PS_BLEND */ - uint32_t dw_ps_blend; - uint32_t dw_ps_blend_dst_alpha_forced_one; -}; - -struct ilo_sampler_cso { - /* SAMPLER_STATE and SAMPLER_BORDER_COLOR_STATE */ - uint32_t payload[15]; - - uint32_t dw_filter; - uint32_t dw_filter_aniso; - uint32_t dw_wrap; - uint32_t dw_wrap_1d; - uint32_t dw_wrap_cube; - - bool anisotropic; - bool saturate_r; - bool saturate_s; - bool saturate_t; -}; - -struct ilo_sampler_state { - const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS]; -}; - -struct ilo_view_surface { - /* SURFACE_STATE */ - uint32_t payload[13]; - struct intel_bo *bo; - - uint32_t scanout; -}; - -struct ilo_view_cso { - struct pipe_sampler_view base; - - struct ilo_view_surface surface; -}; - -struct ilo_view_state { - struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS]; - unsigned count; -}; - -struct ilo_cbuf_cso { - struct pipe_resource *resource; - struct ilo_view_surface surface; - - /* - * this CSO is not so constant because user buffer needs to be uploaded in - * finalize_constant_buffers() - */ - const void *user_buffer; - unsigned user_buffer_size; -}; - -struct ilo_cbuf_state { - struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS]; - uint32_t enabled_mask; -}; - -struct ilo_resource_state { - struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES]; - unsigned count; -}; - -struct ilo_surface_cso { - struct pipe_surface base; - - bool is_rt; - union { - struct ilo_view_surface rt; - struct ilo_zs_surface { - uint32_t payload[12]; - uint32_t dw_aligned_8x4; - - struct intel_bo *bo; - struct intel_bo *hiz_bo; - struct intel_bo *separate_s8_bo; - } zs; - } u; -}; - -struct ilo_fb_state { - struct pipe_framebuffer_state state; - - struct ilo_view_surface null_rt; - struct ilo_zs_surface null_zs; - - struct ilo_fb_blend_caps { - bool can_logicop; - bool can_blend; - bool can_alpha_test; - bool dst_alpha_forced_one; - } blend_caps[PIPE_MAX_COLOR_BUFS]; - - unsigned num_samples; -}; - -struct ilo_shader_cso { - uint32_t payload[5]; -}; - -/** - * Translate a pipe texture target to the matching hardware surface type. - */ -static inline int -ilo_gpe_gen6_translate_texture(enum pipe_texture_target target) -{ - switch (target) { - case PIPE_BUFFER: - return GEN6_SURFTYPE_BUFFER; - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return GEN6_SURFTYPE_1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D_ARRAY: - return GEN6_SURFTYPE_2D; - case PIPE_TEXTURE_3D: - return GEN6_SURFTYPE_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return GEN6_SURFTYPE_CUBE; - default: - assert(!"unknown texture target"); - return GEN6_SURFTYPE_BUFFER; - } -} - -void -ilo_gpe_init_ve(const struct ilo_dev *dev, - unsigned num_states, - const struct pipe_vertex_element *states, - struct ilo_ve_state *ve); - -void -ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev, - struct ilo_ve_cso *cso); - -void -ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev, - int comp0, int comp1, int comp2, int comp3, - struct ilo_ve_cso *cso); - -void -ilo_gpe_set_viewport_cso(const struct ilo_dev *dev, - const struct pipe_viewport_state *state, - struct ilo_viewport_cso *vp); - -void -ilo_gpe_set_scissor(const struct ilo_dev *dev, - unsigned start_slot, - unsigned num_states, - const struct pipe_scissor_state *states, - struct ilo_scissor_state *scissor); - -void -ilo_gpe_set_scissor_null(const struct ilo_dev *dev, - struct ilo_scissor_state *scissor); - -void -ilo_gpe_init_rasterizer(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_state *rasterizer); -void -ilo_gpe_init_dsa(const struct ilo_dev *dev, - const struct pipe_depth_stencil_alpha_state *state, - struct ilo_dsa_state *dsa); - -void -ilo_gpe_init_blend(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - struct ilo_blend_state *blend); - -void -ilo_gpe_init_sampler_cso(const struct ilo_dev *dev, - const struct pipe_sampler_state *state, - struct ilo_sampler_cso *sampler); - -void -ilo_gpe_init_view_surface_null(const struct ilo_dev *dev, - unsigned width, unsigned height, - unsigned depth, unsigned level, - struct ilo_view_surface *surf); - -void -ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev, - const struct ilo_buffer *buf, - unsigned offset, unsigned size, - unsigned struct_size, - enum pipe_format elem_format, - bool is_rt, bool render_cache_rw, - struct ilo_view_surface *surf); - -void -ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev, - const struct ilo_image *img, - enum pipe_texture_target target, - enum pipe_format format, - unsigned first_level, - unsigned num_levels, - unsigned first_layer, - unsigned num_layers, - bool is_rt, - struct ilo_view_surface *surf); - -void -ilo_gpe_init_zs_surface(const struct ilo_dev *dev, - const struct ilo_image *img, - const struct ilo_image *s8_img, - enum pipe_texture_target target, - enum pipe_format format, unsigned level, - unsigned first_layer, unsigned num_layers, - struct ilo_zs_surface *zs); - -void -ilo_gpe_init_vs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *vs, - struct ilo_shader_cso *cso); - -void -ilo_gpe_init_gs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *gs, - struct ilo_shader_cso *cso); - -void -ilo_gpe_init_fs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *fs, - struct ilo_shader_cso *cso); - -void -ilo_gpe_set_fb(const struct ilo_dev *dev, - const struct pipe_framebuffer_state *state, - struct ilo_fb_state *fb); - -#endif /* ILO_STATE_3D_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c b/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c deleted file mode 100644 index 5a4c5dde7e7..00000000000 --- a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c +++ /dev/null @@ -1,2222 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright (C) 2012-2014 LunarG, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Authors: - * Chia-I Wu <[email protected]> - */ - -#include "genhw/genhw.h" -#include "util/u_dual_blend.h" -#include "util/u_framebuffer.h" -#include "util/u_half.h" - -#include "ilo_format.h" -#include "ilo_image.h" -#include "ilo_state_3d.h" -#include "../ilo_shader.h" - -static void -rasterizer_init_clip(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_clip *clip) -{ - uint32_t dw1, dw2, dw3; - - ILO_DEV_ASSERT(dev, 6, 8); - - dw1 = GEN6_CLIP_DW1_STATISTICS; - - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 219: - * - * "Workaround : Due to Hardware issue "EarlyCull" needs to be - * enabled only for the cases where the incoming primitive topology - * into the clipper guaranteed to be Trilist." - * - * What does this mean? - */ - dw1 |= 0 << 19 | - GEN7_CLIP_DW1_EARLY_CULL_ENABLE; - - if (ilo_dev_gen(dev) < ILO_GEN(8)) { - if (state->front_ccw) - dw1 |= GEN7_CLIP_DW1_FRONTWINDING_CCW; - - switch (state->cull_face) { - case PIPE_FACE_NONE: - dw1 |= GEN7_CLIP_DW1_CULLMODE_NONE; - break; - case PIPE_FACE_FRONT: - dw1 |= GEN7_CLIP_DW1_CULLMODE_FRONT; - break; - case PIPE_FACE_BACK: - dw1 |= GEN7_CLIP_DW1_CULLMODE_BACK; - break; - case PIPE_FACE_FRONT_AND_BACK: - dw1 |= GEN7_CLIP_DW1_CULLMODE_BOTH; - break; - } - } - } - - dw2 = GEN6_CLIP_DW2_CLIP_ENABLE | - GEN6_CLIP_DW2_XY_TEST_ENABLE | - state->clip_plane_enable << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT | - GEN6_CLIP_DW2_CLIPMODE_NORMAL; - - if (state->clip_halfz) - dw2 |= GEN6_CLIP_DW2_APIMODE_D3D; - else - dw2 |= GEN6_CLIP_DW2_APIMODE_OGL; - - if (ilo_dev_gen(dev) < ILO_GEN(8) && state->depth_clip) - dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE; - - if (state->flatshade_first) { - dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT | - 0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT | - 1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT; - } - else { - dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT | - 1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT | - 2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT; - } - - dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT | - 0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT; - - clip->payload[0] = dw1; - clip->payload[1] = dw2; - clip->payload[2] = dw3; - - clip->can_enable_guardband = true; - - /* - * There are several reasons that guard band test should be disabled - * - * - GL wide points (to avoid partially visibie object) - * - GL wide or AA lines (to avoid partially visibie object) - */ - if (state->point_size_per_vertex || state->point_size > 1.0f) - clip->can_enable_guardband = false; - if (state->line_smooth || state->line_width > 1.0f) - clip->can_enable_guardband = false; -} - -static void -rasterizer_init_sf_depth_offset_gen6(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_sf *sf) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - /* - * Scale the constant term. The minimum representable value used by the HW - * is not large enouch to be the minimum resolvable difference. - */ - sf->dw_depth_offset_const = fui(state->offset_units * 2.0f); - sf->dw_depth_offset_scale = fui(state->offset_scale); - sf->dw_depth_offset_clamp = fui(state->offset_clamp); -} - -static void -rasterizer_init_sf_gen6(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_sf *sf) -{ - int line_width, point_width; - uint32_t dw1, dw2, dw3; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 248: - * - * "This bit (Statistics Enable) should be set whenever clipping is - * enabled and the Statistics Enable bit is set in CLIP_STATE. It - * should be cleared if clipping is disabled or Statistics Enable in - * CLIP_STATE is clear." - */ - dw1 = GEN7_SF_DW1_STATISTICS | - GEN7_SF_DW1_VIEWPORT_ENABLE; - - /* XXX GEN6 path seems to work fine for GEN7 */ - if (false && ilo_dev_gen(dev) >= ILO_GEN(7)) { - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 258: - * - * "This bit (Legacy Global Depth Bias Enable, Global Depth Offset - * Enable Solid , Global Depth Offset Enable Wireframe, and Global - * Depth Offset Enable Point) should be set whenever non zero depth - * bias (Slope, Bias) values are used. Setting this bit may have - * some degradation of performance for some workloads." - */ - if (state->offset_tri || state->offset_line || state->offset_point) { - /* XXX need to scale offset_const according to the depth format */ - dw1 |= GEN7_SF_DW1_LEGACY_DEPTH_OFFSET; - - dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID | - GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME | - GEN7_SF_DW1_DEPTH_OFFSET_POINT; - } - } else { - if (state->offset_tri) - dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID; - if (state->offset_line) - dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME; - if (state->offset_point) - dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT; - } - - switch (state->fill_front) { - case PIPE_POLYGON_MODE_FILL: - dw1 |= GEN7_SF_DW1_FRONTFACE_SOLID; - break; - case PIPE_POLYGON_MODE_LINE: - dw1 |= GEN7_SF_DW1_FRONTFACE_WIREFRAME; - break; - case PIPE_POLYGON_MODE_POINT: - dw1 |= GEN7_SF_DW1_FRONTFACE_POINT; - break; - } - - switch (state->fill_back) { - case PIPE_POLYGON_MODE_FILL: - dw1 |= GEN7_SF_DW1_BACKFACE_SOLID; - break; - case PIPE_POLYGON_MODE_LINE: - dw1 |= GEN7_SF_DW1_BACKFACE_WIREFRAME; - break; - case PIPE_POLYGON_MODE_POINT: - dw1 |= GEN7_SF_DW1_BACKFACE_POINT; - break; - } - - if (state->front_ccw) - dw1 |= GEN7_SF_DW1_FRONTWINDING_CCW; - - dw2 = 0; - - if (state->line_smooth) { - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 251: - * - * "This field (Anti-aliasing Enable) must be disabled if any of the - * render targets have integer (UINT or SINT) surface format." - * - * From the Sandy Bridge PRM, volume 2 part 1, page 317: - * - * "This field (Hierarchical Depth Buffer Enable) must be disabled - * if Anti-aliasing Enable in 3DSTATE_SF is enabled. - * - * TODO We do not check those yet. - */ - dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE | - GEN7_SF_DW2_AA_LINE_CAP_1_0; - } - - switch (state->cull_face) { - case PIPE_FACE_NONE: - dw2 |= GEN7_SF_DW2_CULLMODE_NONE; - break; - case PIPE_FACE_FRONT: - dw2 |= GEN7_SF_DW2_CULLMODE_FRONT; - break; - case PIPE_FACE_BACK: - dw2 |= GEN7_SF_DW2_CULLMODE_BACK; - break; - case PIPE_FACE_FRONT_AND_BACK: - dw2 |= GEN7_SF_DW2_CULLMODE_BOTH; - break; - } - - /* - * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1) - * pixels in the minor direction. We have to make the lines slightly - * thicker, 0.5 pixel on both sides, so that they intersect that many - * pixels are considered into the lines. - * - * Line width is in U3.7. - */ - line_width = (int) - ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f); - line_width = CLAMP(line_width, 0, 1023); - - /* use GIQ rules */ - if (line_width == 128 && !state->line_smooth) - line_width = 0; - - dw2 |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT; - - if (ilo_dev_gen(dev) == ILO_GEN(7.5) && state->line_stipple_enable) - dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE; - - if (state->scissor) - dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE; - - dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE | - GEN7_SF_DW3_SUBPIXEL_8BITS; - - if (state->line_last_pixel) - dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE; - - if (state->flatshade_first) { - dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | - 0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | - 1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; - } else { - dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | - 1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | - 2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; - } - - if (!state->point_size_per_vertex) - dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH; - - /* in U8.3 */ - point_width = (int) (state->point_size * 8.0f + 0.5f); - point_width = CLAMP(point_width, 1, 2047); - - dw3 |= point_width; - - STATIC_ASSERT(Elements(sf->payload) >= 3); - sf->payload[0] = dw1; - sf->payload[1] = dw2; - sf->payload[2] = dw3; - - if (state->multisample) { - sf->dw_msaa = GEN7_SF_DW2_MSRASTMODE_ON_PATTERN; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 251: - * - * "Software must not program a value of 0.0 when running in - * MSRASTMODE_ON_xxx modes - zero-width lines are not available - * when multisampling rasterization is enabled." - */ - if (!line_width) { - line_width = 128; /* 1.0f */ - - sf->dw_msaa |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT; - } - } else { - sf->dw_msaa = 0; - } - - rasterizer_init_sf_depth_offset_gen6(dev, state, sf); - /* 3DSTATE_RASTER is Gen8+ only */ - sf->dw_raster = 0; -} - -static uint32_t -rasterizer_get_sf_raster_gen8(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state) -{ - uint32_t dw = 0; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (state->front_ccw) - dw |= GEN8_RASTER_DW1_FRONTWINDING_CCW; - - switch (state->cull_face) { - case PIPE_FACE_NONE: - dw |= GEN8_RASTER_DW1_CULLMODE_NONE; - break; - case PIPE_FACE_FRONT: - dw |= GEN8_RASTER_DW1_CULLMODE_FRONT; - break; - case PIPE_FACE_BACK: - dw |= GEN8_RASTER_DW1_CULLMODE_BACK; - break; - case PIPE_FACE_FRONT_AND_BACK: - dw |= GEN8_RASTER_DW1_CULLMODE_BOTH; - break; - } - - if (state->point_smooth) - dw |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE; - - if (state->multisample) - dw |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE; - - if (state->offset_tri) - dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID; - if (state->offset_line) - dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME; - if (state->offset_point) - dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT; - - switch (state->fill_front) { - case PIPE_POLYGON_MODE_FILL: - dw |= GEN8_RASTER_DW1_FRONTFACE_SOLID; - break; - case PIPE_POLYGON_MODE_LINE: - dw |= GEN8_RASTER_DW1_FRONTFACE_WIREFRAME; - break; - case PIPE_POLYGON_MODE_POINT: - dw |= GEN8_RASTER_DW1_FRONTFACE_POINT; - break; - } - - switch (state->fill_back) { - case PIPE_POLYGON_MODE_FILL: - dw |= GEN8_RASTER_DW1_BACKFACE_SOLID; - break; - case PIPE_POLYGON_MODE_LINE: - dw |= GEN8_RASTER_DW1_BACKFACE_WIREFRAME; - break; - case PIPE_POLYGON_MODE_POINT: - dw |= GEN8_RASTER_DW1_BACKFACE_POINT; - break; - } - - if (state->line_smooth) - dw |= GEN8_RASTER_DW1_AA_LINE_ENABLE; - - if (state->scissor) - dw |= GEN8_RASTER_DW1_SCISSOR_ENABLE; - - if (state->depth_clip) - dw |= GEN8_RASTER_DW1_Z_TEST_ENABLE; - - return dw; -} - -static void -rasterizer_init_sf_gen8(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_sf *sf) -{ - int line_width, point_width; - uint32_t dw1, dw2, dw3; - - ILO_DEV_ASSERT(dev, 8, 8); - - /* in U3.7 */ - line_width = (int) - ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f); - line_width = CLAMP(line_width, 0, 1023); - - /* use GIQ rules */ - if (line_width == 128 && !state->line_smooth) - line_width = 0; - - /* in U8.3 */ - point_width = (int) (state->point_size * 8.0f + 0.5f); - point_width = CLAMP(point_width, 1, 2047); - - dw1 = GEN7_SF_DW1_STATISTICS | - GEN7_SF_DW1_VIEWPORT_ENABLE; - - dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT; - if (state->line_smooth) - dw2 |= GEN7_SF_DW2_AA_LINE_CAP_1_0; - - dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE | - GEN7_SF_DW3_SUBPIXEL_8BITS | - point_width; - - if (state->line_last_pixel) - dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE; - - if (state->flatshade_first) { - dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | - 0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | - 1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; - } else { - dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | - 1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | - 2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; - } - - if (!state->point_size_per_vertex) - dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH; - - dw3 |= point_width; - - STATIC_ASSERT(Elements(sf->payload) >= 3); - sf->payload[0] = dw1; - sf->payload[1] = dw2; - sf->payload[2] = dw3; - - rasterizer_init_sf_depth_offset_gen6(dev, state, sf); - - sf->dw_msaa = 0; - sf->dw_raster = rasterizer_get_sf_raster_gen8(dev, state); -} - -static void -rasterizer_init_wm_gen6(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_wm *wm) -{ - uint32_t dw5, dw6; - - ILO_DEV_ASSERT(dev, 6, 6); - - /* only the FF unit states are set, as in GEN7 */ - - dw5 = GEN6_WM_DW5_AA_LINE_WIDTH_2_0; - - /* same value as in 3DSTATE_SF */ - if (state->line_smooth) - dw5 |= GEN6_WM_DW5_AA_LINE_CAP_1_0; - - if (state->poly_stipple_enable) - dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE; - if (state->line_stipple_enable) - dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE; - - /* - * assertion that makes sure - * - * dw6 |= wm->dw_msaa_rast | wm->dw_msaa_disp; - * - * is valid - */ - STATIC_ASSERT(GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL == 0 && - GEN6_WM_DW6_MSDISPMODE_PERSAMPLE == 0); - dw6 = GEN6_WM_DW6_ZW_INTERP_PIXEL; - - if (state->bottom_edge_rule) - dw6 |= GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT; - - wm->dw_msaa_rast = - (state->multisample) ? GEN6_WM_DW6_MSRASTMODE_ON_PATTERN : 0; - wm->dw_msaa_disp = GEN6_WM_DW6_MSDISPMODE_PERPIXEL; - - STATIC_ASSERT(Elements(wm->payload) >= 2); - wm->payload[0] = dw5; - wm->payload[1] = dw6; -} - -static void -rasterizer_init_wm_gen7(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_wm *wm) -{ - uint32_t dw1, dw2; - - ILO_DEV_ASSERT(dev, 7, 7.5); - - /* - * assertion that makes sure - * - * dw1 |= wm->dw_msaa_rast; - * dw2 |= wm->dw_msaa_disp; - * - * is valid - */ - STATIC_ASSERT(GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL == 0 && - GEN7_WM_DW2_MSDISPMODE_PERSAMPLE == 0); - dw1 = GEN7_WM_DW1_ZW_INTERP_PIXEL | - GEN7_WM_DW1_AA_LINE_WIDTH_2_0; - dw2 = 0; - - /* same value as in 3DSTATE_SF */ - if (state->line_smooth) - dw1 |= GEN7_WM_DW1_AA_LINE_CAP_1_0; - - if (state->poly_stipple_enable) - dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE; - if (state->line_stipple_enable) - dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE; - - if (state->bottom_edge_rule) - dw1 |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT; - - wm->dw_msaa_rast = - (state->multisample) ? GEN7_WM_DW1_MSRASTMODE_ON_PATTERN : 0; - wm->dw_msaa_disp = GEN7_WM_DW2_MSDISPMODE_PERPIXEL; - - STATIC_ASSERT(Elements(wm->payload) >= 2); - wm->payload[0] = dw1; - wm->payload[1] = dw2; -} - -static uint32_t -rasterizer_get_wm_gen8(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - dw = GEN7_WM_DW1_ZW_INTERP_PIXEL | - GEN7_WM_DW1_AA_LINE_WIDTH_2_0; - - /* same value as in 3DSTATE_SF */ - if (state->line_smooth) - dw |= GEN7_WM_DW1_AA_LINE_CAP_1_0; - - if (state->poly_stipple_enable) - dw |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE; - if (state->line_stipple_enable) - dw |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE; - - if (state->bottom_edge_rule) - dw |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT; - - return dw; -} - -void -ilo_gpe_init_rasterizer(const struct ilo_dev *dev, - const struct pipe_rasterizer_state *state, - struct ilo_rasterizer_state *rasterizer) -{ - rasterizer_init_clip(dev, state, &rasterizer->clip); - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - memset(&rasterizer->wm, 0, sizeof(rasterizer->wm)); - rasterizer->wm.payload[0] = rasterizer_get_wm_gen8(dev, state); - - rasterizer_init_sf_gen8(dev, state, &rasterizer->sf); - } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - rasterizer_init_wm_gen7(dev, state, &rasterizer->wm); - rasterizer_init_sf_gen6(dev, state, &rasterizer->sf); - } else { - rasterizer_init_wm_gen6(dev, state, &rasterizer->wm); - rasterizer_init_sf_gen6(dev, state, &rasterizer->sf); - } -} - -static void -fs_init_cso_gen6(const struct ilo_dev *dev, - const struct ilo_shader_state *fs, - struct ilo_shader_cso *cso) -{ - int start_grf, input_count, sampler_count, interps, max_threads; - uint32_t dw2, dw4, dw5, dw6; - - ILO_DEV_ASSERT(dev, 6, 6); - - start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG); - input_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT); - sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT); - interps = ilo_shader_get_kernel_param(fs, - ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS); - - /* see brwCreateContext() */ - max_threads = (dev->gt == 2) ? 80 : 40; - - dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT; - dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT; - - dw4 = start_grf << GEN6_WM_DW4_URB_GRF_START0__SHIFT | - 0 << GEN6_WM_DW4_URB_GRF_START1__SHIFT | - 0 << GEN6_WM_DW4_URB_GRF_START2__SHIFT; - - dw5 = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 275: - * - * "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the - * PS kernel or color calculator has the ability to kill (discard) - * pixels or samples, other than due to depth or stencil testing. - * This bit is required to be ENABLED in the following situations: - * - * The API pixel shader program contains "killpix" or "discard" - * instructions, or other code in the pixel shader kernel that can - * cause the final pixel mask to differ from the pixel mask received - * on dispatch. - * - * A sampler with chroma key enabled with kill pixel mode is used by - * the pixel shader. - * - * Any render target has Alpha Test Enable or AlphaToCoverage Enable - * enabled. - * - * The pixel shader kernel generates and outputs oMask. - * - * Note: As ClipDistance clipping is fully supported in hardware and - * therefore not via PS instructions, there should be no need to - * ENABLE this bit due to ClipDistance clipping." - */ - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL)) - dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 275: - * - * "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth - * field must be set to disabled." - * - * TODO This is not checked yet. - */ - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z)) - dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z)) - dw5 |= GEN6_WM_DW5_PS_USE_DEPTH; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W)) - dw5 |= GEN6_WM_DW5_PS_USE_W; - - /* - * TODO set this bit only when - * - * a) fs writes colors and color is not masked, or - * b) fs writes depth, or - * c) fs or cc kills - */ - if (true) - dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE; - - assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET)); - dw5 |= GEN6_PS_DISPATCH_8 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT; - - dw6 = input_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT | - GEN6_WM_DW6_PS_POSOFFSET_NONE | - interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT; - - STATIC_ASSERT(Elements(cso->payload) >= 4); - cso->payload[0] = dw2; - cso->payload[1] = dw4; - cso->payload[2] = dw5; - cso->payload[3] = dw6; -} - -static uint32_t -fs_get_wm_gen7(const struct ilo_dev *dev, - const struct ilo_shader_state *fs) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 7, 7.5); - - dw = ilo_shader_get_kernel_param(fs, - ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) << - GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT; - - /* - * TODO set this bit only when - * - * a) fs writes colors and color is not masked, or - * b) fs writes depth, or - * c) fs or cc kills - */ - dw |= GEN7_WM_DW1_PS_DISPATCH_ENABLE; - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 278: - * - * "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that - * the PS kernel or color calculator has the ability to kill - * (discard) pixels or samples, other than due to depth or stencil - * testing. This bit is required to be ENABLED in the following - * situations: - * - * - The API pixel shader program contains "killpix" or "discard" - * instructions, or other code in the pixel shader kernel that - * can cause the final pixel mask to differ from the pixel mask - * received on dispatch. - * - * - A sampler with chroma key enabled with kill pixel mode is used - * by the pixel shader. - * - * - Any render target has Alpha Test Enable or AlphaToCoverage - * Enable enabled. - * - * - The pixel shader kernel generates and outputs oMask. - * - * Note: As ClipDistance clipping is fully supported in hardware - * and therefore not via PS instructions, there should be no need - * to ENABLE this bit due to ClipDistance clipping." - */ - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL)) - dw |= GEN7_WM_DW1_PS_KILL_PIXEL; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z)) - dw |= GEN7_WM_DW1_PSCDEPTH_ON; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z)) - dw |= GEN7_WM_DW1_PS_USE_DEPTH; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W)) - dw |= GEN7_WM_DW1_PS_USE_W; - - return dw; -} - -static void -fs_init_cso_gen7(const struct ilo_dev *dev, - const struct ilo_shader_state *fs, - struct ilo_shader_cso *cso) -{ - int start_grf, sampler_count, max_threads; - uint32_t dw2, dw4, dw5; - - ILO_DEV_ASSERT(dev, 7, 7.5); - - start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG); - sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT); - - dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT; - dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT; - - dw4 = GEN7_PS_DW4_POSOFFSET_NONE; - - /* see brwCreateContext() */ - switch (ilo_dev_gen(dev)) { - case ILO_GEN(7.5): - max_threads = (dev->gt == 3) ? 408 : (dev->gt == 2) ? 204 : 102; - dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT; - dw4 |= 1 << GEN75_PS_DW4_SAMPLE_MASK__SHIFT; - break; - case ILO_GEN(7): - default: - max_threads = (dev->gt == 2) ? 172 : 48; - dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT; - break; - } - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE)) - dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT)) - dw4 |= GEN7_PS_DW4_ATTR_ENABLE; - - assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET)); - dw4 |= GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT; - - dw5 = start_grf << GEN7_PS_DW5_URB_GRF_START0__SHIFT | - 0 << GEN7_PS_DW5_URB_GRF_START1__SHIFT | - 0 << GEN7_PS_DW5_URB_GRF_START2__SHIFT; - - STATIC_ASSERT(Elements(cso->payload) >= 4); - cso->payload[0] = dw2; - cso->payload[1] = dw4; - cso->payload[2] = dw5; - cso->payload[3] = fs_get_wm_gen7(dev, fs); -} - -static uint32_t -fs_get_psx_gen8(const struct ilo_dev *dev, - const struct ilo_shader_state *fs) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - dw = GEN8_PSX_DW1_DISPATCH_ENABLE; - - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL)) - dw |= GEN8_PSX_DW1_KILL_PIXEL; - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z)) - dw |= GEN8_PSX_DW1_PSCDEPTH_ON; - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z)) - dw |= GEN8_PSX_DW1_USE_DEPTH; - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W)) - dw |= GEN8_PSX_DW1_USE_W; - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT)) - dw |= GEN8_PSX_DW1_ATTR_ENABLE; - - return dw; -} - -static uint32_t -fs_get_wm_gen8(const struct ilo_dev *dev, - const struct ilo_shader_state *fs) -{ - ILO_DEV_ASSERT(dev, 8, 8); - - return ilo_shader_get_kernel_param(fs, - ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) << - GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT; -} - -static void -fs_init_cso_gen8(const struct ilo_dev *dev, - const struct ilo_shader_state *fs, - struct ilo_shader_cso *cso) -{ - int start_grf, sampler_count; - uint32_t dw3, dw6, dw7; - - ILO_DEV_ASSERT(dev, 8, 8); - - start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG); - sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT); - - dw3 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT; - dw3 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT; - - /* always 64? */ - dw6 = (64 - 2) << GEN8_PS_DW6_MAX_THREADS__SHIFT | - GEN8_PS_DW6_POSOFFSET_NONE; - if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE)) - dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE; - - assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET)); - dw6 |= GEN6_PS_DISPATCH_8 << GEN8_PS_DW6_DISPATCH_MODE__SHIFT; - - dw7 = start_grf << GEN8_PS_DW7_URB_GRF_START0__SHIFT | - 0 << GEN8_PS_DW7_URB_GRF_START1__SHIFT | - 0 << GEN8_PS_DW7_URB_GRF_START2__SHIFT; - - STATIC_ASSERT(Elements(cso->payload) >= 5); - cso->payload[0] = dw3; - cso->payload[1] = dw6; - cso->payload[2] = dw7; - cso->payload[3] = fs_get_psx_gen8(dev, fs); - cso->payload[4] = fs_get_wm_gen8(dev, fs); -} - -void -ilo_gpe_init_fs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *fs, - struct ilo_shader_cso *cso) -{ - if (ilo_dev_gen(dev) >= ILO_GEN(8)) - fs_init_cso_gen8(dev, fs, cso); - else if (ilo_dev_gen(dev) >= ILO_GEN(7)) - fs_init_cso_gen7(dev, fs, cso); - else - fs_init_cso_gen6(dev, fs, cso); -} - -struct ilo_zs_surface_info { - int surface_type; - int format; - - struct { - struct intel_bo *bo; - unsigned stride; - unsigned qpitch; - enum gen_surface_tiling tiling; - uint32_t offset; - } zs, stencil, hiz; - - unsigned width, height, depth; - unsigned lod, first_layer, num_layers; -}; - -static void -zs_init_info_null(const struct ilo_dev *dev, - struct ilo_zs_surface_info *info) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - memset(info, 0, sizeof(*info)); - - info->surface_type = GEN6_SURFTYPE_NULL; - info->format = GEN6_ZFORMAT_D32_FLOAT; - info->width = 1; - info->height = 1; - info->depth = 1; - info->num_layers = 1; -} - -static void -zs_init_info(const struct ilo_dev *dev, - const struct ilo_image *img, - const struct ilo_image *s8_img, - enum pipe_texture_target target, - enum pipe_format format, unsigned level, - unsigned first_layer, unsigned num_layers, - struct ilo_zs_surface_info *info) -{ - bool separate_stencil; - - ILO_DEV_ASSERT(dev, 6, 8); - - memset(info, 0, sizeof(*info)); - - info->surface_type = ilo_gpe_gen6_translate_texture(target); - - if (info->surface_type == GEN6_SURFTYPE_CUBE) { - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 325-326: - * - * "For Other Surfaces (Cube Surfaces): - * This field (Minimum Array Element) is ignored." - * - * "For Other Surfaces (Cube Surfaces): - * This field (Render Target View Extent) is ignored." - * - * As such, we cannot set first_layer and num_layers on cube surfaces. - * To work around that, treat it as a 2D surface. - */ - info->surface_type = GEN6_SURFTYPE_2D; - } - - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - separate_stencil = true; - } else { - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 317: - * - * "This field (Separate Stencil Buffer Enable) must be set to the - * same value (enabled or disabled) as Hierarchical Depth Buffer - * Enable." - */ - separate_stencil = ilo_image_can_enable_aux(img, level); - } - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 317: - * - * "If this field (Hierarchical Depth Buffer Enable) is enabled, the - * Surface Format of the depth buffer cannot be - * D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT. Use of stencil - * requires the separate stencil buffer." - * - * From the Ironlake PRM, volume 2 part 1, page 330: - * - * "If this field (Separate Stencil Buffer Enable) is disabled, the - * Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT." - * - * There is no similar restriction for GEN6. But when D24_UNORM_X8_UINT - * is indeed used, the depth values output by the fragment shaders will - * be different when read back. - * - * As for GEN7+, separate_stencil is always true. - */ - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - info->format = GEN6_ZFORMAT_D16_UNORM; - break; - case PIPE_FORMAT_Z32_FLOAT: - info->format = GEN6_ZFORMAT_D32_FLOAT; - break; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - info->format = (separate_stencil) ? - GEN6_ZFORMAT_D24_UNORM_X8_UINT : - GEN6_ZFORMAT_D24_UNORM_S8_UINT; - break; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - info->format = (separate_stencil) ? - GEN6_ZFORMAT_D32_FLOAT : - GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT; - break; - case PIPE_FORMAT_S8_UINT: - if (separate_stencil) { - info->format = GEN6_ZFORMAT_D32_FLOAT; - break; - } - /* fall through */ - default: - assert(!"unsupported depth/stencil format"); - zs_init_info_null(dev, info); - return; - break; - } - - if (format != PIPE_FORMAT_S8_UINT) { - info->zs.bo = img->bo; - info->zs.stride = img->bo_stride; - - assert(img->walk_layer_height % 4 == 0); - info->zs.qpitch = img->walk_layer_height / 4; - - info->zs.tiling = img->tiling; - info->zs.offset = 0; - } - - if (s8_img || format == PIPE_FORMAT_S8_UINT) { - info->stencil.bo = s8_img->bo; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 329: - * - * "The pitch must be set to 2x the value computed based on width, - * as the stencil buffer is stored with two rows interleaved." - * - * For GEN7, we still dobule the stride because we did not double the - * slice widths when initializing the layout. - */ - info->stencil.stride = s8_img->bo_stride * 2; - - assert(s8_img->walk_layer_height % 4 == 0); - info->stencil.qpitch = s8_img->walk_layer_height / 4; - - info->stencil.tiling = s8_img->tiling; - - if (ilo_dev_gen(dev) == ILO_GEN(6)) { - unsigned x, y; - - assert(s8_img->walk == ILO_IMAGE_WALK_LOD); - - /* offset to the level */ - ilo_image_get_slice_pos(s8_img, level, 0, &x, &y); - ilo_image_pos_to_mem(s8_img, x, y, &x, &y); - info->stencil.offset = ilo_image_mem_to_raw(s8_img, x, y); - } - } - - if (ilo_image_can_enable_aux(img, level)) { - info->hiz.bo = img->aux.bo; - info->hiz.stride = img->aux.bo_stride; - - assert(img->aux.walk_layer_height % 4 == 0); - info->hiz.qpitch = img->aux.walk_layer_height / 4; - - info->hiz.tiling = GEN6_TILING_Y; - - /* offset to the level */ - if (ilo_dev_gen(dev) == ILO_GEN(6)) - info->hiz.offset = img->aux.walk_lod_offsets[level]; - } - - info->width = img->width0; - info->height = img->height0; - info->depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers; - - info->lod = level; - info->first_layer = first_layer; - info->num_layers = num_layers; -} - -void -ilo_gpe_init_zs_surface(const struct ilo_dev *dev, - const struct ilo_image *img, - const struct ilo_image *s8_img, - enum pipe_texture_target target, - enum pipe_format format, unsigned level, - unsigned first_layer, unsigned num_layers, - struct ilo_zs_surface *zs) -{ - const int max_2d_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; - const int max_array_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512; - struct ilo_zs_surface_info info; - uint32_t dw1, dw2, dw3, dw4, dw5, dw6; - int align_w = 8, align_h = 4; - - ILO_DEV_ASSERT(dev, 6, 8); - - if (img) { - zs_init_info(dev, img, s8_img, target, format, - level, first_layer, num_layers, &info); - - switch (img->sample_count) { - case 2: - align_w /= 2; - break; - case 4: - align_w /= 2; - align_h /= 2; - break; - case 8: - align_w /= 4; - align_h /= 2; - break; - case 16: - align_w /= 4; - align_h /= 4; - break; - default: - break; - } - } else { - zs_init_info_null(dev, &info); - } - - switch (info.surface_type) { - case GEN6_SURFTYPE_NULL: - break; - case GEN6_SURFTYPE_1D: - assert(info.width <= max_2d_size && info.height == 1 && - info.depth <= max_array_size); - assert(info.first_layer < max_array_size - 1 && - info.num_layers <= max_array_size); - break; - case GEN6_SURFTYPE_2D: - assert(info.width <= max_2d_size && info.height <= max_2d_size && - info.depth <= max_array_size); - assert(info.first_layer < max_array_size - 1 && - info.num_layers <= max_array_size); - break; - case GEN6_SURFTYPE_3D: - assert(info.width <= 2048 && info.height <= 2048 && info.depth <= 2048); - assert(info.first_layer < 2048 && info.num_layers <= max_array_size); - break; - case GEN6_SURFTYPE_CUBE: - assert(info.width <= max_2d_size && info.height <= max_2d_size && - info.depth == 1); - assert(info.first_layer == 0 && info.num_layers == 1); - assert(info.width == info.height); - break; - default: - assert(!"unexpected depth surface type"); - break; - } - - dw1 = info.surface_type << GEN6_DEPTH_DW1_TYPE__SHIFT | - info.format << GEN6_DEPTH_DW1_FORMAT__SHIFT; - - if (info.zs.bo) { - /* required for GEN6+ */ - assert(info.zs.tiling == GEN6_TILING_Y); - assert(info.zs.stride > 0 && info.zs.stride < 128 * 1024 && - info.zs.stride % 128 == 0); - assert(info.width <= info.zs.stride); - - dw1 |= (info.zs.stride - 1); - dw2 = info.zs.offset; - } else { - dw2 = 0; - } - - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - if (info.zs.bo) - dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE; - - if (info.stencil.bo) - dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE; - - if (info.hiz.bo) - dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE; - - dw3 = (info.height - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT | - (info.width - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT | - info.lod << GEN7_DEPTH_DW3_LOD__SHIFT; - - zs->dw_aligned_8x4 = - (align(info.height, align_h) - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT | - (align(info.width, align_w) - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT | - info.lod << GEN7_DEPTH_DW3_LOD__SHIFT; - - dw4 = (info.depth - 1) << GEN7_DEPTH_DW4_DEPTH__SHIFT | - info.first_layer << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT; - - dw5 = 0; - - dw6 = (info.num_layers - 1) << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) - dw6 |= info.zs.qpitch; - } else { - /* always Y-tiled */ - dw1 |= GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT; - - if (info.hiz.bo) { - dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE | - GEN6_DEPTH_DW1_SEPARATE_STENCIL; - } - - dw3 = (info.height - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT | - (info.width - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT | - info.lod << GEN6_DEPTH_DW3_LOD__SHIFT | - GEN6_DEPTH_DW3_MIPLAYOUT_BELOW; - - zs->dw_aligned_8x4 = - (align(info.height, align_h) - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT | - (align(info.width, align_w) - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT | - info.lod << GEN6_DEPTH_DW3_LOD__SHIFT | - GEN6_DEPTH_DW3_MIPLAYOUT_BELOW; - - dw4 = (info.depth - 1) << GEN6_DEPTH_DW4_DEPTH__SHIFT | - info.first_layer << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT | - (info.num_layers - 1) << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT; - - dw5 = 0; - - dw6 = 0; - } - - STATIC_ASSERT(Elements(zs->payload) >= 12); - - zs->payload[0] = dw1; - zs->payload[1] = dw2; - zs->payload[2] = dw3; - zs->payload[3] = dw4; - zs->payload[4] = dw5; - zs->payload[5] = dw6; - - /* do not increment reference count */ - zs->bo = info.zs.bo; - - /* separate stencil */ - if (info.stencil.bo) { - assert(info.stencil.stride > 0 && info.stencil.stride < 128 * 1024 && - info.stencil.stride % 128 == 0); - - dw1 = (info.stencil.stride - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT; - if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) - dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE; - - dw2 = info.stencil.offset; - dw4 = info.stencil.qpitch; - } else { - dw1 = 0; - dw2 = 0; - dw4 = 0; - } - - zs->payload[6] = dw1; - zs->payload[7] = dw2; - zs->payload[8] = dw4; - /* do not increment reference count */ - zs->separate_s8_bo = info.stencil.bo; - - /* hiz */ - if (info.hiz.bo) { - dw1 = (info.hiz.stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT; - dw2 = info.hiz.offset; - dw4 = info.hiz.qpitch; - } else { - dw1 = 0; - dw2 = 0; - dw4 = 0; - } - - zs->payload[9] = dw1; - zs->payload[10] = dw2; - zs->payload[11] = dw4; - /* do not increment reference count */ - zs->hiz_bo = info.hiz.bo; -} - -static void -viewport_get_guardband(const struct ilo_dev *dev, - int center_x, int center_y, - int *min_gbx, int *max_gbx, - int *min_gby, int *max_gby) -{ - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 234: - * - * "Per-Device Guardband Extents - * - * - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1] - * - Maximum Post-Clamp Delta (X or Y): 16K" - * - * "In addition, in order to be correctly rendered, objects must have a - * screenspace bounding box not exceeding 8K in the X or Y direction. - * This additional restriction must also be comprehended by software, - * i.e., enforced by use of clipping." - * - * From the Ivy Bridge PRM, volume 2 part 1, page 248: - * - * "Per-Device Guardband Extents - * - * - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1] - * - Maximum Post-Clamp Delta (X or Y): N/A" - * - * "In addition, in order to be correctly rendered, objects must have a - * screenspace bounding box not exceeding 8K in the X or Y direction. - * This additional restriction must also be comprehended by software, - * i.e., enforced by use of clipping." - * - * Combined, the bounding box of any object can not exceed 8K in both - * width and height. - * - * Below we set the guardband as a squre of length 8K, centered at where - * the viewport is. This makes sure all objects passing the GB test are - * valid to the renderer, and those failing the XY clipping have a - * better chance of passing the GB test. - */ - const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384; - const int half_len = 8192 / 2; - - /* make sure the guardband is within the valid range */ - if (center_x - half_len < -max_extent) - center_x = -max_extent + half_len; - else if (center_x + half_len > max_extent - 1) - center_x = max_extent - half_len; - - if (center_y - half_len < -max_extent) - center_y = -max_extent + half_len; - else if (center_y + half_len > max_extent - 1) - center_y = max_extent - half_len; - - *min_gbx = (float) (center_x - half_len); - *max_gbx = (float) (center_x + half_len); - *min_gby = (float) (center_y - half_len); - *max_gby = (float) (center_y + half_len); -} - -void -ilo_gpe_set_viewport_cso(const struct ilo_dev *dev, - const struct pipe_viewport_state *state, - struct ilo_viewport_cso *vp) -{ - const float scale_x = fabs(state->scale[0]); - const float scale_y = fabs(state->scale[1]); - const float scale_z = fabs(state->scale[2]); - int min_gbx, max_gbx, min_gby, max_gby; - - ILO_DEV_ASSERT(dev, 6, 8); - - viewport_get_guardband(dev, - (int) state->translate[0], - (int) state->translate[1], - &min_gbx, &max_gbx, &min_gby, &max_gby); - - /* matrix form */ - vp->m00 = state->scale[0]; - vp->m11 = state->scale[1]; - vp->m22 = state->scale[2]; - vp->m30 = state->translate[0]; - vp->m31 = state->translate[1]; - vp->m32 = state->translate[2]; - - /* guardband in NDC space */ - vp->min_gbx = ((float) min_gbx - state->translate[0]) / scale_x; - vp->max_gbx = ((float) max_gbx - state->translate[0]) / scale_x; - vp->min_gby = ((float) min_gby - state->translate[1]) / scale_y; - vp->max_gby = ((float) max_gby - state->translate[1]) / scale_y; - - /* viewport in screen space */ - vp->min_x = scale_x * -1.0f + state->translate[0]; - vp->max_x = scale_x * 1.0f + state->translate[0]; - vp->min_y = scale_y * -1.0f + state->translate[1]; - vp->max_y = scale_y * 1.0f + state->translate[1]; - vp->min_z = scale_z * -1.0f + state->translate[2]; - vp->max_z = scale_z * 1.0f + state->translate[2]; -} - -/** - * Translate a pipe logicop to the matching hardware logicop. - */ -static int -gen6_translate_pipe_logicop(unsigned logicop) -{ - switch (logicop) { - case PIPE_LOGICOP_CLEAR: return GEN6_LOGICOP_CLEAR; - case PIPE_LOGICOP_NOR: return GEN6_LOGICOP_NOR; - case PIPE_LOGICOP_AND_INVERTED: return GEN6_LOGICOP_AND_INVERTED; - case PIPE_LOGICOP_COPY_INVERTED: return GEN6_LOGICOP_COPY_INVERTED; - case PIPE_LOGICOP_AND_REVERSE: return GEN6_LOGICOP_AND_REVERSE; - case PIPE_LOGICOP_INVERT: return GEN6_LOGICOP_INVERT; - case PIPE_LOGICOP_XOR: return GEN6_LOGICOP_XOR; - case PIPE_LOGICOP_NAND: return GEN6_LOGICOP_NAND; - case PIPE_LOGICOP_AND: return GEN6_LOGICOP_AND; - case PIPE_LOGICOP_EQUIV: return GEN6_LOGICOP_EQUIV; - case PIPE_LOGICOP_NOOP: return GEN6_LOGICOP_NOOP; - case PIPE_LOGICOP_OR_INVERTED: return GEN6_LOGICOP_OR_INVERTED; - case PIPE_LOGICOP_COPY: return GEN6_LOGICOP_COPY; - case PIPE_LOGICOP_OR_REVERSE: return GEN6_LOGICOP_OR_REVERSE; - case PIPE_LOGICOP_OR: return GEN6_LOGICOP_OR; - case PIPE_LOGICOP_SET: return GEN6_LOGICOP_SET; - default: - assert(!"unknown logicop function"); - return GEN6_LOGICOP_CLEAR; - } -} - -/** - * Translate a pipe blend function to the matching hardware blend function. - */ -static int -gen6_translate_pipe_blend(unsigned blend) -{ - switch (blend) { - case PIPE_BLEND_ADD: return GEN6_BLENDFUNCTION_ADD; - case PIPE_BLEND_SUBTRACT: return GEN6_BLENDFUNCTION_SUBTRACT; - case PIPE_BLEND_REVERSE_SUBTRACT: return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT; - case PIPE_BLEND_MIN: return GEN6_BLENDFUNCTION_MIN; - case PIPE_BLEND_MAX: return GEN6_BLENDFUNCTION_MAX; - default: - assert(!"unknown blend function"); - return GEN6_BLENDFUNCTION_ADD; - }; -} - -/** - * Translate a pipe blend factor to the matching hardware blend factor. - */ -static int -gen6_translate_pipe_blendfactor(unsigned blendfactor) -{ - switch (blendfactor) { - case PIPE_BLENDFACTOR_ONE: return GEN6_BLENDFACTOR_ONE; - case PIPE_BLENDFACTOR_SRC_COLOR: return GEN6_BLENDFACTOR_SRC_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA: return GEN6_BLENDFACTOR_SRC_ALPHA; - case PIPE_BLENDFACTOR_DST_ALPHA: return GEN6_BLENDFACTOR_DST_ALPHA; - case PIPE_BLENDFACTOR_DST_COLOR: return GEN6_BLENDFACTOR_DST_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE; - case PIPE_BLENDFACTOR_CONST_COLOR: return GEN6_BLENDFACTOR_CONST_COLOR; - case PIPE_BLENDFACTOR_CONST_ALPHA: return GEN6_BLENDFACTOR_CONST_ALPHA; - case PIPE_BLENDFACTOR_SRC1_COLOR: return GEN6_BLENDFACTOR_SRC1_COLOR; - case PIPE_BLENDFACTOR_SRC1_ALPHA: return GEN6_BLENDFACTOR_SRC1_ALPHA; - case PIPE_BLENDFACTOR_ZERO: return GEN6_BLENDFACTOR_ZERO; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: return GEN6_BLENDFACTOR_INV_SRC_COLOR; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: return GEN6_BLENDFACTOR_INV_SRC_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: return GEN6_BLENDFACTOR_INV_DST_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_COLOR: return GEN6_BLENDFACTOR_INV_DST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_COLOR: return GEN6_BLENDFACTOR_INV_CONST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return GEN6_BLENDFACTOR_INV_CONST_ALPHA; - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: return GEN6_BLENDFACTOR_INV_SRC1_COLOR; - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: return GEN6_BLENDFACTOR_INV_SRC1_ALPHA; - default: - assert(!"unknown blend factor"); - return GEN6_BLENDFACTOR_ONE; - }; -} - -/** - * Translate a pipe stencil op to the matching hardware stencil op. - */ -static int -gen6_translate_pipe_stencil_op(unsigned stencil_op) -{ - switch (stencil_op) { - case PIPE_STENCIL_OP_KEEP: return GEN6_STENCILOP_KEEP; - case PIPE_STENCIL_OP_ZERO: return GEN6_STENCILOP_ZERO; - case PIPE_STENCIL_OP_REPLACE: return GEN6_STENCILOP_REPLACE; - case PIPE_STENCIL_OP_INCR: return GEN6_STENCILOP_INCRSAT; - case PIPE_STENCIL_OP_DECR: return GEN6_STENCILOP_DECRSAT; - case PIPE_STENCIL_OP_INCR_WRAP: return GEN6_STENCILOP_INCR; - case PIPE_STENCIL_OP_DECR_WRAP: return GEN6_STENCILOP_DECR; - case PIPE_STENCIL_OP_INVERT: return GEN6_STENCILOP_INVERT; - default: - assert(!"unknown stencil op"); - return GEN6_STENCILOP_KEEP; - } -} - -static int -gen6_blend_factor_dst_alpha_forced_one(int factor) -{ - switch (factor) { - case GEN6_BLENDFACTOR_DST_ALPHA: - return GEN6_BLENDFACTOR_ONE; - case GEN6_BLENDFACTOR_INV_DST_ALPHA: - case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE: - return GEN6_BLENDFACTOR_ZERO; - default: - return factor; - } -} - -static uint32_t -blend_get_rt_blend_enable_gen6(const struct ilo_dev *dev, - const struct pipe_rt_blend_state *rt, - bool dst_alpha_forced_one) -{ - int rgb_src, rgb_dst, a_src, a_dst; - uint32_t dw; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - if (!rt->blend_enable) - return 0; - - rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor); - rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor); - a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor); - a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor); - - if (dst_alpha_forced_one) { - rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src); - rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst); - a_src = gen6_blend_factor_dst_alpha_forced_one(a_src); - a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst); - } - - dw = GEN6_RT_DW0_BLEND_ENABLE | - gen6_translate_pipe_blend(rt->alpha_func) << 26 | - a_src << 20 | - a_dst << 15 | - gen6_translate_pipe_blend(rt->rgb_func) << 11 | - rgb_src << 5 | - rgb_dst; - - if (rt->rgb_func != rt->alpha_func || - rgb_src != a_src || rgb_dst != a_dst) - dw |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE; - - return dw; -} - -static uint32_t -blend_get_rt_blend_enable_gen8(const struct ilo_dev *dev, - const struct pipe_rt_blend_state *rt, - bool dst_alpha_forced_one, - bool *independent_alpha) -{ - int rgb_src, rgb_dst, a_src, a_dst; - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (!rt->blend_enable) { - *independent_alpha = false; - return 0; - } - - rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor); - rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor); - a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor); - a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor); - - if (dst_alpha_forced_one) { - rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src); - rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst); - a_src = gen6_blend_factor_dst_alpha_forced_one(a_src); - a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst); - } - - dw = GEN8_RT_DW0_BLEND_ENABLE | - rgb_src << 26 | - rgb_dst << 21 | - gen6_translate_pipe_blend(rt->rgb_func) << 18 | - a_src << 13 | - a_dst << 8 | - gen6_translate_pipe_blend(rt->alpha_func) << 5; - - *independent_alpha = (rt->rgb_func != rt->alpha_func || - rgb_src != a_src || - rgb_dst != a_dst); - - return dw; -} - -static void -blend_init_cso_gen6(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - struct ilo_blend_state *blend, - unsigned index) -{ - const struct pipe_rt_blend_state *rt = &state->rt[index]; - struct ilo_blend_cso *cso = &blend->cso[index]; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - cso->payload[0] = 0; - cso->payload[1] = GEN6_RT_DW1_COLORCLAMP_RTFORMAT | - GEN6_RT_DW1_PRE_BLEND_CLAMP | - GEN6_RT_DW1_POST_BLEND_CLAMP; - - if (!(rt->colormask & PIPE_MASK_A)) - cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_A; - if (!(rt->colormask & PIPE_MASK_R)) - cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_R; - if (!(rt->colormask & PIPE_MASK_G)) - cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_G; - if (!(rt->colormask & PIPE_MASK_B)) - cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_B; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 365: - * - * "Color Buffer Blending and Logic Ops must not be enabled - * simultaneously, or behavior is UNDEFINED." - * - * Since state->logicop_enable takes precedence over rt->blend_enable, - * no special care is needed. - */ - if (state->logicop_enable) { - cso->dw_blend = 0; - cso->dw_blend_dst_alpha_forced_one = 0; - } else { - cso->dw_blend = blend_get_rt_blend_enable_gen6(dev, rt, false); - cso->dw_blend_dst_alpha_forced_one = - blend_get_rt_blend_enable_gen6(dev, rt, true); - } -} - -static bool -blend_init_cso_gen8(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - struct ilo_blend_state *blend, - unsigned index) -{ - const struct pipe_rt_blend_state *rt = &state->rt[index]; - struct ilo_blend_cso *cso = &blend->cso[index]; - bool independent_alpha = false; - - ILO_DEV_ASSERT(dev, 8, 8); - - cso->payload[0] = 0; - cso->payload[1] = GEN8_RT_DW1_COLORCLAMP_RTFORMAT | - GEN8_RT_DW1_PRE_BLEND_CLAMP | - GEN8_RT_DW1_POST_BLEND_CLAMP; - - if (!(rt->colormask & PIPE_MASK_A)) - cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_A; - if (!(rt->colormask & PIPE_MASK_R)) - cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_R; - if (!(rt->colormask & PIPE_MASK_G)) - cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_G; - if (!(rt->colormask & PIPE_MASK_B)) - cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_B; - - if (state->logicop_enable) { - cso->dw_blend = 0; - cso->dw_blend_dst_alpha_forced_one = 0; - } else { - bool tmp[2]; - - cso->dw_blend = blend_get_rt_blend_enable_gen8(dev, rt, false, &tmp[0]); - cso->dw_blend_dst_alpha_forced_one = - blend_get_rt_blend_enable_gen8(dev, rt, true, &tmp[1]); - - if (tmp[0] || tmp[1]) - independent_alpha = true; - } - - return independent_alpha; -} - -static uint32_t -blend_get_logicop_enable_gen6(const struct ilo_dev *dev, - const struct pipe_blend_state *state) -{ - ILO_DEV_ASSERT(dev, 6, 7.5); - - if (!state->logicop_enable) - return 0; - - return GEN6_RT_DW1_LOGICOP_ENABLE | - gen6_translate_pipe_logicop(state->logicop_func) << 18; -} - -static uint32_t -blend_get_logicop_enable_gen8(const struct ilo_dev *dev, - const struct pipe_blend_state *state) -{ - ILO_DEV_ASSERT(dev, 8, 8); - - if (!state->logicop_enable) - return 0; - - return GEN8_RT_DW1_LOGICOP_ENABLE | - gen6_translate_pipe_logicop(state->logicop_func) << 27; -} - -static uint32_t -blend_get_alpha_mod_gen6(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - bool dual_blend) -{ - uint32_t dw = 0; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - if (state->alpha_to_coverage) { - dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE; - if (ilo_dev_gen(dev) >= ILO_GEN(7)) - dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER; - } - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 378: - * - * "If Dual Source Blending is enabled, this bit (AlphaToOne Enable) - * must be disabled." - */ - if (state->alpha_to_one && !dual_blend) - dw |= GEN6_RT_DW1_ALPHA_TO_ONE; - - return dw; -} - -static uint32_t -blend_get_alpha_mod_gen8(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - bool dual_blend) -{ - uint32_t dw = 0; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (state->alpha_to_coverage) { - dw |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE | - GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER; - } - - if (state->alpha_to_one && !dual_blend) - dw |= GEN8_BLEND_DW0_ALPHA_TO_ONE; - - return dw; -} - -static uint32_t -blend_get_ps_blend_gen8(const struct ilo_dev *dev, uint32_t rt_dw0) -{ - int rgb_src, rgb_dst, a_src, a_dst; - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (!(rt_dw0 & GEN8_RT_DW0_BLEND_ENABLE)) - return 0; - - a_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_ALPHA_FACTOR); - a_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_ALPHA_FACTOR); - rgb_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_COLOR_FACTOR); - rgb_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_COLOR_FACTOR); - - dw = GEN8_PS_BLEND_DW1_BLEND_ENABLE; - dw |= GEN_SHIFT32(a_src, GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR); - dw |= GEN_SHIFT32(a_dst, GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR); - dw |= GEN_SHIFT32(rgb_src, GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR); - dw |= GEN_SHIFT32(rgb_dst, GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR); - - if (a_src != rgb_src || a_dst != rgb_dst) - dw |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE; - - return dw; -} - -void -ilo_gpe_init_blend(const struct ilo_dev *dev, - const struct pipe_blend_state *state, - struct ilo_blend_state *blend) -{ - unsigned i; - - ILO_DEV_ASSERT(dev, 6, 8); - - blend->dual_blend = (util_blend_state_is_dual(state, 0) && - state->rt[0].blend_enable && - !state->logicop_enable); - blend->alpha_to_coverage = state->alpha_to_coverage; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - bool independent_alpha; - - blend->dw_alpha_mod = - blend_get_alpha_mod_gen8(dev, state, blend->dual_blend); - blend->dw_logicop = blend_get_logicop_enable_gen8(dev, state); - blend->dw_shared = (state->dither) ? GEN8_BLEND_DW0_DITHER_ENABLE : 0; - - independent_alpha = blend_init_cso_gen8(dev, state, blend, 0); - if (independent_alpha) - blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE; - - blend->dw_ps_blend = blend_get_ps_blend_gen8(dev, - blend->cso[0].dw_blend); - blend->dw_ps_blend_dst_alpha_forced_one = blend_get_ps_blend_gen8(dev, - blend->cso[0].dw_blend_dst_alpha_forced_one); - - if (state->independent_blend_enable) { - for (i = 1; i < Elements(blend->cso); i++) { - independent_alpha = blend_init_cso_gen8(dev, state, blend, i); - if (independent_alpha) - blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE; - } - } else { - for (i = 1; i < Elements(blend->cso); i++) - blend->cso[i] = blend->cso[0]; - } - } else { - blend->dw_alpha_mod = - blend_get_alpha_mod_gen6(dev, state, blend->dual_blend); - blend->dw_logicop = blend_get_logicop_enable_gen6(dev, state); - blend->dw_shared = (state->dither) ? GEN6_RT_DW1_DITHER_ENABLE : 0; - - blend->dw_ps_blend = 0; - blend->dw_ps_blend_dst_alpha_forced_one = 0; - - blend_init_cso_gen6(dev, state, blend, 0); - if (state->independent_blend_enable) { - for (i = 1; i < Elements(blend->cso); i++) - blend_init_cso_gen6(dev, state, blend, i); - } else { - for (i = 1; i < Elements(blend->cso); i++) - blend->cso[i] = blend->cso[0]; - } - } -} - -/** - * Translate a pipe DSA test function to the matching hardware compare - * function. - */ -static int -gen6_translate_dsa_func(unsigned func) -{ - switch (func) { - case PIPE_FUNC_NEVER: return GEN6_COMPAREFUNCTION_NEVER; - case PIPE_FUNC_LESS: return GEN6_COMPAREFUNCTION_LESS; - case PIPE_FUNC_EQUAL: return GEN6_COMPAREFUNCTION_EQUAL; - case PIPE_FUNC_LEQUAL: return GEN6_COMPAREFUNCTION_LEQUAL; - case PIPE_FUNC_GREATER: return GEN6_COMPAREFUNCTION_GREATER; - case PIPE_FUNC_NOTEQUAL: return GEN6_COMPAREFUNCTION_NOTEQUAL; - case PIPE_FUNC_GEQUAL: return GEN6_COMPAREFUNCTION_GEQUAL; - case PIPE_FUNC_ALWAYS: return GEN6_COMPAREFUNCTION_ALWAYS; - default: - assert(!"unknown depth/stencil/alpha test function"); - return GEN6_COMPAREFUNCTION_NEVER; - } -} - -static uint32_t -dsa_get_stencil_enable_gen6(const struct ilo_dev *dev, - const struct pipe_stencil_state *stencil0, - const struct pipe_stencil_state *stencil1) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - if (!stencil0->enabled) - return 0; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 359: - * - * "If the Depth Buffer is either undefined or does not have a surface - * format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate - * stencil buffer is disabled, Stencil Test Enable must be DISABLED" - * - * From the Sandy Bridge PRM, volume 2 part 1, page 370: - * - * "This field (Stencil Test Enable) cannot be enabled if - * Surface Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM." - * - * TODO We do not check these yet. - */ - dw = GEN6_ZS_DW0_STENCIL_TEST_ENABLE | - gen6_translate_dsa_func(stencil0->func) << 28 | - gen6_translate_pipe_stencil_op(stencil0->fail_op) << 25 | - gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 22 | - gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 19; - if (stencil0->writemask) - dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE; - - if (stencil1->enabled) { - dw |= GEN6_ZS_DW0_STENCIL1_ENABLE | - gen6_translate_dsa_func(stencil1->func) << 12 | - gen6_translate_pipe_stencil_op(stencil1->fail_op) << 9 | - gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 6 | - gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 3; - if (stencil1->writemask) - dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE; - } - - return dw; -} - -static uint32_t -dsa_get_stencil_enable_gen8(const struct ilo_dev *dev, - const struct pipe_stencil_state *stencil0, - const struct pipe_stencil_state *stencil1) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (!stencil0->enabled) - return 0; - - dw = gen6_translate_pipe_stencil_op(stencil0->fail_op) << 29 | - gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 26 | - gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 23 | - gen6_translate_dsa_func(stencil0->func) << 8 | - GEN8_ZS_DW1_STENCIL_TEST_ENABLE; - if (stencil0->writemask) - dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE; - - if (stencil1->enabled) { - dw |= gen6_translate_dsa_func(stencil1->func) << 20 | - gen6_translate_pipe_stencil_op(stencil1->fail_op) << 17 | - gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 14 | - gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 11 | - GEN8_ZS_DW1_STENCIL1_ENABLE; - if (stencil1->writemask) - dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE; - } - - return dw; -} - -static uint32_t -dsa_get_depth_enable_gen6(const struct ilo_dev *dev, - const struct pipe_depth_state *state) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 360: - * - * "Enabling the Depth Test function without defining a Depth Buffer is - * UNDEFINED." - * - * From the Sandy Bridge PRM, volume 2 part 1, page 375: - * - * "A Depth Buffer must be defined before enabling writes to it, or - * operation is UNDEFINED." - * - * TODO We do not check these yet. - */ - if (state->enabled) { - dw = GEN6_ZS_DW2_DEPTH_TEST_ENABLE | - gen6_translate_dsa_func(state->func) << 27; - } else { - dw = GEN6_COMPAREFUNCTION_ALWAYS << 27; - } - - if (state->writemask) - dw |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE; - - return dw; -} - -static uint32_t -dsa_get_depth_enable_gen8(const struct ilo_dev *dev, - const struct pipe_depth_state *state) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (state->enabled) { - dw = GEN8_ZS_DW1_DEPTH_TEST_ENABLE | - gen6_translate_dsa_func(state->func) << 5; - } else { - dw = GEN6_COMPAREFUNCTION_ALWAYS << 5; - } - - if (state->writemask) - dw |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE; - - return dw; -} - -static uint32_t -dsa_get_alpha_enable_gen6(const struct ilo_dev *dev, - const struct pipe_alpha_state *state) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 6, 7.5); - - if (!state->enabled) - return 0; - - /* this will be ORed to BLEND_STATE */ - dw = GEN6_RT_DW1_ALPHA_TEST_ENABLE | - gen6_translate_dsa_func(state->func) << 13; - - return dw; -} - -static uint32_t -dsa_get_alpha_enable_gen8(const struct ilo_dev *dev, - const struct pipe_alpha_state *state) -{ - uint32_t dw; - - ILO_DEV_ASSERT(dev, 8, 8); - - if (!state->enabled) - return 0; - - /* this will be ORed to BLEND_STATE */ - dw = GEN8_BLEND_DW0_ALPHA_TEST_ENABLE | - gen6_translate_dsa_func(state->func) << 24; - - return dw; -} - -void -ilo_gpe_init_dsa(const struct ilo_dev *dev, - const struct pipe_depth_stencil_alpha_state *state, - struct ilo_dsa_state *dsa) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - STATIC_ASSERT(Elements(dsa->payload) >= 3); - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - const uint32_t dw_stencil = dsa_get_stencil_enable_gen8(dev, - &state->stencil[0], &state->stencil[1]); - const uint32_t dw_depth = dsa_get_depth_enable_gen8(dev, &state->depth); - - assert(!(dw_stencil & dw_depth)); - dsa->payload[0] = dw_stencil | dw_depth; - - dsa->dw_blend_alpha = dsa_get_alpha_enable_gen8(dev, &state->alpha); - dsa->dw_ps_blend_alpha = (state->alpha.enabled) ? - GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE : 0; - } else { - dsa->payload[0] = dsa_get_stencil_enable_gen6(dev, - &state->stencil[0], &state->stencil[1]); - dsa->payload[2] = dsa_get_depth_enable_gen6(dev, &state->depth); - - dsa->dw_blend_alpha = dsa_get_alpha_enable_gen6(dev, &state->alpha); - dsa->dw_ps_blend_alpha = 0; - } - - dsa->payload[1] = state->stencil[0].valuemask << 24 | - state->stencil[0].writemask << 16 | - state->stencil[1].valuemask << 8 | - state->stencil[1].writemask; - - dsa->alpha_ref = float_to_ubyte(state->alpha.ref_value); -} - -void -ilo_gpe_set_scissor(const struct ilo_dev *dev, - unsigned start_slot, - unsigned num_states, - const struct pipe_scissor_state *states, - struct ilo_scissor_state *scissor) -{ - unsigned i; - - ILO_DEV_ASSERT(dev, 6, 8); - - for (i = 0; i < num_states; i++) { - uint16_t min_x, min_y, max_x, max_y; - - /* both max and min are inclusive in SCISSOR_RECT */ - if (states[i].minx < states[i].maxx && - states[i].miny < states[i].maxy) { - min_x = states[i].minx; - min_y = states[i].miny; - max_x = states[i].maxx - 1; - max_y = states[i].maxy - 1; - } - else { - /* we have to make min greater than max */ - min_x = 1; - min_y = 1; - max_x = 0; - max_y = 0; - } - - scissor->payload[(start_slot + i) * 2 + 0] = min_y << 16 | min_x; - scissor->payload[(start_slot + i) * 2 + 1] = max_y << 16 | max_x; - } - - if (!start_slot && num_states) - scissor->scissor0 = states[0]; -} - -void -ilo_gpe_set_scissor_null(const struct ilo_dev *dev, - struct ilo_scissor_state *scissor) -{ - unsigned i; - - for (i = 0; i < Elements(scissor->payload); i += 2) { - scissor->payload[i + 0] = 1 << 16 | 1; - scissor->payload[i + 1] = 0; - } -} - -static void -fb_set_blend_caps(const struct ilo_dev *dev, - enum pipe_format format, - struct ilo_fb_blend_caps *caps) -{ - const struct util_format_description *desc = - util_format_description(format); - const int ch = util_format_get_first_non_void_channel(format); - - memset(caps, 0, sizeof(*caps)); - - if (format == PIPE_FORMAT_NONE || desc->is_mixed) - return; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 365: - * - * "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB - * variants), otherwise Logic Ops must be DISABLED." - * - * According to the classic driver, this is lifted on Gen8+. - */ - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - caps->can_logicop = true; - } else { - caps->can_logicop = (ch >= 0 && desc->channel[ch].normalized && - desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED && - desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB); - } - - /* no blending for pure integer formats */ - caps->can_blend = !util_format_is_pure_integer(format); - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 382: - * - * "Alpha Test can only be enabled if Pixel Shader outputs a float - * alpha value." - */ - caps->can_alpha_test = !util_format_is_pure_integer(format); - - caps->dst_alpha_forced_one = - (ilo_format_translate_render(dev, format) != - ilo_format_translate_color(dev, format)); - - /* sanity check */ - if (caps->dst_alpha_forced_one) { - enum pipe_format render_format; - - switch (format) { - case PIPE_FORMAT_B8G8R8X8_UNORM: - render_format = PIPE_FORMAT_B8G8R8A8_UNORM; - break; - default: - render_format = PIPE_FORMAT_NONE; - break; - } - - assert(ilo_format_translate_render(dev, format) == - ilo_format_translate_color(dev, render_format)); - } -} - -void -ilo_gpe_set_fb(const struct ilo_dev *dev, - const struct pipe_framebuffer_state *state, - struct ilo_fb_state *fb) -{ - const struct pipe_surface *first_surf = NULL; - int i; - - ILO_DEV_ASSERT(dev, 6, 8); - - util_copy_framebuffer_state(&fb->state, state); - - ilo_gpe_init_view_surface_null(dev, - (state->width) ? state->width : 1, - (state->height) ? state->height : 1, - 1, 0, &fb->null_rt); - - for (i = 0; i < state->nr_cbufs; i++) { - if (state->cbufs[i]) { - fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]); - - if (!first_surf) - first_surf = state->cbufs[i]; - } else { - fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]); - } - } - - if (!first_surf && state->zsbuf) - first_surf = state->zsbuf; - - fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1; - if (!fb->num_samples) - fb->num_samples = 1; - - /* - * The PRMs list several restrictions when the framebuffer has more than - * one surface. It seems they are actually lifted on GEN6+. - */ -} diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c b/src/gallium/drivers/ilo/core/ilo_state_3d_top.c deleted file mode 100644 index c17957fb704..00000000000 --- a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c +++ /dev/null @@ -1,1716 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright (C) 2012-2014 LunarG, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Authors: - * Chia-I Wu <[email protected]> - */ - -#include "genhw/genhw.h" -#include "util/u_dual_blend.h" -#include "util/u_framebuffer.h" -#include "util/u_half.h" -#include "util/u_resource.h" - -#include "ilo_buffer.h" -#include "ilo_format.h" -#include "ilo_image.h" -#include "ilo_state_3d.h" -#include "../ilo_shader.h" - -static void -ve_init_cso(const struct ilo_dev *dev, - const struct pipe_vertex_element *state, - unsigned vb_index, - struct ilo_ve_cso *cso) -{ - int comp[4] = { - GEN6_VFCOMP_STORE_SRC, - GEN6_VFCOMP_STORE_SRC, - GEN6_VFCOMP_STORE_SRC, - GEN6_VFCOMP_STORE_SRC, - }; - int format; - - ILO_DEV_ASSERT(dev, 6, 8); - - switch (util_format_get_nr_components(state->src_format)) { - case 1: comp[1] = GEN6_VFCOMP_STORE_0; - case 2: comp[2] = GEN6_VFCOMP_STORE_0; - case 3: comp[3] = (util_format_is_pure_integer(state->src_format)) ? - GEN6_VFCOMP_STORE_1_INT : - GEN6_VFCOMP_STORE_1_FP; - } - - format = ilo_format_translate_vertex(dev, state->src_format); - - STATIC_ASSERT(Elements(cso->payload) >= 2); - cso->payload[0] = - vb_index << GEN6_VE_DW0_VB_INDEX__SHIFT | - GEN6_VE_DW0_VALID | - format << GEN6_VE_DW0_FORMAT__SHIFT | - state->src_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT; - - cso->payload[1] = - comp[0] << GEN6_VE_DW1_COMP0__SHIFT | - comp[1] << GEN6_VE_DW1_COMP1__SHIFT | - comp[2] << GEN6_VE_DW1_COMP2__SHIFT | - comp[3] << GEN6_VE_DW1_COMP3__SHIFT; -} - -void -ilo_gpe_init_ve(const struct ilo_dev *dev, - unsigned num_states, - const struct pipe_vertex_element *states, - struct ilo_ve_state *ve) -{ - unsigned i; - - ILO_DEV_ASSERT(dev, 6, 8); - - ve->count = num_states; - ve->vb_count = 0; - - for (i = 0; i < num_states; i++) { - const unsigned pipe_idx = states[i].vertex_buffer_index; - const unsigned instance_divisor = states[i].instance_divisor; - unsigned hw_idx; - - /* - * map the pipe vb to the hardware vb, which has a fixed instance - * divisor - */ - for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) { - if (ve->vb_mapping[hw_idx] == pipe_idx && - ve->instance_divisors[hw_idx] == instance_divisor) - break; - } - - /* create one if there is no matching hardware vb */ - if (hw_idx >= ve->vb_count) { - hw_idx = ve->vb_count++; - - ve->vb_mapping[hw_idx] = pipe_idx; - ve->instance_divisors[hw_idx] = instance_divisor; - } - - ve_init_cso(dev, &states[i], hw_idx, &ve->cso[i]); - } -} - -void -ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev, - struct ilo_ve_cso *cso) -{ - int format; - - ILO_DEV_ASSERT(dev, 6, 8); - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 94: - * - * "- This bit (Edge Flag Enable) must only be ENABLED on the last - * valid VERTEX_ELEMENT structure. - * - * - When set, Component 0 Control must be set to VFCOMP_STORE_SRC, - * and Component 1-3 Control must be set to VFCOMP_NOSTORE. - * - * - The Source Element Format must be set to the UINT format. - * - * - [DevSNB]: Edge Flags are not supported for QUADLIST - * primitives. Software may elect to convert QUADLIST primitives - * to some set of corresponding edge-flag-supported primitive - * types (e.g., POLYGONs) prior to submission to the 3D pipeline." - */ - cso->payload[0] |= GEN6_VE_DW0_EDGE_FLAG_ENABLE; - - /* - * Edge flags have format GEN6_FORMAT_R8_USCALED when defined via - * glEdgeFlagPointer(), and format GEN6_FORMAT_R32_FLOAT when defined - * via glEdgeFlag(), as can be seen in vbo_attrib_tmp.h. - * - * Since all the hardware cares about is whether the flags are zero or not, - * we can treat them as the corresponding _UINT formats. - */ - format = GEN_EXTRACT(cso->payload[0], GEN6_VE_DW0_FORMAT); - cso->payload[0] &= ~GEN6_VE_DW0_FORMAT__MASK; - - switch (format) { - case GEN6_FORMAT_R32_FLOAT: - format = GEN6_FORMAT_R32_UINT; - break; - case GEN6_FORMAT_R8_USCALED: - format = GEN6_FORMAT_R8_UINT; - break; - default: - break; - } - - cso->payload[0] |= GEN_SHIFT32(format, GEN6_VE_DW0_FORMAT); - - cso->payload[1] = - GEN6_VFCOMP_STORE_SRC << GEN6_VE_DW1_COMP0__SHIFT | - GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP1__SHIFT | - GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP2__SHIFT | - GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP3__SHIFT; -} - -void -ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev, - int comp0, int comp1, int comp2, int comp3, - struct ilo_ve_cso *cso) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - STATIC_ASSERT(Elements(cso->payload) >= 2); - - assert(comp0 != GEN6_VFCOMP_STORE_SRC && - comp1 != GEN6_VFCOMP_STORE_SRC && - comp2 != GEN6_VFCOMP_STORE_SRC && - comp3 != GEN6_VFCOMP_STORE_SRC); - - cso->payload[0] = GEN6_VE_DW0_VALID; - cso->payload[1] = - comp0 << GEN6_VE_DW1_COMP0__SHIFT | - comp1 << GEN6_VE_DW1_COMP1__SHIFT | - comp2 << GEN6_VE_DW1_COMP2__SHIFT | - comp3 << GEN6_VE_DW1_COMP3__SHIFT; -} - -void -ilo_gpe_init_vs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *vs, - struct ilo_shader_cso *cso) -{ - int start_grf, vue_read_len, sampler_count, max_threads; - uint32_t dw2, dw4, dw5; - - ILO_DEV_ASSERT(dev, 6, 8); - - start_grf = ilo_shader_get_kernel_param(vs, ILO_KERNEL_URB_DATA_START_REG); - vue_read_len = ilo_shader_get_kernel_param(vs, ILO_KERNEL_INPUT_COUNT); - sampler_count = ilo_shader_get_kernel_param(vs, ILO_KERNEL_SAMPLER_COUNT); - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 135: - * - * "(Vertex URB Entry Read Length) Specifies the number of pairs of - * 128-bit vertex elements to be passed into the payload for each - * vertex." - * - * "It is UNDEFINED to set this field to 0 indicating no Vertex URB - * data to be read and passed to the thread." - */ - vue_read_len = (vue_read_len + 1) / 2; - if (!vue_read_len) - vue_read_len = 1; - - max_threads = dev->thread_count; - if (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 2) - max_threads *= 2; - - dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT; - dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT; - - dw4 = start_grf << GEN6_VS_DW4_URB_GRF_START__SHIFT | - vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT | - 0 << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT; - - dw5 = GEN6_VS_DW5_STATISTICS | - GEN6_VS_DW5_VS_ENABLE; - - if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) - dw5 |= (max_threads - 1) << GEN75_VS_DW5_MAX_THREADS__SHIFT; - else - dw5 |= (max_threads - 1) << GEN6_VS_DW5_MAX_THREADS__SHIFT; - - STATIC_ASSERT(Elements(cso->payload) >= 3); - cso->payload[0] = dw2; - cso->payload[1] = dw4; - cso->payload[2] = dw5; -} - -static void -gs_init_cso_gen6(const struct ilo_dev *dev, - const struct ilo_shader_state *gs, - struct ilo_shader_cso *cso) -{ - int start_grf, vue_read_len, max_threads; - uint32_t dw2, dw4, dw5, dw6; - - ILO_DEV_ASSERT(dev, 6, 6); - - if (ilo_shader_get_type(gs) == PIPE_SHADER_GEOMETRY) { - start_grf = ilo_shader_get_kernel_param(gs, - ILO_KERNEL_URB_DATA_START_REG); - - vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT); - } - else { - start_grf = ilo_shader_get_kernel_param(gs, - ILO_KERNEL_VS_GEN6_SO_START_REG); - - vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_OUTPUT_COUNT); - } - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 153: - * - * "Specifies the amount of URB data read and passed in the thread - * payload for each Vertex URB entry, in 256-bit register increments. - * - * It is UNDEFINED to set this field (Vertex URB Entry Read Length) to - * 0 indicating no Vertex URB data to be read and passed to the - * thread." - */ - vue_read_len = (vue_read_len + 1) / 2; - if (!vue_read_len) - vue_read_len = 1; - - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 154: - * - * "Maximum Number of Threads valid range is [0,27] when Rendering - * Enabled bit is set." - * - * From the Sandy Bridge PRM, volume 2 part 1, page 173: - * - * "Programming Note: If the GS stage is enabled, software must always - * allocate at least one GS URB Entry. This is true even if the GS - * thread never needs to output vertices to the pipeline, e.g., when - * only performing stream output. This is an artifact of the need to - * pass the GS thread an initial destination URB handle." - * - * As such, we always enable rendering, and limit the number of threads. - */ - if (dev->gt == 2) { - /* maximum is 60, but limited to 28 */ - max_threads = 28; - } - else { - /* maximum is 24, but limited to 21 (see brwCreateContext()) */ - max_threads = 21; - } - - dw2 = GEN6_THREADDISP_SPF; - - dw4 = vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT | - 0 << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT | - start_grf << GEN6_GS_DW4_URB_GRF_START__SHIFT; - - dw5 = (max_threads - 1) << GEN6_GS_DW5_MAX_THREADS__SHIFT | - GEN6_GS_DW5_STATISTICS | - GEN6_GS_DW5_SO_STATISTICS | - GEN6_GS_DW5_RENDER_ENABLE; - - /* - * we cannot make use of GEN6_GS_REORDER because it will reorder - * triangle strips according to D3D rules (triangle 2N+1 uses vertices - * (2N+1, 2N+3, 2N+2)), instead of GL rules (triangle 2N+1 uses vertices - * (2N+2, 2N+1, 2N+3)). - */ - dw6 = GEN6_GS_DW6_GS_ENABLE; - - if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_DISCARD_ADJACENCY)) - dw6 |= GEN6_GS_DW6_DISCARD_ADJACENCY; - - if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_VS_GEN6_SO)) { - const uint32_t svbi_post_inc = - ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_GEN6_SVBI_POST_INC); - - dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE; - if (svbi_post_inc) { - dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE | - svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT; - } - } - - STATIC_ASSERT(Elements(cso->payload) >= 4); - cso->payload[0] = dw2; - cso->payload[1] = dw4; - cso->payload[2] = dw5; - cso->payload[3] = dw6; -} - -static void -gs_init_cso_gen7(const struct ilo_dev *dev, - const struct ilo_shader_state *gs, - struct ilo_shader_cso *cso) -{ - int start_grf, vue_read_len, sampler_count, max_threads; - uint32_t dw2, dw4, dw5; - - ILO_DEV_ASSERT(dev, 7, 7.5); - - start_grf = ilo_shader_get_kernel_param(gs, ILO_KERNEL_URB_DATA_START_REG); - vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT); - sampler_count = ilo_shader_get_kernel_param(gs, ILO_KERNEL_SAMPLER_COUNT); - - /* in pairs */ - vue_read_len = (vue_read_len + 1) / 2; - - switch (ilo_dev_gen(dev)) { - case ILO_GEN(7.5): - max_threads = (dev->gt >= 2) ? 256 : 70; - break; - case ILO_GEN(7): - max_threads = (dev->gt == 2) ? 128 : 36; - break; - default: - max_threads = 1; - break; - } - - dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT; - dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT; - - dw4 = vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT | - GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES | - 0 << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT | - start_grf << GEN7_GS_DW4_URB_GRF_START__SHIFT; - - dw5 = (max_threads - 1) << GEN7_GS_DW5_MAX_THREADS__SHIFT | - GEN7_GS_DW5_STATISTICS | - GEN7_GS_DW5_GS_ENABLE; - - STATIC_ASSERT(Elements(cso->payload) >= 3); - cso->payload[0] = dw2; - cso->payload[1] = dw4; - cso->payload[2] = dw5; -} - -void -ilo_gpe_init_gs_cso(const struct ilo_dev *dev, - const struct ilo_shader_state *gs, - struct ilo_shader_cso *cso) -{ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) - gs_init_cso_gen7(dev, gs, cso); - else - gs_init_cso_gen6(dev, gs, cso); -} - -static void -view_init_null_gen6(const struct ilo_dev *dev, - unsigned width, unsigned height, - unsigned depth, unsigned level, - struct ilo_view_surface *surf) -{ - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 6, 6); - - assert(width >= 1 && height >= 1 && depth >= 1); - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 71: - * - * "A null surface will be used in instances where an actual surface is - * not bound. When a write message is generated to a null surface, no - * actual surface is written to. When a read message (including any - * sampling engine message) is generated to a null surface, the result - * is all zeros. Note that a null surface type is allowed to be used - * with all messages, even if it is not specificially indicated as - * supported. All of the remaining fields in surface state are ignored - * for null surfaces, with the following exceptions: - * - * * [DevSNB+]: Width, Height, Depth, and LOD fields must match the - * depth buffer's corresponding state for all render target - * surfaces, including null. - * * Surface Format must be R8G8B8A8_UNORM." - * - * From the Sandy Bridge PRM, volume 4 part 1, page 82: - * - * "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be - * true" - */ - - STATIC_ASSERT(Elements(surf->payload) >= 6); - dw = surf->payload; - - dw[0] = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT | - GEN6_FORMAT_B8G8R8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT; - - dw[1] = 0; - - dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT | - (width - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT | - level << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT; - - dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT | - GEN6_TILING_X; - - dw[4] = 0; - dw[5] = 0; -} - -static void -view_init_for_buffer_gen6(const struct ilo_dev *dev, - const struct ilo_buffer *buf, - unsigned offset, unsigned size, - unsigned struct_size, - enum pipe_format elem_format, - bool is_rt, bool render_cache_rw, - struct ilo_view_surface *surf) -{ - const int elem_size = util_format_get_blocksize(elem_format); - int width, height, depth, pitch; - int surface_format, num_entries; - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 6, 6); - - /* - * For SURFTYPE_BUFFER, a SURFACE_STATE specifies an element of a - * structure in a buffer. - */ - - surface_format = ilo_format_translate_color(dev, elem_format); - - num_entries = size / struct_size; - /* see if there is enough space to fit another element */ - if (size % struct_size >= elem_size) - num_entries++; - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 76: - * - * "For SURFTYPE_BUFFER render targets, this field (Surface Base - * Address) specifies the base address of first element of the - * surface. The surface is interpreted as a simple array of that - * single element type. The address must be naturally-aligned to the - * element size (e.g., a buffer containing R32G32B32A32_FLOAT elements - * must be 16-byte aligned). - * - * For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies - * the base address of the first element of the surface, computed in - * software by adding the surface base address to the byte offset of - * the element in the buffer." - */ - if (is_rt) - assert(offset % elem_size == 0); - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 77: - * - * "For buffer surfaces, the number of entries in the buffer ranges - * from 1 to 2^27." - */ - assert(num_entries >= 1 && num_entries <= 1 << 27); - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 81: - * - * "For surfaces of type SURFTYPE_BUFFER, this field (Surface Pitch) - * indicates the size of the structure." - */ - pitch = struct_size; - - pitch--; - num_entries--; - /* bits [6:0] */ - width = (num_entries & 0x0000007f); - /* bits [19:7] */ - height = (num_entries & 0x000fff80) >> 7; - /* bits [26:20] */ - depth = (num_entries & 0x07f00000) >> 20; - - STATIC_ASSERT(Elements(surf->payload) >= 6); - dw = surf->payload; - - dw[0] = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT | - surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT; - if (render_cache_rw) - dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW; - - dw[1] = offset; - - dw[2] = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT | - width << GEN6_SURFACE_DW2_WIDTH__SHIFT; - - dw[3] = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT | - pitch << GEN6_SURFACE_DW3_PITCH__SHIFT; - - dw[4] = 0; - dw[5] = 0; -} - -static void -view_init_for_image_gen6(const struct ilo_dev *dev, - const struct ilo_image *img, - enum pipe_texture_target target, - enum pipe_format format, - unsigned first_level, - unsigned num_levels, - unsigned first_layer, - unsigned num_layers, - bool is_rt, - struct ilo_view_surface *surf) -{ - int surface_type, surface_format; - int width, height, depth, pitch, lod; - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 6, 6); - - surface_type = ilo_gpe_gen6_translate_texture(target); - assert(surface_type != GEN6_SURFTYPE_BUFFER); - - if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil) - format = PIPE_FORMAT_Z32_FLOAT; - - if (is_rt) - surface_format = ilo_format_translate_render(dev, format); - else - surface_format = ilo_format_translate_texture(dev, format); - assert(surface_format >= 0); - - width = img->width0; - height = img->height0; - depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers; - pitch = img->bo_stride; - - if (surface_type == GEN6_SURFTYPE_CUBE) { - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 81: - * - * "For SURFTYPE_CUBE: [DevSNB+]: for Sampling Engine Surfaces, the - * range of this field (Depth) is [0,84], indicating the number of - * cube array elements (equal to the number of underlying 2D array - * elements divided by 6). For other surfaces, this field must be - * zero." - * - * When is_rt is true, we treat the texture as a 2D one to avoid the - * restriction. - */ - if (is_rt) { - surface_type = GEN6_SURFTYPE_2D; - } - else { - assert(num_layers % 6 == 0); - depth = num_layers / 6; - } - } - - /* sanity check the size */ - assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1); - switch (surface_type) { - case GEN6_SURFTYPE_1D: - assert(width <= 8192 && height == 1 && depth <= 512); - assert(first_layer < 512 && num_layers <= 512); - break; - case GEN6_SURFTYPE_2D: - assert(width <= 8192 && height <= 8192 && depth <= 512); - assert(first_layer < 512 && num_layers <= 512); - break; - case GEN6_SURFTYPE_3D: - assert(width <= 2048 && height <= 2048 && depth <= 2048); - assert(first_layer < 2048 && num_layers <= 512); - if (!is_rt) - assert(first_layer == 0); - break; - case GEN6_SURFTYPE_CUBE: - assert(width <= 8192 && height <= 8192 && depth <= 85); - assert(width == height); - assert(first_layer < 512 && num_layers <= 512); - if (is_rt) - assert(first_layer == 0); - break; - default: - assert(!"unexpected surface type"); - break; - } - - /* non-full array spacing is supported only on GEN7+ */ - assert(img->walk != ILO_IMAGE_WALK_LOD); - /* non-interleaved samples are supported only on GEN7+ */ - if (img->sample_count > 1) - assert(img->interleaved_samples); - - if (is_rt) { - assert(num_levels == 1); - lod = first_level; - } - else { - lod = num_levels - 1; - } - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 76: - * - * "Linear render target surface base addresses must be element-size - * aligned, for non-YUV surface formats, or a multiple of 2 - * element-sizes for YUV surface formats. Other linear surfaces have - * no alignment requirements (byte alignment is sufficient.)" - * - * From the Sandy Bridge PRM, volume 4 part 1, page 81: - * - * "For linear render target surfaces, the pitch must be a multiple - * of the element size for non-YUV surface formats. Pitch must be a - * multiple of 2 * element size for YUV surface formats." - * - * From the Sandy Bridge PRM, volume 4 part 1, page 86: - * - * "For linear surfaces, this field (X Offset) must be zero" - */ - if (img->tiling == GEN6_TILING_NONE) { - if (is_rt) { - const int elem_size = util_format_get_blocksize(format); - assert(pitch % elem_size == 0); - } - } - - STATIC_ASSERT(Elements(surf->payload) >= 6); - dw = surf->payload; - - dw[0] = surface_type << GEN6_SURFACE_DW0_TYPE__SHIFT | - surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT | - GEN6_SURFACE_DW0_MIPLAYOUT_BELOW; - - if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt) { - dw[0] |= 1 << 9 | - GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; - } - - if (is_rt) - dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW; - - dw[1] = 0; - - dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT | - (width - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT | - lod << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT; - - assert(img->tiling != GEN8_TILING_W); - dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT | - (pitch - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT | - img->tiling; - - dw[4] = first_level << GEN6_SURFACE_DW4_MIN_LOD__SHIFT | - first_layer << 17 | - (num_layers - 1) << 8 | - ((img->sample_count > 1) ? GEN6_SURFACE_DW4_MULTISAMPLECOUNT_4 : - GEN6_SURFACE_DW4_MULTISAMPLECOUNT_1); - - dw[5] = 0; - - assert(img->align_j == 2 || img->align_j == 4); - if (img->align_j == 4) - dw[5] |= GEN6_SURFACE_DW5_VALIGN_4; -} - -static void -view_init_null_gen7(const struct ilo_dev *dev, - unsigned width, unsigned height, - unsigned depth, unsigned level, - struct ilo_view_surface *surf) -{ - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 7, 8); - - assert(width >= 1 && height >= 1 && depth >= 1); - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 62: - * - * "A null surface is used in instances where an actual surface is not - * bound. When a write message is generated to a null surface, no - * actual surface is written to. When a read message (including any - * sampling engine message) is generated to a null surface, the result - * is all zeros. Note that a null surface type is allowed to be used - * with all messages, even if it is not specificially indicated as - * supported. All of the remaining fields in surface state are ignored - * for null surfaces, with the following exceptions: - * - * * Width, Height, Depth, LOD, and Render Target View Extent fields - * must match the depth buffer's corresponding state for all render - * target surfaces, including null. - * * All sampling engine and data port messages support null surfaces - * with the above behavior, even if not mentioned as specifically - * supported, except for the following: - * * Data Port Media Block Read/Write messages. - * * The Surface Type of a surface used as a render target (accessed - * via the Data Port's Render Target Write message) must be the same - * as the Surface Type of all other render targets and of the depth - * buffer (defined in 3DSTATE_DEPTH_BUFFER), unless either the depth - * buffer or render targets are SURFTYPE_NULL." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 65: - * - * "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be - * true" - */ - - STATIC_ASSERT(Elements(surf->payload) >= 13); - dw = surf->payload; - - dw[0] = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT | - GEN6_FORMAT_B8G8R8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) - dw[0] |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT; - else - dw[0] |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT; - - dw[1] = 0; - - dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) | - GEN_SHIFT32(width - 1, GEN7_SURFACE_DW2_WIDTH); - - dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH); - - dw[4] = 0; - dw[5] = level; - - dw[6] = 0; - dw[7] = 0; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) - memset(&dw[8], 0, sizeof(*dw) * (13 - 8)); -} - -static void -view_init_for_buffer_gen7(const struct ilo_dev *dev, - const struct ilo_buffer *buf, - unsigned offset, unsigned size, - unsigned struct_size, - enum pipe_format elem_format, - bool is_rt, bool render_cache_rw, - struct ilo_view_surface *surf) -{ - const bool typed = (elem_format != PIPE_FORMAT_NONE); - const bool structured = (!typed && struct_size > 1); - const int elem_size = (typed) ? - util_format_get_blocksize(elem_format) : 1; - int width, height, depth, pitch; - int surface_type, surface_format, num_entries; - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 7, 8); - - surface_type = (structured) ? GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER; - - surface_format = (typed) ? - ilo_format_translate_color(dev, elem_format) : GEN6_FORMAT_RAW; - - num_entries = size / struct_size; - /* see if there is enough space to fit another element */ - if (size % struct_size >= elem_size && !structured) - num_entries++; - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 67: - * - * "For SURFTYPE_BUFFER render targets, this field (Surface Base - * Address) specifies the base address of first element of the - * surface. The surface is interpreted as a simple array of that - * single element type. The address must be naturally-aligned to the - * element size (e.g., a buffer containing R32G32B32A32_FLOAT elements - * must be 16-byte aligned) - * - * For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies - * the base address of the first element of the surface, computed in - * software by adding the surface base address to the byte offset of - * the element in the buffer." - */ - if (is_rt) - assert(offset % elem_size == 0); - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 68: - * - * "For typed buffer and structured buffer surfaces, the number of - * entries in the buffer ranges from 1 to 2^27. For raw buffer - * surfaces, the number of entries in the buffer is the number of - * bytes which can range from 1 to 2^30." - */ - assert(num_entries >= 1 && - num_entries <= 1 << ((typed || structured) ? 27 : 30)); - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 69: - * - * "For SURFTYPE_BUFFER: The low two bits of this field (Width) must be - * 11 if the Surface Format is RAW (the size of the buffer must be a - * multiple of 4 bytes)." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 70: - * - * "For surfaces of type SURFTYPE_BUFFER and SURFTYPE_STRBUF, this - * field (Surface Pitch) indicates the size of the structure." - * - * "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the pitch - * must be a multiple of 4 bytes." - */ - if (structured) - assert(struct_size % 4 == 0); - else if (!typed) - assert(num_entries % 4 == 0); - - pitch = struct_size; - - pitch--; - num_entries--; - /* bits [6:0] */ - width = (num_entries & 0x0000007f); - /* bits [20:7] */ - height = (num_entries & 0x001fff80) >> 7; - /* bits [30:21] */ - depth = (num_entries & 0x7fe00000) >> 21; - /* limit to [26:21] */ - if (typed || structured) - depth &= 0x3f; - - STATIC_ASSERT(Elements(surf->payload) >= 13); - dw = surf->payload; - - dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT | - surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT; - if (render_cache_rw) - dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - dw[8] = offset; - memset(&dw[9], 0, sizeof(*dw) * (13 - 9)); - } else { - dw[1] = offset; - } - - dw[2] = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) | - GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH); - - dw[3] = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) | - pitch; - - dw[4] = 0; - dw[5] = 0; - - dw[6] = 0; - dw[7] = 0; - - if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { - dw[7] |= GEN_SHIFT32(GEN75_SCS_RED, GEN75_SURFACE_DW7_SCS_R) | - GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) | - GEN_SHIFT32(GEN75_SCS_BLUE, GEN75_SURFACE_DW7_SCS_B) | - GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A); - } -} - -static void -view_init_for_image_gen7(const struct ilo_dev *dev, - const struct ilo_image *img, - enum pipe_texture_target target, - enum pipe_format format, - unsigned first_level, - unsigned num_levels, - unsigned first_layer, - unsigned num_layers, - bool is_rt, - struct ilo_view_surface *surf) -{ - int surface_type, surface_format; - int width, height, depth, pitch, lod; - uint32_t *dw; - - ILO_DEV_ASSERT(dev, 7, 8); - - surface_type = ilo_gpe_gen6_translate_texture(target); - assert(surface_type != GEN6_SURFTYPE_BUFFER); - - if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil) - format = PIPE_FORMAT_Z32_FLOAT; - - if (is_rt) - surface_format = ilo_format_translate_render(dev, format); - else - surface_format = ilo_format_translate_texture(dev, format); - assert(surface_format >= 0); - - width = img->width0; - height = img->height0; - depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers; - pitch = img->bo_stride; - - if (surface_type == GEN6_SURFTYPE_CUBE) { - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 70: - * - * "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of - * this field is [0,340], indicating the number of cube array - * elements (equal to the number of underlying 2D array elements - * divided by 6). For other surfaces, this field must be zero." - * - * When is_rt is true, we treat the texture as a 2D one to avoid the - * restriction. - */ - if (is_rt) { - surface_type = GEN6_SURFTYPE_2D; - } - else { - assert(num_layers % 6 == 0); - depth = num_layers / 6; - } - } - - /* sanity check the size */ - assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1); - assert(first_layer < 2048 && num_layers <= 2048); - switch (surface_type) { - case GEN6_SURFTYPE_1D: - assert(width <= 16384 && height == 1 && depth <= 2048); - break; - case GEN6_SURFTYPE_2D: - assert(width <= 16384 && height <= 16384 && depth <= 2048); - break; - case GEN6_SURFTYPE_3D: - assert(width <= 2048 && height <= 2048 && depth <= 2048); - if (!is_rt) - assert(first_layer == 0); - break; - case GEN6_SURFTYPE_CUBE: - assert(width <= 16384 && height <= 16384 && depth <= 86); - assert(width == height); - if (is_rt) - assert(first_layer == 0); - break; - default: - assert(!"unexpected surface type"); - break; - } - - if (is_rt) { - assert(num_levels == 1); - lod = first_level; - } - else { - lod = num_levels - 1; - } - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 68: - * - * "The Base Address for linear render target surfaces and surfaces - * accessed with the typed surface read/write data port messages must - * be element-size aligned, for non-YUV surface formats, or a multiple - * of 2 element-sizes for YUV surface formats. Other linear surfaces - * have no alignment requirements (byte alignment is sufficient)." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 70: - * - * "For linear render target surfaces and surfaces accessed with the - * typed data port messages, the pitch must be a multiple of the - * element size for non-YUV surface formats. Pitch must be a multiple - * of 2 * element size for YUV surface formats. For linear surfaces - * with Surface Type of SURFTYPE_STRBUF, the pitch must be a multiple - * of 4 bytes.For other linear surfaces, the pitch can be any multiple - * of bytes." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 74: - * - * "For linear surfaces, this field (X Offset) must be zero." - */ - if (img->tiling == GEN6_TILING_NONE) { - if (is_rt) { - const int elem_size = util_format_get_blocksize(format); - assert(pitch % elem_size == 0); - } - } - - STATIC_ASSERT(Elements(surf->payload) >= 13); - dw = surf->payload; - - dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT | - surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT; - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 63: - * - * "If this field (Surface Array) is enabled, the Surface Type must be - * SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is - * disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or - * SURFTYPE_CUBE, the Depth field must be set to zero." - * - * For non-3D sampler surfaces, resinfo (the sampler message) always - * returns zero for the number of layers when this field is not set. - */ - if (surface_type != GEN6_SURFTYPE_3D) { - switch (target) { - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE_ARRAY: - dw[0] |= GEN7_SURFACE_DW0_IS_ARRAY; - break; - default: - assert(depth == 1); - break; - } - } - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - switch (img->align_j) { - case 4: - dw[0] |= GEN7_SURFACE_DW0_VALIGN_4; - break; - case 8: - dw[0] |= GEN8_SURFACE_DW0_VALIGN_8; - break; - case 16: - dw[0] |= GEN8_SURFACE_DW0_VALIGN_16; - break; - default: - assert(!"unsupported valign"); - break; - } - - switch (img->align_i) { - case 4: - dw[0] |= GEN8_SURFACE_DW0_HALIGN_4; - break; - case 8: - dw[0] |= GEN8_SURFACE_DW0_HALIGN_8; - break; - case 16: - dw[0] |= GEN8_SURFACE_DW0_HALIGN_16; - break; - default: - assert(!"unsupported halign"); - break; - } - - dw[0] |= img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT; - } else { - assert(img->align_i == 4 || img->align_i == 8); - assert(img->align_j == 2 || img->align_j == 4); - - if (img->align_j == 4) - dw[0] |= GEN7_SURFACE_DW0_VALIGN_4; - - if (img->align_i == 8) - dw[0] |= GEN7_SURFACE_DW0_HALIGN_8; - - assert(img->tiling != GEN8_TILING_W); - dw[0] |= img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT; - - if (img->walk == ILO_IMAGE_WALK_LOD) - dw[0] |= GEN7_SURFACE_DW0_ARYSPC_LOD0; - else - dw[0] |= GEN7_SURFACE_DW0_ARYSPC_FULL; - } - - if (is_rt) - dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW; - - if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt) - dw[0] |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - assert(img->walk_layer_height % 4 == 0); - dw[1] = img->walk_layer_height / 4; - } else { - dw[1] = 0; - } - - dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) | - GEN_SHIFT32(width - 1, GEN7_SURFACE_DW2_WIDTH); - - dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH) | - (pitch - 1); - - dw[4] = first_layer << 18 | - (num_layers - 1) << 7; - - /* - * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL - * means the samples are interleaved. The layouts are the same when the - * number of samples is 1. - */ - if (img->interleaved_samples && img->sample_count > 1) { - assert(!is_rt); - dw[4] |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL; - } - else { - dw[4] |= GEN7_SURFACE_DW4_MSFMT_MSS; - } - - switch (img->sample_count) { - case 0: - case 1: - default: - dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_1; - break; - case 2: - dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2; - break; - case 4: - dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4; - break; - case 8: - dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8; - break; - case 16: - dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16; - break; - } - - dw[5] = GEN_SHIFT32(first_level, GEN7_SURFACE_DW5_MIN_LOD) | - lod; - - dw[6] = 0; - dw[7] = 0; - - if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { - dw[7] |= GEN_SHIFT32(GEN75_SCS_RED, GEN75_SURFACE_DW7_SCS_R) | - GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) | - GEN_SHIFT32(GEN75_SCS_BLUE, GEN75_SURFACE_DW7_SCS_B) | - GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A); - } - - if (ilo_dev_gen(dev) >= ILO_GEN(8)) - memset(&dw[8], 0, sizeof(*dw) * (13 - 8)); -} - -void -ilo_gpe_init_view_surface_null(const struct ilo_dev *dev, - unsigned width, unsigned height, - unsigned depth, unsigned level, - struct ilo_view_surface *surf) -{ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - view_init_null_gen7(dev, - width, height, depth, level, surf); - } else { - view_init_null_gen6(dev, - width, height, depth, level, surf); - } - - surf->bo = NULL; - surf->scanout = false; -} - -void -ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev, - const struct ilo_buffer *buf, - unsigned offset, unsigned size, - unsigned struct_size, - enum pipe_format elem_format, - bool is_rt, bool render_cache_rw, - struct ilo_view_surface *surf) -{ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - view_init_for_buffer_gen7(dev, buf, offset, size, - struct_size, elem_format, is_rt, render_cache_rw, surf); - } else { - view_init_for_buffer_gen6(dev, buf, offset, size, - struct_size, elem_format, is_rt, render_cache_rw, surf); - } - - /* do not increment reference count */ - surf->bo = buf->bo; - surf->scanout = false; -} - -void -ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev, - const struct ilo_image *img, - enum pipe_texture_target target, - enum pipe_format format, - unsigned first_level, - unsigned num_levels, - unsigned first_layer, - unsigned num_layers, - bool is_rt, - struct ilo_view_surface *surf) -{ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - view_init_for_image_gen7(dev, img, target, format, - first_level, num_levels, first_layer, num_layers, - is_rt, surf); - } else { - view_init_for_image_gen6(dev, img, target, format, - first_level, num_levels, first_layer, num_layers, - is_rt, surf); - } - - surf->scanout = img->scanout; - /* do not increment reference count */ - surf->bo = img->bo; -} - -static void -sampler_init_border_color_gen6(const struct ilo_dev *dev, - const union pipe_color_union *color, - uint32_t *dw, int num_dwords) -{ - float rgba[4] = { - color->f[0], color->f[1], color->f[2], color->f[3], - }; - - ILO_DEV_ASSERT(dev, 6, 6); - - assert(num_dwords >= 12); - - /* - * This state is not documented in the Sandy Bridge PRM, but in the - * Ironlake PRM. SNORM8 seems to be in DW11 instead of DW1. - */ - - /* IEEE_FP */ - dw[1] = fui(rgba[0]); - dw[2] = fui(rgba[1]); - dw[3] = fui(rgba[2]); - dw[4] = fui(rgba[3]); - - /* FLOAT_16 */ - dw[5] = util_float_to_half(rgba[0]) | - util_float_to_half(rgba[1]) << 16; - dw[6] = util_float_to_half(rgba[2]) | - util_float_to_half(rgba[3]) << 16; - - /* clamp to [-1.0f, 1.0f] */ - rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f); - rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f); - rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f); - rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f); - - /* SNORM16 */ - dw[9] = (int16_t) util_iround(rgba[0] * 32767.0f) | - (int16_t) util_iround(rgba[1] * 32767.0f) << 16; - dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) | - (int16_t) util_iround(rgba[3] * 32767.0f) << 16; - - /* SNORM8 */ - dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) | - (int8_t) util_iround(rgba[1] * 127.0f) << 8 | - (int8_t) util_iround(rgba[2] * 127.0f) << 16 | - (int8_t) util_iround(rgba[3] * 127.0f) << 24; - - /* clamp to [0.0f, 1.0f] */ - rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f); - rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f); - rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f); - rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f); - - /* UNORM8 */ - dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) | - (uint8_t) util_iround(rgba[1] * 255.0f) << 8 | - (uint8_t) util_iround(rgba[2] * 255.0f) << 16 | - (uint8_t) util_iround(rgba[3] * 255.0f) << 24; - - /* UNORM16 */ - dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) | - (uint16_t) util_iround(rgba[1] * 65535.0f) << 16; - dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) | - (uint16_t) util_iround(rgba[3] * 65535.0f) << 16; -} - -/** - * Translate a pipe texture mipfilter to the matching hardware mipfilter. - */ -static int -gen6_translate_tex_mipfilter(unsigned filter) -{ - switch (filter) { - case PIPE_TEX_MIPFILTER_NEAREST: return GEN6_MIPFILTER_NEAREST; - case PIPE_TEX_MIPFILTER_LINEAR: return GEN6_MIPFILTER_LINEAR; - case PIPE_TEX_MIPFILTER_NONE: return GEN6_MIPFILTER_NONE; - default: - assert(!"unknown mipfilter"); - return GEN6_MIPFILTER_NONE; - } -} - -/** - * Translate a pipe texture filter to the matching hardware mapfilter. - */ -static int -gen6_translate_tex_filter(unsigned filter) -{ - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: return GEN6_MAPFILTER_NEAREST; - case PIPE_TEX_FILTER_LINEAR: return GEN6_MAPFILTER_LINEAR; - default: - assert(!"unknown sampler filter"); - return GEN6_MAPFILTER_NEAREST; - } -} - -/** - * Translate a pipe texture coordinate wrapping mode to the matching hardware - * wrapping mode. - */ -static int -gen6_translate_tex_wrap(unsigned wrap) -{ - switch (wrap) { - case PIPE_TEX_WRAP_CLAMP: return GEN8_TEXCOORDMODE_HALF_BORDER; - case PIPE_TEX_WRAP_REPEAT: return GEN6_TEXCOORDMODE_WRAP; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return GEN6_TEXCOORDMODE_CLAMP; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return GEN6_TEXCOORDMODE_CLAMP_BORDER; - case PIPE_TEX_WRAP_MIRROR_REPEAT: return GEN6_TEXCOORDMODE_MIRROR; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - default: - assert(!"unknown sampler wrap mode"); - return GEN6_TEXCOORDMODE_WRAP; - } -} - -/** - * Translate a pipe shadow compare function to the matching hardware shadow - * function. - */ -static int -gen6_translate_shadow_func(unsigned func) -{ - /* - * For PIPE_FUNC_x, the reference value is on the left-hand side of the - * comparison, and 1.0 is returned when the comparison is true. - * - * For GEN6_COMPAREFUNCTION_x, the reference value is on the right-hand side of - * the comparison, and 0.0 is returned when the comparison is true. - */ - switch (func) { - case PIPE_FUNC_NEVER: return GEN6_COMPAREFUNCTION_ALWAYS; - case PIPE_FUNC_LESS: return GEN6_COMPAREFUNCTION_LEQUAL; - case PIPE_FUNC_EQUAL: return GEN6_COMPAREFUNCTION_NOTEQUAL; - case PIPE_FUNC_LEQUAL: return GEN6_COMPAREFUNCTION_LESS; - case PIPE_FUNC_GREATER: return GEN6_COMPAREFUNCTION_GEQUAL; - case PIPE_FUNC_NOTEQUAL: return GEN6_COMPAREFUNCTION_EQUAL; - case PIPE_FUNC_GEQUAL: return GEN6_COMPAREFUNCTION_GREATER; - case PIPE_FUNC_ALWAYS: return GEN6_COMPAREFUNCTION_NEVER; - default: - assert(!"unknown shadow compare function"); - return GEN6_COMPAREFUNCTION_NEVER; - } -} - -void -ilo_gpe_init_sampler_cso(const struct ilo_dev *dev, - const struct pipe_sampler_state *state, - struct ilo_sampler_cso *sampler) -{ - int mip_filter, min_filter, mag_filter, max_aniso; - int lod_bias, max_lod, min_lod; - int wrap_s, wrap_t, wrap_r, wrap_cube; - uint32_t dw0, dw1, dw3; - - ILO_DEV_ASSERT(dev, 6, 8); - - memset(sampler, 0, sizeof(*sampler)); - - mip_filter = gen6_translate_tex_mipfilter(state->min_mip_filter); - min_filter = gen6_translate_tex_filter(state->min_img_filter); - mag_filter = gen6_translate_tex_filter(state->mag_img_filter); - - sampler->anisotropic = state->max_anisotropy; - - if (state->max_anisotropy >= 2 && state->max_anisotropy <= 16) - max_aniso = state->max_anisotropy / 2 - 1; - else if (state->max_anisotropy > 16) - max_aniso = GEN6_ANISORATIO_16; - else - max_aniso = GEN6_ANISORATIO_2; - - /* - * - * Here is how the hardware calculate per-pixel LOD, from my reading of the - * PRMs: - * - * 1) LOD is set to log2(ratio of texels to pixels) if not specified in - * other ways. The number of texels is measured using level - * SurfMinLod. - * 2) Bias is added to LOD. - * 3) LOD is clamped to [MinLod, MaxLod], and the clamped value is - * compared with Base to determine whether magnification or - * minification is needed. (if preclamp is disabled, LOD is compared - * with Base before clamping) - * 4) If magnification is needed, or no mipmapping is requested, LOD is - * set to floor(MinLod). - * 5) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD. - * - * With Gallium interface, Base is always zero and - * pipe_sampler_view::u.tex.first_level specifies SurfMinLod. - */ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - const float scale = 256.0f; - - /* [-16.0, 16.0) in S4.8 */ - lod_bias = (int) - (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale); - lod_bias &= 0x1fff; - - /* [0.0, 14.0] in U4.8 */ - max_lod = (int) (CLAMP(state->max_lod, 0.0f, 14.0f) * scale); - min_lod = (int) (CLAMP(state->min_lod, 0.0f, 14.0f) * scale); - } - else { - const float scale = 64.0f; - - /* [-16.0, 16.0) in S4.6 */ - lod_bias = (int) - (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale); - lod_bias &= 0x7ff; - - /* [0.0, 13.0] in U4.6 */ - max_lod = (int) (CLAMP(state->max_lod, 0.0f, 13.0f) * scale); - min_lod = (int) (CLAMP(state->min_lod, 0.0f, 13.0f) * scale); - } - - /* - * We want LOD to be clamped to determine magnification/minification, and - * get set to zero when it is magnification or when mipmapping is disabled. - * The hardware would set LOD to floor(MinLod) and that is a problem when - * MinLod is greater than or equal to 1.0f. - * - * With Base being zero, it is always minification when MinLod is non-zero. - * To achieve our goal, we just need to set MinLod to zero and set - * MagFilter to MinFilter when mipmapping is disabled. - */ - if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && min_lod) { - min_lod = 0; - mag_filter = min_filter; - } - - /* determine wrap s/t/r */ - wrap_s = gen6_translate_tex_wrap(state->wrap_s); - wrap_t = gen6_translate_tex_wrap(state->wrap_t); - wrap_r = gen6_translate_tex_wrap(state->wrap_r); - if (ilo_dev_gen(dev) < ILO_GEN(8)) { - /* - * For nearest filtering, PIPE_TEX_WRAP_CLAMP means - * PIPE_TEX_WRAP_CLAMP_TO_EDGE; for linear filtering, - * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while - * additionally clamping the texture coordinates to [0.0, 1.0]. - * - * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8. The - * clamping has to be taken care of in the shaders. There are two - * filters here, but let the minification one has a say. - */ - const bool clamp_is_to_edge = - (state->min_img_filter == PIPE_TEX_FILTER_NEAREST); - - if (clamp_is_to_edge) { - if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER) - wrap_s = GEN6_TEXCOORDMODE_CLAMP; - if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER) - wrap_t = GEN6_TEXCOORDMODE_CLAMP; - if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER) - wrap_r = GEN6_TEXCOORDMODE_CLAMP; - } else { - if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER) { - wrap_s = GEN6_TEXCOORDMODE_CLAMP_BORDER; - sampler->saturate_s = true; - } - if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER) { - wrap_t = GEN6_TEXCOORDMODE_CLAMP_BORDER; - sampler->saturate_t = true; - } - if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER) { - wrap_r = GEN6_TEXCOORDMODE_CLAMP_BORDER; - sampler->saturate_r = true; - } - } - } - - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 107: - * - * "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP - * and TEXCOORDMODE_CUBE settings are valid, and each TC component - * must have the same Address Control mode." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 96: - * - * "This field (Cube Surface Control Mode) must be set to - * CUBECTRLMODE_PROGRAMMED" - * - * Therefore, we cannot use "Cube Surface Control Mode" for semless cube - * map filtering. - */ - if (state->seamless_cube_map && - (state->min_img_filter != PIPE_TEX_FILTER_NEAREST || - state->mag_img_filter != PIPE_TEX_FILTER_NEAREST)) { - wrap_cube = GEN6_TEXCOORDMODE_CUBE; - } - else { - wrap_cube = GEN6_TEXCOORDMODE_CLAMP; - } - - if (!state->normalized_coords) { - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 98: - * - * "The following state must be set as indicated if this field - * (Non-normalized Coordinate Enable) is enabled: - * - * - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP, - * TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER. - * - Surface Type must be SURFTYPE_2D or SURFTYPE_3D. - * - Mag Mode Filter must be MAPFILTER_NEAREST or - * MAPFILTER_LINEAR. - * - Min Mode Filter must be MAPFILTER_NEAREST or - * MAPFILTER_LINEAR. - * - Mip Mode Filter must be MIPFILTER_NONE. - * - Min LOD must be 0. - * - Max LOD must be 0. - * - MIP Count must be 0. - * - Surface Min LOD must be 0. - * - Texture LOD Bias must be 0." - */ - assert(wrap_s == GEN6_TEXCOORDMODE_CLAMP || - wrap_s == GEN6_TEXCOORDMODE_CLAMP_BORDER); - assert(wrap_t == GEN6_TEXCOORDMODE_CLAMP || - wrap_t == GEN6_TEXCOORDMODE_CLAMP_BORDER); - assert(wrap_r == GEN6_TEXCOORDMODE_CLAMP || - wrap_r == GEN6_TEXCOORDMODE_CLAMP_BORDER); - - assert(mag_filter == GEN6_MAPFILTER_NEAREST || - mag_filter == GEN6_MAPFILTER_LINEAR); - assert(min_filter == GEN6_MAPFILTER_NEAREST || - min_filter == GEN6_MAPFILTER_LINEAR); - - /* work around a bug in util_blitter */ - mip_filter = GEN6_MIPFILTER_NONE; - - assert(mip_filter == GEN6_MIPFILTER_NONE); - } - - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - dw0 = 1 << 28 | - mip_filter << 20 | - lod_bias << 1; - - sampler->dw_filter = mag_filter << 17 | - min_filter << 14; - - sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 | - GEN6_MAPFILTER_ANISOTROPIC << 14 | - 1; - - dw1 = min_lod << 20 | - max_lod << 8; - - if (state->compare_mode != PIPE_TEX_COMPARE_NONE) - dw1 |= gen6_translate_shadow_func(state->compare_func) << 1; - - dw3 = max_aniso << 19; - - /* round the coordinates for linear filtering */ - if (min_filter != GEN6_MAPFILTER_NEAREST) { - dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND | - GEN6_SAMPLER_DW3_V_MIN_ROUND | - GEN6_SAMPLER_DW3_R_MIN_ROUND); - } - if (mag_filter != GEN6_MAPFILTER_NEAREST) { - dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND | - GEN6_SAMPLER_DW3_V_MAG_ROUND | - GEN6_SAMPLER_DW3_R_MAG_ROUND); - } - - if (!state->normalized_coords) - dw3 |= 1 << 10; - - sampler->dw_wrap = wrap_s << 6 | - wrap_t << 3 | - wrap_r; - - /* - * As noted in the classic i965 driver, the HW may still reference - * wrap_t and wrap_r for 1D textures. We need to set them to a safe - * mode - */ - sampler->dw_wrap_1d = wrap_s << 6 | - GEN6_TEXCOORDMODE_WRAP << 3 | - GEN6_TEXCOORDMODE_WRAP; - - sampler->dw_wrap_cube = wrap_cube << 6 | - wrap_cube << 3 | - wrap_cube; - - STATIC_ASSERT(Elements(sampler->payload) >= 7); - - sampler->payload[0] = dw0; - sampler->payload[1] = dw1; - sampler->payload[2] = dw3; - - memcpy(&sampler->payload[3], - state->border_color.ui, sizeof(state->border_color.ui)); - } - else { - dw0 = 1 << 28 | - mip_filter << 20 | - lod_bias << 3; - - if (state->compare_mode != PIPE_TEX_COMPARE_NONE) - dw0 |= gen6_translate_shadow_func(state->compare_func); - - sampler->dw_filter = (min_filter != mag_filter) << 27 | - mag_filter << 17 | - min_filter << 14; - - sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 | - GEN6_MAPFILTER_ANISOTROPIC << 14; - - dw1 = min_lod << 22 | - max_lod << 12; - - sampler->dw_wrap = wrap_s << 6 | - wrap_t << 3 | - wrap_r; - - sampler->dw_wrap_1d = wrap_s << 6 | - GEN6_TEXCOORDMODE_WRAP << 3 | - GEN6_TEXCOORDMODE_WRAP; - - sampler->dw_wrap_cube = wrap_cube << 6 | - wrap_cube << 3 | - wrap_cube; - - dw3 = max_aniso << 19; - - /* round the coordinates for linear filtering */ - if (min_filter != GEN6_MAPFILTER_NEAREST) { - dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND | - GEN6_SAMPLER_DW3_V_MIN_ROUND | - GEN6_SAMPLER_DW3_R_MIN_ROUND); - } - if (mag_filter != GEN6_MAPFILTER_NEAREST) { - dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND | - GEN6_SAMPLER_DW3_V_MAG_ROUND | - GEN6_SAMPLER_DW3_R_MAG_ROUND); - } - - if (!state->normalized_coords) - dw3 |= 1; - - STATIC_ASSERT(Elements(sampler->payload) >= 15); - - sampler->payload[0] = dw0; - sampler->payload[1] = dw1; - sampler->payload[2] = dw3; - - sampler_init_border_color_gen6(dev, - &state->border_color, &sampler->payload[3], 12); - } -} diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c new file mode 100644 index 00000000000..83ee8de979c --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c @@ -0,0 +1,890 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_cc.h" + +static bool +cc_validate_gen6_stencil(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_stencil_info *stencil = &info->stencil; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 359: + * + * "If the Depth Buffer is either undefined or does not have a surface + * format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate + * stencil buffer is disabled, Stencil Test Enable must be DISABLED" + * + * From the Sandy Bridge PRM, volume 2 part 1, page 370: + * + * "This field (Stencil Test Enable) cannot be enabled if Surface + * Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM." + */ + if (stencil->test_enable) + assert(stencil->cv_has_buffer); + + return true; +} + +static bool +cc_validate_gen6_depth(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_depth_info *depth = &info->depth; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 360: + * + * "Enabling the Depth Test function without defining a Depth Buffer is + * UNDEFINED." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 375: + * + * "A Depth Buffer must be defined before enabling writes to it, or + * operation is UNDEFINED." + */ + if (depth->test_enable || depth->write_enable) + assert(depth->cv_has_buffer); + + return true; +} + +static bool +cc_set_gen6_DEPTH_STENCIL_STATE(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_stencil_info *stencil = &info->stencil; + const struct ilo_state_cc_depth_info *depth = &info->depth; + const struct ilo_state_cc_params_info *params = &info->params; + uint32_t dw0, dw1, dw2; + + ILO_DEV_ASSERT(dev, 6, 7.5); + + if (!cc_validate_gen6_stencil(dev, info) || + !cc_validate_gen6_depth(dev, info)) + return false; + + dw0 = 0; + dw1 = 0; + if (stencil->test_enable) { + const struct ilo_state_cc_stencil_op_info *front = &stencil->front; + const struct ilo_state_cc_stencil_params_info *front_p = + ¶ms->stencil_front; + const struct ilo_state_cc_stencil_op_info *back; + const struct ilo_state_cc_stencil_params_info *back_p; + + dw0 |= GEN6_ZS_DW0_STENCIL_TEST_ENABLE; + + if (stencil->twosided_enable) { + dw0 |= GEN6_ZS_DW0_STENCIL1_ENABLE; + + back = &stencil->back; + back_p = ¶ms->stencil_back; + } else { + back = &stencil->front; + back_p = ¶ms->stencil_front; + } + + dw0 |= front->test_func << GEN6_ZS_DW0_STENCIL_FUNC__SHIFT | + front->fail_op << GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT | + front->zfail_op << GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT | + front->zpass_op << GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT | + back->test_func << GEN6_ZS_DW0_STENCIL1_FUNC__SHIFT | + back->fail_op << GEN6_ZS_DW0_STENCIL1_FAIL_OP__SHIFT | + back->zfail_op << GEN6_ZS_DW0_STENCIL1_ZFAIL_OP__SHIFT | + back->zpass_op << GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT; + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 363: + * + * "If this field (Stencil Buffer Write Enable) is enabled, Stencil + * Test Enable must also be enabled." + * + * This is different from depth write enable, which is independent from + * depth test enable. + */ + if (front_p->write_mask || back_p->write_mask) + dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE; + + dw1 |= front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT | + front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT | + back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT | + back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT; + } + + dw2 = 0; + if (depth->test_enable) { + dw2 |= GEN6_ZS_DW2_DEPTH_TEST_ENABLE | + depth->test_func << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT; + } else { + dw2 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT; + } + + /* independent from depth->test_enable */ + if (depth->write_enable) + dw2 |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 3); + cc->ds[0] = dw0; + cc->ds[1] = dw1; + cc->ds[2] = dw2; + + return true; +} + +static bool +cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_stencil_info *stencil = &info->stencil; + const struct ilo_state_cc_depth_info *depth = &info->depth; + const struct ilo_state_cc_params_info *params = &info->params; + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 8, 8); + + if (!cc_validate_gen6_stencil(dev, info) || + !cc_validate_gen6_depth(dev, info)) + return false; + + dw1 = 0; + dw2 = 0; + if (stencil->test_enable) { + const struct ilo_state_cc_stencil_op_info *front = &stencil->front; + const struct ilo_state_cc_stencil_params_info *front_p = + ¶ms->stencil_front; + const struct ilo_state_cc_stencil_op_info *back; + const struct ilo_state_cc_stencil_params_info *back_p; + + dw1 |= GEN8_ZS_DW1_STENCIL_TEST_ENABLE; + + if (stencil->twosided_enable) { + dw1 |= GEN8_ZS_DW1_STENCIL1_ENABLE; + + back = &stencil->back; + back_p = ¶ms->stencil_back; + } else { + back = &stencil->front; + back_p = ¶ms->stencil_front; + } + + dw1 |= front->fail_op << GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT | + front->zfail_op << GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT | + front->zpass_op << GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT | + back->test_func << GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT | + back->fail_op << GEN8_ZS_DW1_STENCIL1_FAIL_OP__SHIFT | + back->zfail_op << GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT | + back->zpass_op << GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT | + front->test_func << GEN8_ZS_DW1_STENCIL_FUNC__SHIFT; + + if (front_p->write_mask || back_p->write_mask) + dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE; + + dw2 |= front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT | + front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT | + back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT | + back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT; + } + + if (depth->test_enable) { + dw1 |= GEN8_ZS_DW1_DEPTH_TEST_ENABLE | + depth->test_func << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT; + } else { + dw1 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT; + } + + if (depth->write_enable) + dw1 |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 2); + cc->ds[0] = dw1; + cc->ds[1] = dw2; + + return true; +} + +static bool +is_dual_source_blend_factor(enum gen_blend_factor factor) +{ + switch (factor) { + case GEN6_BLENDFACTOR_SRC1_COLOR: + case GEN6_BLENDFACTOR_SRC1_ALPHA: + case GEN6_BLENDFACTOR_INV_SRC1_COLOR: + case GEN6_BLENDFACTOR_INV_SRC1_ALPHA: + return true; + default: + return false; + } +} + +static bool +cc_get_gen6_dual_source_blending(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_blend_info *blend = &info->blend; + bool dual_source_blending; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + dual_source_blending = (blend->rt_count && + (is_dual_source_blend_factor(blend->rt[0].rgb_src) || + is_dual_source_blend_factor(blend->rt[0].rgb_dst) || + is_dual_source_blend_factor(blend->rt[0].a_src) || + is_dual_source_blend_factor(blend->rt[0].a_dst))); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 356: + * + * "Dual Source Blending: When using "Dual Source" Render Target + * Write messages, the Source1 pixel color+alpha passed in the + * message can be selected as a src/dst blend factor. See Color + * Buffer Blending. In single-source mode, those blend factor + * selections are invalid. If SRC1 is included in a src/dst blend + * factor and a DualSource RT Write message is not utilized, + * results are UNDEFINED. (This reflects the same restriction in DX + * APIs, where undefined results are produced if "o1" is not + * written by a PS - there are no default values defined). If SRC1 + * is not included in a src/dst blend factor, dual source blending + * must be disabled." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 356: + * + * "The single source message will not cause a write to the render + * target if Dual Source Blend Enable in 3DSTATE_WM is enabled." + * + * "The dual source message will revert to a single source message + * using source 0 if Dual Source Blend Enable in 3DSTATE_WM is + * disabled." + * + * Dual source blending must be enabled or disabled universally. + */ + for (i = 1; i < blend->rt_count; i++) { + assert(dual_source_blending == + (is_dual_source_blend_factor(blend->rt[i].rgb_src) || + is_dual_source_blend_factor(blend->rt[i].rgb_dst) || + is_dual_source_blend_factor(blend->rt[i].a_src) || + is_dual_source_blend_factor(blend->rt[i].a_dst))); + } + + return dual_source_blending; +} + +static bool +cc_validate_gen6_alpha(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_alpha_info *alpha = &info->alpha; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 356: + * + * "Alpha values from the pixel shader are treated as FLOAT32 format + * for computing the AlphaToCoverage Mask." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 378: + * + * "If set (AlphaToCoverage Enable), Source0 Alpha is converted to a + * temporary 1/2/4-bit coverage mask and the mask bit corresponding to + * the sample# ANDed with the sample mask bit. If set, sample coverage + * is computed based on src0 alpha value. Value of 0 disables all + * samples and value of 1 enables all samples for that pixel. The same + * coverage needs to apply to all the RTs in MRT case. Further, any + * value of src0 alpha between 0 and 1 monotonically increases the + * number of enabled pixels. + * + * The same coverage needs to be applied to all the RTs in MRT case." + * + * "If set (AlphaToOne Enable), Source0 Alpha is set to 1.0f after + * (possibly) being used to generate the AlphaToCoverage coverage + * mask. + * + * The same coverage needs to be applied to all the RTs in MRT case. + * + * If Dual Source Blending is enabled, this bit must be disabled." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 382: + * + * "Alpha Test can only be enabled if Pixel Shader outputs a float + * alpha value. + * + * Alpha Test is applied independently on each render target by + * comparing that render target's alpha value against the alpha + * reference value. If the alpha test fails, the corresponding pixel + * write will be supressed only for that render target. The + * depth/stencil update will occur if alpha test passes for any render + * target." + * + * From the Sandy Bridge PRM, volume 4 part 1, page 194: + * + * "Multiple render targets are supported with the single source and + * replicate data messages. Each render target is accessed with a + * separate Render Target Write message, each with a different surface + * indicated (different binding table index). The depth buffer is + * written only by the message(s) to the last render target, indicated + * by the Last Render Target Select bit set to clear the pixel + * scoreboard bits." + * + * When AlphaToCoverage/AlphaToOne/AlphaTest is enabled, it is + * required/desirable for the RT write messages to set "Source0 Alpha + * Present to RenderTarget" in the MRT case. It is also required/desirable + * for the alpha values to be FLOAT32. + */ + if (alpha->alpha_to_coverage || alpha->alpha_to_one || alpha->test_enable) + assert(alpha->cv_float_source0_alpha); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 356: + * + * "[DevSNB]: When NumSamples = 1, AlphaToCoverage and AlphaTo + * Coverage Dither both must be disabled." + */ + if (ilo_dev_gen(dev) == ILO_GEN(6) && alpha->alpha_to_coverage) + assert(alpha->cv_sample_count_one); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 378: + * + * "If Dual Source Blending is enabled, this bit (AlphaToOne Enable) + * must be disabled." + */ + if (alpha->alpha_to_one) + assert(!cc_get_gen6_dual_source_blending(dev, info)); + + return true; +} + +static bool +cc_validate_gen6_blend(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_blend_info *blend = &info->blend; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(blend->rt_count <= ILO_STATE_CC_BLEND_MAX_RT_COUNT); + + return true; +} + +static enum gen_blend_factor +get_dst_alpha_one_blend_factor(enum gen_blend_factor factor, bool is_rgb) +{ + switch (factor) { + case GEN6_BLENDFACTOR_DST_ALPHA: + return GEN6_BLENDFACTOR_ONE; + case GEN6_BLENDFACTOR_INV_DST_ALPHA: + return GEN6_BLENDFACTOR_ZERO; + case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE: + return (is_rgb) ? GEN6_BLENDFACTOR_ZERO : GEN6_BLENDFACTOR_ONE; + default: + return factor; + } +} + +static void +cc_get_gen6_effective_rt(const struct ilo_dev *dev, + const struct ilo_state_cc_info *info, + uint8_t rt_index, + struct ilo_state_cc_blend_rt_info *dst) +{ + const struct ilo_state_cc_blend_rt_info *rt = &info->blend.rt[rt_index]; + + if (rt->logicop_enable || rt->blend_enable || + rt->argb_write_disables != 0xf) + assert(rt->cv_has_buffer); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 365: + * + * "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB + * variants), otherwise Logic Ops must be DISABLED." + * + * From the Broadwell PRM, volume 7, page 671: + * + * "Logic Ops are supported on all blendable render targets and render + * targets with *INT formats." + */ + if (ilo_dev_gen(dev) < ILO_GEN(8) && rt->logicop_enable) + assert(rt->cv_is_unorm); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 361: + * + * "Only certain surface formats support Color Buffer Blending. Refer + * to the Surface Format tables in Sampling Engine. Blending must be + * disabled on a RenderTarget if blending is not supported." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 365: + * + * "Color Buffer Blending and Logic Ops must not be enabled + * simultaneously, or behavior is UNDEFINED." + */ + if (rt->blend_enable) + assert(!rt->cv_is_integer && !rt->logicop_enable); + + *dst = *rt; + if (rt->blend_enable) { + /* 0x0 is reserved in enum gen_blend_factor */ + assert(rt->rgb_src && rt->rgb_dst && rt->a_src && rt->a_dst); + + if (rt->force_dst_alpha_one) { + dst->rgb_src = get_dst_alpha_one_blend_factor(rt->rgb_src, true); + dst->rgb_dst = get_dst_alpha_one_blend_factor(rt->rgb_dst, true); + dst->a_src = get_dst_alpha_one_blend_factor(rt->a_src, false); + dst->a_dst = get_dst_alpha_one_blend_factor(rt->a_dst, false); + dst->force_dst_alpha_one = false; + } + } else { + dst->rgb_src = GEN6_BLENDFACTOR_ONE; + dst->rgb_dst = GEN6_BLENDFACTOR_ZERO; + dst->rgb_func = GEN6_BLENDFUNCTION_ADD; + dst->a_src = dst->rgb_src; + dst->a_dst = dst->rgb_dst; + dst->a_func = dst->rgb_func; + } +} + +static bool +cc_set_gen6_BLEND_STATE(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_alpha_info *alpha = &info->alpha; + const struct ilo_state_cc_blend_info *blend = &info->blend; + uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw1_invariant; + uint32_t dw0, dw1; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 7.5); + + if (!cc_validate_gen6_alpha(dev, info) || + !cc_validate_gen6_blend(dev, info)) + return false; + + /* + * According to the Sandy Bridge PRM, volume 2 part 1, page 360, pre-blend + * and post-blend color clamps must be enabled in most cases. For the + * other cases, they are either desirable or ignored. We can enable them + * unconditionally. + */ + dw1 = GEN6_RT_DW1_COLORCLAMP_RTFORMAT | + GEN6_RT_DW1_PRE_BLEND_CLAMP | + GEN6_RT_DW1_POST_BLEND_CLAMP; + + if (alpha->alpha_to_coverage) { + dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 379: + * + * "[DevSNB]: This bit (AlphaToCoverage Dither Enable) must be + * disabled." + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER; + } + + if (alpha->alpha_to_one) + dw1 |= GEN6_RT_DW1_ALPHA_TO_ONE; + + if (alpha->test_enable) { + dw1 |= GEN6_RT_DW1_ALPHA_TEST_ENABLE | + alpha->test_func << GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT; + } else { + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 371: + * + * "When Alpha Test is disabled, Alpha Test Function must be + * COMPAREFUNCTION_ALWAYS." + */ + dw1 |= GEN6_COMPAREFUNCTION_ALWAYS << + GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT; + } + + if (blend->dither_enable) + dw1 |= GEN6_RT_DW1_DITHER_ENABLE; + + dw1_invariant = dw1; + + for (i = 0; i < blend->rt_count; i++) { + struct ilo_state_cc_blend_rt_info rt; + + cc_get_gen6_effective_rt(dev, info, i, &rt); + + /* 0x0 is reserved for blend factors and we have to set them all */ + dw0 = rt.a_func << GEN6_RT_DW0_ALPHA_FUNC__SHIFT | + rt.a_src << GEN6_RT_DW0_SRC_ALPHA_FACTOR__SHIFT | + rt.a_dst << GEN6_RT_DW0_DST_ALPHA_FACTOR__SHIFT | + rt.rgb_func << GEN6_RT_DW0_COLOR_FUNC__SHIFT | + rt.rgb_src << GEN6_RT_DW0_SRC_COLOR_FACTOR__SHIFT | + rt.rgb_dst << GEN6_RT_DW0_DST_COLOR_FACTOR__SHIFT; + + if (rt.blend_enable) { + dw0 |= GEN6_RT_DW0_BLEND_ENABLE; + + if (rt.a_src != rt.rgb_src || + rt.a_dst != rt.rgb_dst || + rt.a_func != rt.rgb_func) + dw0 |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE; + } + + dw1 = dw1_invariant | + rt.argb_write_disables << GEN6_RT_DW1_WRITE_DISABLES__SHIFT; + + if (rt.logicop_enable) { + dw1 |= GEN6_RT_DW1_LOGICOP_ENABLE | + rt.logicop_func << GEN6_RT_DW1_LOGICOP_FUNC__SHIFT; + } + + dw_rt[2 * i + 0] = dw0; + dw_rt[2 * i + 1] = dw1; + } + + + STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= ARRAY_SIZE(dw_rt)); + memcpy(&cc->blend[0], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count); + cc->blend_state_count = info->blend.rt_count; + + return true; +} + +static bool +cc_set_gen8_BLEND_STATE(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_alpha_info *alpha = &info->alpha; + const struct ilo_state_cc_blend_info *blend = &info->blend; + uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw0, dw1; + bool indep_alpha_enable; + uint8_t i; + + ILO_DEV_ASSERT(dev, 8, 8); + + if (!cc_validate_gen6_alpha(dev, info) || + !cc_validate_gen6_blend(dev, info)) + return false; + + indep_alpha_enable = false; + for (i = 0; i < blend->rt_count; i++) { + struct ilo_state_cc_blend_rt_info rt; + + cc_get_gen6_effective_rt(dev, info, i, &rt); + + dw0 = rt.rgb_src << GEN8_RT_DW0_SRC_COLOR_FACTOR__SHIFT | + rt.rgb_dst << GEN8_RT_DW0_DST_COLOR_FACTOR__SHIFT | + rt.rgb_func << GEN8_RT_DW0_COLOR_FUNC__SHIFT | + rt.a_src << GEN8_RT_DW0_SRC_ALPHA_FACTOR__SHIFT | + rt.a_dst << GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT | + rt.a_func << GEN8_RT_DW0_ALPHA_FUNC__SHIFT | + rt.argb_write_disables << GEN8_RT_DW0_WRITE_DISABLES__SHIFT; + + if (rt.blend_enable) { + dw0 |= GEN8_RT_DW0_BLEND_ENABLE; + + if (rt.a_src != rt.rgb_src || + rt.a_dst != rt.rgb_dst || + rt.a_func != rt.rgb_func) + indep_alpha_enable = true; + } + + dw1 = GEN8_RT_DW1_COLORCLAMP_RTFORMAT | + GEN8_RT_DW1_PRE_BLEND_CLAMP | + GEN8_RT_DW1_POST_BLEND_CLAMP; + + if (rt.logicop_enable) { + dw1 |= GEN8_RT_DW1_LOGICOP_ENABLE | + rt.logicop_func << GEN8_RT_DW1_LOGICOP_FUNC__SHIFT; + } + + dw_rt[2 * i + 0] = dw0; + dw_rt[2 * i + 1] = dw1; + } + + dw0 = 0; + + if (alpha->alpha_to_coverage) { + dw0 |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE | + GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER; + } + + if (indep_alpha_enable) + dw0 |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE; + + if (alpha->alpha_to_one) + dw0 |= GEN8_BLEND_DW0_ALPHA_TO_ONE; + + if (alpha->test_enable) { + dw0 |= GEN8_BLEND_DW0_ALPHA_TEST_ENABLE | + alpha->test_func << GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT; + } else { + dw0 |= GEN6_COMPAREFUNCTION_ALWAYS << + GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT; + } + + if (blend->dither_enable) + dw0 |= GEN8_BLEND_DW0_DITHER_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 2 + ARRAY_SIZE(dw_rt)); + cc->blend[1] = dw0; + memcpy(&cc->blend[2], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count); + cc->blend_state_count = info->blend.rt_count; + + return true; +} + +static bool +cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + const struct ilo_state_cc_alpha_info *alpha = &info->alpha; + const struct ilo_state_cc_blend_info *blend = &info->blend; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw1 = 0; + + if (alpha->alpha_to_coverage) + dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE; + + if (alpha->test_enable) + dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE; + + if (blend->rt_count) { + struct ilo_state_cc_blend_rt_info rt0; + uint8_t i; + + cc_get_gen6_effective_rt(dev, info, 0, &rt0); + + /* 0x0 is reserved for blend factors and we have to set them all */ + dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT | + rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT | + rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT | + rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT; + + for (i = 0; i < blend->rt_count; i++) { + if (blend->rt[i].argb_write_disables != 0xf) { + dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT; + break; + } + } + + if (rt0.blend_enable) { + dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE; + + if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst) + dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE; + } + } + + STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 1); + cc->blend[0] = dw1; + + return true; +} + +static bool +cc_params_set_gen6_COLOR_CALC_STATE(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_params_info *params) +{ + uint32_t dw0; + + ILO_DEV_ASSERT(dev, 6, 8); + + dw0 = params->stencil_front.test_ref << GEN6_CC_DW0_STENCIL_REF__SHIFT | + params->stencil_back.test_ref << GEN6_CC_DW0_STENCIL1_REF__SHIFT | + GEN6_CC_DW0_ALPHATEST_FLOAT32; + + STATIC_ASSERT(ARRAY_SIZE(cc->cc) >= 6); + cc->cc[0] = dw0; + cc->cc[1] = fui(params->alpha_ref); + cc->cc[2] = fui(params->blend_rgba[0]); + cc->cc[3] = fui(params->blend_rgba[1]); + cc->cc[4] = fui(params->blend_rgba[2]); + cc->cc[5] = fui(params->blend_rgba[3]); + + return true; +} + +bool +ilo_state_cc_init(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + assert(ilo_is_zeroed(cc, sizeof(*cc))); + return ilo_state_cc_set_info(cc, dev, info); +} + +bool +ilo_state_cc_set_info(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info) +{ + bool ret = true; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + ret &= cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(cc, dev, info); + ret &= cc_set_gen8_BLEND_STATE(cc, dev, info); + ret &= cc_set_gen8_3DSTATE_PS_BLEND(cc, dev, info); + } else { + ret &= cc_set_gen6_DEPTH_STENCIL_STATE(cc, dev, info); + ret &= cc_set_gen6_BLEND_STATE(cc, dev, info); + } + + ret &= cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, &info->params); + + assert(ret); + + return ret; +} + +bool +ilo_state_cc_set_params(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_params_info *params) +{ + /* modify stencil masks */ + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + uint32_t dw1 = cc->ds[0]; + uint32_t dw2 = cc->ds[1]; + + if (dw1 & GEN8_ZS_DW1_STENCIL_TEST_ENABLE) { + const bool twosided_enable = (dw1 & GEN8_ZS_DW1_STENCIL1_ENABLE); + const struct ilo_state_cc_stencil_params_info *front_p = + ¶ms->stencil_front; + const struct ilo_state_cc_stencil_params_info *back_p = + (twosided_enable) ? ¶ms->stencil_back : + ¶ms->stencil_front; + + if (front_p->write_mask || back_p->write_mask) + dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE; + else + dw1 &= ~GEN8_ZS_DW1_STENCIL_WRITE_ENABLE; + + dw2 = + front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT | + front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT | + back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT | + back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT; + } + + cc->ds[0] = dw1; + cc->ds[1] = dw2; + } else { + uint32_t dw0 = cc->ds[0]; + uint32_t dw1 = cc->ds[1]; + + if (dw0 & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) { + const bool twosided_enable = (dw0 & GEN6_ZS_DW0_STENCIL1_ENABLE); + const struct ilo_state_cc_stencil_params_info *front_p = + ¶ms->stencil_front; + const struct ilo_state_cc_stencil_params_info *back_p = + (twosided_enable) ? ¶ms->stencil_back : + ¶ms->stencil_front; + + if (front_p->write_mask || back_p->write_mask) + dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE; + else + dw0 &= ~GEN6_ZS_DW0_STENCIL_WRITE_ENABLE; + + dw1 = + front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT | + front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT | + back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT | + back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT; + } + + cc->ds[0] = dw0; + cc->ds[1] = dw1; + } + + /* modify COLOR_CALC_STATE */ + cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, params); + + return true; +} + +void +ilo_state_cc_full_delta(const struct ilo_state_cc *cc, + const struct ilo_dev *dev, + struct ilo_state_cc_delta *delta) +{ + delta->dirty = ILO_STATE_CC_BLEND_STATE | + ILO_STATE_CC_COLOR_CALC_STATE; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL | + ILO_STATE_CC_3DSTATE_PS_BLEND; + } else { + delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE; + } +} + +void +ilo_state_cc_get_delta(const struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc *old, + struct ilo_state_cc_delta *delta) +{ + delta->dirty = 0; + + if (memcmp(cc->ds, old->ds, sizeof(cc->ds))) { + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL; + else + delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + if (cc->blend[0] != old->blend[0]) + delta->dirty |= ILO_STATE_CC_3DSTATE_PS_BLEND; + + if (memcmp(&cc->blend[1], &old->blend[1], + sizeof(uint32_t) * (1 + 2 * cc->blend_state_count))) + delta->dirty |= ILO_STATE_CC_BLEND_STATE; + } else if (memcmp(cc->blend, old->blend, + sizeof(uint32_t) * 2 * cc->blend_state_count)) { + delta->dirty |= ILO_STATE_CC_BLEND_STATE; + } + + if (memcmp(cc->cc, old->cc, sizeof(cc->cc))) + delta->dirty |= ILO_STATE_CC_COLOR_CALC_STATE; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.h b/src/gallium/drivers/ilo/core/ilo_state_cc.h new file mode 100644 index 00000000000..5b96a60f988 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_cc.h @@ -0,0 +1,199 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_CC_H +#define ILO_STATE_CC_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Sandy Bridge PRM, volume 2 part 1, page 38: + * + * "Render Target Index. Specifies the render target index that will be + * used to select blend state from BLEND_STATE. + * Format = U3" + */ +#define ILO_STATE_CC_BLEND_MAX_RT_COUNT 8 + +enum ilo_state_cc_dirty_bits { + ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL = (1 << 0), + ILO_STATE_CC_3DSTATE_PS_BLEND = (1 << 1), + ILO_STATE_CC_DEPTH_STENCIL_STATE = (1 << 2), + ILO_STATE_CC_BLEND_STATE = (1 << 3), + ILO_STATE_CC_COLOR_CALC_STATE = (1 << 4), +}; + +/** + * AlphaCoverage and AlphaTest. + */ +struct ilo_state_cc_alpha_info { + bool cv_sample_count_one; + bool cv_float_source0_alpha; + + bool alpha_to_coverage; + bool alpha_to_one; + + bool test_enable; + enum gen_compare_function test_func; +}; + +struct ilo_state_cc_stencil_op_info { + enum gen_compare_function test_func; + enum gen_stencil_op fail_op; + enum gen_stencil_op zfail_op; + enum gen_stencil_op zpass_op; +}; + +/** + * StencilTest. + */ +struct ilo_state_cc_stencil_info { + bool cv_has_buffer; + + bool test_enable; + bool twosided_enable; + + struct ilo_state_cc_stencil_op_info front; + struct ilo_state_cc_stencil_op_info back; +}; + +/** + * DepthTest. + */ +struct ilo_state_cc_depth_info { + bool cv_has_buffer; + + bool test_enable; + /* independent from test_enable */ + bool write_enable; + + enum gen_compare_function test_func; +}; + +struct ilo_state_cc_blend_rt_info { + bool cv_has_buffer; + bool cv_is_unorm; + bool cv_is_integer; + + uint8_t argb_write_disables; + + bool logicop_enable; + enum gen_logic_op logicop_func; + + bool blend_enable; + bool force_dst_alpha_one; + enum gen_blend_factor rgb_src; + enum gen_blend_factor rgb_dst; + enum gen_blend_function rgb_func; + enum gen_blend_factor a_src; + enum gen_blend_factor a_dst; + enum gen_blend_function a_func; +}; + +/** + * ColorBufferBlending, Dithering, and LogicOps. + */ +struct ilo_state_cc_blend_info { + const struct ilo_state_cc_blend_rt_info *rt; + uint8_t rt_count; + + bool dither_enable; +}; + +struct ilo_state_cc_stencil_params_info { + uint8_t test_ref; + uint8_t test_mask; + uint8_t write_mask; +}; + +/** + * CC parameters. + */ +struct ilo_state_cc_params_info { + float alpha_ref; + + struct ilo_state_cc_stencil_params_info stencil_front; + struct ilo_state_cc_stencil_params_info stencil_back; + + float blend_rgba[4]; +}; + +/** + * Pixel processing. + */ +struct ilo_state_cc_info { + struct ilo_state_cc_alpha_info alpha; + struct ilo_state_cc_stencil_info stencil; + struct ilo_state_cc_depth_info depth; + struct ilo_state_cc_blend_info blend; + + struct ilo_state_cc_params_info params; +}; + +struct ilo_state_cc { + uint32_t ds[3]; + + uint8_t blend_state_count; + uint32_t blend[1 + 1 + 2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT]; + + uint32_t cc[6]; +}; + +struct ilo_state_cc_delta { + uint32_t dirty; +}; + +bool +ilo_state_cc_init(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info); + +bool +ilo_state_cc_set_info(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_info *info); + +bool +ilo_state_cc_set_params(struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc_params_info *params); + +void +ilo_state_cc_full_delta(const struct ilo_state_cc *cc, + const struct ilo_dev *dev, + struct ilo_state_cc_delta *delta); + +void +ilo_state_cc_get_delta(const struct ilo_state_cc *cc, + const struct ilo_dev *dev, + const struct ilo_state_cc *old, + struct ilo_state_cc_delta *delta); + +#endif /* ILO_STATE_CC_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.c b/src/gallium/drivers/ilo/core/ilo_state_compute.c new file mode 100644 index 00000000000..a5fe5e1a6b0 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_compute.c @@ -0,0 +1,435 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_compute.h" + +struct compute_urb_configuration { + int idrt_entry_count; + int curbe_entry_count; + + int urb_entry_count; + /* in 256-bit register increments */ + int urb_entry_size; +}; + +static int +get_gen6_rob_entry_count(const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 60: + * + * "ROB has 64KB of storage; 2048 entries." + * + * From the valid ranges of "CURBE Allocation Size", we can also conclude + * that interface entries and CURBE data must be in ROB. And that ROB + * should be 16KB, or 512 entries, on Gen7 GT1. + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + return 2048; + else if (ilo_dev_gen(dev) >= ILO_GEN(7)) + return (dev->gt == 2) ? 2048 : 512; + else + return (dev->gt == 2) ? 2048 : 1024; +} + +static int +get_gen6_idrt_entry_count(const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 21: + * + * "The first 32 URB entries are reserved for the interface + * descriptor..." + * + * From the Haswell PRM, volume 7, page 836: + * + * "The first 64 URB entries are reserved for the interface + * description..." + */ + return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32; +} + +static int +get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 21: + * + * "(CURBE Allocation Size) Specifies the total length allocated for + * CURBE, in 256-bit register increments. + */ + const int entry_count = (curbe_size + 31) / 32; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(get_gen6_idrt_entry_count(dev) + entry_count <= + get_gen6_rob_entry_count(dev)); + + return entry_count; +} + +static bool +compute_get_gen6_urb_configuration(const struct ilo_dev *dev, + const struct ilo_state_compute_info *info, + struct compute_urb_configuration *urb) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + urb->idrt_entry_count = get_gen6_idrt_entry_count(dev); + urb->curbe_entry_count = + get_gen6_curbe_entry_count(dev, info->curbe_alloc_size); + + /* + * From the Broadwell PRM, volume 2b, page 451: + * + * "Please note that 0 is not allowed for this field (Number of URB + * Entries)." + */ + urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0; + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 52: + * + * "(URB Entry Allocation Size) Specifies the length of each URB entry + * used by the unit, in 256-bit register increments - 1." + */ + urb->urb_entry_size = 1; + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 22: + * + * MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle + * size and the number of URB handles. The driver must ensure that + * ((URB_handle_size * URB_num_handle) - CURBE - 32) <= + * URB_allocation_in_L3." + */ + assert(urb->idrt_entry_count + urb->curbe_entry_count + + urb->urb_entry_count * urb->urb_entry_size <= + info->cv_urb_alloc_size / 32); + + return true; +} + +static int +compute_interface_get_gen6_read_end(const struct ilo_dev *dev, + const struct ilo_state_compute_interface_info *interface) +{ + const int per_thread_read = (interface->curbe_read_length + 31) / 32; + const int cross_thread_read = + (interface->cross_thread_curbe_read_length + 31) / 32; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(interface->curbe_read_offset % 32 == 0); + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 60: + * + * "(Constant URB Entry Read Length) [0,63]" + */ + assert(per_thread_read <= 63); + + /* From the Haswell PRM, volume 2d, page 199: + * + * "(Cross-Thread Constant Data Read Length) [0,127]" + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + assert(cross_thread_read <= 127); + else + assert(!cross_thread_read); + + if (per_thread_read || cross_thread_read) { + return interface->curbe_read_offset / 32 + cross_thread_read + + per_thread_read * interface->thread_group_size; + } else { + return 0; + } +} + +static bool +compute_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_compute_info *info, + const struct compute_urb_configuration *urb) +{ + int min_curbe_entry_count; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(info->interface_count <= urb->idrt_entry_count); + + min_curbe_entry_count = 0; + for (i = 0; i < info->interface_count; i++) { + const int read_end = + compute_interface_get_gen6_read_end(dev, &info->interfaces[i]); + + if (min_curbe_entry_count < read_end) + min_curbe_entry_count = read_end; + } + + assert(min_curbe_entry_count <= urb->curbe_entry_count); + + /* + * From the Broadwell PRM, volume 2b, page 452: + * + * "CURBE Allocation Size should be 0 for GPGPU workloads that uses + * indirect instead of CURBE." + */ + if (!min_curbe_entry_count) + assert(!urb->curbe_entry_count); + + return true; +} + +static uint8_t +compute_get_gen6_scratch_space(const struct ilo_dev *dev, + const struct ilo_state_compute_info *info) +{ + uint32_t scratch_size = 0; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < info->interface_count; i++) { + if (scratch_size < info->interfaces[i].scratch_size) + scratch_size = info->interfaces[i].scratch_size; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + assert(scratch_size <= 2 * 1024 * 1024); + + /* next power of two, starting from 1KB */ + return (scratch_size > 1024) ? + (util_last_bit(scratch_size - 1) - 10): 0; + } else if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { + assert(scratch_size <= 2 * 1024 * 1024); + + /* next power of two, starting from 2KB */ + return (scratch_size > 2048) ? + (util_last_bit(scratch_size - 1) - 11): 0; + } else { + assert(scratch_size <= 12 * 1024); + + return (scratch_size > 1024) ? + (scratch_size - 1) / 1024 : 0; + } +} + +static bool +compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute, + const struct ilo_dev *dev, + const struct ilo_state_compute_info *info) +{ + struct compute_urb_configuration urb; + uint8_t scratch_space; + + uint32_t dw1, dw2, dw4; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!compute_get_gen6_urb_configuration(dev, info, &urb) || + !compute_validate_gen6(dev, info, &urb)) + return false; + + scratch_space = compute_get_gen6_scratch_space(dev, info); + + dw1 = scratch_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT; + dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT | + urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT | + GEN6_VFE_DW2_RESET_GATEWAY_TIMER | + GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL; + + if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) + dw2 |= GEN7_VFE_DW2_GPGPU_MODE; + + assert(urb.urb_entry_size); + + dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT | + urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3); + compute->vfe[0] = dw1; + compute->vfe[1] = dw2; + compute->vfe[2] = dw4; + + return true; +} + +static uint8_t +compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev, + const struct ilo_state_compute_interface_info *interface) +{ + ILO_DEV_ASSERT(dev, 6, 8); + return (interface->sampler_count <= 12) ? + (interface->sampler_count + 3) / 4 : 4; +} + +static uint8_t +compute_interface_get_gen6_surface_count(const struct ilo_dev *dev, + const struct ilo_state_compute_interface_info *interface) +{ + ILO_DEV_ASSERT(dev, 6, 8); + return (interface->surface_count <= 31) ? interface->surface_count : 31; +} + +static uint8_t +compute_interface_get_gen7_slm_size(const struct ilo_dev *dev, + const struct ilo_state_compute_interface_info *interface) +{ + ILO_DEV_ASSERT(dev, 7, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 2, page 61: + * + * "The amount is specified in 4k blocks, but only powers of 2 are + * allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice." + */ + assert(interface->slm_size <= 64 * 1024); + + return util_next_power_of_two((interface->slm_size + 4095) / 4096); +} + +static bool +compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute, + const struct ilo_dev *dev, + const struct ilo_state_compute_info *info) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < info->interface_count; i++) { + const struct ilo_state_compute_interface_info *interface = + &info->interfaces[i]; + uint16_t read_offset, per_thread_read_len, cross_thread_read_len; + uint8_t sampler_count, surface_count; + uint32_t dw0, dw2, dw3, dw4, dw5, dw6; + + assert(interface->kernel_offset % 64 == 0); + assert(interface->thread_group_size); + + read_offset = interface->curbe_read_offset / 32; + per_thread_read_len = (interface->curbe_read_length + 31) / 32; + cross_thread_read_len = + (interface->cross_thread_curbe_read_length + 31) / 32; + + sampler_count = + compute_interface_get_gen6_sampler_count(dev, interface); + surface_count = + compute_interface_get_gen6_surface_count(dev, interface); + + dw0 = interface->kernel_offset; + dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT; + dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT; + dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT | + read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT; + + dw5 = 0; + dw6 = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + const uint8_t slm_size = + compute_interface_get_gen7_slm_size(dev, interface); + + dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE; + + if (slm_size) { + dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE | + slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT; + } + + /* + * From the Haswell PRM, volume 2d, page 199: + * + * "(Number of Threads in GPGPU Thread Group) Specifies the + * number of threads that are in this thread group. Used to + * program the barrier for the number of messages to expect. The + * minimum value is 0 (which will disable the barrier), while + * the maximum value is the number of threads in a subslice for + * local barriers." + * + * From the Broadwell PRM, volume 2d, page 183: + * + * "(Number of Threads in GPGPU Thread Group) Specifies the + * number of threads that are in this thread group. The minimum + * value is 1, while the maximum value is the number of threads + * in a subslice for local barriers. See vol1b Configurations + * for the number of threads per subslice for different + * products. The maximum value for global barriers is limited + * by the number of threads in the system, or by 511, whichever + * is lower. This field should not be set to 0 even if the + * barrier is disabled, since an accurate value is needed for + * proper pre-emption." + */ + if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) { + dw5 |= interface->thread_group_size << + GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { + dw6 |= cross_thread_read_len << + GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT; + } + } + + STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6); + compute->idrt[i][0] = dw0; + compute->idrt[i][1] = dw2; + compute->idrt[i][2] = dw3; + compute->idrt[i][3] = dw4; + compute->idrt[i][4] = dw5; + compute->idrt[i][5] = dw6; + } + + return true; +} + +bool +ilo_state_compute_init(struct ilo_state_compute *compute, + const struct ilo_dev *dev, + const struct ilo_state_compute_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(compute, sizeof(*compute))); + assert(ilo_is_zeroed(info->data, info->data_size)); + + assert(ilo_state_compute_data_size(dev, info->interface_count) <= + info->data_size); + compute->idrt = (uint32_t (*)[6]) info->data; + + ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info); + ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info); + + assert(ret); + + return ret; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.h b/src/gallium/drivers/ilo/core/ilo_state_compute.h new file mode 100644 index 00000000000..346f7b617f4 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_compute.h @@ -0,0 +1,92 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_COMPUTE_H +#define ILO_STATE_COMPUTE_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Haswell PRM, volume 7, page 836: + * + * "The first 64 URB entries are reserved for the interface + * description..." + */ +#define ILO_STATE_COMPUTE_MAX_INTERFACE_COUNT 64 + +struct ilo_state_compute_interface_info { + /* usually 0 unless there are multiple interfaces */ + uint32_t kernel_offset; + + uint32_t scratch_size; + + uint8_t sampler_count; + uint8_t surface_count; + + uint16_t thread_group_size; + uint32_t slm_size; + + uint16_t curbe_read_offset; + uint16_t curbe_read_length; + uint16_t cross_thread_curbe_read_length; +}; + +struct ilo_state_compute_info { + void *data; + size_t data_size; + + const struct ilo_state_compute_interface_info *interfaces; + uint8_t interface_count; + + uint32_t cv_urb_alloc_size; + uint32_t curbe_alloc_size; +}; + +struct ilo_state_compute { + uint32_t vfe[3]; + + uint32_t (*idrt)[6]; + uint8_t idrt_count; +}; + +static inline size_t +ilo_state_compute_data_size(const struct ilo_dev *dev, + uint8_t interface_count) +{ + const struct ilo_state_compute *compute = NULL; + return sizeof(compute->idrt[0]) * interface_count; +} + +bool +ilo_state_compute_init(struct ilo_state_compute *compute, + const struct ilo_dev *dev, + const struct ilo_state_compute_info *info); + +#endif /* ILO_STATE_COMPUTE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c new file mode 100644 index 00000000000..ed64a1f0d3c --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c @@ -0,0 +1,1252 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_raster.h" + +static bool +raster_validate_gen6_clip(const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_clip_info *clip = &info->clip; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(clip->viewport_count); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 188: + * + * ""Clip Distance Cull Test Enable Bitmask" and "Clip Distance Clip + * Test Enable Bitmask" should not have overlapping bits in the mask, + * else the results are undefined." + */ + assert(!(clip->user_cull_enables & clip->user_clip_enables)); + + if (ilo_dev_gen(dev) < ILO_GEN(9)) + assert(clip->z_near_enable == clip->z_far_enable); + + return true; +} + +static bool +raster_set_gen6_3DSTATE_CLIP(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_clip_info *clip = &info->clip; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_tri_info *tri = &info->tri; + const struct ilo_state_raster_scan_info *scan = &info->scan; + uint32_t dw1, dw2, dw3; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!raster_validate_gen6_clip(dev, info)) + return false; + + dw1 = clip->user_cull_enables << GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT; + + if (clip->stats_enable) + dw1 |= GEN6_CLIP_DW1_STATISTICS; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 219: + * + * "Workaround : Due to Hardware issue "EarlyCull" needs to be + * enabled only for the cases where the incoming primitive topology + * into the clipper guaranteed to be Trilist." + * + * What does this mean? + */ + dw1 |= GEN7_CLIP_DW1_SUBPIXEL_8BITS | + GEN7_CLIP_DW1_EARLY_CULL_ENABLE; + + if (ilo_dev_gen(dev) <= ILO_GEN(7.5)) { + dw1 |= tri->front_winding << GEN7_CLIP_DW1_FRONT_WINDING__SHIFT | + tri->cull_mode << GEN7_CLIP_DW1_CULL_MODE__SHIFT; + } + } + + dw2 = clip->user_clip_enables << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT | + GEN6_CLIPMODE_NORMAL << GEN6_CLIP_DW2_CLIP_MODE__SHIFT; + + if (clip->clip_enable) + dw2 |= GEN6_CLIP_DW2_CLIP_ENABLE; + + if (clip->z_near_zero) + dw2 |= GEN6_CLIP_DW2_APIMODE_D3D; + else + dw2 |= GEN6_CLIP_DW2_APIMODE_OGL; + + if (clip->xy_test_enable) + dw2 |= GEN6_CLIP_DW2_XY_TEST_ENABLE; + + if (ilo_dev_gen(dev) < ILO_GEN(8) && clip->z_near_enable) + dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE; + + if (clip->gb_test_enable) + dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE; + + if (scan->barycentric_interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL | + GEN6_INTERP_NONPERSPECTIVE_CENTROID | + GEN6_INTERP_NONPERSPECTIVE_SAMPLE)) + dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE; + + if (setup->first_vertex_provoking) { + dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT | + 0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT | + 1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT; + } else { + dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT | + 1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT | + 2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT; + } + + dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT | + 0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT | + (clip->viewport_count - 1) << GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT; + + if (clip->force_rtaindex_zero) + dw3 |= GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO; + + STATIC_ASSERT(ARRAY_SIZE(rs->clip) >= 3); + rs->clip[0] = dw1; + rs->clip[1] = dw2; + rs->clip[2] = dw3; + + return true; +} + +static bool +raster_params_is_gen6_line_aa_allowed(const struct ilo_dev *dev, + const struct ilo_state_raster_params_info *params) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 251: + * + * "This field (Anti-aliasing Enable) must be disabled if any of the + * render targets have integer (UINT or SINT) surface format." + */ + if (params->any_integer_rt) + return false; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 321: + * + * "[DevSNB+]: This field (Hierarchical Depth Buffer Enable) must be + * disabled if Anti-aliasing Enable in 3DSTATE_SF is enabled. + */ + if (ilo_dev_gen(dev) == ILO_GEN(6) && params->hiz_enable) + return false; + + return true; +} + +static void +raster_get_gen6_effective_line(const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_params_info *params = &info->params; + + *line = info->line; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 251: + * + * "This field (Anti-aliasing Enable) is ignored when Multisample + * Rasterization Mode is MSRASTMODE_ON_xx." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 251: + * + * "Setting a Line Width of 0.0 specifies the rasterization of the + * "thinnest" (one-pixel-wide), non-antialiased lines. Note that + * this effectively overrides the effect of AAEnable (though the + * AAEnable state variable is not modified). Lines rendered with + * zero Line Width are rasterized using GIQ (Grid Intersection + * Quantization) rules as specified by the GDI and Direct3D APIs." + * + * "Software must not program a value of 0.0 when running in + * MSRASTMODE_ON_xxx modes - zero-width lines are not available + * when multisampling rasterization is enabled." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 294: + * + * "Line stipple, controlled via the Line Stipple Enable state variable + * in WM_STATE, discards certain pixels that are produced by non-AA + * line rasterization." + */ + if (setup->line_msaa_enable || + !raster_params_is_gen6_line_aa_allowed(dev, params)) + line->aa_enable = false; + if (setup->line_msaa_enable || line->aa_enable) { + line->stipple_enable = false; + line->giq_enable = false; + line->giq_last_pixel = false; + } +} + +static bool +raster_validate_gen8_raster(const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_tri_info *tri = &info->tri; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 249: + * + * "This setting (SOLID) is required when rendering rectangle + * (RECTLIST) objects. + */ + if (tri->fill_mode_front != GEN6_FILLMODE_SOLID || + tri->fill_mode_back != GEN6_FILLMODE_SOLID) + assert(!setup->cv_is_rectangle); + + return true; +} + +static enum gen_msrast_mode +raster_setup_get_gen6_msrast_mode(const struct ilo_dev *dev, + const struct ilo_state_raster_setup_info *setup) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (setup->line_msaa_enable) { + return (setup->msaa_enable) ? GEN6_MSRASTMODE_ON_PATTERN : + GEN6_MSRASTMODE_ON_PIXEL; + } else { + return (setup->msaa_enable) ? GEN6_MSRASTMODE_OFF_PATTERN : + GEN6_MSRASTMODE_OFF_PIXEL; + } +} + +static int +get_gen6_line_width(const struct ilo_dev *dev, float fwidth, + bool line_aa_enable, bool line_giq_enable) +{ + int line_width; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* in U3.7 */ + line_width = (int) (fwidth * 128.0f + 0.5f); + + /* + * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1) + * pixels in the minor direction. We have to make the lines slightly + * thicker, 0.5 pixel on both sides, so that they intersect that many + * pixels. + */ + if (line_aa_enable) + line_width += 128; + + line_width = CLAMP(line_width, 1, 1023); + + if (line_giq_enable && line_width == 128) + line_width = 0; + + return line_width; +} + +static int +get_gen6_point_width(const struct ilo_dev *dev, float fwidth) +{ + int point_width; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* in U8.3 */ + point_width = (int) (fwidth * 8.0f + 0.5f); + point_width = CLAMP(point_width, 1, 2047); + + return point_width; +} + +static bool +raster_set_gen7_3DSTATE_SF(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + const struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_clip_info *clip = &info->clip; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_point_info *point = &info->point; + const struct ilo_state_raster_tri_info *tri = &info->tri; + const struct ilo_state_raster_params_info *params = &info->params; + const enum gen_msrast_mode msrast = + raster_setup_get_gen6_msrast_mode(dev, setup); + const int line_width = get_gen6_line_width(dev, params->line_width, + line->aa_enable, line->giq_enable); + const int point_width = get_gen6_point_width(dev, params->point_width); + uint32_t dw1, dw2, dw3; + + ILO_DEV_ASSERT(dev, 6, 7.5); + + if (!raster_validate_gen8_raster(dev, info)) + return false; + + dw1 = tri->fill_mode_front << GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT | + tri->fill_mode_back << GEN7_SF_DW1_FILL_MODE_BACK__SHIFT | + tri->front_winding << GEN7_SF_DW1_FRONT_WINDING__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) { + enum gen_depth_format format; + + /* do it here as we want 0x0 to be valid */ + switch (tri->depth_offset_format) { + case GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT: + format = GEN6_ZFORMAT_D32_FLOAT; + break; + case GEN6_ZFORMAT_D24_UNORM_S8_UINT: + format = GEN6_ZFORMAT_D24_UNORM_X8_UINT; + break; + default: + format = tri->depth_offset_format; + break; + } + + dw1 |= format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT; + } + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 248: + * + * "This bit (Statistics Enable) should be set whenever clipping is + * enabled and the Statistics Enable bit is set in CLIP_STATE. It + * should be cleared if clipping is disabled or Statistics Enable in + * CLIP_STATE is clear." + */ + if (clip->stats_enable && clip->clip_enable) + dw1 |= GEN7_SF_DW1_STATISTICS; + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 258: + * + * "This bit (Legacy Global Depth Bias Enable, Global Depth Offset + * Enable Solid , Global Depth Offset Enable Wireframe, and Global + * Depth Offset Enable Point) should be set whenever non zero depth + * bias (Slope, Bias) values are used. Setting this bit may have some + * degradation of performance for some workloads." + * + * But it seems fine to ignore that. + */ + if (tri->depth_offset_solid) + dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID; + if (tri->depth_offset_wireframe) + dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME; + if (tri->depth_offset_point) + dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT; + + if (setup->viewport_transform) + dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM; + + dw2 = tri->cull_mode << GEN7_SF_DW2_CULL_MODE__SHIFT | + line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT | + GEN7_SF_DW2_AA_LINE_CAP_1_0 | + msrast << GEN7_SF_DW2_MSRASTMODE__SHIFT; + + if (line->aa_enable) + dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE; + + if (ilo_dev_gen(dev) == ILO_GEN(7.5) && line->stipple_enable) + dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE; + + if (setup->scissor_enable) + dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE; + + dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE | + GEN7_SF_DW3_SUBPIXEL_8BITS; + + /* this has no effect when line_width != 0 */ + if (line->giq_last_pixel) + dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE; + + if (setup->first_vertex_provoking) { + dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | + 0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | + 1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; + } else { + dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | + 1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | + 2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; + } + + /* setup->point_aa_enable is ignored */ + if (!point->programmable_width) { + dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH | + point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3); + rs->sf[0] = dw1; + rs->sf[1] = dw2; + rs->sf[2] = dw3; + + STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4); + rs->raster[0] = 0; + rs->raster[1] = fui(params->depth_offset_const); + rs->raster[2] = fui(params->depth_offset_scale); + rs->raster[3] = fui(params->depth_offset_clamp); + + rs->line_aa_enable = line->aa_enable; + rs->line_giq_enable = line->giq_enable; + + return true; +} + +static bool +raster_set_gen8_3DSTATE_SF(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + const struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_clip_info *clip = &info->clip; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_point_info *point = &info->point; + const struct ilo_state_raster_params_info *params = &info->params; + const int line_width = get_gen6_line_width(dev, params->line_width, + line->aa_enable, line->giq_enable); + const int point_width = get_gen6_point_width(dev, params->point_width); + uint32_t dw1, dw2, dw3; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw1 = 0; + + if (clip->stats_enable && clip->clip_enable) + dw1 |= GEN7_SF_DW1_STATISTICS; + + if (setup->viewport_transform) + dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM; + + dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT | + GEN7_SF_DW2_AA_LINE_CAP_1_0; + + dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE | + GEN7_SF_DW3_SUBPIXEL_8BITS; + + /* this has no effect when line_width != 0 */ + if (line->giq_last_pixel) + dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE; + + if (setup->first_vertex_provoking) { + dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | + 0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | + 1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; + } else { + dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT | + 1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT | + 2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT; + } + + if (!point->programmable_width) { + dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH | + point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3); + rs->sf[0] = dw1; + rs->sf[1] = dw2; + rs->sf[2] = dw3; + + return true; +} + +static bool +raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + const struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_clip_info *clip = &info->clip; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_point_info *point = &info->point; + const struct ilo_state_raster_tri_info *tri = &info->tri; + const struct ilo_state_raster_params_info *params = &info->params; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 8, 8); + + if (!raster_validate_gen8_raster(dev, info)) + return false; + + dw1 = tri->front_winding << GEN8_RASTER_DW1_FRONT_WINDING__SHIFT | + tri->cull_mode << GEN8_RASTER_DW1_CULL_MODE__SHIFT | + tri->fill_mode_front << GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT | + tri->fill_mode_back << GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT; + + if (point->aa_enable) + dw1 |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE; + + /* where should line_msaa_enable be set? */ + if (setup->msaa_enable) + dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE; + + if (tri->depth_offset_solid) + dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID; + if (tri->depth_offset_wireframe) + dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME; + if (tri->depth_offset_point) + dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT; + + if (line->aa_enable) + dw1 |= GEN8_RASTER_DW1_AA_LINE_ENABLE; + + if (setup->scissor_enable) + dw1 |= GEN8_RASTER_DW1_SCISSOR_ENABLE; + + if (ilo_dev_gen(dev) >= ILO_GEN(9)) { + if (clip->z_far_enable) + dw1 |= GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE; + if (clip->z_near_enable) + dw1 |= GEN9_RASTER_DW1_Z_TEST_NEAR_ENABLE; + } else { + if (clip->z_near_enable) + dw1 |= GEN8_RASTER_DW1_Z_TEST_ENABLE; + } + + STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4); + rs->raster[0] = dw1; + rs->raster[1] = fui(params->depth_offset_const); + rs->raster[2] = fui(params->depth_offset_scale); + rs->raster[3] = fui(params->depth_offset_clamp); + + rs->line_aa_enable = line->aa_enable; + rs->line_giq_enable = line->giq_enable; + + return true; +} + +static enum gen_sample_count +get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count) +{ + enum gen_sample_count c; + int min_gen; + + ILO_DEV_ASSERT(dev, 6, 8); + + switch (sample_count) { + case 1: + c = GEN6_NUMSAMPLES_1; + min_gen = ILO_GEN(6); + break; + case 2: + c = GEN8_NUMSAMPLES_2; + min_gen = ILO_GEN(8); + break; + case 4: + c = GEN6_NUMSAMPLES_4; + min_gen = ILO_GEN(6); + break; + case 8: + c = GEN7_NUMSAMPLES_8; + min_gen = ILO_GEN(7); + break; + case 16: + c = GEN8_NUMSAMPLES_16; + min_gen = ILO_GEN(8); + break; + default: + assert(!"unexpected sample count"); + c = GEN6_NUMSAMPLES_1; + break; + } + + assert(ilo_dev_gen(dev) >= min_gen); + + return c; +} + +static bool +raster_set_gen8_3DSTATE_MULTISAMPLE(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_scan_info *scan = &info->scan; + const enum gen_sample_count count = + get_gen6_sample_count(dev, scan->sample_count); + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 307: + * + * "Setting Multisample Rasterization Mode to MSRASTMODE_xxx_PATTERN + * when Number of Multisamples == NUMSAMPLES_1 is UNDEFINED." + */ + if (setup->msaa_enable) + assert(scan->sample_count > 1); + + dw1 = scan->pixloc << GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT | + count << GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 1); + rs->sample[0] = dw1; + + return true; +} + +static bool +raster_set_gen6_3DSTATE_SAMPLE_MASK(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_scan_info *scan = &info->scan; + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 294: + * + * "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field + * (Sample Mask) must be zero. + * + * If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field + * must be zero." + */ + const uint32_t mask = (1 << scan->sample_count) - 1; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 6, 8); + + dw1 = (scan->sample_mask & mask) << GEN6_SAMPLE_MASK_DW1_VAL__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 2); + rs->sample[1] = dw1; + + return true; +} + +static bool +raster_validate_gen6_wm(const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_scan_info *scan = &info->scan; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (ilo_dev_gen(dev) == ILO_GEN(6)) + assert(scan->earlyz_control == GEN7_EDSC_NORMAL); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 272: + * + * "This bit (Statistics Enable) must be disabled if either of these + * bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve + * Enable or Depth Buffer Resolve Enable." + */ + if (scan->earlyz_op != ILO_STATE_RASTER_EARLYZ_NORMAL) + assert(!scan->stats_enable); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 273: + * + * "If this field (Depth Buffer Resolve Enable) is enabled, the Depth + * Buffer Clear and Hierarchical Depth Buffer Resolve Enable fields + * must both be disabled." + * + * "If this field (Hierarchical Depth Buffer Resolve Enable) is + * enabled, the Depth Buffer Clear and Depth Buffer Resolve Enable + * fields must both be disabled." + * + * This is guaranteed. + */ + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 314-315: + * + * "Stencil buffer clear can be performed at the same time by enabling + * Stencil Buffer Write Enable." + * + * "Note also that stencil buffer clear can be performed without depth + * buffer clear." + */ + if (scan->earlyz_stencil_clear) { + assert(scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_NORMAL || + scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR); + } + + return true; +} + +static bool +raster_set_gen6_3dstate_wm(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + const struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_tri_info *tri = &info->tri; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_scan_info *scan = &info->scan; + const enum gen_msrast_mode msrast = + raster_setup_get_gen6_msrast_mode(dev, setup); + /* only scan conversion states are set, as in Gen8+ */ + uint32_t dw4, dw5, dw6; + + ILO_DEV_ASSERT(dev, 6, 6); + + if (!raster_validate_gen6_wm(dev, info)) + return false; + + dw4 = 0; + + if (scan->stats_enable) + dw4 |= GEN6_WM_DW4_STATISTICS; + + switch (scan->earlyz_op) { + case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: + dw4 |= GEN6_WM_DW4_DEPTH_CLEAR; + break; + case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: + dw4 |= GEN6_WM_DW4_DEPTH_RESOLVE; + break; + case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE: + dw4 |= GEN6_WM_DW4_HIZ_RESOLVE; + break; + default: + if (scan->earlyz_stencil_clear) + dw4 |= GEN6_WM_DW4_DEPTH_CLEAR; + break; + } + + dw5 = GEN6_WM_DW5_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */ + GEN6_WM_DW5_AA_LINE_WIDTH_2_0; + + if (tri->poly_stipple_enable) + dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE; + if (line->stipple_enable) + dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE; + + dw6 = scan->zw_interp << GEN6_WM_DW6_ZW_INTERP__SHIFT | + scan->barycentric_interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT | + GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT | + msrast << GEN6_WM_DW6_MSRASTMODE__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3); + rs->wm[0] = dw4; + rs->wm[1] = dw5; + rs->wm[2] = dw6; + + return true; +} + +static bool +raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info, + const struct ilo_state_raster_line_info *line) +{ + const struct ilo_state_raster_tri_info *tri = &info->tri; + const struct ilo_state_raster_setup_info *setup = &info->setup; + const struct ilo_state_raster_scan_info *scan = &info->scan; + const enum gen_msrast_mode msrast = + raster_setup_get_gen6_msrast_mode(dev, setup); + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!raster_validate_gen6_wm(dev, info)) + return false; + + dw1 = scan->earlyz_control << GEN7_WM_DW1_EDSC__SHIFT | + scan->zw_interp << GEN7_WM_DW1_ZW_INTERP__SHIFT | + scan->barycentric_interps << GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT | + GEN7_WM_DW1_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */ + GEN7_WM_DW1_AA_LINE_WIDTH_2_0 | + GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT; + + if (scan->stats_enable) + dw1 |= GEN7_WM_DW1_STATISTICS; + + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + switch (scan->earlyz_op) { + case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: + dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + break; + case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: + dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE; + break; + case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE: + dw1 |= GEN7_WM_DW1_HIZ_RESOLVE; + break; + default: + if (scan->earlyz_stencil_clear) + dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + break; + } + } + + if (tri->poly_stipple_enable) + dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE; + if (line->stipple_enable) + dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE; + + if (ilo_dev_gen(dev) < ILO_GEN(8)) + dw1 |= msrast << GEN7_WM_DW1_MSRASTMODE__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 1); + rs->wm[0] = dw1; + + return true; +} + +static bool +raster_set_gen8_3dstate_wm_hz_op(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + const struct ilo_state_raster_scan_info *scan = &info->scan; + const enum gen_sample_count count = + get_gen6_sample_count(dev, scan->sample_count); + const uint32_t mask = (1 << scan->sample_count) - 1; + uint32_t dw1, dw4; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw1 = count << GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT; + + if (scan->earlyz_stencil_clear) + dw1 |= GEN8_WM_HZ_DW1_STENCIL_CLEAR; + + switch (scan->earlyz_op) { + case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: + dw1 |= GEN8_WM_HZ_DW1_DEPTH_CLEAR; + break; + case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: + dw1 |= GEN8_WM_HZ_DW1_DEPTH_RESOLVE; + break; + case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE: + dw1 |= GEN8_WM_HZ_DW1_HIZ_RESOLVE; + break; + default: + break; + } + + dw4 = (scan->sample_mask & mask) << GEN8_WM_HZ_DW4_SAMPLE_MASK__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3); + rs->wm[1] = dw1; + rs->wm[2] = dw4; + + return true; +} + +static bool +sample_pattern_get_gen6_packed_offsets(const struct ilo_dev *dev, + uint8_t sample_count, + const struct ilo_state_sample_pattern_offset_info *in, + uint8_t *out) +{ + uint8_t max_dist, i; + + ILO_DEV_ASSERT(dev, 6, 8); + + max_dist = 0; + for (i = 0; i < sample_count; i++) { + const int8_t dist_x = (int8_t) in[i].x - 8; + const int8_t dist_y = (int8_t) in[i].y - 8; + const uint8_t dist = dist_x * dist_x + dist_y * dist_y; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 305: + * + * "Programming Note: When programming the sample offsets (for + * NUMSAMPLES_4 or _8 and MSRASTMODE_xxx_PATTERN), the order of the + * samples 0 to 3 (or 7 for 8X) must have monotonically increasing + * distance from the pixel center. This is required to get the + * correct centroid computation in the device." + */ + assert(dist >= max_dist); + max_dist = dist; + + assert(in[i].x < 16); + assert(in[i].y < 16); + + out[i] = in[i].x << 4 | in[i].y; + } + + return true; +} + +static bool +line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(struct ilo_state_line_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_line_stipple_info *info) +{ + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(info->repeat_count >= 1 && info->repeat_count <= 256); + + dw1 = info->pattern; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + /* in U1.16 */ + const uint32_t inverse = 65536 / info->repeat_count; + dw2 = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT | + info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT; + } else { + /* in U1.13 */ + const uint16_t inverse = 8192 / info->repeat_count; + dw2 = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT | + info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 2); + stipple->stipple[0] = dw1; + stipple->stipple[1] = dw2; + + return true; +} + +static bool +sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + const struct ilo_state_sample_pattern_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_1x) >= 1); + STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_2x) >= 2); + STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_4x) >= 4); + STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_8x) >= 8); + STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_16x) >= 16); + + return (sample_pattern_get_gen6_packed_offsets(dev, 1, + info->pattern_1x, pattern->pattern_1x) && + sample_pattern_get_gen6_packed_offsets(dev, 2, + info->pattern_2x, pattern->pattern_2x) && + sample_pattern_get_gen6_packed_offsets(dev, 4, + info->pattern_4x, pattern->pattern_4x) && + sample_pattern_get_gen6_packed_offsets(dev, 8, + info->pattern_8x, pattern->pattern_8x) && + sample_pattern_get_gen6_packed_offsets(dev, 16, + info->pattern_16x, pattern->pattern_16x)); + +} + +static bool +poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_state_poly_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_poly_stipple_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 32); + memcpy(stipple->stipple, info->pattern, sizeof(info->pattern)); + + return true; +} + +bool +ilo_state_raster_init(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + assert(ilo_is_zeroed(rs, sizeof(*rs))); + return ilo_state_raster_set_info(rs, dev, info); +} + +bool +ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + uint8_t sample_count, + enum ilo_state_raster_earlyz_op earlyz_op, + bool earlyz_stencil_clear) +{ + struct ilo_state_raster_info info; + + memset(&info, 0, sizeof(info)); + + info.clip.viewport_count = 1; + info.setup.cv_is_rectangle = true; + info.setup.msaa_enable = (sample_count > 1); + info.scan.sample_count = sample_count; + info.scan.sample_mask = ~0u; + info.scan.earlyz_op = earlyz_op; + info.scan.earlyz_stencil_clear = earlyz_stencil_clear; + + return ilo_state_raster_init(rs, dev, &info); +} + +bool +ilo_state_raster_set_info(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info) +{ + struct ilo_state_raster_line_info line; + bool ret = true; + + ret &= raster_set_gen6_3DSTATE_CLIP(rs, dev, info); + + raster_get_gen6_effective_line(dev, info, &line); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + ret &= raster_set_gen8_3DSTATE_SF(rs, dev, info, &line); + ret &= raster_set_gen8_3DSTATE_RASTER(rs, dev, info, &line); + } else { + ret &= raster_set_gen7_3DSTATE_SF(rs, dev, info, &line); + } + + ret &= raster_set_gen8_3DSTATE_MULTISAMPLE(rs, dev, info); + ret &= raster_set_gen6_3DSTATE_SAMPLE_MASK(rs, dev, info); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + ret &= raster_set_gen8_3DSTATE_WM(rs, dev, info, &line); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + ret &= raster_set_gen8_3dstate_wm_hz_op(rs, dev, info); + } else { + ret &= raster_set_gen6_3dstate_wm(rs, dev, info, &line); + } + + assert(ret); + + return ret; +} + +bool +ilo_state_raster_set_params(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_params_info *params) +{ + const bool line_aa_enable = (rs->line_aa_enable && + raster_params_is_gen6_line_aa_allowed(dev, params)); + const int line_width = get_gen6_line_width(dev, params->line_width, + line_aa_enable, rs->line_giq_enable); + + ILO_DEV_ASSERT(dev, 6, 8); + + /* modify line AA enable */ + if (rs->line_aa_enable) { + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + if (line_aa_enable) + rs->raster[0] |= GEN8_RASTER_DW1_AA_LINE_ENABLE; + else + rs->raster[0] &= ~GEN8_RASTER_DW1_AA_LINE_ENABLE; + } else { + if (line_aa_enable) + rs->sf[1] |= GEN7_SF_DW2_AA_LINE_ENABLE; + else + rs->sf[1] &= ~GEN7_SF_DW2_AA_LINE_ENABLE; + } + } + + /* modify line width */ + rs->sf[1] = (rs->sf[1] & ~GEN7_SF_DW2_LINE_WIDTH__MASK) | + line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT; + + /* modify point width */ + if (rs->sf[2] & GEN7_SF_DW3_USE_POINT_WIDTH) { + const int point_width = get_gen6_point_width(dev, params->point_width); + + rs->sf[2] = (rs->sf[2] & ~GEN7_SF_DW3_POINT_WIDTH__MASK) | + point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT; + } + + /* modify depth offset */ + rs->raster[1] = fui(params->depth_offset_const); + rs->raster[2] = fui(params->depth_offset_scale); + rs->raster[3] = fui(params->depth_offset_clamp); + + return true; +} + +void +ilo_state_raster_full_delta(const struct ilo_state_raster *rs, + const struct ilo_dev *dev, + struct ilo_state_raster_delta *delta) +{ + delta->dirty = ILO_STATE_RASTER_3DSTATE_CLIP | + ILO_STATE_RASTER_3DSTATE_SF | + ILO_STATE_RASTER_3DSTATE_MULTISAMPLE | + ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK | + ILO_STATE_RASTER_3DSTATE_WM | + ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER | + ILO_STATE_RASTER_3DSTATE_WM_HZ_OP; + } +} + +void +ilo_state_raster_get_delta(const struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster *old, + struct ilo_state_raster_delta *delta) +{ + delta->dirty = 0; + + if (memcmp(rs->clip, old->clip, sizeof(rs->clip))) + delta->dirty |= ILO_STATE_RASTER_3DSTATE_CLIP; + + if (memcmp(rs->sf, old->sf, sizeof(rs->sf))) + delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF; + + if (memcmp(rs->raster, old->raster, sizeof(rs->raster))) { + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER; + else + delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF; + } + + if (memcmp(rs->sample, old->sample, sizeof(rs->sample))) { + delta->dirty |= ILO_STATE_RASTER_3DSTATE_MULTISAMPLE | + ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK; + } + + if (memcmp(rs->wm, old->wm, sizeof(rs->wm))) { + delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM_HZ_OP; + } +} + +bool +ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + const struct ilo_state_sample_pattern_info *info) +{ + bool ret = true; + + ret &= sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(pattern, dev, info); + + assert(ret); + + return ret; +} + +bool +ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev) +{ + static const struct ilo_state_sample_pattern_info default_info = { + .pattern_1x = { + { 8, 8 }, + }, + + .pattern_2x = { + { 4, 4 }, { 12, 12 }, + }, + + .pattern_4x = { + { 6, 2 }, { 14, 6 }, { 2, 10 }, { 10, 14 }, + }, + + /* \see brw_multisample_positions_8x */ + .pattern_8x = { + { 7, 9 }, { 9, 13 }, { 11, 3 }, { 13, 11 }, + { 1, 7 }, { 5, 1 }, { 15, 5 }, { 3, 15 }, + }, + + .pattern_16x = { + { 8, 10 }, { 11, 8 }, { 5, 6 }, { 6, 4 }, + { 12, 11 }, { 13, 9 }, { 14, 7 }, { 10, 2 }, + { 4, 13 }, { 3, 3 }, { 7, 1 }, { 15, 5 }, + { 1, 12 }, { 9, 0 }, { 2, 14 }, { 0, 15 }, + }, + }; + + return ilo_state_sample_pattern_init(pattern, dev, &default_info); +} + +const uint8_t * +ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + uint8_t sample_count) +{ + switch (sample_count) { + case 1: return pattern->pattern_1x; + case 2: return pattern->pattern_2x; + case 4: return pattern->pattern_4x; + case 8: return pattern->pattern_8x; + case 16: return pattern->pattern_16x; + default: + assert(!"unknown sample count"); + return NULL; + } +} + +void +ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + uint8_t sample_count, uint8_t sample_index, + uint8_t *x, uint8_t *y) +{ + const const uint8_t *packed = + ilo_state_sample_pattern_get_packed_offsets(pattern, dev, sample_count); + + assert(sample_index < sample_count); + + *x = (packed[sample_index] >> 4) & 0xf; + *y = packed[sample_index] & 0xf; +} + +/** + * No need to initialize first. + */ +bool +ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_line_stipple_info *info) +{ + bool ret = true; + + ret &= line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(stipple, + dev, info); + + assert(ret); + + return ret; +} + +/** + * No need to initialize first. + */ +bool +ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_poly_stipple_info *info) +{ + bool ret = true; + + ret &= poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(stipple, + dev, info); + + assert(ret); + + return ret; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.h b/src/gallium/drivers/ilo/core/ilo_state_raster.h new file mode 100644 index 00000000000..fc90b49cfc3 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_raster.h @@ -0,0 +1,301 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_RASTER_H +#define ILO_STATE_RASTER_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +enum ilo_state_raster_dirty_bits { + ILO_STATE_RASTER_3DSTATE_CLIP = (1 << 0), + ILO_STATE_RASTER_3DSTATE_SF = (1 << 1), + ILO_STATE_RASTER_3DSTATE_RASTER = (1 << 2), + ILO_STATE_RASTER_3DSTATE_MULTISAMPLE = (1 << 3), + ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK = (1 << 4), + ILO_STATE_RASTER_3DSTATE_WM = (1 << 5), + ILO_STATE_RASTER_3DSTATE_WM_HZ_OP = (1 << 6), + ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS = (1 << 7), +}; + +enum ilo_state_raster_earlyz_op { + ILO_STATE_RASTER_EARLYZ_NORMAL, + ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR, + ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE, + ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE, +}; + +/** + * VUE readback, VertexClipTest, ClipDetermination, and primitive output. + */ +struct ilo_state_raster_clip_info { + bool clip_enable; + /* CL_INVOCATION_COUNT and CL_PRIMITIVES_COUNT */ + bool stats_enable; + + uint8_t viewport_count; + bool force_rtaindex_zero; + + /* these should be mutually exclusive */ + uint8_t user_cull_enables; + uint8_t user_clip_enables; + + bool gb_test_enable; + bool xy_test_enable; + + /* far/near must be enabled together prior to Gen9 */ + bool z_far_enable; + bool z_near_enable; + bool z_near_zero; +}; + +/** + * Primitive assembly, viewport transformation, scissoring, MSAA, etc. + */ +struct ilo_state_raster_setup_info { + bool cv_is_rectangle; + + bool first_vertex_provoking; + bool viewport_transform; + + bool scissor_enable; + + /* MSAA enables for lines and non-lines */ + bool msaa_enable; + bool line_msaa_enable; +}; + +/** + * 3DOBJ_POINT rasterization rules. + */ +struct ilo_state_raster_point_info { + /* ignored when msaa_enable is set */ + bool aa_enable; + + bool programmable_width; +}; + +/** + * 3DOBJ_LINE rasterization rules. + */ +struct ilo_state_raster_line_info { + /* ignored when line_msaa_enable is set */ + bool aa_enable; + + /* ignored when line_msaa_enable or aa_enable is set */ + bool stipple_enable; + bool giq_enable; + bool giq_last_pixel; +}; + +/** + * 3DOBJ_TRIANGLE rasterization rules. + */ +struct ilo_state_raster_tri_info { + enum gen_front_winding front_winding; + enum gen_cull_mode cull_mode; + enum gen_fill_mode fill_mode_front; + enum gen_fill_mode fill_mode_back; + + enum gen_depth_format depth_offset_format; + bool depth_offset_solid; + bool depth_offset_wireframe; + bool depth_offset_point; + + bool poly_stipple_enable; +}; + +/** + * Scan conversion. + */ +struct ilo_state_raster_scan_info { + /* PS_DEPTH_COUNT and PS_INVOCATION_COUNT */ + bool stats_enable; + + uint8_t sample_count; + + /* pixel location for non-MSAA or 1x-MSAA */ + enum gen_pixel_location pixloc; + + uint32_t sample_mask; + + /* interpolations */ + enum gen_zw_interp zw_interp; + uint8_t barycentric_interps; + + /* Gen7+ only */ + enum gen_edsc_mode earlyz_control; + enum ilo_state_raster_earlyz_op earlyz_op; + bool earlyz_stencil_clear; +}; + +/** + * Raster parameters. + */ +struct ilo_state_raster_params_info { + bool any_integer_rt; + bool hiz_enable; + + float point_width; + float line_width; + + /* const term will be scaled by 'r' */ + float depth_offset_const; + float depth_offset_scale; + float depth_offset_clamp; +}; + +struct ilo_state_raster_info { + struct ilo_state_raster_clip_info clip; + struct ilo_state_raster_setup_info setup; + struct ilo_state_raster_point_info point; + struct ilo_state_raster_line_info line; + struct ilo_state_raster_tri_info tri; + struct ilo_state_raster_scan_info scan; + + struct ilo_state_raster_params_info params; +}; + +struct ilo_state_raster { + uint32_t clip[3]; + uint32_t sf[3]; + uint32_t raster[4]; + uint32_t sample[2]; + uint32_t wm[3]; + + bool line_aa_enable; + bool line_giq_enable; +}; + +struct ilo_state_raster_delta { + uint32_t dirty; +}; + +struct ilo_state_sample_pattern_offset_info { + /* in U0.4 */ + uint8_t x; + uint8_t y; +}; + +struct ilo_state_sample_pattern_info { + struct ilo_state_sample_pattern_offset_info pattern_1x[1]; + struct ilo_state_sample_pattern_offset_info pattern_2x[2]; + struct ilo_state_sample_pattern_offset_info pattern_4x[4]; + struct ilo_state_sample_pattern_offset_info pattern_8x[8]; + struct ilo_state_sample_pattern_offset_info pattern_16x[16]; +}; + +struct ilo_state_sample_pattern { + uint8_t pattern_1x[1]; + uint8_t pattern_2x[2]; + uint8_t pattern_4x[4]; + uint8_t pattern_8x[8]; + uint8_t pattern_16x[16]; +}; + +struct ilo_state_line_stipple_info { + uint16_t pattern; + uint16_t repeat_count; +}; + +struct ilo_state_line_stipple { + uint32_t stipple[2]; +}; + +struct ilo_state_poly_stipple_info { + uint32_t pattern[32]; +}; + +struct ilo_state_poly_stipple { + uint32_t stipple[32]; +}; + +bool +ilo_state_raster_init(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info); + +bool +ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + uint8_t sample_count, + enum ilo_state_raster_earlyz_op earlyz_op, + bool earlyz_stencil_clear); + +bool +ilo_state_raster_set_info(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_info *info); + +bool +ilo_state_raster_set_params(struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster_params_info *params); + +void +ilo_state_raster_full_delta(const struct ilo_state_raster *rs, + const struct ilo_dev *dev, + struct ilo_state_raster_delta *delta); + +void +ilo_state_raster_get_delta(const struct ilo_state_raster *rs, + const struct ilo_dev *dev, + const struct ilo_state_raster *old, + struct ilo_state_raster_delta *delta); + +bool +ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + const struct ilo_state_sample_pattern_info *info); + +bool +ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev); + +const uint8_t * +ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + uint8_t sample_count); + +void +ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern, + const struct ilo_dev *dev, + uint8_t sample_count, uint8_t sample_index, + uint8_t *x, uint8_t *y); +bool +ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_line_stipple_info *info); + +bool +ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple, + const struct ilo_dev *dev, + const struct ilo_state_poly_stipple_info *info); + +#endif /* ILO_STATE_RASTER_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.c b/src/gallium/drivers/ilo/core/ilo_state_sampler.c new file mode 100644 index 00000000000..3787f684fe8 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.c @@ -0,0 +1,742 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "util/u_half.h" + +#include "ilo_debug.h" +#include "ilo_state_surface.h" +#include "ilo_state_sampler.h" + +static bool +sampler_validate_gen6_non_normalized(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + const enum gen_texcoord_mode addr_ctrls[3] = { + info->tcx_ctrl, info->tcy_ctrl, info->tcz_ctrl, + }; + int i; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 98: + * + * "The following state must be set as indicated if this field + * (Non-normalized Coordinate Enable) is enabled: + * + * - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP, + * TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER. + * - Surface Type must be SURFTYPE_2D or SURFTYPE_3D. + * - Mag Mode Filter must be MAPFILTER_NEAREST or + * MAPFILTER_LINEAR. + * - Min Mode Filter must be MAPFILTER_NEAREST or + * MAPFILTER_LINEAR. + * - Mip Mode Filter must be MIPFILTER_NONE. + * - Min LOD must be 0. + * - Max LOD must be 0. + * - MIP Count must be 0. + * - Surface Min LOD must be 0. + * - Texture LOD Bias must be 0." + */ + for (i = 0; i < 3; i++) { + switch (addr_ctrls[i]) { + case GEN6_TEXCOORDMODE_CLAMP: + case GEN6_TEXCOORDMODE_CLAMP_BORDER: + case GEN8_TEXCOORDMODE_HALF_BORDER: + break; + default: + assert(!"bad non-normalized coordinate wrap mode"); + break; + } + } + + assert(info->mip_filter == GEN6_MIPFILTER_NONE); + + assert((info->min_filter == GEN6_MAPFILTER_NEAREST || + info->min_filter == GEN6_MAPFILTER_LINEAR) && + (info->mag_filter == GEN6_MAPFILTER_NEAREST || + info->mag_filter == GEN6_MAPFILTER_LINEAR)); + + assert(info->min_lod == 0.0f && + info->max_lod == 0.0f && + info->lod_bias == 0.0f); + + return true; +} + +static bool +sampler_validate_gen6_sampler(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (info->non_normalized && + !sampler_validate_gen6_non_normalized(dev, info)) + return false; + + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + assert(info->tcx_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER && + info->tcy_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER && + info->tcz_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER); + } + + return true; +} + +static uint32_t +sampler_get_gen6_integer_filters(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 103: + * + * "MIPFILTER_LINEAR is not supported for surface formats that do not + * support "Sampling Engine Filtering" as indicated in the Surface + * Formats table unless using the sample_c message type." + * + * "Only MAPFILTER_NEAREST is supported for surface formats that do not + * support "Sampling Engine Filtering" as indicated in the Surface + * Formats table unless using the sample_c message type. + */ + const enum gen_mip_filter mip_filter = + (info->mip_filter == GEN6_MIPFILTER_LINEAR) ? + GEN6_MIPFILTER_NEAREST : info->mip_filter; + const enum gen_map_filter min_filter = GEN6_MAPFILTER_NEAREST; + const enum gen_map_filter mag_filter = GEN6_MAPFILTER_NEAREST; + + ILO_DEV_ASSERT(dev, 6, 8); + + return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT | + mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT | + min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT; +} + +static uint32_t +sampler_get_gen6_3d_filters(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + const enum gen_mip_filter mip_filter = info->mip_filter; + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 103: + * + * "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for + * surfaces of type SURFTYPE_3D." + */ + const enum gen_map_filter min_filter = + (info->min_filter == GEN6_MAPFILTER_NEAREST || + info->min_filter == GEN6_MAPFILTER_LINEAR) ? + info->min_filter : GEN6_MAPFILTER_LINEAR; + const enum gen_map_filter mag_filter = + (info->mag_filter == GEN6_MAPFILTER_NEAREST || + info->mag_filter == GEN6_MAPFILTER_LINEAR) ? + info->mag_filter : GEN6_MAPFILTER_LINEAR; + + ILO_DEV_ASSERT(dev, 6, 8); + + return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT | + mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT | + min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT; +} + +static uint32_t +get_gen6_addr_controls(const struct ilo_dev *dev, + enum gen_texcoord_mode tcx_ctrl, + enum gen_texcoord_mode tcy_ctrl, + enum gen_texcoord_mode tcz_ctrl) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + return tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT | + tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT | + tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT; + } else { + return tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT | + tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT | + tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT; + } +} + +static uint32_t +sampler_get_gen6_1d_addr_controls(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + const enum gen_texcoord_mode tcx_ctrl = + (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ? + GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl; + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 100: + * + * "If this field (TCY Address Control Mode) is set to + * TEXCOORDMODE_CLAMP_BORDER or TEXCOORDMODE_HALF_BORDER and a 1D + * surface is sampled, incorrect blending with the border color in the + * vertical direction may occur." + */ + const enum gen_texcoord_mode tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP; + const enum gen_texcoord_mode tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP; + + ILO_DEV_ASSERT(dev, 6, 8); + + return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl); +} + +static uint32_t +sampler_get_gen6_2d_3d_addr_controls(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + const enum gen_texcoord_mode tcx_ctrl = + (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ? + GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl; + const enum gen_texcoord_mode tcy_ctrl = + (info->tcy_ctrl == GEN6_TEXCOORDMODE_CUBE) ? + GEN6_TEXCOORDMODE_CLAMP : info->tcy_ctrl; + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 108: + * + * "[DevSNB]: if this field (TCZ Address Control Mode) is set to + * TEXCOORDMODE_CLAMP_BORDER samples outside the map will clamp to 0 + * instead of boarder color" + * + * From the Ivy Bridge PRM, volume 4 part 1, page 100: + * + * "If this field is set to TEXCOORDMODE_CLAMP_BORDER for 3D maps on + * formats without an alpha channel, samples straddling the map in the + * Z direction may have their alpha channels off by 1." + * + * Do we want to do something here? + */ + const enum gen_texcoord_mode tcz_ctrl = + (info->tcz_ctrl == GEN6_TEXCOORDMODE_CUBE) ? + GEN6_TEXCOORDMODE_CLAMP : info->tcz_ctrl; + + ILO_DEV_ASSERT(dev, 6, 8); + + return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl); +} + +static uint32_t +sampler_get_gen6_cube_addr_controls(const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 99: + * + * "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP + * and TEXCOORDMODE_CUBE settings are valid, and each TC component + * must have the same Address Control mode. + * + * When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's + * Cube Face Enable field must be programmed to 111111b (all faces + * enabled)." + * + * From the Haswell PRM, volume 2d, page 278: + * + * "When using cube map texture coordinates, each TC component must + * have the same Address Control Mode. + * + * When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's + * Cube Face Enable field must be programmed to 111111b (all faces + * enabled)." + * + * We always enable all cube faces and only need to make sure all address + * control modes are the same. + */ + const enum gen_texcoord_mode tcx_ctrl = + (ilo_dev_gen(dev) >= ILO_GEN(7.5) || + info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE || + info->tcx_ctrl == GEN6_TEXCOORDMODE_CLAMP) ? + info->tcx_ctrl : GEN6_TEXCOORDMODE_CLAMP; + const enum gen_texcoord_mode tcy_ctrl = tcx_ctrl; + const enum gen_texcoord_mode tcz_ctrl = tcx_ctrl; + + ILO_DEV_ASSERT(dev, 6, 8); + + return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl); +} + +static uint16_t +get_gen6_lod_bias(const struct ilo_dev *dev, float bias) +{ + /* [-16.0, 16.0) in S4.6 or S4.8 */ + const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6; + const float max = 16.0f; + const float scale = (float) (1 << fbits); + const int mask = (1 << (1 + 4 + fbits)) - 1; + const int scaled_max = (16 << fbits) - 1; + int scaled; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (bias > max) + bias = max; + else if (bias < -max) + bias = -max; + + scaled = (int) (bias * scale); + if (scaled > scaled_max) + scaled = scaled_max; + + return (scaled & mask); +} + +static uint16_t +get_gen6_lod_clamp(const struct ilo_dev *dev, float clamp) +{ + /* [0.0, 13.0] in U4.6 or [0.0, 14.0] in U4.8 */ + const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6; + const float max = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 14.0f : 13.0f; + const float scale = (float) (1 << fbits); + + ILO_DEV_ASSERT(dev, 6, 8); + + if (clamp > max) + clamp = max; + else if (clamp < 0.0f) + clamp = 0.0f; + + return (int) (clamp * scale); +} + +static bool +sampler_set_gen6_SAMPLER_STATE(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + uint16_t lod_bias, max_lod, min_lod; + uint32_t dw0, dw1, dw3; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!sampler_validate_gen6_sampler(dev, info)) + return false; + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 15: + * + * "The per-pixel LOD is computed in an implementation-dependent manner + * and approximates the log2 of the texel/pixel ratio at the given + * pixel. The computation is typically based on the differential + * texel-space distances associated with a one-pixel differential + * distance along the screen x- and y-axes. These texel-space + * distances are computed by evaluating neighboring pixel texture + * coordinates, these coordinates being in units of texels on the base + * MIP level (multiplied by the corresponding surface size in + * texels)." + * + * Judging from the LOD computation pseudocode on page 16-18, the "base MIP + * level" should be given by SurfMinLod. To summarize, for the "sample" + * message, + * + * 1) LOD is set to log2(texel/pixel ratio). The number of texels is + * measured against level SurfMinLod. + * 2) Bias is added to LOD. + * 3) if pre-clamp is enabled, LOD is clamped to [MinLod, MaxLod] first + * 4) LOD is compared with Base to determine whether magnification or + * minification is needed. + * 5) If magnification is needed, or no mipmapping is requested, LOD is + * set to floor(MinLod). + * 6) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD. + * + * As an example, we could set SurfMinLod to GL_TEXTURE_BASE_LEVEL and Base + * to 0 to match GL. But GL expects LOD to be set to 0, instead of + * floor(MinLod), in 5). Since this is only an issue when MinLod is + * greater than or equal to one, and, with Base being 0, a non-zero MinLod + * implies minification, we only need to deal with the case when mipmapping + * is disabled. We can thus do: + * + * if (MipFilter == MIPFILTER_NONE && MinLod) { + * MinLod = 0; + * MagFilter = MinFilter; + * } + */ + + lod_bias = get_gen6_lod_bias(dev, info->lod_bias); + min_lod = get_gen6_lod_clamp(dev, info->min_lod); + max_lod = get_gen6_lod_clamp(dev, info->max_lod); + + dw0 = GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE | + 0 << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT | + info->mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT | + info->mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT | + info->min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + dw0 |= GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX10_OGL | + lod_bias << GEN7_SAMPLER_DW0_LOD_BIAS__SHIFT; + + if (info->min_filter == GEN6_MAPFILTER_ANISOTROPIC || + info->mag_filter == GEN6_MAPFILTER_ANISOTROPIC) + dw0 |= GEN7_SAMPLER_DW0_ANISO_ALGO_EWA; + } else { + dw0 |= lod_bias << GEN6_SAMPLER_DW0_LOD_BIAS__SHIFT | + info->shadow_func << GEN6_SAMPLER_DW0_SHADOW_FUNC__SHIFT; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 102: + * + * "(Min and Mag State Not Equal) Must be set to 1 if any of the + * following are true: + * + * - Mag Mode Filter and Min Mode Filter are not the same + * - Address Rounding Enable: U address mag filter and U address + * min filter are not the same + * - Address Rounding Enable: V address mag filter and V address + * min filter are not the same + * - Address Rounding Enable: R address mag filter and R address + * min filter are not the same" + * + * We set address rounding for U, V, and R uniformly. Only need to + * check the filters. + */ + if (info->min_filter != info->mag_filter) + dw0 |= GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL; + } + + dw1 = 0; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 96: + * + * "This field (Cube Surface Control Mode) must be set to + * CUBECTRLMODE_PROGRAMMED" + */ + dw1 |= min_lod << GEN7_SAMPLER_DW1_MIN_LOD__SHIFT | + max_lod << GEN7_SAMPLER_DW1_MAX_LOD__SHIFT | + info->shadow_func << GEN7_SAMPLER_DW1_SHADOW_FUNC__SHIFT | + GEN7_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED; + } else { + dw1 |= min_lod << GEN6_SAMPLER_DW1_MIN_LOD__SHIFT | + max_lod << GEN6_SAMPLER_DW1_MAX_LOD__SHIFT | + GEN6_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED | + info->tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT | + info->tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT | + info->tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT; + } + + dw3 = info->max_anisotropy << GEN6_SAMPLER_DW3_MAX_ANISO__SHIFT; + + /* round the coordinates for linear filtering */ + if (info->min_filter != GEN6_MAPFILTER_NEAREST) { + dw3 |= GEN6_SAMPLER_DW3_U_MIN_ROUND | + GEN6_SAMPLER_DW3_V_MIN_ROUND | + GEN6_SAMPLER_DW3_R_MIN_ROUND; + } + if (info->mag_filter != GEN6_MAPFILTER_NEAREST) { + dw3 |= GEN6_SAMPLER_DW3_U_MAG_ROUND | + GEN6_SAMPLER_DW3_V_MAG_ROUND | + GEN6_SAMPLER_DW3_R_MAG_ROUND; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + dw3 |= GEN7_SAMPLER_DW3_TRIQUAL_FULL | + info->tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT | + info->tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT | + info->tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT; + + if (info->non_normalized) + dw3 |= GEN7_SAMPLER_DW3_NON_NORMALIZED_COORD; + } else { + if (info->non_normalized) + dw3 |= GEN6_SAMPLER_DW3_NON_NORMALIZED_COORD; + } + + STATIC_ASSERT(ARRAY_SIZE(sampler->sampler) >= 3); + sampler->sampler[0] = dw0; + sampler->sampler[1] = dw1; + sampler->sampler[2] = dw3; + + sampler->filter_integer = sampler_get_gen6_integer_filters(dev, info); + sampler->filter_3d = sampler_get_gen6_3d_filters(dev, info); + sampler->addr_ctrl_1d = sampler_get_gen6_1d_addr_controls(dev, info); + sampler->addr_ctrl_2d_3d = sampler_get_gen6_2d_3d_addr_controls(dev, info); + sampler->addr_ctrl_cube = sampler_get_gen6_cube_addr_controls(dev, info); + + sampler->non_normalized = info->non_normalized; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 21: + * + * "[DevSNB] Errata: Incorrect behavior is observed in cases where the + * min and mag mode filters are different and SurfMinLOD is nonzero. + * The determination of MagMode uses the following equation instead of + * the one in the above pseudocode: + * + * MagMode = (LOD + SurfMinLOD - Base <= 0)" + * + * As a way to work around that, request Base to be set to SurfMinLod. + */ + if (ilo_dev_gen(dev) == ILO_GEN(6) && + info->min_filter != info->mag_filter) + sampler->base_to_surf_min_lod = true; + + return true; +} + +static bool +sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border, + const struct ilo_dev *dev, + const struct ilo_state_sampler_border_info *info) +{ + uint32_t dw[12]; + float rgba[4]; + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 117: + * + * "For ([DevSNB]), if border color is used, all formats must be + * provided. Hardware will choose the appropriate format based on + * Surface Format and Texture Border Color Mode. The values + * represented by each format should be the same (other than being + * subject to range-based clamping and precision) to avoid unexpected + * behavior." + * + * XXX We do not honor info->is_integer yet. + */ + + ILO_DEV_ASSERT(dev, 6, 6); + + /* make a copy so that we can clamp for SNORM and UNORM */ + memcpy(rgba, info->rgba.f, sizeof(rgba)); + + /* IEEE_FP */ + dw[1] = fui(rgba[0]); + dw[2] = fui(rgba[1]); + dw[3] = fui(rgba[2]); + dw[4] = fui(rgba[3]); + + /* FLOAT_16 */ + dw[5] = util_float_to_half(rgba[0]) | + util_float_to_half(rgba[1]) << 16; + dw[6] = util_float_to_half(rgba[2]) | + util_float_to_half(rgba[3]) << 16; + + /* clamp to [-1.0f, 1.0f] */ + rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f); + rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f); + rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f); + rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f); + + /* SNORM16 */ + dw[9] = (int16_t) util_iround(rgba[0] * 32767.0f) | + (int16_t) util_iround(rgba[1] * 32767.0f) << 16; + dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) | + (int16_t) util_iround(rgba[3] * 32767.0f) << 16; + + /* SNORM8 */ + dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) | + (int8_t) util_iround(rgba[1] * 127.0f) << 8 | + (int8_t) util_iround(rgba[2] * 127.0f) << 16 | + (int8_t) util_iround(rgba[3] * 127.0f) << 24; + + /* clamp to [0.0f, 1.0f] */ + rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f); + rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f); + rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f); + rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f); + + /* UNORM8 */ + dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) | + (uint8_t) util_iround(rgba[1] * 255.0f) << 8 | + (uint8_t) util_iround(rgba[2] * 255.0f) << 16 | + (uint8_t) util_iround(rgba[3] * 255.0f) << 24; + + /* UNORM16 */ + dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) | + (uint16_t) util_iround(rgba[1] * 65535.0f) << 16; + dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) | + (uint16_t) util_iround(rgba[3] * 65535.0f) << 16; + + STATIC_ASSERT(ARRAY_SIZE(border->color) >= 12); + memcpy(border->color, dw, sizeof(dw)); + + return true; +} + +static bool +sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border, + const struct ilo_dev *dev, + const struct ilo_state_sampler_border_info *info) +{ + ILO_DEV_ASSERT(dev, 7, 8); + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 116: + * + * "In DX10/OGL mode, the format of the border color is + * R32G32B32A32_FLOAT, regardless of the surface format chosen." + * + * From the Haswell PRM, volume 2d, page 240: + * + * "So, SW will have to program the table in SAMPLER_BORDER_COLOR_STATE + * at offsets DWORD16 to 19, as per the integer surface format type." + * + * From the Broadwell PRM, volume 2d, page 297: + * + * "DX10/OGL mode: the format of the border color depends on the format + * of the surface being sampled. If the map format is UINT, then the + * border color format is R32G32B32A32_UINT. If the map format is + * SINT, then the border color format is R32G32B32A32_SINT. Otherwise, + * the border color format is R32G32B32A32_FLOAT." + * + * XXX every Gen is different + */ + + STATIC_ASSERT(ARRAY_SIZE(border->color) >= 4); + memcpy(border->color, info->rgba.f, sizeof(info->rgba.f)); + + return true; +} + +bool +ilo_state_sampler_init(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(sampler, sizeof(*sampler))); + + ret &= sampler_set_gen6_SAMPLER_STATE(sampler, dev, info); + + assert(ret); + + return ret; +} + +bool +ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + assert(ilo_is_zeroed(sampler, sizeof(*sampler))); + + sampler->sampler[0] = GEN6_SAMPLER_DW0_DISABLE; + sampler->sampler[1] = 0; + sampler->sampler[2] = 0; + + return true; +} + +/** + * Modify \p sampler to work with \p surf. There will be loss of information. + * Callers should make a copy of the orignal sampler first. + */ +bool +ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev, + const struct ilo_state_surface *surf) +{ + uint32_t addr_ctrl; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (sampler->non_normalized) { + /* see sampler_validate_gen6_non_normalized() */ + assert(surf->type == GEN6_SURFTYPE_2D || + surf->type == GEN6_SURFTYPE_3D); + assert(!surf->min_lod && !surf->mip_count); + } + + if (sampler->base_to_surf_min_lod) { + const uint8_t base = surf->min_lod << GEN6_SAMPLER_DW0_BASE_LOD__RADIX; + + sampler->sampler[0] = + (sampler->sampler[0] & ~GEN6_SAMPLER_DW0_BASE_LOD__MASK) | + base << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT; + } + + if (surf->is_integer || surf->type == GEN6_SURFTYPE_3D) { + const uint32_t mask = (GEN6_SAMPLER_DW0_MIP_FILTER__MASK | + GEN6_SAMPLER_DW0_MIN_FILTER__MASK | + GEN6_SAMPLER_DW0_MAG_FILTER__MASK); + const uint32_t filter = (surf->is_integer) ? + sampler->filter_integer : sampler->filter_3d; + + assert((filter & mask) == filter); + sampler->sampler[0] = (sampler->sampler[0] & ~mask) | + filter; + } + + switch (surf->type) { + case GEN6_SURFTYPE_1D: + addr_ctrl = sampler->addr_ctrl_1d; + break; + case GEN6_SURFTYPE_2D: + case GEN6_SURFTYPE_3D: + addr_ctrl = sampler->addr_ctrl_2d_3d; + break; + case GEN6_SURFTYPE_CUBE: + addr_ctrl = sampler->addr_ctrl_cube; + break; + default: + assert(!"unexpected surface type"); + addr_ctrl = 0; + break; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + const uint32_t mask = (GEN7_SAMPLER_DW3_U_WRAP__MASK | + GEN7_SAMPLER_DW3_V_WRAP__MASK | + GEN7_SAMPLER_DW3_R_WRAP__MASK); + + assert((addr_ctrl & mask) == addr_ctrl); + sampler->sampler[2] = (sampler->sampler[2] & ~mask) | + addr_ctrl; + } else { + const uint32_t mask = (GEN6_SAMPLER_DW1_U_WRAP__MASK | + GEN6_SAMPLER_DW1_V_WRAP__MASK | + GEN6_SAMPLER_DW1_R_WRAP__MASK); + + assert((addr_ctrl & mask) == addr_ctrl); + sampler->sampler[1] = (sampler->sampler[1] & ~mask) | + addr_ctrl; + } + + return true; +} + +bool +ilo_state_sampler_border_init(struct ilo_state_sampler_border *border, + const struct ilo_dev *dev, + const struct ilo_state_sampler_border_info *info) +{ + bool ret = true; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + ret &= sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(border, + dev, info); + } else { + ret &= sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(border, + dev, info); + } + + assert(ret); + + return ret; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.h b/src/gallium/drivers/ilo/core/ilo_state_sampler.h new file mode 100644 index 00000000000..75c7620a678 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.h @@ -0,0 +1,103 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_SAMPLER_H +#define ILO_STATE_SAMPLER_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +struct ilo_state_surface; + +struct ilo_state_sampler_info { + bool non_normalized; + + float lod_bias; + float min_lod; + float max_lod; + + enum gen_mip_filter mip_filter; + enum gen_map_filter min_filter; + enum gen_map_filter mag_filter; + enum gen_aniso_ratio max_anisotropy; + + enum gen_texcoord_mode tcx_ctrl; + enum gen_texcoord_mode tcy_ctrl; + enum gen_texcoord_mode tcz_ctrl; + + enum gen_prefilter_op shadow_func; +}; + +struct ilo_state_sampler_border_info { + union { + float f[4]; + uint32_t ui[4]; + } rgba; + + bool is_integer; +}; + +struct ilo_state_sampler { + uint32_t sampler[3]; + + uint32_t filter_integer; + uint32_t filter_3d; + + uint32_t addr_ctrl_1d; + uint32_t addr_ctrl_2d_3d; + uint32_t addr_ctrl_cube; + + bool non_normalized; + bool base_to_surf_min_lod; +}; + +struct ilo_state_sampler_border { + uint32_t color[12]; +}; + +bool +ilo_state_sampler_init(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev, + const struct ilo_state_sampler_info *info); + +bool +ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev); + +bool +ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler, + const struct ilo_dev *dev, + const struct ilo_state_surface *surf); + +bool +ilo_state_sampler_border_init(struct ilo_state_sampler_border *border, + const struct ilo_dev *dev, + const struct ilo_state_sampler_border_info *info); + +#endif /* ILO_STATE_SAMPLER_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c new file mode 100644 index 00000000000..5d1d400acdd --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c @@ -0,0 +1,350 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_sbe.h" + +static bool +sbe_validate_gen8(const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + assert(info->attr_count <= ILO_STATE_SBE_MAX_ATTR_COUNT); + + assert(info->vue_read_base + info->vue_read_count <= + info->cv_vue_attr_count); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 248: + * + * "(Vertex URB Entry Read Length) + * Format: U5 + * Range [1,16] + * + * Specifies the amount of URB data read for each Vertex URB entry, in + * 256-bit register increments. + * + * Programming Notes + * It is UNDEFINED to set this field to 0 indicating no Vertex URB + * data to be read." + * + * "(Vertex URB Entry Read Offset) + * Format: U6 + * Range [0,63] + * + * Specifies the offset (in 256-bit units) at which Vertex URB data is + * to be read from the URB." + */ + assert(info->vue_read_base % 2 == 0 && info->vue_read_base <= 126); + assert(info->vue_read_count <= 32); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 268: + * + * "This field (Point Sprite Texture Coordinate Enable) must be + * programmed to 0 when non-point primitives are rendered." + */ + if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->point_sprite_enables) + assert(info->cv_is_point); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 246: + * + * "(Number of SF Output Attributes) 33-48: Specifies 17-32 attributes + * (# attributes = field value - 16). Swizzling performed on + * Attributes 16-31 (as required) only. Attributes 0-15 passed through + * unmodified. + * + * Note : + * + * Attribute n Component Override and Constant Source states apply to + * Attributes 16-31 (as required) instead of Attributes 0-15. E.g., + * this allows an Attribute 16-31 component to be overridden with the + * PrimitiveID value. + * + * Attribute n WrapShortest Enables still apply to Attributes 0-15. + * + * Attribute n Swizzle Select and Attribute n Source Attribute states + * are ignored and none of the swizzling functions available through + * these controls are performed." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 247: + * + * "This bit (Attribute Swizzle Enable) controls the use of the + * Attribute n Swizzle Select and Attribute n Source Attribute fields + * only. If ENABLED, those fields are used as described below. If + * DISABLED, attributes are copied from their corresponding source + * attributes, for the purposes of Swizzle Select only. + * + * Note that the following fields are unaffected by this bit, and are + * therefore always used to control their respective fields: + * Attribute n Component Override X/Y/Z/W + * Attribute n Constant Source + * Attribute n WrapShortest Enables" + * + * From the Ivy Bridge PRM, volume 2 part 1, page 264: + * + * "When Attribute Swizzle Enable is ENABLED, this bit (Attribute + * Swizzle Control Mode) controls whether attributes 0-15 or 16-31 are + * subject to the following swizzle controls: + * + * - Attribute n Component Override X/Y/Z/W + * - Attribute n Constant Source + * - Attribute n Swizzle Select + * - Attribute n Source Attribute + * - Attribute n Wrap Shortest Enables" + * + * "SWIZ_16_31... Only valid when 16 or more attributes are output." + */ + assert(info->swizzle_count <= ILO_STATE_SBE_MAX_SWIZZLE_COUNT); + if (info->swizzle_16_31) { + assert(ilo_dev_gen(dev) >= ILO_GEN(7) && + info->swizzle_enable && + info->attr_count > 16); + } + + return true; +} + +static uint8_t +sbe_get_gen8_min_read_count(const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + uint8_t min_count = 0; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* minimum read count for non-swizzled attributes */ + if (!info->swizzle_enable || info->swizzle_count < info->attr_count) { + if (info->swizzle_16_31 && info->swizzle_count + 16 == info->attr_count) + min_count = 16; + else + min_count = info->attr_count; + } + + if (info->swizzle_enable) { + uint8_t i; + + for (i = 0; i < info->swizzle_count; i++) { + const struct ilo_state_sbe_swizzle_info *swizzle = + &info->swizzles[i]; + bool inputattr_facing; + + switch (swizzle->attr_select) { + case GEN6_INPUTATTR_FACING: + case GEN6_INPUTATTR_FACING_W: + inputattr_facing = true; + break; + default: + inputattr_facing = false; + break; + } + + if (min_count < swizzle->attr + inputattr_facing + 1) + min_count = swizzle->attr + inputattr_facing + 1; + } + } + + return min_count; +} + +static uint8_t +sbe_get_gen8_read_length(const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + uint8_t read_len; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 248: + * + * "(Vertex URB Entry Read Length) + * This field should be set to the minimum length required to read the + * maximum source attribute. The maximum source attribute is indicated + * by the maximum value of the enabled Attribute # Source Attribute if + * Attribute Swizzle Enable is set, Number of Output Attributes -1 if + * enable is not set. + * read_length = ceiling((max_source_attr+1)/2) + * + * [errata] Corruption/Hang possible if length programmed larger than + * recommended" + */ + if (info->has_min_read_count) { + read_len = info->vue_read_count; + assert(read_len == sbe_get_gen8_min_read_count(dev, info)); + } else { + read_len = sbe_get_gen8_min_read_count(dev, info); + assert(read_len <= info->vue_read_count); + } + + /* + * In pairs. URB entries are aligned to 1024-bits or 512-bits. There is + * no need to worry about reading past entries. + */ + read_len = (read_len + 1) / 2; + if (!read_len) + read_len = 1; + + return read_len; +} + +static bool +sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + uint8_t vue_read_offset, vue_read_len; + uint8_t attr_count; + uint32_t dw1, dw2, dw3; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!sbe_validate_gen8(dev, info)) + return false; + + vue_read_offset = info->vue_read_base / 2; + vue_read_len = sbe_get_gen8_read_length(dev, info); + + attr_count = info->attr_count; + if (ilo_dev_gen(dev) == ILO_GEN(6) && info->swizzle_16_31) + attr_count += 16; + + dw1 = attr_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT | + vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN | + GEN8_SBE_DW1_USE_URB_READ_OFFSET | + vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT; + } else { + dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->swizzle_16_31) + dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_16_31; + + if (info->swizzle_enable) + dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE; + + dw1 |= (info->point_sprite_origin_lower_left) ? + GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT : + GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT; + + dw2 = info->point_sprite_enables; + dw3 = info->const_interp_enables; + + STATIC_ASSERT(ARRAY_SIZE(sbe->sbe) >= 3); + sbe->sbe[0] = dw1; + sbe->sbe[1] = dw2; + sbe->sbe[2] = dw3; + + return true; +} + +static bool +sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + uint16_t swiz[ILO_STATE_SBE_MAX_SWIZZLE_COUNT]; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < info->swizzle_count; i++) { + const struct ilo_state_sbe_swizzle_info *swizzle = &info->swizzles[i]; + + /* U5 */ + assert(swizzle->attr < 32); + swiz[i] = swizzle->attr_select << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT | + swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT; + + if (swizzle->force_zeros) { + swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W | + GEN8_SBE_SWIZ_OVERRIDE_Z | + GEN8_SBE_SWIZ_OVERRIDE_Y | + GEN8_SBE_SWIZ_OVERRIDE_X | + GEN8_SBE_SWIZ_CONST_0000; + } + } + + for (; i < ARRAY_SIZE(swiz); i++) { + swiz[i] = GEN6_INPUTATTR_NORMAL << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT | + i << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT; + } + + STATIC_ASSERT(sizeof(sbe->swiz) == sizeof(swiz)); + memcpy(sbe->swiz, swiz, sizeof(swiz)); + + return true; +} + +bool +ilo_state_sbe_init(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + assert(ilo_is_zeroed(sbe, sizeof(*sbe))); + return ilo_state_sbe_set_info(sbe, dev, info); +} + +bool +ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + uint8_t read_base, + uint8_t read_count) +{ + struct ilo_state_sbe_info info; + + memset(&info, 0, sizeof(info)); + info.attr_count = read_count; + info.cv_vue_attr_count = read_base + read_count; + info.vue_read_base = read_base; + info.vue_read_count = read_count; + info.has_min_read_count = true; + + return ilo_state_sbe_set_info(sbe, dev, &info); +} + +bool +ilo_state_sbe_set_info(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info) +{ + bool ret = true; + + ILO_DEV_ASSERT(dev, 6, 8); + + ret &= sbe_set_gen8_3DSTATE_SBE(sbe, dev, info); + ret &= sbe_set_gen8_3DSTATE_SBE_SWIZ(sbe, dev, info); + + assert(ret); + + return true; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.h b/src/gallium/drivers/ilo/core/ilo_state_sbe.h new file mode 100644 index 00000000000..122999a9e94 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.h @@ -0,0 +1,103 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_SBE_H +#define ILO_STATE_SBE_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Sandy Bridge PRM, volume 2 part 1, page 264: + * + * "Number of SF Output Attributes sets the number of attributes that will + * be output from the SF stage, not including position. This can be used + * to specify up to 32, and may differ from the number of input + * attributes." + * + * "The first or last set of 16 attributes can be swizzled according to + * certain state fields." + */ +#define ILO_STATE_SBE_MAX_ATTR_COUNT 32 +#define ILO_STATE_SBE_MAX_SWIZZLE_COUNT 16 + +struct ilo_state_sbe_swizzle_info { + /* select an attribute from read ones */ + enum gen_inputattr_select attr_select; + uint8_t attr; + + bool force_zeros; +}; + +struct ilo_state_sbe_info { + uint8_t attr_count; + + /* which VUE attributes to read */ + uint8_t cv_vue_attr_count; + uint8_t vue_read_base; + uint8_t vue_read_count; + bool has_min_read_count; + + bool cv_is_point; + bool point_sprite_origin_lower_left; + /* force sprite coordinates to the four corner vertices of the point */ + uint32_t point_sprite_enables; + + /* force attr at the provoking vertex to a0 and zero to a1/a2 */ + uint32_t const_interp_enables; + + bool swizzle_enable; + /* swizzle attribute 16 to 31 instead; Gen7+ only */ + bool swizzle_16_31; + uint8_t swizzle_count; + const struct ilo_state_sbe_swizzle_info *swizzles; +}; + +struct ilo_state_sbe { + uint32_t sbe[3]; + uint32_t swiz[8]; +}; + +bool +ilo_state_sbe_init(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info); + +bool +ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + uint8_t read_base, + uint8_t read_count); + +bool +ilo_state_sbe_set_info(struct ilo_state_sbe *sbe, + const struct ilo_dev *dev, + const struct ilo_state_sbe_info *info); + +#endif /* ILO_STATE_SBE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.c b/src/gallium/drivers/ilo/core/ilo_state_shader.c new file mode 100644 index 00000000000..f67326c7f10 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_shader.c @@ -0,0 +1,737 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_shader.h" + +enum vertex_stage { + STAGE_VS, + STAGE_HS, + STAGE_DS, + STAGE_GS, +}; + +struct vertex_ff { + uint8_t grf_start; + uint8_t scratch_space; + + uint8_t sampler_count; + uint8_t surface_count; + bool has_uav; + + uint8_t vue_read_offset; + uint8_t vue_read_len; + + uint8_t user_clip_enables; +}; + +static bool +vertex_validate_gen6_kernel(const struct ilo_dev *dev, + enum vertex_stage stage, + const struct ilo_state_shader_kernel_info *kernel) +{ + /* + * "Dispatch GRF Start Register for URB Data" is U4 for GS and U5 for + * others. + */ + const uint8_t max_grf_start = (stage == STAGE_GS) ? 16 : 32; + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 134: + * + * "(Per-Thread Scratch Space) + * Range [0,11] indicating [1K Bytes, 2M Bytes]" + */ + const uint32_t max_scratch_size = 2 * 1024 * 1024; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* we do not want to save it */ + assert(!kernel->offset); + + assert(kernel->grf_start < max_grf_start); + assert(kernel->scratch_size <= max_scratch_size); + + return true; +} + +static bool +vertex_validate_gen6_urb(const struct ilo_dev *dev, + enum vertex_stage stage, + const struct ilo_state_shader_urb_info *urb) +{ + /* "Vertex/Patch URB Entry Read Offset" is U6, in pairs */ + const uint8_t max_read_base = 63 * 2; + /* + * "Vertex/Patch URB Entry Read Length" is limited to 64 for DS and U6 for + * others, in pairs + */ + const uint8_t max_read_count = ((stage == STAGE_DS) ? 64 : 63) * 2; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(urb->read_base + urb->read_count <= urb->cv_input_attr_count); + + assert(urb->read_base % 2 == 0 && urb->read_base <= max_read_base); + + /* + * There is no need to worry about reading past entries, as URB entries are + * aligned to 1024-bits (Gen6) or 512-bits (Gen7+). + */ + assert(urb->read_count <= max_read_count); + + return true; +} + +static bool +vertex_get_gen6_ff(const struct ilo_dev *dev, + enum vertex_stage stage, + const struct ilo_state_shader_kernel_info *kernel, + const struct ilo_state_shader_resource_info *resource, + const struct ilo_state_shader_urb_info *urb, + struct vertex_ff *ff) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (!vertex_validate_gen6_kernel(dev, stage, kernel) || + !vertex_validate_gen6_urb(dev, stage, urb)) + return false; + + ff->grf_start = kernel->grf_start; + /* next power of two, starting from 1KB */ + ff->scratch_space = (kernel->scratch_size > 1024) ? + (util_last_bit(kernel->scratch_size - 1) - 10): 0; + + ff->sampler_count = (resource->sampler_count <= 12) ? + (resource->sampler_count + 3) / 4 : 4; + ff->surface_count = resource->surface_count; + ff->has_uav = resource->has_uav; + + ff->vue_read_offset = urb->read_base / 2; + ff->vue_read_len = (urb->read_count + 1) / 2; + + /* need to read something unless VUE handles are included */ + switch (stage) { + case STAGE_VS: + if (!ff->vue_read_len) + ff->vue_read_len = 1; + + /* one GRF per attribute */ + assert(kernel->grf_start + urb->read_count * 2 <= 128); + break; + case STAGE_GS: + if (ilo_dev_gen(dev) == ILO_GEN(6) && !ff->vue_read_len) + ff->vue_read_len = 1; + break; + default: + break; + } + + ff->user_clip_enables = urb->user_clip_enables; + + return true; +} + +static uint16_t +vs_get_gen6_thread_count(const struct ilo_dev *dev, + const struct ilo_state_vs_info *info) +{ + uint16_t thread_count; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* Maximum Number of Threads of 3DSTATE_VS */ + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + thread_count = 504; + break; + case ILO_GEN(7.5): + thread_count = (dev->gt >= 2) ? 280 : 70; + break; + case ILO_GEN(7): + case ILO_GEN(6): + default: + thread_count = dev->thread_count; + break; + } + + return thread_count - 1; +} + +static bool +vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs, + const struct ilo_dev *dev, + const struct ilo_state_vs_info *info) +{ + struct vertex_ff ff; + uint16_t thread_count; + uint32_t dw2, dw3, dw4, dw5; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel, + &info->resource, &info->urb, &ff)) + return false; + + thread_count = vs_get_gen6_thread_count(dev, info); + + dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (false) + dw2 |= GEN6_THREADDISP_FP_MODE_ALT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav) + dw2 |= GEN75_THREADDISP_ACCESS_UAV; + + dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = ff.grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT | + ff.vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT | + ff.vue_read_offset << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT; + + dw5 = 0; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + dw5 |= thread_count << GEN75_VS_DW5_MAX_THREADS__SHIFT; + else + dw5 |= thread_count << GEN6_VS_DW5_MAX_THREADS__SHIFT; + + if (info->stats_enable) + dw5 |= GEN6_VS_DW5_STATISTICS; + if (info->dispatch_enable) + dw5 |= GEN6_VS_DW5_VS_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(vs->vs) >= 5); + vs->vs[0] = dw2; + vs->vs[1] = dw3; + vs->vs[2] = dw4; + vs->vs[3] = dw5; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + vs->vs[4] = ff.user_clip_enables << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT; + + return true; +} + +static uint16_t +hs_get_gen7_thread_count(const struct ilo_dev *dev, + const struct ilo_state_hs_info *info) +{ + uint16_t thread_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + /* Maximum Number of Threads of 3DSTATE_HS */ + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + thread_count = 504; + break; + case ILO_GEN(7.5): + thread_count = (dev->gt >= 2) ? 256 : 70; + break; + case ILO_GEN(7): + default: + thread_count = dev->thread_count; + break; + } + + return thread_count - 1; +} + +static bool +hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs, + const struct ilo_dev *dev, + const struct ilo_state_hs_info *info) +{ + struct vertex_ff ff; + uint16_t thread_count; + uint32_t dw1, dw2, dw4, dw5; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel, + &info->resource, &info->urb, &ff)) + return false; + + thread_count = hs_get_gen7_thread_count(dev, info); + + dw1 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + dw1 |= thread_count << GEN75_HS_DW1_DISPATCH_MAX_THREADS__SHIFT; + else + dw1 |= thread_count << GEN7_HS_DW1_DISPATCH_MAX_THREADS__SHIFT; + + dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT; + + if (info->dispatch_enable) + dw2 |= GEN7_HS_DW2_HS_ENABLE; + if (info->stats_enable) + dw2 |= GEN7_HS_DW2_STATISTICS; + + dw4 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw5 = GEN7_HS_DW5_INCLUDE_VERTEX_HANDLES | + ff.grf_start << GEN7_HS_DW5_URB_GRF_START__SHIFT | + ff.vue_read_len << GEN7_HS_DW5_URB_READ_LEN__SHIFT | + ff.vue_read_offset << GEN7_HS_DW5_URB_READ_OFFSET__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav) + dw5 |= GEN75_HS_DW5_ACCESS_UAV; + + STATIC_ASSERT(ARRAY_SIZE(hs->hs) >= 4); + hs->hs[0] = dw1; + hs->hs[1] = dw2; + hs->hs[2] = dw4; + hs->hs[3] = dw5; + + return true; +} + +static bool +ds_set_gen7_3DSTATE_TE(struct ilo_state_ds *ds, + const struct ilo_dev *dev, + const struct ilo_state_ds_info *info) +{ + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 7, 8); + + dw1 = 0; + + if (info->dispatch_enable) { + dw1 |= GEN7_TE_DW1_MODE_HW | + GEN7_TE_DW1_TE_ENABLE; + } + + STATIC_ASSERT(ARRAY_SIZE(ds->te) >= 3); + ds->te[0] = dw1; + ds->te[1] = fui(63.0f); + ds->te[2] = fui(64.0f); + + return true; +} + +static uint16_t +ds_get_gen7_thread_count(const struct ilo_dev *dev, + const struct ilo_state_ds_info *info) +{ + uint16_t thread_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + /* Maximum Number of Threads of 3DSTATE_DS */ + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + thread_count = 504; + break; + case ILO_GEN(7.5): + thread_count = (dev->gt >= 2) ? 280 : 70; + break; + case ILO_GEN(7): + default: + thread_count = dev->thread_count; + break; + } + + return thread_count - 1; +} + +static bool +ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds, + const struct ilo_dev *dev, + const struct ilo_state_ds_info *info) +{ + struct vertex_ff ff; + uint16_t thread_count; + uint32_t dw2, dw3, dw4, dw5; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel, + &info->resource, &info->urb, &ff)) + return false; + + thread_count = ds_get_gen7_thread_count(dev, info); + + dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav) + dw2 |= GEN75_THREADDISP_ACCESS_UAV; + + dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = ff.grf_start << GEN7_DS_DW4_URB_GRF_START__SHIFT | + ff.vue_read_len << GEN7_DS_DW4_URB_READ_LEN__SHIFT | + ff.vue_read_offset << GEN7_DS_DW4_URB_READ_OFFSET__SHIFT; + + dw5 = 0; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + dw5 |= thread_count << GEN75_DS_DW5_MAX_THREADS__SHIFT; + else + dw5 |= thread_count << GEN7_DS_DW5_MAX_THREADS__SHIFT; + + if (info->stats_enable) + dw5 |= GEN7_DS_DW5_STATISTICS; + if (info->dispatch_enable) + dw5 |= GEN7_DS_DW5_DS_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(ds->ds) >= 5); + ds->ds[0] = dw2; + ds->ds[1] = dw3; + ds->ds[2] = dw4; + ds->ds[3] = dw5; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + ds->ds[4] = ff.user_clip_enables << GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT; + + return true; +} + +static bool +gs_get_gen6_ff(const struct ilo_dev *dev, + const struct ilo_state_gs_info *info, + struct vertex_ff *ff) +{ + const struct ilo_state_shader_urb_info *urb = &info->urb; + const struct ilo_state_gs_sol_info *sol = &info->sol; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel, + &info->resource, &info->urb, ff)) + return false; + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 168-169: + * + * "[0,62] indicating [1,63] 16B units" + * + * "Programming Restrictions: The vertex size must be programmed as a + * multiple of 32B units with the following exception: Rendering is + * disabled (as per SOL stage state) and the vertex size output by the + * GS thread is 16B. + * + * If rendering is enabled (as per SOL state) the vertex size must be + * programmed as a multiple of 32B units. In other words, the only + * time software can program a vertex size with an odd number of 16B + * units is when rendering is disabled." + */ + assert(urb->output_attr_count <= 63); + if (!sol->render_disable) + assert(urb->output_attr_count % 2 == 0); + + return true; +} + +static uint16_t +gs_get_gen6_thread_count(const struct ilo_dev *dev, + const struct ilo_state_gs_info *info) +{ + const struct ilo_state_gs_sol_info *sol = &info->sol; + uint16_t thread_count; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* Maximum Number of Threads of 3DSTATE_GS */ + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + thread_count = 504; + break; + case ILO_GEN(7.5): + thread_count = (dev->gt >= 2) ? 256 : 70; + break; + case ILO_GEN(7): + case ILO_GEN(6): + default: + thread_count = dev->thread_count; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 154: + * + * "Maximum Number of Threads valid range is [0,27] when Rendering + * Enabled bit is set." + * + * According to the classic driver, [0, 20] for GT1. + */ + if (!sol->render_disable) + thread_count = (dev->gt == 2) ? 27 : 20; + break; + } + + return thread_count - 1; +} + +static bool +gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs, + const struct ilo_dev *dev, + const struct ilo_state_gs_info *info) +{ + const struct ilo_state_gs_sol_info *sol = &info->sol; + struct vertex_ff ff; + uint16_t thread_count; + uint32_t dw2, dw3, dw4, dw5, dw6; + + ILO_DEV_ASSERT(dev, 6, 6); + + if (!gs_get_gen6_ff(dev, info, &ff)) + return false; + + thread_count = gs_get_gen6_thread_count(dev, info); + + dw2 = GEN6_THREADDISP_SPF | + ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = ff.vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT | + ff.vue_read_offset << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT | + ff.grf_start << GEN6_GS_DW4_URB_GRF_START__SHIFT; + + dw5 = thread_count << GEN6_GS_DW5_MAX_THREADS__SHIFT; + + if (info->stats_enable) + dw5 |= GEN6_GS_DW5_STATISTICS; + if (sol->stats_enable) + dw5 |= GEN6_GS_DW5_SO_STATISTICS; + if (!sol->render_disable) + dw5 |= GEN6_GS_DW5_RENDER_ENABLE; + + dw6 = 0; + + /* GEN7_REORDER_TRAILING is handled by the kernel */ + if (sol->tristrip_reorder == GEN7_REORDER_LEADING) + dw6 |= GEN6_GS_DW6_REORDER_LEADING_ENABLE; + + if (sol->sol_enable) { + dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE; + + if (sol->svbi_post_inc) { + dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE | + sol->svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT; + } + } + + if (info->dispatch_enable) + dw6 |= GEN6_GS_DW6_GS_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5); + gs->gs[0] = dw2; + gs->gs[1] = dw3; + gs->gs[2] = dw4; + gs->gs[3] = dw5; + gs->gs[4] = dw6; + + return true; +} + +static uint8_t +gs_get_gen7_vertex_size(const struct ilo_dev *dev, + const struct ilo_state_gs_info *info) +{ + const struct ilo_state_shader_urb_info *urb = &info->urb; + + ILO_DEV_ASSERT(dev, 7, 8); + + return (urb->output_attr_count) ? urb->output_attr_count - 1 : 0; +} + +static bool +gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs, + const struct ilo_dev *dev, + const struct ilo_state_gs_info *info) +{ + struct vertex_ff ff; + uint16_t thread_count; + uint8_t vertex_size; + uint32_t dw2, dw3, dw4, dw5; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!gs_get_gen6_ff(dev, info, &ff)) + return false; + + thread_count = gs_get_gen6_thread_count(dev, info); + vertex_size = gs_get_gen7_vertex_size(dev, info); + + dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav) + dw2 |= GEN75_THREADDISP_ACCESS_UAV; + + dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = vertex_size << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT | + 0 << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT | + ff.vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT | + GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES | + ff.vue_read_offset << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT | + ff.grf_start << GEN7_GS_DW4_URB_GRF_START__SHIFT; + + dw5 = 0; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + dw5 = thread_count << GEN75_GS_DW5_MAX_THREADS__SHIFT; + else + dw5 = thread_count << GEN7_GS_DW5_MAX_THREADS__SHIFT; + + if (info->stats_enable) + dw5 |= GEN7_GS_DW5_STATISTICS; + if (info->dispatch_enable) + dw5 |= GEN7_GS_DW5_GS_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5); + gs->gs[0] = dw2; + gs->gs[1] = dw3; + gs->gs[2] = dw4; + gs->gs[3] = dw5; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + gs->gs[4] = ff.user_clip_enables << GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT; + + return true; +} + +bool +ilo_state_vs_init(struct ilo_state_vs *vs, + const struct ilo_dev *dev, + const struct ilo_state_vs_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(vs, sizeof(*vs))); + + ret &= vs_set_gen6_3DSTATE_VS(vs, dev, info); + + assert(ret); + + return ret; +} + +bool +ilo_state_vs_init_disabled(struct ilo_state_vs *vs, + const struct ilo_dev *dev) +{ + struct ilo_state_vs_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_vs_init(vs, dev, &info); +} + +bool +ilo_state_hs_init(struct ilo_state_hs *hs, + const struct ilo_dev *dev, + const struct ilo_state_hs_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(hs, sizeof(*hs))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= hs_set_gen7_3DSTATE_HS(hs, dev, info); + + assert(ret); + + return ret; +} + +bool +ilo_state_hs_init_disabled(struct ilo_state_hs *hs, + const struct ilo_dev *dev) +{ + struct ilo_state_hs_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_hs_init(hs, dev, &info); +} + +bool +ilo_state_ds_init(struct ilo_state_ds *ds, + const struct ilo_dev *dev, + const struct ilo_state_ds_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(ds, sizeof(*ds))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + ret &= ds_set_gen7_3DSTATE_TE(ds, dev, info); + ret &= ds_set_gen7_3DSTATE_DS(ds, dev, info); + } + + assert(ret); + + return ret; +} + +bool +ilo_state_ds_init_disabled(struct ilo_state_ds *ds, + const struct ilo_dev *dev) +{ + struct ilo_state_ds_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_ds_init(ds, dev, &info); +} + +bool +ilo_state_gs_init(struct ilo_state_gs *gs, + const struct ilo_dev *dev, + const struct ilo_state_gs_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(gs, sizeof(*gs))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= gs_set_gen7_3DSTATE_GS(gs, dev, info); + else + ret &= gs_set_gen6_3DSTATE_GS(gs, dev, info); + + assert(ret); + + return ret; +} + +bool +ilo_state_gs_init_disabled(struct ilo_state_gs *gs, + const struct ilo_dev *dev) +{ + struct ilo_state_gs_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_gs_init(gs, dev, &info); +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.h b/src/gallium/drivers/ilo/core/ilo_state_shader.h new file mode 100644 index 00000000000..44690c5b0bb --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_shader.h @@ -0,0 +1,256 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_SHADER_H +#define ILO_STATE_SHADER_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/** + * Kernel information. + */ +struct ilo_state_shader_kernel_info { + /* usually 0 unless the shader has multiple kernels */ + uint32_t offset; + + uint8_t grf_start; + uint8_t pcb_attr_count; + + uint32_t scratch_size; +}; + +/** + * Shader resources. + */ +struct ilo_state_shader_resource_info { + /* for prefetches */ + uint8_t sampler_count; + uint8_t surface_count; + + bool has_uav; +}; + +/** + * URB inputs/outputs. + */ +struct ilo_state_shader_urb_info { + uint8_t cv_input_attr_count; + + uint8_t read_base; + uint8_t read_count; + + uint8_t output_attr_count; + + uint8_t user_cull_enables; + uint8_t user_clip_enables; +}; + +struct ilo_state_vs_info { + struct ilo_state_shader_kernel_info kernel; + struct ilo_state_shader_resource_info resource; + struct ilo_state_shader_urb_info urb; + + bool dispatch_enable; + bool stats_enable; +}; + +struct ilo_state_hs_info { + struct ilo_state_shader_kernel_info kernel; + struct ilo_state_shader_resource_info resource; + struct ilo_state_shader_urb_info urb; + + bool dispatch_enable; + bool stats_enable; +}; + +struct ilo_state_ds_info { + struct ilo_state_shader_kernel_info kernel; + struct ilo_state_shader_resource_info resource; + struct ilo_state_shader_urb_info urb; + + bool dispatch_enable; + bool stats_enable; +}; + +/** + * Stream output. Must be consistent with ilo_state_sol_info. + */ +struct ilo_state_gs_sol_info { + bool sol_enable; + bool stats_enable; + bool render_disable; + + uint16_t svbi_post_inc; + + enum gen_reorder_mode tristrip_reorder; +}; + +struct ilo_state_gs_info { + struct ilo_state_shader_kernel_info kernel; + struct ilo_state_shader_resource_info resource; + struct ilo_state_shader_urb_info urb; + + struct ilo_state_gs_sol_info sol; + + bool dispatch_enable; + bool stats_enable; +}; + +struct ilo_state_ps_io_info { + /* inputs */ + enum gen_position_offset posoffset; + uint8_t attr_count; + bool use_z; + bool use_w; + bool use_coverage_mask; + + /* outputs */ + enum gen_pscdepth_mode pscdepth; + bool has_rt_write; + bool write_pixel_mask; + bool write_omask; +}; + +struct ilo_state_ps_params_info { + /* compatibility with raster states */ + uint32_t sample_mask; + bool earlyz_control_psexec; + + /* compatibility with cc states */ + bool alpha_may_kill; + bool dual_source_blending; + bool has_writeable_rt; +}; + +struct ilo_state_ps_info { + struct ilo_state_shader_kernel_info kernel_8; + struct ilo_state_shader_kernel_info kernel_16; + struct ilo_state_shader_kernel_info kernel_32; + struct ilo_state_shader_resource_info resource; + + struct ilo_state_ps_io_info io; + struct ilo_state_ps_params_info params; + + /* bitmask of GEN6_PS_DISPATCH_x */ + uint8_t valid_kernels; + bool per_sample_dispatch; + bool sample_count_one; + bool cv_per_sample_interp; + bool cv_has_earlyz_op; + + bool rt_clear_enable; + bool rt_resolve_enable; + + bool cv_has_depth_buffer; +}; + +struct ilo_state_vs { + uint32_t vs[5]; +}; + +struct ilo_state_hs { + uint32_t hs[4]; +}; + +struct ilo_state_ds { + uint32_t te[3]; + uint32_t ds[5]; +}; + +struct ilo_state_gs { + uint32_t gs[5]; +}; + +struct ilo_state_ps { + uint32_t ps[8]; + + struct ilo_state_ps_dispatch_conds { + bool ps_valid; + + bool has_rt_write; + bool write_odepth; + bool write_ostencil; + bool has_uav_write; + bool ps_may_kill; + } conds; +}; + +bool +ilo_state_vs_init(struct ilo_state_vs *vs, + const struct ilo_dev *dev, + const struct ilo_state_vs_info *info); + +bool +ilo_state_vs_init_disabled(struct ilo_state_vs *vs, + const struct ilo_dev *dev); + +bool +ilo_state_hs_init(struct ilo_state_hs *hs, + const struct ilo_dev *dev, + const struct ilo_state_hs_info *info); + +bool +ilo_state_hs_init_disabled(struct ilo_state_hs *hs, + const struct ilo_dev *dev); + + +bool +ilo_state_ds_init(struct ilo_state_ds *ds, + const struct ilo_dev *dev, + const struct ilo_state_ds_info *info); + +bool +ilo_state_ds_init_disabled(struct ilo_state_ds *ds, + const struct ilo_dev *dev); + +bool +ilo_state_gs_init(struct ilo_state_gs *gs, + const struct ilo_dev *dev, + const struct ilo_state_gs_info *info); + +bool +ilo_state_gs_init_disabled(struct ilo_state_gs *gs, + const struct ilo_dev *dev); + +bool +ilo_state_ps_init(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info); + +bool +ilo_state_ps_init_disabled(struct ilo_state_ps *ps, + const struct ilo_dev *dev); + +bool +ilo_state_ps_set_params(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_params_info *params); + +#endif /* ILO_STATE_SHADER_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c new file mode 100644 index 00000000000..f4d801e9b56 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c @@ -0,0 +1,771 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_shader.h" + +struct pixel_ff { + uint8_t dispatch_modes; + + uint32_t kernel_offsets[3]; + uint8_t grf_starts[3]; + bool pcb_enable; + uint8_t scratch_space; + + uint8_t sampler_count; + uint8_t surface_count; + bool has_uav; + + uint16_t thread_count; + + struct ilo_state_ps_dispatch_conds conds; + + bool kill_pixel; + bool dispatch_enable; + bool dual_source_blending; + uint32_t sample_mask; +}; + +static bool +ps_kernel_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_shader_kernel_info *kernel) +{ + /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */ + const uint8_t max_grf_start = 128; + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 271: + * + * "(Per-Thread Scratch Space) + * Range [0,11] indicating [1k bytes, 2M bytes] in powers of two" + */ + const uint32_t max_scratch_size = 2 * 1024 * 1024; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* "Kernel Start Pointer" is 64-byte aligned */ + assert(kernel->offset % 64 == 0); + + assert(kernel->grf_start < max_grf_start); + assert(kernel->scratch_size <= max_scratch_size); + + return true; +} + +static bool +ps_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_ps_info *info) +{ + const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8; + const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16; + const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32; + const struct ilo_state_ps_io_info *io = &info->io; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!ps_kernel_validate_gen6(dev, kernel_8) || + !ps_kernel_validate_gen6(dev, kernel_16) || + !ps_kernel_validate_gen6(dev, kernel_32)) + return false; + + /* unsupported on Gen6 */ + if (ilo_dev_gen(dev) == ILO_GEN(6)) + assert(!io->use_coverage_mask); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 275: + * + * "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth + * field must be set to disabled." + */ + if (ilo_dev_gen(dev) == ILO_GEN(6) && io->pscdepth != GEN7_PSCDEPTH_OFF) + assert(info->cv_has_depth_buffer); + + if (!info->per_sample_dispatch) { + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 281: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE." + */ + assert(io->posoffset != GEN6_POSOFFSET_SAMPLE); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 282: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * INTERP_SAMPLE." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 283: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non-perspective Sample barycentric coordinates." + */ + assert(!info->cv_per_sample_interp); + } + + /* + * + * From the Sandy Bridge PRM, volume 2 part 1, page 314: + * + * "Pixel Shader Dispatch, Alpha... must all be disabled." + * + * Simply disallow any valid kernel when there is early-z op. Also, when + * there is no valid kernel, io should be zeroed. + */ + if (info->valid_kernels) + assert(!info->cv_has_earlyz_op); + else + assert(ilo_is_zeroed(io, sizeof(*io))); + + return true; +} + +static uint8_t +ps_get_gen6_dispatch_modes(const struct ilo_dev *dev, + const struct ilo_state_ps_info *info) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint8_t dispatch_modes = info->valid_kernels; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!dispatch_modes) + return 0; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 334: + * + * "Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader + * computed depth." + * + * "Valid on all products, except when in non-1x PERSAMPLE mode + * (applies to [DevSNB+] only)" + * + * From the Sandy Bridge PRM, volume 4 part 1, page 239: + * + * "[DevSNB]: When Pixel Shader outputs oDepth and PS invocation mode + * is PERPIXEL, Message Type for Render Target Write must be SIMD8. + * + * Errata: [DevSNB+]: When Pixel Shader outputs oMask, this message + * type is not supported: SIMD8 (including SIMD8_DUALSRC_xx)." + * + * It is really hard to follow what combinations are valid on what + * platforms. Judging from the restrictions on RT write messages on Gen6, + * oDepth and oMask related issues should be Gen6-specific. PERSAMPLE + * issue should be universal, and disallows multiple dispatch modes. + */ + if (ilo_dev_gen(dev) == ILO_GEN(6)) { + if (io->pscdepth != GEN7_PSCDEPTH_OFF && !info->per_sample_dispatch) + dispatch_modes &= GEN6_PS_DISPATCH_8; + if (io->write_omask) + dispatch_modes &= ~GEN6_PS_DISPATCH_8; + } + if (info->per_sample_dispatch && !info->sample_count_one) { + /* prefer 32 over 16 over 8 */ + if (dispatch_modes & GEN6_PS_DISPATCH_32) + dispatch_modes &= GEN6_PS_DISPATCH_32; + else if (dispatch_modes & GEN6_PS_DISPATCH_16) + dispatch_modes &= GEN6_PS_DISPATCH_16; + else + dispatch_modes &= GEN6_PS_DISPATCH_8; + } + + /* + * From the Broadwell PRM, volume 2b, page 149: + * + * "When Render Target Fast Clear Enable is ENABLED or Render Target + * Resolve Type = RESOLVE_PARTIAL or RESOLVE_FULL, this bit (8 Pixel + * Dispatch or Dual-8 Pixel Dispatch Enable) must be DISABLED." + */ + if (info->rt_clear_enable || info->rt_resolve_enable) + dispatch_modes &= ~GEN6_PS_DISPATCH_8; + + assert(dispatch_modes); + + return dispatch_modes; +} + +static uint16_t +ps_get_gen6_thread_count(const struct ilo_dev *dev, + const struct ilo_state_ps_info *info) +{ + uint16_t thread_count; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* Maximum Number of Threads of 3DSTATE_PS */ + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + /* scaled automatically */ + thread_count = 64 - 1; + break; + case ILO_GEN(7.5): + thread_count = (dev->gt == 3) ? 408 : + (dev->gt == 2) ? 204 : 102; + break; + case ILO_GEN(7): + thread_count = (dev->gt == 2) ? 172 : 48; + break; + case ILO_GEN(6): + default: + /* from the classic driver instead of the PRM */ + thread_count = (dev->gt == 2) ? 80 : 40; + break; + } + + return thread_count - 1; +} + +static bool +ps_params_get_gen6_kill_pixel(const struct ilo_dev *dev, + const struct ilo_state_ps_params_info *params, + const struct ilo_state_ps_dispatch_conds *conds) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 275: + * + * "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the + * PS kernel or color calculator has the ability to kill (discard) + * pixels or samples, other than due to depth or stencil testing. + * This bit is required to be ENABLED in the following situations: + * + * The API pixel shader program contains "killpix" or "discard" + * instructions, or other code in the pixel shader kernel that can + * cause the final pixel mask to differ from the pixel mask received + * on dispatch. + * + * A sampler with chroma key enabled with kill pixel mode is used by + * the pixel shader. + * + * Any render target has Alpha Test Enable or AlphaToCoverage Enable + * enabled. + * + * The pixel shader kernel generates and outputs oMask. + * + * Note: As ClipDistance clipping is fully supported in hardware and + * therefore not via PS instructions, there should be no need to + * ENABLE this bit due to ClipDistance clipping." + */ + return (conds->ps_may_kill || params->alpha_may_kill); +} + +static bool +ps_params_get_gen6_dispatch_enable(const struct ilo_dev *dev, + const struct ilo_state_ps_params_info *params, + const struct ilo_state_ps_dispatch_conds *conds) +{ + /* + * We want to skip dispatching when EarlyZ suffices. The conditions that + * require dispatching are + * + * - PS writes RTs and RTs are writeable + * - PS changes depth value and depth test/write is enabled + * - PS changes stencil value and stencil test is enabled + * - PS writes UAVs + * - PS or CC kills pixels + * - EDSC is PSEXEC, and depth test/write or stencil test is enabled + */ + bool dispatch_required = + ((conds->has_rt_write && params->has_writeable_rt) || + conds->write_odepth || + conds->write_ostencil || + conds->has_uav_write || + ps_params_get_gen6_kill_pixel(dev, params, conds) || + params->earlyz_control_psexec); + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 280: + * + * "If EDSC_PSEXEC mode is selected, Thread Dispatch Enable must be + * set." + */ + if (ilo_dev_gen(dev) < ILO_GEN(8) && params->earlyz_control_psexec) + dispatch_required = true; + + /* assert it is valid to dispatch */ + if (dispatch_required) + assert(conds->ps_valid); + + return dispatch_required; +} + +static bool +ps_get_gen6_ff_kernels(const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + struct pixel_ff *ff) +{ + const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8; + const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16; + const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32; + uint32_t scratch_size; + + ILO_DEV_ASSERT(dev, 6, 8); + + ff->dispatch_modes = ps_get_gen6_dispatch_modes(dev, info); + + /* initialize kernel offsets and GRF starts */ + if (util_is_power_of_two(ff->dispatch_modes)) { + if (ff->dispatch_modes & GEN6_PS_DISPATCH_8) { + ff->kernel_offsets[0] = kernel_8->offset; + ff->grf_starts[0] = kernel_8->grf_start; + } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_16) { + ff->kernel_offsets[0] = kernel_16->offset; + ff->grf_starts[0] = kernel_16->grf_start; + } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_32) { + ff->kernel_offsets[0] = kernel_32->offset; + ff->grf_starts[0] = kernel_32->grf_start; + } + } else { + ff->kernel_offsets[0] = kernel_8->offset; + ff->kernel_offsets[1] = kernel_32->offset; + ff->kernel_offsets[2] = kernel_16->offset; + + ff->grf_starts[0] = kernel_8->grf_start; + ff->grf_starts[1] = kernel_32->grf_start; + ff->grf_starts[2] = kernel_16->grf_start; + } + + /* we do not want to save it */ + assert(ff->kernel_offsets[0] == 0); + + ff->pcb_enable = (((ff->dispatch_modes & GEN6_PS_DISPATCH_8) && + kernel_8->pcb_attr_count) || + ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) && + kernel_16->pcb_attr_count) || + ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) && + kernel_32->pcb_attr_count)); + + scratch_size = 0; + if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) && + scratch_size < kernel_8->scratch_size) + scratch_size = kernel_8->scratch_size; + if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) && + scratch_size < kernel_16->scratch_size) + scratch_size = kernel_16->scratch_size; + if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) && + scratch_size < kernel_32->scratch_size) + scratch_size = kernel_32->scratch_size; + + /* next power of two, starting from 1KB */ + ff->scratch_space = (scratch_size > 1024) ? + (util_last_bit(scratch_size - 1) - 10): 0; + + /* GPU hangs on Haswell if none of the dispatch mode bits is set */ + if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes) + ff->dispatch_modes |= GEN6_PS_DISPATCH_8; + + return true; +} + +static bool +ps_get_gen6_ff(const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + struct pixel_ff *ff) +{ + const struct ilo_state_shader_resource_info *resource = &info->resource; + const struct ilo_state_ps_io_info *io = &info->io; + const struct ilo_state_ps_params_info *params = &info->params; + + ILO_DEV_ASSERT(dev, 6, 8); + + memset(ff, 0, sizeof(*ff)); + + if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff)) + return false; + + ff->sampler_count = (resource->sampler_count <= 12) ? + (resource->sampler_count + 3) / 4 : 4; + ff->surface_count = resource->surface_count; + ff->has_uav = resource->has_uav; + + ff->thread_count = ps_get_gen6_thread_count(dev, info); + + ff->conds.ps_valid = (info->valid_kernels != 0x0); + ff->conds.has_rt_write = io->has_rt_write; + ff->conds.write_odepth = (io->pscdepth != GEN7_PSCDEPTH_OFF); + ff->conds.write_ostencil = false; + ff->conds.has_uav_write = resource->has_uav; + ff->conds.ps_may_kill = (io->write_pixel_mask || io->write_omask); + + ff->kill_pixel = ps_params_get_gen6_kill_pixel(dev, params, &ff->conds); + ff->dispatch_enable = + ps_params_get_gen6_dispatch_enable(dev, params, &ff->conds); + ff->dual_source_blending = params->dual_source_blending; + ff->sample_mask = params->sample_mask; + + return true; +} + +static bool +ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + const struct pixel_ff *ff) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint32_t dw2, dw3, dw4, dw5, dw6; + + ILO_DEV_ASSERT(dev, 6, 6); + + dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (false) + dw2 |= GEN6_THREADDISP_FP_MODE_ALT; + + dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT | + ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT | + ff->grf_starts[2] << GEN6_WM_DW4_URB_GRF_START2__SHIFT; + + dw5 = ff->thread_count << GEN6_WM_DW5_MAX_THREADS__SHIFT | + ff->dispatch_modes << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT; + + if (ff->kill_pixel) + dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL; + + if (io->pscdepth != GEN7_PSCDEPTH_OFF) + dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH; + if (io->use_z) + dw5 |= GEN6_WM_DW5_PS_USE_DEPTH; + + if (ff->dispatch_enable) + dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE; + + if (io->write_omask) + dw5 |= GEN6_WM_DW5_PS_COMPUTE_OMASK; + if (io->use_w) + dw5 |= GEN6_WM_DW5_PS_USE_W; + + if (ff->dual_source_blending) + dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND; + + dw6 = io->attr_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT | + io->posoffset << GEN6_WM_DW6_PS_POSOFFSET__SHIFT; + + dw6 |= (info->per_sample_dispatch) ? + GEN6_WM_DW6_MSDISPMODE_PERSAMPLE : GEN6_WM_DW6_MSDISPMODE_PERPIXEL; + + STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 7); + ps->ps[0] = dw2; + ps->ps[1] = dw3; + ps->ps[2] = dw4; + ps->ps[3] = dw5; + ps->ps[4] = dw6; + ps->ps[5] = ff->kernel_offsets[1]; + ps->ps[6] = ff->kernel_offsets[2]; + + return true; +} + +static bool +ps_set_gen7_3dstate_wm(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + const struct pixel_ff *ff) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 7, 7.5); + + dw1 = io->pscdepth << GEN7_WM_DW1_PSCDEPTH__SHIFT; + + if (ff->dispatch_enable) + dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE; + if (ff->kill_pixel) + dw1 |= GEN7_WM_DW1_PS_KILL_PIXEL; + + if (io->use_z) + dw1 |= GEN7_WM_DW1_PS_USE_DEPTH; + if (io->use_w) + dw1 |= GEN7_WM_DW1_PS_USE_W; + if (io->use_coverage_mask) + dw1 |= GEN7_WM_DW1_PS_USE_COVERAGE_MASK; + + dw2 = (info->per_sample_dispatch) ? + GEN7_WM_DW2_MSDISPMODE_PERSAMPLE : GEN7_WM_DW2_MSDISPMODE_PERPIXEL; + + STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 2); + ps->ps[0] = dw1; + ps->ps[1] = dw2; + + return true; +} + +static bool +ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + const struct pixel_ff *ff) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint32_t dw2, dw3, dw4, dw5; + + ILO_DEV_ASSERT(dev, 7, 7.5); + + dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (false) + dw2 |= GEN6_THREADDISP_FP_MODE_ALT; + + dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT | + ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT; + + if (ilo_dev_gen(dev) == ILO_GEN(7.5)) { + dw4 |= ff->thread_count << GEN75_PS_DW4_MAX_THREADS__SHIFT | + (ff->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT; + } else { + dw4 |= ff->thread_count << GEN7_PS_DW4_MAX_THREADS__SHIFT; + } + + if (ff->pcb_enable) + dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE; + if (io->attr_count) + dw4 |= GEN7_PS_DW4_ATTR_ENABLE; + if (io->write_omask) + dw4 |= GEN7_PS_DW4_COMPUTE_OMASK; + if (info->rt_clear_enable) + dw4 |= GEN7_PS_DW4_RT_FAST_CLEAR; + if (ff->dual_source_blending) + dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND; + if (info->rt_resolve_enable) + dw4 |= GEN7_PS_DW4_RT_RESOLVE; + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff->has_uav) + dw4 |= GEN75_PS_DW4_ACCESS_UAV; + + dw5 = ff->grf_starts[0] << GEN7_PS_DW5_URB_GRF_START0__SHIFT | + ff->grf_starts[1] << GEN7_PS_DW5_URB_GRF_START1__SHIFT | + ff->grf_starts[2] << GEN7_PS_DW5_URB_GRF_START2__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 8); + ps->ps[2] = dw2; + ps->ps[3] = dw3; + ps->ps[4] = dw4; + ps->ps[5] = dw5; + ps->ps[6] = ff->kernel_offsets[1]; + ps->ps[7] = ff->kernel_offsets[2]; + + return true; +} + +static bool +ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + const struct pixel_ff *ff) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint32_t dw3, dw4, dw6, dw7; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; + + if (false) + dw3 |= GEN6_THREADDISP_FP_MODE_ALT; + + dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT; + + dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT | + io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT | + ff->dispatch_modes << GEN8_PS_DW6_DISPATCH_MODE__SHIFT; + + if (ff->pcb_enable) + dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE; + + if (info->rt_clear_enable) + dw6 |= GEN8_PS_DW6_RT_FAST_CLEAR; + if (info->rt_resolve_enable) + dw6 |= GEN8_PS_DW6_RT_RESOLVE; + + dw7 = ff->grf_starts[0] << GEN8_PS_DW7_URB_GRF_START0__SHIFT | + ff->grf_starts[1] << GEN8_PS_DW7_URB_GRF_START1__SHIFT | + ff->grf_starts[2] << GEN8_PS_DW7_URB_GRF_START2__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 6); + ps->ps[0] = dw3; + ps->ps[1] = dw4; + ps->ps[2] = dw6; + ps->ps[3] = dw7; + ps->ps[4] = ff->kernel_offsets[1]; + ps->ps[5] = ff->kernel_offsets[2]; + + return true; +} + +static bool +ps_set_gen8_3DSTATE_PS_EXTRA(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info, + const struct pixel_ff *ff) +{ + const struct ilo_state_ps_io_info *io = &info->io; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw1 = io->pscdepth << GEN8_PSX_DW1_PSCDEPTH__SHIFT; + + if (info->valid_kernels) + dw1 |= GEN8_PSX_DW1_VALID; + if (!io->has_rt_write) + dw1 |= GEN8_PSX_DW1_UAV_ONLY; + if (io->write_omask) + dw1 |= GEN8_PSX_DW1_COMPUTE_OMASK; + if (io->write_pixel_mask) + dw1 |= GEN8_PSX_DW1_KILL_PIXEL; + + if (io->use_z) + dw1 |= GEN8_PSX_DW1_USE_DEPTH; + if (io->use_w) + dw1 |= GEN8_PSX_DW1_USE_W; + if (io->attr_count) + dw1 |= GEN8_PSX_DW1_ATTR_ENABLE; + + if (info->per_sample_dispatch) + dw1 |= GEN8_PSX_DW1_PER_SAMPLE; + if (ff->has_uav) + dw1 |= GEN8_PSX_DW1_ACCESS_UAV; + if (io->use_coverage_mask) + dw1 |= GEN8_PSX_DW1_USE_COVERAGE_MASK; + + /* + * From the Broadwell PRM, volume 2b, page 151: + * + * "When this bit (Pixel Shader Valid) clear the rest of this command + * should also be clear. + */ + if (!info->valid_kernels) + dw1 = 0; + + STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 5); + ps->ps[4] = dw1; + + return true; +} + +bool +ilo_state_ps_init(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_info *info) +{ + struct pixel_ff ff; + bool ret = true; + + assert(ilo_is_zeroed(ps, sizeof(*ps))); + + ret &= ps_get_gen6_ff(dev, info, &ff); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + ret &= ps_set_gen8_3DSTATE_PS(ps, dev, info, &ff); + ret &= ps_set_gen8_3DSTATE_PS_EXTRA(ps, dev, info, &ff); + } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + ret &= ps_set_gen7_3dstate_wm(ps, dev, info, &ff); + ret &= ps_set_gen7_3DSTATE_PS(ps, dev, info, &ff); + } else { + ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff); + } + + /* save conditions */ + ps->conds = ff.conds; + + assert(ret); + + return ret; +} + +bool +ilo_state_ps_init_disabled(struct ilo_state_ps *ps, + const struct ilo_dev *dev) +{ + struct ilo_state_ps_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_ps_init(ps, dev, &info); +} + +bool +ilo_state_ps_set_params(struct ilo_state_ps *ps, + const struct ilo_dev *dev, + const struct ilo_state_ps_params_info *params) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* modify sample mask */ + if (ilo_dev_gen(dev) == ILO_GEN(7.5)) { + ps->ps[4] = (ps->ps[4] & ~GEN75_PS_DW4_SAMPLE_MASK__MASK) | + (params->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT; + } + + /* modify dispatch enable, pixel kill, and dual source blending */ + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds)) + ps->ps[0] |= GEN7_WM_DW1_PS_DISPATCH_ENABLE; + else + ps->ps[0] &= ~GEN7_WM_DW1_PS_DISPATCH_ENABLE; + + if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds)) + ps->ps[0] |= GEN7_WM_DW1_PS_KILL_PIXEL; + else + ps->ps[0] &= ~GEN7_WM_DW1_PS_KILL_PIXEL; + + if (params->dual_source_blending) + ps->ps[4] |= GEN7_PS_DW4_DUAL_SOURCE_BLEND; + else + ps->ps[4] &= ~GEN7_PS_DW4_DUAL_SOURCE_BLEND; + } else { + if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds)) + ps->ps[3] |= GEN6_WM_DW5_PS_DISPATCH_ENABLE; + else + ps->ps[3] &= ~GEN6_WM_DW5_PS_DISPATCH_ENABLE; + + if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds)) + ps->ps[3] |= GEN6_WM_DW5_PS_KILL_PIXEL; + else + ps->ps[3] &= ~GEN6_WM_DW5_PS_KILL_PIXEL; + + if (params->dual_source_blending) + ps->ps[3] |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND; + else + ps->ps[3] &= ~GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND; + } + } + + return true; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c new file mode 100644 index 00000000000..38c0b719ab3 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c @@ -0,0 +1,464 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_buffer.h" +#include "ilo_state_sol.h" + +static bool +sol_stream_validate_gen7(const struct ilo_dev *dev, + const struct ilo_state_sol_stream_info *stream) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 7, 8); + + assert(stream->vue_read_base + stream->vue_read_count <= + stream->cv_vue_attr_count); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 200: + * + * "(Stream 0 Vertex Read Offset) + * Format: U1 count of 256-bit units + * + * Specifies amount of data to skip over before reading back Stream 0 + * vertex data. Must be zero if the GS is enabled and the Output + * Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B + * unit)." + * + * "(Stream 0 Vertex Read Length) + * Format: U5-1 count of 256-bit units + * + * Specifies amount of vertex data to read back for Stream 0 vertices, + * starting at the Stream 0 Vertex Read Offset location. Maximum + * readback is 17 256-bit units (34 128-bit vertex attributes). Read + * data past the end of the valid vertex data has undefined contents, + * and therefore shouldn't be used to source stream out data. Must be + * zero (i.e., read length = 256b) if the GS is enabled and the Output + * Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B + * unit)." + */ + assert(stream->vue_read_base == 0 || stream->vue_read_base == 2); + assert(stream->vue_read_count <= 34); + + assert(stream->decl_count <= ILO_STATE_SOL_MAX_DECL_COUNT); + + for (i = 0; i < stream->decl_count; i++) { + const struct ilo_state_sol_decl_info *decl = &stream->decls[i]; + + assert(decl->is_hole || decl->attr < stream->vue_read_count); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 205: + * + * "There is only enough internal storage for the 128-bit vertex + * header and 32 128-bit vertex attributes." + */ + assert(decl->attr < 33); + + assert(decl->component_base < 4 && + decl->component_base + decl->component_count <= 4); + assert(decl->buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT); + } + + return true; +} + +static bool +sol_validate_gen7(const struct ilo_dev *dev, + const struct ilo_state_sol_info *info) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 7, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 198: + * + * "This bit (Render Stream Select) is used even if SO Function Enable + * is DISABLED." + * + * From the Haswell PRM, volume 2b, page 796: + * + * "SO Function Enable must also be ENABLED in order for thiis field + * (Render Stream Select) to select a stream for rendering. When SO + * Function Enable is DISABLED and Rendering Disable is cleared (i.e., + * rendering is enabled), StreamID is ignored downstream of the SO + * stage, allowing any stream to be rendered." + * + * We want Gen7 behavior, but we have to require users to follow Gen7.5 + * behavior: info->sol_enable must be set for info->render_stream to work. + */ + + for (i = 0; i < ARRAY_SIZE(info->streams); i++) { + if (!sol_stream_validate_gen7(dev, &info->streams[i])) + return false; + } + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 208: + * + * "(Surface Pitch) + * [0,2048] Must be 0 or a multiple of 4 Bytes." + */ + for (i = 0; i < ARRAY_SIZE(info->buffer_strides); i++) { + assert(info->buffer_strides[i] <= 2048 && + info->buffer_strides[i] % 4 == 0); + } + + return true; +} + +static bool +sol_set_gen7_3DSTATE_STREAMOUT(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + const struct ilo_state_sol_info *info) +{ + struct { + uint8_t offset; + uint8_t len; + } vue_read[ILO_STATE_SOL_MAX_STREAM_COUNT]; + uint8_t i; + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!sol_validate_gen7(dev, info)) + return false; + + for (i = 0; i < ARRAY_SIZE(info->streams); i++) { + const struct ilo_state_sol_stream_info *stream = &info->streams[i]; + + vue_read[i].offset = stream->vue_read_base / 2; + /* + * In pairs minus 1. URB entries are aligned to 512-bits. There is no + * need to worry about reading past entries. + */ + vue_read[i].len = (stream->vue_read_count + 1) / 2; + if (vue_read[i].len) + vue_read[i].len--; + } + + dw1 = info->render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT | + info->tristrip_reorder << GEN7_SO_DW1_REORDER_MODE__SHIFT; + + if (info->sol_enable) + dw1 |= GEN7_SO_DW1_SO_ENABLE; + + if (info->render_disable) + dw1 |= GEN7_SO_DW1_RENDER_DISABLE; + + if (info->stats_enable) + dw1 |= GEN7_SO_DW1_STATISTICS; + + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + const uint8_t buffer_enables = ((bool) info->buffer_strides[3]) << 3 | + ((bool) info->buffer_strides[2]) << 2 | + ((bool) info->buffer_strides[1]) << 1 | + ((bool) info->buffer_strides[0]); + dw1 |= buffer_enables << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT; + } + + dw2 = vue_read[3].offset << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT | + vue_read[3].len << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT | + vue_read[2].offset << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT | + vue_read[2].len << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT | + vue_read[1].offset << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT | + vue_read[1].len << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT | + vue_read[0].offset << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT | + vue_read[0].len << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(sol->streamout) >= 2); + sol->streamout[0] = dw1; + sol->streamout[1] = dw2; + + memcpy(sol->strides, info->buffer_strides, sizeof(sol->strides)); + + return true; +} + +static bool +sol_set_gen7_3DSTATE_SO_DECL_LIST(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + const struct ilo_state_sol_info *info, + uint8_t max_decl_count) +{ + uint64_t decl_list[ILO_STATE_SOL_MAX_DECL_COUNT]; + uint8_t decl_counts[ILO_STATE_SOL_MAX_STREAM_COUNT]; + uint8_t buffer_selects[ILO_STATE_SOL_MAX_STREAM_COUNT]; + uint32_t dw1, dw2; + uint8_t i, j; + + ILO_DEV_ASSERT(dev, 7, 8); + + memset(decl_list, 0, sizeof(decl_list[0]) * max_decl_count); + + for (i = 0; i < ARRAY_SIZE(info->streams); i++) { + const struct ilo_state_sol_stream_info *stream = &info->streams[i]; + + assert(stream->decl_count <= max_decl_count); + decl_counts[i] = stream->decl_count; + buffer_selects[i] = 0; + + for (j = 0; j < stream->decl_count; j++) { + const struct ilo_state_sol_decl_info *decl = &stream->decls[j]; + const uint8_t mask = ((1 << decl->component_count) - 1) << + decl->component_base; + uint16_t val; + + val = decl->buffer << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT | + mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT; + + if (decl->is_hole) + val |= GEN7_SO_DECL_HOLE_FLAG; + else + val |= decl->attr << GEN7_SO_DECL_REG_INDEX__SHIFT; + + decl_list[j] |= (uint64_t) val << (16 * i); + buffer_selects[i] |= 1 << decl->buffer; + } + } + + dw1 = buffer_selects[3] << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT | + buffer_selects[2] << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT | + buffer_selects[1] << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT | + buffer_selects[0] << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT; + dw2 = decl_counts[3] << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT | + decl_counts[2] << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT | + decl_counts[1] << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT | + decl_counts[0] << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(sol->so_decl) >= 2); + sol->so_decl[0] = dw1; + sol->so_decl[1] = dw2; + + STATIC_ASSERT(ARRAY_SIZE(sol->decl[0]) == 2); + memcpy(sol->decl, decl_list, sizeof(sol->decl[0]) * max_decl_count); + sol->decl_count = max_decl_count; + + return true; +} + +static bool +sol_buffer_validate_gen7(const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info) +{ + ILO_DEV_ASSERT(dev, 7, 8); + + if (info->buf) + assert(info->offset < info->buf->bo_size && info->size); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 208: + * + * "(Surface Base Address) This field specifies the starting DWord + * address..." + */ + assert(info->offset % 4 == 0); + + /* Gen8+ only */ + if (info->write_offset_load || info->write_offset_save) + assert(ilo_dev_gen(dev) >= ILO_GEN(8)); + + /* + * From the Broadwell PRM, volume 2b, page 206: + * + * "This field (Stream Offset) specifies the Offset in stream output + * buffer to start at, or whether to append to the end of an existing + * buffer. The Offset must be DWORD aligned." + */ + if (info->write_offset_imm_enable) { + assert(info->write_offset_load); + assert(info->write_offset_imm % 4 == 0); + } + + return true; +} + +static uint32_t +sol_buffer_get_gen6_size(const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info) +{ + uint32_t size; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!info->buf) + return 0; + + size = (info->offset + info->size <= info->buf->bo_size) ? info->size : + info->buf->bo_size - info->offset; + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 208: + * + * "(Surface End Address) This field specifies the ending DWord + * address..." + */ + size &= ~3; + + return size; +} + +static bool +sol_buffer_set_gen7_3dstate_so_buffer(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info) +{ + const uint32_t size = sol_buffer_get_gen6_size(dev, info); + + ILO_DEV_ASSERT(dev, 7, 7.5); + + if (!sol_buffer_validate_gen7(dev, info)) + return false; + + STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 2); + sb->so_buf[0] = info->offset; + sb->so_buf[1] = (size) ? info->offset + size : 0; + + return true; +} + +static bool +sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info) +{ + const uint32_t size = sol_buffer_get_gen6_size(dev, info); + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 8, 8); + + if (!sol_buffer_validate_gen7(dev, info)) + return false; + + dw1 = 0; + + if (info->buf) + dw1 |= GEN8_SO_BUF_DW1_ENABLE; + if (info->write_offset_load) + dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE; + if (info->write_offset_save) + dw1 |= GEN8_SO_BUF_DW1_OFFSET_ENABLE; + + STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 4); + sb->so_buf[0] = dw1; + sb->so_buf[1] = info->offset; + + /* + * From the Broadwell PRM, volume 2b, page 205: + * + * "This field (Surface Size) specifies the size of buffer in number + * DWords minus 1 of the buffer in Graphics Memory." + */ + sb->so_buf[2] = (size) ? size / 4 - 1 : 0; + + /* load from imm or sb->write_offset_bo */ + sb->so_buf[3] = (info->write_offset_imm_enable) ? + info->write_offset_imm : ~0u; + + return true; +} + +bool +ilo_state_sol_init(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + const struct ilo_state_sol_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(sol, sizeof(*sol))); + assert(ilo_is_zeroed(info->data, info->data_size)); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + uint8_t max_decl_count, i; + + max_decl_count = info->streams[0].decl_count; + for (i = 1; i < ARRAY_SIZE(info->streams); i++) { + if (max_decl_count < info->streams[i].decl_count) + max_decl_count = info->streams[i].decl_count; + } + + assert(ilo_state_sol_data_size(dev, max_decl_count) <= info->data_size); + sol->decl = (uint32_t (*)[2]) info->data; + + ret &= sol_set_gen7_3DSTATE_STREAMOUT(sol, dev, info); + ret &= sol_set_gen7_3DSTATE_SO_DECL_LIST(sol, dev, info, max_decl_count); + } + + assert(ret); + + return ret; +} + +bool +ilo_state_sol_init_disabled(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + bool render_disable) +{ + struct ilo_state_sol_info info; + + memset(&info, 0, sizeof(info)); + info.render_disable = render_disable; + + return ilo_state_sol_init(sol, dev, &info); +} + +bool +ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(sb, sizeof(*sb))); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + ret &= sol_buffer_set_gen8_3dstate_so_buffer(sb, dev, info); + else + ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info); + + sb->need_bo = (info->size > 0); + sb->need_write_offset_bo = (info->write_offset_save || + (info->write_offset_load && !info->write_offset_imm_enable)); + + assert(ret); + + return ret; +} + +bool +ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev) +{ + struct ilo_state_sol_buffer_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_sol_buffer_init(sb, dev, &info); +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h new file mode 100644 index 00000000000..2513fcb4979 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h @@ -0,0 +1,166 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_SOL_H +#define ILO_STATE_SOL_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Ivy Bridge PRM, volume 2 part 1, page 193: + * + * "Incoming topologies are tagged with a 2-bit StreamID." + */ +#define ILO_STATE_SOL_MAX_STREAM_COUNT 4 + +/* + * From the Ivy Bridge PRM, volume 2 part 1, page 195: + * + * "Up to four SO buffers are supported." + */ +#define ILO_STATE_SOL_MAX_BUFFER_COUNT 4 + +/* + * From the Ivy Bridge PRM, volume 2 part 1, page 201: + * + * "All 128 decls..." + */ +#define ILO_STATE_SOL_MAX_DECL_COUNT 128 + +/** + * Output a vertex attribute. + */ +struct ilo_state_sol_decl_info { + /* select an attribute from read ones */ + uint8_t attr; + bool is_hole; + + /* which components to write */ + uint8_t component_base; + uint8_t component_count; + + /* destination buffer */ + uint8_t buffer; +}; + +struct ilo_state_sol_stream_info { + /* which VUE attributes to read */ + uint8_t cv_vue_attr_count; + uint8_t vue_read_base; + uint8_t vue_read_count; + + uint8_t decl_count; + const struct ilo_state_sol_decl_info *decls; +}; + +struct ilo_state_sol_info { + void *data; + size_t data_size; + + bool sol_enable; + bool stats_enable; + enum gen_reorder_mode tristrip_reorder; + + bool render_disable; + /* ignored when SOL is disabled */ + uint8_t render_stream; + + /* a buffer is disabled when its stride is zero */ + uint16_t buffer_strides[ILO_STATE_SOL_MAX_BUFFER_COUNT]; + + struct ilo_state_sol_stream_info streams[ILO_STATE_SOL_MAX_STREAM_COUNT]; +}; + +struct ilo_state_sol { + uint32_t streamout[2]; + uint16_t strides[4]; + + uint32_t so_decl[2]; + uint32_t (*decl)[2]; + uint8_t decl_count; +}; + +struct ilo_buffer; + +struct ilo_state_sol_buffer_info { + const struct ilo_buffer *buf; + uint32_t offset; + uint32_t size; + + /* + * Gen8+ only. When enabled, require a write offset bo of at least + * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes + */ + bool write_offset_load; + bool write_offset_save; + + bool write_offset_imm_enable; + uint32_t write_offset_imm; +}; + +struct ilo_state_sol_buffer { + uint32_t so_buf[4]; + + bool need_bo; + bool need_write_offset_bo; + + /* managed by users */ + struct intel_bo *bo; + struct intel_bo *write_offset_bo; +}; + +static inline size_t +ilo_state_sol_data_size(const struct ilo_dev *dev, uint8_t max_decl_count) +{ + const struct ilo_state_sol *so = NULL; + return (ilo_dev_gen(dev) >= ILO_GEN(7)) ? + sizeof(so->decl[0]) * max_decl_count : 0; +} + +bool +ilo_state_sol_init(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + const struct ilo_state_sol_info *info); + +bool +ilo_state_sol_init_disabled(struct ilo_state_sol *sol, + const struct ilo_dev *dev, + bool render_disable); + +bool +ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev, + const struct ilo_state_sol_buffer_info *info); + +bool +ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb, + const struct ilo_dev *dev); + +#endif /* ILO_STATE_SOL_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c new file mode 100644 index 00000000000..5be9f8f6270 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c @@ -0,0 +1,1179 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_buffer.h" +#include "ilo_image.h" +#include "ilo_state_surface.h" + +static bool +surface_set_gen6_null_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev) +{ + uint32_t dw0, dw3; + + ILO_DEV_ASSERT(dev, 6, 6); + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 71: + * + * "All of the remaining fields in surface state are ignored for null + * surfaces, with the following exceptions: + * + * - [DevSNB+]: Width, Height, Depth, and LOD fields must match the + * depth buffer's corresponding state for all render target + * surfaces, including null. + * - Surface Format must be R8G8B8A8_UNORM." + * + * From the Sandy Bridge PRM, volume 4 part 1, page 82: + * + * "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must + * be true" + * + * Note that we ignore the first exception for all surface types. + */ + dw0 = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT | + GEN6_FORMAT_R8G8B8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT; + dw3 = GEN6_TILING_X << GEN6_SURFACE_DW3_TILING__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6); + surf->surface[0] = dw0; + surf->surface[1] = 0; + surf->surface[2] = 0; + surf->surface[3] = dw3; + surf->surface[4] = 0; + surf->surface[5] = 0; + + return true; +} + +static bool +surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev) +{ + uint32_t dw0; + + ILO_DEV_ASSERT(dev, 7, 8); + + dw0 = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT | + GEN6_FORMAT_R8G8B8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + dw0 |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT; + else + dw0 |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13); + surf->surface[0] = dw0; + memset(&surf->surface[1], 0, sizeof(uint32_t) * + (((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 13 : 8) - 1)); + + return true; +} + +static bool +surface_validate_gen6_buffer(const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* SVB writes are Gen6-only */ + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB); + + if (info->offset + info->size > info->buf->bo_size) { + ilo_warn("invalid buffer range\n"); + return false; + } + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 81: + * + * "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B] + * For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]" + */ + if (!info->struct_size || info->struct_size > 2048) { + ilo_warn("invalid buffer struct size\n"); + return false; + } + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 68: + * + * "The Base Address for linear render target surfaces and surfaces + * accessed with the typed surface read/write data port messages must + * be element-size aligned, for non-YUV surface formats, or a multiple + * of 2 element-sizes for YUV surface formats. Other linear surfaces + * have no alignment requirements (byte alignment is sufficient)." + * + * "Certain message types used to access surfaces have more stringent + * alignment requirements. Please refer to the specific message + * documentation for additional restrictions." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237: + * + * "the surface base address must be OWord aligned" + * + * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual + * Block Read/Write. + * + * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249: + * + * "The surface base address must be DWord aligned" + * + * for DWord Scattered Read/Write and Byte Scattered Read/Write. + * + * We have to rely on users to correctly set info->struct_size here. DWord + * Scattered Read/Write has conflicting pitch and alignment, but we do not + * use them yet so we are fine. + * + * It is unclear if sampling engine surfaces require aligned offsets. + */ + if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) { + assert(info->struct_size % info->format_size == 0); + + if (info->offset % info->struct_size) { + ilo_warn("bad buffer offset\n"); + return false; + } + } + + if (info->format == GEN6_FORMAT_RAW) { + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 97: + * + * ""RAW" is supported only with buffers and structured buffers + * accessed via the untyped surface read/write and untyped atomic + * operation messages, which do not have a column in the table." + * + * We do not have a specific access mode for untyped messages. + */ + assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED); + + /* + * Nothing is said about Untyped* messages, but I guess they require the + * base address to be DWord aligned. + */ + if (info->offset % 4) { + ilo_warn("bad RAW buffer offset\n"); + return false; + } + + if (info->struct_size > 1) { + /* no STRBUF on Gen6 */ + if (ilo_dev_gen(dev) == ILO_GEN(6)) { + ilo_warn("no STRBUF support\n"); + return false; + } + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 70: + * + * "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the + * pitch must be a multiple of 4 bytes." + */ + if (info->struct_size % 4) { + ilo_warn("bad STRBUF pitch\n"); + return false; + } + } + } + + return true; +} + +static bool +surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info, + uint32_t *count) +{ + uint32_t max_struct, c; + + ILO_DEV_ASSERT(dev, 6, 8); + + c = info->size / info->struct_size; + if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB && + info->format_size < info->size - info->struct_size * c) + c++; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 77: + * + * "For buffer surfaces, the number of entries in the buffer ranges + * from 1 to 2^27." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 68: + * + * "For typed buffer and structured buffer surfaces, the number of + * entries in the buffer ranges from 1 to 2^27. For raw buffer + * surfaces, the number of entries in the buffer is the number of + * bytes which can range from 1 to 2^30." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 69: + * + * For SURFTYPE_BUFFER: The low two bits of this field (Width) must be + * 11 if the Surface Format is RAW (the size of the buffer must be a + * multiple of 4 bytes)." + */ + max_struct = 1 << 27; + if (info->format == GEN6_FORMAT_RAW && info->struct_size == 1) { + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + max_struct = 1 << 30; + + c &= ~3; + } + + if (!c || c > max_struct) { + ilo_warn("too many or zero buffer structs\n"); + return false; + } + + *count = c - 1; + + return true; +} + +static bool +surface_set_gen6_buffer_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) +{ + uint32_t dw0, dw1, dw2, dw3; + uint32_t struct_count; + int width, height, depth; + + ILO_DEV_ASSERT(dev, 6, 6); + + if (!surface_validate_gen6_buffer(dev, info) || + !surface_get_gen6_buffer_struct_count(dev, info, &struct_count)) + return false; + + /* bits [6:0] */ + width = (struct_count & 0x0000007f); + /* bits [19:7] */ + height = (struct_count & 0x000fff80) >> 7; + /* bits [26:20] */ + depth = (struct_count & 0x07f00000) >> 20; + + dw0 = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT | + info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT; + dw1 = info->offset; + dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT | + width << GEN6_SURFACE_DW2_WIDTH__SHIFT; + dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT | + (info->struct_size - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6); + surf->surface[0] = dw0; + surf->surface[1] = dw1; + surf->surface[2] = dw2; + surf->surface[3] = dw3; + surf->surface[4] = 0; + surf->surface[5] = 0; + + surf->type = GEN6_SURFTYPE_BUFFER; + surf->min_lod = 0; + surf->mip_count = 0; + + return true; +} + +static bool +surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) +{ + uint32_t dw0, dw1, dw2, dw3, dw7; + enum gen_surface_type type; + uint32_t struct_count; + int width, height, depth; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!surface_validate_gen6_buffer(dev, info) || + !surface_get_gen6_buffer_struct_count(dev, info, &struct_count)) + return false; + + type = (info->format == GEN6_FORMAT_RAW && info->struct_size > 1) ? + GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER; + + /* bits [6:0] */ + width = (struct_count & 0x0000007f); + /* bits [20:7] */ + height = (struct_count & 0x001fff80) >> 7; + /* bits [30:21] */ + depth = (struct_count & 0x7fe00000) >> 21; + + dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT | + info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT; + dw1 = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 0 : info->offset; + dw2 = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) | + GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH); + dw3 = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) | + GEN_SHIFT32(info->struct_size - 1, GEN7_SURFACE_DW3_PITCH); + + dw7 = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { + dw7 |= GEN_SHIFT32(GEN75_SCS_RED, GEN75_SURFACE_DW7_SCS_R) | + GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) | + GEN_SHIFT32(GEN75_SCS_BLUE, GEN75_SURFACE_DW7_SCS_B) | + GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A); + } + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13); + surf->surface[0] = dw0; + surf->surface[1] = dw1; + surf->surface[2] = dw2; + surf->surface[3] = dw3; + surf->surface[4] = 0; + surf->surface[5] = 0; + surf->surface[6] = 0; + surf->surface[7] = dw7; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + surf->surface[8] = info->offset; + surf->surface[9] = 0; + surf->surface[10] = 0; + surf->surface[11] = 0; + surf->surface[12] = 0; + } + + surf->type = type; + surf->min_lod = 0; + surf->mip_count = 0; + + return true; +} + +static enum gen_surface_type +get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + switch (img->target) { + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return GEN6_SURFTYPE_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE_ARRAY: + return GEN6_SURFTYPE_2D; + case PIPE_TEXTURE_3D: + return GEN6_SURFTYPE_3D; + default: + assert(!"unknown texture target"); + return GEN6_SURFTYPE_NULL; + } +} + +static bool +surface_validate_gen6_image(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + switch (info->access) { + case ILO_STATE_SURFACE_ACCESS_SAMPLER: + case ILO_STATE_SURFACE_ACCESS_DP_RENDER: + break; + case ILO_STATE_SURFACE_ACCESS_DP_TYPED: + assert(ilo_dev_gen(dev) >= ILO_GEN(7)); + break; + default: + assert(!"unsupported surface access"); + break; + } + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 78: + * + * "For surface types other than SURFTYPE_BUFFER, the Width specified + * by this field must be less than or equal to the surface pitch + * (specified in bytes via the Surface Pitch field)." + */ + assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 && + info->img->width0 <= info->img->bo_stride); + + if (info->is_cube_map) { + assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D); + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 78: + * + * "For cube maps, Width must be set equal to the Height." + */ + assert(info->img->width0 == info->img->height0); + } + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 72: + * + * "Tile Walk TILEWALK_YMAJOR is UNDEFINED for render target formats + * that have 128 bits-per-element (BPE)." + * + * "If Number of Multisamples is set to a value other than + * MULTISAMPLECOUNT_1, this field cannot be set to the following + * formats: + * + * - any format with greater than 64 bits per element + * - any compressed texture format (BC*) + * - any YCRCB* format" + * + * From the Ivy Bridge PRM, volume 4 part 1, page 63: + * + * If Number of Multisamples is set to a value other than + * MULTISAMPLECOUNT_1, this field cannot be set to the following + * formats: any format with greater than 64 bits per element, if + * Number of Multisamples is MULTISAMPLECOUNT_8, any compressed + * texture format (BC*), and any YCRCB* format. + * + * TODO + */ + + if (ilo_dev_gen(dev) < ILO_GEN(8) && info->img->tiling == GEN8_TILING_W) { + ilo_warn("tiling W is not supported\n"); + return false; + } + + return true; +} + +static void +get_gen6_max_extent(const struct ilo_dev *dev, + const struct ilo_image *img, + uint16_t *max_w, uint16_t *max_h) +{ + const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; + + ILO_DEV_ASSERT(dev, 6, 8); + + switch (get_gen6_surface_type(dev, img)) { + case GEN6_SURFTYPE_1D: + *max_w = max_size; + *max_h = 1; + break; + case GEN6_SURFTYPE_2D: + *max_w = max_size; + *max_h = max_size; + break; + case GEN6_SURFTYPE_3D: + *max_w = 2048; + *max_h = 2048; + break; + default: + assert(!"invalid surface type"); + *max_w = 1; + *max_h = 1; + break; + } +} + +static bool +surface_get_gen6_image_extent(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + uint16_t *width, uint16_t *height) +{ + uint16_t w, h, max_w, max_h; + + ILO_DEV_ASSERT(dev, 6, 8); + + w = info->img->width0; + h = info->img->height0; + + get_gen6_max_extent(dev, info->img, &max_w, &max_h); + assert(w && h && w <= max_w && h <= max_h); + + *width = w - 1; + *height = h - 1; + + return true; +} + +static bool +surface_get_gen6_image_slices(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + uint16_t *depth, uint16_t *min_array_elem, + uint16_t *rt_view_extent) +{ + uint16_t max_slice, d; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 63: + * + * "If this field (Surface Array) is enabled, the Surface Type must be + * SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is + * disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or + * SURFTYPE_CUBE, the Depth field must be set to zero." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 69: + * + * "This field (Depth) specifies the total number of levels for a + * volume texture or the number of array elements allowed to be + * accessed starting at the Minimum Array Element for arrayed + * surfaces. If the volume texture is MIP-mapped, this field + * specifies the depth of the base MIP level." + * + * "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of this + * field is [0,340], indicating the number of cube array elements + * (equal to the number of underlying 2D array elements divided by 6). + * For other surfaces, this field must be zero." + * + * "Errata: For SURFTYPE_CUBE sampling engine surfaces, the range of + * this field is limited to [0,85]. + * + * Errata: If Surface Array is enabled, and Depth is between 1024 and + * 2047, an incorrect array slice may be accessed if the requested + * array index in the message is greater than or equal to 4096." + * + * The errata are for Gen7-specific, and they limit the number of useable + * layers to (86 * 6), about 512. + */ + + switch (get_gen6_surface_type(dev, info->img)) { + case GEN6_SURFTYPE_1D: + case GEN6_SURFTYPE_2D: + max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512; + + assert(info->img->array_size <= max_slice); + max_slice = info->img->array_size; + + d = info->slice_count; + if (info->is_cube_map) { + if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) { + if (!d || d % 6) { + ilo_warn("invalid cube slice count\n"); + return false; + } + + if (ilo_dev_gen(dev) == ILO_GEN(7) && d > 86 * 6) { + ilo_warn("cube slice count exceeds Gen7 limit\n"); + return false; + } + } else { + /* + * Minumum Array Element and Depth must be 0; Render Target View + * Extent is ignored. + */ + if (info->slice_base || d != 6) { + ilo_warn("no cube RT array support in data port\n"); + return false; + } + } + + d /= 6; + } + + if (!info->is_array && d > 1) { + ilo_warn("non-array surface with non-zero depth\n"); + return false; + } + break; + case GEN6_SURFTYPE_3D: + max_slice = 2048; + + assert(info->img->depth0 <= max_slice); + max_slice = u_minify(info->img->depth0, info->level_base); + + d = info->img->depth0; + + if (info->is_array) { + ilo_warn("3D surfaces cannot be arrays\n"); + return false; + } + break; + default: + assert(!"invalid surface type"); + return false; + break; + } + + if (!info->slice_count || + info->slice_base + info->slice_count > max_slice) { + ilo_warn("invalid slice range\n"); + return false; + } + + assert(d); + *depth = d - 1; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 84: + * + * "For Sampling Engine and Render Target 1D and 2D Surfaces: + * This field (Minimum Array Element) indicates the minimum array + * element that can be accessed as part of this surface. This field + * is added to the delivered array index before it is used to address + * the surface. + * + * For Render Target 3D Surfaces: + * This field indicates the minimum `R' coordinate on the LOD + * currently being rendered to. This field is added to the delivered + * array index before it is used to address the surface. + * + * For Sampling Engine Cube Surfaces on [DevSNB+] only: + * This field indicates the minimum array element in the underlying 2D + * surface array that can be accessed as part of this surface (the + * cube array index is multipled by 6 to compute this value, although + * this field is not restricted to only multiples of 6). This field is + * added to the delivered array index before it is used to address the + * surface. + * + * For Other Surfaces: + * This field must be set to zero." + * + * On Gen7+, typed sufaces are treated like sampling engine 1D and 2D + * surfaces. + */ + *min_array_elem = info->slice_base; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 84: + * + * "For Render Target 3D Surfaces: + * This field (Render Target View Extent) indicates the extent of the + * accessible `R' coordinates minus 1 on the LOD currently being + * rendered to. + * + * For Render Target 1D and 2D Surfaces: + * This field must be set to the same value as the Depth field. + * + * For Other Surfaces: + * This field is ignored." + */ + *rt_view_extent = info->slice_count - 1; + + return true; +} + +static bool +surface_get_gen6_image_levels(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + uint8_t *min_lod, uint8_t *mip_count) +{ + uint8_t max_level = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 15 : 14; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(info->img->level_count <= max_level); + max_level = info->img->level_count; + + if (!info->level_count || + info->level_base + info->level_count > max_level) { + ilo_warn("invalid level range\n"); + return false; + } + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 79: + * + * "For Sampling Engine Surfaces: + * This field (MIP Count / LOD) indicates the number of MIP levels + * allowed to be accessed starting at Surface Min LOD, which must be + * less than or equal to the number of MIP levels actually stored in + * memory for this surface. + * + * Force the mip map access to be between the mipmap specified by the + * integer bits of the Min LOD and the ceiling of the value specified + * here. + * + * For Render Target Surfaces: + * This field defines the MIP level that is currently being rendered + * into. This is the absolute MIP level on the surface and is not + * relative to the Surface Min LOD field, which is ignored for render + * target surfaces. + * + * For Other Surfaces: + * This field is reserved : MBZ" + * + * From the Sandy Bridge PRM, volume 4 part 1, page 83: + * + * "For Sampling Engine Surfaces: + * + * This field (Surface Min LOD) indicates the most detailed LOD that + * can be accessed as part of this surface. This field is added to + * the delivered LOD (sample_l, ld, or resinfo message types) before + * it is used to address the surface. + * + * For Other Surfaces: + * This field is ignored." + * + * On Gen7+, typed sufaces are treated like sampling engine surfaces. + */ + if (info->access == ILO_STATE_SURFACE_ACCESS_DP_RENDER) { + assert(info->level_count == 1); + + *min_lod = 0; + *mip_count = info->level_base; + } else { + *min_lod = info->level_base; + *mip_count = info->level_count - 1; + } + + return true; +} + +static bool +surface_get_gen6_image_sample_count(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + enum gen_sample_count *sample_count) +{ + int min_gen; + + ILO_DEV_ASSERT(dev, 6, 8); + + switch (info->img->sample_count) { + case 1: + *sample_count = GEN6_NUMSAMPLES_1; + min_gen = ILO_GEN(6); + break; + case 2: + *sample_count = GEN8_NUMSAMPLES_2; + min_gen = ILO_GEN(8); + break; + case 4: + *sample_count = GEN6_NUMSAMPLES_4; + min_gen = ILO_GEN(6); + break; + case 8: + *sample_count = GEN7_NUMSAMPLES_8; + min_gen = ILO_GEN(7); + break; + case 16: + *sample_count = GEN8_NUMSAMPLES_16; + min_gen = ILO_GEN(8); + break; + default: + assert(!"invalid sample count"); + *sample_count = GEN6_NUMSAMPLES_1; + break; + } + + assert(ilo_dev_gen(dev) >= min_gen); + + return true; +} + +static bool +surface_get_gen6_image_alignments(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + uint32_t *alignments) +{ + uint32_t a = 0; + bool err = false; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + switch (info->img->align_i) { + case 4: + a |= GEN8_SURFACE_DW0_HALIGN_4; + break; + case 8: + a |= GEN8_SURFACE_DW0_HALIGN_8; + break; + case 16: + a |= GEN8_SURFACE_DW0_HALIGN_16; + break; + default: + err = true; + break; + } + + switch (info->img->align_j) { + case 4: + a |= GEN7_SURFACE_DW0_VALIGN_4; + break; + case 8: + a |= GEN8_SURFACE_DW0_VALIGN_8; + break; + case 16: + a |= GEN8_SURFACE_DW0_VALIGN_16; + break; + default: + err = true; + break; + } + } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + switch (info->img->align_i) { + case 4: + a |= GEN7_SURFACE_DW0_HALIGN_4; + break; + case 8: + a |= GEN7_SURFACE_DW0_HALIGN_8; + break; + default: + err = true; + break; + } + + switch (info->img->align_j) { + case 2: + a |= GEN7_SURFACE_DW0_VALIGN_2; + break; + case 4: + a |= GEN7_SURFACE_DW0_VALIGN_4; + break; + default: + err = true; + break; + } + } else { + if (info->img->align_i != 4) + err = true; + + switch (info->img->align_j) { + case 2: + a |= GEN6_SURFACE_DW5_VALIGN_2; + break; + case 4: + a |= GEN6_SURFACE_DW5_VALIGN_4; + break; + default: + err = true; + break; + } + } + + if (err) + assert(!"invalid HALIGN or VALIGN"); + + *alignments = a; + + return true; +} + +static bool +surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info) +{ + uint16_t width, height, depth, array_base, view_extent; + uint8_t min_lod, mip_count; + enum gen_sample_count sample_count; + uint32_t alignments; + enum gen_surface_type type; + uint32_t dw0, dw2, dw3, dw4, dw5; + + ILO_DEV_ASSERT(dev, 6, 6); + + if (!surface_validate_gen6_image(dev, info) || + !surface_get_gen6_image_extent(dev, info, &width, &height) || + !surface_get_gen6_image_slices(dev, info, &depth, &array_base, + &view_extent) || + !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) || + !surface_get_gen6_image_sample_count(dev, info, &sample_count) || + !surface_get_gen6_image_alignments(dev, info, &alignments)) + return false; + + /* no ARYSPC_LOD0 */ + assert(info->img->walk != ILO_IMAGE_WALK_LOD); + /* no UMS/CMS */ + if (info->img->sample_count > 1) + assert(info->img->interleaved_samples); + + type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : + get_gen6_surface_type(dev, info->img); + + dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT | + info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT | + GEN6_SURFACE_DW0_MIPLAYOUT_BELOW; + + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 74: + * + * "CUBE_AVERAGE may only be selected if all of the Cube Face Enable + * fields are equal to one." + * + * From the Sandy Bridge PRM, volume 4 part 1, page 75-76: + * + * "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine: + * Bits 5:0 of this field (Cube Face Enables) enable the individual + * faces of a cube map. Enabling a face indicates that the face is + * present in the cube map, while disabling it indicates that that + * face is represented by the texture map's border color. Refer to + * Memory Data Formats for the correlation between faces and the cube + * map memory layout. Note that storage for disabled faces must be + * provided. + * + * For other surfaces: + * This field is reserved : MBZ" + * + * "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this + * field must be programmed to 111111b (all faces enabled)." + */ + if (info->is_cube_map && + info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) { + dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE | + GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; + } + + dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT | + width << GEN6_SURFACE_DW2_WIDTH__SHIFT | + mip_count << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT; + + dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT | + (info->img->bo_stride - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT | + info->img->tiling << GEN6_SURFACE_DW3_TILING__SHIFT; + + dw4 = min_lod << GEN6_SURFACE_DW4_MIN_LOD__SHIFT | + array_base << GEN6_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT | + view_extent << GEN6_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT | + sample_count << GEN6_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT; + + dw5 = alignments; + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6); + surf->surface[0] = dw0; + surf->surface[1] = 0; + surf->surface[2] = dw2; + surf->surface[3] = dw3; + surf->surface[4] = dw4; + surf->surface[5] = dw5; + + surf->type = type; + surf->min_lod = min_lod; + surf->mip_count = mip_count; + + return true; +} + +static bool +surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info) +{ + uint16_t width, height, depth, array_base, view_extent; + uint8_t min_lod, mip_count; + uint32_t alignments; + enum gen_sample_count sample_count; + enum gen_surface_type type; + uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!surface_validate_gen6_image(dev, info) || + !surface_get_gen6_image_extent(dev, info, &width, &height) || + !surface_get_gen6_image_slices(dev, info, &depth, &array_base, + &view_extent) || + !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) || + !surface_get_gen6_image_sample_count(dev, info, &sample_count) || + !surface_get_gen6_image_alignments(dev, info, &alignments)) + return false; + + type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : + get_gen6_surface_type(dev, info->img); + + dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT | + info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT | + alignments; + + if (info->is_array) + dw0 |= GEN7_SURFACE_DW0_IS_ARRAY; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + dw0 |= info->img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT; + } else { + dw0 |= info->img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT; + + if (info->img->walk == ILO_IMAGE_WALK_LOD) + dw0 |= GEN7_SURFACE_DW0_ARYSPC_LOD0; + else + dw0 |= GEN7_SURFACE_DW0_ARYSPC_FULL; + } + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 67: + * + * "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine: Bits + * 5:0 of this field (Cube Face Enables) enable the individual faces + * of a cube map. Enabling a face indicates that the face is present + * in the cube map, while disabling it indicates that that face is + * represented by the texture map's border color. Refer to Memory Data + * Formats for the correlation between faces and the cube map memory + * layout. Note that storage for disabled faces must be provided. For + * other surfaces this field is reserved and MBZ." + * + * "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this + * field must be programmed to 111111b (all faces enabled). This field + * is ignored unless the Surface Type is SURFTYPE_CUBE." + */ + if (info->is_cube_map && + info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) + dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; + + dw1 = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + assert(info->img->walk_layer_height % 4 == 0); + dw1 |= info->img->walk_layer_height / 4 << + GEN8_SURFACE_DW1_QPITCH__SHIFT; + } + + dw2 = height << GEN7_SURFACE_DW2_HEIGHT__SHIFT | + width << GEN7_SURFACE_DW2_WIDTH__SHIFT; + + dw3 = depth << GEN7_SURFACE_DW3_DEPTH__SHIFT | + (info->img->bo_stride - 1) << GEN7_SURFACE_DW3_PITCH__SHIFT; + + if (ilo_dev_gen(dev) == ILO_GEN(7.5)) + dw3 |= 0 << GEN75_SURFACE_DW3_INTEGER_SURFACE_FORMAT__SHIFT; + + dw4 = array_base << GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT | + view_extent << GEN7_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT | + sample_count << GEN7_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT; + + /* + * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL + * means the samples are interleaved. The layouts are the same when the + * number of samples is 1. + */ + if (info->img->interleaved_samples && info->img->sample_count > 1) { + assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_RENDER); + dw4 |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL; + } else { + dw4 |= GEN7_SURFACE_DW4_MSFMT_MSS; + } + + dw5 = min_lod << GEN7_SURFACE_DW5_MIN_LOD__SHIFT | + mip_count << GEN7_SURFACE_DW5_MIP_COUNT_LOD__SHIFT; + + dw7 = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { + dw7 |= GEN_SHIFT32(GEN75_SCS_RED, GEN75_SURFACE_DW7_SCS_R) | + GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) | + GEN_SHIFT32(GEN75_SCS_BLUE, GEN75_SURFACE_DW7_SCS_B) | + GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A); + } + + STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13); + surf->surface[0] = dw0; + surf->surface[1] = dw1; + surf->surface[2] = dw2; + surf->surface[3] = dw3; + surf->surface[4] = dw4; + surf->surface[5] = dw5; + surf->surface[6] = 0; + surf->surface[7] = dw7; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + surf->surface[8] = 0; + surf->surface[9] = 0; + surf->surface[10] = 0; + surf->surface[11] = 0; + surf->surface[12] = 0; + } + + surf->type = type; + surf->min_lod = min_lod; + surf->mip_count = mip_count; + + return true; +} + +bool +ilo_state_surface_init_for_null(struct ilo_state_surface *surf, + const struct ilo_dev *dev) +{ + bool ret = true; + + assert(ilo_is_zeroed(surf, sizeof(*surf))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= surface_set_gen7_null_SURFACE_STATE(surf, dev); + else + ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev); + + surf->type = GEN6_SURFTYPE_NULL; + surf->readonly = true; + + assert(ret); + + return ret; +} + +bool +ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(surf, sizeof(*surf))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= surface_set_gen7_buffer_SURFACE_STATE(surf, dev, info); + else + ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info); + + surf->readonly = info->readonly; + + assert(ret); + + return ret; +} + +bool +ilo_state_surface_init_for_image(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(surf, sizeof(*surf))); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= surface_set_gen7_image_SURFACE_STATE(surf, dev, info); + else + ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info); + + surf->is_integer = info->is_integer; + surf->readonly = info->readonly; + surf->scanout = info->img->scanout; + + assert(ret); + + return ret; +} + +bool +ilo_state_surface_set_scs(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + enum gen_surface_scs rgba[4]) +{ + const uint32_t scs = GEN_SHIFT32(rgba[0], GEN75_SURFACE_DW7_SCS_R) | + GEN_SHIFT32(rgba[1], GEN75_SURFACE_DW7_SCS_G) | + GEN_SHIFT32(rgba[2], GEN75_SURFACE_DW7_SCS_B) | + GEN_SHIFT32(rgba[3], GEN75_SURFACE_DW7_SCS_A); + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(ilo_dev_gen(dev) >= ILO_GEN(7.5)); + + surf->surface[7] = (surf->surface[7] & ~GEN75_SURFACE_DW7_SCS__MASK) | scs; + + return true; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h new file mode 100644 index 00000000000..9c025428d50 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h @@ -0,0 +1,121 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_SURFACE_H +#define ILO_STATE_SURFACE_H + +#include "genhw/genhw.h" +#include "intel_winsys.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +struct ilo_buffer; +struct ilo_image; + +enum ilo_state_surface_access { + ILO_STATE_SURFACE_ACCESS_SAMPLER, /* sampling engine surfaces */ + ILO_STATE_SURFACE_ACCESS_DP_RENDER, /* render target surfaces */ + ILO_STATE_SURFACE_ACCESS_DP_TYPED, /* typed surfaces */ + ILO_STATE_SURFACE_ACCESS_DP_UNTYPED, /* untyped surfaces */ + ILO_STATE_SURFACE_ACCESS_DP_DATA, + ILO_STATE_SURFACE_ACCESS_DP_SVB, +}; + +struct ilo_state_surface_buffer_info { + const struct ilo_buffer *buf; + + enum ilo_state_surface_access access; + + enum gen_surface_format format; + uint8_t format_size; + + bool readonly; + uint16_t struct_size; + + uint32_t offset; + uint32_t size; +}; + +struct ilo_state_surface_image_info { + const struct ilo_image *img; + + enum ilo_state_surface_access access; + + enum gen_surface_format format; + bool is_integer; + + bool readonly; + bool is_cube_map; + bool is_array; + + uint8_t level_base; + uint8_t level_count; + uint16_t slice_base; + uint16_t slice_count; +}; + +struct ilo_state_surface { + uint32_t surface[13]; + + enum gen_surface_type type; + uint8_t min_lod; + uint8_t mip_count; + bool is_integer; + + bool readonly; + bool scanout; + + /* managed by users */ + struct intel_bo *bo; +}; + +bool +ilo_state_surface_valid_format(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + enum gen_surface_format format); + +bool +ilo_state_surface_init_for_null(struct ilo_state_surface *surf, + const struct ilo_dev *dev); + +bool +ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info); + +bool +ilo_state_surface_init_for_image(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info); + +bool +ilo_state_surface_set_scs(struct ilo_state_surface *surf, + const struct ilo_dev *dev, + enum gen_surface_scs rgba[4]); + +#endif /* ILO_STATE_SURFACE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface_format.c b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c new file mode 100644 index 00000000000..a40c1b84d17 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c @@ -0,0 +1,351 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2013 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "genhw/genhw.h" +#include "ilo_state_surface.h" + +static bool +surface_valid_sampler_format(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + enum gen_surface_format format) +{ + /* + * This table is based on: + * + * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 + * - the Ivy Bridge PRM, volume 4 part 1, page 84-87 + */ + static const struct sampler_cap { + int sampling; + int filtering; + int shadow_map; + int chroma_key; + } caps[] = { +#define CAP(sampling, filtering, shadow_map, chroma_key) \ + { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) } + [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32G32B32X32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_R32G32B32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_R32G32B32_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32G32B32_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16B16A16_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16B16A16_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16B16A16_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16B16A16_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16B16A16_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_R32G32_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32G32_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_L32A32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_R16G16B16X16_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16B16X16_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_A32X32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_L32X32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_I32X32_FLOAT] = CAP( 1, 5, 0, 0), + [GEN6_FORMAT_B8G8R8A8_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R10G10B10A2_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R10G10B10A2_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8A8_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8A8_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8A8_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R8G8B8A8_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16G16_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B10G10R10A2_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R11G11B10_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R32_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R32_FLOAT] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_R24_UNORM_X8_TYPELESS] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_X24_TYPELESS_G8_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_L16A16_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_I24X8_UNORM] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_L24X8_UNORM] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_A24X8_UNORM] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_I32_FLOAT] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_L32_FLOAT] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_A32_FLOAT] = CAP( 1, 5, 1, 0), + [GEN6_FORMAT_B8G8R8X8_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8X8_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R9G9B9E5_SHAREDEXP] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B10G10R10X2_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_L16A16_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B5G6R5_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_B5G6R5_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B5G5R5A1_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B4G4R4A4_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8_SNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_R8G8_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R8G8_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16_UNORM] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_R16_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R16_FLOAT] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_A8P8_UNORM_PALETTE0] = CAP( 5, 5, 0, 0), + [GEN6_FORMAT_A8P8_UNORM_PALETTE1] = CAP( 5, 5, 0, 0), + [GEN6_FORMAT_I16_UNORM] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_L16_UNORM] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_A16_UNORM] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_L8A8_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_I16_FLOAT] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_L16_FLOAT] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_A16_FLOAT] = CAP( 1, 1, 1, 0), + [GEN6_FORMAT_L8A8_UNORM_SRGB] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_R5G5_SNORM_B6_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_P8A8_UNORM_PALETTE0] = CAP( 5, 5, 0, 0), + [GEN6_FORMAT_P8A8_UNORM_PALETTE1] = CAP( 5, 5, 0, 0), + [GEN6_FORMAT_R8_UNORM] = CAP( 1, 1, 0, 4.5), + [GEN6_FORMAT_R8_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8_SINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_R8_UINT] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_A8_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_I8_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_L8_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_P4A4_UNORM_PALETTE0] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_A4P4_UNORM_PALETTE0] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_P8_UNORM_PALETTE0] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_L8_UNORM_SRGB] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_P8_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_P4A4_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_A4P4_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_DXT1_RGB_SRGB] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_R1_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_YCRCB_NORMAL] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_YCRCB_SWAPUVY] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_P2_UNORM_PALETTE0] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_P2_UNORM_PALETTE1] = CAP(4.5, 4.5, 0, 0), + [GEN6_FORMAT_BC1_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_BC2_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_BC3_UNORM] = CAP( 1, 1, 0, 1), + [GEN6_FORMAT_BC4_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC5_UNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC1_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC2_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC3_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_MONO8] = CAP( 1, 0, 0, 0), + [GEN6_FORMAT_YCRCB_SWAPUV] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_YCRCB_SWAPY] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_DXT1_RGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_FXT1] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC4_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_BC5_SNORM] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R16G16B16_FLOAT] = CAP( 5, 5, 0, 0), + [GEN6_FORMAT_BC6H_SF16] = CAP( 7, 7, 0, 0), + [GEN6_FORMAT_BC7_UNORM] = CAP( 7, 7, 0, 0), + [GEN6_FORMAT_BC7_UNORM_SRGB] = CAP( 7, 7, 0, 0), + [GEN6_FORMAT_BC6H_UF16] = CAP( 7, 7, 0, 0), +#undef CAP + }; + + ILO_DEV_ASSERT(dev, 6, 8); + + return (format < ARRAY_SIZE(caps) && caps[format].sampling && + ilo_dev_gen(dev) >= caps[format].sampling); +} + +static bool +surface_valid_dp_format(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + enum gen_surface_format format) +{ + /* + * This table is based on: + * + * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 + * - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278 + * - the Haswell PRM, volume 7, page 262-264 + */ + static const struct dp_cap { + int rt_write; + int rt_write_blending; + int typed_write; + int media_color_processing; + } caps[] = { +#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \ + { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) } + [GEN6_FORMAT_R32G32B32A32_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R32G32B32A32_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R32G32B32A32_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16B16A16_UNORM] = CAP( 1, 4.5, 7, 6), + [GEN6_FORMAT_R16G16B16A16_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R16G16B16A16_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16B16A16_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16B16A16_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R32G32_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R32G32_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R32G32_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_B8G8R8A8_UNORM] = CAP( 1, 1, 7, 6), + [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R10G10B10A2_UNORM] = CAP( 1, 1, 7, 6), + [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB] = CAP( 0, 0, 0, 6), + [GEN6_FORMAT_R10G10B10A2_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R8G8B8A8_UNORM] = CAP( 1, 1, 7, 6), + [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB] = CAP( 1, 1, 0, 6), + [GEN6_FORMAT_R8G8B8A8_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R8G8B8A8_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R8G8B8A8_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16_UNORM] = CAP( 1, 4.5, 7, 0), + [GEN6_FORMAT_R16G16_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R16G16_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16G16_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B10G10R10A2_UNORM] = CAP( 1, 1, 7, 6), + [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB] = CAP( 1, 1, 0, 6), + [GEN6_FORMAT_R11G11B10_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R32_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R32_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R32_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B8G8R8X8_UNORM] = CAP( 0, 0, 0, 6), + [GEN6_FORMAT_B5G6R5_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B5G6R5_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B5G5R5A1_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_B4G4R4A4_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8G8_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R8G8_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R8G8_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R8G8_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16_UNORM] = CAP( 1, 4.5, 7, 7), + [GEN6_FORMAT_R16_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R16_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R16_FLOAT] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B5G5R5X1_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB] = CAP( 1, 1, 0, 0), + [GEN6_FORMAT_R8_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_R8_SNORM] = CAP( 1, 6, 7, 0), + [GEN6_FORMAT_R8_SINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_R8_UINT] = CAP( 1, 0, 7, 0), + [GEN6_FORMAT_A8_UNORM] = CAP( 1, 1, 7, 0), + [GEN6_FORMAT_YCRCB_NORMAL] = CAP( 1, 0, 0, 6), + [GEN6_FORMAT_YCRCB_SWAPUVY] = CAP( 1, 0, 0, 6), + [GEN6_FORMAT_YCRCB_SWAPUV] = CAP( 1, 0, 0, 6), + [GEN6_FORMAT_YCRCB_SWAPY] = CAP( 1, 0, 0, 6), +#undef CAP + }; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (format >= ARRAY_SIZE(caps)) + return false; + + switch (access) { + case ILO_STATE_SURFACE_ACCESS_DP_RENDER: + return (caps[format].rt_write && + ilo_dev_gen(dev) >= caps[format].rt_write); + case ILO_STATE_SURFACE_ACCESS_DP_TYPED: + return (caps[format].typed_write && + ilo_dev_gen(dev) >= caps[format].typed_write); + case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED: + return (format == GEN6_FORMAT_RAW); + case ILO_STATE_SURFACE_ACCESS_DP_DATA: + /* ignored, but can it be raw? */ + assert(format != GEN6_FORMAT_RAW); + return true; + default: + return false; + } +} + +static bool +surface_valid_svb_format(const struct ilo_dev *dev, + enum gen_surface_format format) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * This table is based on: + * + * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 + * - the Ivy Bridge PRM, volume 2 part 1, page 195 + * - the Haswell PRM, volume 7, page 535 + */ + switch (format) { + case GEN6_FORMAT_R32G32B32A32_FLOAT: + case GEN6_FORMAT_R32G32B32A32_SINT: + case GEN6_FORMAT_R32G32B32A32_UINT: + case GEN6_FORMAT_R32G32B32_FLOAT: + case GEN6_FORMAT_R32G32B32_SINT: + case GEN6_FORMAT_R32G32B32_UINT: + case GEN6_FORMAT_R32G32_FLOAT: + case GEN6_FORMAT_R32G32_SINT: + case GEN6_FORMAT_R32G32_UINT: + case GEN6_FORMAT_R32_SINT: + case GEN6_FORMAT_R32_UINT: + case GEN6_FORMAT_R32_FLOAT: + return true; + default: + return false; + } +} + +bool +ilo_state_surface_valid_format(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + enum gen_surface_format format) +{ + bool valid; + + switch (access) { + case ILO_STATE_SURFACE_ACCESS_SAMPLER: + valid = surface_valid_sampler_format(dev, access, format); + break; + case ILO_STATE_SURFACE_ACCESS_DP_RENDER: + case ILO_STATE_SURFACE_ACCESS_DP_TYPED: + case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED: + case ILO_STATE_SURFACE_ACCESS_DP_DATA: + valid = surface_valid_dp_format(dev, access, format); + break; + case ILO_STATE_SURFACE_ACCESS_DP_SVB: + valid = surface_valid_svb_format(dev, format); + break; + default: + valid = false; + break; + } + + return valid; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.c b/src/gallium/drivers/ilo/core/ilo_state_urb.c new file mode 100644 index 00000000000..cbd150c71c9 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_urb.c @@ -0,0 +1,769 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_urb.h" + +struct urb_configuration { + uint8_t vs_pcb_alloc_kb; + uint8_t hs_pcb_alloc_kb; + uint8_t ds_pcb_alloc_kb; + uint8_t gs_pcb_alloc_kb; + uint8_t ps_pcb_alloc_kb; + + uint8_t urb_offset_8kb; + + uint8_t vs_urb_alloc_8kb; + uint8_t hs_urb_alloc_8kb; + uint8_t ds_urb_alloc_8kb; + uint8_t gs_urb_alloc_8kb; + + uint8_t vs_entry_rows; + uint8_t hs_entry_rows; + uint8_t ds_entry_rows; + uint8_t gs_entry_rows; + + int vs_entry_count; + int hs_entry_count; + int ds_entry_count; + int gs_entry_count; +}; + +static void +urb_alloc_gen7_pcb(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Haswell PRM, volume 2b, page 940: + * + * "[0,16] (0KB - 16KB) Increments of 1KB DevHSW:GT1, DevHSW:GT2 + * [0,32] (0KB - 32KB) Increments of 2KB DevHSW:GT3" + */ + const uint8_t increment_kb = + (ilo_dev_gen(dev) >= ILO_GEN(8) || + (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 3)) ? 2 : 1; + + ILO_DEV_ASSERT(dev, 7, 8); + + /* + * Keep the strategy simple as we do not know the workloads and how + * expensive it is to change the configuration frequently. + */ + if (info->hs_const_data || info->ds_const_data) { + conf->vs_pcb_alloc_kb = increment_kb * 4; + conf->hs_pcb_alloc_kb = increment_kb * 3; + conf->ds_pcb_alloc_kb = increment_kb * 3; + conf->gs_pcb_alloc_kb = increment_kb * 3; + conf->ps_pcb_alloc_kb = increment_kb * 3; + } else if (info->gs_const_data) { + conf->vs_pcb_alloc_kb = increment_kb * 6; + conf->gs_pcb_alloc_kb = increment_kb * 5; + conf->ps_pcb_alloc_kb = increment_kb * 5; + } else { + conf->vs_pcb_alloc_kb = increment_kb * 8; + conf->ps_pcb_alloc_kb = increment_kb * 8; + } + + conf->urb_offset_8kb = increment_kb * 16 / 8; +} + +static void +urb_alloc_gen6_urb(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 34: + * + * "(VS URB Starting Address) Offset from the start of the URB memory + * where VS starts its allocation, specified in multiples of 8 KB." + * + * Same for other stages. + */ + const int space_avail_8kb = dev->urb_size / 8192 - conf->urb_offset_8kb; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 173: + * + * "Programming Note: If the GS stage is enabled, software must always + * allocate at least one GS URB Entry. This is true even if the GS + * thread never needs to output vertices to the urb, e.g., when only + * performing stream output. This is an artifact of the need to pass + * the GS thread an initial destination URB handle." + */ + const bool force_gs_alloc = + (ilo_dev_gen(dev) == ILO_GEN(6) && info->gs_enable); + + ILO_DEV_ASSERT(dev, 6, 8); + + if (info->hs_entry_size || info->ds_entry_size) { + conf->vs_urb_alloc_8kb = space_avail_8kb / 4; + conf->hs_urb_alloc_8kb = space_avail_8kb / 4; + conf->ds_urb_alloc_8kb = space_avail_8kb / 4; + conf->gs_urb_alloc_8kb = space_avail_8kb / 4; + + if (space_avail_8kb % 4) { + assert(space_avail_8kb % 2 == 0); + conf->vs_urb_alloc_8kb++; + conf->gs_urb_alloc_8kb++; + } + } else if (info->gs_entry_size || force_gs_alloc) { + assert(space_avail_8kb % 2 == 0); + conf->vs_urb_alloc_8kb = space_avail_8kb / 2; + conf->gs_urb_alloc_8kb = space_avail_8kb / 2; + } else { + conf->vs_urb_alloc_8kb = space_avail_8kb; + } +} + +static bool +urb_init_gen6_vs_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 28: + * + * "(VS URB Entry Allocation Size) + * Range [0,4] = [1,5] 1024-bit URB rows" + * + * "(VS Number of URB Entries) + * Range [24,256] in multiples of 4 + * [24, 128] in multiples of 4[DevSNBGT1]" + */ + const int max_entry_count = (dev->gt == 2) ? 256 : 252; + const int row_size = 1024 / 8; + int row_count, entry_count; + int entry_size; + + ILO_DEV_ASSERT(dev, 6, 6); + + /* VE and VS share the same VUE for each vertex */ + entry_size = info->vs_entry_size; + if (entry_size < info->ve_entry_size) + entry_size = info->ve_entry_size; + + row_count = (entry_size + row_size - 1) / row_size; + if (row_count > 5) + return false; + else if (!row_count) + row_count++; + + entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count); + if (entry_count > max_entry_count) + entry_count = max_entry_count; + entry_count &= ~3; + assert(entry_count >= 24); + + conf->vs_entry_rows = row_count; + conf->vs_entry_count = entry_count; + + return true; +} + +static bool +urb_init_gen6_gs_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 29: + * + * "(GS Number of URB Entries) + * Range [0,256] in multiples of 4 + * [0, 254] in multiples of 4[DevSNBGT1]" + * + * "(GS URB Entry Allocation Size) + * Range [0,4] = [1,5] 1024-bit URB rows" + */ + const int max_entry_count = (dev->gt == 2) ? 256 : 252; + const int row_size = 1024 / 8; + int row_count, entry_count; + + ILO_DEV_ASSERT(dev, 6, 6); + + row_count = (info->gs_entry_size + row_size - 1) / row_size; + if (row_count > 5) + return false; + else if (!row_count) + row_count++; + + entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count); + if (entry_count > max_entry_count) + entry_count = max_entry_count; + entry_count &= ~3; + + conf->gs_entry_rows = row_count; + conf->gs_entry_count = entry_count; + + return true; +} + +static bool +urb_init_gen7_vs_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 34-35: + * + * "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may + * cause performance to decrease due to banking in the URB. Element + * sizes of 16 to 20 should be programmed with six 512-bit URB rows." + * + * "(VS URB Entry Allocation Size) + * Format: U9-1 count of 512-bit units" + * + * "(VS Number of URB Entries) + * [32,704] + * [32,512] + * + * Programming Restriction: VS Number of URB Entries must be divisible + * by 8 if the VS URB Entry Allocation Size is less than 9 512-bit URB + * entries."2:0" = reserved "000b"" + * + * From the Haswell PRM, volume 2b, page 847: + * + * "(VS Number of URB Entries) + * [64,1664] DevHSW:GT3 + * [64,1664] DevHSW:GT2 + * [32,640] DevHSW:GT1" + */ + const int row_size = 512 / 8; + int row_count, entry_count; + int entry_size; + int max_entry_count, min_entry_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 35: + * + * "Programming Restriction: As the VS URB entry serves as both the + * per-vertex input and output of the VS shader, the VS URB Allocation + * Size must be sized to the maximum of the vertex input and output + * structures." + * + * From the Ivy Bridge PRM, volume 2 part 1, page 42: + * + * "If the VS function is enabled, the VF-written VUEs are not required + * to have Vertex Headers, as the VS-incoming vertices are guaranteed + * to be consumed by the VS (i.e., the VS thread is responsible for + * overwriting the input vertex data)." + * + * VE and VS share the same VUE for each vertex. + */ + entry_size = info->vs_entry_size; + if (entry_size < info->ve_entry_size) + entry_size = info->ve_entry_size; + + row_count = (entry_size + row_size - 1) / row_size; + if (row_count == 5 || !row_count) + row_count++; + + entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count); + if (row_count < 9) + entry_count &= ~7; + + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + case ILO_GEN(7.5): + max_entry_count = (dev->gt >= 2) ? 1664 : 640; + min_entry_count = (dev->gt >= 2) ? 64 : 32; + break; + case ILO_GEN(7): + max_entry_count = (dev->gt == 2) ? 704 : 512; + min_entry_count = 32; + break; + default: + assert(!"unexpected gen"); + return false; + break; + } + + if (entry_count > max_entry_count) + entry_count = max_entry_count; + else if (entry_count < min_entry_count) + return false; + + conf->vs_entry_rows = row_count; + conf->vs_entry_count = entry_count; + + return true; +} + +static bool +urb_init_gen7_hs_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 37: + * + * "HS Number of URB Entries must be divisible by 8 if the HS URB Entry + * Allocation Size is less than 9 512-bit URB + * entries."2:0" = reserved "000" + * + * [0,64] + * [0,32]" + * + * From the Haswell PRM, volume 2b, page 849: + * + * "(HS Number of URB Entries) + * [0,128] DevHSW:GT2 + * [0,64] DevHSW:GT1" + */ + const int row_size = 512 / 8; + int row_count, entry_count; + int max_entry_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + row_count = (info->hs_entry_size + row_size - 1) / row_size; + if (!row_count) + row_count++; + + entry_count = conf->hs_urb_alloc_8kb * 8192 / (row_size * row_count); + if (row_count < 9) + entry_count &= ~7; + + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + case ILO_GEN(7.5): + max_entry_count = (dev->gt >= 2) ? 128 : 64; + break; + case ILO_GEN(7): + max_entry_count = (dev->gt == 2) ? 64 : 32; + break; + default: + assert(!"unexpected gen"); + return false; + break; + } + + if (entry_count > max_entry_count) + entry_count = max_entry_count; + else if (info->hs_entry_size && !entry_count) + return false; + + conf->hs_entry_rows = row_count; + conf->hs_entry_count = entry_count; + + return true; +} + +static bool +urb_init_gen7_ds_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 38: + * + * "(DS URB Entry Allocation Size) + * [0,9]" + * + * "(DS Number of URB Entries) If Domain Shader Thread Dispatch is + * Enabled then the minimum number handles that must be allocated is + * 138 URB entries. + * "2:0" = reserved "000" + * + * [0,448] + * [0,288] + * + * DS Number of URB Entries must be divisible by 8 if the DS URB Entry + * Allocation Size is less than 9 512-bit URB entries.If Domain Shader + * Thread Dispatch is Enabled then the minimum number of handles that + * must be allocated is 10 URB entries." + * + * From the Haswell PRM, volume 2b, page 851: + * + * "(DS Number of URB Entries) + * [0,960] DevHSW:GT2 + * [0,384] DevHSW:GT1" + */ + const int row_size = 512 / 8; + int row_count, entry_count; + int max_entry_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + row_count = (info->ds_entry_size + row_size - 1) / row_size; + if (row_count > 10) + return false; + else if (!row_count) + row_count++; + + entry_count = conf->ds_urb_alloc_8kb * 8192 / (row_size * row_count); + if (row_count < 9) + entry_count &= ~7; + + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + case ILO_GEN(7.5): + max_entry_count = (dev->gt >= 2) ? 960 : 384; + break; + case ILO_GEN(7): + max_entry_count = (dev->gt == 2) ? 448 : 288; + break; + default: + assert(!"unexpected gen"); + return false; + break; + } + + if (entry_count > max_entry_count) + entry_count = max_entry_count; + else if (info->ds_entry_size && entry_count < 10) + return false; + + conf->ds_entry_rows = row_count; + conf->ds_entry_count = entry_count; + + return true; +} + +static bool +urb_init_gen7_gs_entry(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 40: + * + * "(GS Number of URB Entries) GS Number of URB Entries must be + * divisible by 8 if the GS URB Entry Allocation Size is less than 9 + * 512-bit URB entries. + * "2:0" = reserved "000" + * + * [0,320] + * [0,192]" + * + * From the Ivy Bridge PRM, volume 2 part 1, page 171: + * + * "(DUAL_INSTANCE and DUAL_OBJECT) The GS must be allocated at least + * two URB handles or behavior is UNDEFINED." + * + * From the Haswell PRM, volume 2b, page 853: + * + * "(GS Number of URB Entries) + * [0,640] DevHSW:GT2 + * [0,256] DevHSW:GT1 + * + * Only if GS is disabled can this field be programmed to 0. If GS is + * enabled this field shall be programmed to a value greater than 0. + * For GS Dispatch Mode "Single", this field shall be programmed to a + * value greater than or equal to 1. For other GS Dispatch Modes, + * refer to the definition of Dispatch Mode (3DSTATE_GS) for minimum + * values of this field." + */ + const int row_size = 512 / 8; + int row_count, entry_count; + int max_entry_count; + + ILO_DEV_ASSERT(dev, 7, 8); + + row_count = (info->gs_entry_size + row_size - 1) / row_size; + if (!row_count) + row_count++; + + entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count); + if (row_count < 9) + entry_count &= ~7; + + switch (ilo_dev_gen(dev)) { + case ILO_GEN(8): + case ILO_GEN(7.5): + max_entry_count = (dev->gt >= 2) ? 640 : 256; + break; + case ILO_GEN(7): + max_entry_count = (dev->gt == 2) ? 320 : 192; + break; + default: + assert(!"unexpected gen"); + return false; + break; + } + + if (entry_count > max_entry_count) + entry_count = max_entry_count; + else if (info->gs_entry_size && entry_count < 2) + return false; + + conf->gs_entry_rows = row_count; + conf->gs_entry_count = entry_count; + + return true; +} + +static bool +urb_get_gen6_configuration(const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + struct urb_configuration *conf) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + memset(conf, 0, sizeof(*conf)); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + urb_alloc_gen7_pcb(dev, info, conf); + + urb_alloc_gen6_urb(dev, info, conf); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + if (!urb_init_gen7_vs_entry(dev, info, conf) || + !urb_init_gen7_hs_entry(dev, info, conf) || + !urb_init_gen7_ds_entry(dev, info, conf) || + !urb_init_gen7_gs_entry(dev, info, conf)) + return false; + } else { + if (!urb_init_gen6_vs_entry(dev, info, conf) || + !urb_init_gen6_gs_entry(dev, info, conf)) + return false; + } + + return true; +} + +static bool +urb_set_gen7_3dstate_push_constant_alloc(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + const struct urb_configuration *conf) +{ + uint32_t dw1[5]; + uint8_t sizes_kb[5], offset_kb; + int i; + + ILO_DEV_ASSERT(dev, 7, 8); + + sizes_kb[0] = conf->vs_pcb_alloc_kb; + sizes_kb[1] = conf->hs_pcb_alloc_kb; + sizes_kb[2] = conf->ds_pcb_alloc_kb; + sizes_kb[3] = conf->gs_pcb_alloc_kb; + sizes_kb[4] = conf->ps_pcb_alloc_kb; + offset_kb = 0; + + for (i = 0; i < 5; i++) { + /* careful for the valid range of offsets */ + if (sizes_kb[i]) { + dw1[i] = offset_kb << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT | + sizes_kb[i] << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT; + offset_kb += sizes_kb[i]; + } else { + dw1[i] = 0; + } + } + + STATIC_ASSERT(ARRAY_SIZE(urb->pcb) >= 5); + memcpy(urb->pcb, dw1, sizeof(dw1)); + + return true; +} + +static bool +urb_set_gen6_3DSTATE_URB(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + const struct urb_configuration *conf) +{ + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 6, 6); + + assert(conf->vs_entry_rows && conf->gs_entry_rows); + + dw1 = (conf->vs_entry_rows - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT | + conf->vs_entry_count << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT; + dw2 = conf->gs_entry_count << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT | + (conf->gs_entry_rows - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 2); + urb->urb[0] = dw1; + urb->urb[1] = dw2; + + return true; +} + +static bool +urb_set_gen7_3dstate_urb(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info, + const struct urb_configuration *conf) +{ + uint32_t dw1[4]; + struct { + uint8_t alloc_8kb; + uint8_t entry_rows; + int entry_count; + } stages[4]; + uint8_t offset_8kb; + int i; + + ILO_DEV_ASSERT(dev, 7, 8); + + stages[0].alloc_8kb = conf->vs_urb_alloc_8kb; + stages[1].alloc_8kb = conf->hs_urb_alloc_8kb; + stages[2].alloc_8kb = conf->ds_urb_alloc_8kb; + stages[3].alloc_8kb = conf->gs_urb_alloc_8kb; + + stages[0].entry_rows = conf->vs_entry_rows; + stages[1].entry_rows = conf->hs_entry_rows; + stages[2].entry_rows = conf->ds_entry_rows; + stages[3].entry_rows = conf->gs_entry_rows; + + stages[0].entry_count = conf->vs_entry_count; + stages[1].entry_count = conf->hs_entry_count; + stages[2].entry_count = conf->ds_entry_count; + stages[3].entry_count = conf->gs_entry_count; + + offset_8kb = conf->urb_offset_8kb; + + for (i = 0; i < 4; i++) { + /* careful for the valid range of offsets */ + if (stages[i].alloc_8kb) { + assert(stages[i].entry_rows); + dw1[i] = + offset_8kb << GEN7_URB_DW1_OFFSET__SHIFT | + (stages[i].entry_rows - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT | + stages[i].entry_count << GEN7_URB_DW1_ENTRY_COUNT__SHIFT; + offset_8kb += stages[i].alloc_8kb; + } else { + dw1[i] = 0; + } + } + + STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 4); + memcpy(urb->urb, dw1, sizeof(dw1)); + + return true; +} + +bool +ilo_state_urb_init(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info) +{ + assert(ilo_is_zeroed(urb, sizeof(*urb))); + return ilo_state_urb_set_info(urb, dev, info); +} + +bool +ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + uint8_t vf_attr_count) +{ + struct ilo_state_urb_info info; + + memset(&info, 0, sizeof(info)); + info.ve_entry_size = sizeof(uint32_t) * 4 * vf_attr_count; + + return ilo_state_urb_init(urb, dev, &info); +} + +bool +ilo_state_urb_set_info(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info) +{ + struct urb_configuration conf; + bool ret = true; + + ret &= urb_get_gen6_configuration(dev, info, &conf); + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + ret &= urb_set_gen7_3dstate_push_constant_alloc(urb, dev, info, &conf); + ret &= urb_set_gen7_3dstate_urb(urb, dev, info, &conf); + } else { + ret &= urb_set_gen6_3DSTATE_URB(urb, dev, info, &conf); + } + + assert(ret); + + return ret; +} + +void +ilo_state_urb_full_delta(const struct ilo_state_urb *urb, + const struct ilo_dev *dev, + struct ilo_state_urb_delta *delta) +{ + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + delta->dirty = ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS | + ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_HS | + ILO_STATE_URB_3DSTATE_URB_DS | + ILO_STATE_URB_3DSTATE_URB_GS; + } else { + delta->dirty = ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_GS; + } +} + +void +ilo_state_urb_get_delta(const struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb *old, + struct ilo_state_urb_delta *delta) +{ + delta->dirty = 0; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + if (memcmp(urb->pcb, old->pcb, sizeof(urb->pcb))) { + delta->dirty |= ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS; + } + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 34: + * + * "3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be + * programmed in order for the programming of this state + * (3DSTATE_URB_VS) to be valid." + * + * The same is true for the other three states. + */ + if (memcmp(urb->urb, old->urb, sizeof(urb->urb))) { + delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_HS | + ILO_STATE_URB_3DSTATE_URB_DS | + ILO_STATE_URB_3DSTATE_URB_GS; + } + } else { + if (memcmp(urb->urb, old->urb, sizeof(uint32_t) * 2)) { + delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_GS; + } + } +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.h b/src/gallium/drivers/ilo/core/ilo_state_urb.h new file mode 100644 index 00000000000..9522b3bd681 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_urb.h @@ -0,0 +1,103 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_URB_H +#define ILO_STATE_URB_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +enum ilo_state_urb_dirty_bits { + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS = (1 << 0), + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS = (1 << 1), + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS = (1 << 2), + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS = (1 << 3), + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS = (1 << 4), + ILO_STATE_URB_3DSTATE_URB_VS = (1 << 5), + ILO_STATE_URB_3DSTATE_URB_HS = (1 << 6), + ILO_STATE_URB_3DSTATE_URB_DS = (1 << 7), + ILO_STATE_URB_3DSTATE_URB_GS = (1 << 8), +}; + +/** + * URB entry allocation sizes and sizes of constant data extracted from PCBs + * to threads. + */ +struct ilo_state_urb_info { + bool gs_enable; + + bool vs_const_data; + bool hs_const_data; + bool ds_const_data; + bool gs_const_data; + bool ps_const_data; + + uint16_t ve_entry_size; + uint16_t vs_entry_size; + uint16_t hs_entry_size; + uint16_t ds_entry_size; + uint16_t gs_entry_size; +}; + +struct ilo_state_urb { + uint32_t pcb[5]; + uint32_t urb[4]; +}; + +struct ilo_state_urb_delta { + uint32_t dirty; +}; + +bool +ilo_state_urb_init(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info); + +bool +ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + uint8_t vf_attr_count); + +bool +ilo_state_urb_set_info(struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb_info *info); + +void +ilo_state_urb_full_delta(const struct ilo_state_urb *urb, + const struct ilo_dev *dev, + struct ilo_state_urb_delta *delta); + +void +ilo_state_urb_get_delta(const struct ilo_state_urb *urb, + const struct ilo_dev *dev, + const struct ilo_state_urb *old, + struct ilo_state_urb_delta *delta); + +#endif /* ILO_STATE_URB_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c new file mode 100644 index 00000000000..ddc75428ed7 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c @@ -0,0 +1,984 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_buffer.h" +#include "ilo_state_vf.h" + +static bool +vf_validate_gen6_elements(const struct ilo_dev *dev, + const struct ilo_state_vf_info *info) +{ + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 95: + * + * "(Source Element Offset (in bytes)) + * Format: U11 + * Range [0,2047" + * + * From the Haswell PRM, volume 2d, page 415: + * + * "(Source Element Offset) + * Format: U12 byte offset + * ... + * [0,4095]" + * + * From the Broadwell PRM, volume 2d, page 469: + * + * "(Source Element Offset) + * Format: U12 byte offset + * ... + * [0,2047]" + */ + const uint16_t max_vertex_offset = + (ilo_dev_gen(dev) == ILO_GEN(7.5)) ? 4096 : 2048; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(info->element_count <= ILO_STATE_VF_MAX_ELEMENT_COUNT); + + for (i = 0; i < info->element_count; i++) { + const struct ilo_state_vf_element_info *elem = &info->elements[i]; + + assert(elem->buffer < ILO_STATE_VF_MAX_BUFFER_COUNT); + assert(elem->vertex_offset < max_vertex_offset); + assert(ilo_state_vf_valid_element_format(dev, elem->format)); + } + + return true; +} + +static uint32_t +get_gen6_component_controls(const struct ilo_dev *dev, + enum gen_vf_component comp_x, + enum gen_vf_component comp_y, + enum gen_vf_component comp_z, + enum gen_vf_component comp_w) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + return comp_x << GEN6_VE_DW1_COMP0__SHIFT | + comp_y << GEN6_VE_DW1_COMP1__SHIFT | + comp_z << GEN6_VE_DW1_COMP2__SHIFT | + comp_w << GEN6_VE_DW1_COMP3__SHIFT; +} + +static bool +get_gen6_edge_flag_format(const struct ilo_dev *dev, + const struct ilo_state_vf_element_info *elem, + enum gen_surface_format *format) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 94: + * + * "The Source Element Format must be set to the UINT format." + * + * From the Haswell PRM, volume 2d, page 413: + * + * "The SourceElementFormat needs to be a single-component format with + * an element which has edge flag enabled." + */ + if (elem->component_count != 1) + return false; + + /* pick the format we like */ + switch (elem->format_size) { + case 1: + *format = GEN6_FORMAT_R8_UINT; + break; + case 2: + *format = GEN6_FORMAT_R16_UINT; + break; + case 4: + *format = GEN6_FORMAT_R32_UINT; + break; + default: + return false; + break; + } + + return true; +} + +static bool +vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_info *info) +{ + enum gen_surface_format edge_flag_format; + uint32_t dw0, dw1; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!vf_validate_gen6_elements(dev, info)) + return false; + + for (i = 0; i < info->element_count; i++) { + const struct ilo_state_vf_element_info *elem = &info->elements[i]; + enum gen_vf_component components[4] = { + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0, + (elem->is_integer) ? GEN6_VFCOMP_STORE_1_INT : + GEN6_VFCOMP_STORE_1_FP, + }; + + switch (elem->component_count) { + case 4: components[3] = GEN6_VFCOMP_STORE_SRC; /* fall through */ + case 3: components[2] = GEN6_VFCOMP_STORE_SRC; /* fall through */ + case 2: components[1] = GEN6_VFCOMP_STORE_SRC; /* fall through */ + case 1: components[0] = GEN6_VFCOMP_STORE_SRC; break; + default: + assert(!"unexpected component count"); + break; + } + + dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT | + GEN6_VE_DW0_VALID | + elem->format << GEN6_VE_DW0_FORMAT__SHIFT | + elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT; + dw1 = get_gen6_component_controls(dev, + components[0], components[1], + components[2], components[3]); + + STATIC_ASSERT(ARRAY_SIZE(vf->user_ve[i]) >= 2); + vf->user_ve[i][0] = dw0; + vf->user_ve[i][1] = dw1; + } + + vf->user_ve_count = i; + + vf->edge_flag_supported = (i && get_gen6_edge_flag_format(dev, + &info->elements[i - 1], &edge_flag_format)); + if (vf->edge_flag_supported) { + const struct ilo_state_vf_element_info *elem = &info->elements[i - 1]; + + /* without edge flag enable */ + vf->last_user_ve[0][0] = dw0; + vf->last_user_ve[0][1] = dw1; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 94: + * + * "This bit (Edge Flag Enable) must only be ENABLED on the last + * valid VERTEX_ELEMENT structure. + * + * When set, Component 0 Control must be set to + * VFCOMP_STORE_SRC, and Component 1-3 Control must be set to + * VFCOMP_NOSTORE." + */ + dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT | + GEN6_VE_DW0_VALID | + edge_flag_format << GEN6_VE_DW0_FORMAT__SHIFT | + GEN6_VE_DW0_EDGE_FLAG_ENABLE | + elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT; + dw1 = get_gen6_component_controls(dev, GEN6_VFCOMP_STORE_SRC, + GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE); + + /* with edge flag enable */ + vf->last_user_ve[1][0] = dw0; + vf->last_user_ve[1][1] = dw1; + } + + return true; +} + +static bool +vf_set_gen6_vertex_buffer_state(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_info *info) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 7.5); + + memset(vf->vb_to_first_elem, -1, sizeof(vf->vb_to_first_elem)); + + for (i = 0; i < info->element_count; i++) { + const struct ilo_state_vf_element_info *elem = &info->elements[i]; + + STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2); + /* instancing enable only */ + vf->user_instancing[i][0] = (elem->instancing_enable) ? + GEN6_VB_DW0_ACCESS_INSTANCEDATA : + GEN6_VB_DW0_ACCESS_VERTEXDATA; + vf->user_instancing[i][1] = elem->instancing_step_rate; + + /* + * Instancing is per VB, not per VE, before Gen8. Set up a VB-to-VE + * mapping as well. + */ + if (vf->vb_to_first_elem[elem->buffer] < 0) { + vf->vb_to_first_elem[elem->buffer] = i; + } else { + const struct ilo_state_vf_element_info *first = + &info->elements[vf->vb_to_first_elem[elem->buffer]]; + + assert(elem->instancing_enable == first->instancing_enable && + elem->instancing_step_rate == first->instancing_step_rate); + } + } + + return true; +} + +static bool +vf_set_gen8_3DSTATE_VF_INSTANCING(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_info *info) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 8, 8); + + for (i = 0; i < info->element_count; i++) { + const struct ilo_state_vf_element_info *elem = &info->elements[i]; + + STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2); + vf->user_instancing[i][0] = (elem->instancing_enable) ? + GEN8_INSTANCING_DW1_ENABLE : 0; + vf->user_instancing[i][1] = elem->instancing_step_rate; + } + + return true; +} + +static uint32_t +get_gen6_component_zeros(const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + return get_gen6_component_controls(dev, + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0); +} + +static uint32_t +get_gen6_component_ids(const struct ilo_dev *dev, + bool vertexid, bool instanceid) +{ + ILO_DEV_ASSERT(dev, 6, 7.5); + + return get_gen6_component_controls(dev, + (vertexid) ? GEN6_VFCOMP_STORE_VID : GEN6_VFCOMP_STORE_0, + (instanceid) ? GEN6_VFCOMP_STORE_IID : GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0, + GEN6_VFCOMP_STORE_0); +} + +static bool +vf_params_set_gen6_internal_ve(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params, + uint8_t user_ve_count) +{ + const bool prepend_ids = + (params->prepend_vertexid || params->prepend_instanceid); + uint8_t internal_ve_count = 0, i; + uint32_t dw1[2]; + + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 92: + * + * "- At least one VERTEX_ELEMENT_STATE structure must be included. + * + * - Inclusion of partial VERTEX_ELEMENT_STATE structures is + * UNDEFINED. + * + * - SW must ensure that at least one vertex element is defined prior + * to issuing a 3DPRIMTIVE command, or operation is UNDEFINED. + * + * - There are no "holes" allowed in the destination vertex: NOSTORE + * components must be overwritten by subsequent components unless + * they are the trailing DWords of the vertex. Software must + * explicitly chose some value (probably 0) to be written into + * DWords that would otherwise be "holes"." + * + * - ... + * + * - [DevILK+] Element[0] must be valid." + */ + if (params->prepend_zeros || (!user_ve_count && !prepend_ids)) + dw1[internal_ve_count++] = get_gen6_component_zeros(dev); + + if (prepend_ids) { + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + /* placeholder for 3DSTATE_VF_SGVS */ + dw1[internal_ve_count++] = get_gen6_component_zeros(dev); + } else { + dw1[internal_ve_count++] = get_gen6_component_ids(dev, + params->prepend_vertexid, params->prepend_instanceid); + } + } + + for (i = 0; i < internal_ve_count; i++) { + STATIC_ASSERT(ARRAY_SIZE(vf->internal_ve[i]) >= 2); + vf->internal_ve[i][0] = GEN6_VE_DW0_VALID; + vf->internal_ve[i][1] = dw1[i]; + } + + vf->internal_ve_count = internal_ve_count; + + return true; +} + +static bool +vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params) +{ + const uint8_t attr = (params->prepend_zeros) ? 1 : 0; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 8, 8); + + dw1 = 0; + + if (params->prepend_instanceid) { + dw1 |= GEN8_SGVS_DW1_IID_ENABLE | + 1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT | + attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT; + } + + if (params->prepend_vertexid) { + dw1 |= GEN8_SGVS_DW1_VID_ENABLE | + 0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT | + attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1); + vf->sgvs[0] = dw1; + + return true; +} + +static uint32_t +get_gen6_fixed_cut_index(const struct ilo_dev *dev, + enum gen_index_format format) +{ + const uint32_t fixed = ~0u; + + ILO_DEV_ASSERT(dev, 6, 7); + + switch (format) { + case GEN6_INDEX_BYTE: return (uint8_t) fixed; + case GEN6_INDEX_WORD: return (uint16_t) fixed; + case GEN6_INDEX_DWORD: return (uint32_t) fixed; + default: + assert(!"unknown index format"); + return fixed; + } +} + +static bool +get_gen6_cut_index_supported(const struct ilo_dev *dev, + enum gen_3dprim_type topology) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * See the Sandy Bridge PRM, volume 2 part 1, page 80 and the Haswell PRM, + * volume 7, page 456. + */ + switch (topology) { + case GEN6_3DPRIM_TRIFAN: + case GEN6_3DPRIM_QUADLIST: + case GEN6_3DPRIM_QUADSTRIP: + case GEN6_3DPRIM_POLYGON: + case GEN6_3DPRIM_LINELOOP: + return (ilo_dev_gen(dev) >= ILO_GEN(7.5)); + case GEN6_3DPRIM_RECTLIST: + case GEN6_3DPRIM_TRIFAN_NOSTIPPLE: + return false; + default: + return true; + } +} + +static bool +vf_params_set_gen6_3dstate_index_buffer(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params) +{ + uint32_t dw0 = 0; + + ILO_DEV_ASSERT(dev, 6, 7); + + /* cut index only, as in 3DSTATE_VF */ + if (params->cut_index_enable) { + assert(get_gen6_cut_index_supported(dev, params->cv_topology)); + assert(get_gen6_fixed_cut_index(dev, params->cv_index_format) == + params->cut_index); + + dw0 |= GEN6_IB_DW0_CUT_INDEX_ENABLE; + } + + STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 1); + vf->cut[0] = dw0; + + return true; +} + +static bool +vf_params_set_gen75_3DSTATE_VF(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params) +{ + uint32_t dw0 = 0; + + ILO_DEV_ASSERT(dev, 7.5, 8); + + if (params->cut_index_enable) { + assert(get_gen6_cut_index_supported(dev, params->cv_topology)); + dw0 |= GEN75_VF_DW0_CUT_INDEX_ENABLE; + } + + STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 2); + vf->cut[0] = dw0; + vf->cut[1] = params->cut_index; + + return true; +} + +static bool +vertex_buffer_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_vertex_buffer_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (info->buf) + assert(info->offset < info->buf->bo_size && info->size); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 86: + * + * "(Buffer Pitch) + * Range [DevCTG+]: [0,2048] Bytes" + */ + assert(info->stride <= 2048); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 86: + * + * "64-bit floating point values must be 64-bit aligned in memory, or + * UNPREDICTABLE data will be fetched. When accessing an element + * containing 64-bit floating point values, the Buffer Starting + * Address and Source Element Offset values must add to a 64-bit + * aligned address, and BufferPitch must be a multiple of 64-bits." + */ + if (info->cv_has_double) { + assert(info->stride % 8 == 0); + assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0); + } + + return true; +} + +static uint32_t +vertex_buffer_get_gen6_size(const struct ilo_dev *dev, + const struct ilo_state_vertex_buffer_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (!info->buf) + return 0; + + return (info->offset + info->size <= info->buf->bo_size) ? info->size : + info->buf->bo_size - info->offset; +} + +static bool +vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb, + const struct ilo_dev *dev, + const struct ilo_state_vertex_buffer_info *info) +{ + const uint32_t size = vertex_buffer_get_gen6_size(dev, info); + uint32_t dw0; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!vertex_buffer_validate_gen6(dev, info)) + return false; + + dw0 = info->stride << GEN6_VB_DW0_PITCH__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + dw0 |= GEN7_VB_DW0_ADDR_MODIFIED; + if (!info->buf) + dw0 |= GEN6_VB_DW0_IS_NULL; + + STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3); + vb->vb[0] = dw0; + vb->vb[1] = info->offset; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + vb->vb[2] = size; + } else { + /* address of the last valid byte */ + vb->vb[2] = (size) ? info->offset + size - 1 : 0; + } + + vb->need_bo = (info->buf != NULL); + + return true; +} + +static uint32_t +get_index_format_size(enum gen_index_format format) +{ + switch (format) { + case GEN6_INDEX_BYTE: return 1; + case GEN6_INDEX_WORD: return 2; + case GEN6_INDEX_DWORD: return 4; + default: + assert(!"unknown index format"); + return 1; + } +} + +static bool +index_buffer_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_index_buffer_info *info) +{ + const uint32_t format_size = get_index_format_size(info->format); + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 79: + * + * "This field (Buffer Starting Address) contains the size-aligned (as + * specified by Index Format) Graphics Address of the first element of + * interest within the index buffer." + */ + assert(info->offset % format_size == 0); + + if (info->buf) + assert(info->offset < info->buf->bo_size && info->size); + + return true; +} + +static uint32_t +index_buffer_get_gen6_size(const struct ilo_dev *dev, + const struct ilo_state_index_buffer_info *info) +{ + uint32_t size; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!info->buf) + return 0; + + size = (info->offset + info->size <= info->buf->bo_size) ? info->size : + info->buf->bo_size - info->offset; + + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + const uint32_t format_size = get_index_format_size(info->format); + size -= (size % format_size); + } + + return size; +} + +static bool +index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib, + const struct ilo_dev *dev, + const struct ilo_state_index_buffer_info *info) +{ + const uint32_t size = index_buffer_get_gen6_size(dev, info); + + ILO_DEV_ASSERT(dev, 6, 8); + + if (!index_buffer_validate_gen6(dev, info)) + return false; + + STATIC_ASSERT(ARRAY_SIZE(ib->ib) >= 3); + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + ib->ib[0] = info->format << GEN8_IB_DW1_FORMAT__SHIFT; + ib->ib[1] = info->offset; + ib->ib[2] = size; + } else { + ib->ib[0] = info->format << GEN6_IB_DW0_FORMAT__SHIFT; + ib->ib[1] = info->offset; + /* address of the last valid byte, or 0 */ + ib->ib[2] = (size) ? info->offset + size - 1 : 0; + } + + ib->need_bo = (info->buf != NULL); + + return true; +} + +bool +ilo_state_vf_valid_element_format(const struct ilo_dev *dev, + enum gen_surface_format format) +{ + /* + * This table is based on: + * + * - the Sandy Bridge PRM, volume 4 part 1, page 88-97 + * - the Ivy Bridge PRM, volume 2 part 1, page 97-99 + * - the Haswell PRM, volume 7, page 467-470 + */ + static const int vf_element_formats[] = { + [GEN6_FORMAT_R32G32B32A32_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R64G64_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32A32_SFIXED] = ILO_GEN(7.5), + [GEN6_FORMAT_R32G32B32_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32B32_SFIXED] = ILO_GEN(7.5), + [GEN6_FORMAT_R16G16B16A16_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R64_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16A16_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32G32_SFIXED] = ILO_GEN(7.5), + [GEN6_FORMAT_B8G8R8A8_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R10G10B10A2_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R10G10B10A2_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_B10G10R10A2_UNORM] = ILO_GEN(7.5), + [GEN6_FORMAT_R11G11B10_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R32_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R32_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R32_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R10G10B10X2_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8A8_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R32_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R16_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8_SINT] = ILO_GEN( 1), + [GEN6_FORMAT_R8_UINT] = ILO_GEN( 1), + [GEN6_FORMAT_R8_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R8G8B8_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R64G64B64A64_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R64G64B64_FLOAT] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16_FLOAT] = ILO_GEN( 6), + [GEN6_FORMAT_R16G16B16_UNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16_SNORM] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16_SSCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16_USCALED] = ILO_GEN( 1), + [GEN6_FORMAT_R16G16B16_UINT] = ILO_GEN(7.5), + [GEN6_FORMAT_R16G16B16_SINT] = ILO_GEN(7.5), + [GEN6_FORMAT_R32_SFIXED] = ILO_GEN(7.5), + [GEN6_FORMAT_R10G10B10A2_SNORM] = ILO_GEN(7.5), + [GEN6_FORMAT_R10G10B10A2_USCALED] = ILO_GEN(7.5), + [GEN6_FORMAT_R10G10B10A2_SSCALED] = ILO_GEN(7.5), + [GEN6_FORMAT_R10G10B10A2_SINT] = ILO_GEN(7.5), + [GEN6_FORMAT_B10G10R10A2_SNORM] = ILO_GEN(7.5), + [GEN6_FORMAT_B10G10R10A2_USCALED] = ILO_GEN(7.5), + [GEN6_FORMAT_B10G10R10A2_SSCALED] = ILO_GEN(7.5), + [GEN6_FORMAT_B10G10R10A2_UINT] = ILO_GEN(7.5), + [GEN6_FORMAT_B10G10R10A2_SINT] = ILO_GEN(7.5), + [GEN6_FORMAT_R8G8B8_UINT] = ILO_GEN(7.5), + [GEN6_FORMAT_R8G8B8_SINT] = ILO_GEN(7.5), + }; + + ILO_DEV_ASSERT(dev, 6, 8); + + return (format < ARRAY_SIZE(vf_element_formats) && + vf_element_formats[format] && + ilo_dev_gen(dev) >= vf_element_formats[format]); +} + +bool +ilo_state_vf_init(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(vf, sizeof(*vf))); + assert(ilo_is_zeroed(info->data, info->data_size)); + + assert(ilo_state_vf_data_size(dev, info->element_count) <= + info->data_size); + vf->user_ve = (uint32_t (*)[2]) info->data; + vf->user_instancing = + (uint32_t (*)[2]) (vf->user_ve + info->element_count); + + ret &= vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(vf, dev, info); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + ret &= vf_set_gen8_3DSTATE_VF_INSTANCING(vf, dev, info); + else + ret &= vf_set_gen6_vertex_buffer_state(vf, dev, info); + + ret &= ilo_state_vf_set_params(vf, dev, &info->params); + + assert(ret); + + return ret; +} + +bool +ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + void *data, size_t data_size, + const struct ilo_state_vf_element_info *elements, + uint8_t element_count) +{ + struct ilo_state_vf_info info; + + memset(&info, 0, sizeof(info)); + + info.data = data; + info.data_size = data_size; + + info.elements = elements; + info.element_count = element_count; + + /* + * For VUE header, + * + * DW0: Reserved: MBZ + * DW1: Render Target Array Index + * DW2: Viewport Index + * DW3: Point Width + */ + info.params.prepend_zeros = true; + + return ilo_state_vf_init(vf, dev, &info); +} + +bool +ilo_state_vf_set_params(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params) +{ + bool ret = true; + + ILO_DEV_ASSERT(dev, 6, 8); + + ret &= vf_params_set_gen6_internal_ve(vf, dev, params, vf->user_ve_count); + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + ret &= vf_params_set_gen8_3DSTATE_VF_SGVS(vf, dev, params); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 94: + * + * "Edge flags are supported for the following primitive topology types + * only, otherwise EdgeFlagEnable must not be ENABLED. + * + * - 3DPRIM_TRILIST* + * - 3DPRIM_TRISTRIP* + * - 3DPRIM_TRIFAN* + * - 3DPRIM_POLYGON" + * + * "[DevSNB]: Edge Flags are not supported for QUADLIST primitives. + * Software may elect to convert QUADLIST primitives to some set of + * corresponding edge-flag-supported primitive types (e.g., POLYGONs) + * prior to submission to the 3D vf." + * + * From the Ivy Bridge PRM, volume 2 part 1, page 86: + * + * "Edge flags are supported for all primitive topology types." + * + * Both PRMs are confusing... + */ + if (params->last_element_edge_flag) { + assert(vf->edge_flag_supported); + if (ilo_dev_gen(dev) == ILO_GEN(6)) + assert(params->cv_topology != GEN6_3DPRIM_QUADLIST); + } + + if (vf->edge_flag_supported) { + assert(vf->user_ve_count); + memcpy(vf->user_ve[vf->user_ve_count - 1], + vf->last_user_ve[params->last_element_edge_flag], + sizeof(vf->user_ve[vf->user_ve_count - 1])); + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + ret &= vf_params_set_gen75_3DSTATE_VF(vf, dev, params); + else + ret &= vf_params_set_gen6_3dstate_index_buffer(vf, dev, params); + + assert(ret); + + return ret; +} + +void +ilo_state_vf_full_delta(const struct ilo_state_vf *vf, + const struct ilo_dev *dev, + struct ilo_state_vf_delta *delta) +{ + delta->dirty = ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS | + ILO_STATE_VF_3DSTATE_VF_INSTANCING; + } else { + delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + delta->dirty |= ILO_STATE_VF_3DSTATE_VF; + else + delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER; +} + +void +ilo_state_vf_get_delta(const struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf *old, + struct ilo_state_vf_delta *delta) +{ + /* no shallow copying */ + assert(vf->user_ve != old->user_ve && + vf->user_instancing != old->user_instancing); + + delta->dirty = 0; + + if (vf->internal_ve_count != old->internal_ve_count || + vf->user_ve_count != old->user_ve_count || + memcmp(vf->internal_ve, old->internal_ve, + sizeof(vf->internal_ve[0]) * vf->internal_ve_count) || + memcmp(vf->user_ve, old->user_ve, + sizeof(vf->user_ve[0]) * vf->user_ve_count)) + delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS; + + if (vf->user_ve_count != old->user_ve_count || + memcmp(vf->user_instancing, old->user_instancing, + sizeof(vf->user_instancing[0]) * vf->user_ve_count)) { + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + delta->dirty |= ILO_STATE_VF_3DSTATE_VF_INSTANCING; + else + delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + if (vf->sgvs[0] != old->sgvs[0]) + delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS; + } + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { + if (memcmp(vf->cut, old->cut, sizeof(vf->cut))) + delta->dirty |= ILO_STATE_VF_3DSTATE_VF; + } else { + if (vf->cut[0] != old->cut[0]) + delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER; + } +} + +/** + * No need to initialize first. + */ +bool +ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb, + const struct ilo_dev *dev, + const struct ilo_state_vertex_buffer_info *info) +{ + bool ret = true; + + ret &= vertex_buffer_set_gen8_vertex_buffer_state(vb, dev, info); + + assert(ret); + + return ret; +} + +/** + * No need to initialize first. + */ +bool +ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib, + const struct ilo_dev *dev, + const struct ilo_state_index_buffer_info *info) +{ + bool ret = true; + + ret &= index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(ib, dev, info); + + assert(ret); + + return ret; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h new file mode 100644 index 00000000000..f15c63a248a --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h @@ -0,0 +1,228 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_VF_H +#define ILO_STATE_VF_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Sandy Bridge PRM, volume 2 part 1, page 93: + * + * "Up to 34 (DevSNB+) vertex elements are supported." + * + * "Up to 33 VBs are supported" + * + * Reserve two VEs and one VB for internal use. + */ +#define ILO_STATE_VF_MAX_ELEMENT_COUNT (34 - 2) +#define ILO_STATE_VF_MAX_BUFFER_COUNT (33 - 1) + +enum ilo_state_vf_dirty_bits { + ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS = (1 << 0), + ILO_STATE_VF_3DSTATE_VF_SGVS = (1 << 1), + ILO_STATE_VF_3DSTATE_VF_INSTANCING = (1 << 2), + ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS = (1 << 3), + ILO_STATE_VF_3DSTATE_VF = (1 << 4), + ILO_STATE_VF_3DSTATE_INDEX_BUFFER = (1 << 5), +}; + +/** + * Fetch a 128-bit vertex attribute. + */ +struct ilo_state_vf_element_info { + uint8_t buffer; + uint16_t vertex_offset; + enum gen_surface_format format; + + uint8_t format_size; + uint8_t component_count; + bool is_integer; + + /* must be the same for those share the same buffer before Gen8 */ + bool instancing_enable; + uint32_t instancing_step_rate; +}; + +/** + * VF parameters. + */ +struct ilo_state_vf_params_info { + enum gen_3dprim_type cv_topology; + + /* prepend an attribute of zeros */ + bool prepend_zeros; + + /* prepend an attribute of VertexID and/or InstanceID */ + bool prepend_vertexid; + bool prepend_instanceid; + + bool last_element_edge_flag; + + enum gen_index_format cv_index_format; + bool cut_index_enable; + uint32_t cut_index; +}; + +/** + * Vertex fetch. + */ +struct ilo_state_vf_info { + void *data; + size_t data_size; + + const struct ilo_state_vf_element_info *elements; + uint8_t element_count; + + struct ilo_state_vf_params_info params; +}; + +struct ilo_state_vf { + uint32_t (*user_ve)[2]; + uint32_t (*user_instancing)[2]; + int8_t vb_to_first_elem[ILO_STATE_VF_MAX_BUFFER_COUNT]; + uint8_t user_ve_count; + + bool edge_flag_supported; + uint32_t last_user_ve[2][2]; + + /* two VEs are reserved for internal use */ + uint32_t internal_ve[2][2]; + uint8_t internal_ve_count; + + uint32_t sgvs[1]; + + uint32_t cut[2]; +}; + +struct ilo_state_vf_delta { + uint32_t dirty; +}; + +struct ilo_buffer; + +struct ilo_state_vertex_buffer_info { + const struct ilo_buffer *buf; + uint32_t offset; + uint32_t size; + + uint16_t stride; + + /* doubles must be at 64-bit aligned addresses */ + bool cv_has_double; + uint8_t cv_double_vertex_offset_mod_8; +}; + +struct ilo_state_vertex_buffer { + uint32_t vb[3]; + + bool need_bo; + + /* managed by users */ + struct intel_bo *bo; +}; + +struct ilo_state_index_buffer_info { + const struct ilo_buffer *buf; + uint32_t offset; + uint32_t size; + + enum gen_index_format format; +}; + +struct ilo_state_index_buffer { + uint32_t ib[3]; + + bool need_bo; + + /* managed by users */ + struct intel_bo *bo; +}; + +static inline size_t +ilo_state_vf_data_size(const struct ilo_dev *dev, uint8_t element_count) +{ + const struct ilo_state_vf *vf = NULL; + return (sizeof(vf->user_ve[0]) + + sizeof(vf->user_instancing[0])) * element_count; +} + +bool +ilo_state_vf_valid_element_format(const struct ilo_dev *dev, + enum gen_surface_format format); + +bool +ilo_state_vf_init(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_info *info); + +bool +ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + void *data, size_t data_size, + const struct ilo_state_vf_element_info *elements, + uint8_t element_count); + +bool +ilo_state_vf_set_params(struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf_params_info *params); + +/** + * Return the number of attributes in the VUE. + */ +static inline uint8_t +ilo_state_vf_get_attr_count(const struct ilo_state_vf *vf) +{ + return vf->internal_ve_count + vf->user_ve_count; +} + +void +ilo_state_vf_full_delta(const struct ilo_state_vf *vf, + const struct ilo_dev *dev, + struct ilo_state_vf_delta *delta); + +void +ilo_state_vf_get_delta(const struct ilo_state_vf *vf, + const struct ilo_dev *dev, + const struct ilo_state_vf *old, + struct ilo_state_vf_delta *delta); + +bool +ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb, + const struct ilo_dev *dev, + const struct ilo_state_vertex_buffer_info *info); + +bool +ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib, + const struct ilo_dev *dev, + const struct ilo_state_index_buffer_info *info); + +#endif /* ILO_STATE_VF_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.c b/src/gallium/drivers/ilo/core/ilo_state_viewport.c new file mode 100644 index 00000000000..aae57334541 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.c @@ -0,0 +1,378 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "ilo_debug.h" +#include "ilo_state_viewport.h" + +static void +viewport_matrix_get_gen6_guardband(const struct ilo_dev *dev, + const struct ilo_state_viewport_matrix_info *mat, + float *min_gbx, float *max_gbx, + float *min_gby, float *max_gby) +{ + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 234: + * + * "Per-Device Guardband Extents + * + * - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1] + * - Maximum Post-Clamp Delta (X or Y): 16K" + * + * "In addition, in order to be correctly rendered, objects must have a + * screenspace bounding box not exceeding 8K in the X or Y direction. + * This additional restriction must also be comprehended by software, + * i.e., enforced by use of clipping." + * + * From the Ivy Bridge PRM, volume 2 part 1, page 248: + * + * "Per-Device Guardband Extents + * + * - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1] + * - Maximum Post-Clamp Delta (X or Y): N/A" + * + * "In addition, in order to be correctly rendered, objects must have a + * screenspace bounding box not exceeding 8K in the X or Y direction. + * This additional restriction must also be comprehended by software, + * i.e., enforced by use of clipping." + * + * Combined, the bounding box of any object can not exceed 8K in both + * width and height. + * + * Below we set the guardband as a squre of length 8K, centered at where + * the viewport is. This makes sure all objects passing the GB test are + * valid to the renderer, and those failing the XY clipping have a + * better chance of passing the GB test. + */ + const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384; + const int half_len = 8192 / 2; + int center_x = (int) mat->translate[0]; + int center_y = (int) mat->translate[1]; + float scale_x, scale_y; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* make sure the guardband is within the valid range */ + if (center_x - half_len < -max_extent) + center_x = -max_extent + half_len; + else if (center_x + half_len > max_extent - 1) + center_x = max_extent - half_len; + + if (center_y - half_len < -max_extent) + center_y = -max_extent + half_len; + else if (center_y + half_len > max_extent - 1) + center_y = max_extent - half_len; + + scale_x = fabsf(mat->scale[0]); + scale_y = fabsf(mat->scale[1]); + /* + * From the Haswell PRM, volume 2d, page 292-293: + * + * "Note: Minimum allowed value for this field (X/Y Min Clip Guardband) + * is -16384." + * + * "Note: Maximum allowed value for this field (X/Y Max Clip Guardband) + * is 16383." + * + * Avoid small scales. + */ + if (scale_x < 1.0f) + scale_x = 1.0f; + if (scale_y < 1.0f) + scale_y = 1.0f; + + /* in NDC space */ + *min_gbx = ((float) (center_x - half_len) - mat->translate[0]) / scale_x; + *max_gbx = ((float) (center_x + half_len) - mat->translate[0]) / scale_x; + *min_gby = ((float) (center_y - half_len) - mat->translate[1]) / scale_y; + *max_gby = ((float) (center_y + half_len) - mat->translate[1]) / scale_y; +} + +static void +viewport_matrix_get_extent(const struct ilo_state_viewport_matrix_info *mat, + int axis, float *min, float *max) +{ + const float scale_abs = fabsf(mat->scale[axis]); + + *min = -1.0f * scale_abs + mat->translate[axis]; + *max = 1.0f * scale_abs + mat->translate[axis]; +} + +static bool +viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_matrix_info *matrices, + uint8_t count) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < count; i++) { + const struct ilo_state_viewport_matrix_info *mat = &matrices[i]; + float min_gbx, max_gbx, min_gby, max_gby; + uint32_t dw[16]; + + viewport_matrix_get_gen6_guardband(dev, mat, + &min_gbx, &max_gbx, &min_gby, &max_gby); + + dw[0] = fui(mat->scale[0]); + dw[1] = fui(mat->scale[1]); + dw[2] = fui(mat->scale[2]); + dw[3] = fui(mat->translate[0]); + dw[4] = fui(mat->translate[1]); + dw[5] = fui(mat->translate[2]); + dw[6] = 0; + dw[7] = 0; + + dw[8] = fui(min_gbx); + dw[9] = fui(max_gbx); + dw[10] = fui(min_gby); + dw[11] = fui(max_gby); + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + float min_x, max_x, min_y, max_y; + + viewport_matrix_get_extent(mat, 0, &min_x, &max_x); + viewport_matrix_get_extent(mat, 1, &min_y, &max_y); + + dw[12] = fui(min_x); + dw[13] = fui(max_x - 1.0f); + dw[14] = fui(min_y); + dw[15] = fui(max_y - 1.0f); + } else { + dw[12] = 0; + dw[13] = 0; + dw[14] = 0; + dw[15] = 0; + } + + STATIC_ASSERT(ARRAY_SIZE(vp->sf_clip[i]) >= 16); + memcpy(vp->sf_clip[i], dw, sizeof(dw)); + } + + return true; +} + +static bool +viewport_matrix_set_gen6_CC_VIEWPORT(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_matrix_info *matrices, + uint8_t count) +{ + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < count; i++) { + const struct ilo_state_viewport_matrix_info *mat = &matrices[i]; + float min_z, max_z; + + viewport_matrix_get_extent(mat, 2, &min_z, &max_z); + + STATIC_ASSERT(ARRAY_SIZE(vp->cc[i]) >= 2); + vp->cc[i][0] = fui(min_z); + vp->cc[i][1] = fui(max_z); + } + + return true; +} + +static bool +viewport_scissor_set_gen6_SCISSOR_RECT(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_scissor_info *scissors, + uint8_t count) +{ + const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; + uint8_t i; + + ILO_DEV_ASSERT(dev, 6, 8); + + for (i = 0; i < count; i++) { + const struct ilo_state_viewport_scissor_info *scissor = &scissors[i]; + uint16_t min_x, min_y, max_x, max_y; + uint32_t dw0, dw1; + + min_x = (scissor->min_x < max_size) ? scissor->min_x : max_size - 1; + min_y = (scissor->min_y < max_size) ? scissor->min_y : max_size - 1; + max_x = (scissor->max_x < max_size) ? scissor->max_x : max_size - 1; + max_y = (scissor->max_y < max_size) ? scissor->max_y : max_size - 1; + + dw0 = min_y << GEN6_SCISSOR_DW0_MIN_Y__SHIFT | + min_x << GEN6_SCISSOR_DW0_MIN_X__SHIFT; + dw1 = max_y << GEN6_SCISSOR_DW1_MAX_Y__SHIFT | + max_x << GEN6_SCISSOR_DW1_MAX_X__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(vp->scissor[i]) >= 2); + vp->scissor[i][0] = dw0; + vp->scissor[i][1] = dw1; + } + + return true; +} + +bool +ilo_state_viewport_init(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_info *info) +{ + const size_t elem_size = ilo_state_viewport_data_size(dev, 1); + + assert(ilo_is_zeroed(vp, sizeof(*vp))); + assert(ilo_is_zeroed(info->data, info->data_size)); + + vp->data = info->data; + + if (info->data_size / elem_size < ILO_STATE_VIEWPORT_MAX_COUNT) + vp->array_size = info->data_size / elem_size; + else + vp->array_size = ILO_STATE_VIEWPORT_MAX_COUNT; + + return ilo_state_viewport_set_params(vp, dev, &info->params, false); +} + +bool +ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + void *data, size_t data_size) +{ + struct ilo_state_viewport_info info; + + memset(&info, 0, sizeof(info)); + info.data = data; + info.data_size = data_size; + + return ilo_state_viewport_init(vp, dev, &info); +} + +bool +ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + void *data, size_t data_size) +{ + struct ilo_state_viewport_info info; + struct ilo_state_viewport_matrix_info mat; + struct ilo_state_viewport_scissor_info sci; + + memset(&info, 0, sizeof(info)); + memset(&mat, 0, sizeof(mat)); + memset(&sci, 0, sizeof(sci)); + + info.data = data; + info.data_size = data_size; + info.params.matrices = &mat; + info.params.scissors = &sci; + info.params.count = 1; + + mat.scale[0] = 1.0f; + mat.scale[1] = 1.0f; + mat.scale[2] = 1.0f; + + return ilo_state_viewport_init(vp, dev, &info); +} + +static void +viewport_set_count(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + uint8_t count) +{ + assert(count <= vp->array_size); + + vp->count = count; + vp->sf_clip = (uint32_t (*)[16]) vp->data; + vp->cc = (uint32_t (*)[ 2]) (vp->sf_clip + count); + vp->scissor = (uint32_t (*)[ 2]) (vp->cc + count); +} + +bool +ilo_state_viewport_set_params(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_params_info *params, + bool scissors_only) +{ + bool ret = true; + + if (scissors_only) { + assert(vp->count == params->count); + + ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev, + params->scissors, params->count); + } else { + viewport_set_count(vp, dev, params->count); + + ret &= viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(vp, dev, + params->matrices, params->count); + ret &= viewport_matrix_set_gen6_CC_VIEWPORT(vp, dev, + params->matrices, params->count); + ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev, + params->scissors, params->count); + } + + assert(ret); + + return ret; +} + +void +ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + struct ilo_state_viewport_delta *delta) +{ + delta->dirty = ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT | + ILO_STATE_VIEWPORT_CC_VIEWPORT | + ILO_STATE_VIEWPORT_SCISSOR_RECT; +} + +void +ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport *old, + struct ilo_state_viewport_delta *delta) +{ + const size_t sf_clip_size = sizeof(vp->sf_clip[0]) * vp->count; + const size_t cc_size = sizeof(vp->cc[0]) * vp->count; + const size_t scissor_size = sizeof(vp->scissor[0]) * vp->count; + + /* no shallow copying */ + assert(vp->data != old->data); + + if (vp->count != old->count) { + ilo_state_viewport_full_delta(vp, dev, delta); + return; + } + + delta->dirty = 0; + + if (memcmp(vp->sf_clip, old->sf_clip, sf_clip_size)) + delta->dirty |= ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT; + + if (memcmp(vp->cc, old->cc, cc_size)) + delta->dirty |= ILO_STATE_VIEWPORT_CC_VIEWPORT; + + if (memcmp(vp->scissor, old->scissor, scissor_size)) + delta->dirty |= ILO_STATE_VIEWPORT_SCISSOR_RECT; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.h b/src/gallium/drivers/ilo/core/ilo_state_viewport.h new file mode 100644 index 00000000000..b42ad6571da --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.h @@ -0,0 +1,132 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_VIEWPORT_H +#define ILO_STATE_VIEWPORT_H + +#include "genhw/genhw.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +/* + * From the Sandy Bridge PRM, volume 2 part 1, page 38: + * + * "... 16 sets of viewport (VP) state parameters in the Clip unit's + * VertexClipTest function and in the SF unit's ViewportMapping and + * Scissor functions." + */ +#define ILO_STATE_VIEWPORT_MAX_COUNT 16 + +enum ilo_state_viewport_dirty_bits { + ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT = (1 << 0), + ILO_STATE_VIEWPORT_CC_VIEWPORT = (1 << 1), + ILO_STATE_VIEWPORT_SCISSOR_RECT = (1 << 2), +}; + +struct ilo_state_viewport_matrix_info { + float scale[3]; + float translate[3]; +}; + +struct ilo_state_viewport_scissor_info { + /* all inclusive */ + uint16_t min_x; + uint16_t min_y; + uint16_t max_x; + uint16_t max_y; +}; + +struct ilo_state_viewport_params_info { + const struct ilo_state_viewport_matrix_info *matrices; + const struct ilo_state_viewport_scissor_info *scissors; + uint8_t count; +}; + +struct ilo_state_viewport_info { + void *data; + size_t data_size; + + struct ilo_state_viewport_params_info params; +}; + +struct ilo_state_viewport { + void *data; + uint8_t array_size; + + uint8_t count; + uint32_t (*sf_clip)[16]; + uint32_t (*cc)[2]; + uint32_t (*scissor)[2]; +}; + +struct ilo_state_viewport_delta { + uint32_t dirty; +}; + +static inline size_t +ilo_state_viewport_data_size(const struct ilo_dev *dev, uint8_t array_size) +{ + const struct ilo_state_viewport *vp = NULL; + return (sizeof(vp->sf_clip[0]) + + sizeof(vp->cc[0]) + + sizeof(vp->scissor[0])) * array_size; +} + +bool +ilo_state_viewport_init(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_info *info); + +bool +ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + void *data, size_t data_size); + +bool +ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + void *data, size_t data_size); + +bool +ilo_state_viewport_set_params(struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport_params_info *params, + bool scissors_only); + +void +ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + struct ilo_state_viewport_delta *delta); + +void +ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp, + const struct ilo_dev *dev, + const struct ilo_state_viewport *old, + struct ilo_state_viewport_delta *delta); + +#endif /* ILO_STATE_VIEWPORT_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c new file mode 100644 index 00000000000..901fedb5599 --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c @@ -0,0 +1,727 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "intel_winsys.h" + +#include "ilo_debug.h" +#include "ilo_image.h" +#include "ilo_state_zs.h" + +static bool +zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + const enum gen_depth_format format = GEN6_ZFORMAT_D32_FLOAT; + uint32_t dw1; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + dw1 = GEN6_SURFTYPE_NULL << GEN7_DEPTH_DW1_TYPE__SHIFT | + format << GEN7_DEPTH_DW1_FORMAT__SHIFT; + } else { + dw1 = GEN6_SURFTYPE_NULL << GEN6_DEPTH_DW1_TYPE__SHIFT | + GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT | + format << GEN6_DEPTH_DW1_FORMAT__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5); + zs->depth[0] = dw1; + zs->depth[1] = 0; + zs->depth[2] = 0; + zs->depth[3] = 0; + zs->depth[4] = 0; + + zs->depth_format = format; + + return true; +} + +static enum gen_surface_type +get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + switch (img->target) { + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return GEN6_SURFTYPE_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE_ARRAY: + return GEN6_SURFTYPE_2D; + case PIPE_TEXTURE_3D: + return GEN6_SURFTYPE_3D; + default: + assert(!"unknown texture target"); + return GEN6_SURFTYPE_NULL; + } +} + +static enum gen_depth_format +get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + switch (img->format) { + case PIPE_FORMAT_Z32_FLOAT: + return GEN6_ZFORMAT_D32_FLOAT; + case PIPE_FORMAT_Z24X8_UNORM: + return GEN6_ZFORMAT_D24_UNORM_X8_UINT; + case PIPE_FORMAT_Z16_UNORM: + return GEN6_ZFORMAT_D16_UNORM; + default: + assert(!"unknown depth format"); + return GEN6_ZFORMAT_D32_FLOAT; + } + } else { + switch (img->format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT; + case PIPE_FORMAT_Z32_FLOAT: + return GEN6_ZFORMAT_D32_FLOAT; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return GEN6_ZFORMAT_D24_UNORM_S8_UINT; + case PIPE_FORMAT_Z24X8_UNORM: + return GEN6_ZFORMAT_D24_UNORM_X8_UINT; + case PIPE_FORMAT_Z16_UNORM: + return GEN6_ZFORMAT_D16_UNORM; + default: + assert(!"unknown depth format"); + return GEN6_ZFORMAT_D32_FLOAT; + } + } +} + +static bool +zs_validate_gen6(const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 315: + * + * The stencil buffer has a format of S8_UINT, and shares Surface + * Type, Height, Width, and Depth, Minimum Array Element, Render + * Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth + * Buffer Object Control State fields of the depth buffer. + */ + if (info->z_img == info->s_img) { + assert(info->z_img->target == info->s_img->target && + info->z_img->width0 == info->s_img->width0 && + info->z_img->height0 == info->s_img->height0 && + info->z_img->depth0 == info->s_img->depth0); + } + + assert(info->level < img->level_count); + assert(img->bo_stride); + + if (info->hiz_enable) { + assert(info->z_img && + ilo_image_can_enable_aux(info->z_img, info->level)); + } + + if (info->is_cube_map) { + assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 323: + * + * "For cube maps, Width must be set equal to Height." + */ + assert(img->width0 == img->height0); + } + + if (info->z_img) + assert(info->z_img->tiling == GEN6_TILING_Y); + if (info->s_img) + assert(info->s_img->tiling == GEN8_TILING_W); + + return true; +} + +static void +get_gen6_max_extent(const struct ilo_dev *dev, + const struct ilo_image *img, + uint16_t *max_w, uint16_t *max_h) +{ + const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; + + ILO_DEV_ASSERT(dev, 6, 8); + + switch (get_gen6_surface_type(dev, img)) { + case GEN6_SURFTYPE_1D: + *max_w = max_size; + *max_h = 1; + break; + case GEN6_SURFTYPE_2D: + *max_w = max_size; + *max_h = max_size; + break; + case GEN6_SURFTYPE_3D: + *max_w = 2048; + *max_h = 2048; + break; + default: + assert(!"invalid surface type"); + *max_w = 1; + *max_h = 1; + break; + } +} + +static void +get_gen6_hiz_alignments(const struct ilo_dev *dev, + const struct ilo_image *img, + uint16_t *align_w, uint16_t *align_h) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 313: + * + * "A rectangle primitive representing the clear area is delivered. The + * primitive must adhere to the following restrictions on size: + * + * - If Number of Multisamples is NUMSAMPLES_1, the rectangle must be + * aligned to an 8x4 pixel block relative to the upper left corner + * of the depth buffer, and contain an integer number of these pixel + * blocks, and all 8x4 pixels must be lit. + * - If Number of Multisamples is NUMSAMPLES_4, the rectangle must be + * aligned to a 4x2 pixel block (8x4 sample block) relative to the + * upper left corner of the depth buffer, and contain an integer + * number of these pixel blocks, and all samples of the 4x2 pixels + * must be lit + * - If Number of Multisamples is NUMSAMPLES_8, the rectangle must be + * aligned to a 2x2 pixel block (8x4 sample block) relative to the + * upper left corner of the depth buffer, and contain an integer + * number of these pixel blocks, and all samples of the 2x2 pixels + * must be list." + * + * Experiments on Gen7.5 show that HiZ resolve also requires the rectangle + * to be aligned to 8x4 sample blocks. But to be on the safe side, we + * always require a level to be aligned when HiZ is enabled. + */ + switch (img->sample_count) { + case 1: + *align_w = 8; + *align_h = 4; + break; + case 2: + *align_w = 4; + *align_h = 4; + break; + case 4: + *align_w = 4; + *align_h = 2; + break; + case 8: + *align_w = 2; + *align_h = 2; + break; + case 16: + *align_w = 2; + *align_h = 1; + break; + default: + assert(!"unknown sample count"); + *align_w = 1; + *align_h = 1; + break; + } +} + +static bool +zs_get_gen6_depth_extent(const struct ilo_dev *dev, + const struct ilo_state_zs_info *info, + uint16_t *width, uint16_t *height) +{ + const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img; + uint16_t w, h, max_w, max_h; + + ILO_DEV_ASSERT(dev, 6, 8); + + w = img->width0; + h = img->height0; + + if (info->hiz_enable) { + uint16_t align_w, align_h; + + get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h); + + /* + * We want to force 8x4 alignment, but we can do so only for level 0 and + * only when it is padded. ilo_image should know all these. + */ + if (info->level) + assert(w % align_w == 0 && h % align_h == 0); + + w = align(w, align_w); + h = align(h, align_h); + } + + get_gen6_max_extent(dev, img, &max_w, &max_h); + assert(w && h && w <= max_w && h <= max_h); + + *width = w - 1; + *height = h - 1; + + return true; +} + +static bool +zs_get_gen6_depth_slices(const struct ilo_dev *dev, + const struct ilo_state_zs_info *info, + uint16_t *depth, uint16_t *min_array_elem, + uint16_t *rt_view_extent) +{ + const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img; + uint16_t max_slice, d; + + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 325: + * + * "This field (Depth) specifies the total number of levels for a + * volume texture or the number of array elements allowed to be + * accessed starting at the Minimum Array Element for arrayed + * surfaces. If the volume texture is MIP-mapped, this field specifies + * the depth of the base MIP level." + */ + switch (get_gen6_surface_type(dev, img)) { + case GEN6_SURFTYPE_1D: + case GEN6_SURFTYPE_2D: + max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512; + + assert(img->array_size <= max_slice); + max_slice = img->array_size; + + d = info->slice_count; + if (info->is_cube_map) { + /* + * Minumum Array Element and Depth must be 0; Render Target View + * Extent is ignored. + */ + if (info->slice_base || d != 6) { + ilo_warn("no cube array dpeth buffer\n"); + return false; + } + + d /= 6; + } + break; + case GEN6_SURFTYPE_3D: + max_slice = 2048; + + assert(img->depth0 <= max_slice); + max_slice = u_minify(img->depth0, info->level); + + d = img->depth0; + break; + default: + assert(!"invalid surface type"); + return false; + break; + } + + if (!info->slice_count || + info->slice_base + info->slice_count > max_slice) { + ilo_warn("invalid slice range\n"); + return false; + } + + assert(d); + *depth = d - 1; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 325: + * + * "For 1D and 2D Surfaces: + * This field (Minimum Array Element) indicates the minimum array + * element that can be accessed as part of this surface. The delivered + * array index is added to this field before being used to address the + * surface. + * + * For 3D Surfaces: + * This field indicates the minimum `R' coordinate on the LOD + * currently being rendered to. This field is added to the delivered + * array index before it is used to address the surface. + * + * For Other Surfaces: + * This field is ignored." + */ + *min_array_elem = info->slice_base; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 326: + * + * "For 3D Surfaces: + * This field (Render Target View Extent) indicates the extent of the + * accessible `R' coordinates minus 1 on the LOD currently being + * rendered to. + * + * For 1D and 2D Surfaces: + * This field must be set to the same value as the Depth field. + * + * For Other Surfaces: + * This field is ignored." + */ + *rt_view_extent = info->slice_count - 1; + + return true; +} + +static bool +zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + uint16_t width, height, depth, array_base, view_extent; + enum gen_surface_type type; + enum gen_depth_format format; + uint32_t dw1, dw2, dw3, dw4; + + ILO_DEV_ASSERT(dev, 6, 6); + + if (!zs_validate_gen6(dev, info) || + !zs_get_gen6_depth_extent(dev, info, &width, &height) || + !zs_get_gen6_depth_slices(dev, info, &depth, &array_base, + &view_extent)) + return false; + + type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : + (info->z_img) ? get_gen6_surface_type(dev, info->z_img) : + get_gen6_surface_type(dev, info->s_img); + + format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) : + GEN6_ZFORMAT_D32_FLOAT; + + /* + * From the Ironlake PRM, volume 2 part 1, page 330: + * + * "If this field (Separate Stencil Buffer Enable) is disabled, the + * Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 321: + * + * "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set + * to the same value (enabled or disabled) as Hierarchical Depth + * Buffer Enable." + */ + if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT) + format = GEN6_ZFORMAT_D24_UNORM_S8_UINT; + + /* info->z_readonly and info->s_readonly are ignored on Gen6 */ + dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT | + GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT | + format << GEN6_DEPTH_DW1_FORMAT__SHIFT; + + if (info->z_img) + dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT; + + if (info->hiz_enable || !info->z_img) { + dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE | + GEN6_DEPTH_DW1_SEPARATE_STENCIL; + } + + dw2 = 0; + dw3 = height << GEN6_DEPTH_DW3_HEIGHT__SHIFT | + width << GEN6_DEPTH_DW3_WIDTH__SHIFT | + info->level << GEN6_DEPTH_DW3_LOD__SHIFT | + GEN6_DEPTH_DW3_MIPLAYOUT_BELOW; + dw4 = depth << GEN6_DEPTH_DW4_DEPTH__SHIFT | + array_base << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT | + view_extent << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT; + + STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5); + zs->depth[0] = dw1; + zs->depth[1] = dw2; + zs->depth[2] = dw3; + zs->depth[3] = dw4; + zs->depth[4] = 0; + + zs->depth_format = format; + + return true; +} + +static bool +zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + enum gen_surface_type type; + enum gen_depth_format format; + uint16_t width, height, depth; + uint16_t array_base, view_extent; + uint32_t dw1, dw2, dw3, dw4, dw6; + + ILO_DEV_ASSERT(dev, 7, 8); + + if (!zs_validate_gen6(dev, info) || + !zs_get_gen6_depth_extent(dev, info, &width, &height) || + !zs_get_gen6_depth_slices(dev, info, &depth, &array_base, + &view_extent)) + return false; + + type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : + (info->z_img) ? get_gen6_surface_type(dev, info->z_img) : + get_gen6_surface_type(dev, info->s_img); + + format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) : + GEN6_ZFORMAT_D32_FLOAT; + + dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT | + format << GEN7_DEPTH_DW1_FORMAT__SHIFT; + + if (info->z_img) { + if (!info->z_readonly) + dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE; + if (info->hiz_enable) + dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE; + + dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT; + } + + if (info->s_img && !info->s_readonly) + dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE; + + dw2 = 0; + dw3 = height << GEN7_DEPTH_DW3_HEIGHT__SHIFT | + width << GEN7_DEPTH_DW3_WIDTH__SHIFT | + info->level << GEN7_DEPTH_DW3_LOD__SHIFT; + dw4 = depth << GEN7_DEPTH_DW4_DEPTH__SHIFT | + array_base << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT; + dw6 = view_extent << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(8) && info->z_img) { + assert(info->z_img->walk_layer_height % 4 == 0); + /* note that DW is off-by-one for Gen8+ */ + dw6 |= (info->z_img->walk_layer_height / 4) << + GEN8_DEPTH_DW7_QPITCH__SHIFT; + } + + STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5); + zs->depth[0] = dw1; + zs->depth[1] = dw2; + zs->depth[2] = dw3; + zs->depth[3] = dw4; + zs->depth[4] = dw6; + + zs->depth_format = format; + + return true; +} + +static bool +zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3); + zs->stencil[0] = 0; + zs->stencil[1] = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + zs->stencil[2] = 0; + + return true; +} + +static bool +zs_set_gen6_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + const struct ilo_image *img = info->s_img; + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(img->bo_stride); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 329: + * + * "The pitch must be set to 2x the value computed based on width, as + * the stencil buffer is stored with two rows interleaved." + * + * For Gen7+, we still dobule the stride because we did not double the + * slice widths when initializing ilo_image. + */ + dw1 = (img->bo_stride * 2 - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT; + + if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) + dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE; + + dw2 = 0; + /* offset to the level as Gen6 does not support mipmapped stencil */ + if (ilo_dev_gen(dev) == ILO_GEN(6)) { + unsigned x, y; + + ilo_image_get_slice_pos(img, info->level, 0, &x, &y); + ilo_image_pos_to_mem(img, x, y, &x, &y); + dw2 |= ilo_image_mem_to_raw(img, x, y); + } + + STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3); + zs->stencil[0] = dw1; + zs->stencil[1] = dw2; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + uint32_t dw4; + + assert(img->walk_layer_height % 4 == 0); + dw4 = (img->walk_layer_height / 4) << GEN8_STENCIL_DW4_QPITCH__SHIFT; + + zs->stencil[2] = dw4; + } + + return true; +} + +static bool +zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3); + zs->hiz[0] = 0; + zs->hiz[1] = 0; + if (ilo_dev_gen(dev) >= ILO_GEN(8)) + zs->hiz[2] = 0; + + return true; +} + +static bool +zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs, + const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + const struct ilo_image *img = info->z_img; + uint32_t dw1, dw2; + + ILO_DEV_ASSERT(dev, 6, 8); + + assert(img->aux.bo_stride); + + dw1 = (img->aux.bo_stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT; + + dw2 = 0; + /* offset to the level as Gen6 does not support mipmapped HiZ */ + if (ilo_dev_gen(dev) == ILO_GEN(6)) + dw2 |= img->aux.walk_lod_offsets[info->level]; + + STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3); + zs->hiz[0] = dw1; + zs->hiz[1] = dw2; + + if (ilo_dev_gen(dev) >= ILO_GEN(8)) { + uint32_t dw4; + + assert(img->aux.walk_layer_height % 4 == 0); + dw4 = (img->aux.walk_layer_height / 4) << GEN8_HIZ_DW4_QPITCH__SHIFT; + + zs->hiz[2] = dw4; + } + + return true; +} + +bool +ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev, + const struct ilo_state_zs_info *info) +{ + bool ret = true; + + assert(ilo_is_zeroed(zs, sizeof(*zs))); + + if (info->z_img || info->s_img) { + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + ret &= zs_set_gen7_3DSTATE_DEPTH_BUFFER(zs, dev, info); + else + ret &= zs_set_gen6_3DSTATE_DEPTH_BUFFER(zs, dev, info); + } else { + ret &= zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(zs, dev); + } + + if (info->s_img) + ret &= zs_set_gen6_3DSTATE_STENCIL_BUFFER(zs, dev, info); + else + ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev); + + if (info->z_img && info->hiz_enable) + ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info); + else + ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev); + + zs->z_readonly = info->z_readonly; + zs->s_readonly = info->s_readonly; + + assert(ret); + + return ret; +} + +bool +ilo_state_zs_init_for_null(struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + struct ilo_state_zs_info info; + + memset(&info, 0, sizeof(info)); + + return ilo_state_zs_init(zs, dev, &info); +} + +bool +ilo_state_zs_disable_hiz(struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * Separate stencil must be disabled simultaneously on Gen6. We can make + * it work when there is no stencil buffer, but it is probably not worth + * it. + */ + assert(ilo_dev_gen(dev) >= ILO_GEN(7)); + + zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE; + zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev); + + return true; +} diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h new file mode 100644 index 00000000000..98212daf74f --- /dev/null +++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h @@ -0,0 +1,93 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2015 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef ILO_STATE_ZS_H +#define ILO_STATE_ZS_H + +#include "genhw/genhw.h" +#include "intel_winsys.h" + +#include "ilo_core.h" +#include "ilo_dev.h" + +struct ilo_image; + +struct ilo_state_zs_info { + /* both are optional */ + const struct ilo_image *z_img; + const struct ilo_image *s_img; + + /* ignored prior to Gen7 */ + bool z_readonly; + bool s_readonly; + + bool hiz_enable; + bool is_cube_map; + + uint8_t level; + uint16_t slice_base; + uint16_t slice_count; +}; + +struct ilo_state_zs { + uint32_t depth[5]; + uint32_t stencil[3]; + uint32_t hiz[3]; + + /* TODO move this to ilo_image */ + enum gen_depth_format depth_format; + + bool z_readonly; + bool s_readonly; + + /* managed by users */ + struct intel_bo *depth_bo; + struct intel_bo *stencil_bo; + struct intel_bo *hiz_bo; +}; + +bool +ilo_state_zs_init(struct ilo_state_zs *zs, + const struct ilo_dev *dev, + const struct ilo_state_zs_info *info); + +bool +ilo_state_zs_init_for_null(struct ilo_state_zs *zs, + const struct ilo_dev *dev); + +bool +ilo_state_zs_disable_hiz(struct ilo_state_zs *zs, + const struct ilo_dev *dev); + +static inline enum gen_depth_format +ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs, + const struct ilo_dev *dev) +{ + return zs->depth_format; +} + +#endif /* ILO_STATE_ZS_H */ diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h index 24d726adcb3..5a0bb4f8d77 100644 --- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h @@ -97,6 +97,9 @@ enum gen_mi_alu_operand { #define GEN6_MI_LENGTH__MASK 0x0000003f #define GEN6_MI_LENGTH__SHIFT 0 #define GEN6_MI_NOOP__SIZE 1 +#define GEN6_MI_NOOP_DW0_WRITE_NOPID (0x1 << 22) +#define GEN6_MI_NOOP_DW0_VALUE__MASK 0x003fffff +#define GEN6_MI_NOOP_DW0_VALUE__SHIFT 0 #define GEN75_MI_SET_PREDICATE__SIZE 1 #define GEN75_MI_SET_PREDICATE_DW0_PREDICATE__MASK 0x00000003 diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h index 2bdd72b29bc..c51e4f78bc0 100644 --- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h @@ -35,6 +35,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN6_REG_MASK__MASK 0xffff0000 #define GEN6_REG_MASK__SHIFT 16 #define GEN6_REG__SIZE 0x400000 +#define GEN6_REG_NOPID 0x2094 + #define GEN7_REG_HS_INVOCATION_COUNT 0x2300 #define GEN7_REG_DS_INVOCATION_COUNT 0x2308 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h index d25542e8cc2..52173fe5d07 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h @@ -32,7 +32,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -enum gen_prim_type { +enum gen_3dprim_type { GEN6_3DPRIM_POINTLIST = 0x1, GEN6_3DPRIM_LINELIST = 0x2, GEN6_3DPRIM_LINESTRIP = 0x3, @@ -105,6 +105,12 @@ enum gen_state_alignment { GEN8_ALIGNMENT_SURFACE_STATE = 0x40, }; +enum gen_index_format { + GEN6_INDEX_BYTE = 0x0, + GEN6_INDEX_WORD = 0x1, + GEN6_INDEX_DWORD = 0x2, +}; + enum gen_vf_component { GEN6_VFCOMP_NOSTORE = 0x0, GEN6_VFCOMP_STORE_SRC = 0x1, @@ -123,6 +129,87 @@ enum gen_depth_format { GEN6_ZFORMAT_D16_UNORM = 0x5, }; +enum gen_reorder_mode { + GEN7_REORDER_LEADING = 0x0, + GEN7_REORDER_TRAILING = 0x1, +}; + +enum gen_clip_mode { + GEN6_CLIPMODE_NORMAL = 0x0, + GEN6_CLIPMODE_REJECT_ALL = 0x3, + GEN6_CLIPMODE_ACCEPT_ALL = 0x4, +}; + +enum gen_front_winding { + GEN6_FRONTWINDING_CW = 0x0, + GEN6_FRONTWINDING_CCW = 0x1, +}; + +enum gen_fill_mode { + GEN6_FILLMODE_SOLID = 0x0, + GEN6_FILLMODE_WIREFRAME = 0x1, + GEN6_FILLMODE_POINT = 0x2, +}; + +enum gen_cull_mode { + GEN6_CULLMODE_BOTH = 0x0, + GEN6_CULLMODE_NONE = 0x1, + GEN6_CULLMODE_FRONT = 0x2, + GEN6_CULLMODE_BACK = 0x3, +}; + +enum gen_pixel_location { + GEN6_PIXLOC_CENTER = 0x0, + GEN6_PIXLOC_UL_CORNER = 0x1, +}; + +enum gen_sample_count { + GEN6_NUMSAMPLES_1 = 0x0, + GEN8_NUMSAMPLES_2 = 0x1, + GEN6_NUMSAMPLES_4 = 0x2, + GEN7_NUMSAMPLES_8 = 0x3, + GEN8_NUMSAMPLES_16 = 0x4, +}; + +enum gen_inputattr_select { + GEN6_INPUTATTR_NORMAL = 0x0, + GEN6_INPUTATTR_FACING = 0x1, + GEN6_INPUTATTR_W = 0x2, + GEN6_INPUTATTR_FACING_W = 0x3, +}; + +enum gen_zw_interp { + GEN6_ZW_INTERP_PIXEL = 0x0, + GEN6_ZW_INTERP_CENTROID = 0x2, + GEN6_ZW_INTERP_SAMPLE = 0x3, +}; + +enum gen_position_offset { + GEN6_POSOFFSET_NONE = 0x0, + GEN6_POSOFFSET_CENTROID = 0x2, + GEN6_POSOFFSET_SAMPLE = 0x3, +}; + +enum gen_edsc_mode { + GEN7_EDSC_NORMAL = 0x0, + GEN7_EDSC_PSEXEC = 0x1, + GEN7_EDSC_PREPS = 0x2, +}; + +enum gen_pscdepth_mode { + GEN7_PSCDEPTH_OFF = 0x0, + GEN7_PSCDEPTH_ON = 0x1, + GEN7_PSCDEPTH_ON_GE = 0x2, + GEN7_PSCDEPTH_ON_LE = 0x3, +}; + +enum gen_msrast_mode { + GEN6_MSRASTMODE_OFF_PIXEL = 0x0, + GEN6_MSRASTMODE_OFF_PATTERN = 0x1, + GEN6_MSRASTMODE_ON_PIXEL = 0x2, + GEN6_MSRASTMODE_ON_PATTERN = 0x3, +}; + #define GEN6_INTERP_NONPERSPECTIVE_SAMPLE (0x1 << 5) #define GEN6_INTERP_NONPERSPECTIVE_CENTROID (0x1 << 4) #define GEN6_INTERP_NONPERSPECTIVE_PIXEL (0x1 << 3) @@ -285,9 +372,6 @@ enum gen_depth_format { #define GEN6_IB_DW0_CUT_INDEX_ENABLE (0x1 << 10) #define GEN6_IB_DW0_FORMAT__MASK 0x00000300 #define GEN6_IB_DW0_FORMAT__SHIFT 8 -#define GEN6_IB_DW0_FORMAT_BYTE (0x0 << 8) -#define GEN6_IB_DW0_FORMAT_WORD (0x1 << 8) -#define GEN6_IB_DW0_FORMAT_DWORD (0x2 << 8) @@ -295,9 +379,6 @@ enum gen_depth_format { #define GEN8_IB_DW1_FORMAT__MASK 0x00000300 #define GEN8_IB_DW1_FORMAT__SHIFT 8 -#define GEN8_IB_DW1_FORMAT_BYTE (0x0 << 8) -#define GEN8_IB_DW1_FORMAT_WORD (0x1 << 8) -#define GEN8_IB_DW1_FORMAT_DWORD (0x2 << 8) #define GEN8_IB_DW1_MOCS__MASK 0x0000007f #define GEN8_IB_DW1_MOCS__SHIFT 0 @@ -313,8 +394,8 @@ enum gen_depth_format { #define GEN8_INSTANCING_DW1_ENABLE (0x1 << 8) -#define GEN8_INSTANCING_DW1_VB_INDEX__MASK 0x0000003f -#define GEN8_INSTANCING_DW1_VB_INDEX__SHIFT 0 +#define GEN8_INSTANCING_DW1_VE_INDEX__MASK 0x0000003f +#define GEN8_INSTANCING_DW1_VE_INDEX__SHIFT 0 #define GEN8_3DSTATE_VF_SGVS__SIZE 2 @@ -614,7 +695,7 @@ enum gen_depth_format { #define GEN6_GS_DW5_SO_STATISTICS (0x1 << 9) #define GEN6_GS_DW5_RENDER_ENABLE (0x1 << 8) -#define GEN6_GS_DW6_REORDER_ENABLE (0x1 << 30) +#define GEN6_GS_DW6_REORDER_LEADING_ENABLE (0x1 << 30) #define GEN6_GS_DW6_DISCARD_ADJACENCY (0x1 << 29) #define GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE (0x1 << 28) #define GEN6_GS_DW6_SVBI_POST_INC_ENABLE (0x1 << 27) @@ -666,11 +747,9 @@ enum gen_depth_format { #define GEN7_GS_DW5_INVOCATION_INCR__SHIFT 5 #define GEN7_GS_DW5_INCLUDE_PRIMITIVE_ID (0x1 << 4) #define GEN7_GS_DW5_HINT (0x1 << 3) -#define GEN7_GS_DW5_REORDER_ENABLE (0x1 << 2) -#define GEN75_GS_DW5_REORDER__MASK 0x00000004 -#define GEN75_GS_DW5_REORDER__SHIFT 2 -#define GEN75_GS_DW5_REORDER_LEADING (0x0 << 2) -#define GEN75_GS_DW5_REORDER_TRAILING (0x1 << 2) +#define GEN7_GS_DW5_REORDER_LEADING_ENABLE (0x1 << 2) +#define GEN75_GS_DW5_REORDER_MODE__MASK 0x00000004 +#define GEN75_GS_DW5_REORDER_MODE__SHIFT 2 #define GEN7_GS_DW5_DISCARD_ADJACENCY (0x1 << 1) #define GEN7_GS_DW5_GS_ENABLE (0x1 << 0) @@ -727,10 +806,8 @@ enum gen_depth_format { #define GEN8_GS_DW7_INVOCATION_INCR__SHIFT 5 #define GEN8_GS_DW7_INCLUDE_PRIMITIVE_ID (0x1 << 4) #define GEN8_GS_DW7_HINT (0x1 << 3) -#define GEN8_GS_DW7_REORDER__MASK 0x00000004 -#define GEN8_GS_DW7_REORDER__SHIFT 2 -#define GEN8_GS_DW7_REORDER_LEADING (0x0 << 2) -#define GEN8_GS_DW7_REORDER_TRAILING (0x1 << 2) +#define GEN8_GS_DW7_REORDER_MODE__MASK 0x00000004 +#define GEN8_GS_DW7_REORDER_MODE__SHIFT 2 #define GEN8_GS_DW7_DISCARD_ADJACENCY (0x1 << 1) #define GEN8_GS_DW7_GS_ENABLE (0x1 << 0) @@ -758,10 +835,8 @@ enum gen_depth_format { #define GEN7_SO_DW1_RENDER_DISABLE (0x1 << 30) #define GEN7_SO_DW1_RENDER_STREAM_SELECT__MASK 0x18000000 #define GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT 27 -#define GEN7_SO_DW1_REORDER__MASK 0x04000000 -#define GEN7_SO_DW1_REORDER__SHIFT 26 -#define GEN7_SO_DW1_REORDER_LEADING (0x0 << 26) -#define GEN7_SO_DW1_REORDER_TRAILING (0x1 << 26) +#define GEN7_SO_DW1_REORDER_MODE__MASK 0x04000000 +#define GEN7_SO_DW1_REORDER_MODE__SHIFT 26 #define GEN7_SO_DW1_STATISTICS (0x1 << 25) #define GEN7_SO_DW1_BUFFER_ENABLES__MASK 0x00000f00 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT 8 @@ -862,21 +937,15 @@ enum gen_depth_format { #define GEN6_3DSTATE_CLIP__SIZE 4 -#define GEN7_CLIP_DW1_FRONTWINDING__MASK 0x00100000 -#define GEN7_CLIP_DW1_FRONTWINDING__SHIFT 20 -#define GEN7_CLIP_DW1_FRONTWINDING_CW (0x0 << 20) -#define GEN7_CLIP_DW1_FRONTWINDING_CCW (0x1 << 20) +#define GEN7_CLIP_DW1_FRONT_WINDING__MASK 0x00100000 +#define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT 20 #define GEN7_CLIP_DW1_SUBPIXEL__MASK 0x00080000 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT 19 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS (0x0 << 19) #define GEN7_CLIP_DW1_SUBPIXEL_4BITS (0x1 << 19) #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE (0x1 << 18) -#define GEN7_CLIP_DW1_CULLMODE__MASK 0x00030000 -#define GEN7_CLIP_DW1_CULLMODE__SHIFT 16 -#define GEN7_CLIP_DW1_CULLMODE_BOTH (0x0 << 16) -#define GEN7_CLIP_DW1_CULLMODE_NONE (0x1 << 16) -#define GEN7_CLIP_DW1_CULLMODE_FRONT (0x2 << 16) -#define GEN7_CLIP_DW1_CULLMODE_BACK (0x3 << 16) +#define GEN7_CLIP_DW1_CULL_MODE__MASK 0x00030000 +#define GEN7_CLIP_DW1_CULL_MODE__SHIFT 16 #define GEN6_CLIP_DW1_STATISTICS (0x1 << 10) #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK 0x000000ff #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT 0 @@ -891,11 +960,8 @@ enum gen_depth_format { #define GEN6_CLIP_DW2_GB_TEST_ENABLE (0x1 << 26) #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__MASK 0x00ff0000 #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT 16 -#define GEN6_CLIP_DW2_CLIPMODE__MASK 0x0000e000 -#define GEN6_CLIP_DW2_CLIPMODE__SHIFT 13 -#define GEN6_CLIP_DW2_CLIPMODE_NORMAL (0x0 << 13) -#define GEN6_CLIP_DW2_CLIPMODE_REJECT_ALL (0x3 << 13) -#define GEN6_CLIP_DW2_CLIPMODE_ACCEPT_ALL (0x4 << 13) +#define GEN6_CLIP_DW2_CLIP_MODE__MASK 0x0000e000 +#define GEN6_CLIP_DW2_CLIP_MODE__SHIFT 13 #define GEN6_CLIP_DW2_PERSPECTIVE_DIVIDE_DISABLE (0x1 << 9) #define GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE (0x1 << 8) #define GEN6_CLIP_DW2_TRI_PROVOKE__MASK 0x00000030 @@ -911,7 +977,7 @@ enum gen_depth_format { #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__MASK 0x0001ffc0 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT 6 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__RADIX 3 -#define GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO (0x1 << 5) +#define GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO (0x1 << 5) #define GEN6_CLIP_DW3_MAX_VPINDEX__MASK 0x0000000f #define GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT 0 @@ -927,29 +993,17 @@ enum gen_depth_format { #define GEN7_SF_DW1_DEPTH_OFFSET_SOLID (0x1 << 9) #define GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME (0x1 << 8) #define GEN7_SF_DW1_DEPTH_OFFSET_POINT (0x1 << 7) -#define GEN7_SF_DW1_FRONTFACE__MASK 0x00000060 -#define GEN7_SF_DW1_FRONTFACE__SHIFT 5 -#define GEN7_SF_DW1_FRONTFACE_SOLID (0x0 << 5) -#define GEN7_SF_DW1_FRONTFACE_WIREFRAME (0x1 << 5) -#define GEN7_SF_DW1_FRONTFACE_POINT (0x2 << 5) -#define GEN7_SF_DW1_BACKFACE__MASK 0x00000018 -#define GEN7_SF_DW1_BACKFACE__SHIFT 3 -#define GEN7_SF_DW1_BACKFACE_SOLID (0x0 << 3) -#define GEN7_SF_DW1_BACKFACE_WIREFRAME (0x1 << 3) -#define GEN7_SF_DW1_BACKFACE_POINT (0x2 << 3) -#define GEN7_SF_DW1_VIEWPORT_ENABLE (0x1 << 1) -#define GEN7_SF_DW1_FRONTWINDING__MASK 0x00000001 -#define GEN7_SF_DW1_FRONTWINDING__SHIFT 0 -#define GEN7_SF_DW1_FRONTWINDING_CW 0x0 -#define GEN7_SF_DW1_FRONTWINDING_CCW 0x1 +#define GEN7_SF_DW1_FILL_MODE_FRONT__MASK 0x00000060 +#define GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT 5 +#define GEN7_SF_DW1_FILL_MODE_BACK__MASK 0x00000018 +#define GEN7_SF_DW1_FILL_MODE_BACK__SHIFT 3 +#define GEN7_SF_DW1_VIEWPORT_TRANSFORM (0x1 << 1) +#define GEN7_SF_DW1_FRONT_WINDING__MASK 0x00000001 +#define GEN7_SF_DW1_FRONT_WINDING__SHIFT 0 #define GEN7_SF_DW2_AA_LINE_ENABLE (0x1 << 31) -#define GEN7_SF_DW2_CULLMODE__MASK 0x60000000 -#define GEN7_SF_DW2_CULLMODE__SHIFT 29 -#define GEN7_SF_DW2_CULLMODE_BOTH (0x0 << 29) -#define GEN7_SF_DW2_CULLMODE_NONE (0x1 << 29) -#define GEN7_SF_DW2_CULLMODE_FRONT (0x2 << 29) -#define GEN7_SF_DW2_CULLMODE_BACK (0x3 << 29) +#define GEN7_SF_DW2_CULL_MODE__MASK 0x60000000 +#define GEN7_SF_DW2_CULL_MODE__SHIFT 29 #define GEN7_SF_DW2_LINE_WIDTH__MASK 0x0ffc0000 #define GEN7_SF_DW2_LINE_WIDTH__SHIFT 18 #define GEN7_SF_DW2_LINE_WIDTH__RADIX 7 @@ -963,10 +1017,6 @@ enum gen_depth_format { #define GEN7_SF_DW2_SCISSOR_ENABLE (0x1 << 11) #define GEN7_SF_DW2_MSRASTMODE__MASK 0x00000300 #define GEN7_SF_DW2_MSRASTMODE__SHIFT 8 -#define GEN7_SF_DW2_MSRASTMODE_OFF_PIXEL (0x0 << 8) -#define GEN7_SF_DW2_MSRASTMODE_OFF_PATTERN (0x1 << 8) -#define GEN7_SF_DW2_MSRASTMODE_ON_PIXEL (0x2 << 8) -#define GEN7_SF_DW2_MSRASTMODE_ON_PATTERN (0x3 << 8) #define GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE (0x1 << 31) #define GEN7_SF_DW3_TRI_PROVOKE__MASK 0x60000000 @@ -1021,14 +1071,10 @@ enum gen_depth_format { #define GEN8_SBE_SWIZ_CONST_0001_FLOAT (0x1 << 9) #define GEN8_SBE_SWIZ_CONST_1111_FLOAT (0x2 << 9) #define GEN8_SBE_SWIZ_CONST_PRIM_ID (0x3 << 9) -#define GEN8_SBE_SWIZ_INPUTATTR__MASK 0x000000c0 -#define GEN8_SBE_SWIZ_INPUTATTR__SHIFT 6 -#define GEN8_SBE_SWIZ_INPUTATTR_NORMAL (0x0 << 6) -#define GEN8_SBE_SWIZ_INPUTATTR_FACING (0x1 << 6) -#define GEN8_SBE_SWIZ_INPUTATTR_W (0x2 << 6) -#define GEN8_SBE_SWIZ_INPUTATTR_FACING_W (0x3 << 6) -#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__MASK 0x0000001f -#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__SHIFT 0 +#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__MASK 0x000000c0 +#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT 6 +#define GEN8_SBE_SWIZ_SRC_ATTR__MASK 0x0000001f +#define GEN8_SBE_SWIZ_SRC_ATTR__SHIFT 0 #define GEN6_3DSTATE_SF__SIZE 20 @@ -1080,31 +1126,19 @@ enum gen_depth_format { #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE (0x1 << 26) -#define GEN8_RASTER_DW1_FRONTWINDING__MASK 0x00200000 -#define GEN8_RASTER_DW1_FRONTWINDING__SHIFT 21 -#define GEN8_RASTER_DW1_FRONTWINDING_CW (0x0 << 21) -#define GEN8_RASTER_DW1_FRONTWINDING_CCW (0x1 << 21) -#define GEN8_RASTER_DW1_CULLMODE__MASK 0x00030000 -#define GEN8_RASTER_DW1_CULLMODE__SHIFT 16 -#define GEN8_RASTER_DW1_CULLMODE_BOTH (0x0 << 16) -#define GEN8_RASTER_DW1_CULLMODE_NONE (0x1 << 16) -#define GEN8_RASTER_DW1_CULLMODE_FRONT (0x2 << 16) -#define GEN8_RASTER_DW1_CULLMODE_BACK (0x3 << 16) +#define GEN8_RASTER_DW1_FRONT_WINDING__MASK 0x00200000 +#define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT 21 +#define GEN8_RASTER_DW1_CULL_MODE__MASK 0x00030000 +#define GEN8_RASTER_DW1_CULL_MODE__SHIFT 16 #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE (0x1 << 13) #define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE (0x1 << 12) #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID (0x1 << 9) #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME (0x1 << 8) #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT (0x1 << 7) -#define GEN8_RASTER_DW1_FRONTFACE__MASK 0x00000060 -#define GEN8_RASTER_DW1_FRONTFACE__SHIFT 5 -#define GEN8_RASTER_DW1_FRONTFACE_SOLID (0x0 << 5) -#define GEN8_RASTER_DW1_FRONTFACE_WIREFRAME (0x1 << 5) -#define GEN8_RASTER_DW1_FRONTFACE_POINT (0x2 << 5) -#define GEN8_RASTER_DW1_BACKFACE__MASK 0x00000018 -#define GEN8_RASTER_DW1_BACKFACE__SHIFT 3 -#define GEN8_RASTER_DW1_BACKFACE_SOLID (0x0 << 3) -#define GEN8_RASTER_DW1_BACKFACE_WIREFRAME (0x1 << 3) -#define GEN8_RASTER_DW1_BACKFACE_POINT (0x2 << 3) +#define GEN8_RASTER_DW1_FILL_MODE_FRONT__MASK 0x00000060 +#define GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT 5 +#define GEN8_RASTER_DW1_FILL_MODE_BACK__MASK 0x00000018 +#define GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT 3 #define GEN8_RASTER_DW1_AA_LINE_ENABLE (0x1 << 2) #define GEN8_RASTER_DW1_SCISSOR_ENABLE (0x1 << 1) #define GEN8_RASTER_DW1_Z_TEST_ENABLE (0x1 << 0) @@ -1164,14 +1198,8 @@ enum gen_depth_format { #define GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT 20 #define GEN6_WM_DW6_PS_POSOFFSET__MASK 0x000c0000 #define GEN6_WM_DW6_PS_POSOFFSET__SHIFT 18 -#define GEN6_WM_DW6_PS_POSOFFSET_NONE (0x0 << 18) -#define GEN6_WM_DW6_PS_POSOFFSET_CENTROID (0x2 << 18) -#define GEN6_WM_DW6_PS_POSOFFSET_SAMPLE (0x3 << 18) #define GEN6_WM_DW6_ZW_INTERP__MASK 0x00030000 #define GEN6_WM_DW6_ZW_INTERP__SHIFT 16 -#define GEN6_WM_DW6_ZW_INTERP_PIXEL (0x0 << 16) -#define GEN6_WM_DW6_ZW_INTERP_CENTROID (0x2 << 16) -#define GEN6_WM_DW6_ZW_INTERP_SAMPLE (0x3 << 16) #define GEN6_WM_DW6_BARYCENTRIC_INTERP__MASK 0x0000fc00 #define GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT 10 #define GEN6_WM_DW6_POINT_RASTRULE__MASK 0x00000200 @@ -1180,10 +1208,6 @@ enum gen_depth_format { #define GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT (0x1 << 9) #define GEN6_WM_DW6_MSRASTMODE__MASK 0x00000006 #define GEN6_WM_DW6_MSRASTMODE__SHIFT 1 -#define GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL (0x0 << 1) -#define GEN6_WM_DW6_MSRASTMODE_OFF_PATTERN (0x1 << 1) -#define GEN6_WM_DW6_MSRASTMODE_ON_PIXEL (0x2 << 1) -#define GEN6_WM_DW6_MSRASTMODE_ON_PATTERN (0x3 << 1) #define GEN6_WM_DW6_MSDISPMODE__MASK 0x00000001 #define GEN6_WM_DW6_MSDISPMODE__SHIFT 0 #define GEN6_WM_DW6_MSDISPMODE_PERSAMPLE 0x0 @@ -1207,22 +1231,12 @@ enum gen_depth_format { #define GEN7_WM_DW1_PS_KILL_PIXEL (0x1 << 25) #define GEN7_WM_DW1_PSCDEPTH__MASK 0x01800000 #define GEN7_WM_DW1_PSCDEPTH__SHIFT 23 -#define GEN7_WM_DW1_PSCDEPTH_OFF (0x0 << 23) -#define GEN7_WM_DW1_PSCDEPTH_ON (0x1 << 23) -#define GEN7_WM_DW1_PSCDEPTH_ON_GE (0x2 << 23) -#define GEN7_WM_DW1_PSCDEPTH_ON_LE (0x3 << 23) #define GEN7_WM_DW1_EDSC__MASK 0x00600000 #define GEN7_WM_DW1_EDSC__SHIFT 21 -#define GEN7_WM_DW1_EDSC_NORMAL (0x0 << 21) -#define GEN7_WM_DW1_EDSC_PSEXEC (0x1 << 21) -#define GEN7_WM_DW1_EDSC_PREPS (0x2 << 21) #define GEN7_WM_DW1_PS_USE_DEPTH (0x1 << 20) #define GEN7_WM_DW1_PS_USE_W (0x1 << 19) #define GEN7_WM_DW1_ZW_INTERP__MASK 0x00060000 #define GEN7_WM_DW1_ZW_INTERP__SHIFT 17 -#define GEN7_WM_DW1_ZW_INTERP_PIXEL (0x0 << 17) -#define GEN7_WM_DW1_ZW_INTERP_CENTROID (0x2 << 17) -#define GEN7_WM_DW1_ZW_INTERP_SAMPLE (0x3 << 17) #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK 0x0001f800 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT 11 #define GEN7_WM_DW1_PS_USE_COVERAGE_MASK (0x1 << 10) @@ -1247,10 +1261,6 @@ enum gen_depth_format { #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT (0x1 << 2) #define GEN7_WM_DW1_MSRASTMODE__MASK 0x00000003 #define GEN7_WM_DW1_MSRASTMODE__SHIFT 0 -#define GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL 0x0 -#define GEN7_WM_DW1_MSRASTMODE_OFF_PATTERN 0x1 -#define GEN7_WM_DW1_MSRASTMODE_ON_PIXEL 0x2 -#define GEN7_WM_DW1_MSRASTMODE_ON_PATTERN 0x3 #define GEN7_WM_DW2_MSDISPMODE__MASK 0x80000000 #define GEN7_WM_DW2_MSDISPMODE__SHIFT 31 @@ -1265,12 +1275,12 @@ enum gen_depth_format { #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE 4 -#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__MASK 0xe0000000 -#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__SHIFT 29 -#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__MASK 0x1c000000 -#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__SHIFT 26 -#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__MASK 0x03800000 -#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__SHIFT 23 +#define GEN8_ZS_DW1_STENCIL_FAIL_OP__MASK 0xe0000000 +#define GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT 29 +#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__MASK 0x1c000000 +#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT 26 +#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__MASK 0x03800000 +#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT 23 #define GEN8_ZS_DW1_STENCIL1_FUNC__MASK 0x00700000 #define GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT 20 #define GEN8_ZS_DW1_STENCIL1_FAIL_OP__MASK 0x000e0000 @@ -1279,8 +1289,8 @@ enum gen_depth_format { #define GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT 14 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__MASK 0x00003800 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT 11 -#define GEN8_ZS_DW1_STENCIL0_FUNC__MASK 0x00000700 -#define GEN8_ZS_DW1_STENCIL0_FUNC__SHIFT 8 +#define GEN8_ZS_DW1_STENCIL_FUNC__MASK 0x00000700 +#define GEN8_ZS_DW1_STENCIL_FUNC__SHIFT 8 #define GEN8_ZS_DW1_DEPTH_FUNC__MASK 0x000000e0 #define GEN8_ZS_DW1_DEPTH_FUNC__SHIFT 5 #define GEN8_ZS_DW1_STENCIL1_ENABLE (0x1 << 4) @@ -1289,17 +1299,17 @@ enum gen_depth_format { #define GEN8_ZS_DW1_DEPTH_TEST_ENABLE (0x1 << 1) #define GEN8_ZS_DW1_DEPTH_WRITE_ENABLE (0x1 << 0) -#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__MASK 0xff000000 -#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__SHIFT 24 -#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__MASK 0x00ff0000 -#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__SHIFT 16 -#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__MASK 0x0000ff00 -#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__SHIFT 8 -#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__MASK 0x000000ff -#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__SHIFT 0 - -#define GEN9_ZS_DW3_STENCIL0_REF__MASK 0x0000ff00 -#define GEN9_ZS_DW3_STENCIL0_REF__SHIFT 8 +#define GEN8_ZS_DW2_STENCIL_TEST_MASK__MASK 0xff000000 +#define GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT 24 +#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__MASK 0x00ff0000 +#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT 16 +#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__MASK 0x0000ff00 +#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT 8 +#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__MASK 0x000000ff +#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT 0 + +#define GEN9_ZS_DW3_STENCIL_REF__MASK 0x0000ff00 +#define GEN9_ZS_DW3_STENCIL_REF__SHIFT 8 #define GEN9_ZS_DW3_STENCIL1_REF__MASK 0x000000ff #define GEN9_ZS_DW3_STENCIL1_REF__SHIFT 0 @@ -1314,13 +1324,8 @@ enum gen_depth_format { #define GEN8_WM_HZ_DW1_FULL_SURFACE_DEPTH_CLEAR (0x1 << 25) #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__MASK 0x00ff0000 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__SHIFT 16 -#define GEN8_WM_HZ_DW1_NUMSAMPLES__MASK 0x0000e000 -#define GEN8_WM_HZ_DW1_NUMSAMPLES__SHIFT 13 -#define GEN8_WM_HZ_DW1_NUMSAMPLES_1 (0x0 << 13) -#define GEN8_WM_HZ_DW1_NUMSAMPLES_2 (0x1 << 13) -#define GEN8_WM_HZ_DW1_NUMSAMPLES_4 (0x2 << 13) -#define GEN8_WM_HZ_DW1_NUMSAMPLES_8 (0x3 << 13) -#define GEN8_WM_HZ_DW1_NUMSAMPLES_16 (0x4 << 13) +#define GEN8_WM_HZ_DW1_NUM_SAMPLES__MASK 0x0000e000 +#define GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT 13 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__MASK 0xffff0000 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__SHIFT 16 @@ -1359,9 +1364,6 @@ enum gen_depth_format { #define GEN75_PS_DW4_ACCESS_UAV (0x1 << 5) #define GEN7_PS_DW4_POSOFFSET__MASK 0x00000018 #define GEN7_PS_DW4_POSOFFSET__SHIFT 3 -#define GEN7_PS_DW4_POSOFFSET_NONE (0x0 << 3) -#define GEN7_PS_DW4_POSOFFSET_CENTROID (0x2 << 3) -#define GEN7_PS_DW4_POSOFFSET_SAMPLE (0x3 << 3) #define GEN7_PS_DW4_DISPATCH_MODE__MASK 0x00000007 #define GEN7_PS_DW4_DISPATCH_MODE__SHIFT 0 @@ -1397,9 +1399,6 @@ enum gen_depth_format { #define GEN8_PS_DW6_RT_RESOLVE (0x1 << 6) #define GEN8_PS_DW6_POSOFFSET__MASK 0x00000018 #define GEN8_PS_DW6_POSOFFSET__SHIFT 3 -#define GEN8_PS_DW6_POSOFFSET_NONE (0x0 << 3) -#define GEN8_PS_DW6_POSOFFSET_CENTROID (0x2 << 3) -#define GEN8_PS_DW6_POSOFFSET_SAMPLE (0x3 << 3) #define GEN8_PS_DW6_DISPATCH_MODE__MASK 0x00000007 #define GEN8_PS_DW6_DISPATCH_MODE__SHIFT 0 @@ -1423,16 +1422,12 @@ enum gen_depth_format { #define GEN8_3DSTATE_PS_EXTRA__SIZE 2 -#define GEN8_PSX_DW1_DISPATCH_ENABLE (0x1 << 31) +#define GEN8_PSX_DW1_VALID (0x1 << 31) #define GEN8_PSX_DW1_UAV_ONLY (0x1 << 30) #define GEN8_PSX_DW1_COMPUTE_OMASK (0x1 << 29) #define GEN8_PSX_DW1_KILL_PIXEL (0x1 << 28) #define GEN8_PSX_DW1_PSCDEPTH__MASK 0x0c000000 #define GEN8_PSX_DW1_PSCDEPTH__SHIFT 26 -#define GEN8_PSX_DW1_PSCDEPTH_OFF (0x0 << 26) -#define GEN8_PSX_DW1_PSCDEPTH_ON (0x1 << 26) -#define GEN8_PSX_DW1_PSCDEPTH_ON_GE (0x2 << 26) -#define GEN8_PSX_DW1_PSCDEPTH_ON_LE (0x3 << 26) #define GEN8_PSX_DW1_FORCE_COMPUTE_DEPTH (0x1 << 25) #define GEN8_PSX_DW1_USE_DEPTH (0x1 << 24) #define GEN8_PSX_DW1_USE_W (0x1 << 23) @@ -1696,17 +1691,10 @@ enum gen_depth_format { #define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE (0x1 << 5) -#define GEN6_MULTISAMPLE_DW1_PIXLOC__MASK 0x00000010 -#define GEN6_MULTISAMPLE_DW1_PIXLOC__SHIFT 4 -#define GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER (0x0 << 4) -#define GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER (0x1 << 4) -#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__MASK 0x0000000e -#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__SHIFT 1 -#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1 (0x0 << 1) -#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2 (0x1 << 1) -#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4 (0x2 << 1) -#define GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8 (0x3 << 1) -#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16 (0x4 << 1) +#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK 0x00000010 +#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT 4 +#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK 0x0000000e +#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT 1 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h index 6d815beecb3..b65b704adc6 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h @@ -84,7 +84,7 @@ enum gen_blend_function { GEN6_BLENDFUNCTION_MAX = 0x4, }; -enum gen_logicop_function { +enum gen_logic_op { GEN6_LOGICOP_CLEAR = 0x0, GEN6_LOGICOP_NOR = 0x1, GEN6_LOGICOP_AND_INVERTED = 0x2, @@ -103,20 +103,31 @@ enum gen_logicop_function { GEN6_LOGICOP_SET = 0xf, }; -enum gen_sampler_mip_filter { +enum gen_mip_filter { GEN6_MIPFILTER_NONE = 0x0, GEN6_MIPFILTER_NEAREST = 0x1, GEN6_MIPFILTER_LINEAR = 0x3, }; -enum gen_sampler_map_filter { +enum gen_map_filter { GEN6_MAPFILTER_NEAREST = 0x0, GEN6_MAPFILTER_LINEAR = 0x1, GEN6_MAPFILTER_ANISOTROPIC = 0x2, GEN6_MAPFILTER_MONO = 0x6, }; -enum gen_sampler_aniso_ratio { +enum gen_prefilter_op { + GEN6_PREFILTEROP_ALWAYS = 0x0, + GEN6_PREFILTEROP_NEVER = 0x1, + GEN6_PREFILTEROP_LESS = 0x2, + GEN6_PREFILTEROP_EQUAL = 0x3, + GEN6_PREFILTEROP_LEQUAL = 0x4, + GEN6_PREFILTEROP_GREATER = 0x5, + GEN6_PREFILTEROP_NOTEQUAL = 0x6, + GEN6_PREFILTEROP_GEQUAL = 0x7, +}; + +enum gen_aniso_ratio { GEN6_ANISORATIO_2 = 0x0, GEN6_ANISORATIO_4 = 0x1, GEN6_ANISORATIO_6 = 0x2, @@ -127,7 +138,7 @@ enum gen_sampler_aniso_ratio { GEN6_ANISORATIO_16 = 0x7, }; -enum gen_sampler_texcoord_mode { +enum gen_texcoord_mode { GEN6_TEXCOORDMODE_WRAP = 0x0, GEN6_TEXCOORDMODE_MIRROR = 0x1, GEN6_TEXCOORDMODE_CLAMP = 0x2, @@ -137,15 +148,15 @@ enum gen_sampler_texcoord_mode { GEN8_TEXCOORDMODE_HALF_BORDER = 0x6, }; -enum gen_sampler_key_filter { +enum gen_key_filter { GEN6_KEYFILTER_KILL_ON_ANY_MATCH = 0x0, GEN6_KEYFILTER_REPLACE_BLACK = 0x1, }; #define GEN6_COLOR_CALC_STATE__SIZE 6 -#define GEN6_CC_DW0_STENCIL0_REF__MASK 0xff000000 -#define GEN6_CC_DW0_STENCIL0_REF__SHIFT 24 +#define GEN6_CC_DW0_STENCIL_REF__MASK 0xff000000 +#define GEN6_CC_DW0_STENCIL_REF__SHIFT 24 #define GEN6_CC_DW0_STENCIL1_REF__MASK 0x00ff0000 #define GEN6_CC_DW0_STENCIL1_REF__SHIFT 16 #define GEN6_CC_DW0_ROUND_DISABLE_DISABLE (0x1 << 15) @@ -162,14 +173,14 @@ enum gen_sampler_key_filter { #define GEN6_DEPTH_STENCIL_STATE__SIZE 3 #define GEN6_ZS_DW0_STENCIL_TEST_ENABLE (0x1 << 31) -#define GEN6_ZS_DW0_STENCIL0_FUNC__MASK 0x70000000 -#define GEN6_ZS_DW0_STENCIL0_FUNC__SHIFT 28 -#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__MASK 0x0e000000 -#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__SHIFT 25 -#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__MASK 0x01c00000 -#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__SHIFT 22 -#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__MASK 0x00380000 -#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__SHIFT 19 +#define GEN6_ZS_DW0_STENCIL_FUNC__MASK 0x70000000 +#define GEN6_ZS_DW0_STENCIL_FUNC__SHIFT 28 +#define GEN6_ZS_DW0_STENCIL_FAIL_OP__MASK 0x0e000000 +#define GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT 25 +#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__MASK 0x01c00000 +#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT 22 +#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__MASK 0x00380000 +#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT 19 #define GEN6_ZS_DW0_STENCIL_WRITE_ENABLE (0x1 << 18) #define GEN6_ZS_DW0_STENCIL1_ENABLE (0x1 << 15) #define GEN6_ZS_DW0_STENCIL1_FUNC__MASK 0x00007000 @@ -181,14 +192,14 @@ enum gen_sampler_key_filter { #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__MASK 0x00000038 #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT 3 -#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__MASK 0xff000000 -#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__SHIFT 24 -#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__MASK 0x00ff0000 -#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__SHIFT 16 -#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__MASK 0x0000ff00 -#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__SHIFT 8 -#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__MASK 0x000000ff -#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__SHIFT 0 +#define GEN6_ZS_DW1_STENCIL_TEST_MASK__MASK 0xff000000 +#define GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT 24 +#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__MASK 0x00ff0000 +#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT 16 +#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__MASK 0x0000ff00 +#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT 8 +#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__MASK 0x000000ff +#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT 0 #define GEN6_ZS_DW2_DEPTH_TEST_ENABLE (0x1 << 31) #define GEN6_ZS_DW2_DEPTH_FUNC__MASK 0x38000000 @@ -216,10 +227,12 @@ enum gen_sampler_key_filter { #define GEN6_RT_DW1_ALPHA_TO_COVERAGE (0x1 << 31) #define GEN6_RT_DW1_ALPHA_TO_ONE (0x1 << 30) #define GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER (0x1 << 29) -#define GEN6_RT_DW1_WRITE_DISABLE_A (0x1 << 27) -#define GEN6_RT_DW1_WRITE_DISABLE_R (0x1 << 26) -#define GEN6_RT_DW1_WRITE_DISABLE_G (0x1 << 25) -#define GEN6_RT_DW1_WRITE_DISABLE_B (0x1 << 24) +#define GEN6_RT_DW1_WRITE_DISABLES__MASK 0x0f000000 +#define GEN6_RT_DW1_WRITE_DISABLES__SHIFT 24 +#define GEN6_RT_DW1_WRITE_DISABLES_A (0x1 << 27) +#define GEN6_RT_DW1_WRITE_DISABLES_R (0x1 << 26) +#define GEN6_RT_DW1_WRITE_DISABLES_G (0x1 << 25) +#define GEN6_RT_DW1_WRITE_DISABLES_B (0x1 << 24) #define GEN6_RT_DW1_LOGICOP_ENABLE (0x1 << 22) #define GEN6_RT_DW1_LOGICOP_FUNC__MASK 0x003c0000 #define GEN6_RT_DW1_LOGICOP_FUNC__SHIFT 18 @@ -267,10 +280,12 @@ enum gen_sampler_key_filter { #define GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT 8 #define GEN8_RT_DW0_ALPHA_FUNC__MASK 0x000000e0 #define GEN8_RT_DW0_ALPHA_FUNC__SHIFT 5 -#define GEN8_RT_DW0_WRITE_DISABLE_A (0x1 << 3) -#define GEN8_RT_DW0_WRITE_DISABLE_R (0x1 << 2) -#define GEN8_RT_DW0_WRITE_DISABLE_G (0x1 << 1) -#define GEN8_RT_DW0_WRITE_DISABLE_B (0x1 << 0) +#define GEN8_RT_DW0_WRITE_DISABLES__MASK 0x0000000f +#define GEN8_RT_DW0_WRITE_DISABLES__SHIFT 0 +#define GEN8_RT_DW0_WRITE_DISABLES_A (0x1 << 3) +#define GEN8_RT_DW0_WRITE_DISABLES_R (0x1 << 2) +#define GEN8_RT_DW0_WRITE_DISABLES_G (0x1 << 1) +#define GEN8_RT_DW0_WRITE_DISABLES_B (0x1 << 0) #define GEN8_RT_DW1_LOGICOP_ENABLE (0x1 << 31) #define GEN8_RT_DW1_LOGICOP_FUNC__MASK 0x78000000 @@ -419,6 +434,7 @@ enum gen_sampler_key_filter { #define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT 27 #define GEN6_SAMPLER_DW0_BASE_LOD__MASK 0x07c00000 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT 22 +#define GEN6_SAMPLER_DW0_BASE_LOD__RADIX 1 #define GEN6_SAMPLER_DW0_MIP_FILTER__MASK 0x00300000 #define GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT 20 #define GEN6_SAMPLER_DW0_MAG_FILTER__MASK 0x000e0000 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h index 7c2349f2447..b5d09f64429 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h @@ -299,7 +299,10 @@ enum gen_surface_scs { #define GEN6_SURFACE_DW0_MIPLAYOUT__SHIFT 10 #define GEN6_SURFACE_DW0_MIPLAYOUT_BELOW (0x0 << 10) #define GEN6_SURFACE_DW0_MIPLAYOUT_RIGHT (0x1 << 10) -#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE (0x1 << 9) +#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__MASK 0x00000200 +#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__SHIFT 9 +#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_REPLICATE (0x0 << 9) +#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE (0x1 << 9) #define GEN6_SURFACE_DW0_RENDER_CACHE_RW (0x1 << 8) #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK 0x000000c0 #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT 6 @@ -485,6 +488,8 @@ enum gen_surface_scs { #define GEN7_SURFACE_DW7_CC_B__SHIFT 29 #define GEN7_SURFACE_DW7_CC_A__MASK 0x10000000 #define GEN7_SURFACE_DW7_CC_A__SHIFT 28 +#define GEN75_SURFACE_DW7_SCS__MASK 0x0fff0000 +#define GEN75_SURFACE_DW7_SCS__SHIFT 16 #define GEN75_SURFACE_DW7_SCS_R__MASK 0x0e000000 #define GEN75_SURFACE_DW7_SCS_R__SHIFT 25 #define GEN75_SURFACE_DW7_SCS_G__MASK 0x01c00000 diff --git a/src/gallium/drivers/ilo/genhw/genhw.h b/src/gallium/drivers/ilo/genhw/genhw.h index 9e05bf5beca..3a777a18c2a 100644 --- a/src/gallium/drivers/ilo/genhw/genhw.h +++ b/src/gallium/drivers/ilo/genhw/genhw.h @@ -1,6 +1,4 @@ /* - * Mesa 3-D graphics library - * * Copyright (C) 2014 LunarG, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -25,8 +23,9 @@ #ifndef GENHW_H #define GENHW_H -#include "pipe/p_compiler.h" -#include "util/u_debug.h" +#include <stdbool.h> +#include <stdint.h> +#include <assert.h> #include "gen_regs.xml.h" #include "gen_mi.xml.h" diff --git a/src/gallium/drivers/ilo/ilo_blitter.h b/src/gallium/drivers/ilo/ilo_blitter.h index 4284f415c1c..4eba8481c28 100644 --- a/src/gallium/drivers/ilo/ilo_blitter.h +++ b/src/gallium/drivers/ilo/ilo_blitter.h @@ -39,12 +39,6 @@ enum ilo_blitter_uses { ILO_BLITTER_USE_FB_STENCIL = 1 << 4, }; -enum ilo_blitter_rectlist_op { - ILO_BLITTER_RECTLIST_CLEAR_ZS, - ILO_BLITTER_RECTLIST_RESOLVE_Z, - ILO_BLITTER_RECTLIST_RESOLVE_HIZ, -}; - struct blitter_context; struct pipe_resource; struct pipe_surface; @@ -57,30 +51,42 @@ struct ilo_blitter { /* * A minimal context with the goal to send RECTLISTs down the pipeline. */ - enum ilo_blitter_rectlist_op op; + enum ilo_state_raster_earlyz_op earlyz_op; + bool earlyz_stencil_clear; uint32_t uses; bool initialized; float vertices[3][2]; - struct ilo_ve_state ve; - struct pipe_draw_info draw; + struct gen6_3dprimitive_info draw_info; - struct ilo_viewport_cso viewport; - struct ilo_dsa_state dsa; + uint32_t vf_data[4]; + struct ilo_state_vf vf; - struct { - struct pipe_stencil_ref stencil_ref; - ubyte alpha_ref; - struct pipe_blend_color blend_color; - } cc; + struct ilo_state_vs vs; + struct ilo_state_hs hs; + struct ilo_state_ds ds; + struct ilo_state_gs gs; + + struct ilo_state_sol sol; + + struct ilo_state_viewport vp; + uint32_t vp_data[20]; + + struct ilo_state_sbe sbe; + struct ilo_state_ps ps; + struct ilo_state_cc cc; uint32_t depth_clear_value; + struct ilo_state_urb urb; + struct { struct ilo_surface_cso dst; unsigned width, height; unsigned num_samples; + + struct ilo_state_raster rs; } fb; }; diff --git a/src/gallium/drivers/ilo/ilo_blitter_pipe.c b/src/gallium/drivers/ilo/ilo_blitter_pipe.c index c4c02bd3e53..0bfe7827f11 100644 --- a/src/gallium/drivers/ilo/ilo_blitter_pipe.c +++ b/src/gallium/drivers/ilo/ilo_blitter_pipe.c @@ -63,7 +63,7 @@ ilo_blitter_pipe_begin(struct ilo_blitter *blitter, util_blitter_save_viewport(b, &vec->viewport.viewport0); if (scissor_enable) - util_blitter_save_scissor(b, &vec->scissor.scissor0); + util_blitter_save_scissor(b, &vec->viewport.scissor0); switch (op) { case ILO_BLITTER_PIPE_BLIT: diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c index 6d8afed9dca..13c8f500680 100644 --- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c +++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c @@ -25,7 +25,6 @@ * Chia-I Wu <[email protected]> */ -#include "core/ilo_state_3d.h" #include "util/u_draw.h" #include "util/u_pack_color.h" @@ -40,45 +39,48 @@ static bool ilo_blitter_set_invariants(struct ilo_blitter *blitter) { - struct pipe_vertex_element velem; - struct pipe_viewport_state vp; + struct ilo_state_vf_element_info elem; if (blitter->initialized) return true; + /* a rectangle has 3 vertices in a RECTLIST */ + blitter->draw_info.topology = GEN6_3DPRIM_RECTLIST; + blitter->draw_info.vertex_count = 3; + blitter->draw_info.instance_count = 1; + + memset(&elem, 0, sizeof(elem)); /* only vertex X and Y */ - memset(&velem, 0, sizeof(velem)); - velem.src_format = PIPE_FORMAT_R32G32_FLOAT; - ilo_gpe_init_ve(blitter->ilo->dev, 1, &velem, &blitter->ve); - - /* generate VUE header */ - ilo_gpe_init_ve_nosrc(blitter->ilo->dev, - GEN6_VFCOMP_STORE_0, /* Reserved */ - GEN6_VFCOMP_STORE_0, /* Render Target Array Index */ - GEN6_VFCOMP_STORE_0, /* Viewport Index */ - GEN6_VFCOMP_STORE_0, /* Point Width */ - &blitter->ve.nosrc_cso); - blitter->ve.prepend_nosrc_cso = true; + elem.format = GEN6_FORMAT_R32G32_FLOAT; + elem.format_size = 8; + elem.component_count = 2; - /* a rectangle has 3 vertices in a RECTLIST */ - util_draw_init_info(&blitter->draw); - blitter->draw.mode = ILO_PRIM_RECTANGLES; - blitter->draw.count = 3; + ilo_state_vf_init_for_rectlist(&blitter->vf, blitter->ilo->dev, + blitter->vf_data, sizeof(blitter->vf_data), &elem, 1); + + ilo_state_vs_init_disabled(&blitter->vs, blitter->ilo->dev); + ilo_state_hs_init_disabled(&blitter->hs, blitter->ilo->dev); + ilo_state_ds_init_disabled(&blitter->ds, blitter->ilo->dev); + ilo_state_gs_init_disabled(&blitter->gs, blitter->ilo->dev); + ilo_state_sol_init_disabled(&blitter->sol, blitter->ilo->dev, false); /** * From the Haswell PRM, volume 7, page 615: * * "The clear value must be between the min and max depth values - * (inclusive) defined in the CC_VIEWPORT." + * (inclusive) defined in the CC_VIEWPORT." * * Even though clipping and viewport transformation will be disabled, we * still need to set up the viewport states. */ - memset(&vp, 0, sizeof(vp)); - vp.scale[0] = 1.0f; - vp.scale[1] = 1.0f; - vp.scale[2] = 1.0f; - ilo_gpe_set_viewport_cso(blitter->ilo->dev, &vp, &blitter->viewport); + ilo_state_viewport_init_for_rectlist(&blitter->vp, blitter->ilo->dev, + blitter->vp_data, sizeof(blitter->vp_data)); + + ilo_state_sbe_init_for_rectlist(&blitter->sbe, blitter->ilo->dev, 0, 0); + ilo_state_ps_init_disabled(&blitter->ps, blitter->ilo->dev); + + ilo_state_urb_init_for_rectlist(&blitter->urb, blitter->ilo->dev, + ilo_state_vf_get_attr_count(&blitter->vf)); blitter->initialized = true; @@ -86,10 +88,12 @@ ilo_blitter_set_invariants(struct ilo_blitter *blitter) } static void -ilo_blitter_set_op(struct ilo_blitter *blitter, - enum ilo_blitter_rectlist_op op) +ilo_blitter_set_earlyz_op(struct ilo_blitter *blitter, + enum ilo_state_raster_earlyz_op op, + bool earlyz_stencil_clear) { - blitter->op = op; + blitter->earlyz_op = op; + blitter->earlyz_stencil_clear = earlyz_stencil_clear; } /** @@ -117,18 +121,27 @@ ilo_blitter_set_rectlist(struct ilo_blitter *blitter, } static void -ilo_blitter_set_clear_values(struct ilo_blitter *blitter, - uint32_t depth, ubyte stencil) +ilo_blitter_set_depth_clear_value(struct ilo_blitter *blitter, + uint32_t depth) { blitter->depth_clear_value = depth; - blitter->cc.stencil_ref.ref_value[0] = stencil; } static void -ilo_blitter_set_dsa(struct ilo_blitter *blitter, - const struct pipe_depth_stencil_alpha_state *state) +ilo_blitter_set_cc(struct ilo_blitter *blitter, + const struct ilo_state_cc_info *info) +{ + memset(&blitter->cc, 0, sizeof(blitter->cc)); + ilo_state_cc_init(&blitter->cc, blitter->ilo->dev, info); +} + +static void +ilo_blitter_set_fb_rs(struct ilo_blitter *blitter) { - ilo_gpe_init_dsa(blitter->ilo->dev, state, &blitter->dsa); + memset(&blitter->fb.rs, 0, sizeof(blitter->fb.rs)); + ilo_state_raster_init_for_rectlist(&blitter->fb.rs, blitter->ilo->dev, + blitter->fb.num_samples, blitter->earlyz_op, + blitter->earlyz_stencil_clear); } static void @@ -146,6 +159,8 @@ ilo_blitter_set_fb(struct ilo_blitter *blitter, blitter->fb.num_samples = 1; memcpy(&blitter->fb.dst, cso, sizeof(*cso)); + + ilo_blitter_set_fb_rs(blitter); } static void @@ -191,9 +206,9 @@ hiz_align_fb(struct ilo_blitter *blitter) { unsigned align_w, align_h; - switch (blitter->op) { - case ILO_BLITTER_RECTLIST_CLEAR_ZS: - case ILO_BLITTER_RECTLIST_RESOLVE_Z: + switch (blitter->earlyz_op) { + case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: + case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: break; default: return; @@ -328,7 +343,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter, double depth, unsigned stencil) { struct ilo_texture *tex = ilo_texture(zs->texture); - struct pipe_depth_stencil_alpha_state dsa_state; + struct ilo_state_cc_info info; uint32_t uses, clear_value; if (!ilo_image_can_enable_aux(&tex->image, zs->u.tex.level)) @@ -368,17 +383,20 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter, * - [DevSNB] errata: For stencil buffer only clear, the previous * depth clear value must be delivered during the clear." */ - memset(&dsa_state, 0, sizeof(dsa_state)); + memset(&info, 0, sizeof(info)); - if (clear_flags & PIPE_CLEAR_DEPTH) - dsa_state.depth.writemask = true; + if (clear_flags & PIPE_CLEAR_DEPTH) { + info.depth.cv_has_buffer = true; + info.depth.write_enable = true; + } if (clear_flags & PIPE_CLEAR_STENCIL) { - dsa_state.stencil[0].enabled = true; - dsa_state.stencil[0].func = PIPE_FUNC_ALWAYS; - dsa_state.stencil[0].fail_op = PIPE_STENCIL_OP_KEEP; - dsa_state.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE; - dsa_state.stencil[0].zfail_op = PIPE_STENCIL_OP_KEEP; + info.stencil.cv_has_buffer = true; + info.stencil.test_enable = true; + info.stencil.front.test_func = GEN6_COMPAREFUNCTION_ALWAYS; + info.stencil.front.fail_op = GEN6_STENCILOP_KEEP; + info.stencil.front.zfail_op = GEN6_STENCILOP_KEEP; + info.stencil.front.zpass_op = GEN6_STENCILOP_REPLACE; /* * From the Ivy Bridge PRM, volume 2 part 1, page 277: @@ -389,18 +407,21 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter, * - DEPTH_STENCIL_STATE::Stencil Test Mask must be 0xFF * - DEPTH_STENCIL_STATE::Back Face Stencil Write Mask must be 0xFF * - DEPTH_STENCIL_STATE::Back Face Stencil Test Mask must be 0xFF" + * + * Back frace masks will be copied from front face masks. */ - dsa_state.stencil[0].valuemask = 0xff; - dsa_state.stencil[0].writemask = 0xff; - dsa_state.stencil[1].valuemask = 0xff; - dsa_state.stencil[1].writemask = 0xff; + info.params.stencil_front.test_ref = (uint8_t) stencil; + info.params.stencil_front.test_mask = 0xff; + info.params.stencil_front.write_mask = 0xff; } ilo_blitter_set_invariants(blitter); - ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_CLEAR_ZS); + ilo_blitter_set_earlyz_op(blitter, + ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR, + clear_flags & PIPE_CLEAR_STENCIL); - ilo_blitter_set_dsa(blitter, &dsa_state); - ilo_blitter_set_clear_values(blitter, clear_value, (ubyte) stencil); + ilo_blitter_set_cc(blitter, &info); + ilo_blitter_set_depth_clear_value(blitter, clear_value); ilo_blitter_set_fb_from_surface(blitter, zs); uses = ILO_BLITTER_USE_DSA; @@ -421,7 +442,7 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter, unsigned level, unsigned slice) { struct ilo_texture *tex = ilo_texture(res); - struct pipe_depth_stencil_alpha_state dsa_state; + struct ilo_state_cc_info info; const struct ilo_texture_slice *s = ilo_texture_get_slice(tex, level, slice); @@ -435,16 +456,18 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter, * to NEVER. Depth Buffer Write Enable must be enabled. Stencil Test * Enable and Stencil Buffer Write Enable must be disabled." */ - memset(&dsa_state, 0, sizeof(dsa_state)); - dsa_state.depth.writemask = true; - dsa_state.depth.enabled = true; - dsa_state.depth.func = PIPE_FUNC_NEVER; + memset(&info, 0, sizeof(info)); + info.depth.cv_has_buffer = true; + info.depth.test_enable = true; + info.depth.write_enable = true; + info.depth.test_func = GEN6_COMPAREFUNCTION_NEVER; ilo_blitter_set_invariants(blitter); - ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_Z); + ilo_blitter_set_earlyz_op(blitter, + ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE, false); - ilo_blitter_set_dsa(blitter, &dsa_state); - ilo_blitter_set_clear_values(blitter, s->clear_value, 0); + ilo_blitter_set_cc(blitter, &info); + ilo_blitter_set_depth_clear_value(blitter, s->clear_value); ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice); ilo_blitter_set_uses(blitter, ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH); @@ -458,7 +481,7 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter, unsigned level, unsigned slice) { struct ilo_texture *tex = ilo_texture(res); - struct pipe_depth_stencil_alpha_state dsa_state; + struct ilo_state_cc_info info; if (!ilo_image_can_enable_aux(&tex->image, level)) return; @@ -470,13 +493,15 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter, * disabled. Depth Buffer Write Enable must be enabled. Stencil Test * Enable and Stencil Buffer Write Enable must be disabled." */ - memset(&dsa_state, 0, sizeof(dsa_state)); - dsa_state.depth.writemask = true; + memset(&info, 0, sizeof(info)); + info.depth.cv_has_buffer = true; + info.depth.write_enable = true; ilo_blitter_set_invariants(blitter); - ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_HIZ); + ilo_blitter_set_earlyz_op(blitter, + ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE, false); - ilo_blitter_set_dsa(blitter, &dsa_state); + ilo_blitter_set_cc(blitter, &info); ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice); ilo_blitter_set_uses(blitter, ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH); diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c index fc91fd312d2..e8e1a4cd14c 100644 --- a/src/gallium/drivers/ilo/ilo_draw.c +++ b/src/gallium/drivers/ilo/ilo_draw.c @@ -452,12 +452,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, } u; /* we will draw with IB mapped */ - if (ib->buffer) { - u.ptr = intel_bo_map(ilo_buffer(ib->buffer)->bo, false); + if (ib->state.buffer) { + u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false); if (u.ptr) - u.u8 += ib->offset; + u.u8 += ib->state.offset; } else { - u.ptr = ib->user_buffer; + u.ptr = ib->state.user_buffer; } if (!u.ptr) @@ -483,7 +483,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, (pipe)->draw_vbo(pipe, &subinfo); \ } while (0) - switch (ib->index_size) { + switch (ib->state.index_size) { case 1: DRAW_VBO_WITH_SW_RESTART(&ilo->base, info, u.u8); break; @@ -500,8 +500,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, #undef DRAW_VBO_WITH_SW_RESTART - if (ib->buffer) - intel_bo_unmap(ilo_buffer(ib->buffer)->bo); + if (ib->state.buffer) + intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo); } static bool @@ -511,9 +511,9 @@ draw_vbo_need_sw_restart(const struct ilo_context *ilo, /* the restart index is fixed prior to GEN7.5 */ if (ilo_dev_gen(ilo->dev) < ILO_GEN(7.5)) { const unsigned cut_index = - (ilo->state_vector.ib.index_size == 1) ? 0xff : - (ilo->state_vector.ib.index_size == 2) ? 0xffff : - (ilo->state_vector.ib.index_size == 4) ? 0xffffffff : 0; + (ilo->state_vector.ib.state.index_size == 1) ? 0xff : + (ilo->state_vector.ib.state.index_size == 2) ? 0xffff : + (ilo->state_vector.ib.state.index_size == 4) ? 0xffffffff : 0; if (info->restart_index < cut_index) return true; diff --git a/src/gallium/drivers/ilo/ilo_format.c b/src/gallium/drivers/ilo/ilo_format.c new file mode 100644 index 00000000000..ca7e6b55ca1 --- /dev/null +++ b/src/gallium/drivers/ilo/ilo_format.c @@ -0,0 +1,356 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 2012-2013 LunarG, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "genhw/genhw.h" +#include "core/ilo_state_surface.h" +#include "core/ilo_state_vf.h" +#include "ilo_format.h" + +bool +ilo_format_support_vb(const struct ilo_dev *dev, + enum pipe_format format) +{ + const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER); + + return (idx >= 0 && ilo_state_vf_valid_element_format(dev, idx)); +} + +bool +ilo_format_support_sol(const struct ilo_dev *dev, + enum pipe_format format) +{ + const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT); + + return (idx >= 0 && ilo_state_surface_valid_format(dev, + ILO_STATE_SURFACE_ACCESS_DP_SVB, idx)); +} + +bool +ilo_format_support_sampler(const struct ilo_dev *dev, + enum pipe_format format) +{ + const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW); + + return (idx >= 0 && ilo_state_surface_valid_format(dev, + ILO_STATE_SURFACE_ACCESS_SAMPLER, idx)); +} + +bool +ilo_format_support_rt(const struct ilo_dev *dev, + enum pipe_format format) +{ + const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET); + + return (idx >= 0 && ilo_state_surface_valid_format(dev, + ILO_STATE_SURFACE_ACCESS_DP_RENDER, idx)); +} + +bool +ilo_format_support_zs(const struct ilo_dev *dev, + enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return true; + case PIPE_FORMAT_S8_UINT: + /* TODO separate stencil */ + default: + return false; + } +} + +/** + * Translate a color (non-depth/stencil) pipe format to the matching hardware + * format. Return -1 on errors. + */ +int +ilo_format_translate_color(const struct ilo_dev *dev, + enum pipe_format format) +{ + static const int format_mapping[PIPE_FORMAT_COUNT] = { + [PIPE_FORMAT_NONE] = 0, + [PIPE_FORMAT_B8G8R8A8_UNORM] = GEN6_FORMAT_B8G8R8A8_UNORM, + [PIPE_FORMAT_B8G8R8X8_UNORM] = GEN6_FORMAT_B8G8R8X8_UNORM, + [PIPE_FORMAT_A8R8G8B8_UNORM] = 0, + [PIPE_FORMAT_X8R8G8B8_UNORM] = 0, + [PIPE_FORMAT_B5G5R5A1_UNORM] = GEN6_FORMAT_B5G5R5A1_UNORM, + [PIPE_FORMAT_B4G4R4A4_UNORM] = GEN6_FORMAT_B4G4R4A4_UNORM, + [PIPE_FORMAT_B5G6R5_UNORM] = GEN6_FORMAT_B5G6R5_UNORM, + [PIPE_FORMAT_R10G10B10A2_UNORM] = GEN6_FORMAT_R10G10B10A2_UNORM, + [PIPE_FORMAT_L8_UNORM] = GEN6_FORMAT_L8_UNORM, + [PIPE_FORMAT_A8_UNORM] = GEN6_FORMAT_A8_UNORM, + [PIPE_FORMAT_I8_UNORM] = GEN6_FORMAT_I8_UNORM, + [PIPE_FORMAT_L8A8_UNORM] = GEN6_FORMAT_L8A8_UNORM, + [PIPE_FORMAT_L16_UNORM] = GEN6_FORMAT_L16_UNORM, + [PIPE_FORMAT_UYVY] = GEN6_FORMAT_YCRCB_SWAPUVY, + [PIPE_FORMAT_YUYV] = GEN6_FORMAT_YCRCB_NORMAL, + [PIPE_FORMAT_Z16_UNORM] = 0, + [PIPE_FORMAT_Z32_UNORM] = 0, + [PIPE_FORMAT_Z32_FLOAT] = 0, + [PIPE_FORMAT_Z24_UNORM_S8_UINT] = 0, + [PIPE_FORMAT_S8_UINT_Z24_UNORM] = 0, + [PIPE_FORMAT_Z24X8_UNORM] = 0, + [PIPE_FORMAT_X8Z24_UNORM] = 0, + [PIPE_FORMAT_S8_UINT] = 0, + [PIPE_FORMAT_R64_FLOAT] = GEN6_FORMAT_R64_FLOAT, + [PIPE_FORMAT_R64G64_FLOAT] = GEN6_FORMAT_R64G64_FLOAT, + [PIPE_FORMAT_R64G64B64_FLOAT] = GEN6_FORMAT_R64G64B64_FLOAT, + [PIPE_FORMAT_R64G64B64A64_FLOAT] = GEN6_FORMAT_R64G64B64A64_FLOAT, + [PIPE_FORMAT_R32_FLOAT] = GEN6_FORMAT_R32_FLOAT, + [PIPE_FORMAT_R32G32_FLOAT] = GEN6_FORMAT_R32G32_FLOAT, + [PIPE_FORMAT_R32G32B32_FLOAT] = GEN6_FORMAT_R32G32B32_FLOAT, + [PIPE_FORMAT_R32G32B32A32_FLOAT] = GEN6_FORMAT_R32G32B32A32_FLOAT, + [PIPE_FORMAT_R32_UNORM] = GEN6_FORMAT_R32_UNORM, + [PIPE_FORMAT_R32G32_UNORM] = GEN6_FORMAT_R32G32_UNORM, + [PIPE_FORMAT_R32G32B32_UNORM] = GEN6_FORMAT_R32G32B32_UNORM, + [PIPE_FORMAT_R32G32B32A32_UNORM] = GEN6_FORMAT_R32G32B32A32_UNORM, + [PIPE_FORMAT_R32_USCALED] = GEN6_FORMAT_R32_USCALED, + [PIPE_FORMAT_R32G32_USCALED] = GEN6_FORMAT_R32G32_USCALED, + [PIPE_FORMAT_R32G32B32_USCALED] = GEN6_FORMAT_R32G32B32_USCALED, + [PIPE_FORMAT_R32G32B32A32_USCALED] = GEN6_FORMAT_R32G32B32A32_USCALED, + [PIPE_FORMAT_R32_SNORM] = GEN6_FORMAT_R32_SNORM, + [PIPE_FORMAT_R32G32_SNORM] = GEN6_FORMAT_R32G32_SNORM, + [PIPE_FORMAT_R32G32B32_SNORM] = GEN6_FORMAT_R32G32B32_SNORM, + [PIPE_FORMAT_R32G32B32A32_SNORM] = GEN6_FORMAT_R32G32B32A32_SNORM, + [PIPE_FORMAT_R32_SSCALED] = GEN6_FORMAT_R32_SSCALED, + [PIPE_FORMAT_R32G32_SSCALED] = GEN6_FORMAT_R32G32_SSCALED, + [PIPE_FORMAT_R32G32B32_SSCALED] = GEN6_FORMAT_R32G32B32_SSCALED, + [PIPE_FORMAT_R32G32B32A32_SSCALED] = GEN6_FORMAT_R32G32B32A32_SSCALED, + [PIPE_FORMAT_R16_UNORM] = GEN6_FORMAT_R16_UNORM, + [PIPE_FORMAT_R16G16_UNORM] = GEN6_FORMAT_R16G16_UNORM, + [PIPE_FORMAT_R16G16B16_UNORM] = GEN6_FORMAT_R16G16B16_UNORM, + [PIPE_FORMAT_R16G16B16A16_UNORM] = GEN6_FORMAT_R16G16B16A16_UNORM, + [PIPE_FORMAT_R16_USCALED] = GEN6_FORMAT_R16_USCALED, + [PIPE_FORMAT_R16G16_USCALED] = GEN6_FORMAT_R16G16_USCALED, + [PIPE_FORMAT_R16G16B16_USCALED] = GEN6_FORMAT_R16G16B16_USCALED, + [PIPE_FORMAT_R16G16B16A16_USCALED] = GEN6_FORMAT_R16G16B16A16_USCALED, + [PIPE_FORMAT_R16_SNORM] = GEN6_FORMAT_R16_SNORM, + [PIPE_FORMAT_R16G16_SNORM] = GEN6_FORMAT_R16G16_SNORM, + [PIPE_FORMAT_R16G16B16_SNORM] = GEN6_FORMAT_R16G16B16_SNORM, + [PIPE_FORMAT_R16G16B16A16_SNORM] = GEN6_FORMAT_R16G16B16A16_SNORM, + [PIPE_FORMAT_R16_SSCALED] = GEN6_FORMAT_R16_SSCALED, + [PIPE_FORMAT_R16G16_SSCALED] = GEN6_FORMAT_R16G16_SSCALED, + [PIPE_FORMAT_R16G16B16_SSCALED] = GEN6_FORMAT_R16G16B16_SSCALED, + [PIPE_FORMAT_R16G16B16A16_SSCALED] = GEN6_FORMAT_R16G16B16A16_SSCALED, + [PIPE_FORMAT_R8_UNORM] = GEN6_FORMAT_R8_UNORM, + [PIPE_FORMAT_R8G8_UNORM] = GEN6_FORMAT_R8G8_UNORM, + [PIPE_FORMAT_R8G8B8_UNORM] = GEN6_FORMAT_R8G8B8_UNORM, + [PIPE_FORMAT_R8G8B8A8_UNORM] = GEN6_FORMAT_R8G8B8A8_UNORM, + [PIPE_FORMAT_X8B8G8R8_UNORM] = 0, + [PIPE_FORMAT_R8_USCALED] = GEN6_FORMAT_R8_USCALED, + [PIPE_FORMAT_R8G8_USCALED] = GEN6_FORMAT_R8G8_USCALED, + [PIPE_FORMAT_R8G8B8_USCALED] = GEN6_FORMAT_R8G8B8_USCALED, + [PIPE_FORMAT_R8G8B8A8_USCALED] = GEN6_FORMAT_R8G8B8A8_USCALED, + [PIPE_FORMAT_R8_SNORM] = GEN6_FORMAT_R8_SNORM, + [PIPE_FORMAT_R8G8_SNORM] = GEN6_FORMAT_R8G8_SNORM, + [PIPE_FORMAT_R8G8B8_SNORM] = GEN6_FORMAT_R8G8B8_SNORM, + [PIPE_FORMAT_R8G8B8A8_SNORM] = GEN6_FORMAT_R8G8B8A8_SNORM, + [PIPE_FORMAT_R8_SSCALED] = GEN6_FORMAT_R8_SSCALED, + [PIPE_FORMAT_R8G8_SSCALED] = GEN6_FORMAT_R8G8_SSCALED, + [PIPE_FORMAT_R8G8B8_SSCALED] = GEN6_FORMAT_R8G8B8_SSCALED, + [PIPE_FORMAT_R8G8B8A8_SSCALED] = GEN6_FORMAT_R8G8B8A8_SSCALED, + [PIPE_FORMAT_R32_FIXED] = GEN6_FORMAT_R32_SFIXED, + [PIPE_FORMAT_R32G32_FIXED] = GEN6_FORMAT_R32G32_SFIXED, + [PIPE_FORMAT_R32G32B32_FIXED] = GEN6_FORMAT_R32G32B32_SFIXED, + [PIPE_FORMAT_R32G32B32A32_FIXED] = GEN6_FORMAT_R32G32B32A32_SFIXED, + [PIPE_FORMAT_R16_FLOAT] = GEN6_FORMAT_R16_FLOAT, + [PIPE_FORMAT_R16G16_FLOAT] = GEN6_FORMAT_R16G16_FLOAT, + [PIPE_FORMAT_R16G16B16_FLOAT] = GEN6_FORMAT_R16G16B16_FLOAT, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = GEN6_FORMAT_R16G16B16A16_FLOAT, + [PIPE_FORMAT_L8_SRGB] = GEN6_FORMAT_L8_UNORM_SRGB, + [PIPE_FORMAT_L8A8_SRGB] = GEN6_FORMAT_L8A8_UNORM_SRGB, + [PIPE_FORMAT_R8G8B8_SRGB] = GEN6_FORMAT_R8G8B8_UNORM_SRGB, + [PIPE_FORMAT_A8B8G8R8_SRGB] = 0, + [PIPE_FORMAT_X8B8G8R8_SRGB] = 0, + [PIPE_FORMAT_B8G8R8A8_SRGB] = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB, + [PIPE_FORMAT_B8G8R8X8_SRGB] = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB, + [PIPE_FORMAT_A8R8G8B8_SRGB] = 0, + [PIPE_FORMAT_X8R8G8B8_SRGB] = 0, + [PIPE_FORMAT_R8G8B8A8_SRGB] = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB, + [PIPE_FORMAT_DXT1_RGB] = GEN6_FORMAT_DXT1_RGB, + [PIPE_FORMAT_DXT1_RGBA] = GEN6_FORMAT_BC1_UNORM, + [PIPE_FORMAT_DXT3_RGBA] = GEN6_FORMAT_BC2_UNORM, + [PIPE_FORMAT_DXT5_RGBA] = GEN6_FORMAT_BC3_UNORM, + [PIPE_FORMAT_DXT1_SRGB] = GEN6_FORMAT_DXT1_RGB_SRGB, + [PIPE_FORMAT_DXT1_SRGBA] = GEN6_FORMAT_BC1_UNORM_SRGB, + [PIPE_FORMAT_DXT3_SRGBA] = GEN6_FORMAT_BC2_UNORM_SRGB, + [PIPE_FORMAT_DXT5_SRGBA] = GEN6_FORMAT_BC3_UNORM_SRGB, + [PIPE_FORMAT_RGTC1_UNORM] = GEN6_FORMAT_BC4_UNORM, + [PIPE_FORMAT_RGTC1_SNORM] = GEN6_FORMAT_BC4_SNORM, + [PIPE_FORMAT_RGTC2_UNORM] = GEN6_FORMAT_BC5_UNORM, + [PIPE_FORMAT_RGTC2_SNORM] = GEN6_FORMAT_BC5_SNORM, + [PIPE_FORMAT_R8G8_B8G8_UNORM] = 0, + [PIPE_FORMAT_G8R8_G8B8_UNORM] = 0, + [PIPE_FORMAT_R8SG8SB8UX8U_NORM] = 0, + [PIPE_FORMAT_R5SG5SB6U_NORM] = 0, + [PIPE_FORMAT_A8B8G8R8_UNORM] = 0, + [PIPE_FORMAT_B5G5R5X1_UNORM] = GEN6_FORMAT_B5G5R5X1_UNORM, + [PIPE_FORMAT_R10G10B10A2_USCALED] = GEN6_FORMAT_R10G10B10A2_USCALED, + [PIPE_FORMAT_R11G11B10_FLOAT] = GEN6_FORMAT_R11G11B10_FLOAT, + [PIPE_FORMAT_R9G9B9E5_FLOAT] = GEN6_FORMAT_R9G9B9E5_SHAREDEXP, + [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = 0, + [PIPE_FORMAT_R1_UNORM] = GEN6_FORMAT_R1_UNORM, + [PIPE_FORMAT_R10G10B10X2_USCALED] = GEN6_FORMAT_R10G10B10X2_USCALED, + [PIPE_FORMAT_R10G10B10X2_SNORM] = 0, + [PIPE_FORMAT_L4A4_UNORM] = 0, + [PIPE_FORMAT_B10G10R10A2_UNORM] = GEN6_FORMAT_B10G10R10A2_UNORM, + [PIPE_FORMAT_R10SG10SB10SA2U_NORM] = 0, + [PIPE_FORMAT_R8G8Bx_SNORM] = 0, + [PIPE_FORMAT_R8G8B8X8_UNORM] = GEN6_FORMAT_R8G8B8X8_UNORM, + [PIPE_FORMAT_B4G4R4X4_UNORM] = 0, + [PIPE_FORMAT_X24S8_UINT] = 0, + [PIPE_FORMAT_S8X24_UINT] = 0, + [PIPE_FORMAT_X32_S8X24_UINT] = 0, + [PIPE_FORMAT_B2G3R3_UNORM] = 0, + [PIPE_FORMAT_L16A16_UNORM] = GEN6_FORMAT_L16A16_UNORM, + [PIPE_FORMAT_A16_UNORM] = GEN6_FORMAT_A16_UNORM, + [PIPE_FORMAT_I16_UNORM] = GEN6_FORMAT_I16_UNORM, + [PIPE_FORMAT_LATC1_UNORM] = 0, + [PIPE_FORMAT_LATC1_SNORM] = 0, + [PIPE_FORMAT_LATC2_UNORM] = 0, + [PIPE_FORMAT_LATC2_SNORM] = 0, + [PIPE_FORMAT_A8_SNORM] = 0, + [PIPE_FORMAT_L8_SNORM] = 0, + [PIPE_FORMAT_L8A8_SNORM] = 0, + [PIPE_FORMAT_I8_SNORM] = 0, + [PIPE_FORMAT_A16_SNORM] = 0, + [PIPE_FORMAT_L16_SNORM] = 0, + [PIPE_FORMAT_L16A16_SNORM] = 0, + [PIPE_FORMAT_I16_SNORM] = 0, + [PIPE_FORMAT_A16_FLOAT] = GEN6_FORMAT_A16_FLOAT, + [PIPE_FORMAT_L16_FLOAT] = GEN6_FORMAT_L16_FLOAT, + [PIPE_FORMAT_L16A16_FLOAT] = GEN6_FORMAT_L16A16_FLOAT, + [PIPE_FORMAT_I16_FLOAT] = GEN6_FORMAT_I16_FLOAT, + [PIPE_FORMAT_A32_FLOAT] = GEN6_FORMAT_A32_FLOAT, + [PIPE_FORMAT_L32_FLOAT] = GEN6_FORMAT_L32_FLOAT, + [PIPE_FORMAT_L32A32_FLOAT] = GEN6_FORMAT_L32A32_FLOAT, + [PIPE_FORMAT_I32_FLOAT] = GEN6_FORMAT_I32_FLOAT, + [PIPE_FORMAT_YV12] = 0, + [PIPE_FORMAT_YV16] = 0, + [PIPE_FORMAT_IYUV] = 0, + [PIPE_FORMAT_NV12] = 0, + [PIPE_FORMAT_NV21] = 0, + [PIPE_FORMAT_A4R4_UNORM] = 0, + [PIPE_FORMAT_R4A4_UNORM] = 0, + [PIPE_FORMAT_R8A8_UNORM] = 0, + [PIPE_FORMAT_A8R8_UNORM] = 0, + [PIPE_FORMAT_R10G10B10A2_SSCALED] = GEN6_FORMAT_R10G10B10A2_SSCALED, + [PIPE_FORMAT_R10G10B10A2_SNORM] = GEN6_FORMAT_R10G10B10A2_SNORM, + [PIPE_FORMAT_B10G10R10A2_USCALED] = GEN6_FORMAT_B10G10R10A2_USCALED, + [PIPE_FORMAT_B10G10R10A2_SSCALED] = GEN6_FORMAT_B10G10R10A2_SSCALED, + [PIPE_FORMAT_B10G10R10A2_SNORM] = GEN6_FORMAT_B10G10R10A2_SNORM, + [PIPE_FORMAT_R8_UINT] = GEN6_FORMAT_R8_UINT, + [PIPE_FORMAT_R8G8_UINT] = GEN6_FORMAT_R8G8_UINT, + [PIPE_FORMAT_R8G8B8_UINT] = GEN6_FORMAT_R8G8B8_UINT, + [PIPE_FORMAT_R8G8B8A8_UINT] = GEN6_FORMAT_R8G8B8A8_UINT, + [PIPE_FORMAT_R8_SINT] = GEN6_FORMAT_R8_SINT, + [PIPE_FORMAT_R8G8_SINT] = GEN6_FORMAT_R8G8_SINT, + [PIPE_FORMAT_R8G8B8_SINT] = GEN6_FORMAT_R8G8B8_SINT, + [PIPE_FORMAT_R8G8B8A8_SINT] = GEN6_FORMAT_R8G8B8A8_SINT, + [PIPE_FORMAT_R16_UINT] = GEN6_FORMAT_R16_UINT, + [PIPE_FORMAT_R16G16_UINT] = GEN6_FORMAT_R16G16_UINT, + [PIPE_FORMAT_R16G16B16_UINT] = GEN6_FORMAT_R16G16B16_UINT, + [PIPE_FORMAT_R16G16B16A16_UINT] = GEN6_FORMAT_R16G16B16A16_UINT, + [PIPE_FORMAT_R16_SINT] = GEN6_FORMAT_R16_SINT, + [PIPE_FORMAT_R16G16_SINT] = GEN6_FORMAT_R16G16_SINT, + [PIPE_FORMAT_R16G16B16_SINT] = GEN6_FORMAT_R16G16B16_SINT, + [PIPE_FORMAT_R16G16B16A16_SINT] = GEN6_FORMAT_R16G16B16A16_SINT, + [PIPE_FORMAT_R32_UINT] = GEN6_FORMAT_R32_UINT, + [PIPE_FORMAT_R32G32_UINT] = GEN6_FORMAT_R32G32_UINT, + [PIPE_FORMAT_R32G32B32_UINT] = GEN6_FORMAT_R32G32B32_UINT, + [PIPE_FORMAT_R32G32B32A32_UINT] = GEN6_FORMAT_R32G32B32A32_UINT, + [PIPE_FORMAT_R32_SINT] = GEN6_FORMAT_R32_SINT, + [PIPE_FORMAT_R32G32_SINT] = GEN6_FORMAT_R32G32_SINT, + [PIPE_FORMAT_R32G32B32_SINT] = GEN6_FORMAT_R32G32B32_SINT, + [PIPE_FORMAT_R32G32B32A32_SINT] = GEN6_FORMAT_R32G32B32A32_SINT, + [PIPE_FORMAT_A8_UINT] = 0, + [PIPE_FORMAT_I8_UINT] = GEN6_FORMAT_I8_UINT, + [PIPE_FORMAT_L8_UINT] = GEN6_FORMAT_L8_UINT, + [PIPE_FORMAT_L8A8_UINT] = GEN6_FORMAT_L8A8_UINT, + [PIPE_FORMAT_A8_SINT] = 0, + [PIPE_FORMAT_I8_SINT] = GEN6_FORMAT_I8_SINT, + [PIPE_FORMAT_L8_SINT] = GEN6_FORMAT_L8_SINT, + [PIPE_FORMAT_L8A8_SINT] = GEN6_FORMAT_L8A8_SINT, + [PIPE_FORMAT_A16_UINT] = 0, + [PIPE_FORMAT_I16_UINT] = 0, + [PIPE_FORMAT_L16_UINT] = 0, + [PIPE_FORMAT_L16A16_UINT] = 0, + [PIPE_FORMAT_A16_SINT] = 0, + [PIPE_FORMAT_I16_SINT] = 0, + [PIPE_FORMAT_L16_SINT] = 0, + [PIPE_FORMAT_L16A16_SINT] = 0, + [PIPE_FORMAT_A32_UINT] = 0, + [PIPE_FORMAT_I32_UINT] = 0, + [PIPE_FORMAT_L32_UINT] = 0, + [PIPE_FORMAT_L32A32_UINT] = 0, + [PIPE_FORMAT_A32_SINT] = 0, + [PIPE_FORMAT_I32_SINT] = 0, + [PIPE_FORMAT_L32_SINT] = 0, + [PIPE_FORMAT_L32A32_SINT] = 0, + [PIPE_FORMAT_B10G10R10A2_UINT] = GEN6_FORMAT_B10G10R10A2_UINT, + [PIPE_FORMAT_ETC1_RGB8] = GEN6_FORMAT_ETC1_RGB8, + [PIPE_FORMAT_R8G8_R8B8_UNORM] = 0, + [PIPE_FORMAT_G8R8_B8R8_UNORM] = 0, + [PIPE_FORMAT_R8G8B8X8_SNORM] = 0, + [PIPE_FORMAT_R8G8B8X8_SRGB] = 0, + [PIPE_FORMAT_R8G8B8X8_UINT] = 0, + [PIPE_FORMAT_R8G8B8X8_SINT] = 0, + [PIPE_FORMAT_B10G10R10X2_UNORM] = GEN6_FORMAT_B10G10R10X2_UNORM, + [PIPE_FORMAT_R16G16B16X16_UNORM] = GEN6_FORMAT_R16G16B16X16_UNORM, + [PIPE_FORMAT_R16G16B16X16_SNORM] = 0, + [PIPE_FORMAT_R16G16B16X16_FLOAT] = GEN6_FORMAT_R16G16B16X16_FLOAT, + [PIPE_FORMAT_R16G16B16X16_UINT] = 0, + [PIPE_FORMAT_R16G16B16X16_SINT] = 0, + [PIPE_FORMAT_R32G32B32X32_FLOAT] = GEN6_FORMAT_R32G32B32X32_FLOAT, + [PIPE_FORMAT_R32G32B32X32_UINT] = 0, + [PIPE_FORMAT_R32G32B32X32_SINT] = 0, + [PIPE_FORMAT_R8A8_SNORM] = 0, + [PIPE_FORMAT_R16A16_UNORM] = 0, + [PIPE_FORMAT_R16A16_SNORM] = 0, + [PIPE_FORMAT_R16A16_FLOAT] = 0, + [PIPE_FORMAT_R32A32_FLOAT] = 0, + [PIPE_FORMAT_R8A8_UINT] = 0, + [PIPE_FORMAT_R8A8_SINT] = 0, + [PIPE_FORMAT_R16A16_UINT] = 0, + [PIPE_FORMAT_R16A16_SINT] = 0, + [PIPE_FORMAT_R32A32_UINT] = 0, + [PIPE_FORMAT_R32A32_SINT] = 0, + [PIPE_FORMAT_R10G10B10A2_UINT] = GEN6_FORMAT_R10G10B10A2_UINT, + [PIPE_FORMAT_B5G6R5_SRGB] = GEN6_FORMAT_B5G6R5_UNORM_SRGB, + }; + int sfmt = format_mapping[format]; + + /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */ + if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT) + sfmt = -1; + + return sfmt; +} diff --git a/src/gallium/drivers/ilo/core/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h index 6b73ea1dad7..4e955c09c14 100644 --- a/src/gallium/drivers/ilo/core/ilo_format.h +++ b/src/gallium/drivers/ilo/ilo_format.h @@ -29,8 +29,8 @@ #define ILO_FORMAT_H #include "genhw/genhw.h" -#include "ilo_core.h" -#include "ilo_dev.h" + +#include "ilo_common.h" bool ilo_format_support_vb(const struct ilo_dev *dev, diff --git a/src/gallium/drivers/ilo/ilo_render.c b/src/gallium/drivers/ilo/ilo_render.c index f5be3360f05..21f75de11a0 100644 --- a/src/gallium/drivers/ilo/ilo_render.c +++ b/src/gallium/drivers/ilo/ilo_render.c @@ -35,76 +35,10 @@ #include "ilo_query.h" #include "ilo_render_gen.h" -/* in S1.3 */ -struct sample_position { - int8_t x, y; -}; - -static const struct sample_position ilo_sample_pattern_1x[1] = { - { 0, 0 }, -}; - -static const struct sample_position ilo_sample_pattern_2x[2] = { - { -4, -4 }, - { 4, 4 }, -}; - -static const struct sample_position ilo_sample_pattern_4x[4] = { - { -2, -6 }, - { 6, -2 }, - { -6, 2 }, - { 2, 6 }, -}; - -/* \see brw_multisample_positions_8x */ -static const struct sample_position ilo_sample_pattern_8x[8] = { - { -1, 1 }, - { 1, 5 }, - { 3, -5 }, - { 5, 3 }, - { -7, -1 }, - { -3, -7 }, - { 7, -3 }, - { -5, 7 }, -}; - -static const struct sample_position ilo_sample_pattern_16x[16] = { - { 0, 2 }, - { 3, 0 }, - { -3, -2 }, - { -2, -4 }, - { 4, 3 }, - { 5, 1 }, - { 6, -1 }, - { 2, -6 }, - { -4, 5 }, - { -5, -5 }, - { -1, -7 }, - { 7, -3 }, - { -7, 4 }, - { 1, -8 }, - { -6, 6 }, - { -8, 7 }, -}; - -static uint8_t -pack_sample_position(const struct sample_position *pos) -{ - return (pos->x + 8) << 4 | (pos->y + 8); -} - -static void -get_sample_position(const struct sample_position *pos, float *x, float *y) -{ - *x = (float) (pos->x + 8) / 16.0f; - *y = (float) (pos->y + 8) / 16.0f; -} - struct ilo_render * ilo_render_create(struct ilo_builder *builder) { struct ilo_render *render; - int i; render = CALLOC_STRUCT(ilo_render); if (!render) @@ -121,29 +55,8 @@ ilo_render_create(struct ilo_builder *builder) return NULL; } - /* pack into dwords */ - render->sample_pattern_1x = pack_sample_position(ilo_sample_pattern_1x); - render->sample_pattern_2x = - pack_sample_position(&ilo_sample_pattern_2x[1]) << 8 | - pack_sample_position(&ilo_sample_pattern_2x[0]); - for (i = 0; i < 4; i++) { - render->sample_pattern_4x |= - pack_sample_position(&ilo_sample_pattern_4x[i]) << (8 * i); - - render->sample_pattern_8x[0] |= - pack_sample_position(&ilo_sample_pattern_8x[i]) << (8 * i); - render->sample_pattern_8x[1] |= - pack_sample_position(&ilo_sample_pattern_8x[i + 4]) << (8 * i); - - render->sample_pattern_16x[0] |= - pack_sample_position(&ilo_sample_pattern_16x[i]) << (8 * i); - render->sample_pattern_16x[1] |= - pack_sample_position(&ilo_sample_pattern_16x[i + 4]) << (8 * i); - render->sample_pattern_16x[2] |= - pack_sample_position(&ilo_sample_pattern_16x[i + 8]) << (8 * i); - render->sample_pattern_16x[3] |= - pack_sample_position(&ilo_sample_pattern_16x[i + 12]) << (8 * i); - } + ilo_state_sample_pattern_init_default(&render->sample_pattern, + render->dev); ilo_render_invalidate_hw(render); ilo_render_invalidate_builder(render); @@ -164,38 +77,13 @@ ilo_render_get_sample_position(const struct ilo_render *render, unsigned sample_index, float *x, float *y) { - const struct sample_position *pattern; + uint8_t off_x, off_y; - switch (sample_count) { - case 1: - assert(sample_index < Elements(ilo_sample_pattern_1x)); - pattern = ilo_sample_pattern_1x; - break; - case 2: - assert(sample_index < Elements(ilo_sample_pattern_2x)); - pattern = ilo_sample_pattern_2x; - break; - case 4: - assert(sample_index < Elements(ilo_sample_pattern_4x)); - pattern = ilo_sample_pattern_4x; - break; - case 8: - assert(sample_index < Elements(ilo_sample_pattern_8x)); - pattern = ilo_sample_pattern_8x; - break; - case 16: - assert(sample_index < Elements(ilo_sample_pattern_16x)); - pattern = ilo_sample_pattern_16x; - break; - default: - assert(!"unknown sample count"); - *x = 0.5f; - *y = 0.5f; - return; - break; - } + ilo_state_sample_pattern_get_offset(&render->sample_pattern, render->dev, + sample_count, sample_index, &off_x, &off_y); - get_sample_position(&pattern[sample_index], x, y); + *x = (float) off_x / 16.0f; + *y = (float) off_y / 16.0f; } void @@ -446,12 +334,44 @@ draw_session_prepare(struct ilo_render *render, render->instruction_bo_changed = true; session->prim_changed = true; - session->primitive_restart_changed = true; + + ilo_state_urb_full_delta(&vec->urb, render->dev, &session->urb_delta); + ilo_state_vf_full_delta(&vec->ve->vf, render->dev, &session->vf_delta); + + ilo_state_raster_full_delta(&vec->rasterizer->rs, render->dev, + &session->rs_delta); + + ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev, + &session->vp_delta); + + ilo_state_cc_full_delta(&vec->blend->cc, render->dev, + &session->cc_delta); } else { session->prim_changed = (render->state.reduced_prim != session->reduced_prim); - session->primitive_restart_changed = - (render->state.primitive_restart != vec->draw->primitive_restart); + + ilo_state_urb_get_delta(&vec->urb, render->dev, + &render->state.urb, &session->urb_delta); + + if (vec->dirty & ILO_DIRTY_VE) { + ilo_state_vf_full_delta(&vec->ve->vf, render->dev, + &session->vf_delta); + } + + if (vec->dirty & ILO_DIRTY_RASTERIZER) { + ilo_state_raster_get_delta(&vec->rasterizer->rs, render->dev, + &render->state.rs, &session->rs_delta); + } + + if (vec->dirty & ILO_DIRTY_VIEWPORT) { + ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev, + &session->vp_delta); + } + + if (vec->dirty & ILO_DIRTY_BLEND) { + ilo_state_cc_get_delta(&vec->blend->cc, render->dev, + &render->state.cc, &session->cc_delta); + } } } @@ -467,7 +387,10 @@ draw_session_end(struct ilo_render *render, render->instruction_bo_changed = false; render->state.reduced_prim = session->reduced_prim; - render->state.primitive_restart = vec->draw->primitive_restart; + + render->state.urb = vec->urb; + render->state.rs = vec->rasterizer->rs; + render->state.cc = vec->blend->cc; } void diff --git a/src/gallium/drivers/ilo/ilo_render.h b/src/gallium/drivers/ilo/ilo_render.h index a85b2800fb1..098af73ec9b 100644 --- a/src/gallium/drivers/ilo/ilo_render.h +++ b/src/gallium/drivers/ilo/ilo_render.h @@ -43,9 +43,6 @@ ilo_render_create(struct ilo_builder *builder); void ilo_render_destroy(struct ilo_render *render); -/** - * Estimate the size of an action. - */ void ilo_render_get_sample_position(const struct ilo_render *render, unsigned sample_count, diff --git a/src/gallium/drivers/ilo/ilo_render_dynamic.c b/src/gallium/drivers/ilo/ilo_render_dynamic.c index ef92b12da83..3b4c80227a6 100644 --- a/src/gallium/drivers/ilo/ilo_render_dynamic.c +++ b/src/gallium/drivers/ilo/ilo_render_dynamic.c @@ -30,6 +30,7 @@ #include "ilo_common.h" #include "ilo_blitter.h" +#include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" @@ -42,16 +43,14 @@ gen6_emit_draw_dynamic_viewports(struct ilo_render *r, { ILO_DEV_ASSERT(r->dev, 6, 6); - /* SF_VIEWPORT, CLIP_VIEWPORT, and CC_VIEWPORT */ - if (DIRTY(VIEWPORT)) { + /* CLIP_VIEWPORT, SF_VIEWPORT, and CC_VIEWPORT */ + if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT | + ILO_STATE_VIEWPORT_CC_VIEWPORT)) || + r->state_bo_changed) { r->state.CLIP_VIEWPORT = gen6_CLIP_VIEWPORT(r->builder, - vec->viewport.cso, vec->viewport.count); - - r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder, - vec->viewport.cso, vec->viewport.count); - - r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, - vec->viewport.cso, vec->viewport.count); + &vec->viewport.vp); + r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder, &vec->viewport.vp); + r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp); session->viewport_changed = true; } @@ -65,12 +64,12 @@ gen7_emit_draw_dynamic_viewports(struct ilo_render *r, ILO_DEV_ASSERT(r->dev, 7, 8); /* SF_CLIP_VIEWPORT and CC_VIEWPORT */ - if (DIRTY(VIEWPORT)) { + if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT | + ILO_STATE_VIEWPORT_CC_VIEWPORT)) || + r->state_bo_changed) { r->state.SF_CLIP_VIEWPORT = gen7_SF_CLIP_VIEWPORT(r->builder, - vec->viewport.cso, vec->viewport.count); - - r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, - vec->viewport.cso, vec->viewport.count); + &vec->viewport.vp); + r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp); session->viewport_changed = true; } @@ -84,10 +83,10 @@ gen6_emit_draw_dynamic_scissors(struct ilo_render *r, ILO_DEV_ASSERT(r->dev, 6, 8); /* SCISSOR_RECT */ - if (DIRTY(SCISSOR) || DIRTY(VIEWPORT)) { - /* there should be as many scissors as there are viewports */ + if ((session->vp_delta.dirty & ILO_STATE_VIEWPORT_SCISSOR_RECT) || + r->state_bo_changed) { r->state.SCISSOR_RECT = gen6_SCISSOR_RECT(r->builder, - &vec->scissor, vec->viewport.count); + &vec->viewport.vp); session->scissor_changed = true; } @@ -101,32 +100,30 @@ gen6_emit_draw_dynamic_cc(struct ilo_render *r, ILO_DEV_ASSERT(r->dev, 6, 8); /* BLEND_STATE */ - if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA)) { - if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) { - r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder, - vec->blend, &vec->fb, vec->dsa); - } else { - r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder, - vec->blend, &vec->fb, vec->dsa); - } + if ((session->cc_delta.dirty & ILO_STATE_CC_BLEND_STATE) || + r->state_bo_changed) { + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder, &vec->blend->cc); + else + r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder, &vec->blend->cc); session->blend_changed = true; } /* COLOR_CALC_STATE */ - if (DIRTY(DSA) || DIRTY(STENCIL_REF) || DIRTY(BLEND_COLOR)) { + if ((session->cc_delta.dirty & ILO_STATE_CC_COLOR_CALC_STATE) || + r->state_bo_changed) { r->state.COLOR_CALC_STATE = - gen6_COLOR_CALC_STATE(r->builder, &vec->stencil_ref, - vec->dsa->alpha_ref, &vec->blend_color); - + gen6_COLOR_CALC_STATE(r->builder, &vec->blend->cc); session->cc_changed = true; } /* DEPTH_STENCIL_STATE */ - if (ilo_dev_gen(r->dev) < ILO_GEN(8) && DIRTY(DSA)) { + if (ilo_dev_gen(r->dev) < ILO_GEN(8) && + ((session->cc_delta.dirty & ILO_STATE_CC_DEPTH_STENCIL_STATE) || + r->state_bo_changed)) { r->state.DEPTH_STENCIL_STATE = - gen6_DEPTH_STENCIL_STATE(r->builder, vec->dsa); - + gen6_DEPTH_STENCIL_STATE(r->builder, &vec->blend->cc); session->dsa_changed = true; } } @@ -137,12 +134,11 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r, int shader_type, struct ilo_render_draw_session *session) { - const struct ilo_sampler_cso * const *samplers = - vec->sampler[shader_type].cso; - const struct pipe_sampler_view * const *views = - (const struct pipe_sampler_view **) vec->view[shader_type].states; + const struct ilo_view_cso * const *views = + (const struct ilo_view_cso **) vec->view[shader_type].states; + struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS]; uint32_t *sampler_state, *border_color_state; - int sampler_count; + int sampler_count, i; bool emit_border_color = false; bool skip = false; @@ -194,16 +190,28 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r, sampler_count <= Elements(vec->sampler[shader_type].cso)); if (emit_border_color) { - int i; - for (i = 0; i < sampler_count; i++) { - border_color_state[i] = (samplers[i]) ? - gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0; + const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i]; + + border_color_state[i] = (cso) ? + gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0; + } + } + + for (i = 0; i < sampler_count; i++) { + const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i]; + + if (cso && views[i]) { + samplers[i] = cso->sampler; + ilo_state_sampler_set_surface(&samplers[i], + r->dev, &views[i]->surface); + } else { + samplers[i] = vec->disabled_sampler; } } - *sampler_state = gen6_SAMPLER_STATE(r->builder, - samplers, views, border_color_state, sampler_count); + *sampler_state = gen6_SAMPLER_STATE(r->builder, samplers, + border_color_state, sampler_count); } static void @@ -234,13 +242,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r, const struct ilo_cbuf_state *cbuf = &vec->cbuf[PIPE_SHADER_VERTEX]; - if (cbuf0_size <= cbuf->cso[0].user_buffer_size) { + if (cbuf0_size <= cbuf->cso[0].info.size) { memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size); } else { memcpy(pcb, cbuf->cso[0].user_buffer, - cbuf->cso[0].user_buffer_size); - memset(pcb + cbuf->cso[0].user_buffer_size, 0, - cbuf0_size - cbuf->cso[0].user_buffer_size); + cbuf->cso[0].info.size); + memset(pcb + cbuf->cso[0].info.size, 0, + cbuf0_size - cbuf->cso[0].info.size); } pcb += cbuf0_size; @@ -271,13 +279,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r, gen6_push_constant_buffer(r->builder, cbuf0_size, &pcb); r->state.wm.PUSH_CONSTANT_BUFFER_size = cbuf0_size; - if (cbuf0_size <= cbuf->cso[0].user_buffer_size) { + if (cbuf0_size <= cbuf->cso[0].info.size) { memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size); } else { memcpy(pcb, cbuf->cso[0].user_buffer, - cbuf->cso[0].user_buffer_size); - memset(pcb + cbuf->cso[0].user_buffer_size, 0, - cbuf0_size - cbuf->cso[0].user_buffer_size); + cbuf->cso[0].info.size); + memset(pcb + cbuf->cso[0].info.size, 0, + cbuf0_size - cbuf->cso[0].info.size); } session->pcb_fs_changed = true; @@ -441,18 +449,17 @@ ilo_render_emit_rectlist_dynamic_states(struct ilo_render *render, if (blitter->uses & ILO_BLITTER_USE_DSA) { render->state.DEPTH_STENCIL_STATE = - gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->dsa); + gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->cc); } if (blitter->uses & ILO_BLITTER_USE_CC) { render->state.COLOR_CALC_STATE = - gen6_COLOR_CALC_STATE(render->builder, &blitter->cc.stencil_ref, - blitter->cc.alpha_ref, &blitter->cc.blend_color); + gen6_COLOR_CALC_STATE(render->builder, &blitter->cc); } if (blitter->uses & ILO_BLITTER_USE_VIEWPORT) { render->state.CC_VIEWPORT = - gen6_CC_VIEWPORT(render->builder, &blitter->viewport, 1); + gen6_CC_VIEWPORT(render->builder, &blitter->vp); } assert(ilo_builder_dynamic_used(render->builder) <= dynamic_used + @@ -466,10 +473,9 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r, { const unsigned shader_type = PIPE_SHADER_COMPUTE; const struct ilo_shader_state *cs = vec->cs; - const struct ilo_sampler_cso * const *samplers = - vec->sampler[shader_type].cso; - const struct pipe_sampler_view * const *views = - (const struct pipe_sampler_view **) vec->view[shader_type].states; + const struct ilo_view_cso * const *views = + (const struct ilo_view_cso **) vec->view[shader_type].states; + struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS]; int sampler_count, i; ILO_DEV_ASSERT(r->dev, 7, 7.5); @@ -480,11 +486,25 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r, sampler_count <= Elements(vec->sampler[shader_type].cso)); for (i = 0; i < sampler_count; i++) { - r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (samplers[i]) ? - gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0; + const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i]; + + r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (cso) ? + gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0; } - r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers, views, + for (i = 0; i < sampler_count; i++) { + const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i]; + + if (cso && views[i]) { + samplers[i] = cso->sampler; + ilo_state_sampler_set_surface(&samplers[i], + r->dev, &views[i]->surface); + } else { + samplers[i] = vec->disabled_sampler; + } + } + + r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers, r->state.cs.SAMPLER_BORDER_COLOR_STATE, sampler_count); } @@ -503,20 +523,39 @@ gen6_emit_launch_grid_dynamic_idrt(struct ilo_render *r, struct ilo_render_launch_grid_session *session) { const struct ilo_shader_state *cs = vec->cs; - struct gen6_idrt_data data; + struct ilo_state_compute_interface_info interface; + struct ilo_state_compute_info info; + uint32_t kernel_offset; ILO_DEV_ASSERT(r->dev, 7, 7.5); - memset(&data, 0, sizeof(data)); + memset(&interface, 0, sizeof(interface)); + + interface.sampler_count = + ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT); + interface.surface_count = + ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT); + interface.thread_group_size = session->thread_group_size; + interface.slm_size = + ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE); + interface.curbe_read_length = r->state.cs.PUSH_CONSTANT_BUFFER_size; + + memset(&info, 0, sizeof(info)); + info.data = session->compute_data; + info.data_size = sizeof(session->compute_data); + info.interfaces = &interface; + info.interface_count = 1; + info.cv_urb_alloc_size = r->dev->urb_size; + info.curbe_alloc_size = r->state.cs.PUSH_CONSTANT_BUFFER_size; + + ilo_state_compute_init(&session->compute, r->dev, &info); - data.cs = cs; - data.sampler_offset = r->state.cs.SAMPLER_STATE; - data.binding_table_offset = r->state.cs.BINDING_TABLE_STATE; + kernel_offset = ilo_shader_get_kernel_offset(cs); - data.curbe_size = r->state.cs.PUSH_CONSTANT_BUFFER_size; - data.thread_group_size = session->thread_group_size; + session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder, + &session->compute, &kernel_offset, + &r->state.cs.SAMPLER_STATE, &r->state.cs.BINDING_TABLE_STATE); - session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder, &data, 1); session->idrt_size = 32; } diff --git a/src/gallium/drivers/ilo/ilo_render_gen.h b/src/gallium/drivers/ilo/ilo_render_gen.h index acfe8be3088..6b133750043 100644 --- a/src/gallium/drivers/ilo/ilo_render_gen.h +++ b/src/gallium/drivers/ilo/ilo_render_gen.h @@ -31,6 +31,7 @@ #include "core/ilo_builder.h" #include "core/ilo_builder_3d.h" #include "core/ilo_builder_render.h" +#include "core/ilo_state_raster.h" #include "ilo_common.h" #include "ilo_state.h" @@ -50,11 +51,7 @@ struct ilo_render { struct intel_bo *workaround_bo; - uint32_t sample_pattern_1x; - uint32_t sample_pattern_2x; - uint32_t sample_pattern_4x; - uint32_t sample_pattern_8x[2]; - uint32_t sample_pattern_16x[4]; + struct ilo_state_sample_pattern sample_pattern; bool hw_ctx_changed; @@ -85,10 +82,13 @@ struct ilo_render { */ uint32_t deferred_pipe_control_dw1; - bool primitive_restart; int reduced_prim; int so_max_vertices; + struct ilo_state_urb urb; + struct ilo_state_raster rs; + struct ilo_state_cc cc; + uint32_t SF_VIEWPORT; uint32_t CLIP_VIEWPORT; uint32_t SF_CLIP_VIEWPORT; /* GEN7+ */ @@ -142,7 +142,12 @@ struct ilo_render_draw_session { int reduced_prim; bool prim_changed; - bool primitive_restart_changed; + + struct ilo_state_urb_delta urb_delta; + struct ilo_state_vf_delta vf_delta; + struct ilo_state_raster_delta rs_delta; + struct ilo_state_viewport_delta vp_delta; + struct ilo_state_cc_delta cc_delta; /* dynamic states */ bool viewport_changed; @@ -180,6 +185,9 @@ struct ilo_render_launch_grid_session { uint32_t idrt; int idrt_size; + + uint32_t compute_data[6]; + struct ilo_state_compute compute; }; int @@ -381,8 +389,7 @@ ilo_render_pipe_control(struct ilo_render *r, uint32_t dw1) */ static inline void ilo_render_3dprimitive(struct ilo_render *r, - const struct pipe_draw_info *info, - const struct ilo_ib_state *ib) + const struct gen6_3dprimitive_info *info) { ILO_DEV_ASSERT(r->dev, 6, 8); @@ -391,9 +398,9 @@ ilo_render_3dprimitive(struct ilo_render *r, /* 3DPRIMITIVE */ if (ilo_dev_gen(r->dev) >= ILO_GEN(7)) - gen7_3DPRIMITIVE(r->builder, info, ib); + gen7_3DPRIMITIVE(r->builder, info); else - gen6_3DPRIMITIVE(r->builder, info, ib); + gen6_3DPRIMITIVE(r->builder, info); r->state.current_pipe_control_dw1 = 0; assert(!r->state.deferred_pipe_control_dw1); diff --git a/src/gallium/drivers/ilo/ilo_render_gen6.c b/src/gallium/drivers/ilo/ilo_render_gen6.c index 47f711e7956..c1f759f3043 100644 --- a/src/gallium/drivers/ilo/ilo_render_gen6.c +++ b/src/gallium/drivers/ilo/ilo_render_gen6.c @@ -29,11 +29,11 @@ #include "core/ilo_builder_3d.h" #include "core/ilo_builder_mi.h" #include "core/ilo_builder_render.h" -#include "util/u_dual_blend.h" #include "util/u_prim.h" #include "ilo_blitter.h" #include "ilo_query.h" +#include "ilo_resource.h" #include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" @@ -330,64 +330,19 @@ gen6_draw_common_urb(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - /* 3DSTATE_URB */ - if (DIRTY(VE) || DIRTY(VS) || DIRTY(GS)) { - const bool gs_active = (vec->gs || (vec->vs && - ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))); - int vs_entry_size, gs_entry_size; - int vs_total_size, gs_total_size; - - vs_entry_size = (vec->vs) ? - ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0; + const bool gs_active = (vec->gs || (vec->vs && + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))); - /* - * As indicated by 2e712e41db0c0676e9f30fc73172c0e8de8d84d4, VF and VS - * share VUE handles. The VUE allocation size must be large enough to - * store either VF outputs (number of VERTEX_ELEMENTs) and VS outputs. - * - * I am not sure if the PRM explicitly states that VF and VS share VUE - * handles. But here is a citation that implies so: - * - * From the Sandy Bridge PRM, volume 2 part 1, page 44: - * - * "Once a FF stage that spawn threads has sufficient input to - * initiate a thread, it must guarantee that it is safe to request - * the thread initiation. For all these FF stages, this check is - * based on : - * - * - The availability of output URB entries: - * - VS: As the input URB entries are overwritten with the - * VS-generated output data, output URB availability isn't a - * factor." - */ - if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso) - vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso; - - gs_entry_size = (vec->gs) ? - ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT) : - (gs_active) ? vs_entry_size : 0; - - /* in bytes */ - vs_entry_size *= sizeof(float) * 4; - gs_entry_size *= sizeof(float) * 4; - vs_total_size = r->dev->urb_size; - - if (gs_active) { - vs_total_size /= 2; - gs_total_size = vs_total_size; - } - else { - gs_total_size = 0; - } - - gen6_3DSTATE_URB(r->builder, vs_total_size, gs_total_size, - vs_entry_size, gs_entry_size); + /* 3DSTATE_URB */ + if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_GS)) { + gen6_3DSTATE_URB(r->builder, &vec->urb); if (r->state.gs.active && !gs_active) gen6_wa_post_3dstate_urb_no_gs(r); - - r->state.gs.active = gs_active; } + + r->state.gs.active = gs_active; } static void @@ -459,33 +414,30 @@ gen6_draw_vf(struct ilo_render *r, { if (ilo_dev_gen(r->dev) >= ILO_GEN(7.5)) { /* 3DSTATE_INDEX_BUFFER */ - if (DIRTY(IB) || r->batch_bo_changed) { - gen6_3DSTATE_INDEX_BUFFER(r->builder, - &vec->ib, false); - } + if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) || + DIRTY(IB) || r->batch_bo_changed) + gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib); /* 3DSTATE_VF */ - if (session->primitive_restart_changed) { - gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart, - vec->draw->restart_index); - } - } - else { + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF) + gen75_3DSTATE_VF(r->builder, &vec->ve->vf); + } else { /* 3DSTATE_INDEX_BUFFER */ - if (DIRTY(IB) || session->primitive_restart_changed || - r->batch_bo_changed) { - gen6_3DSTATE_INDEX_BUFFER(r->builder, - &vec->ib, vec->draw->primitive_restart); - } + if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) || + DIRTY(IB) || r->batch_bo_changed) + gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib); } /* 3DSTATE_VERTEX_BUFFERS */ - if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) - gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb); + if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) || + DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) { + gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf, + vec->vb.vb, vec->ve->vb_count); + } /* 3DSTATE_VERTEX_ELEMENTS */ - if (DIRTY(VE)) - gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve); + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS) + gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf); } void @@ -516,10 +468,17 @@ gen6_draw_vs(struct ilo_render *r, /* 3DSTATE_VS */ if (DIRTY(VS) || r->instruction_bo_changed) { + const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs); + const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs); + if (ilo_dev_gen(r->dev) == ILO_GEN(6)) gen6_wa_pre_3dstate_vs_toggle(r); - gen6_3DSTATE_VS(r->builder, vec->vs); + if (ilo_dev_gen(r->dev) == ILO_GEN(6) && + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) + gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs, kernel_offset); + else + gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset); } } @@ -535,14 +494,39 @@ gen6_draw_gs(struct ilo_render *r, /* 3DSTATE_GS */ if (DIRTY(GS) || DIRTY(VS) || session->prim_changed || r->instruction_bo_changed) { + const union ilo_shader_cso *cso; + uint32_t kernel_offset; + if (vec->gs) { - gen6_3DSTATE_GS(r->builder, vec->gs); - } else if (vec->vs && + cso = ilo_shader_get_kernel_cso(vec->gs); + kernel_offset = ilo_shader_get_kernel_offset(vec->gs); + + gen6_3DSTATE_GS(r->builder, &cso->gs, kernel_offset); + } else if (ilo_dev_gen(r->dev) == ILO_GEN(6) && ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) { - const int verts_per_prim = u_vertices_per_prim(session->reduced_prim); - gen6_so_3DSTATE_GS(r->builder, vec->vs, verts_per_prim); + const int verts_per_prim = + u_vertices_per_prim(session->reduced_prim); + enum ilo_kernel_param param; + + switch (verts_per_prim) { + case 1: + param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET; + break; + case 2: + param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET; + break; + default: + param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET; + break; + } + + cso = ilo_shader_get_kernel_cso(vec->vs); + kernel_offset = ilo_shader_get_kernel_offset(vec->vs) + + ilo_shader_get_kernel_param(vec->vs, param); + + gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol, kernel_offset); } else { - gen6_disable_3DSTATE_GS(r->builder); + gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0); } } } @@ -633,30 +617,8 @@ gen6_draw_clip(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_CLIP */ - if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(VIEWPORT) || DIRTY(FB)) { - bool enable_guardband = true; - unsigned i; - - /* - * Gen8+ has viewport extent test. Guard band test can be enabled on - * prior Gens only when the viewport is larger than the framebuffer, - * unless we emulate viewport extent test on them. - */ - if (ilo_dev_gen(r->dev) < ILO_GEN(8)) { - for (i = 0; i < vec->viewport.count; i++) { - const struct ilo_viewport_cso *vp = &vec->viewport.cso[i]; - - if (vp->min_x > 0.0f || vp->max_x < vec->fb.state.width || - vp->min_y > 0.0f || vp->max_y < vec->fb.state.height) { - enable_guardband = false; - break; - } - } - } - - gen6_3DSTATE_CLIP(r->builder, vec->rasterizer, - vec->fs, enable_guardband, 1); - } + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_CLIP) + gen6_3DSTATE_CLIP(r->builder, &vec->rasterizer->rs); } static void @@ -665,9 +627,9 @@ gen6_draw_sf(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_SF */ - if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(FB)) { - gen6_3DSTATE_SF(r->builder, vec->rasterizer, vec->fs, - vec->fb.num_samples); + if ((session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) || DIRTY(FS)) { + const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs); + gen6_3DSTATE_SF(r->builder, &vec->rasterizer->rs, sbe); } } @@ -700,17 +662,17 @@ gen6_draw_wm(struct ilo_render *r, } /* 3DSTATE_WM */ - if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) || - DIRTY(RASTERIZER) || r->instruction_bo_changed) { - const bool dual_blend = vec->blend->dual_blend; - const bool cc_may_kill = (vec->dsa->dw_blend_alpha || - vec->blend->alpha_to_coverage); + if (DIRTY(FS) || + (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM) || + r->instruction_bo_changed) { + const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs); + const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs); if (ilo_dev_gen(r->dev) == ILO_GEN(6) && r->hw_ctx_changed) gen6_wa_pre_3dstate_wm_max_threads(r); - gen6_3DSTATE_WM(r->builder, vec->fs, - vec->rasterizer, dual_blend, cc_may_kill); + gen6_3DSTATE_WM(r->builder, &vec->rasterizer->rs, + &cso->ps, kernel_offset); } } @@ -719,25 +681,23 @@ gen6_draw_wm_multisample(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */ - if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) { - const uint32_t *pattern; - - pattern = (vec->fb.num_samples > 1) ? - &r->sample_pattern_4x : &r->sample_pattern_1x; + /* 3DSTATE_MULTISAMPLE */ + if (DIRTY(FB) || (session->rs_delta.dirty & + ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) { + const uint8_t sample_count = (vec->fb.num_samples > 1) ? 4 : 1; if (ilo_dev_gen(r->dev) == ILO_GEN(6)) { gen6_wa_pre_non_pipelined(r); gen6_wa_pre_3dstate_multisample(r); } - gen6_3DSTATE_MULTISAMPLE(r->builder, - vec->fb.num_samples, pattern, - vec->rasterizer->state.half_pixel_center); - - gen6_3DSTATE_SAMPLE_MASK(r->builder, - (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1); + gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs, + &r->sample_pattern, sample_count); } + + /* 3DSTATE_SAMPLE_MASK */ + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK) + gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs); } static void @@ -747,7 +707,7 @@ gen6_draw_wm_depth(struct ilo_render *r, { /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */ if (DIRTY(FB) || r->batch_bo_changed) { - const struct ilo_zs_surface *zs; + const struct ilo_state_zs *zs; uint32_t clear_params; if (vec->fb.state.zsbuf) { @@ -772,7 +732,7 @@ gen6_draw_wm_depth(struct ilo_render *r, gen6_wa_pre_depth(r); } - gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false); + gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs); gen6_3DSTATE_CLEAR_PARAMS(r->builder, clear_params); @@ -790,10 +750,8 @@ gen6_draw_wm_raster(struct ilo_render *r, if (ilo_dev_gen(r->dev) == ILO_GEN(6)) gen6_wa_pre_non_pipelined(r); - gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder, - &vec->poly_stipple); - - gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, 0, 0); + gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder, &vec->poly_stipple); + gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, &vec->poly_stipple); } /* 3DSTATE_LINE_STIPPLE */ @@ -801,17 +759,16 @@ gen6_draw_wm_raster(struct ilo_render *r, if (ilo_dev_gen(r->dev) == ILO_GEN(6)) gen6_wa_pre_non_pipelined(r); - gen6_3DSTATE_LINE_STIPPLE(r->builder, - vec->rasterizer->state.line_stipple_pattern, - vec->rasterizer->state.line_stipple_factor + 1); + gen6_3DSTATE_LINE_STIPPLE(r->builder, &vec->line_stipple); } /* 3DSTATE_AA_LINE_PARAMETERS */ - if (DIRTY(RASTERIZER) && vec->rasterizer->state.line_smooth) { + if (session->rs_delta.dirty & + ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS) { if (ilo_dev_gen(r->dev) == ILO_GEN(6)) gen6_wa_pre_non_pipelined(r); - gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder); + gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder, &vec->rasterizer->rs); } } @@ -849,7 +806,7 @@ ilo_render_emit_draw_commands_gen6(struct ilo_render *render, gen6_draw_sf_rect(render, vec, session); gen6_draw_vf(render, vec, session); - ilo_render_3dprimitive(render, vec->draw, &vec->ib); + ilo_render_3dprimitive(render, &vec->draw_info); } static void @@ -860,40 +817,23 @@ gen6_rectlist_vs_to_sf(struct ilo_render *r, gen6_wa_post_3dstate_constant_vs(r); gen6_wa_pre_3dstate_vs_toggle(r); - gen6_disable_3DSTATE_VS(r->builder); + gen6_3DSTATE_VS(r->builder, &blitter->vs, 0); gen6_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0); - gen6_disable_3DSTATE_GS(r->builder); + gen6_3DSTATE_GS(r->builder, &blitter->gs, 0); - gen6_disable_3DSTATE_CLIP(r->builder); - gen6_3DSTATE_SF(r->builder, NULL, NULL, blitter->fb.num_samples); + gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs); + gen6_3DSTATE_SF(r->builder, &blitter->fb.rs, &blitter->sbe); } static void gen6_rectlist_wm(struct ilo_render *r, const struct ilo_blitter *blitter) { - uint32_t hiz_op; - - switch (blitter->op) { - case ILO_BLITTER_RECTLIST_CLEAR_ZS: - hiz_op = GEN6_WM_DW4_DEPTH_CLEAR; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_Z: - hiz_op = GEN6_WM_DW4_DEPTH_RESOLVE; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_HIZ: - hiz_op = GEN6_WM_DW4_HIZ_RESOLVE; - break; - default: - hiz_op = 0; - break; - } - gen6_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0); gen6_wa_pre_3dstate_wm_max_threads(r); - gen6_hiz_3DSTATE_WM(r->builder, hiz_op); + gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0); } static void @@ -903,10 +843,8 @@ gen6_rectlist_wm_depth(struct ilo_render *r, gen6_wa_pre_depth(r); if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH | - ILO_BLITTER_USE_FB_STENCIL)) { - gen6_3DSTATE_DEPTH_BUFFER(r->builder, - &blitter->fb.dst.u.zs, true); - } + ILO_BLITTER_USE_FB_STENCIL)) + gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs); if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) { gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, @@ -926,16 +864,12 @@ static void gen6_rectlist_wm_multisample(struct ilo_render *r, const struct ilo_blitter *blitter) { - const uint32_t *pattern = (blitter->fb.num_samples > 1) ? - &r->sample_pattern_4x : &r->sample_pattern_1x; + const uint8_t sample_count = (blitter->fb.num_samples > 1) ? 4 : 1; gen6_wa_pre_3dstate_multisample(r); - gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples, - pattern, true); - - gen6_3DSTATE_SAMPLE_MASK(r->builder, - (1 << blitter->fb.num_samples) - 1); + gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs, &r->sample_pattern, sample_count); + gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs); } int @@ -964,11 +898,9 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r, session->vb_start, session->vb_end, sizeof(blitter->vertices[0])); - gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve); + gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf); - gen6_3DSTATE_URB(r->builder, r->dev->urb_size, 0, - (blitter->ve.count + blitter->ve.prepend_nosrc_cso) * 4 * sizeof(float), - 0); + gen6_3DSTATE_URB(r->builder, &blitter->urb); if (r->state.gs.active) { gen6_wa_post_3dstate_urb_no_gs(r); @@ -994,7 +926,7 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r, gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0, blitter->fb.width, blitter->fb.height); - ilo_render_3dprimitive(r, &blitter->draw, NULL); + ilo_render_3dprimitive(r, &blitter->draw_info); } int diff --git a/src/gallium/drivers/ilo/ilo_render_gen7.c b/src/gallium/drivers/ilo/ilo_render_gen7.c index 07fe7c83536..6623a8bcb43 100644 --- a/src/gallium/drivers/ilo/ilo_render_gen7.c +++ b/src/gallium/drivers/ilo/ilo_render_gen7.c @@ -28,9 +28,9 @@ #include "genhw/genhw.h" #include "core/ilo_builder_3d.h" #include "core/ilo_builder_render.h" -#include "util/u_dual_blend.h" #include "ilo_blitter.h" +#include "ilo_resource.h" #include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" @@ -201,40 +201,17 @@ gen7_draw_common_urb(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_URB_{VS,GS,HS,DS} */ - if (DIRTY(VE) || DIRTY(VS)) { - /* the first 16KB are reserved for VS and PS PCBs */ - const int offset = - (ilo_dev_gen(r->dev) >= ILO_GEN(8)) || - (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ? - 32768 : 16384; - int vs_entry_size, vs_total_size; - - vs_entry_size = (vec->vs) ? - ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0; - - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 35: - * - * "Programming Restriction: As the VS URB entry serves as both the - * per-vertex input and output of the VS shader, the VS URB - * Allocation Size must be sized to the maximum of the vertex input - * and output structures." - */ - if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso) - vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso; - - vs_entry_size *= sizeof(float) * 4; - vs_total_size = r->dev->urb_size - offset; - + if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS | + ILO_STATE_URB_3DSTATE_URB_HS | + ILO_STATE_URB_3DSTATE_URB_DS | + ILO_STATE_URB_3DSTATE_URB_GS)) { if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_pre_vs(r); - gen7_3DSTATE_URB_VS(r->builder, - offset, vs_total_size, vs_entry_size); - - gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0); - gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0); - gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0); + gen7_3DSTATE_URB_VS(r->builder, &vec->urb); + gen7_3DSTATE_URB_GS(r->builder, &vec->urb); + gen7_3DSTATE_URB_HS(r->builder, &vec->urb); + gen7_3DSTATE_URB_DS(r->builder, &vec->urb); } } @@ -244,22 +221,15 @@ gen7_draw_common_pcb_alloc(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_PUSH_CONSTANT_ALLOC_{VS,PS} */ - if (r->hw_ctx_changed) { - /* - * Push constant buffers are only allowed to take up at most the first - * 16KB of the URB. Split the space evenly for VS and FS. - */ - const int max_size = - (ilo_dev_gen(r->dev) >= ILO_GEN(8)) || - (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ? - 32768 : 16384; - const int size = max_size / 2; - int offset = 0; - - gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size); - offset += size; - - gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size); + if (session->urb_delta.dirty & + (ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS | + ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS)) { + gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &vec->urb); + gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(r->builder, &vec->urb); + gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &vec->urb); if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_post_3dstate_push_constant_alloc_ps(r); @@ -344,14 +314,14 @@ gen7_draw_vs(struct ilo_render *r, } /* 3DSTATE_VS */ - if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) { - if (emit_3dstate_vs || DIRTY(RASTERIZER)) { - gen8_3DSTATE_VS(r->builder, vec->vs, - vec->rasterizer->state.clip_plane_enable); - } - } else { - if (emit_3dstate_vs) - gen6_3DSTATE_VS(r->builder, vec->vs); + if (emit_3dstate_vs) { + const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs); + const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs); + + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + gen8_3DSTATE_VS(r->builder, &cso->vs, kernel_offset); + else + gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset); } } @@ -362,8 +332,15 @@ gen7_draw_hs(struct ilo_render *r, { /* 3DSTATE_CONSTANT_HS and 3DSTATE_HS */ if (r->hw_ctx_changed) { + const struct ilo_state_hs *hs = &vec->disabled_hs; + const uint32_t kernel_offset = 0; + gen7_3DSTATE_CONSTANT_HS(r->builder, 0, 0, 0); - gen7_disable_3DSTATE_HS(r->builder); + + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + gen8_3DSTATE_HS(r->builder, hs, kernel_offset); + else + gen7_3DSTATE_HS(r->builder, hs, kernel_offset); } /* 3DSTATE_BINDING_TABLE_POINTERS_HS */ @@ -377,8 +354,10 @@ gen7_draw_te(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_TE */ - if (r->hw_ctx_changed) - gen7_3DSTATE_TE(r->builder); + if (r->hw_ctx_changed) { + const struct ilo_state_ds *ds = &vec->disabled_ds; + gen7_3DSTATE_TE(r->builder, ds); + } } void @@ -388,8 +367,15 @@ gen7_draw_ds(struct ilo_render *r, { /* 3DSTATE_CONSTANT_DS and 3DSTATE_DS */ if (r->hw_ctx_changed) { + const struct ilo_state_ds *ds = &vec->disabled_ds; + const uint32_t kernel_offset = 0; + gen7_3DSTATE_CONSTANT_DS(r->builder, 0, 0, 0); - gen7_disable_3DSTATE_DS(r->builder); + + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + gen8_3DSTATE_DS(r->builder, ds, kernel_offset); + else + gen7_3DSTATE_DS(r->builder, ds, kernel_offset); } /* 3DSTATE_BINDING_TABLE_POINTERS_DS */ @@ -405,8 +391,15 @@ gen7_draw_gs(struct ilo_render *r, { /* 3DSTATE_CONSTANT_GS and 3DSTATE_GS */ if (r->hw_ctx_changed) { + const struct ilo_state_gs *gs = &vec->disabled_gs; + const uint32_t kernel_offset = 0; + gen7_3DSTATE_CONSTANT_GS(r->builder, 0, 0, 0); - gen7_disable_3DSTATE_GS(r->builder); + + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + gen8_3DSTATE_GS(r->builder, gs, kernel_offset); + else + gen7_3DSTATE_GS(r->builder, gs, kernel_offset); } /* 3DSTATE_BINDING_TABLE_POINTERS_GS */ @@ -421,7 +414,7 @@ gen7_draw_sol(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - const struct pipe_stream_output_info *so_info; + const struct ilo_state_sol *sol; const struct ilo_shader_state *shader; bool dirty_sh = false; @@ -434,41 +427,54 @@ gen7_draw_sol(struct ilo_render *r, dirty_sh = DIRTY(VS); } - so_info = ilo_shader_get_kernel_so_info(shader); + sol = ilo_shader_get_kernel_sol(shader); /* 3DSTATE_SO_BUFFER */ if ((DIRTY(SO) || dirty_sh || r->batch_bo_changed) && vec->so.enabled) { int i; - for (i = 0; i < vec->so.count; i++) { - const int stride = so_info->stride[i] * 4; /* in bytes */ - - gen7_3DSTATE_SO_BUFFER(r->builder, i, stride, vec->so.states[i]); + for (i = 0; i < ILO_STATE_SOL_MAX_BUFFER_COUNT; i++) { + const struct pipe_stream_output_target *target = + (i < vec->so.count && vec->so.states[i]) ? + vec->so.states[i] : NULL; + const struct ilo_state_sol_buffer *sb = (target) ? + &((const struct ilo_stream_output_target *) target)->sb : + &vec->so.dummy_sb; + + if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) + gen8_3DSTATE_SO_BUFFER(r->builder, sol, sb, i); + else + gen7_3DSTATE_SO_BUFFER(r->builder, sol, sb, i); } - - for (; i < 4; i++) - gen7_disable_3DSTATE_SO_BUFFER(r->builder, i); } /* 3DSTATE_SO_DECL_LIST */ if (dirty_sh && vec->so.enabled) - gen7_3DSTATE_SO_DECL_LIST(r->builder, so_info); - - /* 3DSTATE_STREAMOUT */ - if (DIRTY(SO) || DIRTY(RASTERIZER) || dirty_sh) { - const int output_count = ilo_shader_get_kernel_param(shader, - ILO_KERNEL_OUTPUT_COUNT); - int buf_strides[4] = { 0, 0, 0, 0 }; - int i; + gen7_3DSTATE_SO_DECL_LIST(r->builder, sol); - for (i = 0; i < vec->so.count; i++) - buf_strides[i] = so_info->stride[i] * 4; + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 196-197: + * + * "Anytime the SOL unit MMIO registers or non-pipeline state are + * written, the SOL unit needs to receive a pipeline state update with + * SOL unit dirty state for information programmed in MMIO/NP to get + * loaded into the SOL unit. + * + * The SOL unit incorrectly double buffers MMIO/NP registers and only + * moves them into the design for usage when control topology is + * received with the SOL unit dirty state. + * + * If the state does not change, need to resend the same state. + * + * Because of corruption, software must flush the whole fixed function + * pipeline when 3DSTATE_STREAMOUT changes state." + * + * The first and fourth paragraphs are gone on Gen7.5+. + */ - gen7_3DSTATE_STREAMOUT(r->builder, 0, - vec->rasterizer->state.rasterizer_discard, - output_count, buf_strides); - } + /* 3DSTATE_STREAMOUT */ + gen7_3DSTATE_STREAMOUT(r->builder, sol); } static void @@ -477,22 +483,17 @@ gen7_draw_sf(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_SBE */ - if (DIRTY(RASTERIZER) || DIRTY(FS)) { - gen7_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ? - vec->rasterizer->state.sprite_coord_mode : 0); + if (DIRTY(FS)) { + const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs); + gen7_3DSTATE_SBE(r->builder, sbe); } /* 3DSTATE_SF */ - if (DIRTY(RASTERIZER) || DIRTY(FB)) { - struct pipe_surface *zs = vec->fb.state.zsbuf; - + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) { if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_pre_3dstate_sf_depth_bias(r); - gen7_3DSTATE_SF(r->builder, - (vec->rasterizer) ? &vec->rasterizer->sf : NULL, - (zs) ? zs->format : PIPE_FORMAT_NONE, - vec->fb.num_samples); + gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs); } } @@ -501,13 +502,12 @@ gen7_draw_wm(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - /* 3DSTATE_WM */ - if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) || DIRTY(RASTERIZER)) { - const bool cc_may_kill = (vec->dsa->dw_blend_alpha || - vec->blend->alpha_to_coverage); + const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs); + const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs); - gen7_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer, cc_may_kill); - } + /* 3DSTATE_WM */ + if (DIRTY(FS) || (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM)) + gen7_3DSTATE_WM(r->builder, &vec->rasterizer->rs, &cso->ps); /* 3DSTATE_BINDING_TABLE_POINTERS_PS */ if (session->binding_table_fs_changed) { @@ -530,13 +530,11 @@ gen7_draw_wm(struct ilo_render *r, } /* 3DSTATE_PS */ - if (DIRTY(FS) || DIRTY(BLEND) || r->instruction_bo_changed) { - const bool dual_blend = vec->blend->dual_blend; - + if (DIRTY(FS) || r->instruction_bo_changed) { if (r->hw_ctx_changed) gen7_wa_pre_3dstate_ps_max_threads(r); - gen7_3DSTATE_PS(r->builder, vec->fs, dual_blend); + gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset); } /* 3DSTATE_SCISSOR_STATE_POINTERS */ @@ -569,7 +567,7 @@ gen7_draw_wm(struct ilo_render *r, /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */ if (DIRTY(FB) || r->batch_bo_changed) { - const struct ilo_zs_surface *zs; + const struct ilo_state_zs *zs; uint32_t clear_params; if (vec->fb.state.zsbuf) { @@ -588,7 +586,7 @@ gen7_draw_wm(struct ilo_render *r, clear_params = 0; } - gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false); + gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs); gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params); @@ -600,24 +598,21 @@ gen7_draw_wm_multisample(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */ - if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) { - const uint32_t *pattern; + /* 3DSTATE_MULTISAMPLE */ + if (DIRTY(FB) || (session->rs_delta.dirty & + ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) { + const uint8_t sample_count = (vec->fb.num_samples > 4) ? 8 : + (vec->fb.num_samples > 1) ? 4 : 1; gen7_wa_pre_3dstate_multisample(r); - pattern = (vec->fb.num_samples > 4) ? r->sample_pattern_8x : - (vec->fb.num_samples > 1) ? &r->sample_pattern_4x : - &r->sample_pattern_1x; - - gen6_3DSTATE_MULTISAMPLE(r->builder, - vec->fb.num_samples, pattern, - vec->rasterizer->state.half_pixel_center); - - gen7_3DSTATE_SAMPLE_MASK(r->builder, - (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1, - vec->fb.num_samples); + gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs, + &r->sample_pattern, sample_count); } + + /* 3DSTATE_SAMPLE_MASK */ + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK) + gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs); } void @@ -654,28 +649,15 @@ ilo_render_emit_draw_commands_gen7(struct ilo_render *render, gen6_draw_sf_rect(render, vec, session); gen6_draw_vf(render, vec, session); - ilo_render_3dprimitive(render, vec->draw, &vec->ib); + ilo_render_3dprimitive(render, &vec->draw_info); } static void gen7_rectlist_pcb_alloc(struct ilo_render *r, const struct ilo_blitter *blitter) { - /* - * Push constant buffers are only allowed to take up at most the first - * 16KB of the URB. Split the space evenly for VS and FS. - */ - const int max_size = - (ilo_dev_gen(r->dev) >= ILO_GEN(8)) || - (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ? - 32768 : 16384; - const int size = max_size / 2; - int offset = 0; - - gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size); - offset += size; - - gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size); + gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &blitter->urb); + gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &blitter->urb); if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_post_3dstate_push_constant_alloc_ps(r); @@ -685,19 +667,10 @@ static void gen7_rectlist_urb(struct ilo_render *r, const struct ilo_blitter *blitter) { - /* the first 16KB are reserved for VS and PS PCBs */ - const int offset = - (ilo_dev_gen(r->dev) >= ILO_GEN(8)) || - (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ? - 32768 : 16384; - - gen7_3DSTATE_URB_VS(r->builder, offset, r->dev->urb_size - offset, - (blitter->ve.count + blitter->ve.prepend_nosrc_cso) * - 4 * sizeof(float)); - - gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0); - gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0); - gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0); + gen7_3DSTATE_URB_VS(r->builder, &blitter->urb); + gen7_3DSTATE_URB_GS(r->builder, &blitter->urb); + gen7_3DSTATE_URB_HS(r->builder, &blitter->urb); + gen7_3DSTATE_URB_DS(r->builder, &blitter->urb); } static void @@ -705,58 +678,40 @@ gen7_rectlist_vs_to_sf(struct ilo_render *r, const struct ilo_blitter *blitter) { gen7_3DSTATE_CONSTANT_VS(r->builder, NULL, NULL, 0); - gen6_disable_3DSTATE_VS(r->builder); + gen6_3DSTATE_VS(r->builder, &blitter->vs, 0); gen7_3DSTATE_CONSTANT_HS(r->builder, NULL, NULL, 0); - gen7_disable_3DSTATE_HS(r->builder); + gen7_3DSTATE_HS(r->builder, &blitter->hs, 0); - gen7_3DSTATE_TE(r->builder); + gen7_3DSTATE_TE(r->builder, &blitter->ds); gen7_3DSTATE_CONSTANT_DS(r->builder, NULL, NULL, 0); - gen7_disable_3DSTATE_DS(r->builder); + gen7_3DSTATE_DS(r->builder, &blitter->ds, 0); gen7_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0); - gen7_disable_3DSTATE_GS(r->builder); + gen7_3DSTATE_GS(r->builder, &blitter->gs, 0); - gen7_3DSTATE_STREAMOUT(r->builder, 0, false, 0x0, 0); + gen7_3DSTATE_STREAMOUT(r->builder, &blitter->sol); - gen6_disable_3DSTATE_CLIP(r->builder); + gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs); if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_pre_3dstate_sf_depth_bias(r); - gen7_3DSTATE_SF(r->builder, NULL, blitter->fb.dst.base.format, - blitter->fb.num_samples); - gen7_3DSTATE_SBE(r->builder, NULL, 0); + gen7_3DSTATE_SF(r->builder, &blitter->fb.rs); + gen7_3DSTATE_SBE(r->builder, &blitter->sbe); } static void gen7_rectlist_wm(struct ilo_render *r, const struct ilo_blitter *blitter) { - uint32_t hiz_op; - - switch (blitter->op) { - case ILO_BLITTER_RECTLIST_CLEAR_ZS: - hiz_op = GEN7_WM_DW1_DEPTH_CLEAR; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_Z: - hiz_op = GEN7_WM_DW1_DEPTH_RESOLVE; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_HIZ: - hiz_op = GEN7_WM_DW1_HIZ_RESOLVE; - break; - default: - hiz_op = 0; - break; - } - - gen7_hiz_3DSTATE_WM(r->builder, hiz_op); + gen7_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps); gen7_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0); gen7_wa_pre_3dstate_ps_max_threads(r); - gen7_disable_3DSTATE_PS(r->builder); + gen7_3DSTATE_PS(r->builder, &blitter->ps, 0); } static void @@ -766,10 +721,8 @@ gen7_rectlist_wm_depth(struct ilo_render *r, gen7_wa_pre_depth(r); if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH | - ILO_BLITTER_USE_FB_STENCIL)) { - gen6_3DSTATE_DEPTH_BUFFER(r->builder, - &blitter->fb.dst.u.zs, true); - } + ILO_BLITTER_USE_FB_STENCIL)) + gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs); if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) { gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, @@ -789,18 +742,15 @@ static void gen7_rectlist_wm_multisample(struct ilo_render *r, const struct ilo_blitter *blitter) { - const uint32_t *pattern = - (blitter->fb.num_samples > 4) ? r->sample_pattern_8x : - (blitter->fb.num_samples > 1) ? &r->sample_pattern_4x : - &r->sample_pattern_1x; + const uint8_t sample_count = (blitter->fb.num_samples > 4) ? 8 : + (blitter->fb.num_samples > 1) ? 4 : 1; gen7_wa_pre_3dstate_multisample(r); - gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples, - pattern, true); + gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs, + &r->sample_pattern, sample_count); - gen7_3DSTATE_SAMPLE_MASK(r->builder, - (1 << blitter->fb.num_samples) - 1, blitter->fb.num_samples); + gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs); } void @@ -818,7 +768,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r, session->vb_start, session->vb_end, sizeof(blitter->vertices[0])); - gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve); + gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf); gen7_rectlist_pcb_alloc(r, blitter); @@ -854,7 +804,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r, if (ilo_dev_gen(r->dev) == ILO_GEN(7)) gen7_wa_post_ps_and_later(r); - ilo_render_3dprimitive(r, &blitter->draw, NULL); + ilo_render_3dprimitive(r, &blitter->draw_info); } int diff --git a/src/gallium/drivers/ilo/ilo_render_gen8.c b/src/gallium/drivers/ilo/ilo_render_gen8.c index 715b93611f1..65494b4058a 100644 --- a/src/gallium/drivers/ilo/ilo_render_gen8.c +++ b/src/gallium/drivers/ilo/ilo_render_gen8.c @@ -28,9 +28,9 @@ #include "genhw/genhw.h" #include "core/ilo_builder_3d.h" #include "core/ilo_builder_render.h" -#include "util/u_dual_blend.h" #include "ilo_blitter.h" +#include "ilo_resource.h" #include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" @@ -66,26 +66,20 @@ gen8_draw_sf(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_RASTER */ - if (DIRTY(RASTERIZER)) { - gen8_3DSTATE_RASTER(r->builder, (vec->rasterizer) ? - &vec->rasterizer->sf : NULL); - } + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_RASTER) + gen8_3DSTATE_RASTER(r->builder, &vec->rasterizer->rs); - /* 3DSTATE_SBE */ - if (DIRTY(RASTERIZER) || DIRTY(FS)) { - gen8_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ? - vec->rasterizer->state.sprite_coord_mode : 0); - } + /* 3DSTATE_SBE and 3DSTATE_SBE_SWIZ */ + if (DIRTY(FS)) { + const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs); - /* 3DSTATE_SBE_SWIZ */ - if (DIRTY(FS)) - gen8_3DSTATE_SBE_SWIZ(r->builder, vec->fs); + gen8_3DSTATE_SBE(r->builder, sbe); + gen8_3DSTATE_SBE_SWIZ(r->builder, sbe); + } /* 3DSTATE_SF */ - if (DIRTY(RASTERIZER)) { - gen8_3DSTATE_SF(r->builder, (vec->rasterizer) ? - &vec->rasterizer->sf : NULL); - } + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) + gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs); } static void @@ -93,12 +87,15 @@ gen8_draw_wm(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { + const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs); + const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs); + /* 3DSTATE_WM */ - if (DIRTY(FS) || DIRTY(RASTERIZER)) - gen8_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer); + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM) + gen8_3DSTATE_WM(r->builder, &vec->rasterizer->rs); - if (DIRTY(DSA)) - gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, vec->dsa); + if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL) + gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, &vec->blend->cc); /* 3DSTATE_WM_HZ_OP and 3DSTATE_WM_CHROMAKEY */ if (r->hw_ctx_changed) { @@ -128,18 +125,15 @@ gen8_draw_wm(struct ilo_render *r, /* 3DSTATE_PS */ if (DIRTY(FS) || r->instruction_bo_changed) - gen8_3DSTATE_PS(r->builder, vec->fs); + gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset); /* 3DSTATE_PS_EXTRA */ - if (DIRTY(FS) || DIRTY(DSA) || DIRTY(BLEND)) { - const bool cc_may_kill = (vec->dsa->dw_blend_alpha || - vec->blend->alpha_to_coverage); - gen8_3DSTATE_PS_EXTRA(r->builder, vec->fs, cc_may_kill, false); - } + if (DIRTY(FS)) + gen8_3DSTATE_PS_EXTRA(r->builder, &cso->ps); /* 3DSTATE_PS_BLEND */ - if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA)) - gen8_3DSTATE_PS_BLEND(r->builder, vec->blend, &vec->fb, vec->dsa); + if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_PS_BLEND) + gen8_3DSTATE_PS_BLEND(r->builder, &vec->blend->cc); /* 3DSTATE_SCISSOR_STATE_POINTERS */ if (session->scissor_changed) { @@ -149,7 +143,7 @@ gen8_draw_wm(struct ilo_render *r, /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */ if (DIRTY(FB) || r->batch_bo_changed) { - const struct ilo_zs_surface *zs; + const struct ilo_state_zs *zs; uint32_t clear_params; if (vec->fb.state.zsbuf) { @@ -170,7 +164,7 @@ gen8_draw_wm(struct ilo_render *r, gen8_wa_pre_depth(r); - gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false); + gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs); gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs); gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params); @@ -183,14 +177,8 @@ gen8_draw_wm_sample_pattern(struct ilo_render *r, struct ilo_render_draw_session *session) { /* 3DSTATE_SAMPLE_PATTERN */ - if (r->hw_ctx_changed) { - gen8_3DSTATE_SAMPLE_PATTERN(r->builder, - &r->sample_pattern_1x, - &r->sample_pattern_2x, - &r->sample_pattern_4x, - r->sample_pattern_8x, - r->sample_pattern_16x); - } + if (r->hw_ctx_changed) + gen8_3DSTATE_SAMPLE_PATTERN(r->builder, &r->sample_pattern); } static void @@ -198,15 +186,13 @@ gen8_draw_wm_multisample(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */ - if (DIRTY(SAMPLE_MASK) || DIRTY(FB) || DIRTY(RASTERIZER)) { - gen8_3DSTATE_MULTISAMPLE(r->builder, vec->fb.num_samples, - vec->rasterizer->state.half_pixel_center); - - gen7_3DSTATE_SAMPLE_MASK(r->builder, - (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1, - vec->fb.num_samples); - } + /* 3DSTATE_MULTISAMPLE */ + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_MULTISAMPLE) + gen8_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs); + + /* 3DSTATE_SAMPLE_MASK */ + if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK) + gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs); } static void @@ -214,36 +200,38 @@ gen8_draw_vf(struct ilo_render *r, const struct ilo_state_vector *vec, struct ilo_render_draw_session *session) { - int i; - /* 3DSTATE_INDEX_BUFFER */ - if (DIRTY(IB) || r->batch_bo_changed) - gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ib); + if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) || + DIRTY(IB) || r->batch_bo_changed) + gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib); /* 3DSTATE_VF */ - if (session->primitive_restart_changed) { - gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart, - vec->draw->restart_index); - } + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF) + gen75_3DSTATE_VF(r->builder, &vec->ve->vf); /* 3DSTATE_VERTEX_BUFFERS */ - if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) - gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb); + if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) || + DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) { + gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf, + vec->vb.vb, vec->ve->vb_count); + } /* 3DSTATE_VERTEX_ELEMENTS */ - if (DIRTY(VE)) - gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve); + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS) + gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf); + + gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw_info.topology); - gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw->mode); + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_INSTANCING) { + const uint8_t attr_count = ilo_state_vf_get_attr_count(&vec->ve->vf); + uint8_t i; - for (i = 0; i < vec->ve->vb_count; i++) { - gen8_3DSTATE_VF_INSTANCING(r->builder, i, - vec->ve->instance_divisors[i]); + for (i = 0; i < attr_count; i++) + gen8_3DSTATE_VF_INSTANCING(r->builder, &vec->ve->vf, i); } - gen8_3DSTATE_VF_SGVS(r->builder, - false, 0, 0, - false, 0, 0); + if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_SGVS) + gen8_3DSTATE_VF_SGVS(r->builder, &vec->ve->vf); } void @@ -281,7 +269,7 @@ ilo_render_emit_draw_commands_gen8(struct ilo_render *render, gen6_draw_sf_rect(render, vec, session); gen8_draw_vf(render, vec, session); - ilo_render_3dprimitive(render, vec->draw, &vec->ib); + ilo_render_3dprimitive(render, &vec->draw_info); } int @@ -365,17 +353,13 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r, const struct ilo_blitter *blitter, const struct ilo_render_rectlist_session *session) { - uint32_t op; - ILO_DEV_ASSERT(r->dev, 8, 8); gen8_wa_pre_depth(r); if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH | - ILO_BLITTER_USE_FB_STENCIL)) { - gen6_3DSTATE_DEPTH_BUFFER(r->builder, - &blitter->fb.dst.u.zs, true); - } + ILO_BLITTER_USE_FB_STENCIL)) + gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs); if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) { gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, @@ -393,27 +377,8 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r, gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0, blitter->fb.width, blitter->fb.height); - switch (blitter->op) { - case ILO_BLITTER_RECTLIST_CLEAR_ZS: - op = 0; - if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) - op |= GEN8_WM_HZ_DW1_DEPTH_CLEAR; - if (blitter->uses & ILO_BLITTER_USE_FB_STENCIL) - op |= GEN8_WM_HZ_DW1_STENCIL_CLEAR; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_Z: - op = GEN8_WM_HZ_DW1_DEPTH_RESOLVE; - break; - case ILO_BLITTER_RECTLIST_RESOLVE_HIZ: - op = GEN8_WM_HZ_DW1_HIZ_RESOLVE; - break; - default: - op = 0; - break; - } - - gen8_3DSTATE_WM_HZ_OP(r->builder, op, blitter->fb.width, - blitter->fb.height, blitter->fb.num_samples); + gen8_3DSTATE_WM_HZ_OP(r->builder, &blitter->fb.rs, + blitter->fb.width, blitter->fb.height); ilo_render_pipe_control(r, GEN6_PIPE_CONTROL_WRITE_IMM); diff --git a/src/gallium/drivers/ilo/ilo_render_media.c b/src/gallium/drivers/ilo/ilo_render_media.c index 387920a912c..a0de0024d61 100644 --- a/src/gallium/drivers/ilo/ilo_render_media.c +++ b/src/gallium/drivers/ilo/ilo_render_media.c @@ -30,6 +30,7 @@ #include "core/ilo_builder_mi.h" #include "core/ilo_builder_render.h" +#include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" @@ -206,7 +207,7 @@ ilo_render_emit_launch_grid_commands(struct ilo_render *render, gen6_state_base_address(render->builder, true); - gen6_MEDIA_VFE_STATE(render->builder, pcb_size, use_slm); + gen6_MEDIA_VFE_STATE(render->builder, &session->compute); if (pcb_size) gen6_MEDIA_CURBE_LOAD(render->builder, pcb, pcb_size); diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c index b345dfb4fc4..ad053564294 100644 --- a/src/gallium/drivers/ilo/ilo_render_surface.c +++ b/src/gallium/drivers/ilo/ilo_render_surface.c @@ -29,11 +29,65 @@ #include "ilo_common.h" #include "ilo_blitter.h" +#include "ilo_resource.h" +#include "ilo_shader.h" #include "ilo_state.h" #include "ilo_render_gen.h" #define DIRTY(state) (session->pipe_dirty & ILO_DIRTY_ ## state) +static inline uint32_t +gen6_so_SURFACE_STATE(struct ilo_builder *builder, + const struct pipe_stream_output_target *so, + const struct pipe_stream_output_info *so_info, + int so_index) +{ + struct ilo_buffer *buf = ilo_buffer(so->buffer); + struct ilo_state_surface_buffer_info info; + struct ilo_state_surface surf; + + ILO_DEV_ASSERT(builder->dev, 6, 6); + + memset(&info, 0, sizeof(info)); + info.buf = buf; + info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB; + + switch (so_info->output[so_index].num_components) { + case 1: + info.format = GEN6_FORMAT_R32_FLOAT; + info.format_size = 4; + break; + case 2: + info.format = GEN6_FORMAT_R32G32_FLOAT; + info.format_size = 8; + break; + case 3: + info.format = GEN6_FORMAT_R32G32B32_FLOAT; + info.format_size = 12; + break; + case 4: + info.format = GEN6_FORMAT_R32G32B32A32_FLOAT; + info.format_size = 16; + break; + default: + assert(!"unexpected SO components length"); + info.format = GEN6_FORMAT_R32_FLOAT; + info.format_size = 4; + break; + } + + info.struct_size = + so_info->stride[so_info->output[so_index].output_buffer] * 4; + info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4; + info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4; + + memset(&surf, 0, sizeof(surf)); + ilo_state_surface_init_for_buffer(&surf, builder->dev, &info); + surf.bo = info.buf->bo; + + return gen6_SURFACE_STATE(builder, &surf); +} + static void gen6_emit_draw_surface_rt(struct ilo_render *r, const struct ilo_state_vector *vec, @@ -64,11 +118,9 @@ gen6_emit_draw_surface_rt(struct ilo_render *r, (const struct ilo_surface_cso *) fb->state.cbufs[i]; assert(surface->is_rt); - surface_state[i] = - gen6_SURFACE_STATE(r->builder, &surface->u.rt, true); + surface_state[i] = gen6_SURFACE_STATE(r->builder, &surface->u.rt); } else { - surface_state[i] = - gen6_SURFACE_STATE(r->builder, &fb->null_rt, true); + surface_state[i] = gen6_SURFACE_STATE(r->builder, &fb->null_rt); } } } @@ -173,8 +225,7 @@ gen6_emit_draw_surface_view(struct ilo_render *r, const struct ilo_view_cso *cso = (const struct ilo_view_cso *) view->states[i]; - surface_state[i] = - gen6_SURFACE_STATE(r->builder, &cso->surface, false); + surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface); } else { surface_state[i] = 0; } @@ -228,12 +279,10 @@ gen6_emit_draw_surface_const(struct ilo_render *r, for (i = 0; i < count; i++) { const struct ilo_cbuf_cso *cso = &cbuf->cso[i]; - if (cso->resource) { - surface_state[i] = gen6_SURFACE_STATE(r->builder, - &cso->surface, false); - } else { + if (cso->resource) + surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface); + else surface_state[i] = 0; - } } } @@ -406,8 +455,7 @@ gen6_emit_launch_grid_surface_view(struct ilo_render *r, const struct ilo_view_cso *cso = (const struct ilo_view_cso *) view->states[i]; - surface_state[i] = - gen6_SURFACE_STATE(r->builder, &cso->surface, false); + surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface); } else { surface_state[i] = 0; } @@ -421,7 +469,8 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r, { const struct ilo_shader_state *cs = vec->cs; uint32_t *surface_state = r->state.cs.SURFACE_STATE; - struct ilo_view_surface view; + struct ilo_state_surface_buffer_info info; + struct ilo_state_surface surf; int base, count; ILO_DEV_ASSERT(r->dev, 7, 7.5); @@ -432,15 +481,22 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r, if (!count) return; - ilo_gpe_init_view_surface_for_buffer(r->dev, - ilo_buffer(session->input->buffer), - session->input->buffer_offset, - session->input->buffer_size, - 1, PIPE_FORMAT_NONE, - false, false, &view); + memset(&info, 0, sizeof(info)); + info.buf = ilo_buffer(session->input->buffer); + info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED; + info.format = GEN6_FORMAT_RAW; + info.format_size = 1; + info.struct_size = 1; + info.readonly = true; + info.offset = session->input->buffer_offset; + info.size = session->input->buffer_size; + + memset(&surf, 0, sizeof(surf)); + ilo_state_surface_init_for_buffer(&surf, r->dev, &info); + surf.bo = info.buf->bo; assert(count == 1 && session->input->buffer); - surface_state[base] = gen6_SURFACE_STATE(r->builder, &view, false); + surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf); } static void @@ -483,14 +539,24 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r, for (i = 0; i < count; i++) { if (i < vec->global_binding.count && bindings[i].resource) { const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource); - struct ilo_view_surface view; + struct ilo_state_surface_buffer_info info; + struct ilo_state_surface surf; assert(bindings[i].resource->target == PIPE_BUFFER); - ilo_gpe_init_view_surface_for_buffer(r->dev, buf, 0, buf->bo_size, - 1, PIPE_FORMAT_NONE, true, true, &view); - surface_state[i] = - gen6_SURFACE_STATE(r->builder, &view, true); + memset(&info, 0, sizeof(info)); + info.buf = buf; + info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED; + info.format = GEN6_FORMAT_RAW; + info.format_size = 1; + info.struct_size = 1; + info.size = buf->bo_size; + + memset(&surf, 0, sizeof(surf)); + ilo_state_surface_init_for_buffer(&surf, r->dev, &info); + surf.bo = info.buf->bo; + + surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf); } else { surface_state[i] = 0; } diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c index ad4852278d0..be9fd10a84c 100644 --- a/src/gallium/drivers/ilo/ilo_resource.c +++ b/src/gallium/drivers/ilo/ilo_resource.c @@ -178,8 +178,8 @@ tex_create_bo(struct ilo_texture *tex) if (!bo) return false; - ilo_image_set_bo(&tex->image, bo); - intel_bo_unref(bo); + intel_bo_unref(tex->image.bo); + tex->image.bo = bo; return true; } @@ -223,7 +223,7 @@ tex_create_hiz(struct ilo_texture *tex) if (!bo) return false; - ilo_image_set_aux_bo(&tex->image, bo); + tex->image.aux.bo = bo; if (tex->imported) { unsigned lv; @@ -256,7 +256,7 @@ tex_create_mcs(struct ilo_texture *tex) if (!bo) return false; - ilo_image_set_aux_bo(&tex->image, bo); + tex->image.aux.bo = bo; return true; } @@ -267,7 +267,8 @@ tex_destroy(struct ilo_texture *tex) if (tex->separate_s8) tex_destroy(tex->separate_s8); - ilo_image_cleanup(&tex->image); + intel_bo_unref(tex->image.bo); + intel_bo_unref(tex->image.aux.bo); tex_free_slices(tex); FREE(tex); @@ -287,15 +288,13 @@ tex_alloc_bos(struct ilo_texture *tex) switch (tex->image.aux.type) { case ILO_IMAGE_AUX_HIZ: - if (!tex_create_hiz(tex)) { - /* Separate Stencil Buffer requires HiZ to be enabled */ - if (ilo_dev_gen(&is->dev) == ILO_GEN(6) && - tex->image.separate_stencil) - return false; - } + if (!tex_create_hiz(tex) && + !ilo_image_disable_aux(&tex->image, &is->dev)) + return false; break; case ILO_IMAGE_AUX_MCS: - if (!tex_create_mcs(tex)) + if (!tex_create_mcs(tex) && + !ilo_image_disable_aux(&tex->image, &is->dev)) return false; break; default: @@ -328,8 +327,7 @@ tex_import_handle(struct ilo_texture *tex, return false; } - ilo_image_set_bo(&tex->image, bo); - intel_bo_unref(bo); + tex->image.bo = bo; tex->imported = true; @@ -427,8 +425,8 @@ buf_create_bo(struct ilo_buffer_resource *buf) if (!bo) return false; - ilo_buffer_set_bo(&buf->buffer, bo); - intel_bo_unref(bo); + intel_bo_unref(buf->buffer.bo); + buf->buffer.bo = bo; return true; } @@ -436,7 +434,7 @@ buf_create_bo(struct ilo_buffer_resource *buf) static void buf_destroy(struct ilo_buffer_resource *buf) { - ilo_buffer_cleanup(&buf->buffer); + intel_bo_unref(buf->buffer.bo); FREE(buf); } @@ -445,6 +443,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ) { const struct ilo_screen *is = ilo_screen(screen); struct ilo_buffer_resource *buf; + unsigned size; buf = CALLOC_STRUCT(ilo_buffer_resource); if (!buf) @@ -454,8 +453,25 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ) buf->base.screen = screen; pipe_reference_init(&buf->base.reference, 1); - ilo_buffer_init(&buf->buffer, &is->dev, - templ->width0, templ->bind, templ->flags); + size = templ->width0; + + /* + * As noted in ilo_format_translate(), we treat some 3-component formats as + * 4-component formats to work around hardware limitations. Imagine the + * case where the vertex buffer holds a single PIPE_FORMAT_R16G16B16_FLOAT + * vertex, and buf->bo_size is 6. The hardware would fail to fetch it at + * boundary check because the vertex buffer is expected to hold a + * PIPE_FORMAT_R16G16B16A16_FLOAT vertex and that takes at least 8 bytes. + * + * For the workaround to work, we should add 2 to the bo size. But that + * would waste a page when the bo size is already page aligned. Let's + * round it to page size for now and revisit this when needed. + */ + if ((templ->bind & PIPE_BIND_VERTEX_BUFFER) && + ilo_dev_gen(&is->dev) < ILO_GEN(7.5)) + size = align(size, 4096); + + ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags); if (buf->buffer.bo_size < templ->width0 || buf->buffer.bo_size > ilo_max_resource_size || diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index 918af0820de..94105559b80 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -31,11 +31,10 @@ #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "genhw/genhw.h" /* for GEN6_REG_TIMESTAMP */ -#include "core/ilo_fence.h" -#include "core/ilo_format.h" #include "core/intel_winsys.h" #include "ilo_context.h" +#include "ilo_format.h" #include "ilo_resource.h" #include "ilo_transfer.h" /* for ILO_TRANSFER_MAP_BUFFER_ALIGNMENT */ #include "ilo_public.h" @@ -43,8 +42,7 @@ struct pipe_fence_handle { struct pipe_reference reference; - - struct ilo_fence fence; + struct intel_bo *seqno_bo; }; static float @@ -347,7 +345,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_INDEP_BLEND_FUNC: return true; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return (ilo_dev_gen(&is->dev) >= ILO_GEN(7)) ? 2048 : 512; + return (ilo_dev_gen(&is->dev) >= ILO_GEN(7.5)) ? 2048 : 512; case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: @@ -458,6 +456,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_VENDOR_ID: @@ -641,7 +640,7 @@ ilo_screen_fence_reference(struct pipe_screen *screen, STATIC_ASSERT(&((struct pipe_fence_handle *) NULL)->reference == NULL); if (pipe_reference(&old->reference, &fence->reference)) { - ilo_fence_cleanup(&old->fence); + intel_bo_unref(old->seqno_bo); FREE(old); } } @@ -654,10 +653,14 @@ ilo_screen_fence_finish(struct pipe_screen *screen, const int64_t wait_timeout = (timeout > INT64_MAX) ? -1 : timeout; bool signaled; - signaled = ilo_fence_wait(&fence->fence, wait_timeout); + signaled = (!fence->seqno_bo || + intel_bo_wait(fence->seqno_bo, wait_timeout) == 0); + /* XXX not thread safe */ - if (signaled) - ilo_fence_set_seq_bo(&fence->fence, NULL); + if (signaled && fence->seqno_bo) { + intel_bo_unref(fence->seqno_bo); + fence->seqno_bo = NULL; + } return signaled; } @@ -676,7 +679,6 @@ ilo_screen_fence_signalled(struct pipe_screen *screen, struct pipe_fence_handle * ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo) { - struct ilo_screen *is = ilo_screen(screen); struct pipe_fence_handle *fence; fence = CALLOC_STRUCT(pipe_fence_handle); @@ -685,8 +687,7 @@ ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo) pipe_reference_init(&fence->reference, 1); - ilo_fence_init(&fence->fence, &is->dev); - ilo_fence_set_seq_bo(&fence->fence, bo); + fence->seqno_bo = intel_bo_ref(bo); return fence; } @@ -696,7 +697,7 @@ ilo_screen_destroy(struct pipe_screen *screen) { struct ilo_screen *is = ilo_screen(screen); - ilo_dev_cleanup(&is->dev); + intel_winsys_destroy(is->dev.winsys); FREE(is); } diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c index 799db2cbfcb..5f2b01017e2 100644 --- a/src/gallium/drivers/ilo/ilo_shader.c +++ b/src/gallium/drivers/ilo/ilo_shader.c @@ -27,7 +27,6 @@ #include "genhw/genhw.h" /* for SBE setup */ #include "core/ilo_builder.h" -#include "core/ilo_state_3d.h" #include "core/intel_winsys.h" #include "shader/ilo_shader_internal.h" #include "tgsi/tgsi_parse.h" @@ -557,39 +556,255 @@ ilo_shader_state_search_variant(struct ilo_shader_state *state, } static void -copy_so_info(struct ilo_shader *sh, - const struct pipe_stream_output_info *so_info) +init_shader_urb(const struct ilo_shader *kernel, + const struct ilo_shader_state *state, + struct ilo_state_shader_urb_info *urb) { - unsigned i, attr; + urb->cv_input_attr_count = kernel->in.count; + urb->read_base = 0; + urb->read_count = kernel->in.count; - if (!so_info->num_outputs) + urb->output_attr_count = kernel->out.count; + urb->user_cull_enables = 0x0; + urb->user_clip_enables = 0x0; +} + +static void +init_shader_kernel(const struct ilo_shader *kernel, + const struct ilo_shader_state *state, + struct ilo_state_shader_kernel_info *kern) +{ + kern->offset = 0; + kern->grf_start = kernel->in.start_grf; + kern->pcb_attr_count = + (kernel->pcb.cbuf0_size + kernel->pcb.clip_state_size + 15) / 16; + kern->scratch_size = 0; +} + +static void +init_shader_resource(const struct ilo_shader *kernel, + const struct ilo_shader_state *state, + struct ilo_state_shader_resource_info *resource) +{ + resource->sampler_count = state->info.num_samplers; + resource->surface_count = 0; + resource->has_uav = false; +} + +static void +init_vs(struct ilo_shader *kernel, + const struct ilo_shader_state *state) +{ + struct ilo_state_vs_info info; + + memset(&info, 0, sizeof(info)); + + init_shader_urb(kernel, state, &info.urb); + init_shader_kernel(kernel, state, &info.kernel); + init_shader_resource(kernel, state, &info.resource); + info.dispatch_enable = true; + info.stats_enable = true; + + if (ilo_dev_gen(state->info.dev) == ILO_GEN(6) && kernel->stream_output) { + struct ilo_state_gs_info gs_info; + + memset(&gs_info, 0, sizeof(gs_info)); + + gs_info.urb.cv_input_attr_count = kernel->out.count; + gs_info.urb.read_count = kernel->out.count; + gs_info.kernel.grf_start = kernel->gs_start_grf; + gs_info.sol.sol_enable = true; + gs_info.sol.stats_enable = true; + gs_info.sol.render_disable = kernel->variant.u.vs.rasterizer_discard; + gs_info.sol.svbi_post_inc = kernel->svbi_post_inc; + gs_info.sol.tristrip_reorder = GEN7_REORDER_LEADING; + gs_info.dispatch_enable = true; + gs_info.stats_enable = true; + + ilo_state_vs_init(&kernel->cso.vs_sol.vs, state->info.dev, &info); + ilo_state_gs_init(&kernel->cso.vs_sol.sol, state->info.dev, &gs_info); + } else { + ilo_state_vs_init(&kernel->cso.vs, state->info.dev, &info); + } +} + +static void +init_gs(struct ilo_shader *kernel, + const struct ilo_shader_state *state) +{ + const struct pipe_stream_output_info *so_info = &state->info.stream_output; + struct ilo_state_gs_info info; + + memset(&info, 0, sizeof(info)); + + init_shader_urb(kernel, state, &info.urb); + init_shader_kernel(kernel, state, &info.kernel); + init_shader_resource(kernel, state, &info.resource); + info.dispatch_enable = true; + info.stats_enable = true; + + if (so_info->num_outputs > 0) { + info.sol.sol_enable = true; + info.sol.stats_enable = true; + info.sol.render_disable = kernel->variant.u.gs.rasterizer_discard; + info.sol.tristrip_reorder = GEN7_REORDER_LEADING; + } + + ilo_state_gs_init(&kernel->cso.gs, state->info.dev, &info); +} + +static void +init_ps(struct ilo_shader *kernel, + const struct ilo_shader_state *state) +{ + struct ilo_state_ps_info info; + + memset(&info, 0, sizeof(info)); + + init_shader_kernel(kernel, state, &info.kernel_8); + init_shader_resource(kernel, state, &info.resource); + + info.io.has_rt_write = true; + info.io.posoffset = GEN6_POSOFFSET_NONE; + info.io.attr_count = kernel->in.count; + info.io.use_z = kernel->in.has_pos; + info.io.use_w = kernel->in.has_pos; + info.io.use_coverage_mask = false; + info.io.pscdepth = (kernel->out.has_pos) ? + GEN7_PSCDEPTH_ON : GEN7_PSCDEPTH_OFF; + info.io.write_pixel_mask = kernel->has_kill; + info.io.write_omask = false; + + info.params.sample_mask = 0x1; + info.params.earlyz_control_psexec = false; + info.params.alpha_may_kill = false; + info.params.dual_source_blending = false; + info.params.has_writeable_rt = true; + + info.valid_kernels = GEN6_PS_DISPATCH_8; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 284: + * + * "(MSDISPMODE_PERSAMPLE) This is the high-quality multisample mode + * where (over and above PERPIXEL mode) the PS is run for each covered + * sample. This mode is also used for "normal" non-multisample + * rendering (aka 1X), given Number of Multisamples is programmed to + * NUMSAMPLES_1." + */ + info.per_sample_dispatch = true; + + info.rt_clear_enable = false; + info.rt_resolve_enable = false; + info.cv_per_sample_interp = false; + info.cv_has_earlyz_op = false; + info.sample_count_one = true; + info.cv_has_depth_buffer = true; + + ilo_state_ps_init(&kernel->cso.ps, state->info.dev, &info); + + /* remember current parameters */ + kernel->ps_params = info.params; +} + +static void +init_sol(struct ilo_shader *kernel, + const struct ilo_dev *dev, + const struct pipe_stream_output_info *so_info, + bool rasterizer_discard) +{ + struct ilo_state_sol_decl_info decls[4][PIPE_MAX_SO_OUTPUTS]; + unsigned buf_offsets[PIPE_MAX_SO_BUFFERS]; + struct ilo_state_sol_info info; + unsigned i; + + if (!so_info->num_outputs) { + ilo_state_sol_init_disabled(&kernel->sol, dev, rasterizer_discard); return; + } + + memset(&info, 0, sizeof(info)); + info.data = kernel->sol_data; + info.data_size = sizeof(kernel->sol_data); + info.sol_enable = true; + info.stats_enable = true; + info.tristrip_reorder = GEN7_REORDER_TRAILING; + info.render_disable = rasterizer_discard; + info.render_stream = 0; + + for (i = 0; i < 4; i++) { + info.buffer_strides[i] = so_info->stride[i] * 4; - sh->so_info = *so_info; + info.streams[i].cv_vue_attr_count = kernel->out.count; + info.streams[i].decls = decls[i]; + } + memset(decls, 0, sizeof(decls)); + memset(buf_offsets, 0, sizeof(buf_offsets)); for (i = 0; i < so_info->num_outputs; i++) { + const unsigned stream = so_info->output[i].stream; + const unsigned buffer = so_info->output[i].output_buffer; + struct ilo_state_sol_decl_info *decl; + unsigned attr; + /* figure out which attribute is sourced */ - for (attr = 0; attr < sh->out.count; attr++) { - const int reg_idx = sh->out.register_indices[attr]; + for (attr = 0; attr < kernel->out.count; attr++) { + const int reg_idx = kernel->out.register_indices[attr]; if (reg_idx == so_info->output[i].register_index) break; } - - if (attr < sh->out.count) { - sh->so_info.output[i].register_index = attr; - } - else { + if (attr >= kernel->out.count) { assert(!"stream output an undefined register"); - sh->so_info.output[i].register_index = 0; + attr = 0; } + if (info.streams[stream].vue_read_count < attr + 1) + info.streams[stream].vue_read_count = attr + 1; + + /* pad with holes first */ + while (buf_offsets[buffer] < so_info->output[i].dst_offset) { + int num_dwords; + + num_dwords = so_info->output[i].dst_offset - buf_offsets[buffer]; + if (num_dwords > 4) + num_dwords = 4; + + assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream])); + decl = &decls[stream][info.streams[stream].decl_count]; + + decl->attr = 0; + decl->is_hole = true; + decl->component_base = 0; + decl->component_count = num_dwords; + decl->buffer = buffer; + + info.streams[stream].decl_count++; + buf_offsets[buffer] += num_dwords; + } + assert(buf_offsets[buffer] == so_info->output[i].dst_offset); + + assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream])); + decl = &decls[stream][info.streams[stream].decl_count]; + + decl->attr = attr; + decl->is_hole = false; /* PSIZE is at W channel */ - if (sh->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) { + if (kernel->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) { assert(so_info->output[i].start_component == 0); assert(so_info->output[i].num_components == 1); - sh->so_info.output[i].start_component = 3; + decl->component_base = 3; + decl->component_count = 1; + } else { + decl->component_base = so_info->output[i].start_component; + decl->component_count = so_info->output[i].num_components; } + decl->buffer = buffer; + + info.streams[stream].decl_count++; + buf_offsets[buffer] += so_info->output[i].num_components; } + + ilo_state_sol_init(&kernel->sol, dev, &info); } /** @@ -599,17 +814,20 @@ static struct ilo_shader * ilo_shader_state_add_variant(struct ilo_shader_state *state, const struct ilo_shader_variant *variant) { + bool rasterizer_discard = false; struct ilo_shader *sh; switch (state->info.type) { case PIPE_SHADER_VERTEX: sh = ilo_shader_compile_vs(state, variant); + rasterizer_discard = variant->u.vs.rasterizer_discard; break; case PIPE_SHADER_FRAGMENT: sh = ilo_shader_compile_fs(state, variant); break; case PIPE_SHADER_GEOMETRY: sh = ilo_shader_compile_gs(state, variant); + rasterizer_discard = variant->u.gs.rasterizer_discard; break; case PIPE_SHADER_COMPUTE: sh = ilo_shader_compile_cs(state, variant); @@ -625,7 +843,8 @@ ilo_shader_state_add_variant(struct ilo_shader_state *state, sh->variant = *variant; - copy_so_info(sh, &state->info.stream_output); + init_sol(sh, state->info.dev, &state->info.stream_output, + rasterizer_discard); ilo_shader_state_add_shader(state, sh); @@ -665,13 +884,13 @@ ilo_shader_state_use_variant(struct ilo_shader_state *state, if (construct_cso) { switch (state->info.type) { case PIPE_SHADER_VERTEX: - ilo_gpe_init_vs_cso(state->info.dev, state, &sh->cso); + init_vs(sh, state); break; case PIPE_SHADER_GEOMETRY: - ilo_gpe_init_gs_cso(state->info.dev, state, &sh->cso); + init_gs(sh, state); break; case PIPE_SHADER_FRAGMENT: - ilo_gpe_init_fs_cso(state->info.dev, state, &sh->cso); + init_ps(sh, state); break; default: break; @@ -789,16 +1008,33 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader, const struct ilo_state_vector *vec, uint32_t dirty) { - const struct ilo_shader * const cur = shader->shader; struct ilo_shader_variant variant; + bool changed = false; - if (!(shader->info.non_orthogonal_states & dirty)) - return false; + if (shader->info.non_orthogonal_states & dirty) { + const struct ilo_shader * const old = shader->shader; + + ilo_shader_variant_init(&variant, &shader->info, vec); + ilo_shader_state_use_variant(shader, &variant); + changed = (shader->shader != old); + } - ilo_shader_variant_init(&variant, &shader->info, vec); - ilo_shader_state_use_variant(shader, &variant); + if (shader->info.type == PIPE_SHADER_FRAGMENT) { + struct ilo_shader *kernel = shader->shader; - return (shader->shader != cur); + if (kernel->ps_params.sample_mask != vec->sample_mask || + kernel->ps_params.alpha_may_kill != vec->blend->alpha_may_kill) { + kernel->ps_params.sample_mask = vec->sample_mask; + kernel->ps_params.alpha_may_kill = vec->blend->alpha_may_kill; + + ilo_state_ps_set_params(&kernel->cso.ps, shader->info.dev, + &kernel->ps_params); + + changed = true; + } + } + + return changed; } static int @@ -829,82 +1065,104 @@ route_attr(const int *semantics, const int *indices, int len, * \return true if a different routing is selected */ bool -ilo_shader_select_kernel_routing(struct ilo_shader_state *shader, - const struct ilo_shader_state *source, - const struct ilo_rasterizer_state *rasterizer) +ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader, + const struct ilo_shader_state *source, + const struct ilo_rasterizer_state *rasterizer) { - const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable; + const bool is_point = true; const bool light_twoside = rasterizer->state.light_twoside; + const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable; + const int sprite_coord_mode = rasterizer->state.sprite_coord_mode; struct ilo_shader *kernel = shader->shader; struct ilo_kernel_routing *routing = &kernel->routing; + struct ilo_state_sbe_swizzle_info swizzles[ILO_STATE_SBE_MAX_SWIZZLE_COUNT]; + struct ilo_state_sbe_info info; const int *src_semantics, *src_indices; - int src_len, max_src_slot; + int src_skip, src_len, src_slot; int dst_len, dst_slot; - /* we are constructing 3DSTATE_SBE here */ - ILO_DEV_ASSERT(shader->info.dev, 6, 8); - assert(kernel); if (source) { assert(source->shader); + src_semantics = source->shader->out.semantic_names; src_indices = source->shader->out.semantic_indices; src_len = source->shader->out.count; - } - else { + src_skip = 0; + + assert(src_len >= 2 && + src_semantics[0] == TGSI_SEMANTIC_PSIZE && + src_semantics[1] == TGSI_SEMANTIC_POSITION); + + /* + * skip PSIZE and POSITION (how about the optional CLIPDISTs?), unless + * they are all the source shader has and FS needs to read some + * attributes. + */ + if (src_len > 2 || !kernel->in.count) { + src_semantics += 2; + src_indices += 2; + src_len -= 2; + src_skip = 2; + } + } else { src_semantics = kernel->in.semantic_names; src_indices = kernel->in.semantic_indices; src_len = kernel->in.count; + src_skip = 0; } /* no change */ - if (kernel->routing_initialized && - routing->source_skip + routing->source_len <= src_len && - kernel->routing_sprite_coord_enable == sprite_coord_enable && - !memcmp(kernel->routing_src_semantics, - &src_semantics[routing->source_skip], - sizeof(kernel->routing_src_semantics[0]) * routing->source_len) && - !memcmp(kernel->routing_src_indices, - &src_indices[routing->source_skip], - sizeof(kernel->routing_src_indices[0]) * routing->source_len)) + if (routing->initialized && + routing->is_point == is_point && + routing->light_twoside == light_twoside && + routing->sprite_coord_enable == sprite_coord_enable && + routing->sprite_coord_mode == sprite_coord_mode && + routing->src_len <= src_len && + !memcmp(routing->src_semantics, src_semantics, + sizeof(src_semantics[0]) * routing->src_len) && + !memcmp(routing->src_indices, src_indices, + sizeof(src_indices[0]) * routing->src_len)) return false; - if (source) { - /* skip PSIZE and POSITION (how about the optional CLIPDISTs?) */ - assert(src_semantics[0] == TGSI_SEMANTIC_PSIZE); - assert(src_semantics[1] == TGSI_SEMANTIC_POSITION); - routing->source_skip = 2; - - routing->source_len = src_len - routing->source_skip; - src_semantics += routing->source_skip; - src_indices += routing->source_skip; - } - else { - routing->source_skip = 0; - routing->source_len = src_len; - } - - routing->const_interp_enable = kernel->in.const_interp_enable; - routing->point_sprite_enable = 0; - routing->swizzle_enable = false; - - assert(kernel->in.count <= Elements(routing->swizzles)); - dst_len = MIN2(kernel->in.count, Elements(routing->swizzles)); - max_src_slot = -1; + routing->is_point = is_point; + routing->light_twoside = light_twoside; + routing->sprite_coord_enable = sprite_coord_enable; + routing->sprite_coord_mode = sprite_coord_mode; + + assert(kernel->in.count <= Elements(swizzles)); + dst_len = MIN2(kernel->in.count, Elements(swizzles)); + + memset(&swizzles, 0, sizeof(swizzles)); + memset(&info, 0, sizeof(info)); + + info.attr_count = dst_len; + info.cv_vue_attr_count = src_skip + src_len; + info.vue_read_base = src_skip; + info.vue_read_count = 0; + info.has_min_read_count = true; + info.swizzle_enable = false; + info.swizzle_16_31 = false; + info.swizzle_count = 0; + info.swizzles = swizzles; + info.const_interp_enables = kernel->in.const_interp_enable; + info.point_sprite_enables = 0x0; + info.point_sprite_origin_lower_left = + (sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); + info.cv_is_point = is_point; for (dst_slot = 0; dst_slot < dst_len; dst_slot++) { const int semantic = kernel->in.semantic_names[dst_slot]; const int index = kernel->in.semantic_indices[dst_slot]; - int src_slot; if (semantic == TGSI_SEMANTIC_GENERIC && (sprite_coord_enable & (1 << index))) - routing->point_sprite_enable |= 1 << dst_slot; + info.point_sprite_enables |= 1 << dst_slot; if (source) { - src_slot = route_attr(src_semantics, src_indices, - routing->source_len, semantic, index); + src_slot = route_attr(src_semantics, src_indices, src_len, + semantic, index); /* * The source shader stage does not output this attribute. The value @@ -918,58 +1176,47 @@ ilo_shader_select_kernel_routing(struct ilo_shader_state *shader, */ if (src_slot < 0) src_slot = 0; - } - else { + } else { src_slot = dst_slot; } - routing->swizzles[dst_slot] = src_slot; - /* use the following slot for two-sided lighting */ if (semantic == TGSI_SEMANTIC_COLOR && light_twoside && - src_slot + 1 < routing->source_len && + src_slot + 1 < src_len && src_semantics[src_slot + 1] == TGSI_SEMANTIC_BCOLOR && src_indices[src_slot + 1] == index) { - routing->swizzles[dst_slot] |= GEN8_SBE_SWIZ_INPUTATTR_FACING; + swizzles[dst_slot].attr_select = GEN6_INPUTATTR_FACING; + swizzles[dst_slot].attr = src_slot; + info.swizzle_enable = true; src_slot++; + } else { + swizzles[dst_slot].attr_select = GEN6_INPUTATTR_NORMAL; + swizzles[dst_slot].attr = src_slot; + if (src_slot != dst_slot) + info.swizzle_enable = true; } - if (routing->swizzles[dst_slot] != dst_slot) - routing->swizzle_enable = true; + swizzles[dst_slot].force_zeros = false; - if (max_src_slot < src_slot) - max_src_slot = src_slot; + if (info.vue_read_count < src_slot + 1) + info.vue_read_count = src_slot + 1; } - memset(&routing->swizzles[dst_slot], 0, sizeof(routing->swizzles) - - sizeof(routing->swizzles[0]) * dst_slot); + if (info.swizzle_enable) + info.swizzle_count = dst_len; - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 248: - * - * "It is UNDEFINED to set this field (Vertex URB Entry Read Length) to - * 0 indicating no Vertex URB data to be read. - * - * This field should be set to the minimum length required to read the - * maximum source attribute. The maximum source attribute is indicated - * by the maximum value of the enabled Attribute # Source Attribute if - * Attribute Swizzle Enable is set, Number of Output Attributes-1 if - * enable is not set. - * - * read_length = ceiling((max_source_attr+1)/2) - * - * [errata] Corruption/Hang possible if length programmed larger than - * recommended" - */ - routing->source_len = max_src_slot + 1; + if (routing->initialized) + ilo_state_sbe_set_info(&routing->sbe, shader->info.dev, &info); + else + ilo_state_sbe_init(&routing->sbe, shader->info.dev, &info); + + routing->src_len = info.vue_read_count; + memcpy(routing->src_semantics, src_semantics, + sizeof(src_semantics[0]) * routing->src_len); + memcpy(routing->src_indices, src_indices, + sizeof(src_indices[0]) * routing->src_len); - /* remember the states of the source */ - kernel->routing_initialized = true; - kernel->routing_sprite_coord_enable = sprite_coord_enable; - memcpy(kernel->routing_src_semantics, src_semantics, - sizeof(kernel->routing_src_semantics[0]) * routing->source_len); - memcpy(kernel->routing_src_indices, src_indices, - sizeof(kernel->routing_src_indices[0]) * routing->source_len); + routing->initialized = true; return true; } @@ -1147,7 +1394,7 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, /** * Return the CSO of the selected kernel. */ -const struct ilo_shader_cso * +const union ilo_shader_cso * ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader) { const struct ilo_shader *kernel = shader->shader; @@ -1163,22 +1410,28 @@ ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader) const struct pipe_stream_output_info * ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader) { + return &shader->info.stream_output; +} + +const struct ilo_state_sol * +ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader) +{ const struct ilo_shader *kernel = shader->shader; assert(kernel); - return &kernel->so_info; + return &kernel->sol; } /** * Return the routing info of the selected kernel. */ -const struct ilo_kernel_routing * -ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader) +const struct ilo_state_sbe * +ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader) { const struct ilo_shader *kernel = shader->shader; assert(kernel); - return &kernel->routing; + return &kernel->routing.sbe; } diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h index 8a359001bb8..d9f02a4746a 100644 --- a/src/gallium/drivers/ilo/ilo_shader.h +++ b/src/gallium/drivers/ilo/ilo_shader.h @@ -28,6 +28,8 @@ #ifndef ILO_SHADER_H #define ILO_SHADER_H +#include "core/ilo_state_shader.h" + #include "ilo_common.h" enum ilo_kernel_param { @@ -81,23 +83,28 @@ enum ilo_kernel_param { ILO_KERNEL_PARAM_COUNT, }; -struct ilo_kernel_routing { - uint32_t const_interp_enable; - uint32_t point_sprite_enable; - unsigned source_skip, source_len; - - bool swizzle_enable; - uint16_t swizzles[16]; -}; - struct intel_bo; struct ilo_builder; struct ilo_rasterizer_state; struct ilo_shader_cache; struct ilo_shader_state; -struct ilo_shader_cso; +struct ilo_state_sbe; +struct ilo_state_sol; struct ilo_state_vector; +union ilo_shader_cso { + struct ilo_state_vs vs; + struct ilo_state_hs hs; + struct ilo_state_ds ds; + struct ilo_state_gs gs; + struct ilo_state_ps ps; + + struct { + struct ilo_state_vs vs; + struct ilo_state_gs sol; + } vs_sol; +}; + struct ilo_shader_cache * ilo_shader_cache_create(void); @@ -151,9 +158,9 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader, uint32_t dirty); bool -ilo_shader_select_kernel_routing(struct ilo_shader_state *shader, - const struct ilo_shader_state *source, - const struct ilo_rasterizer_state *rasterizer); +ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader, + const struct ilo_shader_state *source, + const struct ilo_rasterizer_state *rasterizer); uint32_t ilo_shader_get_kernel_offset(const struct ilo_shader_state *shader); @@ -162,13 +169,16 @@ int ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, enum ilo_kernel_param param); -const struct ilo_shader_cso * +const union ilo_shader_cso * ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader); const struct pipe_stream_output_info * ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader); -const struct ilo_kernel_routing * -ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader); +const struct ilo_state_sol * +ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader); + +const struct ilo_state_sbe * +ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader); #endif /* ILO_SHADER_H */ diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c index b1bd49a0b6c..63534f33fa7 100644 --- a/src/gallium/drivers/ilo/ilo_state.c +++ b/src/gallium/drivers/ilo/ilo_state.c @@ -25,16 +25,288 @@ * Chia-I Wu <[email protected]> */ -#include "core/ilo_state_3d.h" +#include "util/u_dual_blend.h" #include "util/u_dynarray.h" +#include "util/u_framebuffer.h" #include "util/u_helpers.h" +#include "util/u_resource.h" #include "util/u_upload_mgr.h" #include "ilo_context.h" +#include "ilo_format.h" #include "ilo_resource.h" #include "ilo_shader.h" #include "ilo_state.h" +/** + * Translate a pipe primitive type to the matching hardware primitive type. + */ +static enum gen_3dprim_type +ilo_translate_draw_mode(unsigned mode) +{ + static const enum gen_3dprim_type prim_mapping[PIPE_PRIM_MAX] = { + [PIPE_PRIM_POINTS] = GEN6_3DPRIM_POINTLIST, + [PIPE_PRIM_LINES] = GEN6_3DPRIM_LINELIST, + [PIPE_PRIM_LINE_LOOP] = GEN6_3DPRIM_LINELOOP, + [PIPE_PRIM_LINE_STRIP] = GEN6_3DPRIM_LINESTRIP, + [PIPE_PRIM_TRIANGLES] = GEN6_3DPRIM_TRILIST, + [PIPE_PRIM_TRIANGLE_STRIP] = GEN6_3DPRIM_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = GEN6_3DPRIM_TRIFAN, + [PIPE_PRIM_QUADS] = GEN6_3DPRIM_QUADLIST, + [PIPE_PRIM_QUAD_STRIP] = GEN6_3DPRIM_QUADSTRIP, + [PIPE_PRIM_POLYGON] = GEN6_3DPRIM_POLYGON, + [PIPE_PRIM_LINES_ADJACENCY] = GEN6_3DPRIM_LINELIST_ADJ, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = GEN6_3DPRIM_LINESTRIP_ADJ, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = GEN6_3DPRIM_TRILIST_ADJ, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = GEN6_3DPRIM_TRISTRIP_ADJ, + }; + + assert(prim_mapping[mode]); + + return prim_mapping[mode]; +} + +static enum gen_index_format +ilo_translate_index_size(unsigned index_size) +{ + switch (index_size) { + case 1: return GEN6_INDEX_BYTE; + case 2: return GEN6_INDEX_WORD; + case 4: return GEN6_INDEX_DWORD; + default: + assert(!"unknown index size"); + return GEN6_INDEX_BYTE; + } +} + +static enum gen_mip_filter +ilo_translate_mip_filter(unsigned filter) +{ + switch (filter) { + case PIPE_TEX_MIPFILTER_NEAREST: return GEN6_MIPFILTER_NEAREST; + case PIPE_TEX_MIPFILTER_LINEAR: return GEN6_MIPFILTER_LINEAR; + case PIPE_TEX_MIPFILTER_NONE: return GEN6_MIPFILTER_NONE; + default: + assert(!"unknown mipfilter"); + return GEN6_MIPFILTER_NONE; + } +} + +static int +ilo_translate_img_filter(unsigned filter) +{ + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: return GEN6_MAPFILTER_NEAREST; + case PIPE_TEX_FILTER_LINEAR: return GEN6_MAPFILTER_LINEAR; + default: + assert(!"unknown sampler filter"); + return GEN6_MAPFILTER_NEAREST; + } +} + +static enum gen_texcoord_mode +ilo_translate_address_wrap(unsigned wrap) +{ + switch (wrap) { + case PIPE_TEX_WRAP_CLAMP: return GEN8_TEXCOORDMODE_HALF_BORDER; + case PIPE_TEX_WRAP_REPEAT: return GEN6_TEXCOORDMODE_WRAP; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return GEN6_TEXCOORDMODE_CLAMP; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return GEN6_TEXCOORDMODE_CLAMP_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: return GEN6_TEXCOORDMODE_MIRROR; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(!"unknown sampler wrap mode"); + return GEN6_TEXCOORDMODE_WRAP; + } +} + +static enum gen_aniso_ratio +ilo_translate_max_anisotropy(unsigned max_anisotropy) +{ + switch (max_anisotropy) { + case 0: case 1: case 2: return GEN6_ANISORATIO_2; + case 3: case 4: return GEN6_ANISORATIO_4; + case 5: case 6: return GEN6_ANISORATIO_6; + case 7: case 8: return GEN6_ANISORATIO_8; + case 9: case 10: return GEN6_ANISORATIO_10; + case 11: case 12: return GEN6_ANISORATIO_12; + case 13: case 14: return GEN6_ANISORATIO_14; + default: return GEN6_ANISORATIO_16; + } +} + +static enum gen_prefilter_op +ilo_translate_shadow_func(unsigned func) +{ + /* + * For PIPE_FUNC_x, the reference value is on the left-hand side of the + * comparison, and 1.0 is returned when the comparison is true. + * + * For GEN6_PREFILTEROP_x, the reference value is on the right-hand side of + * the comparison, and 0.0 is returned when the comparison is true. + */ + switch (func) { + case PIPE_FUNC_NEVER: return GEN6_PREFILTEROP_ALWAYS; + case PIPE_FUNC_LESS: return GEN6_PREFILTEROP_LEQUAL; + case PIPE_FUNC_EQUAL: return GEN6_PREFILTEROP_NOTEQUAL; + case PIPE_FUNC_LEQUAL: return GEN6_PREFILTEROP_LESS; + case PIPE_FUNC_GREATER: return GEN6_PREFILTEROP_GEQUAL; + case PIPE_FUNC_NOTEQUAL: return GEN6_PREFILTEROP_EQUAL; + case PIPE_FUNC_GEQUAL: return GEN6_PREFILTEROP_GREATER; + case PIPE_FUNC_ALWAYS: return GEN6_PREFILTEROP_NEVER; + default: + assert(!"unknown shadow compare function"); + return GEN6_PREFILTEROP_NEVER; + } +} + +static enum gen_front_winding +ilo_translate_front_ccw(unsigned front_ccw) +{ + return (front_ccw) ? GEN6_FRONTWINDING_CCW : GEN6_FRONTWINDING_CW; +} + +static enum gen_cull_mode +ilo_translate_cull_face(unsigned cull_face) +{ + switch (cull_face) { + case PIPE_FACE_NONE: return GEN6_CULLMODE_NONE; + case PIPE_FACE_FRONT: return GEN6_CULLMODE_FRONT; + case PIPE_FACE_BACK: return GEN6_CULLMODE_BACK; + case PIPE_FACE_FRONT_AND_BACK: return GEN6_CULLMODE_BOTH; + default: + assert(!"unknown face culling"); + return GEN6_CULLMODE_NONE; + } +} + +static enum gen_fill_mode +ilo_translate_poly_mode(unsigned poly_mode) +{ + switch (poly_mode) { + case PIPE_POLYGON_MODE_FILL: return GEN6_FILLMODE_SOLID; + case PIPE_POLYGON_MODE_LINE: return GEN6_FILLMODE_WIREFRAME; + case PIPE_POLYGON_MODE_POINT: return GEN6_FILLMODE_POINT; + default: + assert(!"unknown polygon mode"); + return GEN6_FILLMODE_SOLID; + } +} + +static enum gen_pixel_location +ilo_translate_half_pixel_center(bool half_pixel_center) +{ + return (half_pixel_center) ? GEN6_PIXLOC_CENTER : GEN6_PIXLOC_UL_CORNER; +} + +static enum gen_compare_function +ilo_translate_compare_func(unsigned func) +{ + switch (func) { + case PIPE_FUNC_NEVER: return GEN6_COMPAREFUNCTION_NEVER; + case PIPE_FUNC_LESS: return GEN6_COMPAREFUNCTION_LESS; + case PIPE_FUNC_EQUAL: return GEN6_COMPAREFUNCTION_EQUAL; + case PIPE_FUNC_LEQUAL: return GEN6_COMPAREFUNCTION_LEQUAL; + case PIPE_FUNC_GREATER: return GEN6_COMPAREFUNCTION_GREATER; + case PIPE_FUNC_NOTEQUAL: return GEN6_COMPAREFUNCTION_NOTEQUAL; + case PIPE_FUNC_GEQUAL: return GEN6_COMPAREFUNCTION_GEQUAL; + case PIPE_FUNC_ALWAYS: return GEN6_COMPAREFUNCTION_ALWAYS; + default: + assert(!"unknown compare function"); + return GEN6_COMPAREFUNCTION_NEVER; + } +} + +static enum gen_stencil_op +ilo_translate_stencil_op(unsigned stencil_op) +{ + switch (stencil_op) { + case PIPE_STENCIL_OP_KEEP: return GEN6_STENCILOP_KEEP; + case PIPE_STENCIL_OP_ZERO: return GEN6_STENCILOP_ZERO; + case PIPE_STENCIL_OP_REPLACE: return GEN6_STENCILOP_REPLACE; + case PIPE_STENCIL_OP_INCR: return GEN6_STENCILOP_INCRSAT; + case PIPE_STENCIL_OP_DECR: return GEN6_STENCILOP_DECRSAT; + case PIPE_STENCIL_OP_INCR_WRAP: return GEN6_STENCILOP_INCR; + case PIPE_STENCIL_OP_DECR_WRAP: return GEN6_STENCILOP_DECR; + case PIPE_STENCIL_OP_INVERT: return GEN6_STENCILOP_INVERT; + default: + assert(!"unknown stencil op"); + return GEN6_STENCILOP_KEEP; + } +} + +static enum gen_logic_op +ilo_translate_logicop(unsigned logicop) +{ + switch (logicop) { + case PIPE_LOGICOP_CLEAR: return GEN6_LOGICOP_CLEAR; + case PIPE_LOGICOP_NOR: return GEN6_LOGICOP_NOR; + case PIPE_LOGICOP_AND_INVERTED: return GEN6_LOGICOP_AND_INVERTED; + case PIPE_LOGICOP_COPY_INVERTED: return GEN6_LOGICOP_COPY_INVERTED; + case PIPE_LOGICOP_AND_REVERSE: return GEN6_LOGICOP_AND_REVERSE; + case PIPE_LOGICOP_INVERT: return GEN6_LOGICOP_INVERT; + case PIPE_LOGICOP_XOR: return GEN6_LOGICOP_XOR; + case PIPE_LOGICOP_NAND: return GEN6_LOGICOP_NAND; + case PIPE_LOGICOP_AND: return GEN6_LOGICOP_AND; + case PIPE_LOGICOP_EQUIV: return GEN6_LOGICOP_EQUIV; + case PIPE_LOGICOP_NOOP: return GEN6_LOGICOP_NOOP; + case PIPE_LOGICOP_OR_INVERTED: return GEN6_LOGICOP_OR_INVERTED; + case PIPE_LOGICOP_COPY: return GEN6_LOGICOP_COPY; + case PIPE_LOGICOP_OR_REVERSE: return GEN6_LOGICOP_OR_REVERSE; + case PIPE_LOGICOP_OR: return GEN6_LOGICOP_OR; + case PIPE_LOGICOP_SET: return GEN6_LOGICOP_SET; + default: + assert(!"unknown logicop function"); + return GEN6_LOGICOP_CLEAR; + } +} + +static int +ilo_translate_blend_func(unsigned blend) +{ + switch (blend) { + case PIPE_BLEND_ADD: return GEN6_BLENDFUNCTION_ADD; + case PIPE_BLEND_SUBTRACT: return GEN6_BLENDFUNCTION_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT; + case PIPE_BLEND_MIN: return GEN6_BLENDFUNCTION_MIN; + case PIPE_BLEND_MAX: return GEN6_BLENDFUNCTION_MAX; + default: + assert(!"unknown blend function"); + return GEN6_BLENDFUNCTION_ADD; + } +} + +static int +ilo_translate_blend_factor(unsigned factor) +{ + switch (factor) { + case PIPE_BLENDFACTOR_ONE: return GEN6_BLENDFACTOR_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: return GEN6_BLENDFACTOR_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: return GEN6_BLENDFACTOR_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: return GEN6_BLENDFACTOR_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: return GEN6_BLENDFACTOR_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: return GEN6_BLENDFACTOR_CONST_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: return GEN6_BLENDFACTOR_CONST_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: return GEN6_BLENDFACTOR_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: return GEN6_BLENDFACTOR_SRC1_ALPHA; + case PIPE_BLENDFACTOR_ZERO: return GEN6_BLENDFACTOR_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: return GEN6_BLENDFACTOR_INV_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: return GEN6_BLENDFACTOR_INV_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: return GEN6_BLENDFACTOR_INV_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: return GEN6_BLENDFACTOR_INV_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: return GEN6_BLENDFACTOR_INV_CONST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return GEN6_BLENDFACTOR_INV_CONST_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: return GEN6_BLENDFACTOR_INV_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: return GEN6_BLENDFACTOR_INV_SRC1_ALPHA; + default: + assert(!"unknown blend factor"); + return GEN6_BLENDFACTOR_ONE; + } +} + static void finalize_shader_states(struct ilo_state_vector *vec) { @@ -78,7 +350,7 @@ finalize_shader_states(struct ilo_state_vector *vec) /* need to setup SBE for FS */ if (type == PIPE_SHADER_FRAGMENT && vec->dirty & (state | ILO_DIRTY_GS | ILO_DIRTY_VS | ILO_DIRTY_RASTERIZER)) { - if (ilo_shader_select_kernel_routing(shader, + if (ilo_shader_select_kernel_sbe(shader, (vec->gs) ? vec->gs : vec->vs, vec->rasterizer)) vec->dirty |= state; } @@ -97,7 +369,6 @@ finalize_cbuf_state(struct ilo_context *ilo, ~ilo_shader_get_kernel_param(sh, ILO_KERNEL_SKIP_CBUF0_UPLOAD); while (upload_mask) { - const enum pipe_format elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT; unsigned offset, i; i = u_bit_scan(&upload_mask); @@ -105,14 +376,16 @@ finalize_cbuf_state(struct ilo_context *ilo, if (cbuf->cso[i].resource) continue; - u_upload_data(ilo->uploader, 0, cbuf->cso[i].user_buffer_size, + u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource); - ilo_gpe_init_view_surface_for_buffer(ilo->dev, - ilo_buffer(cbuf->cso[i].resource), - offset, cbuf->cso[i].user_buffer_size, - util_format_get_blocksize(elem_format), elem_format, - false, false, &cbuf->cso[i].surface); + cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource); + cbuf->cso[i].info.offset = offset; + + memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface)); + ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface, + ilo->dev, &cbuf->cso[i].info); + cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo; ilo->state_vector.dirty |= ILO_DIRTY_CBUF; } @@ -133,114 +406,380 @@ finalize_constant_buffers(struct ilo_context *ilo) static void finalize_index_buffer(struct ilo_context *ilo) { + const struct ilo_dev *dev = ilo->dev; struct ilo_state_vector *vec = &ilo->state_vector; const bool need_upload = (vec->draw->indexed && - (vec->ib.user_buffer || vec->ib.offset % vec->ib.index_size)); + (vec->ib.state.user_buffer || + vec->ib.state.offset % vec->ib.state.index_size)); struct pipe_resource *current_hw_res = NULL; + struct ilo_state_index_buffer_info info; + int64_t vertex_start_bias = 0; if (!(vec->dirty & ILO_DIRTY_IB) && !need_upload) return; + /* make sure vec->ib.hw_resource changes when reallocated */ pipe_resource_reference(¤t_hw_res, vec->ib.hw_resource); if (need_upload) { - const unsigned offset = vec->ib.index_size * vec->draw->start; - const unsigned size = vec->ib.index_size * vec->draw->count; + const unsigned offset = vec->ib.state.index_size * vec->draw->start; + const unsigned size = vec->ib.state.index_size * vec->draw->count; unsigned hw_offset; - if (vec->ib.user_buffer) { + if (vec->ib.state.user_buffer) { u_upload_data(ilo->uploader, 0, size, - vec->ib.user_buffer + offset, &hw_offset, &vec->ib.hw_resource); - } - else { - u_upload_buffer(ilo->uploader, 0, vec->ib.offset + offset, size, - vec->ib.buffer, &hw_offset, &vec->ib.hw_resource); + vec->ib.state.user_buffer + offset, + &hw_offset, &vec->ib.hw_resource); + } else { + u_upload_buffer(ilo->uploader, 0, + vec->ib.state.offset + offset, size, vec->ib.state.buffer, + &hw_offset, &vec->ib.hw_resource); } /* the HW offset should be aligned */ - assert(hw_offset % vec->ib.index_size == 0); - vec->ib.draw_start_offset = hw_offset / vec->ib.index_size; + assert(hw_offset % vec->ib.state.index_size == 0); + vertex_start_bias = hw_offset / vec->ib.state.index_size; /* * INDEX[vec->draw->start] in the original buffer is INDEX[0] in the HW * resource */ - vec->ib.draw_start_offset -= vec->draw->start; - } - else { - pipe_resource_reference(&vec->ib.hw_resource, vec->ib.buffer); + vertex_start_bias -= vec->draw->start; + } else { + pipe_resource_reference(&vec->ib.hw_resource, vec->ib.state.buffer); /* note that index size may be zero when the draw is not indexed */ if (vec->draw->indexed) - vec->ib.draw_start_offset = vec->ib.offset / vec->ib.index_size; - else - vec->ib.draw_start_offset = 0; + vertex_start_bias = vec->ib.state.offset / vec->ib.state.index_size; } + vec->draw_info.vertex_start += vertex_start_bias; + /* treat the IB as clean if the HW states do not change */ if (vec->ib.hw_resource == current_hw_res && - vec->ib.hw_index_size == vec->ib.index_size) + vec->ib.hw_index_size == vec->ib.state.index_size) vec->dirty &= ~ILO_DIRTY_IB; else - vec->ib.hw_index_size = vec->ib.index_size; + vec->ib.hw_index_size = vec->ib.state.index_size; pipe_resource_reference(¤t_hw_res, NULL); + + memset(&info, 0, sizeof(info)); + if (vec->ib.hw_resource) { + info.buf = ilo_buffer(vec->ib.hw_resource); + info.size = info.buf->bo_size; + info.format = ilo_translate_index_size(vec->ib.hw_index_size); + + vec->ib.ib.bo = info.buf->bo; + } + + ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info); } static void finalize_vertex_elements(struct ilo_context *ilo) { + const struct ilo_dev *dev = ilo->dev; + struct ilo_state_vector *vec = &ilo->state_vector; + struct ilo_ve_state *ve = vec->ve; + const bool last_element_edge_flag = (vec->vs && + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG)); + const bool prepend_vertexid = (vec->vs && + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_VERTEXID)); + const bool prepend_instanceid = (vec->vs && + ilo_shader_get_kernel_param(vec->vs, + ILO_KERNEL_VS_INPUT_INSTANCEID)); + const enum gen_index_format index_format = (vec->draw->indexed) ? + ilo_translate_index_size(vec->ib.state.index_size) : GEN6_INDEX_DWORD; + + /* check for non-orthogonal states */ + if (ve->vf_params.cv_topology != vec->draw_info.topology || + ve->vf_params.prepend_vertexid != prepend_vertexid || + ve->vf_params.prepend_instanceid != prepend_instanceid || + ve->vf_params.last_element_edge_flag != last_element_edge_flag || + ve->vf_params.cv_index_format != index_format || + ve->vf_params.cut_index_enable != vec->draw->primitive_restart || + ve->vf_params.cut_index != vec->draw->restart_index) { + ve->vf_params.cv_topology = vec->draw_info.topology; + ve->vf_params.prepend_vertexid = prepend_vertexid; + ve->vf_params.prepend_instanceid = prepend_instanceid; + ve->vf_params.last_element_edge_flag = last_element_edge_flag; + ve->vf_params.cv_index_format = index_format; + ve->vf_params.cut_index_enable = vec->draw->primitive_restart; + ve->vf_params.cut_index = vec->draw->restart_index; + + ilo_state_vf_set_params(&ve->vf, dev, &ve->vf_params); + + vec->dirty |= ILO_DIRTY_VE; + } +} + +static void +finalize_vertex_buffers(struct ilo_context *ilo) +{ + const struct ilo_dev *dev = ilo->dev; struct ilo_state_vector *vec = &ilo->state_vector; + struct ilo_state_vertex_buffer_info info; + unsigned i; - if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS))) + if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VB))) return; - vec->dirty |= ILO_DIRTY_VE; + memset(&info, 0, sizeof(info)); + + for (i = 0; i < vec->ve->vb_count; i++) { + const unsigned pipe_idx = vec->ve->vb_mapping[i]; + const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx]; + + if (cso->buffer) { + info.buf = ilo_buffer(cso->buffer); + info.offset = cso->buffer_offset; + info.size = info.buf->bo_size; + + info.stride = cso->stride; + + vec->vb.vb[i].bo = info.buf->bo; + } else { + memset(&info, 0, sizeof(info)); + } + + ilo_state_vertex_buffer_set_info(&vec->vb.vb[i], dev, &info); + } +} + +static void +finalize_urb(struct ilo_context *ilo) +{ + const uint16_t attr_size = sizeof(uint32_t) * 4; + const struct ilo_dev *dev = ilo->dev; + struct ilo_state_vector *vec = &ilo->state_vector; + struct ilo_state_urb_info info; + + if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS | + ILO_DIRTY_GS | ILO_DIRTY_FS))) + return; + + memset(&info, 0, sizeof(info)); + + info.ve_entry_size = attr_size * ilo_state_vf_get_attr_count(&vec->ve->vf); + + if (vec->vs) { + info.vs_const_data = (bool) + (ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_PCB_CBUF0_SIZE) + + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_PCB_UCP_SIZE)); + info.vs_entry_size = attr_size * + ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT); + } + + if (vec->gs) { + info.gs_const_data = (bool) + ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_PCB_CBUF0_SIZE); - vec->ve->last_cso_edgeflag = false; - if (vec->ve->count && vec->vs && - ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG)) { - vec->ve->edgeflag_cso = vec->ve->cso[vec->ve->count - 1]; - ilo_gpe_set_ve_edgeflag(ilo->dev, &vec->ve->edgeflag_cso); - vec->ve->last_cso_edgeflag = true; - } - - vec->ve->prepend_nosrc_cso = false; - if (vec->vs && - (ilo_shader_get_kernel_param(vec->vs, - ILO_KERNEL_VS_INPUT_INSTANCEID) || - ilo_shader_get_kernel_param(vec->vs, - ILO_KERNEL_VS_INPUT_VERTEXID))) { - ilo_gpe_init_ve_nosrc(ilo->dev, - GEN6_VFCOMP_STORE_VID, - GEN6_VFCOMP_STORE_IID, - GEN6_VFCOMP_NOSTORE, - GEN6_VFCOMP_NOSTORE, - &vec->ve->nosrc_cso); - vec->ve->prepend_nosrc_cso = true; - } else if (!vec->vs) { - /* generate VUE header */ - ilo_gpe_init_ve_nosrc(ilo->dev, - GEN6_VFCOMP_STORE_0, /* Reserved */ - GEN6_VFCOMP_STORE_0, /* Render Target Array Index */ - GEN6_VFCOMP_STORE_0, /* Viewport Index */ - GEN6_VFCOMP_STORE_0, /* Point Width */ - &vec->ve->nosrc_cso); - vec->ve->prepend_nosrc_cso = true; - } else if (!vec->ve->count) { /* - * From the Sandy Bridge PRM, volume 2 part 1, page 92: + * From the Ivy Bridge PRM, volume 2 part 1, page 189: + * + * "All outputs of a GS thread will be stored in the single GS + * thread output URB entry." * - * "SW must ensure that at least one vertex element is defined prior - * to issuing a 3DPRIMTIVE command, or operation is UNDEFINED." + * TODO */ - ilo_gpe_init_ve_nosrc(ilo->dev, - GEN6_VFCOMP_STORE_0, - GEN6_VFCOMP_STORE_0, - GEN6_VFCOMP_STORE_0, - GEN6_VFCOMP_STORE_1_FP, - &vec->ve->nosrc_cso); - vec->ve->prepend_nosrc_cso = true; + info.gs_entry_size = attr_size * + ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT); + } + + if (vec->fs) { + info.ps_const_data = (bool) + ilo_shader_get_kernel_param(vec->fs, ILO_KERNEL_PCB_CBUF0_SIZE); + } + + ilo_state_urb_set_info(&vec->urb, dev, &info); +} + +static void +finalize_viewport(struct ilo_context *ilo) +{ + const struct ilo_dev *dev = ilo->dev; + struct ilo_state_vector *vec = &ilo->state_vector; + + if (vec->dirty & ILO_DIRTY_VIEWPORT) { + ilo_state_viewport_set_params(&vec->viewport.vp, + dev, &vec->viewport.params, false); + } else if (vec->dirty & ILO_DIRTY_SCISSOR) { + ilo_state_viewport_set_params(&vec->viewport.vp, + dev, &vec->viewport.params, true); + vec->dirty |= ILO_DIRTY_VIEWPORT; + } +} + +static bool +can_enable_gb_test(const struct ilo_rasterizer_state *rasterizer, + const struct ilo_viewport_state *viewport, + const struct ilo_fb_state *fb) +{ + unsigned i; + + /* + * There are several reasons that guard band test should be disabled + * + * - GL wide points (to avoid partially visibie object) + * - GL wide or AA lines (to avoid partially visibie object) + * - missing 2D clipping + */ + if (rasterizer->state.point_size_per_vertex || + rasterizer->state.point_size > 1.0f || + rasterizer->state.line_width > 1.0f || + rasterizer->state.line_smooth) + return false; + + for (i = 0; i < viewport->params.count; i++) { + const struct ilo_state_viewport_matrix_info *mat = + &viewport->matrices[i]; + float min_x, max_x, min_y, max_y; + + min_x = -1.0f * fabsf(mat->scale[0]) + mat->translate[0]; + max_x = 1.0f * fabsf(mat->scale[0]) + mat->translate[0]; + min_y = -1.0f * fabsf(mat->scale[1]) + mat->translate[1]; + max_y = 1.0f * fabsf(mat->scale[1]) + mat->translate[1]; + + if (min_x > 0.0f || max_x < fb->state.width || + min_y > 0.0f || max_y < fb->state.height) + return false; + } + + return true; +} + +static void +finalize_rasterizer(struct ilo_context *ilo) +{ + const struct ilo_dev *dev = ilo->dev; + struct ilo_state_vector *vec = &ilo->state_vector; + struct ilo_rasterizer_state *rasterizer = vec->rasterizer; + struct ilo_state_raster_info *info = &vec->rasterizer->info; + const bool gb_test_enable = + can_enable_gb_test(rasterizer, &vec->viewport, &vec->fb); + const bool multisample = + (rasterizer->state.multisample && vec->fb.num_samples > 1); + const uint8_t barycentric_interps = ilo_shader_get_kernel_param(vec->fs, + ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS); + + /* check for non-orthogonal states */ + if (info->clip.viewport_count != vec->viewport.params.count || + info->clip.gb_test_enable != gb_test_enable || + info->setup.msaa_enable != multisample || + info->setup.line_msaa_enable != multisample || + info->tri.depth_offset_format != vec->fb.depth_offset_format || + info->scan.sample_count != vec->fb.num_samples || + info->scan.sample_mask != vec->sample_mask || + info->scan.barycentric_interps != barycentric_interps || + info->params.any_integer_rt != vec->fb.has_integer_rt || + info->params.hiz_enable != vec->fb.has_hiz) { + info->clip.viewport_count = vec->viewport.params.count; + info->clip.gb_test_enable = gb_test_enable; + info->setup.msaa_enable = multisample; + info->setup.line_msaa_enable = multisample; + info->tri.depth_offset_format = vec->fb.depth_offset_format; + info->scan.sample_count = vec->fb.num_samples; + info->scan.sample_mask = vec->sample_mask; + info->scan.barycentric_interps = barycentric_interps; + info->params.any_integer_rt = vec->fb.has_integer_rt; + info->params.hiz_enable = vec->fb.has_hiz; + + ilo_state_raster_set_info(&rasterizer->rs, dev, &rasterizer->info); + + vec->dirty |= ILO_DIRTY_RASTERIZER; + } +} + +static bool +finalize_blend_rt(struct ilo_context *ilo) +{ + struct ilo_state_vector *vec = &ilo->state_vector; + const struct ilo_fb_state *fb = &vec->fb; + struct ilo_blend_state *blend = vec->blend; + struct ilo_state_cc_blend_info *info = &vec->blend->info.blend; + bool changed = false; + unsigned i; + + if (!(vec->dirty & (ILO_DIRTY_FB | ILO_DIRTY_BLEND))) + return false; + + /* set up one for dummy RT writes */ + if (!fb->state.nr_cbufs) { + if (info->rt != &blend->dummy_rt) { + info->rt = &blend->dummy_rt; + info->rt_count = 1; + changed = true; + } + + return changed; + } + + if (info->rt != blend->effective_rt || + info->rt_count != fb->state.nr_cbufs) { + info->rt = blend->effective_rt; + info->rt_count = fb->state.nr_cbufs; + changed = true; + } + + for (i = 0; i < fb->state.nr_cbufs; i++) { + const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i]; + struct ilo_state_cc_blend_rt_info *rt = &blend->effective_rt[i]; + /* ignore logicop when not UNORM */ + const bool logicop_enable = + (blend->rt[i].logicop_enable && caps->is_unorm); + + if (rt->cv_is_unorm != caps->is_unorm || + rt->cv_is_integer != caps->is_integer || + rt->logicop_enable != logicop_enable || + rt->force_dst_alpha_one != caps->force_dst_alpha_one) { + rt->cv_is_unorm = caps->is_unorm; + rt->cv_is_integer = caps->is_integer; + rt->logicop_enable = logicop_enable; + rt->force_dst_alpha_one = caps->force_dst_alpha_one; + + changed = true; + } + } + + return changed; +} + +static void +finalize_blend(struct ilo_context *ilo) +{ + const struct ilo_dev *dev = ilo->dev; + struct ilo_state_vector *vec = &ilo->state_vector; + struct ilo_blend_state *blend = vec->blend; + struct ilo_state_cc_info *info = &blend->info; + const bool sample_count_one = (vec->fb.num_samples <= 1); + const bool float_source0_alpha = + (!vec->fb.state.nr_cbufs || !vec->fb.state.cbufs[0] || + !util_format_is_pure_integer(vec->fb.state.cbufs[0]->format)); + + /* check for non-orthogonal states */ + if (finalize_blend_rt(ilo) || + info->alpha.cv_sample_count_one != sample_count_one || + info->alpha.cv_float_source0_alpha != float_source0_alpha || + info->alpha.test_enable != vec->dsa->alpha_test || + info->alpha.test_func != vec->dsa->alpha_func || + memcmp(&info->stencil, &vec->dsa->stencil, sizeof(info->stencil)) || + memcmp(&info->depth, &vec->dsa->depth, sizeof(info->depth)) || + memcmp(&info->params, &vec->cc_params, sizeof(info->params))) { + info->alpha.cv_sample_count_one = sample_count_one; + info->alpha.cv_float_source0_alpha = float_source0_alpha; + info->alpha.test_enable = vec->dsa->alpha_test; + info->alpha.test_func = vec->dsa->alpha_func; + info->stencil = vec->dsa->stencil; + info->depth = vec->dsa->depth; + info->params = vec->cc_params; + + ilo_state_cc_set_info(&blend->cc, dev, info); + + blend->alpha_may_kill = (info->alpha.alpha_to_coverage || + info->alpha.test_enable); + + vec->dirty |= ILO_DIRTY_BLEND; } } @@ -254,10 +793,24 @@ ilo_finalize_3d_states(struct ilo_context *ilo, { ilo->state_vector.draw = draw; + ilo->state_vector.draw_info.topology = ilo_translate_draw_mode(draw->mode); + ilo->state_vector.draw_info.indexed = draw->indexed; + ilo->state_vector.draw_info.vertex_count = draw->count; + ilo->state_vector.draw_info.vertex_start = draw->start; + ilo->state_vector.draw_info.instance_count = draw->instance_count; + ilo->state_vector.draw_info.instance_start = draw->start_instance; + ilo->state_vector.draw_info.vertex_base = draw->index_bias; + + finalize_blend(ilo); finalize_shader_states(&ilo->state_vector); finalize_constant_buffers(ilo); finalize_index_buffer(ilo); finalize_vertex_elements(ilo); + finalize_vertex_buffers(ilo); + + finalize_urb(ilo); + finalize_rasterizer(ilo); + finalize_viewport(ilo); u_upload_unmap(ilo->uploader); } @@ -301,12 +854,79 @@ ilo_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *state) { const struct ilo_dev *dev = ilo_context(pipe)->dev; + struct ilo_state_cc_info *info; struct ilo_blend_state *blend; + int i; - blend = MALLOC_STRUCT(ilo_blend_state); + blend = CALLOC_STRUCT(ilo_blend_state); assert(blend); - ilo_gpe_init_blend(dev, state, blend); + info = &blend->info; + + info->alpha.cv_float_source0_alpha = true; + info->alpha.cv_sample_count_one = true; + info->alpha.alpha_to_one = state->alpha_to_one; + info->alpha.alpha_to_coverage = state->alpha_to_coverage; + info->alpha.test_enable = false; + info->alpha.test_func = GEN6_COMPAREFUNCTION_ALWAYS; + + info->stencil.cv_has_buffer = true; + info->depth.cv_has_buffer= true; + + info->blend.rt = blend->effective_rt; + info->blend.rt_count = 1; + info->blend.dither_enable = state->dither; + + for (i = 0; i < ARRAY_SIZE(blend->rt); i++) { + const struct pipe_rt_blend_state *rt = &state->rt[i]; + struct ilo_state_cc_blend_rt_info *rt_info = &blend->rt[i]; + + rt_info->cv_has_buffer = true; + rt_info->cv_is_unorm = true; + rt_info->cv_is_integer = false; + + /* logic op takes precedence over blending */ + if (state->logicop_enable) { + rt_info->logicop_enable = true; + rt_info->logicop_func = ilo_translate_logicop(state->logicop_func); + } else if (rt->blend_enable) { + rt_info->blend_enable = true; + + rt_info->rgb_src = ilo_translate_blend_factor(rt->rgb_src_factor); + rt_info->rgb_dst = ilo_translate_blend_factor(rt->rgb_dst_factor); + rt_info->rgb_func = ilo_translate_blend_func(rt->rgb_func); + + rt_info->a_src = ilo_translate_blend_factor(rt->alpha_src_factor); + rt_info->a_dst = ilo_translate_blend_factor(rt->alpha_dst_factor); + rt_info->a_func = ilo_translate_blend_func(rt->alpha_func); + } + + if (!(rt->colormask & PIPE_MASK_A)) + rt_info->argb_write_disables |= (1 << 3); + if (!(rt->colormask & PIPE_MASK_R)) + rt_info->argb_write_disables |= (1 << 2); + if (!(rt->colormask & PIPE_MASK_G)) + rt_info->argb_write_disables |= (1 << 1); + if (!(rt->colormask & PIPE_MASK_B)) + rt_info->argb_write_disables |= (1 << 0); + + if (!state->independent_blend_enable) { + for (i = 1; i < ARRAY_SIZE(blend->rt); i++) + blend->rt[i] = *rt_info; + break; + } + } + + memcpy(blend->effective_rt, blend->rt, sizeof(blend->rt)); + + blend->dummy_rt.argb_write_disables = 0xf; + + if (!ilo_state_cc_init(&blend->cc, dev, &blend->info)) { + FREE(blend); + return NULL; + } + + blend->dual_blend = util_blend_state_is_dual(state, 0); return blend; } @@ -333,11 +953,105 @@ ilo_create_sampler_state(struct pipe_context *pipe, { const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_sampler_cso *sampler; + struct ilo_state_sampler_info info; + struct ilo_state_sampler_border_info border; - sampler = MALLOC_STRUCT(ilo_sampler_cso); + sampler = CALLOC_STRUCT(ilo_sampler_cso); assert(sampler); - ilo_gpe_init_sampler_cso(dev, state, sampler); + memset(&info, 0, sizeof(info)); + + info.non_normalized = !state->normalized_coords; + if (state->normalized_coords) { + info.lod_bias = state->lod_bias; + info.min_lod = state->min_lod; + info.max_lod = state->max_lod; + + info.mip_filter = ilo_translate_mip_filter(state->min_mip_filter); + } else { + /* work around a bug in util_blitter */ + info.mip_filter = GEN6_MIPFILTER_NONE; + } + + if (state->max_anisotropy) { + info.min_filter = GEN6_MAPFILTER_ANISOTROPIC; + info.mag_filter = GEN6_MAPFILTER_ANISOTROPIC; + } else { + info.min_filter = ilo_translate_img_filter(state->min_img_filter); + info.mag_filter = ilo_translate_img_filter(state->mag_img_filter); + } + + info.max_anisotropy = ilo_translate_max_anisotropy(state->max_anisotropy); + + /* use LOD 0 when no mipmapping (see sampler_set_gen6_SAMPLER_STATE()) */ + if (info.mip_filter == GEN6_MIPFILTER_NONE && info.min_lod > 0.0f) { + info.min_lod = 0.0f; + info.mag_filter = info.min_filter; + } + + if (state->seamless_cube_map) { + if (state->min_img_filter == PIPE_TEX_FILTER_NEAREST || + state->mag_img_filter == PIPE_TEX_FILTER_NEAREST) { + info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP; + info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP; + info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP; + } else { + info.tcx_ctrl = GEN6_TEXCOORDMODE_CUBE; + info.tcy_ctrl = GEN6_TEXCOORDMODE_CUBE; + info.tcz_ctrl = GEN6_TEXCOORDMODE_CUBE; + } + } else { + info.tcx_ctrl = ilo_translate_address_wrap(state->wrap_s); + info.tcy_ctrl = ilo_translate_address_wrap(state->wrap_t); + info.tcz_ctrl = ilo_translate_address_wrap(state->wrap_r); + + if (ilo_dev_gen(dev) < ILO_GEN(8)) { + /* + * For nearest filtering, PIPE_TEX_WRAP_CLAMP means + * PIPE_TEX_WRAP_CLAMP_TO_EDGE; for linear filtering, + * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while + * additionally clamping the texture coordinates to [0.0, 1.0]. + * + * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8. The + * clamping has to be taken care of in the shaders. There are two + * filters here, but let the minification one has a say. + */ + const bool clamp_is_to_edge = + (state->min_img_filter == PIPE_TEX_FILTER_NEAREST); + + if (clamp_is_to_edge) { + if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) + info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP; + if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) + info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP; + if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) + info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP; + } else { + if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) { + info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER; + sampler->saturate_s = true; + } + if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) { + info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER; + sampler->saturate_t = true; + } + if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) { + info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER; + sampler->saturate_r = true; + } + } + } + } + + if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) + info.shadow_func = ilo_translate_shadow_func(state->compare_func); + + ilo_state_sampler_init(&sampler->sampler, dev, &info); + + memset(&border, 0, sizeof(border)); + memcpy(border.rgba.f, state->border_color.f, sizeof(border.rgba.f)); + + ilo_state_sampler_border_init(&sampler->border, dev, &border); return sampler; } @@ -403,12 +1117,74 @@ ilo_create_rasterizer_state(struct pipe_context *pipe, { const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_rasterizer_state *rast; + struct ilo_state_raster_info *info; - rast = MALLOC_STRUCT(ilo_rasterizer_state); + rast = CALLOC_STRUCT(ilo_rasterizer_state); assert(rast); rast->state = *state; - ilo_gpe_init_rasterizer(dev, state, rast); + + info = &rast->info; + + info->clip.clip_enable = true; + info->clip.stats_enable = true; + info->clip.viewport_count = 1; + info->clip.force_rtaindex_zero = true; + info->clip.user_clip_enables = state->clip_plane_enable; + info->clip.gb_test_enable = true; + info->clip.xy_test_enable = true; + info->clip.z_far_enable = state->depth_clip; + info->clip.z_near_enable = state->depth_clip; + info->clip.z_near_zero = state->clip_halfz; + + info->setup.first_vertex_provoking = state->flatshade_first; + info->setup.viewport_transform = true; + info->setup.scissor_enable = state->scissor; + info->setup.msaa_enable = false; + info->setup.line_msaa_enable = false; + info->point.aa_enable = state->point_smooth; + info->point.programmable_width = state->point_size_per_vertex; + info->line.aa_enable = state->line_smooth; + info->line.stipple_enable = state->line_stipple_enable; + info->line.giq_enable = true; + info->line.giq_last_pixel = state->line_last_pixel; + info->tri.front_winding = ilo_translate_front_ccw(state->front_ccw); + info->tri.cull_mode = ilo_translate_cull_face(state->cull_face); + info->tri.fill_mode_front = ilo_translate_poly_mode(state->fill_front); + info->tri.fill_mode_back = ilo_translate_poly_mode(state->fill_back); + info->tri.depth_offset_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT; + info->tri.depth_offset_solid = state->offset_tri; + info->tri.depth_offset_wireframe = state->offset_line; + info->tri.depth_offset_point = state->offset_point; + info->tri.poly_stipple_enable = state->poly_stipple_enable; + + info->scan.stats_enable = true; + info->scan.sample_count = 1; + info->scan.pixloc = + ilo_translate_half_pixel_center(state->half_pixel_center); + info->scan.sample_mask = ~0u; + info->scan.zw_interp = GEN6_ZW_INTERP_PIXEL; + info->scan.barycentric_interps = GEN6_INTERP_PERSPECTIVE_PIXEL; + info->scan.earlyz_control = GEN7_EDSC_NORMAL; + info->scan.earlyz_op = ILO_STATE_RASTER_EARLYZ_NORMAL; + info->scan.earlyz_stencil_clear = false; + + info->params.any_integer_rt = false; + info->params.hiz_enable = true; + info->params.point_width = + (state->point_size == 0.0f) ? 1.0f : state->point_size; + info->params.line_width = + (state->line_width == 0.0f) ? 1.0f : state->line_width; + + info->params.depth_offset_scale = state->offset_scale; + /* + * Scale the constant term. The minimum representable value used by the HW + * is not large enouch to be the minimum resolvable difference. + */ + info->params.depth_offset_const = state->offset_units * 2.0f; + info->params.depth_offset_clamp = state->offset_clamp; + + ilo_state_raster_init(&rast->rs, dev, info); return rast; } @@ -416,10 +1192,20 @@ ilo_create_rasterizer_state(struct pipe_context *pipe, static void ilo_bind_rasterizer_state(struct pipe_context *pipe, void *state) { + const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; vec->rasterizer = state; + if (vec->rasterizer) { + struct ilo_state_line_stipple_info info; + + info.pattern = vec->rasterizer->state.line_stipple_pattern; + info.repeat_count = vec->rasterizer->state.line_stipple_factor + 1; + + ilo_state_line_stipple_set_info(&vec->line_stipple, dev, &info); + } + vec->dirty |= ILO_DIRTY_RASTERIZER; } @@ -433,13 +1219,48 @@ static void * ilo_create_depth_stencil_alpha_state(struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *state) { - const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_dsa_state *dsa; + int i; - dsa = MALLOC_STRUCT(ilo_dsa_state); + dsa = CALLOC_STRUCT(ilo_dsa_state); assert(dsa); - ilo_gpe_init_dsa(dev, state, dsa); + dsa->depth.cv_has_buffer = true; + dsa->depth.test_enable = state->depth.enabled; + dsa->depth.write_enable = state->depth.writemask; + dsa->depth.test_func = ilo_translate_compare_func(state->depth.func); + + dsa->stencil.cv_has_buffer = true; + for (i = 0; i < ARRAY_SIZE(state->stencil); i++) { + const struct pipe_stencil_state *stencil = &state->stencil[i]; + struct ilo_state_cc_stencil_op_info *op; + + if (!stencil->enabled) + break; + + if (i == 0) { + dsa->stencil.test_enable = true; + dsa->stencil_front.test_mask = stencil->valuemask; + dsa->stencil_front.write_mask = stencil->writemask; + + op = &dsa->stencil.front; + } else { + dsa->stencil.twosided_enable = true; + dsa->stencil_back.test_mask = stencil->valuemask; + dsa->stencil_back.write_mask = stencil->writemask; + + op = &dsa->stencil.back; + } + + op->test_func = ilo_translate_compare_func(stencil->func); + op->fail_op = ilo_translate_stencil_op(stencil->fail_op); + op->zfail_op = ilo_translate_stencil_op(stencil->zfail_op); + op->zpass_op = ilo_translate_stencil_op(stencil->zpass_op); + } + + dsa->alpha_test = state->alpha.enabled; + dsa->alpha_ref = state->alpha.ref_value; + dsa->alpha_func = ilo_translate_compare_func(state->alpha.func); return dsa; } @@ -450,6 +1271,17 @@ ilo_bind_depth_stencil_alpha_state(struct pipe_context *pipe, void *state) struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; vec->dsa = state; + if (vec->dsa) { + vec->cc_params.alpha_ref = vec->dsa->alpha_ref; + vec->cc_params.stencil_front.test_mask = + vec->dsa->stencil_front.test_mask; + vec->cc_params.stencil_front.write_mask = + vec->dsa->stencil_front.write_mask; + vec->cc_params.stencil_back.test_mask = + vec->dsa->stencil_back.test_mask; + vec->cc_params.stencil_back.write_mask = + vec->dsa->stencil_back.write_mask; + } vec->dirty |= ILO_DIRTY_DSA; } @@ -575,12 +1407,60 @@ ilo_create_vertex_elements_state(struct pipe_context *pipe, const struct pipe_vertex_element *elements) { const struct ilo_dev *dev = ilo_context(pipe)->dev; + struct ilo_state_vf_element_info vf_elements[PIPE_MAX_ATTRIBS]; + unsigned instance_divisors[PIPE_MAX_ATTRIBS]; + struct ilo_state_vf_info vf_info; struct ilo_ve_state *ve; + unsigned i; - ve = MALLOC_STRUCT(ilo_ve_state); + ve = CALLOC_STRUCT(ilo_ve_state); assert(ve); - ilo_gpe_init_ve(dev, num_elements, elements, ve); + for (i = 0; i < num_elements; i++) { + const struct pipe_vertex_element *elem = &elements[i]; + struct ilo_state_vf_element_info *attr = &vf_elements[i]; + unsigned hw_idx; + + /* + * map the pipe vb to the hardware vb, which has a fixed instance + * divisor + */ + for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) { + if (ve->vb_mapping[hw_idx] == elem->vertex_buffer_index && + instance_divisors[hw_idx] == elem->instance_divisor) + break; + } + + /* create one if there is no matching hardware vb */ + if (hw_idx >= ve->vb_count) { + hw_idx = ve->vb_count++; + + ve->vb_mapping[hw_idx] = elem->vertex_buffer_index; + instance_divisors[hw_idx] = elem->instance_divisor; + } + + attr->buffer = hw_idx; + attr->vertex_offset = elem->src_offset; + attr->format = ilo_format_translate_vertex(dev, elem->src_format); + attr->format_size = util_format_get_blocksize(elem->src_format); + attr->component_count = util_format_get_nr_components(elem->src_format); + attr->is_integer = util_format_is_pure_integer(elem->src_format); + + attr->instancing_enable = (elem->instance_divisor != 0); + attr->instancing_step_rate = elem->instance_divisor; + } + + memset(&vf_info, 0, sizeof(vf_info)); + vf_info.data = ve->vf_data; + vf_info.data_size = sizeof(ve->vf_data); + vf_info.elements = vf_elements; + vf_info.element_count = num_elements; + /* vf_info.params and ve->vf_params are both zeroed */ + + if (!ilo_state_vf_init(&ve->vf, dev, &vf_info)) { + FREE(ve); + return NULL; + } return ve; } @@ -609,7 +1489,7 @@ ilo_set_blend_color(struct pipe_context *pipe, { struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; - vec->blend_color = *state; + memcpy(vec->cc_params.blend_rgba, state->color, sizeof(state->color)); vec->dirty |= ILO_DIRTY_BLEND_COLOR; } @@ -626,6 +1506,9 @@ ilo_set_stencil_ref(struct pipe_context *pipe, vec->stencil_ref = *state; + vec->cc_params.stencil_front.test_ref = state->ref_value[0]; + vec->cc_params.stencil_back.test_ref = state->ref_value[1]; + vec->dirty |= ILO_DIRTY_STENCIL_REF; } @@ -675,47 +1558,47 @@ ilo_set_constant_buffer(struct pipe_context *pipe, pipe_resource_reference(&cso->resource, buf[i].buffer); + cso->info.access = ILO_STATE_SURFACE_ACCESS_DP_DATA; + cso->info.format = GEN6_FORMAT_R32G32B32A32_FLOAT; + cso->info.format_size = 16; + cso->info.struct_size = 16; + cso->info.readonly = true; + cso->info.size = buf[i].buffer_size; + if (buf[i].buffer) { - const enum pipe_format elem_format = - PIPE_FORMAT_R32G32B32A32_FLOAT; + cso->info.buf = ilo_buffer(buf[i].buffer); + cso->info.offset = buf[i].buffer_offset; - ilo_gpe_init_view_surface_for_buffer(dev, - ilo_buffer(buf[i].buffer), - buf[i].buffer_offset, buf[i].buffer_size, - util_format_get_blocksize(elem_format), elem_format, - false, false, &cso->surface); + memset(&cso->surface, 0, sizeof(cso->surface)); + ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info); + cso->surface.bo = cso->info.buf->bo; cso->user_buffer = NULL; - cso->user_buffer_size = 0; cbuf->enabled_mask |= 1 << (index + i); - } - else if (buf[i].user_buffer) { - cso->surface.bo = NULL; - + } else if (buf[i].user_buffer) { + cso->info.buf = NULL; /* buffer_offset does not apply for user buffer */ cso->user_buffer = buf[i].user_buffer; - cso->user_buffer_size = buf[i].buffer_size; cbuf->enabled_mask |= 1 << (index + i); - } - else { - cso->surface.bo = NULL; + } else { + cso->info.buf = NULL; + cso->info.size = 0; cso->user_buffer = NULL; - cso->user_buffer_size = 0; cbuf->enabled_mask &= ~(1 << (index + i)); } } - } - else { + } else { for (i = 0; i < count; i++) { struct ilo_cbuf_cso *cso = &cbuf->cso[index + i]; pipe_resource_reference(&cso->resource, NULL); - cso->surface.bo = NULL; + + cso->info.buf = NULL; + cso->info.size = 0; cso->user_buffer = NULL; - cso->user_buffer_size = 0; cbuf->enabled_mask &= ~(1 << (index + i)); } @@ -725,13 +1608,116 @@ ilo_set_constant_buffer(struct pipe_context *pipe, } static void +fb_set_blend_caps(const struct ilo_dev *dev, + enum pipe_format format, + struct ilo_fb_blend_caps *caps) +{ + const struct util_format_description *desc = + util_format_description(format); + const int ch = util_format_get_first_non_void_channel(format); + + memset(caps, 0, sizeof(*caps)); + + if (format == PIPE_FORMAT_NONE || desc->is_mixed) + return; + + caps->is_unorm = (ch >= 0 && desc->channel[ch].normalized && + desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED && + desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB); + caps->is_integer = util_format_is_pure_integer(format); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 365: + * + * "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB + * variants), otherwise Logic Ops must be DISABLED." + * + * According to the classic driver, this is lifted on Gen8+. + */ + caps->can_logicop = (ilo_dev_gen(dev) >= ILO_GEN(8) || caps->is_unorm); + + /* no blending for pure integer formats */ + caps->can_blend = !caps->is_integer; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 382: + * + * "Alpha Test can only be enabled if Pixel Shader outputs a float + * alpha value." + */ + caps->can_alpha_test = !caps->is_integer; + + caps->force_dst_alpha_one = + (ilo_format_translate_render(dev, format) != + ilo_format_translate_color(dev, format)); + + /* sanity check */ + if (caps->force_dst_alpha_one) { + enum pipe_format render_format; + + switch (format) { + case PIPE_FORMAT_B8G8R8X8_UNORM: + render_format = PIPE_FORMAT_B8G8R8A8_UNORM; + break; + default: + render_format = PIPE_FORMAT_NONE; + break; + } + + assert(ilo_format_translate_render(dev, format) == + ilo_format_translate_color(dev, render_format)); + } +} + +static void ilo_set_framebuffer_state(struct pipe_context *pipe, const struct pipe_framebuffer_state *state) { const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; + struct ilo_fb_state *fb = &vec->fb; + const struct pipe_surface *first_surf = NULL; + int i; + + util_copy_framebuffer_state(&fb->state, state); + + fb->has_integer_rt = false; + for (i = 0; i < state->nr_cbufs; i++) { + if (state->cbufs[i]) { + fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]); - ilo_gpe_set_fb(dev, state, &vec->fb); + fb->has_integer_rt |= fb->blend_caps[i].is_integer; + + if (!first_surf) + first_surf = state->cbufs[i]; + } else { + fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]); + } + } + + if (!first_surf && state->zsbuf) + first_surf = state->zsbuf; + + fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1; + if (!fb->num_samples) + fb->num_samples = 1; + + if (state->zsbuf) { + const struct ilo_surface_cso *cso = + (const struct ilo_surface_cso *) state->zsbuf; + + fb->has_hiz = cso->u.zs.hiz_bo; + fb->depth_offset_format = + ilo_state_zs_get_depth_format(&cso->u.zs, dev); + } else { + fb->has_hiz = false; + fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT; + } + + /* + * The PRMs list several restrictions when the framebuffer has more than + * one surface. It seems they are actually lifted on GEN6+. + */ vec->dirty |= ILO_DIRTY_FB; } @@ -740,9 +1726,15 @@ static void ilo_set_polygon_stipple(struct pipe_context *pipe, const struct pipe_poly_stipple *state) { + const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; + struct ilo_state_poly_stipple_info info; + int i; + + for (i = 0; i < 32; i++) + info.pattern[i] = state->stipple[i]; - vec->poly_stipple = *state; + ilo_state_poly_stipple_set_info(&vec->poly_stipple, dev, &info); vec->dirty |= ILO_DIRTY_POLY_STIPPLE; } @@ -753,11 +1745,26 @@ ilo_set_scissor_states(struct pipe_context *pipe, unsigned num_scissors, const struct pipe_scissor_state *scissors) { - const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; + unsigned i; + + for (i = 0; i < num_scissors; i++) { + struct ilo_state_viewport_scissor_info *info = + &vec->viewport.scissors[start_slot + i]; - ilo_gpe_set_scissor(dev, start_slot, num_scissors, - scissors, &vec->scissor); + if (scissors[i].minx < scissors[i].maxx && + scissors[i].miny < scissors[i].maxy) { + info->min_x = scissors[i].minx; + info->min_y = scissors[i].miny; + info->max_x = scissors[i].maxx - 1; + info->max_y = scissors[i].maxy - 1; + } else { + info->min_x = 1; + info->min_y = 1; + info->max_x = 0; + info->max_y = 0; + } + } vec->dirty |= ILO_DIRTY_SCISSOR; } @@ -768,28 +1775,31 @@ ilo_set_viewport_states(struct pipe_context *pipe, unsigned num_viewports, const struct pipe_viewport_state *viewports) { - const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; if (viewports) { unsigned i; for (i = 0; i < num_viewports; i++) { - ilo_gpe_set_viewport_cso(dev, &viewports[i], - &vec->viewport.cso[start_slot + i]); + struct ilo_state_viewport_matrix_info *info = + &vec->viewport.matrices[start_slot + i]; + + memcpy(info->scale, viewports[i].scale, sizeof(info->scale)); + memcpy(info->translate, viewports[i].translate, + sizeof(info->translate)); } - if (vec->viewport.count < start_slot + num_viewports) - vec->viewport.count = start_slot + num_viewports; + if (vec->viewport.params.count < start_slot + num_viewports) + vec->viewport.params.count = start_slot + num_viewports; /* need to save viewport 0 for util_blitter */ if (!start_slot && num_viewports) vec->viewport.viewport0 = viewports[0]; } else { - if (vec->viewport.count <= start_slot + num_viewports && - vec->viewport.count > start_slot) - vec->viewport.count = start_slot; + if (vec->viewport.params.count <= start_slot + num_viewports && + vec->viewport.params.count > start_slot) + vec->viewport.params.count = start_slot; } vec->dirty |= ILO_DIRTY_VIEWPORT; @@ -905,16 +1915,11 @@ ilo_set_index_buffer(struct pipe_context *pipe, struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; if (state) { - pipe_resource_reference(&vec->ib.buffer, state->buffer); - vec->ib.user_buffer = state->user_buffer; - vec->ib.offset = state->offset; - vec->ib.index_size = state->index_size; - } - else { - pipe_resource_reference(&vec->ib.buffer, NULL); - vec->ib.user_buffer = NULL; - vec->ib.offset = 0; - vec->ib.index_size = 0; + pipe_resource_reference(&vec->ib.state.buffer, state->buffer); + vec->ib.state = *state; + } else { + pipe_resource_reference(&vec->ib.state.buffer, NULL); + memset(&vec->ib.state, 0, sizeof(vec->ib.state)); } vec->dirty |= ILO_DIRTY_IB; @@ -926,19 +1931,28 @@ ilo_create_stream_output_target(struct pipe_context *pipe, unsigned buffer_offset, unsigned buffer_size) { - struct pipe_stream_output_target *target; + const struct ilo_dev *dev = ilo_context(pipe)->dev; + struct ilo_stream_output_target *target; + struct ilo_state_sol_buffer_info info; - target = MALLOC_STRUCT(pipe_stream_output_target); + target = CALLOC_STRUCT(ilo_stream_output_target); assert(target); - pipe_reference_init(&target->reference, 1); - target->buffer = NULL; - pipe_resource_reference(&target->buffer, res); - target->context = pipe; - target->buffer_offset = buffer_offset; - target->buffer_size = buffer_size; + pipe_reference_init(&target->base.reference, 1); + pipe_resource_reference(&target->base.buffer, res); + target->base.context = pipe; + target->base.buffer_offset = buffer_offset; + target->base.buffer_size = buffer_size; + + memset(&info, 0, sizeof(info)); + info.buf = ilo_buffer(res); + info.offset = buffer_offset; + info.size = buffer_size; - return target; + ilo_state_sol_buffer_init(&target->sb, dev, &info); + target->sb.bo = info.buf->bo; + + return &target->base; } static void @@ -991,7 +2005,7 @@ ilo_create_sampler_view(struct pipe_context *pipe, const struct ilo_dev *dev = ilo_context(pipe)->dev; struct ilo_view_cso *view; - view = MALLOC_STRUCT(ilo_view_cso); + view = CALLOC_STRUCT(ilo_view_cso); assert(view); view->base = *templ; @@ -1001,16 +2015,24 @@ ilo_create_sampler_view(struct pipe_context *pipe, view->base.context = pipe; if (res->target == PIPE_BUFFER) { - const unsigned elem_size = util_format_get_blocksize(templ->format); - const unsigned first_elem = templ->u.buf.first_element; - const unsigned num_elems = templ->u.buf.last_element - first_elem + 1; - - ilo_gpe_init_view_surface_for_buffer(dev, ilo_buffer(res), - first_elem * elem_size, num_elems * elem_size, - elem_size, templ->format, false, false, &view->surface); - } - else { + struct ilo_state_surface_buffer_info info; + + memset(&info, 0, sizeof(info)); + info.buf = ilo_buffer(res); + info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER; + info.format = ilo_format_translate_color(dev, templ->format); + info.format_size = util_format_get_blocksize(templ->format); + info.struct_size = info.format_size; + info.readonly = true; + info.offset = templ->u.buf.first_element * info.struct_size; + info.size = (templ->u.buf.last_element - + templ->u.buf.first_element + 1) * info.struct_size; + + ilo_state_surface_init_for_buffer(&view->surface, dev, &info); + view->surface.bo = info.buf->bo; + } else { struct ilo_texture *tex = ilo_texture(res); + struct ilo_state_surface_image_info info; /* warn about degraded performance because of a missing binding flag */ if (tex->image.tiling == GEN6_TILING_NONE && @@ -1019,13 +2041,33 @@ ilo_create_sampler_view(struct pipe_context *pipe, "not created for sampling\n"); } - ilo_gpe_init_view_surface_for_image(dev, &tex->image, - tex->base.target, templ->format, - templ->u.tex.first_level, - templ->u.tex.last_level - templ->u.tex.first_level + 1, - templ->u.tex.first_layer, - templ->u.tex.last_layer - templ->u.tex.first_layer + 1, - false, &view->surface); + memset(&info, 0, sizeof(info)); + info.img = &tex->image; + + info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER; + + if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && + tex->image.separate_stencil) { + info.format = ilo_format_translate_texture(dev, + PIPE_FORMAT_Z32_FLOAT); + } else { + info.format = ilo_format_translate_texture(dev, templ->format); + } + + info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE || + tex->image.target == PIPE_TEXTURE_CUBE_ARRAY); + info.is_array = util_resource_is_array_texture(&tex->base); + info.readonly = true; + + info.level_base = templ->u.tex.first_level; + info.level_count = templ->u.tex.last_level - + templ->u.tex.first_level + 1; + info.slice_base = templ->u.tex.first_layer; + info.slice_count = templ->u.tex.last_layer - + templ->u.tex.first_layer + 1; + + ilo_state_surface_init_for_image(&view->surface, dev, &info); + view->surface.bo = info.img->bo; } return &view->base; @@ -1048,7 +2090,7 @@ ilo_create_surface(struct pipe_context *pipe, struct ilo_texture *tex = ilo_texture(res); struct ilo_surface_cso *surf; - surf = MALLOC_STRUCT(ilo_surface_cso); + surf = CALLOC_STRUCT(ilo_surface_cso); assert(surf); surf->base = *templ; @@ -1063,28 +2105,56 @@ ilo_create_surface(struct pipe_context *pipe, surf->is_rt = !util_format_is_depth_or_stencil(templ->format); if (surf->is_rt) { + struct ilo_state_surface_image_info info; + /* relax this? */ assert(tex->base.target != PIPE_BUFFER); - /* - * classic i965 sets render_cache_rw for constant buffers and sol - * surfaces but not render buffers. Why? - */ - ilo_gpe_init_view_surface_for_image(dev, - &tex->image, tex->base.target, - templ->format, templ->u.tex.level, 1, - templ->u.tex.first_layer, - templ->u.tex.last_layer - templ->u.tex.first_layer + 1, - true, &surf->u.rt); + memset(&info, 0, sizeof(info)); + info.img = &tex->image; + info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER; + info.format = ilo_format_translate_render(dev, templ->format); + info.is_array = util_resource_is_array_texture(&tex->base); + info.level_base = templ->u.tex.level; + info.level_count = 1; + info.slice_base = templ->u.tex.first_layer; + info.slice_count = templ->u.tex.last_layer - + templ->u.tex.first_layer + 1; + + ilo_state_surface_init_for_image(&surf->u.rt, dev, &info); + surf->u.rt.bo = info.img->bo; } else { + struct ilo_state_zs_info info; + assert(res->target != PIPE_BUFFER); - ilo_gpe_init_zs_surface(dev, &tex->image, - (tex->separate_s8) ? &tex->separate_s8->image : NULL, - tex->base.target, templ->format, - templ->u.tex.level, templ->u.tex.first_layer, - templ->u.tex.last_layer - templ->u.tex.first_layer + 1, - &surf->u.zs); + memset(&info, 0, sizeof(info)); + + if (templ->format == PIPE_FORMAT_S8_UINT) { + info.s_img = &tex->image; + } else { + info.z_img = &tex->image; + info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL; + + info.hiz_enable = + ilo_image_can_enable_aux(&tex->image, templ->u.tex.level); + } + + info.level = templ->u.tex.level; + info.slice_base = templ->u.tex.first_layer; + info.slice_count = templ->u.tex.last_layer - + templ->u.tex.first_layer + 1; + + ilo_state_zs_init(&surf->u.zs, dev, &info); + + if (info.z_img) { + surf->u.zs.depth_bo = info.z_img->bo; + if (info.hiz_enable) + surf->u.zs.hiz_bo = info.z_img->aux.bo; + } + + if (info.s_img) + surf->u.zs.stencil_bo = info.s_img->bo; } return &surf->base; @@ -1294,10 +2364,30 @@ void ilo_state_vector_init(const struct ilo_dev *dev, struct ilo_state_vector *vec) { - ilo_gpe_set_scissor_null(dev, &vec->scissor); + struct ilo_state_urb_info urb_info; - ilo_gpe_init_zs_surface(dev, NULL, NULL, PIPE_TEXTURE_2D, - PIPE_FORMAT_NONE, 0, 0, 1, &vec->fb.null_zs); + vec->sample_mask = ~0u; + + ilo_state_viewport_init_data_only(&vec->viewport.vp, dev, + vec->viewport.vp_data, sizeof(vec->viewport.vp_data)); + assert(vec->viewport.vp.array_size >= ILO_MAX_VIEWPORTS); + + vec->viewport.params.matrices = vec->viewport.matrices; + vec->viewport.params.scissors = vec->viewport.scissors; + + ilo_state_hs_init_disabled(&vec->disabled_hs, dev); + ilo_state_ds_init_disabled(&vec->disabled_ds, dev); + ilo_state_gs_init_disabled(&vec->disabled_gs, dev); + + ilo_state_sol_buffer_init_disabled(&vec->so.dummy_sb, dev); + + ilo_state_surface_init_for_null(&vec->fb.null_rt, dev); + ilo_state_zs_init_for_null(&vec->fb.null_zs, dev); + + ilo_state_sampler_init_disabled(&vec->disabled_sampler, dev); + + memset(&urb_info, 0, sizeof(urb_info)); + ilo_state_urb_init(&vec->urb, dev, &urb_info); util_dynarray_init(&vec->global_binding.bindings); @@ -1314,7 +2404,7 @@ ilo_state_vector_cleanup(struct ilo_state_vector *vec) pipe_resource_reference(&vec->vb.states[i].buffer, NULL); } - pipe_resource_reference(&vec->ib.buffer, NULL); + pipe_resource_reference(&vec->ib.state.buffer, NULL); pipe_resource_reference(&vec->ib.hw_resource, NULL); for (i = 0; i < vec->so.count; i++) @@ -1377,7 +2467,7 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, } } - if (vec->ib.buffer == res) { + if (vec->ib.state.buffer == res) { states |= ILO_DIRTY_IB; /* @@ -1392,6 +2482,10 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, for (i = 0; i < vec->so.count; i++) { if (vec->so.states[i]->buffer == res) { + struct ilo_stream_output_target *target = + (struct ilo_stream_output_target *) vec->so.states[i]; + + target->sb.bo = ilo_buffer(res)->bo; states |= ILO_DIRTY_SO; break; } @@ -1456,7 +2550,8 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, struct ilo_surface_cso *cso = (struct ilo_surface_cso *) vec->fb.state.zsbuf; - cso->u.rt.bo = bo; + cso->u.zs.depth_bo = bo; + states |= ILO_DIRTY_FB; } } diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h index fd0a3156ebc..3e6fd8a2554 100644 --- a/src/gallium/drivers/ilo/ilo_state.h +++ b/src/gallium/drivers/ilo/ilo_state.h @@ -28,13 +28,38 @@ #ifndef ILO_STATE_H #define ILO_STATE_H -#include "core/ilo_state_3d.h" +#include "core/ilo_builder_3d.h" /* for gen6_3dprimitive_info */ +#include "core/ilo_state_cc.h" +#include "core/ilo_state_compute.h" +#include "core/ilo_state_raster.h" +#include "core/ilo_state_sampler.h" +#include "core/ilo_state_sbe.h" +#include "core/ilo_state_shader.h" +#include "core/ilo_state_sol.h" +#include "core/ilo_state_surface.h" +#include "core/ilo_state_urb.h" +#include "core/ilo_state_vf.h" +#include "core/ilo_state_viewport.h" +#include "core/ilo_state_zs.h" #include "pipe/p_state.h" #include "util/u_dynarray.h" #include "ilo_common.h" /** + * \see brw_context.h + */ +#define ILO_MAX_DRAW_BUFFERS 8 +#define ILO_MAX_CONST_BUFFERS (1 + 12) +#define ILO_MAX_SAMPLER_VIEWS 16 +#define ILO_MAX_SAMPLERS 16 +#define ILO_MAX_SO_BINDINGS 64 +#define ILO_MAX_SO_BUFFERS 4 +#define ILO_MAX_VIEWPORTS 1 + +#define ILO_MAX_SURFACES 256 + +/** * States that we track. * * XXX Do we want to count each sampler or vertex buffer as a state? If that @@ -120,6 +145,172 @@ enum ilo_dirty_flags { }; struct ilo_context; +struct ilo_shader_state; + +struct ilo_ve_state { + unsigned vb_mapping[PIPE_MAX_ATTRIBS]; + unsigned vb_count; + + /* these are not valid until the state is finalized */ + uint32_t vf_data[PIPE_MAX_ATTRIBS][4]; + struct ilo_state_vf_params_info vf_params; + struct ilo_state_vf vf; +}; + +struct ilo_vb_state { + struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS]; + struct ilo_state_vertex_buffer vb[PIPE_MAX_ATTRIBS]; + uint32_t enabled_mask; +}; + +struct ilo_ib_state { + struct pipe_index_buffer state; + + /* these are not valid until the state is finalized */ + struct pipe_resource *hw_resource; + unsigned hw_index_size; + struct ilo_state_index_buffer ib; +}; + +struct ilo_cbuf_cso { + struct pipe_resource *resource; + struct ilo_state_surface_buffer_info info; + struct ilo_state_surface surface; + + /* + * this CSO is not so constant because user buffer needs to be uploaded in + * finalize_constant_buffers() + */ + const void *user_buffer; +}; + +struct ilo_sampler_cso { + struct ilo_state_sampler sampler; + struct ilo_state_sampler_border border; + bool saturate_s; + bool saturate_t; + bool saturate_r; +}; + +struct ilo_sampler_state { + const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS]; +}; + +struct ilo_cbuf_state { + struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS]; + uint32_t enabled_mask; +}; + +struct ilo_resource_state { + struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES]; + unsigned count; +}; + +struct ilo_view_cso { + struct pipe_sampler_view base; + + struct ilo_state_surface surface; +}; + +struct ilo_view_state { + struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS]; + unsigned count; +}; + +struct ilo_stream_output_target { + struct pipe_stream_output_target base; + + struct ilo_state_sol_buffer sb; +}; + +struct ilo_so_state { + struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS]; + unsigned count; + unsigned append_bitmask; + + struct ilo_state_sol_buffer dummy_sb; + + bool enabled; +}; + +struct ilo_rasterizer_state { + struct pipe_rasterizer_state state; + + /* these are invalid until finalize_rasterizer() */ + struct ilo_state_raster_info info; + struct ilo_state_raster rs; +}; + +struct ilo_viewport_state { + struct ilo_state_viewport_matrix_info matrices[ILO_MAX_VIEWPORTS]; + struct ilo_state_viewport_scissor_info scissors[ILO_MAX_VIEWPORTS]; + struct ilo_state_viewport_params_info params; + + struct pipe_viewport_state viewport0; + struct pipe_scissor_state scissor0; + + struct ilo_state_viewport vp; + uint32_t vp_data[20 * ILO_MAX_VIEWPORTS]; +}; + +struct ilo_surface_cso { + struct pipe_surface base; + + bool is_rt; + union { + struct ilo_state_surface rt; + struct ilo_state_zs zs; + } u; +}; + +struct ilo_fb_state { + struct pipe_framebuffer_state state; + + struct ilo_state_surface null_rt; + struct ilo_state_zs null_zs; + + struct ilo_fb_blend_caps { + bool is_unorm; + bool is_integer; + bool force_dst_alpha_one; + + bool can_logicop; + bool can_blend; + bool can_alpha_test; + } blend_caps[PIPE_MAX_COLOR_BUFS]; + + unsigned num_samples; + + bool has_integer_rt; + bool has_hiz; + enum gen_depth_format depth_offset_format; +}; + +struct ilo_dsa_state { + struct ilo_state_cc_depth_info depth; + + struct ilo_state_cc_stencil_info stencil; + struct { + uint8_t test_mask; + uint8_t write_mask; + } stencil_front, stencil_back; + + bool alpha_test; + float alpha_ref; + enum gen_compare_function alpha_func; +}; + +struct ilo_blend_state { + struct ilo_state_cc_blend_rt_info rt[PIPE_MAX_COLOR_BUFS]; + struct ilo_state_cc_blend_rt_info dummy_rt; + bool dual_blend; + + /* these are invalid until finalize_blend() */ + struct ilo_state_cc_blend_rt_info effective_rt[PIPE_MAX_COLOR_BUFS]; + struct ilo_state_cc_info info; + struct ilo_state_cc cc; + bool alpha_may_kill; +}; struct ilo_global_binding_cso { struct pipe_resource *resource; @@ -147,6 +338,7 @@ struct ilo_global_binding { struct ilo_state_vector { const struct pipe_draw_info *draw; + struct gen6_3dprimitive_info draw_info; uint32_t dirty; @@ -157,30 +349,41 @@ struct ilo_state_vector { struct ilo_shader_state *vs; struct ilo_shader_state *gs; + struct ilo_state_hs disabled_hs; + struct ilo_state_ds disabled_ds; + struct ilo_state_gs disabled_gs; + struct ilo_so_state so; struct pipe_clip_state clip; + struct ilo_viewport_state viewport; - struct ilo_scissor_state scissor; - const struct ilo_rasterizer_state *rasterizer; - struct pipe_poly_stipple poly_stipple; + struct ilo_rasterizer_state *rasterizer; + + struct ilo_state_line_stipple line_stipple; + struct ilo_state_poly_stipple poly_stipple; unsigned sample_mask; struct ilo_shader_state *fs; - const struct ilo_dsa_state *dsa; + struct ilo_state_cc_params_info cc_params; struct pipe_stencil_ref stencil_ref; - const struct ilo_blend_state *blend; - struct pipe_blend_color blend_color; + const struct ilo_dsa_state *dsa; + struct ilo_blend_state *blend; + struct ilo_fb_state fb; + struct ilo_state_urb urb; + /* shader resources */ struct ilo_sampler_state sampler[PIPE_SHADER_TYPES]; struct ilo_view_state view[PIPE_SHADER_TYPES]; struct ilo_cbuf_state cbuf[PIPE_SHADER_TYPES]; struct ilo_resource_state resource; + struct ilo_state_sampler disabled_sampler; + /* GPGPU */ struct ilo_shader_state *cs; struct ilo_resource_state cs_resource; diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h index d2dc2f5b5b4..01c86675202 100644 --- a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h +++ b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h @@ -28,6 +28,9 @@ #ifndef ILO_SHADER_INTERNAL_H #define ILO_SHADER_INTERNAL_H +#include "core/ilo_state_sbe.h" +#include "core/ilo_state_sol.h" + #include "ilo_common.h" #include "ilo_state.h" #include "ilo_shader.h" @@ -72,13 +75,27 @@ struct ilo_shader_variant { uint32_t saturate_tex_coords[3]; }; +struct ilo_kernel_routing { + bool initialized; + + bool is_point; + bool light_twoside; + uint32_t sprite_coord_enable; + int sprite_coord_mode; + int src_len; + int src_semantics[PIPE_MAX_SHADER_OUTPUTS]; + int src_indices[PIPE_MAX_SHADER_OUTPUTS]; + + struct ilo_state_sbe sbe; +}; + /** * A compiled shader. */ struct ilo_shader { struct ilo_shader_variant variant; - struct ilo_shader_cso cso; + union ilo_shader_cso cso; struct { int semantic_names[PIPE_MAX_SHADER_INPUTS]; @@ -111,7 +128,9 @@ struct ilo_shader { bool stream_output; int svbi_post_inc; - struct pipe_stream_output_info so_info; + + uint32_t sol_data[PIPE_MAX_SO_OUTPUTS][2]; + struct ilo_state_sol sol; /* for VS stream output / rasterizer discard */ int gs_offsets[3]; @@ -121,11 +140,8 @@ struct ilo_shader { void *kernel; int kernel_size; - bool routing_initialized; - int routing_src_semantics[PIPE_MAX_SHADER_OUTPUTS]; - int routing_src_indices[PIPE_MAX_SHADER_OUTPUTS]; - uint32_t routing_sprite_coord_enable; struct ilo_kernel_routing routing; + struct ilo_state_ps_params_info ps_params; /* what does the push constant buffer consist of? */ struct { diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c index 65e47bf3a4a..d38585f1475 100644 --- a/src/gallium/drivers/ilo/shader/toy_tgsi.c +++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c @@ -2036,9 +2036,6 @@ parse_instruction(struct toy_tgsi *tgsi, if (!dst_is_scratch[i]) continue; - if (tgsi_inst->Instruction.Saturate == TGSI_SAT_MINUS_PLUS_ONE) - tc_fail(tgsi->tc, "TGSI_SAT_MINUS_PLUS_ONE unhandled"); - tgsi->tc->templ.saturate = tgsi_inst->Instruction.Saturate; /* emit indirect store */ diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c index b6c32ffb979..b25e0413750 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c @@ -975,10 +975,6 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm, s_bld.int_vec_type, ""); } - /* convert scalar stencil refs into vectors */ - stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); - stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); - s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); diff --git a/src/gallium/drivers/llvmpipe/lp_public.h b/src/gallium/drivers/llvmpipe/lp_public.h index ec6b660b48e..27ab1baefbb 100644 --- a/src/gallium/drivers/llvmpipe/lp_public.h +++ b/src/gallium/drivers/llvmpipe/lp_public.h @@ -1,10 +1,18 @@ #ifndef LP_PUBLIC_H #define LP_PUBLIC_H +#ifdef __cplusplus +extern "C" { +#endif + struct pipe_screen; struct sw_winsys; struct pipe_screen * llvmpipe_create_screen(struct sw_winsys *winsys); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c index 4f8bab62e7b..fc593670671 100644 --- a/src/gallium/drivers/llvmpipe/lp_query.c +++ b/src/gallium/drivers/llvmpipe/lp_query.c @@ -315,7 +315,7 @@ llvmpipe_check_render_cond(struct llvmpipe_context *lp) b = pipe->get_query_result(pipe, lp->render_cond_query, wait, (void*)&result); if (b) - return (!result == lp->render_cond_cond); + return ((!result) == lp->render_cond_cond); else return TRUE; } diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index f4ba596f358..47f1897c732 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -165,7 +165,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_DEPTH_CLIP_DISABLE: return 1; case PIPE_CAP_SHADER_STENCIL_EXPORT: - return 0; + return 1; case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: case PIPE_CAP_START_INSTANCE: @@ -258,8 +258,9 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: return 1; case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_SAMPLER_VIEW_TARGET: return 0; + case PIPE_CAP_SAMPLER_VIEW_TARGET: + return 1; case PIPE_CAP_FAKE_SW_MSAA: return 1; case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: @@ -290,6 +291,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 96cc77c250c..4c8167a9e7d 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -854,9 +854,10 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, jit_tex->img_stride[j] = lp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_1D_ARRAY || - res->target == PIPE_TEXTURE_2D_ARRAY || - res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_1D_ARRAY || + view->target == PIPE_TEXTURE_2D_ARRAY || + view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { /* * For array textures, we don't have first_layer, instead * adjust last_layer (stored as depth) plus the mip level offsets @@ -868,7 +869,8 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, jit_tex->mip_offsets[j] += view->u.tex.first_layer * lp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { assert(jit_tex->depth % 6 == 0); } assert(view->u.tex.first_layer <= view->u.tex.last_layer); @@ -1067,10 +1069,13 @@ try_update_scene_state( struct lp_setup_context *setup ) if (setup->dirty & LP_SETUP_NEW_CONSTANTS) { for (i = 0; i < Elements(setup->constants); ++i) { struct pipe_resource *buffer = setup->constants[i].current.buffer; - const unsigned current_size = setup->constants[i].current.buffer_size; + const unsigned current_size = MIN2(setup->constants[i].current.buffer_size, + LP_MAX_TGSI_CONST_BUFFER_SIZE); const ubyte *current_data = NULL; int num_constants; + STATIC_ASSERT(DATA_BLOCK_SIZE >= LP_MAX_TGSI_CONST_BUFFER_SIZE); + if (buffer) { /* resource buffer */ current_data = (ubyte *) llvmpipe_resource_data(buffer); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 35fe7b20181..b5ce8683f1a 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -260,7 +260,8 @@ generate_fs_loop(struct gallivm_state *gallivm, { const struct util_format_description *zs_format_desc = NULL; const struct tgsi_token *tokens = shader->base.tokens; - LLVMTypeRef vec_type; + struct lp_type int_type = lp_int_type(type); + LLVMTypeRef vec_type, int_vec_type; LLVMValueRef mask_ptr, mask_val; LLVMValueRef consts_ptr, num_consts_ptr; LLVMValueRef z; @@ -295,7 +296,7 @@ generate_fs_loop(struct gallivm_state *gallivm, zs_format_desc = util_format_description(key->zsbuf_format); assert(zs_format_desc); - if (!shader->info.base.writes_z) { + if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { if (key->alpha.enabled || key->blend.alpha_to_coverage || shader->info.base.uses_kill) { @@ -329,11 +330,14 @@ generate_fs_loop(struct gallivm_state *gallivm, depth_mode = 0; } + vec_type = lp_build_vec_type(gallivm, type); + int_vec_type = lp_build_vec_type(gallivm, int_type); stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr); stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr); - - vec_type = lp_build_vec_type(gallivm, type); + /* convert scalar stencil refs into vectors */ + stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]); + stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]); consts_ptr = lp_jit_context_constants(gallivm, context_ptr); num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr); @@ -462,7 +466,9 @@ generate_fs_loop(struct gallivm_state *gallivm, int pos0 = find_output_by_semantic(&shader->info.base, TGSI_SEMANTIC_POSITION, 0); - + int s_out = find_output_by_semantic(&shader->info.base, + TGSI_SEMANTIC_STENCIL, + 0); if (pos0 != -1 && outputs[pos0][2]) { z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z"); @@ -512,6 +518,15 @@ generate_fs_loop(struct gallivm_state *gallivm, } } + if (s_out != -1 && outputs[s_out][1]) { + /* there's only one value, and spec says to discard additional bits */ + LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255); + stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s"); + stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, ""); + stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, ""); + stencil_refs[1] = stencil_refs[0]; + } + lp_build_depth_stencil_load_swizzled(gallivm, type, zs_format_desc, key->resource_1d, depth_ptr, depth_stride, diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c index 21da6290574..b205f02fdba 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c +++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c @@ -170,6 +170,36 @@ llvmpipe_create_sampler_view(struct pipe_context *pipe, view->texture = NULL; pipe_resource_reference(&view->texture, texture); view->context = pipe; + +#ifdef DEBUG + /* + * This is possibly too lenient, but the primary reason is just + * to catch state trackers which forget to initialize this, so + * it only catches clearly impossible view targets. + */ + if (view->target != texture->target) { + if (view->target == PIPE_TEXTURE_1D) + assert(texture->target == PIPE_TEXTURE_1D_ARRAY); + else if (view->target == PIPE_TEXTURE_1D_ARRAY) + assert(texture->target == PIPE_TEXTURE_1D); + else if (view->target == PIPE_TEXTURE_2D) + assert(texture->target == PIPE_TEXTURE_2D_ARRAY || + texture->target == PIPE_TEXTURE_CUBE || + texture->target == PIPE_TEXTURE_CUBE_ARRAY); + else if (view->target == PIPE_TEXTURE_2D_ARRAY) + assert(texture->target == PIPE_TEXTURE_2D || + texture->target == PIPE_TEXTURE_CUBE || + texture->target == PIPE_TEXTURE_CUBE_ARRAY); + else if (view->target == PIPE_TEXTURE_CUBE) + assert(texture->target == PIPE_TEXTURE_CUBE_ARRAY || + texture->target == PIPE_TEXTURE_2D_ARRAY); + else if (view->target == PIPE_TEXTURE_CUBE_ARRAY) + assert(texture->target == PIPE_TEXTURE_CUBE || + texture->target == PIPE_TEXTURE_2D_ARRAY); + else + assert(0); + } +#endif } return view; @@ -245,15 +275,17 @@ prepare_shader_sampling( row_stride[j] = lp_tex->row_stride[j]; img_stride[j] = lp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_1D_ARRAY || - res->target == PIPE_TEXTURE_2D_ARRAY || - res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_1D_ARRAY || + view->target == PIPE_TEXTURE_2D_ARRAY || + view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1; for (j = first_level; j <= last_level; j++) { mip_offsets[j] += view->u.tex.first_layer * lp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { assert(num_layers % 6 == 0); } assert(view->u.tex.first_layer <= view->u.tex.last_layer); diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c index 08f968f7f0a..96f8ed82cd8 100644 --- a/src/gallium/drivers/llvmpipe/lp_surface.c +++ b/src/gallium/drivers/llvmpipe/lp_surface.c @@ -42,13 +42,6 @@ lp_resource_copy(struct pipe_context *pipe, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box) { - struct llvmpipe_resource *src_tex = llvmpipe_resource(src); - struct llvmpipe_resource *dst_tex = llvmpipe_resource(dst); - const enum pipe_format format = src_tex->base.format; - unsigned width = src_box->width; - unsigned height = src_box->height; - unsigned depth = src_box->depth; - llvmpipe_flush_resource(pipe, dst, dst_level, FALSE, /* read_only */ @@ -63,58 +56,8 @@ lp_resource_copy(struct pipe_context *pipe, FALSE, /* do_not_block */ "blit src"); - /* Fallback for buffers. */ - if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { - util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); - return; - } - - /* - printf("surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u %u x %u x %u\n", - src_tex->id, src_level, dst_tex->id, dst_level, - src_box->x, src_box->y, src_box->z, dstx, dsty, dstz, - src_box->width, src_box->height, src_box->depth); - */ - - /* make sure display target resources (which cannot have levels/layers) are mapped */ - if (src_tex->dt) - (void) llvmpipe_resource_map(src, src_level, 0, LP_TEX_USAGE_READ); - if (dst_tex->dt) - /* - * Could set this to WRITE_ALL if complete dst is covered but it gets - * ignored anyway. - */ - (void) llvmpipe_resource_map(dst, dst_level, 0, LP_TEX_USAGE_READ_WRITE); - - - /* copy */ - { - const ubyte *src_linear_ptr - = llvmpipe_get_texture_image_address(src_tex, src_box->z, - src_level); - ubyte *dst_linear_ptr - = llvmpipe_get_texture_image_address(dst_tex, dstz, - dst_level); - - if (dst_linear_ptr && src_linear_ptr) { - util_copy_box(dst_linear_ptr, format, - llvmpipe_resource_stride(&dst_tex->base, dst_level), - dst_tex->img_stride[dst_level], - dstx, dsty, 0, - width, height, depth, - src_linear_ptr, - llvmpipe_resource_stride(&src_tex->base, src_level), - src_tex->img_stride[src_level], - src_box->x, src_box->y, 0); - } - } - - if (src_tex->dt) - llvmpipe_resource_unmap(src, 0, 0); - if (dst_tex->dt) - llvmpipe_resource_unmap(dst, 0, 0); - + util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box); } @@ -139,11 +82,6 @@ static void lp_blit(struct pipe_context *pipe, return; /* done */ } - if (info.mask & PIPE_MASK_S) { - debug_printf("llvmpipe: cannot blit stencil, skipping\n"); - info.mask &= ~PIPE_MASK_S; - } - if (!util_blitter_is_blit_supported(lp->blitter, &info)) { debug_printf("llvmpipe: blit unsupported %s -> %s\n", util_format_short_name(info.src.resource->format), diff --git a/src/gallium/drivers/nouveau/Android.mk b/src/gallium/drivers/nouveau/Android.mk index 420c8e5734c..daf3abd1bb3 100644 --- a/src/gallium/drivers/nouveau/Android.mk +++ b/src/gallium/drivers/nouveau/Android.mk @@ -39,6 +39,10 @@ LOCAL_SRC_FILES := \ LOCAL_SHARED_LIBRARIES := libdrm libdrm_nouveau LOCAL_MODULE := libmesa_pipe_nouveau +ifeq ($(MESA_LOLLIPOP_BUILD),true) +LOCAL_C_INCLUDES := external/libcxx/include +else include external/stlport/libstlport.mk +endif include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am index 0aefc031210..d05f0a17ab4 100644 --- a/src/gallium/drivers/nouveau/Makefile.am +++ b/src/gallium/drivers/nouveau/Makefile.am @@ -48,7 +48,7 @@ nouveau_compiler_SOURCES = \ nouveau_compiler_LDADD = \ libnouveau.la \ - ../../auxiliary/libgallium.la \ + $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/util/libmesautil.la \ $(GALLIUM_COMMON_LIB_DEPS) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm index be17871edd4..b9c05a04b9a 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -11,7 +11,7 @@ // SIZE: 22 / 14 * 8 bytes // gk110_div_u32: - sched 0x28282804280428 + sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28 bfind u32 $r2 $r1 xor b32 $r2 $r2 0x1f mov b32 $r3 0x1 @@ -19,7 +19,7 @@ gk110_div_u32: cvt u32 $r1 neg u32 $r1 mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 - sched 0x28282828282828 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 mul $r3 u32 $r1 u32 $r2 @@ -27,7 +27,7 @@ gk110_div_u32: mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 mul $r3 u32 $r1 u32 $r2 - sched 0x042c2828042804 + sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04 add $r2 (mul high u32 $r2 u32 $r3) $r2 mov b32 $r3 $r0 mul high $r0 u32 $r0 u32 $r2 @@ -35,7 +35,7 @@ gk110_div_u32: add $r1 (mul u32 $r1 u32 $r0) $r3 set $p0 0x1 ge u32 $r1 $r2 $p0 sub b32 $r1 $r1 $r2 - sched 0x20282e20042c28 + sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20 $p0 add b32 $r0 $r0 0x1 $p0 set $p0 0x1 ge u32 $r1 $r2 $p0 sub b32 $r1 $r1 $r2 @@ -51,7 +51,7 @@ gk110_div_u32: gk110_div_s32: set $p2 0x1 lt s32 $r0 0x0 set $p3 0x1 lt s32 $r1 0x0 xor $p2 - sched 0x28042804282820 + sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28 cvt s32 $r0 abs s32 $r0 cvt s32 $r1 abs s32 $r1 bfind u32 $r2 $r1 @@ -59,7 +59,7 @@ gk110_div_s32: mov b32 $r3 0x1 shl b32 $r2 $r3 clamp $r2 cvt u32 $r1 neg u32 $r1 - sched 0x28282828282828 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 mul $r3 u32 $r1 u32 $r2 @@ -67,7 +67,7 @@ gk110_div_s32: mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 mul $r3 u32 $r1 u32 $r2 - sched 0x28280428042828 + sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28 add $r2 (mul high u32 $r2 u32 $r3) $r2 mul $r3 u32 $r1 u32 $r2 add $r2 (mul high u32 $r2 u32 $r3) $r2 @@ -75,7 +75,7 @@ gk110_div_s32: mul high $r0 u32 $r0 u32 $r2 cvt u32 $r2 neg u32 $r1 add $r1 (mul u32 $r1 u32 $r0) $r3 - sched 0x2028042c28042c + sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 set $p0 0x1 ge u32 $r1 $r2 $p0 sub b32 $r1 $r1 $r2 $p0 add b32 $r0 $r0 0x1 @@ -83,7 +83,7 @@ gk110_div_s32: $p0 sub b32 $r1 $r1 $r2 $p0 add b32 $r0 $r0 0x1 $p3 cvt s32 $r0 neg s32 $r0 - sched 0x2c200428042e04 + sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c $p2 cvt s32 $r1 neg s32 $r1 ret diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 6bb9620d5f7..ab8bf2e5504 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -967,8 +967,8 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i) code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0); if (i->defExists(1)) defId(i->def(1), 2); - else - code[0] |= 0x1c; + else + code[0] |= 0x1c; } else { switch (i->sType) { case TYPE_F32: op2 = 0x000; op1 = 0x800; break; @@ -990,8 +990,12 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i) } FTZ_(3a); - if (i->dType == TYPE_F32) - code[1] |= 1 << 23; + if (i->dType == TYPE_F32) { + if (isFloatType(i->sType)) + code[1] |= 1 << 23; + else + code[1] |= 1 << 15; + } } if (i->sType == TYPE_S32) code[1] |= 1 << 19; @@ -1316,6 +1320,8 @@ CodeEmitterGK110::emitFlow(const Instruction *i) } else if (mask & 2) { int32_t pcRel = f->target.bb->binPos - (codeSize + 8); + if (writeIssueDelays && !(f->target.bb->binPos & 0x3f)) + pcRel += 8; // currently we don't want absolute branches assert(!f->absolute); code[0] |= (pcRel & 0x1ff) << 23; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 22db368b371..399a6f1db13 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -509,10 +509,13 @@ CodeEmitterGM107::emitBRA() emitCond5(0x00, CC_TR); if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) { + int32_t pos = insn->target.bb->binPos; + if (writeIssueDelays && !(pos & 0x1f)) + pos += 8; if (!insn->absolute) - emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8)); + emitField(0x14, 24, pos - (codeSize + 8)); else - emitField(0x14, 32, insn->target.bb->binPos); + emitField(0x14, 32, pos); } else { emitCBUF (0x24, gpr, 20, 16, 0, insn->src(0)); emitField(0x05, 1, 1); @@ -1827,6 +1830,7 @@ CodeEmitterGM107::emitISET() emitCond3(0x31, insn->setCond); emitField(0x30, 1, isSignedType(insn->sType)); emitCC (0x2f); + emitField(0x2c, 1, insn->dType == TYPE_F32); emitX (0x2b); emitGPR (0x08, insn->src(0)); emitGPR (0x00, insn->def(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index d9aed34a0ce..472e3a84119 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1078,8 +1078,14 @@ CodeEmitterNVC0::emitSET(const CmpInstruction *i) if (!isFloatType(i->sType)) lo = 0x3; - if (isFloatType(i->dType) || isSignedIntType(i->sType)) + if (isSignedIntType(i->sType)) lo |= 0x20; + if (isFloatType(i->dType)) { + if (isFloatType(i->sType)) + lo |= 0x20; + else + lo |= 0x80; + } switch (i->op) { case OP_SET_AND: hi = 0x10000000; break; @@ -1406,6 +1412,8 @@ CodeEmitterNVC0::emitFlow(const Instruction *i) } else if (mask & 2) { int32_t pcRel = f->target.bb->binPos - (codeSize + 8); + if (writeIssueDelays && !(f->target.bb->binPos & 0x3f)) + pcRel += 8; // currently we don't want absolute branches assert(!f->absolute); code[0] |= (pcRel & 0x3f) << 26; @@ -2712,7 +2720,6 @@ private: RegScores *score; // for current BB std::vector<RegScores> scoreBoards; - int cycle; int prevData; operation prevOp; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 254629f907a..ecd115f9807 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1316,7 +1316,7 @@ private: }; private: - const struct tgsi::Source *code; + const tgsi::Source *code; const struct nv50_ir_prog_info *info; struct { @@ -1356,18 +1356,20 @@ Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c) { const int swz = src.getSwizzle(c); + /* TODO: Use Array ID when it's available for the index */ return makeSym(src.getFile(), src.is2D() ? src.getIndex(1) : 0, - src.isIndirect(0) ? -1 : src.getIndex(0), swz, + src.getIndex(0), swz, src.getIndex(0) * 16 + swz * 4); } Symbol * Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c) { + /* TODO: Use Array ID when it's available for the index */ return makeSym(dst.getFile(), dst.is2D() ? dst.getIndex(1) : 0, - dst.isIndirect(0) ? -1 : dst.getIndex(0), c, + dst.getIndex(0), c, dst.getIndex(0) * 16 + c * 4); } @@ -1604,19 +1606,8 @@ Converter::storeDst(int d, int c, Value *val) { const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); - switch (tgsi.getSaturate()) { - case TGSI_SAT_NONE: - break; - case TGSI_SAT_ZERO_ONE: + if (tgsi.getSaturate()) { mkOp1(OP_SAT, dstTy, val, val); - break; - case TGSI_SAT_MINUS_PLUS_ONE: - mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f)); - mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f)); - break; - default: - assert(!"invalid saturation mode"); - break; } Value *ptr = NULL; @@ -1955,13 +1946,13 @@ isResourceSpecial(const int r) } static inline bool -isResourceRaw(const struct tgsi::Source *code, const int r) +isResourceRaw(const tgsi::Source *code, const int r) { return isResourceSpecial(r) || code->resources[r].raw; } static inline nv50_ir::TexTarget -getResourceTarget(const struct tgsi::Source *code, int r) +getResourceTarget(const tgsi::Source *code, int r) { if (isResourceSpecial(r)) return nv50_ir::TEX_TARGET_BUFFER; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 64989ac8846..596ac95d489 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -240,6 +240,7 @@ GM107LoweringPass::visit(Instruction *i) Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), i->getIndirect(0, 0), bld.mkImm(4)); i->setIndirect(0, 0, ptr); + i->op = OP_VFETCH; } else { i->op = OP_VFETCH; assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 1ad086094dc..2c7f7e326b2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -887,7 +887,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i) } } bld.setPosition(joinBB, false); - bld.mkOp(OP_JOIN, TYPE_NONE, NULL); + bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index b61f3c49bb9..7a5d1ce0299 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -100,8 +100,7 @@ void NVC0LegalizeSSA::handleFTZ(Instruction *i) { // Only want to flush float inputs - if (i->sType != TYPE_F32) - return; + assert(i->sType == TYPE_F32); // If we're already flushing denorms (and NaN's) to zero, no need for this. if (i->dnz) @@ -129,7 +128,7 @@ NVC0LegalizeSSA::visit(BasicBlock *bb) Instruction *next; for (Instruction *i = bb->getEntry(); i; i = next) { next = i->next; - if (i->dType == TYPE_F32) { + if (i->sType == TYPE_F32) { if (prog->getType() != Program::TYPE_COMPUTE) handleFTZ(i); continue; @@ -169,7 +168,7 @@ NVC0LegalizePostRA::insnDominatedBy(const Instruction *later, void NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses, - Instruction *usei, const Instruction *insn) + Instruction *usei, const Instruction *texi) { bool add = true; for (std::list<TexUse>::iterator it = uses.begin(); @@ -184,7 +183,7 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses, ++it; } if (add) - uses.push_back(TexUse(usei, insn)); + uses.push_back(TexUse(usei, texi)); } void @@ -196,7 +195,8 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi, while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0))) insn = insn->getSrc(0)->getUniqueInsn(); - if (!insn->bb->reachableBy(texi->bb, term)) + // NOTE: the tex itself is, of course, not an overwriting definition + if (insn == texi || !insn->bb->reachableBy(texi->bb, term)) return; switch (insn->op) { @@ -244,7 +244,12 @@ NVC0LegalizePostRA::findFirstUses( visited.insert(usei); if (usei->op == OP_PHI || usei->op == OP_UNION) { - // need a barrier before WAW cases + // need a barrier before WAW cases, like: + // %r0 = tex + // if ... + // texbar <- is required or tex might replace x again + // %r1 = x <- overwriting def + // %r2 = phi %r0, %r1 for (int s = 0; usei->srcExists(s); ++s) { Instruction *defi = usei->getSrc(s)->getUniqueInsn(); if (defi && &usei->src(s) != *u) @@ -263,7 +268,7 @@ NVC0LegalizePostRA::findFirstUses( usei->subOp != NV50_IR_SUBOP_MOV_FINAL) { findFirstUses(texi, usei, uses, visited); } else { - addTexUse(uses, usei, insn); + addTexUse(uses, usei, texi); } } } @@ -1751,6 +1756,7 @@ NVC0LoweringPass::visit(Instruction *i) Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), i->getIndirect(0, 0), bld.mkImm(4)); i->setIndirect(0, 0, ptr); + i->op = OP_VFETCH; } else { i->op = OP_VFETCH; assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 14446b6b53f..ae739eeda83 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -236,6 +236,9 @@ LoadPropagation::visit(BasicBlock *bb) if (i->op == OP_CALL) // calls have args as sources, they must be in regs continue; + if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg + continue; + if (i->srcExists(1)) checkSwapSrc01(i); @@ -278,7 +281,6 @@ private: void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&); - // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET CmpInstruction *findOriginForTestWithZero(Value *); unsigned int foldCount; @@ -337,25 +339,33 @@ ConstantFolding::findOriginForTestWithZero(Value *value) return NULL; Instruction *insn = value->getInsn(); - while (insn && insn->op != OP_SET) { - Instruction *next = NULL; - switch (insn->op) { - case OP_NEG: - case OP_ABS: - case OP_CVT: - next = insn->getSrc(0)->getInsn(); - if (insn->sType != next->dType) + if (insn->asCmp() && insn->op != OP_SLCT) + return insn->asCmp(); + + /* Sometimes mov's will sneak in as a result of other folding. This gets + * cleaned up later. + */ + if (insn->op == OP_MOV) + return findOriginForTestWithZero(insn->getSrc(0)); + + /* Deal with AND 1.0 here since nv50 can't fold into boolean float */ + if (insn->op == OP_AND) { + int s = 0; + ImmediateValue imm; + if (!insn->src(s).getImmediate(imm)) { + s = 1; + if (!insn->src(s).getImmediate(imm)) return NULL; - break; - case OP_MOV: - next = insn->getSrc(0)->getInsn(); - break; - default: - return NULL; } - insn = next; + if (imm.reg.data.f32 != 1.0f) + return NULL; + /* TODO: Come up with a way to handle the condition being inverted */ + if (insn->src(!s).mod != Modifier(0)) + return NULL; + return findOriginForTestWithZero(insn->getSrc(!s)); } - return insn ? insn->asCmp() : NULL; + + return NULL; } void @@ -574,6 +584,11 @@ ConstantFolding::expr(Instruction *i, case OP_POPCNT: res.data.u32 = util_bitcount(a->data.u32 & b->data.u32); break; + case OP_PFETCH: + // The two arguments to pfetch are logically added together. Normally + // the second argument will not be constant, but that can happen. + res.data.u32 = a->data.u32 + b->data.u32; + break; default: return; } @@ -588,7 +603,9 @@ ConstantFolding::expr(Instruction *i, i->getSrc(0)->reg.data = res.data; - if (i->op == OP_MAD || i->op == OP_FMA) { + switch (i->op) { + case OP_MAD: + case OP_FMA: { i->op = OP_ADD; i->setSrc(1, i->getSrc(0)); @@ -603,8 +620,14 @@ ConstantFolding::expr(Instruction *i, bld.setPosition(i, false); i->setSrc(1, bld.loadImm(NULL, res.data.u32)); } - } else { + break; + } + case OP_PFETCH: + // Leave PFETCH alone... we just folded its 2 args into 1. + break; + default: i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */ + break; } i->subOp = 0; } @@ -946,33 +969,82 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) case OP_SET: // TODO: SET_AND,OR,XOR { + /* This optimizes the case where the output of a set is being compared + * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we + * can be a lot cleverer in our comparison. + */ CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t)); CondCode cc, ccZ; - if (i->src(t).mod != Modifier(0)) - return; - if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET) + if (imm0.reg.data.u32 != 0 || !si) return; cc = si->setCond; ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U); + // We do everything assuming var (cmp) 0, reverse the condition if 0 is + // first. if (s == 0) ccZ = reverseCondCode(ccZ); + // If there is a negative modifier, we need to undo that, by flipping + // the comparison to zero. + if (i->src(t).mod.neg()) + ccZ = reverseCondCode(ccZ); + // If this is a signed comparison, we expect the input to be a regular + // boolean, i.e. 0/-1. However the rest of the logic assumes that true + // is positive, so just flip the sign. + if (i->sType == TYPE_S32) { + assert(!isFloatType(si->dType)); + ccZ = reverseCondCode(ccZ); + } switch (ccZ) { - case CC_LT: cc = CC_FL; break; - case CC_GE: cc = CC_TR; break; - case CC_EQ: cc = inverseCondCode(cc); break; - case CC_LE: cc = inverseCondCode(cc); break; - case CC_GT: break; - case CC_NE: break; + case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true + case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true + case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool + case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool + case CC_GT: break; // bool > 0 -- bool + case CC_NE: break; // bool != 0 -- bool default: return; } + + // Update the condition of this SET to be identical to the origin set, + // but with the updated condition code. The original SET should get + // DCE'd, ideally. + i->op = si->op; i->asCmp()->setCond = cc; i->setSrc(0, si->src(0)); i->setSrc(1, si->src(1)); + if (si->srcExists(2)) + i->setSrc(2, si->src(2)); i->sType = si->sType; } break; + case OP_AND: + { + CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp(); + if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1) + return; + if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32)) + return; + if (imm0.reg.data.f32 != 1.0) + return; + if (i->getSrc(t)->getInsn()->dType != TYPE_U32) + return; + + i->getSrc(t)->getInsn()->dType = TYPE_F32; + if (i->src(t).mod != Modifier(0)) { + assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT)); + i->src(t).mod = Modifier(0); + cmp->setCond = inverseCondCode(cmp->setCond); + } + i->op = OP_MOV; + i->setSrc(s, NULL); + if (t) { + i->setSrc(0, i->getSrc(t)); + i->setSrc(t, NULL); + } + } + break; + case OP_SHL: { if (s != 1 || i->src(0).mod != Modifier(0)) @@ -2216,7 +2288,7 @@ FlatteningPass::visit(BasicBlock *bb) insn->op != OP_LINTERP && // probably just nve4 insn->op != OP_PINTERP && // probably just nve4 ((insn->op != OP_LOAD && insn->op != OP_STORE) || - typeSizeof(insn->dType) <= 4) && + (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) && !insn->isNop()) { insn->join = 1; bb->remove(bb->getExit()); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 178a1671c3f..ca545a6024a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -84,7 +84,7 @@ static const struct opProperties _initProps[] = // neg abs not sat c[] s[], a[], imm { OP_ADD, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 }, { OP_SUB, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 }, - { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 }, + { OP_MUL, 0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 }, { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x1, 0x1, 0x0 }, // special constraint @@ -188,6 +188,9 @@ void TargetNV50::initOpInfo() if (prop->mSat & 8) opInfo[prop->op].dstMods = NV50_IR_MOD_SAT; } + + if (chipset >= 0xa0) + opInfo[OP_MUL].dstMods = NV50_IR_MOD_SAT; } unsigned int @@ -413,6 +416,8 @@ TargetNV50::isOpSupported(operation op, DataType ty) const return false; case OP_SAD: return ty == TYPE_S32; + case OP_SET: + return !isFloatType(ty); default: return true; } diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c index 32fa65c8a51..09cdbb53ecb 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -658,13 +658,13 @@ nouveau_buffer_create(struct pipe_screen *pscreen, switch (buffer->base.usage) { case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: - buffer->domain = NOUVEAU_BO_VRAM; + buffer->domain = NV_VRAM_DOMAIN(screen); break; case PIPE_USAGE_DYNAMIC: /* For most apps, we'd have to do staging transfers to avoid sync * with this usage, and GART -> GART copies would be suboptimal. */ - buffer->domain = NOUVEAU_BO_VRAM; + buffer->domain = NV_VRAM_DOMAIN(screen); break; case PIPE_USAGE_STAGING: case PIPE_USAGE_STREAM: @@ -676,7 +676,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen, } } else { if (buffer->base.bind & screen->vidmem_bindings) - buffer->domain = NOUVEAU_BO_VRAM; + buffer->domain = NV_VRAM_DOMAIN(screen); else if (buffer->base.bind & screen->sysmem_bindings) buffer->domain = NOUVEAU_BO_GART; diff --git a/src/gallium/drivers/nouveau/nouveau_heap.h b/src/gallium/drivers/nouveau/nouveau_heap.h index d0b22844ad0..a3d64a65623 100644 --- a/src/gallium/drivers/nouveau/nouveau_heap.h +++ b/src/gallium/drivers/nouveau/nouveau_heap.h @@ -23,6 +23,26 @@ #ifndef __NOUVEAU_HEAP_H__ #define __NOUVEAU_HEAP_H__ +/* This datastructure represents a memory allocation heap. Fundamentally, this + * is a doubly-linked list with a few properties, and a usage convention. + * + * On initial allocation, there is a single node with the full size that's + * marked as not in-use. As allocations are made, blocks are taken off the end + * of that first node, and inserted right after it. If the first node doesn't + * have enough free space, we look for free space down in the rest of the + * list. This can happen if an allocation is made and then freed. + * + * The first node will remain with in_use == 0 even if the whole heap is + * exhausted. Another invariant is that there will never be two sequential + * in_use == 0 nodes. If a node is freed and it has one (or both) adjacent + * free nodes, they are merged into one, and the relevant heap entries are + * freed. + * + * The pattern to free the whole heap is to start with the first node and then + * just free the "next" node, until there is no next node. This should assure + * that at the end the first (and only) node is not in use and contains the + * full size of the heap. + */ struct nouveau_heap { struct nouveau_heap *prev; struct nouveau_heap *next; diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index b4f1413fd8b..c6e5074db19 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -164,6 +164,16 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) size = sizeof(nvc0_data); } + /* + * Set default VRAM domain if not overridden + */ + if (!screen->vram_domain) { + if (dev->vram_size > 0) + screen->vram_domain = NOUVEAU_BO_VRAM; + else + screen->vram_domain = NOUVEAU_BO_GART; + } + ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS, data, size, &screen->channel); if (ret) diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h index cf06f7e88aa..30041b271c9 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.h +++ b/src/gallium/drivers/nouveau/nouveau_screen.h @@ -51,6 +51,8 @@ struct nouveau_screen { boolean hint_buf_keep_sysmem_copy; + unsigned vram_domain; + struct { unsigned profiles_checked; unsigned profiles_present; @@ -94,6 +96,8 @@ struct nouveau_screen { #endif }; +#define NV_VRAM_DOMAIN(screen) ((screen)->vram_domain) + #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS # define NOUVEAU_DRV_STAT(s, n, v) do { \ (s)->stats.named.n += (v); \ diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c index 1ab8929cc38..83fd1fa38dd 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c @@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers, struct pipe_framebuffer_state *fb = &nv30->framebuffer; uint32_t colr = 0, zeta = 0, mode = 0; - if (!nv30_state_validate(nv30, TRUE)) + if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE)) return; if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h index 7b32aaee936..592cdbe24f9 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_context.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h @@ -204,7 +204,7 @@ void nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info); boolean -nv30_state_validate(struct nv30_context *nv30, boolean hwtnl); +nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl); void nv30_state_release(struct nv30_context *nv30); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c index 3575c3d29fa..c1665b7ad2f 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c @@ -71,12 +71,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render, struct nv30_render *r = nv30_render(render); struct nv30_context *nv30 = r->nv30; - r->length = vertex_size * nr_vertices; + r->length = (uint32_t)vertex_size * (uint32_t)nr_vertices; if (r->offset + r->length >= render->max_vertex_buffer_bytes) { pipe_resource_reference(&r->buffer, NULL); r->buffer = pipe_buffer_create(&nv30->screen->base.base, - PIPE_BIND_VERTEX_BUFFER, 0, + PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM, render->max_vertex_buffer_bytes); if (!r->buffer) return FALSE; @@ -91,10 +91,14 @@ static void * nv30_render_map_vertices(struct vbuf_render *render) { struct nv30_render *r = nv30_render(render); - char *map = pipe_buffer_map(&r->nv30->base.pipe, r->buffer, - PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED, &r->transfer); - return map + r->offset; + char *map = pipe_buffer_map_range( + &r->nv30->base.pipe, r->buffer, + r->offset, r->length, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_DISCARD_RANGE, + &r->transfer); + assert(map); + return map; } static void @@ -103,6 +107,7 @@ nv30_render_unmap_vertices(struct vbuf_render *render, { struct nv30_render *r = nv30_render(render); pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer); + r->transfer = NULL; } static void @@ -126,10 +131,10 @@ nv30_render_draw_elements(struct vbuf_render *render, for (i = 0; i < r->vertex_info.num_attribs; i++) { PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP, nv04_resource(r->buffer), r->offset + r->vtxptr[i], - NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0); + NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1); } - if (!nv30_state_validate(nv30, FALSE)) + if (!nv30_state_validate(nv30, ~0, FALSE)) return; BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1); @@ -171,10 +176,10 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr) for (i = 0; i < r->vertex_info.num_attribs; i++) { PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP, nv04_resource(r->buffer), r->offset + r->vtxptr[i], - NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0); + NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1); } - if (!nv30_state_validate(nv30, FALSE)) + if (!nv30_state_validate(nv30, ~0, FALSE)) return; BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1); @@ -213,22 +218,24 @@ static const struct { [TGSI_SEMANTIC_BCOLOR ] = { EMIT_4F, INTERP_LINEAR , 1, 3, 0x00000004 }, [TGSI_SEMANTIC_FOG ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 }, [TGSI_SEMANTIC_PSIZE ] = { EMIT_1F_PSIZE, INTERP_POS , 6, 6, 0x00000020 }, - [TGSI_SEMANTIC_GENERIC ] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 } + [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }, }; static boolean vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) { - struct pipe_screen *pscreen = &r->nv30->screen->base.base; + struct nv30_screen *screen = r->nv30->screen; struct nv30_fragprog *fp = r->nv30->fragprog.program; struct vertex_info *vinfo = &r->vertex_info; enum pipe_format format; uint emit = EMIT_OMIT; uint result = *idx; - if (sem == TGSI_SEMANTIC_GENERIC && result >= 8) { - for (result = 0; result < 8; result++) { - if (fp->texcoord[result] == *idx) { + if (sem == TGSI_SEMANTIC_GENERIC) { + uint num_texcoords = (screen->eng3d->oclass < NV40_3D_CLASS) ? 8 : 10; + for (result = 0; result < num_texcoords; result++) { + if (fp->texcoord[result] == *idx + 8) { + sem = TGSI_SEMANTIC_TEXCOORD; emit = vroute[sem].emit; break; } @@ -243,11 +250,11 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib); format = draw_translate_vinfo_format(emit); - r->vtxfmt[attrib] = nv30_vtxfmt(pscreen, format)->hw; - r->vtxptr[attrib] = vinfo->size | NV30_3D_VTXBUF_DMA1; + r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw; + r->vtxptr[attrib] = vinfo->size; vinfo->size += draw_translate_vinfo_size(emit); - if (nv30_screen(pscreen)->eng3d->oclass < NV40_3D_CLASS) { + if (screen->eng3d->oclass < NV40_3D_CLASS) { r->vtxprog[attrib][0] = 0x001f38d8; r->vtxprog[attrib][1] = 0x0080001b | (attrib << 9); r->vtxprog[attrib][2] = 0x0836106c; @@ -259,7 +266,12 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) r->vtxprog[attrib][3] = 0x6041ff80 | (result + vroute[sem].vp40) << 2; } - *idx = vroute[sem].ow40 << result; + if (result < 8) + *idx = vroute[sem].ow40 << result; + else { + assert(sem == TGSI_SEMANTIC_TEXCOORD); + *idx = 0x00001000 << (result - 8); + } return TRUE; } @@ -313,7 +325,7 @@ nv30_render_validate(struct nv30_context *nv30) while (pntc && attrib < 16) { uint index = ffs(pntc) - 1; pntc &= ~(1 << index); - if (vroute_add(r, attrib, TGSI_SEMANTIC_GENERIC, &index)) { + if (vroute_add(r, attrib, TGSI_SEMANTIC_TEXCOORD, &index)) { vp_attribs |= (1 << attrib++); vp_results |= index; } @@ -398,17 +410,17 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (nv30->vertprog.constbuf) { void *map = nv04_resource(nv30->vertprog.constbuf)->data; draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, - map, nv30->vertprog.constbuf_nr); + map, nv30->vertprog.constbuf_nr * 16); + } else { + draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, NULL, 0); } } for (i = 0; i < nv30->num_vtxbufs; i++) { const void *map = nv30->vtxbuf[i].user_buffer; if (!map) { - if (!nv30->vtxbuf[i].buffer) { - continue; - } - map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer, + if (nv30->vtxbuf[i].buffer) + map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer, PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ, &transfer[i]); } @@ -418,9 +430,9 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (info->indexed) { const void *map = nv30->idxbuf.user_buffer; if (!map) - pipe_buffer_map(pipe, nv30->idxbuf.buffer, - PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_READ, &transferi); + map = pipe_buffer_map(pipe, nv30->idxbuf.buffer, + PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_READ, &transferi); draw_set_indexes(draw, (ubyte *) map + nv30->idxbuf.offset, nv30->idxbuf.index_size, ~0); @@ -444,6 +456,12 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) static void nv30_render_destroy(struct vbuf_render *render) { + struct nv30_render *r = nv30_render(render); + + if (r->transfer) + pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer); + pipe_resource_reference(&r->buffer, NULL); + nouveau_heap_free(&r->vertprog); FREE(render); } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c index a05bfe10ee9..7f227868f73 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c @@ -23,6 +23,7 @@ * */ +#include "draw/draw_context.h" #include "tgsi/tgsi_parse.h" #include "nv_object.xml.h" @@ -147,8 +148,12 @@ nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso) pipe_resource_reference(&fp->buffer, NULL); + if (fp->draw) + draw_delete_fragment_shader(nv30_context(pipe)->draw, fp->draw); + FREE((void *)fp->pipe.tokens); FREE(fp->insn); + FREE(fp->consts); FREE(fp); } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index eeb714864e2..2e38a1978ae 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -161,6 +161,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_VENDOR_ID: @@ -251,6 +252,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; default: debug_printf("unknown vertex shader param %d\n", param); @@ -291,6 +293,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; default: debug_printf("unknown fragment shader param %d\n", param); @@ -523,7 +526,7 @@ nv30_screen_create(struct nouveau_device *dev) ret = nouveau_bo_wrap(screen->base.device, fifo->notify, &screen->notify); if (ret == 0) - nouveau_bo_map(screen->notify, 0, screen->base.client); + ret = nouveau_bo_map(screen->notify, 0, screen->base.client); if (ret) FAIL_SCREEN_INIT("error mapping notifier memory: %d\n", ret); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c index 0f9d19dd68e..a954dcce562 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c @@ -272,15 +272,13 @@ nv30_validate_clip(struct nv30_context *nv30) uint32_t clpd_enable = 0; for (i = 0; i < 6; i++) { - if (nv30->rast->pipe.clip_plane_enable & (1 << i)) { - if (nv30->dirty & NV30_NEW_CLIP) { - BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5); - PUSH_DATA (push, i); - PUSH_DATAp(push, nv30->clip.ucp[i], 4); - } - - clpd_enable |= 1 << (1 + 4*i); + if (nv30->dirty & NV30_NEW_CLIP) { + BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5); + PUSH_DATA (push, i); + PUSH_DATAp(push, nv30->clip.ucp[i], 4); } + if (nv30->rast->pipe.clip_plane_enable & (1 << i)) + clpd_enable |= 2 << (4*i); } BEGIN_NV04(push, NV30_3D(VP_CLIP_PLANES_ENABLE), 1); @@ -389,7 +387,7 @@ static struct state_validate hwtnl_validate_list[] = { { nv30_validate_stipple, NV30_NEW_STIPPLE }, { nv30_validate_scissor, NV30_NEW_SCISSOR | NV30_NEW_RASTERIZER }, { nv30_validate_viewport, NV30_NEW_VIEWPORT }, - { nv30_validate_clip, NV30_NEW_CLIP }, + { nv30_validate_clip, NV30_NEW_CLIP | NV30_NEW_RASTERIZER }, { nv30_fragprog_validate, NV30_NEW_FRAGPROG | NV30_NEW_FRAGCONST }, { nv30_vertprog_validate, NV30_NEW_VERTPROG | NV30_NEW_VERTCONST | NV30_NEW_FRAGPROG | NV30_NEW_RASTERIZER }, @@ -456,7 +454,7 @@ nv30_state_context_switch(struct nv30_context *nv30) } boolean -nv30_state_validate(struct nv30_context *nv30, boolean hwtnl) +nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl) { struct nouveau_screen *screen = &nv30->screen->base; struct nouveau_pushbuf *push = nv30->base.pushbuf; @@ -481,14 +479,16 @@ nv30_state_validate(struct nv30_context *nv30, boolean hwtnl) else validate = swtnl_validate_list; - if (nv30->dirty) { + mask &= nv30->dirty; + + if (mask) { while (validate->func) { - if (nv30->dirty & validate->mask) + if (mask & validate->mask) validate->func(nv30); validate++; } - nv30->dirty = 0; + nv30->dirty &= ~mask; } nouveau_pushbuf_bufctx(push, bctx); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c index 67ab8295218..d4e384b21d2 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c @@ -564,7 +564,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS))) nv30_update_user_vbufs(nv30); - nv30_state_validate(nv30, TRUE); + nv30_state_validate(nv30, ~0, TRUE); if (nv30->draw_flags) { nv30_render_vbo(pipe, info); return; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c index 3c1b7e714ea..4d4145d10b5 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c @@ -23,6 +23,7 @@ * */ +#include "draw/draw_context.h" #include "util/u_dynarray.h" #include "tgsi/tgsi_parse.h" @@ -237,6 +238,10 @@ nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso) if (vp->translated) nv30_vertprog_destroy(vp); + + if (vp->draw) + draw_delete_vertex_shader(nv30_context(pipe)->draw, vp->draw); + FREE((void *)vp->pipe.tokens); FREE(vp); } diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c index bbdca8102f0..9ef16965f39 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c @@ -327,6 +327,8 @@ nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target) //util_dynarray_append(&fpc->loop_stack, unsigned, target); } +#if 0 +/* documentation only */ /* warning: this only works forward, and probably only if not inside any IF */ static void nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target) @@ -352,6 +354,7 @@ nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target) reloc.location = fpc->inst_offset + 3; util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc); } +#endif static void nv40_fp_brk(struct nvfx_fpc *fpc) @@ -528,7 +531,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc, dst = tgsi_dst(fpc, &finst->Dst[0]); mask = tgsi_mask(finst->Dst[0].Register.WriteMask); - sat = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE); + sat = finst->Instruction.Saturate; switch (finst->Instruction.Opcode) { case TGSI_OPCODE_ABS: @@ -1201,17 +1204,3 @@ out_err: tgsi_dump(fp->pipe.tokens, 0); goto out; } - -static inline void -nvfx_fp_memcpy(void* dst, const void* src, size_t len) -{ -#ifndef PIPE_ARCH_BIG_ENDIAN - memcpy(dst, src, len); -#else - size_t i; - for(i = 0; i < len; i += 4) { - uint32_t v = *(uint32_t*)((char*)src + i); - *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16); - } -#endif -} diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c index 29d506b6e9b..1ce0589be71 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c @@ -539,7 +539,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, final_dst = dst = tgsi_dst(vpc, &finst->Dst[0]); mask = tgsi_mask(finst->Dst[0].Register.WriteMask); - if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) { + if(finst->Instruction.Saturate) { assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL); if (vpc->is_nv4x) sat = TRUE; @@ -796,7 +796,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, return FALSE; } - if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE && !vpc->is_nv4x) { + if(finst->Instruction.Saturate && !vpc->is_nv4x) { if (!vpc->r_0_1.type) vpc->r_0_1 = constant(vpc, -1, 0, 1, 0, 0); nvfx_vp_emit(vpc, arith(0, VEC, MAX, dst, mask, nvfx_src(dst), swz(nvfx_src(vpc->r_0_1), X, X, X, X), none)); @@ -872,9 +872,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, } break; case TGSI_SEMANTIC_EDGEFLAG: - /* not really an error just a fallback */ - NOUVEAU_ERR("cannot handle edgeflag output\n"); - return FALSE; + vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0); + return TRUE; default: NOUVEAU_ERR("bad output semantic\n"); return FALSE; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 2cfd5db5ea0..5b5d3912c20 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -138,8 +138,11 @@ nv50_destroy(struct pipe_context *pipe) { struct nv50_context *nv50 = nv50_context(pipe); - if (nv50_context_screen(nv50)->cur_ctx == nv50) - nv50_context_screen(nv50)->cur_ctx = NULL; + if (nv50->screen->cur_ctx == nv50) { + nv50->screen->cur_ctx = NULL; + /* Save off the state in case another context gets created */ + nv50->screen->save_state = nv50->state; + } nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL); nouveau_pushbuf_kick(nv50->base.pushbuf, nv50->base.pushbuf->channel); @@ -290,6 +293,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv) pipe->get_sample_position = nv50_context_get_sample_position; if (!screen->cur_ctx) { + /* Restore the last context's state here, normally handled during + * context switch + */ + nv50->state = screen->save_state; screen->cur_ctx = nv50; nouveau_pushbuf_bufctx(screen->base.pushbuf, nv50->bufctx); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h index 45eb554eb4f..1f123ef7e92 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h @@ -104,28 +104,7 @@ struct nv50_context { uint32_t dirty; boolean cb_dirty; - struct { - uint32_t instance_elts; /* bitmask of per-instance elements */ - uint32_t instance_base; - uint32_t interpolant_ctrl; - uint32_t semantic_color; - uint32_t semantic_psize; - int32_t index_bias; - boolean uniform_buffer_bound[3]; - boolean prim_restart; - boolean point_sprite; - boolean rt_serialize; - boolean flushed; - boolean rasterizer_discard; - uint8_t tls_required; - boolean new_tls_space; - uint8_t num_vtxbufs; - uint8_t num_vtxelts; - uint8_t num_textures[3]; - uint8_t num_samplers[3]; - uint8_t prim_size; - uint16_t scissor; - } state; + struct nv50_graph_state state; struct nv50_blend_stateobj *blend; struct nv50_rasterizer_stateobj *rast; @@ -191,12 +170,6 @@ nv50_context(struct pipe_context *pipe) return (struct nv50_context *)pipe; } -static INLINE struct nv50_screen * -nv50_context_screen(struct nv50_context *nv50) -{ - return nv50_screen(&nv50->base.screen->base); -} - /* return index used in nv50_context arrays for a specific shader type */ static INLINE unsigned nv50_context_shader_stage(unsigned pipe) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c index 744a3a5bf8b..f15d8f3ecb6 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c @@ -377,7 +377,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, if (!bo_config.nv50.memtype && (pt->bind & PIPE_BIND_SHARED)) mt->base.domain = NOUVEAU_BO_GART; else - mt->base.domain = NOUVEAU_BO_VRAM; + mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen)); bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP; if (mt->base.base.bind & (PIPE_BIND_CURSOR | PIPE_BIND_DISPLAY_TARGET)) @@ -419,7 +419,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen, FREE(mt); return NULL; } - mt->base.domain = NOUVEAU_BO_VRAM; + mt->base.domain = mt->base.bo->flags & NOUVEAU_BO_APER; mt->base.address = mt->base.bo->offset; mt->base.base = *templ; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c index 6690aa282eb..81f7474e36b 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c @@ -27,6 +27,11 @@ #include "nv50/nv50_context.h" #include "nv_object.xml.h" +#define NV50_QUERY_STATE_READY 0 +#define NV50_QUERY_STATE_ACTIVE 1 +#define NV50_QUERY_STATE_ENDED 2 +#define NV50_QUERY_STATE_FLUSHED 3 + /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts * (since we use only a single GPU channel per screen) will not work properly. * @@ -42,10 +47,10 @@ struct nv50_query { struct nouveau_bo *bo; uint32_t base; uint32_t offset; /* base + i * 32 */ - boolean ready; - boolean flushed; + uint8_t state; boolean is64bit; struct nouveau_mm_allocation *mm; + struct nouveau_fence *fence; }; #define NV50_QUERY_ALLOC_SPACE 256 @@ -65,7 +70,7 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size) if (q->bo) { nouveau_bo_ref(NULL, &q->bo); if (q->mm) { - if (q->ready) + if (q->state == NV50_QUERY_STATE_READY) nouveau_mm_free(q->mm); else nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, @@ -92,6 +97,7 @@ static void nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq) { nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0); + nouveau_fence_ref(NULL, &nv50_query(pq)->fence); FREE(nv50_query(pq)); } @@ -112,7 +118,8 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index) q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED || type == PIPE_QUERY_PRIMITIVES_EMITTED || - type == PIPE_QUERY_SO_STATISTICS); + type == PIPE_QUERY_SO_STATISTICS || + type == PIPE_QUERY_PIPELINE_STATISTICS); q->type = type; if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) { @@ -200,7 +207,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq) default: break; } - q->ready = FALSE; + q->state = NV50_QUERY_STATE_ACTIVE; return true; } @@ -211,6 +218,8 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nv50_query *q = nv50_query(pq); + q->state = NV50_QUERY_STATE_ENDED; + switch (q->type) { case PIPE_QUERY_OCCLUSION_COUNTER: nv50_query_get(push, q, 0, 0x0100f002); @@ -253,19 +262,27 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) break; case PIPE_QUERY_TIMESTAMP_DISJOINT: /* This query is not issued on GPU because disjoint is forced to FALSE */ - q->ready = TRUE; + q->state = NV50_QUERY_STATE_READY; break; default: assert(0); break; } - q->ready = q->flushed = FALSE; + + if (q->is64bit) + nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence); } -static INLINE boolean -nv50_query_ready(struct nv50_query *q) +static INLINE void +nv50_query_update(struct nv50_query *q) { - return q->ready || (!q->is64bit && (q->data[0] == q->sequence)); + if (q->is64bit) { + if (nouveau_fence_signalled(q->fence)) + q->state = NV50_QUERY_STATE_READY; + } else { + if (q->data[0] == q->sequence) + q->state = NV50_QUERY_STATE_READY; + } } static boolean @@ -280,13 +297,14 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, uint64_t *data64 = (uint64_t *)q->data; int i; - if (!q->ready) /* update ? */ - q->ready = nv50_query_ready(q); - if (!q->ready) { + if (q->state != NV50_QUERY_STATE_READY) + nv50_query_update(q); + + if (q->state != NV50_QUERY_STATE_READY) { if (!wait) { /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */ - if (!q->flushed) { - q->flushed = TRUE; + if (q->state != NV50_QUERY_STATE_FLUSHED) { + q->state = NV50_QUERY_STATE_FLUSHED; PUSH_KICK(nv50->base.pushbuf); } return FALSE; @@ -294,7 +312,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client)) return FALSE; } - q->ready = TRUE; + q->state = NV50_QUERY_STATE_READY; switch (q->type) { case PIPE_QUERY_GPU_FINISHED: @@ -434,6 +452,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push, /* XXX: does this exist ? */ #define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8)) + PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); nouveau_pushbuf_space(push, 0, 0, 1); nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 | NV50_IB_ENTRY_1_NO_PREFETCH); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 829dfbc13fa..6583a353578 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -209,6 +209,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_VENDOR_ID: @@ -290,6 +291,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; default: NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h index f8ce365135a..881051b1862 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h @@ -25,10 +25,34 @@ struct nv50_context; struct nv50_blitter; +struct nv50_graph_state { + uint32_t instance_elts; /* bitmask of per-instance elements */ + uint32_t instance_base; + uint32_t interpolant_ctrl; + uint32_t semantic_color; + uint32_t semantic_psize; + int32_t index_bias; + boolean uniform_buffer_bound[3]; + boolean prim_restart; + boolean point_sprite; + boolean rt_serialize; + boolean flushed; + boolean rasterizer_discard; + uint8_t tls_required; + boolean new_tls_space; + uint8_t num_vtxbufs; + uint8_t num_vtxelts; + uint8_t num_textures[3]; + uint8_t num_samplers[3]; + uint8_t prim_size; + uint16_t scissor; +}; + struct nv50_screen { struct nouveau_screen base; struct nv50_context *cur_ctx; + struct nv50_graph_state save_state; struct nouveau_bo *code; struct nouveau_bo *uniforms; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 290750459cf..d4d41af3c61 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -811,12 +811,12 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE; if (nv50->constbuf[s][i].user) { nv50->constbuf[s][i].u.data = cb->user_buffer; - nv50->constbuf[s][i].size = cb->buffer_size; + nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); nv50->constbuf_valid[s] |= 1 << i; } else if (res) { nv50->constbuf[s][i].offset = cb->buffer_offset; - nv50->constbuf[s][i].size = align(cb->buffer_size, 0x100); + nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000); nv50->constbuf_valid[s] |= 1 << i; } else { nv50->constbuf_valid[s] &= ~(1 << i); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c index 85e19b4c623..116bf4bba7c 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c @@ -394,6 +394,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to) if (ctx_from) ctx_to->state = ctx_from->state; + else + ctx_to->state = ctx_to->screen->save_state; ctx_to->dirty = ~0; ctx_to->viewports_dirty = ~0; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index c1590eefe9f..1fd33b8aa59 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -628,6 +628,7 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten, BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1); PUSH_DATA (push, prim); + PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); nouveau_pushbuf_space(push, 8, 0, 1); switch (index_size) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index ad287a2af6b..56fc83d3679 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -57,7 +57,7 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 1 << 12, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL, &screen->parm); if (ret) return ret; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index 7662fb50f61..a35c3f66142 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -139,8 +139,12 @@ nvc0_destroy(struct pipe_context *pipe) { struct nvc0_context *nvc0 = nvc0_context(pipe); - if (nvc0->screen->cur_ctx == nvc0) + if (nvc0->screen->cur_ctx == nvc0) { nvc0->screen->cur_ctx = NULL; + nvc0->screen->save_state = nvc0->state; + nvc0->screen->save_state.tfb = NULL; + } + /* Unset bufctx, we don't want to revalidate any resources after the flush. * Other contexts will always set their bufctx again on action calls. */ @@ -303,6 +307,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) pipe->get_sample_position = nvc0_context_get_sample_position; if (!screen->cur_ctx) { + nvc0->state = screen->save_state; screen->cur_ctx = nvc0; nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx); } @@ -324,7 +329,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) /* add permanently resident buffers to bufctxts */ - flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; + flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD; BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text); BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo); @@ -335,7 +340,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm); } - flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR; + flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RDWR; if (screen->poly_cache) BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index ef251f35a1b..a8d7593b398 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -113,29 +113,7 @@ struct nvc0_context { uint32_t dirty; uint32_t dirty_cp; /* dirty flags for compute state */ - struct { - boolean flushed; - boolean rasterizer_discard; - boolean early_z_forced; - boolean prim_restart; - uint32_t instance_elts; /* bitmask of per-instance elements */ - uint32_t instance_base; - uint32_t constant_vbos; - uint32_t constant_elts; - int32_t index_bias; - uint16_t scissor; - uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */ - uint8_t num_vtxbufs; - uint8_t num_vtxelts; - uint8_t num_textures[6]; - uint8_t num_samplers[6]; - uint8_t tls_required; /* bitmask of shader types using l[] */ - uint8_t c14_bound; /* whether immediate array constbuf is bound */ - uint8_t clip_enable; - uint32_t clip_mode; - uint32_t uniform_buffer_bound[5]; - struct nvc0_transform_feedback_state *tfb; - } state; + struct nvc0_graph_state state; struct nvc0_blend_stateobj *blend; struct nvc0_rasterizer_stateobj *rast; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c index fc75fc6a4a1..3875bbf4ca4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c @@ -302,7 +302,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen, if (!bo_config.nvc0.memtype && (pt->usage == PIPE_USAGE_STAGING || pt->bind & PIPE_BIND_SHARED)) mt->base.domain = NOUVEAU_BO_GART; else - mt->base.domain = NOUVEAU_BO_VRAM; + mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen)); bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index c156e918dc5..e1f5a8c4416 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -392,7 +392,7 @@ nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info) break; } - gp->hdr[4] = info->prop.gp.maxVertices & 0x1ff; + gp->hdr[4] = MIN2(info->prop.gp.maxVertices, 1024); return nvc0_vtgp_gen_header(gp, info); } @@ -683,11 +683,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem); if (ret) { struct nouveau_heap *heap = screen->text_heap; - struct nouveau_heap *iter; - for (iter = heap; iter && iter->next != heap; iter = iter->next) { - struct nvc0_program *evict = iter->priv; - if (evict) - nouveau_heap_free(&evict->mem); + /* Note that the code library, which is allocated before anything else, + * does not have a priv pointer. We can stop once we hit it. + */ + while (heap->next && heap->next->priv) { + struct nvc0_program *evict = heap->next->priv; + nouveau_heap_free(&evict->mem); } debug_printf("WARNING: out of code space, evicting all shaders.\n"); ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); @@ -734,12 +735,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) if (!is_cp) nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, - NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr); + NV_VRAM_DOMAIN(&screen->base), NVC0_SHADER_HEADER_SIZE, prog->hdr); nvc0->base.push_data(&nvc0->base, screen->text, code_pos, - NOUVEAU_BO_VRAM, prog->code_size, prog->code); + NV_VRAM_DOMAIN(&screen->base), prog->code_size, prog->code); if (prog->immd_size) nvc0->base.push_data(&nvc0->base, - screen->text, prog->immd_base, NOUVEAU_BO_VRAM, + screen->text, prog->immd_base, NV_VRAM_DOMAIN(&screen->base), prog->immd_size, prog->immd_data); BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1); @@ -770,7 +771,7 @@ nvc0_program_library_upload(struct nvc0_context *nvc0) return; nvc0->base.push_data(&nvc0->base, - screen->text, screen->lib_code->start, NOUVEAU_BO_VRAM, + screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base), size, code); /* no need for a memory barrier, will be emitted with first program */ } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 52032eb6f83..aea6cbda02d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -617,6 +617,7 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) + PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); nouveau_pushbuf_space(push, 0, 0, 1); nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 | NVC0_IB_ENTRY_1_NO_PREFETCH); @@ -1407,11 +1408,14 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, count += NVC0_QUERY_DRV_STAT_COUNT; if (screen->base.device->drm_version >= 0x01000101) { - if (screen->base.class_3d >= NVE4_3D_CLASS) { - count += NVE4_PM_QUERY_COUNT; - } else if (screen->compute) { - count += NVC0_PM_QUERY_COUNT; /* NVC0_COMPUTE is not always enabled */ + if (screen->base.class_3d == NVE4_3D_CLASS) { + count += NVE4_PM_QUERY_COUNT; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + /* NVC0_COMPUTE is not always enabled */ + count += NVC0_PM_QUERY_COUNT; + } } } @@ -1437,19 +1441,21 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, } else #endif if (id < count) { - if (screen->base.class_3d >= NVE4_3D_CLASS) { - info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); - info->max_value.u64 = - (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; - return 1; - } else if (screen->compute) { - info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; - return 1; + if (screen->base.class_3d == NVE4_3D_CLASS) { + info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; + info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); + info->max_value.u64 = + (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; + info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; + return 1; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; + info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); + info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; + return 1; + } } } /* user asked for info about non-existing query */ @@ -1469,10 +1475,13 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, #endif if (screen->base.device->drm_version >= 0x01000101) { - if (screen->base.class_3d >= NVE4_3D_CLASS) { - count++; - } else if (screen->compute) { - count++; /* NVC0_COMPUTE is not always enabled */ + if (screen->compute) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + count++; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + count++; /* NVC0_COMPUTE is not always enabled */ + } } } @@ -1480,25 +1489,28 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, return count; if (id == NVC0_QUERY_MP_COUNTER_GROUP) { - info->name = "MP counters"; - info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU; - - if (screen->base.class_3d >= NVE4_3D_CLASS) { - info->num_queries = NVE4_PM_QUERY_COUNT; - - /* On NVE4+, each multiprocessor have 8 hardware counters separated - * in two distinct domains, but we allow only one active query - * simultaneously because some of them use more than one hardware - * counter and this will result in an undefined behaviour. */ - info->max_active_queries = 1; /* TODO: handle multiple hw counters */ - return 1; - } else if (screen->compute) { - info->num_queries = NVC0_PM_QUERY_COUNT; - - /* On NVC0:NVE4, each multiprocessor have 8 hardware counters - * in a single domain. */ - info->max_active_queries = 8; - return 1; + if (screen->compute) { + info->name = "MP counters"; + info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU; + + if (screen->base.class_3d == NVE4_3D_CLASS) { + info->num_queries = NVE4_PM_QUERY_COUNT; + + /* On NVE4+, each multiprocessor have 8 hardware counters separated + * in two distinct domains, but we allow only one active query + * simultaneously because some of them use more than one hardware + * counter and this will result in an undefined behaviour. */ + info->max_active_queries = 1; /* TODO: handle multiple hw counters */ + return 1; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + info->num_queries = NVC0_PM_QUERY_COUNT; + + /* On NVC0:NVE4, each multiprocessor have 8 hardware counters + * in a single domain. */ + info->max_active_queries = 8; + return 1; + } } } #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 748c9e7c8b9..56c230e42fc 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -193,6 +193,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_VENDOR_ID: @@ -296,6 +297,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 1; case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: return 16; /* would be 32 in linked (OpenGL-style) mode */ @@ -581,7 +583,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, size = align(size, 1 << 17); - ret = nouveau_bo_new(screen->base.device, NOUVEAU_BO_VRAM, 1 << 17, size, + ret = nouveau_bo_new(screen->base.device, NV_VRAM_DOMAIN(&screen->base), 1 << 17, size, NULL, &bo); if (ret) { NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size); @@ -644,6 +646,11 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.sysmem_bindings |= PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER; + if (screen->base.vram_domain & NOUVEAU_BO_GART) { + screen->base.sysmem_bindings |= screen->base.vidmem_bindings; + screen->base.vidmem_bindings = 0; + } + pscreen->destroy = nvc0_screen_destroy; pscreen->context_create = nvc0_create; pscreen->is_format_supported = nvc0_screen_is_format_supported; @@ -822,7 +829,7 @@ nvc0_screen_create(struct nouveau_device *dev) nvc0_magic_3d_init(push, screen->eng3d->oclass); - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL, &screen->text); if (ret) goto fail; @@ -832,12 +839,12 @@ nvc0_screen_create(struct nouveau_device *dev) */ nouveau_heap_init(&screen->text_heap, 0, (1 << 20) - 0x100); - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 12, 6 << 16, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 12, 6 << 16, NULL, &screen->uniform_bo); if (ret) goto fail; - PUSH_REFN (push, screen->uniform_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + PUSH_REFN (push, screen->uniform_bo, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_WR); for (i = 0; i < 5; ++i) { /* TIC and TSC entries for each unit (nve4+ only) */ @@ -908,7 +915,7 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, 0); if (screen->eng3d->oclass < GM107_3D_CLASS) { - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL, &screen->poly_cache); if (ret) goto fail; @@ -919,7 +926,7 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, 3); } - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 17, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 17, NULL, &screen->txc); if (ret) goto fail; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index 1a7d5027a7c..ef2bd43f006 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -27,10 +27,35 @@ struct nvc0_context; struct nvc0_blitter; +struct nvc0_graph_state { + boolean flushed; + boolean rasterizer_discard; + boolean early_z_forced; + boolean prim_restart; + uint32_t instance_elts; /* bitmask of per-instance elements */ + uint32_t instance_base; + uint32_t constant_vbos; + uint32_t constant_elts; + int32_t index_bias; + uint16_t scissor; + uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */ + uint8_t num_vtxbufs; + uint8_t num_vtxelts; + uint8_t num_textures[6]; + uint8_t num_samplers[6]; + uint8_t tls_required; /* bitmask of shader types using l[] */ + uint8_t c14_bound; /* whether immediate array constbuf is bound */ + uint8_t clip_enable; + uint32_t clip_mode; + uint32_t uniform_buffer_bound[5]; + struct nvc0_transform_feedback_state *tfb; +}; + struct nvc0_screen { struct nouveau_screen base; struct nvc0_context *cur_ctx; + struct nvc0_graph_state save_state; int num_occlusion_queries_active; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 516b33b76d5..e0842784a88 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -34,7 +34,7 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0, struct nouveau_pushbuf *push = nvc0->base.pushbuf; if (prog && prog->need_tls) { - const uint32_t flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR; + const uint32_t flags = NV_VRAM_DOMAIN(&nvc0->screen->base) | NOUVEAU_BO_RDWR; if (!nvc0->state.tls_required) BCTX_REFN_bo(nvc0->bufctx_3d, TLS, flags, nvc0->screen->tls); nvc0->state.tls_required |= 1 << stage; @@ -262,11 +262,13 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) if (tfb) targ->stride = tfb->stride[b]; + buf = nv04_resource(targ->pipe.buffer); + + BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR); + if (!(nvc0->tfbbuf_dirty & (1 << b))) continue; - buf = nv04_resource(targ->pipe.buffer); - if (!targ->clean) nvc0_query_fifo_wait(push, targ->pq); BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5); @@ -280,7 +282,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */ targ->clean = FALSE; } - BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR); } for (; b < 4; ++b) IMMED_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 0); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index dca06f4cddb..6b7a211e71b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -413,24 +413,6 @@ nvc0_zsa_state_delete(struct pipe_context *pipe, void *hwcso) #define NV50_TSC_WRAP_CASE(n) \ case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n -static INLINE unsigned -nv50_tsc_wrap_mode(unsigned wrap) -{ - switch (wrap) { - NV50_TSC_WRAP_CASE(REPEAT); - NV50_TSC_WRAP_CASE(MIRROR_REPEAT); - NV50_TSC_WRAP_CASE(CLAMP_TO_EDGE); - NV50_TSC_WRAP_CASE(CLAMP_TO_BORDER); - NV50_TSC_WRAP_CASE(CLAMP); - NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_EDGE); - NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_BORDER); - NV50_TSC_WRAP_CASE(MIRROR_CLAMP); - default: - NOUVEAU_ERR("unknown wrap mode: %d\n", wrap); - return NV50_TSC_WRAP_REPEAT; - } -} - static void nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso) { @@ -811,12 +793,12 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE; if (nvc0->constbuf[s][i].user) { nvc0->constbuf[s][i].u.data = cb->user_buffer; - nvc0->constbuf[s][i].size = cb->buffer_size; + nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); nvc0->constbuf_valid[s] |= 1 << i; } else if (cb) { nvc0->constbuf[s][i].offset = cb->buffer_offset; - nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100); + nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000); nvc0->constbuf_valid[s] |= 1 << i; } else { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index 6051f128f66..c52399ab312 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -439,7 +439,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1); PUSH_DATA (push, (0 << 4) | 1); } - nvc0_cb_push(&nvc0->base, bo, NOUVEAU_BO_VRAM, + nvc0_cb_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base), base, nvc0->state.uniform_buffer_bound[s], 0, (size + 3) / 4, nvc0->constbuf[s][0].u.data); @@ -543,6 +543,8 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to) if (ctx_from) ctx_to->state = ctx_from->state; + else + ctx_to->state = ctx_to->screen->save_state; ctx_to->dirty = ~0; ctx_to->viewports_dirty = ~0; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index 4404d8c1a74..a820de7259a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -1152,6 +1152,12 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 | NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST); } + if (nvc0->state.instance_elts) { + nvc0->state.instance_elts = 0; + BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2); + PUSH_DATA (push, n); + PUSH_DATA (push, 0); + } nvc0->state.num_vtxelts = 2; for (i = 0; i < info->dst.box.depth; ++i, z += dz) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index 457f27c8311..ddc0409ca86 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -396,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc); nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc, - 65536 + tsc->id * 32, NOUVEAU_BO_VRAM, + 65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base), 32, tsc->tsc); need_flush = TRUE; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 657b8c0fe82..8cf2584b0ce 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -829,6 +829,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) } PUSH_DATA(push, nvc0_prim_gl(info->mode)); #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) + PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain); nouveau_pushbuf_space(push, 0, 0, 1); nouveau_pushbuf_data(push, buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size); diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index f243316b899..fce02a7cc57 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -63,7 +63,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL, &screen->parm); if (ret) return ret; diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index a7b59d8bfbb..a7bca915f57 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -190,6 +190,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; /* SWTCL-only features. */ @@ -273,6 +274,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: return (is_r500 ? 256 : 32) * sizeof(float[4]); case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; case PIPE_SHADER_CAP_MAX_TEMPS: return is_r500 ? 128 : is_r400 ? 64 : 32; @@ -332,6 +334,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e case PIPE_SHADER_CAP_MAX_PREDS: return 0; /* unused */ case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c index 69afb4caeaa..23ed2cf2532 100644 --- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c +++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c @@ -133,13 +133,7 @@ static unsigned translate_opcode(unsigned opcode) static unsigned translate_saturate(unsigned saturate) { - switch(saturate) { - default: - fprintf(stderr, "Unknown saturate mode: %i\n", saturate); - /* fall-through */ - case TGSI_SAT_NONE: return RC_SATURATE_NONE; - case TGSI_SAT_ZERO_ONE: return RC_SATURATE_ZERO_ONE; - } + return saturate ? RC_SATURATE_ZERO_ONE : RC_SATURATE_NONE; } static unsigned translate_register_file(unsigned file) diff --git a/src/gallium/drivers/r600/Android.mk b/src/gallium/drivers/r600/Android.mk index e9357597a9b..bfe39873089 100644 --- a/src/gallium/drivers/r600/Android.mk +++ b/src/gallium/drivers/r600/Android.mk @@ -33,6 +33,10 @@ LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES) LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon LOCAL_MODULE := libmesa_pipe_r600 +ifeq ($(MESA_LOLLIPOP_BUILD),true) +LOCAL_C_INCLUDES := external/libcxx/include +else include external/stlport/libstlport.mk +endif include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 21e5d42adc3..e122b607b86 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -332,6 +332,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; /* Stream output. */ @@ -475,6 +476,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e case PIPE_SHADER_CAP_SUBROUTINES: return 0; case PIPE_SHADER_CAP_INTEGERS: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 87b6e6e06ec..af7622e9b34 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -617,98 +617,100 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) switch (d->Declaration.File) { case TGSI_FILE_INPUT: - i = ctx->shader->ninput; - assert(i < Elements(ctx->shader->input)); - ctx->shader->ninput += count; - ctx->shader->input[i].name = d->Semantic.Name; - ctx->shader->input[i].sid = d->Semantic.Index; - ctx->shader->input[i].interpolate = d->Interp.Interpolate; - ctx->shader->input[i].interpolate_location = d->Interp.Location; - ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First; - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); - switch (ctx->shader->input[i].name) { - case TGSI_SEMANTIC_FACE: - if (ctx->face_gpr != -1) - ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ - else - ctx->face_gpr = ctx->shader->input[i].gpr; - break; - case TGSI_SEMANTIC_COLOR: - ctx->colors_used++; - break; - case TGSI_SEMANTIC_POSITION: - ctx->fragcoord_input = i; - break; - case TGSI_SEMANTIC_PRIMID: - /* set this for now */ - ctx->shader->gs_prim_id_input = true; - ctx->shader->ps_prim_id_input = i; - break; - } - if (ctx->bc->chip_class >= EVERGREEN) { - if ((r = evergreen_interp_input(ctx, i))) - return r; + for (j = 0; j < count; j++) { + i = ctx->shader->ninput + j; + assert(i < Elements(ctx->shader->input)); + ctx->shader->input[i].name = d->Semantic.Name; + ctx->shader->input[i].sid = d->Semantic.Index + j; + ctx->shader->input[i].interpolate = d->Interp.Interpolate; + ctx->shader->input[i].interpolate_location = d->Interp.Location; + ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); + switch (ctx->shader->input[i].name) { + case TGSI_SEMANTIC_FACE: + if (ctx->face_gpr != -1) + ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ + else + ctx->face_gpr = ctx->shader->input[i].gpr; + break; + case TGSI_SEMANTIC_COLOR: + ctx->colors_used++; + break; + case TGSI_SEMANTIC_POSITION: + ctx->fragcoord_input = i; + break; + case TGSI_SEMANTIC_PRIMID: + /* set this for now */ + ctx->shader->gs_prim_id_input = true; + ctx->shader->ps_prim_id_input = i; + break; + } + if (ctx->bc->chip_class >= EVERGREEN) { + if ((r = evergreen_interp_input(ctx, i))) + return r; + } + } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { + /* FIXME probably skip inputs if they aren't passed in the ring */ + ctx->shader->input[i].ring_offset = ctx->next_ring_offset; + ctx->next_ring_offset += 16; + if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) + ctx->shader->gs_prim_id_input = true; } - } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { - /* FIXME probably skip inputs if they aren't passed in the ring */ - ctx->shader->input[i].ring_offset = ctx->next_ring_offset; - ctx->next_ring_offset += 16; - if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) - ctx->shader->gs_prim_id_input = true; - } - for (j = 1; j < count; ++j) { - ctx->shader->input[i + j] = ctx->shader->input[i]; - ctx->shader->input[i + j].gpr += j; } + ctx->shader->ninput += count; break; case TGSI_FILE_OUTPUT: - i = ctx->shader->noutput++; - assert(i < Elements(ctx->shader->output)); - ctx->shader->output[i].name = d->Semantic.Name; - ctx->shader->output[i].sid = d->Semantic.Index; - ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First; - ctx->shader->output[i].interpolate = d->Interp.Interpolate; - ctx->shader->output[i].write_mask = d->Declaration.UsageMask; - if (ctx->type == TGSI_PROCESSOR_VERTEX || - ctx->type == TGSI_PROCESSOR_GEOMETRY) { - ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_CLIPDIST: - ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2); - break; - case TGSI_SEMANTIC_PSIZE: - ctx->shader->vs_out_misc_write = 1; - ctx->shader->vs_out_point_size = 1; - break; - case TGSI_SEMANTIC_EDGEFLAG: - ctx->shader->vs_out_misc_write = 1; - ctx->shader->vs_out_edgeflag = 1; - ctx->edgeflag_output = i; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - ctx->shader->vs_out_misc_write = 1; - ctx->shader->vs_out_viewport = 1; - break; - case TGSI_SEMANTIC_LAYER: - ctx->shader->vs_out_misc_write = 1; - ctx->shader->vs_out_layer = 1; - break; - case TGSI_SEMANTIC_CLIPVERTEX: - ctx->clip_vertex_write = TRUE; - ctx->cv_output = i; - break; - } - if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { - ctx->gs_out_ring_offset += 16; - } - } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_COLOR: - ctx->shader->nr_ps_max_color_exports++; - break; + for (j = 0; j < count; j++) { + i = ctx->shader->noutput + j; + assert(i < Elements(ctx->shader->output)); + ctx->shader->output[i].name = d->Semantic.Name; + ctx->shader->output[i].sid = d->Semantic.Index + j; + ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; + ctx->shader->output[i].interpolate = d->Interp.Interpolate; + ctx->shader->output[i].write_mask = d->Declaration.UsageMask; + if (ctx->type == TGSI_PROCESSOR_VERTEX || + ctx->type == TGSI_PROCESSOR_GEOMETRY) { + ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); + switch (d->Semantic.Name) { + case TGSI_SEMANTIC_CLIPDIST: + ctx->shader->clip_dist_write |= d->Declaration.UsageMask << + ((d->Semantic.Index + j) << 2); + break; + case TGSI_SEMANTIC_PSIZE: + ctx->shader->vs_out_misc_write = 1; + ctx->shader->vs_out_point_size = 1; + break; + case TGSI_SEMANTIC_EDGEFLAG: + ctx->shader->vs_out_misc_write = 1; + ctx->shader->vs_out_edgeflag = 1; + ctx->edgeflag_output = i; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + ctx->shader->vs_out_misc_write = 1; + ctx->shader->vs_out_viewport = 1; + break; + case TGSI_SEMANTIC_LAYER: + ctx->shader->vs_out_misc_write = 1; + ctx->shader->vs_out_layer = 1; + break; + case TGSI_SEMANTIC_CLIPVERTEX: + ctx->clip_vertex_write = TRUE; + ctx->cv_output = i; + break; + } + if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { + ctx->gs_out_ring_offset += 16; + } + } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + switch (d->Semantic.Name) { + case TGSI_SEMANTIC_COLOR: + ctx->shader->nr_ps_max_color_exports++; + break; + } } } + ctx->shader->noutput += count; break; case TGSI_FILE_TEMPORARY: if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { @@ -723,6 +725,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_CONSTANT: case TGSI_FILE_SAMPLER: + case TGSI_FILE_SAMPLER_VIEW: case TGSI_FILE_ADDRESS: break; @@ -1337,7 +1340,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output int i, j, r; /* Sanity checking. */ - if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) { + if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { R600_ERR("Too many stream outputs: %d\n", so->num_outputs); r = -EINVAL; goto out_err; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index c50c7055851..13dc9ee8c10 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -95,22 +95,23 @@ static void r600_texture_barrier(struct pipe_context *ctx) static unsigned r600_conv_pipe_prim(unsigned prim) { static const unsigned prim_conv[] = { - V_008958_DI_PT_POINTLIST, - V_008958_DI_PT_LINELIST, - V_008958_DI_PT_LINELOOP, - V_008958_DI_PT_LINESTRIP, - V_008958_DI_PT_TRILIST, - V_008958_DI_PT_TRISTRIP, - V_008958_DI_PT_TRIFAN, - V_008958_DI_PT_QUADLIST, - V_008958_DI_PT_QUADSTRIP, - V_008958_DI_PT_POLYGON, - V_008958_DI_PT_LINELIST_ADJ, - V_008958_DI_PT_LINESTRIP_ADJ, - V_008958_DI_PT_TRILIST_ADJ, - V_008958_DI_PT_TRISTRIP_ADJ, - V_008958_DI_PT_RECTLIST + [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST, + [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST, + [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP, + [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP, + [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST, + [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN, + [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST, + [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP, + [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON, + [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ, + [R600_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST }; + assert(prim < Elements(prim_conv)); return prim_conv[prim]; } diff --git a/src/gallium/drivers/radeon/Android.mk b/src/gallium/drivers/radeon/Android.mk index d61579280ea..6997a6d3ec3 100644 --- a/src/gallium/drivers/radeon/Android.mk +++ b/src/gallium/drivers/radeon/Android.mk @@ -30,6 +30,10 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := $(C_SOURCES) +ifeq ($(MESA_ENABLE_LLVM),true) +LOCAL_SRC_FILES += $(LLVM_C_FILES) +endif + LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon LOCAL_MODULE := libmesa_pipe_radeon diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources index c655fe5787b..f63790c329e 100644 --- a/src/gallium/drivers/radeon/Makefile.sources +++ b/src/gallium/drivers/radeon/Makefile.sources @@ -12,6 +12,7 @@ C_SOURCES := \ radeon_uvd.c \ radeon_uvd.h \ radeon_vce_40_2_2.c \ + radeon_vce_50.c \ radeon_vce.c \ radeon_vce.h \ radeon_video.c \ diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 42e681dc7d2..3def4446882 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -107,11 +107,10 @@ void r600_draw_rectangle(struct blitter_context *blitter, void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw) { - /* The number of dwords we already used in the DMA so far. */ - num_dw += ctx->rings.dma.cs->cdw; /* Flush if there's not enough space. */ - if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { + if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) { ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS); } } diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 8612ef8daf7..6a9557b0b73 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -33,7 +33,6 @@ #define RADEON_LLVM_MAX_INPUTS 32 * 4 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4 -#define RADEON_LLVM_MAX_ARRAYS 16 #define RADEON_LLVM_INITIAL_CF_DEPTH 4 @@ -130,8 +129,7 @@ struct radeon_llvm_context { unsigned loop_depth; unsigned loop_depth_max; - struct tgsi_declaration_range arrays[RADEON_LLVM_MAX_ARRAYS]; - unsigned num_arrays; + struct tgsi_declaration_range *arrays; LLVMValueRef main_fn; diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c index 624077c7465..25580b6bd4c 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.c +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c @@ -86,10 +86,18 @@ static void init_r600_target() { static unsigned initialized = 0; if (!initialized) { +#if HAVE_LLVM < 0x0307 LLVMInitializeR600TargetInfo(); LLVMInitializeR600Target(); LLVMInitializeR600TargetMC(); LLVMInitializeR600AsmPrinter(); +#else + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + +#endif initialized = 1; } } diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 20e506b7c5e..c8c980d9d32 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -85,8 +85,9 @@ get_array_range(struct lp_build_tgsi_context *bld_base, unsigned File, const struct tgsi_ind_register *reg) { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 || - reg->ArrayID > RADEON_LLVM_MAX_ARRAYS) { + reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) { struct tgsi_declaration_range range; range.First = 0; range.Last = bld_base->info->file_max[File]; @@ -252,8 +253,14 @@ static void emit_declaration( } case TGSI_FILE_TEMPORARY: - if (decl->Declaration.Array && decl->Array.ArrayID <= RADEON_LLVM_MAX_ARRAYS) + if (decl->Declaration.Array) { + if (!ctx->arrays) { + int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY]; + ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size); + } + ctx->arrays[decl->Array.ArrayID - 1] = decl->Range; + } if (uses_temp_indirect_addressing(bld_base)) { lp_emit_declaration_soa(bld_base, decl); break; @@ -314,6 +321,21 @@ static void emit_declaration( } } +static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base, + LLVMValueRef value) +{ + struct lp_build_emit_data clamp_emit_data; + + memset(&clamp_emit_data, 0, sizeof(clamp_emit_data)); + clamp_emit_data.arg_count = 3; + clamp_emit_data.args[0] = value; + clamp_emit_data.args[2] = bld_base->base.one; + clamp_emit_data.args[1] = bld_base->base.zero; + + return lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP, + &clamp_emit_data); +} + static void emit_store( struct lp_build_tgsi_context * bld_base, @@ -324,7 +346,6 @@ emit_store( struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); struct gallivm_state *gallivm = bld->bld_base.base.gallivm; - struct lp_build_context base = bld->bld_base.base; const struct tgsi_full_dst_register *reg = &inst->Dst[0]; LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; LLVMValueRef temp_ptr; @@ -350,28 +371,8 @@ emit_store( TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { LLVMValueRef value = dst[chan_index]; - if (inst->Instruction.Saturate != TGSI_SAT_NONE) { - struct lp_build_emit_data clamp_emit_data; - - memset(&clamp_emit_data, 0, sizeof(clamp_emit_data)); - clamp_emit_data.arg_count = 3; - clamp_emit_data.args[0] = value; - clamp_emit_data.args[2] = base.one; - - switch(inst->Instruction.Saturate) { - case TGSI_SAT_ZERO_ONE: - clamp_emit_data.args[1] = base.zero; - break; - case TGSI_SAT_MINUS_PLUS_ONE: - clamp_emit_data.args[1] = LLVMConstReal( - base.elem_type, -1.0f); - break; - default: - assert(0); - } - value = lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP, - &clamp_emit_data); - } + if (inst->Instruction.Saturate) + value = radeon_llvm_saturate(bld_base, value); if (reg->Register.File == TGSI_FILE_ADDRESS) { temp_ptr = bld->addr[reg->Register.Index][chan_index]; @@ -1438,8 +1439,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) /* Allocate outputs */ ctx->soa.outputs = ctx->outputs; - ctx->num_arrays = 0; - /* XXX: Is there a better way to initialize all this ? */ lp_set_default_actions(bld_base); @@ -1628,8 +1627,11 @@ void radeon_llvm_dispose(struct radeon_llvm_context * ctx) { LLVMDisposeModule(ctx->soa.bld_base.base.gallivm->module); LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context); + FREE(ctx->arrays); + ctx->arrays = NULL; FREE(ctx->temps); ctx->temps = NULL; + ctx->temps_count = 0; FREE(ctx->loop); ctx->loop = NULL; ctx->loop_depth_max = 0; diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index e220f40165b..a6567379fe3 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -44,6 +44,10 @@ #include "radeon_video.h" #include "radeon_vce.h" +#define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8)) +#define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8)) +#define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8)) + /** * flush commands to the hardware */ @@ -183,6 +187,44 @@ static unsigned get_cpb_num(struct rvce_encoder *enc) } /** + * Get the slot for the currently encoded frame + */ +struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list); +} + +/** + * Get the slot for L0 + */ +struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list); +} + +/** + * Get the slot for L1 + */ +struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list); +} + +/** + * Calculate the offsets into the CPB + */ +void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, + unsigned *luma_offset, unsigned *chroma_offset) +{ + unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128); + unsigned vpitch = align(enc->luma->npix_y, 16); + unsigned fsize = pitch * (vpitch + vpitch / 2); + + *luma_offset = slot->index * fsize; + *chroma_offset = *luma_offset + pitch * vpitch; +} + +/** * destroy this video encoder */ static void rvce_destroy(struct pipe_video_codec *encoder) @@ -406,7 +448,19 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, reset_cpb(enc); - radeon_vce_40_2_2_init(enc); + switch (rscreen->info.vce_fw_version) { + case FW_40_2_2: + radeon_vce_40_2_2_init(enc); + break; + + case FW_50_0_1: + case FW_50_1_2: + radeon_vce_50_init(enc); + break; + + default: + goto error; + } return &enc->base; @@ -426,5 +480,7 @@ error: */ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) { - return rscreen->info.vce_fw_version == ((40 << 24) | (2 << 16) | (2 << 8)); + return rscreen->info.vce_fw_version == FW_40_2_2 || + rscreen->info.vce_fw_version == FW_50_0_1 || + rscreen->info.vce_fw_version == FW_50_1_2; } diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h index 1cf018006a8..8319ef48cd5 100644 --- a/src/gallium/drivers/radeon/radeon_vce.h +++ b/src/gallium/drivers/radeon/radeon_vce.h @@ -104,6 +104,13 @@ struct rvce_encoder { bool use_vui; }; +/* CPB handling functions */ +struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc); +struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc); +struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc); +void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, + unsigned *luma_offset, unsigned *chroma_offset); + struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, const struct pipe_video_codec *templat, struct radeon_winsys* ws, @@ -114,4 +121,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen); /* init vce fw 40.2.2 specific callbacks */ void radeon_vce_40_2_2_init(struct rvce_encoder *enc); +/* init vce fw 50 specific callbacks */ +void radeon_vce_50_init(struct rvce_encoder *enc); + #endif diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c index 09029575547..51b17b5f6a8 100644 --- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c +++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c @@ -46,32 +46,6 @@ static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 }; -static struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc) -{ - return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list); -} - -static struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc) -{ - return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list); -} - -static struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc) -{ - return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list); -} - -static void frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, - unsigned *luma_offset, unsigned *chroma_offset) -{ - unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128); - unsigned vpitch = align(enc->luma->npix_y, 16); - unsigned fsize = pitch * (vpitch + vpitch / 2); - - *luma_offset = slot->index * fsize; - *chroma_offset = *luma_offset + pitch * vpitch; -} - static void session(struct rvce_encoder *enc) { RVCE_BEGIN(0x00000001); // session cmd @@ -369,7 +343,7 @@ static void encode(struct rvce_encoder *enc) if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { struct rvce_cpb_slot *l0 = l0_slot(enc); - frame_offset(enc, l0, &luma_offset, &chroma_offset); + rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset); RVCE_CS(l0->picture_type); // encPicType RVCE_CS(l0->frame_num); // frameNumber RVCE_CS(l0->pic_order_cnt); // pictureOrderCount @@ -395,7 +369,7 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0x00000000); // pictureStructure if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { struct rvce_cpb_slot *l1 = l1_slot(enc); - frame_offset(enc, l1, &luma_offset, &chroma_offset); + rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset); RVCE_CS(l1->picture_type); // encPicType RVCE_CS(l1->frame_num); // frameNumber RVCE_CS(l1->pic_order_cnt); // pictureOrderCount @@ -409,7 +383,7 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0xffffffff); // chromaOffset } - frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset); + rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset); RVCE_CS(luma_offset); // encReconstructedLumaOffset RVCE_CS(chroma_offset); // encReconstructedChromaOffset RVCE_CS(0x00000000); // encColocBufferOffset diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c new file mode 100644 index 00000000000..84a2bfb117e --- /dev/null +++ b/src/gallium/drivers/radeon/radeon_vce_50.c @@ -0,0 +1,228 @@ +/************************************************************************** + * + * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <[email protected]> + * + */ + +#include <stdio.h> + +#include "pipe/p_video_codec.h" + +#include "util/u_video.h" +#include "util/u_memory.h" + +#include "vl/vl_video_buffer.h" + +#include "r600_pipe_common.h" +#include "radeon_video.h" +#include "radeon_vce.h" + +static void task_info(struct rvce_encoder *enc, uint32_t taskOperation) +{ + RVCE_BEGIN(0x00000002); // task info + RVCE_CS(0xffffffff); // offsetOfNextTaskInfo + RVCE_CS(taskOperation); // taskOperation + RVCE_CS(0x00000000); // referencePictureDependency + RVCE_CS(0x00000000); // collocateFlagDependency + RVCE_CS(0x00000000); // feedbackIndex + RVCE_CS(0x00000000); // videoBitstreamRingIndex + RVCE_END(); +} + +static void rate_control(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000005); // rate control + RVCE_CS(enc->pic.rate_ctrl.rate_ctrl_method); // encRateControlMethod + RVCE_CS(enc->pic.rate_ctrl.target_bitrate); // encRateControlTargetBitRate + RVCE_CS(enc->pic.rate_ctrl.peak_bitrate); // encRateControlPeakBitRate + RVCE_CS(enc->pic.rate_ctrl.frame_rate_num); // encRateControlFrameRateNum + RVCE_CS(0x00000000); // encGOPSize + RVCE_CS(enc->pic.quant_i_frames); // encQP_I + RVCE_CS(enc->pic.quant_p_frames); // encQP_P + RVCE_CS(enc->pic.quant_b_frames); // encQP_B + RVCE_CS(enc->pic.rate_ctrl.vbv_buffer_size); // encVBVBufferSize + RVCE_CS(enc->pic.rate_ctrl.frame_rate_den); // encRateControlFrameRateDen + RVCE_CS(0x00000000); // encVBVBufferLevel + RVCE_CS(0x00000000); // encMaxAUSize + RVCE_CS(0x00000000); // encQPInitialMode + RVCE_CS(enc->pic.rate_ctrl.target_bits_picture); // encTargetBitsPerPicture + RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_integer); // encPeakBitsPerPictureInteger + RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_fraction); // encPeakBitsPerPictureFractional + RVCE_CS(0x00000000); // encMinQP + RVCE_CS(0x00000033); // encMaxQP + RVCE_CS(0x00000000); // encSkipFrameEnable + RVCE_CS(0x00000000); // encFillerDataEnable + RVCE_CS(0x00000000); // encEnforceHRD + RVCE_CS(0x00000000); // encBPicsDeltaQP + RVCE_CS(0x00000000); // encReferenceBPicsDeltaQP + RVCE_CS(0x00000000); // encRateControlReInitDisable + RVCE_CS(0x00000000); // encLCVBRInitQPFlag + RVCE_CS(0x00000000); // encLCVBRSATDBasedNonlinearBitBudgetFlag + RVCE_END(); +} + +static void encode(struct rvce_encoder *enc) +{ + int i; + unsigned luma_offset, chroma_offset; + + task_info(enc, 0x00000003); + + RVCE_BEGIN(0x05000001); // context buffer + RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi + RVCE_CS(0x00000000); // encodeContextAddressLo + RVCE_END(); + + RVCE_BEGIN(0x05000004); // video bitstream buffer + RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi + RVCE_CS(0x00000000); // videoBitstreamRingAddressLo + RVCE_CS(enc->bs_size); // videoBitstreamRingSize + RVCE_END(); + + RVCE_BEGIN(0x03000001); // encode + RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders + RVCE_CS(0x00000000); // pictureStructure + RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize + RVCE_CS(0x00000000); // forceRefreshMap + RVCE_CS(0x00000000); // insertAUD + RVCE_CS(0x00000000); // endOfSequence + RVCE_CS(0x00000000); // endOfStream + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi + RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi + RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo + RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch + RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch + RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch + RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) + RVCE_CS(0x00000000); // encInputPicTileConfig + RVCE_CS(enc->pic.picture_type); // encPicType + RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag + RVCE_CS(0x00000000); // encIdrPicId + RVCE_CS(0x00000000); // encMGSKeyPic + RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag + RVCE_CS(0x00000000); // encTemporalLayerIndex + RVCE_CS(0x00000000); // num_ref_idx_active_override_flag + RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1 + RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1 + + i = enc->pic.frame_num - enc->pic.ref_idx_l0; + if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) { + RVCE_CS(0x00000001); // encRefListModificationOp + RVCE_CS(i - 1); // encRefListModificationNum + } else { + RVCE_CS(0x00000000); // encRefListModificationOp + RVCE_CS(0x00000000); // encRefListModificationNum + } + + for (i = 0; i < 3; ++i) { + RVCE_CS(0x00000000); // encRefListModificationOp + RVCE_CS(0x00000000); // encRefListModificationNum + } + for (i = 0; i < 4; ++i) { + RVCE_CS(0x00000000); // encDecodedPictureMarkingOp + RVCE_CS(0x00000000); // encDecodedPictureMarkingNum + RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx + RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp + RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum + } + + // encReferencePictureL0[0] + RVCE_CS(0x00000000); // pictureStructure + if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || + enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { + struct rvce_cpb_slot *l0 = l0_slot(enc); + rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset); + RVCE_CS(l0->picture_type); // encPicType + RVCE_CS(l0->frame_num); // frameNumber + RVCE_CS(l0->pic_order_cnt); // pictureOrderCount + RVCE_CS(luma_offset); // lumaOffset + RVCE_CS(chroma_offset); // chromaOffset + } else { + RVCE_CS(0x00000000); // encPicType + RVCE_CS(0x00000000); // frameNumber + RVCE_CS(0x00000000); // pictureOrderCount + RVCE_CS(0xffffffff); // lumaOffset + RVCE_CS(0xffffffff); // chromaOffset + } + + // encReferencePictureL0[1] + RVCE_CS(0x00000000); // pictureStructure + RVCE_CS(0x00000000); // encPicType + RVCE_CS(0x00000000); // frameNumber + RVCE_CS(0x00000000); // pictureOrderCount + RVCE_CS(0xffffffff); // lumaOffset + RVCE_CS(0xffffffff); // chromaOffset + + // encReferencePictureL1[0] + RVCE_CS(0x00000000); // pictureStructure + if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { + struct rvce_cpb_slot *l1 = l1_slot(enc); + rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset); + RVCE_CS(l1->picture_type); // encPicType + RVCE_CS(l1->frame_num); // frameNumber + RVCE_CS(l1->pic_order_cnt); // pictureOrderCount + RVCE_CS(luma_offset); // lumaOffset + RVCE_CS(chroma_offset); // chromaOffset + } else { + RVCE_CS(0x00000000); // encPicType + RVCE_CS(0x00000000); // frameNumber + RVCE_CS(0x00000000); // pictureOrderCount + RVCE_CS(0xffffffff); // lumaOffset + RVCE_CS(0xffffffff); // chromaOffset + } + + rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset); + RVCE_CS(luma_offset); // encReconstructedLumaOffset + RVCE_CS(chroma_offset); // encReconstructedChromaOffset + RVCE_CS(0x00000000); // encColocBufferOffset + RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset + RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset + RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset + RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset + RVCE_CS(0x00000000); // pictureCount + RVCE_CS(enc->pic.frame_num); // frameNumber + RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount + RVCE_CS(0x00000000); // numIPicRemainInRCGOP + RVCE_CS(0x00000000); // numPPicRemainInRCGOP + RVCE_CS(0x00000000); // numBPicRemainInRCGOP + RVCE_CS(0x00000000); // numIRPicRemainInRCGOP + RVCE_CS(0x00000000); // enableIntraRefresh + RVCE_END(); +} + +void radeon_vce_50_init(struct rvce_encoder *enc) +{ + radeon_vce_40_2_2_init(enc); + + /* only the two below are different */ + enc->rate_control = rate_control; + enc->encode = encode; +} diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 774dc2285c0..2876c0ae735 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -1,4 +1,5 @@ C_SOURCES := \ + cik_sdma.c \ si_blit.c \ si_commands.c \ si_compute.c \ diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c new file mode 100644 index 00000000000..86111cb86e8 --- /dev/null +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -0,0 +1,364 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * Copyright 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ + +#include "sid.h" +#include "si_pipe.h" +#include "../radeon/r600_cs.h" + +#include "util/u_format.h" + +static uint32_t cik_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode) +{ + if (sscreen->b.info.si_tile_mode_array_valid) { + uint32_t gb_tile_mode = sscreen->b.info.si_tile_mode_array[tile_mode]; + + return G_009910_MICRO_TILE_MODE_NEW(gb_tile_mode); + } + + /* The kernel cannod return the tile mode array. Guess? */ + return V_009910_ADDR_SURF_THIN_MICRO_TILING; +} + +static void cik_sdma_do_copy_buffer(struct si_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src, + uint64_t dst_offset, + uint64_t src_offset, + uint64_t size) +{ + struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + unsigned i, ncopy, csize; + struct r600_resource *rdst = (struct r600_resource*)dst; + struct r600_resource *rsrc = (struct r600_resource*)src; + + dst_offset += r600_resource(dst)->gpu_address; + src_offset += r600_resource(src)->gpu_address; + + ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; + r600_need_dma_space(&ctx->b, ncopy * 7); + + r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + RADEON_PRIO_MIN); + r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + RADEON_PRIO_MIN); + + for (i = 0; i < ncopy; i++) { + csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE; + cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR, + 0); + cs->buf[cs->cdw++] = csize; + cs->buf[cs->cdw++] = 0; /* src/dst endian swap */ + cs->buf[cs->cdw++] = src_offset; + cs->buf[cs->cdw++] = src_offset >> 32; + cs->buf[cs->cdw++] = dst_offset; + cs->buf[cs->cdw++] = dst_offset >> 32; + dst_offset += csize; + src_offset += csize; + size -= csize; + } +} + +static void cik_sdma_copy_buffer(struct si_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src, + uint64_t dst_offset, + uint64_t src_offset, + uint64_t size) +{ + struct r600_resource *rdst = (struct r600_resource*)dst; + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&rdst->valid_buffer_range, dst_offset, + dst_offset + size); + + cik_sdma_do_copy_buffer(ctx, dst, src, dst_offset, src_offset, size); +} + +static void cik_sdma_copy_tile(struct si_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + struct pipe_resource *src, + unsigned src_level, + unsigned y, + unsigned copy_height, + unsigned y_align, + unsigned pitch, + unsigned bpe) +{ + struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct si_screen *sscreen = ctx->screen; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + struct r600_texture *rlinear, *rtiled; + unsigned linear_lvl, tiled_lvl; + unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size; + unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode; + unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; + uint64_t base, addr; + unsigned pipe_config, tile_mode_index; + + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + assert(dst_mode != src_mode); + assert(src_mode == RADEON_SURF_MODE_LINEAR || dst_mode == RADEON_SURF_MODE_LINEAR); + + sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED; + lbpe = util_logbase2(bpe); + pitch_tile_max = ((pitch / bpe) / 8) - 1; + + detile = dst_mode == RADEON_SURF_MODE_LINEAR; + rlinear = detile ? rdst : rsrc; + rtiled = detile ? rsrc : rdst; + linear_lvl = detile ? dst_level : src_level; + tiled_lvl = detile ? src_level : dst_level; + + assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format)); + + array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode); + slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x * + rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1; + height = rlinear->surface.level[linear_lvl].nblk_y; + base = rtiled->surface.level[tiled_lvl].offset; + addr = rlinear->surface.level[linear_lvl].offset; + bank_h = cik_bank_wh(rtiled->surface.bankh); + bank_w = cik_bank_wh(rtiled->surface.bankw); + mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea); + tile_split = cik_tile_split(rtiled->surface.tile_split); + tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false); + nbanks = si_num_banks(sscreen, rtiled); + base += rtiled->resource.gpu_address; + addr += rlinear->resource.gpu_address; + + pipe_config = cik_db_pipe_config(sscreen, tile_mode_index); + mt = cik_micro_tile_mode(sscreen, tile_mode_index); + + size = (copy_height * pitch) / 4; + cheight = copy_height; + if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { + cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; + cheight &= ~(y_align - 1); + } + ncopy = (copy_height + cheight - 1) / cheight; + r600_need_dma_space(&ctx->b, ncopy * 12); + + r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + RADEON_USAGE_READ, RADEON_PRIO_MIN); + r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + RADEON_USAGE_WRITE, RADEON_PRIO_MIN); + + copy_height = size * 4 / pitch; + for (i = 0; i < ncopy; i++) { + cheight = copy_height; + if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { + cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; + cheight &= ~(y_align - 1); + } + size = (cheight * pitch) / 4; + + cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + sub_op, detile << 15); + cs->buf[cs->cdw++] = base; + cs->buf[cs->cdw++] = base >> 32; + cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max; + cs->buf[cs->cdw++] = slice_tile_max; + cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) | + (nbanks << 21) | (bank_h << 18) | (bank_w << 15) | + (tile_split << 11) | (mt << 8) | (array_mode << 3) | + lbpe; + cs->buf[cs->cdw++] = y << 16; /* | x */ + cs->buf[cs->cdw++] = 0; /* z */; + cs->buf[cs->cdw++] = addr & 0xfffffffc; + cs->buf[cs->cdw++] = addr >> 32; + cs->buf[cs->cdw++] = (pitch / bpe) - 1; + cs->buf[cs->cdw++] = size; + + copy_height -= cheight; + y += cheight; + } +} + +void cik_sdma_copy(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + unsigned dst_pitch, src_pitch, bpe, dst_mode, src_mode; + unsigned src_w, dst_w; + unsigned src_x, src_y; + unsigned copy_height, y_align; + unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; + + if (sctx->b.rings.dma.cs == NULL) { + goto fallback; + } + + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + cik_sdma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); + return; + } + + /* Before re-enabling this, please make sure you can hit all newly + * enabled paths in your testing, preferably with both piglit (in + * particular the streaming-texture-leak test) and real world apps + * (e.g. the UE4 Elemental demo). + */ + goto fallback; + + if (src->format != dst->format || + rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 || + rdst->dirty_level_mask & (1 << dst_level)) { + goto fallback; + } + + if (rsrc->dirty_level_mask & (1 << src_level)) { + if (rsrc->htile_buffer) + goto fallback; + + ctx->flush_resource(ctx, src); + } + + src_x = util_format_get_nblocksx(src->format, src_box->x); + dst_x = util_format_get_nblocksx(src->format, dst_x); + src_y = util_format_get_nblocksy(src->format, src_box->y); + dst_y = util_format_get_nblocksy(src->format, dst_y); + + dst_pitch = rdst->surface.level[dst_level].pitch_bytes; + src_pitch = rsrc->surface.level[src_level].pitch_bytes; + src_w = rsrc->surface.level[src_level].npix_x; + dst_w = rdst->surface.level[dst_level].npix_x; + + if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w || + src_box->width != src_w || + rsrc->surface.level[src_level].nblk_y != + rdst->surface.level[dst_level].nblk_y) { + /* FIXME CIK can do partial blit */ + goto fallback; + } + + bpe = rdst->surface.bpe; + copy_height = src_box->height / rsrc->surface.blk_h; + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + + /* Dimensions must be aligned to (macro)tiles */ + switch (src_mode == RADEON_SURF_MODE_LINEAR ? dst_mode : src_mode) { + case RADEON_SURF_MODE_1D: + if ((src_x % 8) || (src_y % 8) || (dst_x % 8) || (dst_y % 8) || + (copy_height % 8)) + goto fallback; + y_align = 8; + break; + case RADEON_SURF_MODE_2D: { + unsigned mtilew, mtileh, num_banks; + + switch (si_num_banks(sctx->screen, rsrc)) { + case V_02803C_ADDR_SURF_2_BANK: + default: + num_banks = 2; + break; + case V_02803C_ADDR_SURF_4_BANK: + num_banks = 4; + break; + case V_02803C_ADDR_SURF_8_BANK: + num_banks = 8; + break; + case V_02803C_ADDR_SURF_16_BANK: + num_banks = 16; + break; + } + + mtilew = (8 * rsrc->surface.bankw * + sctx->screen->b.tiling_info.num_channels) * + rsrc->surface.mtilea; + assert(!(mtilew & (mtilew - 1))); + mtileh = (8 * rsrc->surface.bankh * num_banks) / + rsrc->surface.mtilea; + assert(!(mtileh & (mtileh - 1))); + + if ((src_x & (mtilew - 1)) || (src_y & (mtileh - 1)) || + (dst_x & (mtilew - 1)) || (dst_y & (mtileh - 1)) || + (copy_height & (mtileh - 1))) + goto fallback; + + y_align = mtileh; + break; + } + default: + y_align = 1; + } + + if (src_mode == dst_mode) { + uint64_t dst_offset, src_offset; + unsigned src_h, dst_h; + + src_h = rsrc->surface.level[src_level].npix_y; + dst_h = rdst->surface.level[dst_level].npix_y; + + if (src_box->depth > 1 && + (src_y || dst_y || src_h != dst_h || src_box->height != src_h)) + goto fallback; + + /* simple dma blit would do NOTE code here assume : + * dst_pitch == src_pitch + */ + src_offset= rsrc->surface.level[src_level].offset; + src_offset += rsrc->surface.level[src_level].slice_size * src_box->z; + src_offset += src_y * src_pitch + src_x * bpe; + dst_offset = rdst->surface.level[dst_level].offset; + dst_offset += rdst->surface.level[dst_level].slice_size * dst_z; + dst_offset += dst_y * dst_pitch + dst_x * bpe; + cik_sdma_do_copy_buffer(sctx, dst, src, dst_offset, src_offset, + src_box->depth * + rsrc->surface.level[src_level].slice_size); + } else { + if (dst_y != src_y || src_box->depth > 1 || src_box->z || dst_z) + goto fallback; + + cik_sdma_copy_tile(sctx, dst, dst_level, src, src_level, + src_y, copy_height, y_align, dst_pitch, bpe); + } + return; + +fallback: + si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box); +} diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index db523eef318..7a0076e7aa9 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -30,21 +30,6 @@ #include "util/u_format.h" -static unsigned si_array_mode(unsigned mode) -{ - switch (mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - return V_009910_ARRAY_LINEAR_ALIGNED; - case RADEON_SURF_MODE_1D: - return V_009910_ARRAY_1D_TILED_THIN1; - case RADEON_SURF_MODE_2D: - return V_009910_ARRAY_2D_TILED_THIN1; - default: - case RADEON_SURF_MODE_LINEAR: - return V_009910_ARRAY_LINEAR_GENERAL; - } -} - static uint32_t si_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode) { if (sscreen->b.info.si_tile_mode_array_valid) { @@ -240,11 +225,6 @@ void si_dma_copy(struct pipe_context *ctx, goto fallback; } - /* TODO: Implement DMA copy for CIK */ - if (sctx->b.chip_class >= CIK) { - goto fallback; - } - if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); return; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e68c30e8c7c..53ae71a8c92 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -251,6 +251,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_TGSI_TEXCOORD: return 1; case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: @@ -286,13 +287,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_TGSI_TEXCOORD: case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: @@ -451,6 +452,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: return 0; case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; } return 0; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f98c7a83744..2d67342f160 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -237,6 +237,15 @@ struct si_context { unsigned spi_tmpring_size; }; +/* cik_sdma.c */ +void cik_sdma_copy(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + /* si_blit.c */ void si_init_blit_functions(struct si_context *sctx); void si_flush_depth_textures(struct si_context *sctx, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 89f02ab0410..47e5f96cbed 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -128,21 +128,10 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) case TGSI_SEMANTIC_CLIPDIST: assert(index <= 1); return 2 + index; - case TGSI_SEMANTIC_CLIPVERTEX: - return 4; - case TGSI_SEMANTIC_COLOR: - assert(index <= 1); - return 5 + index; - case TGSI_SEMANTIC_BCOLOR: - assert(index <= 1); - return 7 + index; - case TGSI_SEMANTIC_FOG: - return 9; - case TGSI_SEMANTIC_EDGEFLAG: - return 10; case TGSI_SEMANTIC_GENERIC: - assert(index <= 63-11); - return 11 + index; + assert(index <= 63-4); + return 4 + index; + default: assert(0); return 63; @@ -1183,6 +1172,7 @@ handle_semantic: continue; case TGSI_SEMANTIC_PRIMID: case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: target = V_008DFC_SQ_EXP_PARAM + param_count; shader->vs_output_param_offset[i] = param_count; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 7f0fdd599dc..6c18836d189 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -44,6 +44,21 @@ static void si_init_atom(struct r600_atom *atom, struct r600_atom **list_elem, *list_elem = atom; } +unsigned si_array_mode(unsigned mode) +{ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + return V_009910_ARRAY_LINEAR_ALIGNED; + case RADEON_SURF_MODE_1D: + return V_009910_ARRAY_1D_TILED_THIN1; + case RADEON_SURF_MODE_2D: + return V_009910_ARRAY_2D_TILED_THIN1; + default: + case RADEON_SURF_MODE_LINEAR: + return V_009910_ARRAY_LINEAR_GENERAL; + } +} + uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex) { if (sscreen->b.chip_class == CIK && @@ -636,18 +651,14 @@ static void *si_create_rs_state(struct pipe_context *ctx, rs->offset_units = state->offset_units; rs->offset_scale = state->offset_scale * 12.0f; - tmp = S_0286D4_FLAT_SHADE_ENA(1); - if (state->sprite_coord_enable) { - tmp |= S_0286D4_PNT_SPRITE_ENA(1) | - S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | - S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | - S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | - S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1); - if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) { - tmp |= S_0286D4_PNT_SPRITE_TOP_1(1); - } - } - si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, tmp); + si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, + S_0286D4_FLAT_SHADE_ENA(1) | + S_0286D4_PNT_SPRITE_ENA(1) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); /* point size 12.4 fixed point */ tmp = (unsigned)(state->point_size * 8.0); @@ -2910,11 +2921,16 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.b.set_polygon_stipple = si_set_polygon_stipple; sctx->b.b.set_min_samples = si_set_min_samples; - sctx->b.dma_copy = si_dma_copy; sctx->b.set_occlusion_query_state = si_set_occlusion_query_state; sctx->b.need_gfx_cs_space = si_need_gfx_cs_space; sctx->b.b.draw_vbo = si_draw_vbo; + + if (sctx->b.chip_class >= CIK) { + sctx->b.dma_copy = cik_sdma_copy; + } else { + sctx->b.dma_copy = si_dma_copy; + } } static void diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2f8a943846a..5e68b162137 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -261,6 +261,7 @@ unsigned cik_bank_wh(unsigned bankwh); unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode); unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect); unsigned cik_tile_split(unsigned tile_split); +unsigned si_array_mode(unsigned mode); uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex); unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 1bbc6b3ca7a..208c8523ef1 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -182,8 +182,13 @@ static void si_shader_vs(struct si_shader *shader) for (nparams = 0, i = 0 ; i < info->num_outputs; i++) { switch (info->output_semantic_name[i]) { case TGSI_SEMANTIC_CLIPVERTEX: + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_CULLDIST: case TGSI_SEMANTIC_POSITION: case TGSI_SEMANTIC_PSIZE: + case TGSI_SEMANTIC_EDGEFLAG: + case TGSI_SEMANTIC_VIEWPORT_INDEX: + case TGSI_SEMANTIC_LAYER: break; default: nparams++; @@ -351,21 +356,25 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx, union si_shader_key *key) { struct si_context *sctx = (struct si_context *)ctx; - memset(key, 0, sizeof(*key)); + unsigned i; - if (sel->type == PIPE_SHADER_VERTEX) { - unsigned i; - if (!sctx->vertex_elements) - return; + memset(key, 0, sizeof(*key)); - for (i = 0; i < sctx->vertex_elements->count; ++i) - key->vs.instance_divisors[i] = sctx->vertex_elements->elements[i].instance_divisor; + switch (sel->type) { + case PIPE_SHADER_VERTEX: + if (sctx->vertex_elements) + for (i = 0; i < sctx->vertex_elements->count; ++i) + key->vs.instance_divisors[i] = + sctx->vertex_elements->elements[i].instance_divisor; if (sctx->gs_shader) { key->vs.as_es = 1; key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs; } - } else if (sel->type == PIPE_SHADER_FRAGMENT) { + break; + case PIPE_SHADER_GEOMETRY: + break; + case PIPE_SHADER_FRAGMENT: { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) @@ -393,11 +402,14 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx, } key->ps.alpha_func = PIPE_FUNC_ALWAYS; - /* Alpha-test should be disabled if colorbuffer 0 is integer. */ if (sctx->queued.named.dsa && !sctx->framebuffer.cb0_is_integer) key->ps.alpha_func = sctx->queued.named.dsa->alpha_func; + break; + } + default: + assert(0); } } @@ -580,15 +592,22 @@ static void si_delete_shader_selector(struct pipe_context *ctx, while (p) { c = p->next_variant; - if (sel->type == PIPE_SHADER_GEOMETRY) { + switch (sel->type) { + case PIPE_SHADER_VERTEX: + if (p->key.vs.as_es) + si_pm4_delete_state(sctx, es, p->pm4); + else + si_pm4_delete_state(sctx, vs, p->pm4); + break; + case PIPE_SHADER_GEOMETRY: si_pm4_delete_state(sctx, gs, p->pm4); si_pm4_delete_state(sctx, vs, p->gs_copy_shader->pm4); - } else if (sel->type == PIPE_SHADER_FRAGMENT) + break; + case PIPE_SHADER_FRAGMENT: si_pm4_delete_state(sctx, ps, p->pm4); - else if (p->key.vs.as_es) - si_pm4_delete_state(sctx, es, p->pm4); - else - si_pm4_delete_state(sctx, vs, p->pm4); + break; + } + si_shader_destroy(ctx, p); free(p); p = c; @@ -661,8 +680,9 @@ bcolor: (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade)) tmp |= S_028644_FLAT_SHADE(1); - if (name == TGSI_SEMANTIC_GENERIC && - sctx->sprite_coord_enable & (1 << index)) { + if (name == TGSI_SEMANTIC_PCOORD || + (name == TGSI_SEMANTIC_TEXCOORD && + sctx->sprite_coord_enable & (1 << index))) { tmp |= S_028644_PT_SPRITE_TEX(1); } @@ -835,8 +855,15 @@ static void si_update_spi_tmpring_size(struct si_context *sctx) si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4); if (si_update_scratch_buffer(sctx, sctx->gs_shader)) si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4); - if (si_update_scratch_buffer(sctx, sctx->vs_shader)) - si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); + + /* VS can be bound as ES or VS. */ + if (sctx->gs_shader) { + if (si_update_scratch_buffer(sctx, sctx->vs_shader)) + si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4); + } else { + if (si_update_scratch_buffer(sctx, sctx->vs_shader)) + si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); + } } /* The LLVM shader backend should be reporting aligned scratch_sizes. */ diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index afe011b15c7..35d5ee232a0 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -4516,6 +4516,13 @@ #define V_009910_ADDR_SURF_8_BANK 0x02 #define V_009910_ADDR_SURF_16_BANK 0x03 /* CIK */ +#define S_009910_MICRO_TILE_MODE_NEW(x) (((x) & 0x07) << 22) +#define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07) +#define C_009910_MICRO_TILE_MODE_NEW(x) 0xFE3FFFFF +#define V_009910_ADDR_SURF_DISPLAY_MICRO_TILING 0x00 +#define V_009910_ADDR_SURF_THIN_MICRO_TILING 0x01 +#define V_009910_ADDR_SURF_DEPTH_MICRO_TILING 0x02 +#define V_009910_ADDR_SURF_ROTATED_MICRO_TILING 0x03 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS 0x00B01C #define S_00B01C_CU_EN(x) (((x) & 0xFFFF) << 0) #define G_00B01C_CU_EN(x) (((x) >> 0) & 0xFFFF) @@ -8696,5 +8703,29 @@ #define SI_DMA_PACKET_CONSTANT_FILL 0xd #define SI_DMA_PACKET_NOP 0xf +/* CIK async DMA packets */ +#define CIK_SDMA_PACKET(op, sub_op, n) ((((n) & 0xFFFF) << 16) | \ + (((sub_op) & 0xFF) << 8) | \ + (((op) & 0xFF) << 0)) +/* CIK async DMA packet types */ +#define CIK_SDMA_OPCODE_NOP 0x0 +#define CIK_SDMA_OPCODE_COPY 0x1 +#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR 0x0 +#define CIK_SDMA_COPY_SUB_OPCODE_TILED 0x1 +#define CIK_SDMA_COPY_SUB_OPCODE_SOA 0x3 +#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4 +#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5 +#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6 +#define CIK_SDMA_OPCODE_WRITE 0x2 +#define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0 +#define SDMA_WRTIE_SUB_OPCODE_TILED 0x1 +#define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4 +#define CIK_SDMA_PACKET_FENCE 0x5 +#define CIK_SDMA_PACKET_TRAP 0x6 +#define CIK_SDMA_PACKET_SEMAPHORE 0x7 +#define CIK_SDMA_PACKET_CONSTANT_FILL 0xb +#define CIK_SDMA_PACKET_SRBM_WRITE 0xe +#define CIK_SDMA_COPY_MAX_SIZE 0x1fffff + #endif /* _SID_H */ diff --git a/src/gallium/drivers/rbug/rbug_public.h b/src/gallium/drivers/rbug/rbug_public.h index b66740b49cd..83f9c94e31f 100644 --- a/src/gallium/drivers/rbug/rbug_public.h +++ b/src/gallium/drivers/rbug/rbug_public.h @@ -28,6 +28,10 @@ #ifndef RBUG_PUBLIC_H #define RBUG_PUBLIC_H +#ifdef __cplusplus +extern "C" { +#endif + struct pipe_screen; struct pipe_context; @@ -37,4 +41,8 @@ rbug_screen_create(struct pipe_screen *screen); boolean rbug_enabled(void); +#ifdef __cplusplus +} +#endif + #endif /* RBUG_PUBLIC_H */ diff --git a/src/gallium/drivers/softpipe/sp_public.h b/src/gallium/drivers/softpipe/sp_public.h index 62d0903d87a..88a9b5e6643 100644 --- a/src/gallium/drivers/softpipe/sp_public.h +++ b/src/gallium/drivers/softpipe/sp_public.h @@ -1,10 +1,18 @@ #ifndef SP_PUBLIC_H #define SP_PUBLIC_H +#ifdef __cplusplus +extern "C" { +#endif + struct pipe_screen; struct sw_winsys; struct pipe_screen * softpipe_create_screen(struct sw_winsys *winsys); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c index e77387082bc..76105b4c0ec 100644 --- a/src/gallium/drivers/softpipe/sp_query.c +++ b/src/gallium/drivers/softpipe/sp_query.c @@ -277,7 +277,7 @@ softpipe_check_render_cond(struct softpipe_context *sp) b = pipe->get_query_result(pipe, sp->render_cond_query, wait, (void*)&result); if (b) - return (!result == sp->render_cond_cond); + return (!result) == sp->render_cond_cond; else return TRUE; } diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index d289e28a6f8..a688d319bb8 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -191,7 +191,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_ENDIANNESS: return PIPE_ENDIAN_NATIVE; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + return 4; case PIPE_CAP_TEXTURE_GATHER_SM5: + return 1; case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_SAMPLE_SHADING: @@ -200,13 +202,15 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: return 1; case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_SAMPLER_VIEW_TARGET: return 0; + case PIPE_CAP_SAMPLER_VIEW_TARGET: + return 1; case PIPE_CAP_FAKE_SW_MSAA: return 1; case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + return -32; case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - return 0; + return 31; case PIPE_CAP_DRAW_INDIRECT: return 1; @@ -237,6 +241,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 0; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c index e56fb5b1485..d7a3360713f 100644 --- a/src/gallium/drivers/softpipe/sp_state_sampler.c +++ b/src/gallium/drivers/softpipe/sp_state_sampler.c @@ -202,7 +202,7 @@ prepare_shader_sampling( struct pipe_resource *res = view->texture; int j; - if (res->target != PIPE_BUFFER) { + if (view->target != PIPE_BUFFER) { first_level = view->u.tex.first_level; last_level = view->u.tex.last_level; assert(first_level <= last_level); @@ -214,15 +214,17 @@ prepare_shader_sampling( row_stride[j] = sp_tex->stride[j]; img_stride[j] = sp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_1D_ARRAY || - res->target == PIPE_TEXTURE_2D_ARRAY || - res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_1D_ARRAY || + view->target == PIPE_TEXTURE_2D_ARRAY || + view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1; for (j = first_level; j <= last_level; j++) { mip_offsets[j] += view->u.tex.first_layer * sp_tex->img_stride[j]; } - if (res->target == PIPE_TEXTURE_CUBE_ARRAY) { + if (view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { assert(num_layers % 6 == 0); } assert(view->u.tex.first_layer <= view->u.tex.last_layer); diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index 68dcf57240d..1010b63de2c 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -131,68 +131,80 @@ repeat(int coord, unsigned size) * \param icoord returns the integer texcoords */ static void -wrap_nearest_repeat(float s, unsigned size, int *icoord) +wrap_nearest_repeat(float s, unsigned size, int offset, int *icoord) { /* s limited to [0,1) */ /* i limited to [0,size-1] */ int i = util_ifloor(s * size); - *icoord = repeat(i, size); + *icoord = repeat(i + offset, size); } static void -wrap_nearest_clamp(float s, unsigned size, int *icoord) +wrap_nearest_clamp(float s, unsigned size, int offset, int *icoord) { /* s limited to [0,1] */ /* i limited to [0,size-1] */ + s *= size; + s += offset; if (s <= 0.0F) *icoord = 0; - else if (s >= 1.0F) + else if (s >= size) *icoord = size - 1; else - *icoord = util_ifloor(s * size); + *icoord = util_ifloor(s); } static void -wrap_nearest_clamp_to_edge(float s, unsigned size, int *icoord) +wrap_nearest_clamp_to_edge(float s, unsigned size, int offset, int *icoord) { /* s limited to [min,max] */ /* i limited to [0, size-1] */ - const float min = 1.0F / (2.0F * size); - const float max = 1.0F - min; + const float min = 0.5F; + const float max = (float)size - 0.5F; + + s *= size; + s += offset; + if (s < min) *icoord = 0; else if (s > max) *icoord = size - 1; else - *icoord = util_ifloor(s * size); + *icoord = util_ifloor(s); } static void -wrap_nearest_clamp_to_border(float s, unsigned size, int *icoord) +wrap_nearest_clamp_to_border(float s, unsigned size, int offset, int *icoord) { /* s limited to [min,max] */ /* i limited to [-1, size] */ - const float min = -1.0F / (2.0F * size); - const float max = 1.0F - min; + const float min = -0.5F; + const float max = size + 0.5F; + + s *= size; + s += offset; if (s <= min) *icoord = -1; else if (s >= max) *icoord = size; else - *icoord = util_ifloor(s * size); + *icoord = util_ifloor(s); } - static void -wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord) +wrap_nearest_mirror_repeat(float s, unsigned size, int offset, int *icoord) { const float min = 1.0F / (2.0F * size); const float max = 1.0F - min; - const int flr = util_ifloor(s); - float u = frac(s); + int flr; + float u; + + s += (float)offset / size; + flr = util_ifloor(s); + u = frac(s); if (flr & 1) u = 1.0F - u; if (u < min) @@ -205,51 +217,52 @@ wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord) static void -wrap_nearest_mirror_clamp(float s, unsigned size, int *icoord) +wrap_nearest_mirror_clamp(float s, unsigned size, int offset, int *icoord) { /* s limited to [0,1] */ /* i limited to [0,size-1] */ - const float u = fabsf(s); + const float u = fabsf(s * size + offset); if (u <= 0.0F) *icoord = 0; - else if (u >= 1.0F) + else if (u >= size) *icoord = size - 1; else - *icoord = util_ifloor(u * size); + *icoord = util_ifloor(u); } static void -wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int *icoord) +wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int offset, int *icoord) { /* s limited to [min,max] */ /* i limited to [0, size-1] */ - const float min = 1.0F / (2.0F * size); - const float max = 1.0F - min; - const float u = fabsf(s); + const float min = 0.5F; + const float max = (float)size - 0.5F; + const float u = fabsf(s * size + offset); + if (u < min) *icoord = 0; else if (u > max) *icoord = size - 1; else - *icoord = util_ifloor(u * size); + *icoord = util_ifloor(u); } static void -wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord) +wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int offset, int *icoord) { - /* s limited to [min,max] */ - /* i limited to [0, size-1] */ - const float min = -1.0F / (2.0F * size); - const float max = 1.0F - min; - const float u = fabsf(s); + /* u limited to [-0.5, size-0.5] */ + const float min = -0.5F; + const float max = (float)size + 0.5F; + const float u = fabsf(s * size + offset); + if (u < min) *icoord = -1; else if (u > max) *icoord = size; else - *icoord = util_ifloor(u * size); + *icoord = util_ifloor(u); } @@ -264,22 +277,23 @@ wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord) * \param icoord returns the computed integer texture coord */ static void -wrap_linear_repeat(float s, unsigned size, +wrap_linear_repeat(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { float u = s * size - 0.5F; - *icoord0 = repeat(util_ifloor(u), size); + *icoord0 = repeat(util_ifloor(u) + offset, size); *icoord1 = repeat(*icoord0 + 1, size); *w = frac(u); } static void -wrap_linear_clamp(float s, unsigned size, +wrap_linear_clamp(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = CLAMP(s, 0.0F, 1.0F); - u = u * size - 0.5f; + float u = CLAMP(s * size + offset, 0.0F, (float)size); + + u = u - 0.5f; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; *w = frac(u); @@ -287,11 +301,11 @@ wrap_linear_clamp(float s, unsigned size, static void -wrap_linear_clamp_to_edge(float s, unsigned size, +wrap_linear_clamp_to_edge(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = CLAMP(s, 0.0F, 1.0F); - u = u * size - 0.5f; + float u = CLAMP(s * size + offset, 0.0F, (float)size); + u = u - 0.5f; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; if (*icoord0 < 0) @@ -303,13 +317,13 @@ wrap_linear_clamp_to_edge(float s, unsigned size, static void -wrap_linear_clamp_to_border(float s, unsigned size, +wrap_linear_clamp_to_border(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - const float min = -1.0F / (2.0F * size); - const float max = 1.0F - min; - float u = CLAMP(s, min, max); - u = u * size - 0.5f; + const float min = -0.5F; + const float max = (float)size + 0.5F; + float u = CLAMP(s * size + offset, min, max); + u = u - 0.5f; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; *w = frac(u); @@ -317,11 +331,15 @@ wrap_linear_clamp_to_border(float s, unsigned size, static void -wrap_linear_mirror_repeat(float s, unsigned size, +wrap_linear_mirror_repeat(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - const int flr = util_ifloor(s); - float u = frac(s); + int flr; + float u; + + s += (float)offset / size; + flr = util_ifloor(s); + u = frac(s); if (flr & 1) u = 1.0F - u; u = u * size - 0.5F; @@ -336,14 +354,12 @@ wrap_linear_mirror_repeat(float s, unsigned size, static void -wrap_linear_mirror_clamp(float s, unsigned size, +wrap_linear_mirror_clamp(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = fabsf(s); - if (u >= 1.0F) + float u = fabsf(s * size + offset); + if (u >= size) u = (float) size; - else - u *= size; u -= 0.5F; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; @@ -352,14 +368,12 @@ wrap_linear_mirror_clamp(float s, unsigned size, static void -wrap_linear_mirror_clamp_to_edge(float s, unsigned size, +wrap_linear_mirror_clamp_to_edge(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = fabsf(s); - if (u >= 1.0F) + float u = fabsf(s * size + offset); + if (u >= size) u = (float) size; - else - u *= size; u -= 0.5F; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; @@ -372,18 +386,16 @@ wrap_linear_mirror_clamp_to_edge(float s, unsigned size, static void -wrap_linear_mirror_clamp_to_border(float s, unsigned size, +wrap_linear_mirror_clamp_to_border(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - const float min = -1.0F / (2.0F * size); - const float max = 1.0F - min; - float u = fabsf(s); + const float min = -0.5F; + const float max = size + 0.5F; + float u = fabsf(s * size + offset); if (u <= min) - u = min * size; + u = min; else if (u >= max) - u = max * size; - else - u *= size; + u = max; u -= 0.5F; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; @@ -395,10 +407,10 @@ wrap_linear_mirror_clamp_to_border(float s, unsigned size, * PIPE_TEX_WRAP_CLAMP for nearest sampling, unnormalized coords. */ static void -wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord) +wrap_nearest_unorm_clamp(float s, unsigned size, int offset, int *icoord) { int i = util_ifloor(s); - *icoord = CLAMP(i, 0, (int) size-1); + *icoord = CLAMP(i + offset, 0, (int) size-1); } @@ -406,9 +418,9 @@ wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord) * PIPE_TEX_WRAP_CLAMP_TO_BORDER for nearest sampling, unnormalized coords. */ static void -wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord) +wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int offset, int *icoord) { - *icoord = util_ifloor( CLAMP(s, -0.5F, (float) size + 0.5F) ); + *icoord = util_ifloor( CLAMP(s + offset, -0.5F, (float) size + 0.5F) ); } @@ -416,9 +428,9 @@ wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord) * PIPE_TEX_WRAP_CLAMP_TO_EDGE for nearest sampling, unnormalized coords. */ static void -wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord) +wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int offset, int *icoord) { - *icoord = util_ifloor( CLAMP(s, 0.5F, (float) size - 0.5F) ); + *icoord = util_ifloor( CLAMP(s + offset, 0.5F, (float) size - 0.5F) ); } @@ -426,11 +438,11 @@ wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord) * PIPE_TEX_WRAP_CLAMP for linear sampling, unnormalized coords. */ static void -wrap_linear_unorm_clamp(float s, unsigned size, +wrap_linear_unorm_clamp(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { /* Not exactly what the spec says, but it matches NVIDIA output */ - float u = CLAMP(s - 0.5F, 0.0f, (float) size - 1.0f); + float u = CLAMP(s + offset - 0.5F, 0.0f, (float) size - 1.0f); *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; *w = frac(u); @@ -441,10 +453,10 @@ wrap_linear_unorm_clamp(float s, unsigned size, * PIPE_TEX_WRAP_CLAMP_TO_BORDER for linear sampling, unnormalized coords. */ static void -wrap_linear_unorm_clamp_to_border(float s, unsigned size, +wrap_linear_unorm_clamp_to_border(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = CLAMP(s, -0.5F, (float) size + 0.5F); + float u = CLAMP(s + offset, -0.5F, (float) size + 0.5F); u -= 0.5F; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; @@ -458,10 +470,10 @@ wrap_linear_unorm_clamp_to_border(float s, unsigned size, * PIPE_TEX_WRAP_CLAMP_TO_EDGE for linear sampling, unnormalized coords. */ static void -wrap_linear_unorm_clamp_to_edge(float s, unsigned size, +wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset, int *icoord0, int *icoord1, float *w) { - float u = CLAMP(s, +0.5F, (float) size - 0.5F); + float u = CLAMP(s + offset, +0.5F, (float) size - 0.5F); u -= 0.5F; *icoord0 = util_ifloor(u); *icoord1 = *icoord0 + 1; @@ -474,11 +486,11 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size, /** * Do coordinate to array index conversion. For array textures. */ -static INLINE void -wrap_array_layer(float coord, unsigned size, int *layer) +static INLINE int +coord_to_layer(float coord, unsigned first_layer, unsigned last_layer) { int c = util_ifloor(coord + 0.5F); - *layer = CLAMP(c, 0, (int) size - 1); + return CLAMP(c, (int)first_layer, (int)last_layer); } @@ -757,61 +769,6 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc) } -static INLINE const float * -get_texel_cube_seamless(const struct sp_sampler_view *sp_sview, - union tex_tile_address addr, int x, int y, - float *corner) -{ - const struct pipe_resource *texture = sp_sview->base.texture; - unsigned level = addr.bits.level; - unsigned face = addr.bits.face; - int new_x, new_y, max_x; - - max_x = (int) u_minify(texture->width0, level); - - assert(texture->width0 == texture->height0); - new_x = x; - new_y = y; - - /* change the face */ - if (x < 0) { - /* - * Cheat with corners. They are difficult and I believe because we don't get - * per-pixel faces we can actually have multiple corner texels per pixel, - * which screws things up majorly in any case (as the per spec behavior is - * to average the 3 remaining texels, which we might not have). - * Hence just make sure that the 2nd coord is clamped, will simply pick the - * sample which would have fallen off the x coord, but not y coord. - * So the filter weight of the samples will be wrong, but at least this - * ensures that only valid texels near the corner are used. - */ - if (y < 0 || y >= max_x) { - y = CLAMP(y, 0, max_x - 1); - } - new_x = get_next_xcoord(face, 0, max_x -1, x, y); - new_y = get_next_ycoord(face, 0, max_x -1, x, y); - face = get_next_face(face, 0); - } else if (x >= max_x) { - if (y < 0 || y >= max_x) { - y = CLAMP(y, 0, max_x - 1); - } - new_x = get_next_xcoord(face, 1, max_x -1, x, y); - new_y = get_next_ycoord(face, 1, max_x -1, x, y); - face = get_next_face(face, 1); - } else if (y < 0) { - new_x = get_next_xcoord(face, 2, max_x -1, x, y); - new_y = get_next_ycoord(face, 2, max_x -1, x, y); - face = get_next_face(face, 2); - } else if (y >= max_x) { - new_x = get_next_xcoord(face, 3, max_x -1, x, y); - new_y = get_next_ycoord(face, 3, max_x -1, x, y); - face = get_next_face(face, 3); - } - - addr.bits.face = face; - return get_texel_2d_no_border( sp_sview, addr, new_x, new_y ); -} - /* Gather a quad of adjacent texels within a tile: */ static INLINE void @@ -948,6 +905,60 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview, } +static INLINE const float * +get_texel_cube_seamless(const struct sp_sampler_view *sp_sview, + union tex_tile_address addr, int x, int y, + float *corner, int layer, unsigned face) +{ + const struct pipe_resource *texture = sp_sview->base.texture; + unsigned level = addr.bits.level; + int new_x, new_y, max_x; + + max_x = (int) u_minify(texture->width0, level); + + assert(texture->width0 == texture->height0); + new_x = x; + new_y = y; + + /* change the face */ + if (x < 0) { + /* + * Cheat with corners. They are difficult and I believe because we don't get + * per-pixel faces we can actually have multiple corner texels per pixel, + * which screws things up majorly in any case (as the per spec behavior is + * to average the 3 remaining texels, which we might not have). + * Hence just make sure that the 2nd coord is clamped, will simply pick the + * sample which would have fallen off the x coord, but not y coord. + * So the filter weight of the samples will be wrong, but at least this + * ensures that only valid texels near the corner are used. + */ + if (y < 0 || y >= max_x) { + y = CLAMP(y, 0, max_x - 1); + } + new_x = get_next_xcoord(face, 0, max_x -1, x, y); + new_y = get_next_ycoord(face, 0, max_x -1, x, y); + face = get_next_face(face, 0); + } else if (x >= max_x) { + if (y < 0 || y >= max_x) { + y = CLAMP(y, 0, max_x - 1); + } + new_x = get_next_xcoord(face, 1, max_x -1, x, y); + new_y = get_next_ycoord(face, 1, max_x -1, x, y); + face = get_next_face(face, 1); + } else if (y < 0) { + new_x = get_next_xcoord(face, 2, max_x -1, x, y); + new_y = get_next_ycoord(face, 2, max_x -1, x, y); + face = get_next_face(face, 2); + } else if (y >= max_x) { + new_x = get_next_xcoord(face, 3, max_x -1, x, y); + new_y = get_next_ycoord(face, 3, max_x -1, x, y); + face = get_next_face(face, 3); + } + + return get_texel_3d_no_border(sp_sview, addr, new_x, new_y, layer + face); +} + + /* Get texel pointer for cube array texture */ static INLINE const float * get_texel_cube_array(const struct sp_sampler_view *sp_sview, @@ -1008,22 +1019,18 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ static INLINE void img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { - unsigned xpot = pot_level_size(sp_sview->xpot, level); - unsigned ypot = pot_level_size(sp_sview->ypot, level); + unsigned xpot = pot_level_size(sp_sview->xpot, args->level); + unsigned ypot = pot_level_size(sp_sview->ypot, args->level); int xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */ int ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */ union tex_tile_address addr; int c; - float u = s * xpot - 0.5F; - float v = t * ypot - 0.5F; + float u = (args->s * xpot - 0.5F) + args->offset[0]; + float v = (args->t * ypot - 0.5F) + args->offset[1]; int uflr = util_ifloor(u); int vflr = util_ifloor(v); @@ -1037,7 +1044,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview, const float *tx[4]; addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; /* Can we fetch all four at once: */ @@ -1066,21 +1073,17 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview, static INLINE void img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float rgba[TGSI_QUAD_SIZE]) { - unsigned xpot = pot_level_size(sp_sview->xpot, level); - unsigned ypot = pot_level_size(sp_sview->ypot, level); + unsigned xpot = pot_level_size(sp_sview->xpot, args->level); + unsigned ypot = pot_level_size(sp_sview->ypot, args->level); const float *out; union tex_tile_address addr; int c; - float u = s * xpot; - float v = t * ypot; + float u = args->s * xpot + args->offset[0]; + float v = args->t * ypot + args->offset[1]; int uflr = util_ifloor(u); int vflr = util_ifloor(v); @@ -1089,7 +1092,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview, int y0 = vflr & (ypot - 1); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; out = get_texel_2d_no_border(sp_sview, addr, x0, y0); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1104,26 +1107,22 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview, static INLINE void img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float rgba[TGSI_QUAD_SIZE]) { - unsigned xpot = pot_level_size(sp_sview->xpot, level); - unsigned ypot = pot_level_size(sp_sview->ypot, level); + unsigned xpot = pot_level_size(sp_sview->xpot, args->level); + unsigned ypot = pot_level_size(sp_sview->ypot, args->level); union tex_tile_address addr; int c; - float u = s * xpot; - float v = t * ypot; + float u = args->s * xpot + args->offset[0]; + float v = args->t * ypot + args->offset[1]; int x0, y0; const float *out; addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; x0 = util_ifloor(u); if (x0 < 0) @@ -1150,11 +1149,7 @@ img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview, static void img_filter_1d_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float rgba[TGSI_QUAD_SIZE]) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1164,14 +1159,14 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview, const float *out; int c; - width = u_minify(texture->width0, level); + width = u_minify(texture->width0, args->level); assert(width > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->nearest_texcoord_s(s, width, &x); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); out = get_texel_2d(sp_sview, sp_samp, addr, x, 0); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1186,11 +1181,7 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview, static void img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1200,15 +1191,16 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview, const float *out; int c; - width = u_minify(texture->width0, level); + width = u_minify(texture->width0, args->level); assert(width > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->nearest_texcoord_s(s, width, &x); - wrap_array_layer(t, texture->array_size, &layer); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); out = get_texel_1d_array(sp_sview, sp_samp, addr, x, layer); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1223,11 +1215,7 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview, static void img_filter_2d_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1237,17 +1225,17 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview, const float *out; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->nearest_texcoord_s(s, width, &x); - sp_samp->nearest_texcoord_t(t, height, &y); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); out = get_texel_2d(sp_sview, sp_samp, addr, x, y); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1262,11 +1250,7 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview, static void img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1276,18 +1260,19 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview, const float *out; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->nearest_texcoord_s(s, width, &x); - sp_samp->nearest_texcoord_t(t, height, &y); - wrap_array_layer(p, texture->array_size, &layer); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); + layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); out = get_texel_2d_array(sp_sview, sp_samp, addr, x, y, layer); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1299,54 +1284,43 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview, } -static INLINE union tex_tile_address -face(union tex_tile_address addr, unsigned face ) -{ - addr.bits.face = face; - return addr; -} - - static void img_filter_cube_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; int width, height; - int x, y; + int x, y, layerface; union tex_tile_address addr; const float *out; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; /* * If NEAREST filtering is done within a miplevel, always apply wrap * mode CLAMP_TO_EDGE. */ if (sp_samp->base.seamless_cube_map) { - wrap_nearest_clamp_to_edge(s, width, &x); - wrap_nearest_clamp_to_edge(t, height, &y); + wrap_nearest_clamp_to_edge(args->s, width, args->offset[0], &x); + wrap_nearest_clamp_to_edge(args->t, height, args->offset[1], &y); } else { /* Would probably make sense to ignore mode and just do edge clamp */ - sp_samp->nearest_texcoord_s(s, width, &x); - sp_samp->nearest_texcoord_t(t, height, &y); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); } - out = get_texel_2d(sp_sview, sp_samp, face(addr, face_id), x, y); + layerface = args->face_id + sp_sview->base.u.tex.first_layer; + out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface); for (c = 0; c < TGSI_QUAD_SIZE; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; @@ -1358,34 +1332,32 @@ img_filter_cube_nearest(struct sp_sampler_view *sp_sview, static void img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; int width, height; - int x, y, layer; + int x, y, layerface; union tex_tile_address addr; const float *out; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->nearest_texcoord_s(s, width, &x); - sp_samp->nearest_texcoord_t(t, height, &y); - wrap_array_layer(p, texture->array_size, &layer); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); + layerface = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer - 5) + args->face_id; - out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layer * 6 + face_id); + out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface); for (c = 0; c < TGSI_QUAD_SIZE; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; @@ -1397,11 +1369,7 @@ img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview, static void img_filter_3d_nearest(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1411,20 +1379,20 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview, const float *out; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); - depth = u_minify(texture->depth0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); + depth = u_minify(texture->depth0, args->level); assert(width > 0); assert(height > 0); assert(depth > 0); - sp_samp->nearest_texcoord_s(s, width, &x); - sp_samp->nearest_texcoord_t(t, height, &y); - sp_samp->nearest_texcoord_p(p, depth, &z); + sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); + sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); + sp_samp->nearest_texcoord_p(args->p, depth, args->offset[2], &z); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; out = get_texel_3d(sp_sview, sp_samp, addr, x, y, z); for (c = 0; c < TGSI_QUAD_SIZE; c++) @@ -1435,11 +1403,7 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview, static void img_filter_1d_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1450,14 +1414,14 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview, const float *tx0, *tx1; int c; - width = u_minify(texture->width0, level); + width = u_minify(texture->width0, args->level); assert(width > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, 0); tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, 0); @@ -1471,11 +1435,7 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview, static void img_filter_1d_array_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1486,15 +1446,16 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview, const float *tx0, *tx1; int c; - width = u_minify(texture->width0, level); + width = u_minify(texture->width0, args->level); assert(width > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - wrap_array_layer(t, texture->array_size, &layer); + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); tx0 = get_texel_1d_array(sp_sview, sp_samp, addr, x0, layer); tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1, layer); @@ -1504,15 +1465,77 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview, rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]); } +/* + * Retrieve the gathered value, need to convert to the + * TGSI expected interface, and take component select + * and swizzling into account. + */ +static float +get_gather_value(const struct sp_sampler_view *sp_sview, + int chan_in, int comp_sel, + const float *tx[4]) +{ + int chan; + unsigned swizzle; + + /* + * softpipe samples in a different order + * to TGSI expects, so we need to swizzle, + * the samples into the correct slots. + */ + switch (chan_in) { + case 0: + chan = 2; + break; + case 1: + chan = 3; + break; + case 2: + chan = 1; + break; + case 3: + chan = 0; + break; + default: + assert(0); + return 0.0; + } + + /* pick which component to use for the swizzle */ + switch (comp_sel) { + case 0: + swizzle = sp_sview->base.swizzle_r; + break; + case 1: + swizzle = sp_sview->base.swizzle_g; + break; + case 2: + swizzle = sp_sview->base.swizzle_b; + break; + case 3: + swizzle = sp_sview->base.swizzle_a; + break; + default: + assert(0); + return 0.0; + } + + /* get correct result using the channel and swizzle */ + switch (swizzle) { + case PIPE_SWIZZLE_ZERO: + return 0.0; + case PIPE_SWIZZLE_ONE: + return 1.0; + default: + return tx[chan][swizzle]; + } +} + static void img_filter_2d_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1520,42 +1543,45 @@ img_filter_2d_linear(struct sp_sampler_view *sp_sview, int x0, y0, x1, y1; float xw, yw; /* weights */ union tex_tile_address addr; - const float *tx0, *tx1, *tx2, *tx3; + const float *tx[4]; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw); + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw); - tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, y0); - tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, y0); - tx2 = get_texel_2d(sp_sview, sp_samp, addr, x0, y1); - tx3 = get_texel_2d(sp_sview, sp_samp, addr, x1, y1); + tx[0] = get_texel_2d(sp_sview, sp_samp, addr, x0, y0); + tx[1] = get_texel_2d(sp_sview, sp_samp, addr, x1, y0); + tx[2] = get_texel_2d(sp_sview, sp_samp, addr, x0, y1); + tx[3] = get_texel_2d(sp_sview, sp_samp, addr, x1, y1); - /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) - rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, - tx0[c], tx1[c], - tx2[c], tx3[c]); + if (args->gather_only) { + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, + args->gather_comp, + tx); + } else { + /* interpolate R, G, B, A */ + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, + tx[0][c], tx[1][c], + tx[2][c], tx[3][c]); + } } static void img_filter_2d_array_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1563,63 +1589,67 @@ img_filter_2d_array_linear(struct sp_sampler_view *sp_sview, int x0, y0, x1, y1, layer; float xw, yw; /* weights */ union tex_tile_address addr; - const float *tx0, *tx1, *tx2, *tx3; + const float *tx[4]; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; - - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw); - wrap_array_layer(p, texture->array_size, &layer); - - tx0 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer); - tx1 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer); - tx2 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer); - tx3 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer); - - /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) - rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, - tx0[c], tx1[c], - tx2[c], tx3[c]); + addr.bits.level = args->level; + + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw); + layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); + + tx[0] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer); + tx[1] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer); + tx[2] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer); + tx[3] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer); + + if (args->gather_only) { + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, + args->gather_comp, + tx); + } else { + /* interpolate R, G, B, A */ + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, + tx[0][c], tx[1][c], + tx[2][c], tx[3][c]); + } } static void img_filter_cube_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; int width, height; - int x0, y0, x1, y1; + int x0, y0, x1, y1, layer; float xw, yw; /* weights */ - union tex_tile_address addr, addrj; - const float *tx0, *tx1, *tx2, *tx3; + union tex_tile_address addr; + const float *tx[4]; float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE], corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE]; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; /* * For seamless if LINEAR filtering is done within a miplevel, @@ -1627,43 +1657,47 @@ img_filter_cube_linear(struct sp_sampler_view *sp_sview, */ if (sp_samp->base.seamless_cube_map) { /* Note this is a bit overkill, actual clamping is not required */ - wrap_linear_clamp_to_border(s, width, &x0, &x1, &xw); - wrap_linear_clamp_to_border(t, height, &y0, &y1, &yw); + wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw); + wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw); } else { /* Would probably make sense to ignore mode and just do edge clamp */ - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw); + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw); } - addrj = face(addr, face_id); + layer = sp_sview->base.u.tex.first_layer; if (sp_samp->base.seamless_cube_map) { - tx0 = get_texel_cube_seamless(sp_sview, addrj, x0, y0, corner0); - tx1 = get_texel_cube_seamless(sp_sview, addrj, x1, y0, corner1); - tx2 = get_texel_cube_seamless(sp_sview, addrj, x0, y1, corner2); - tx3 = get_texel_cube_seamless(sp_sview, addrj, x1, y1, corner3); + tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id); + tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id); + tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id); + tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id); } else { - tx0 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y0); - tx1 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y0); - tx2 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y1); - tx3 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y1); + tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id); + tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id); + tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id); + tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id); + } + + if (args->gather_only) { + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, + args->gather_comp, + tx); + } else { + /* interpolate R, G, B, A */ + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, + tx[0][c], tx[1][c], + tx[2][c], tx[3][c]); } - /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) - rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, - tx0[c], tx1[c], - tx2[c], tx3[c]); } static void img_filter_cube_array_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1671,42 +1705,68 @@ img_filter_cube_array_linear(struct sp_sampler_view *sp_sview, int x0, y0, x1, y1, layer; float xw, yw; /* weights */ union tex_tile_address addr; - const float *tx0, *tx1, *tx2, *tx3; + const float *tx[4]; + float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE], + corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE]; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); assert(width > 0); assert(height > 0); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw); - wrap_array_layer(p, texture->array_size, &layer); + /* + * For seamless if LINEAR filtering is done within a miplevel, + * always apply wrap mode CLAMP_TO_BORDER. + */ + if (sp_samp->base.seamless_cube_map) { + /* Note this is a bit overkill, actual clamping is not required */ + wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw); + wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw); + } else { + /* Would probably make sense to ignore mode and just do edge clamp */ + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw); + } - tx0 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer * 6 + face_id); - tx1 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer * 6 + face_id); - tx2 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer * 6 + face_id); - tx3 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer * 6 + face_id); + layer = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer - 5); - /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) - rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, - tx0[c], tx1[c], - tx2[c], tx3[c]); + if (sp_samp->base.seamless_cube_map) { + tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id); + tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id); + tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id); + tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id); + } else { + tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id); + tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id); + tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id); + tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id); + } + + if (args->gather_only) { + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, + args->gather_comp, + tx); + } else { + /* interpolate R, G, B, A */ + for (c = 0; c < TGSI_QUAD_SIZE; c++) + rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, + tx[0][c], tx[1][c], + tx[2][c], tx[3][c]); + } } static void img_filter_3d_linear(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -1717,21 +1777,20 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview, const float *tx00, *tx01, *tx02, *tx03, *tx10, *tx11, *tx12, *tx13; int c; - width = u_minify(texture->width0, level); - height = u_minify(texture->height0, level); - depth = u_minify(texture->depth0, level); + width = u_minify(texture->width0, args->level); + height = u_minify(texture->height0, args->level); + depth = u_minify(texture->depth0, args->level); addr.value = 0; - addr.bits.level = level; + addr.bits.level = args->level; assert(width > 0); assert(height > 0); assert(depth > 0); - sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw); - sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw); - sp_samp->linear_texcoord_p(p, depth, &z0, &z1, &zw); - + sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw); + sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw); + sp_samp->linear_texcoord_p(args->p, depth, args->offset[2], &z0, &z1, &zw); tx00 = get_texel_3d(sp_sview, sp_samp, addr, x0, y0, z0); tx01 = get_texel_3d(sp_sview, sp_samp, addr, x1, y0, z0); @@ -1837,6 +1896,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview, } break; case tgsi_sampler_lod_zero: + case tgsi_sampler_gather: /* this is all static state in the sampler really need clamp here? */ lod[0] = lod[1] = lod[2] = lod[3] = CLAMP(lod_bias, min_lod, max_lod); break; @@ -1846,6 +1906,12 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview, } } +static INLINE unsigned +get_gather_component(const float lod_in[TGSI_QUAD_SIZE]) +{ + /* gather component is stored in lod_in slot as unsigned */ + return (*(unsigned int *)lod_in) & 0x3; +} static void mip_filter_linear(struct sp_sampler_view *sp_sview, @@ -1857,36 +1923,45 @@ mip_filter_linear(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { const struct pipe_sampler_view *psview = &sp_sview->base; int j; float lod[TGSI_QUAD_SIZE]; + struct img_filter_args args; - compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod); + compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod); + + args.offset = filt_args->offset; + args.gather_only = filt_args->control == tgsi_sampler_gather; + args.gather_comp = get_gather_component(lod_in); for (j = 0; j < TGSI_QUAD_SIZE; j++) { int level0 = psview->u.tex.first_level + (int)lod[j]; - if (lod[j] < 0.0) - mag_filter(sp_sview, sp_samp, s[j], t[j], p[j], - psview->u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); - - else if (level0 >= (int) psview->u.tex.last_level) - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level, - sp_sview->faces[j], &rgba[0][j]); + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.face_id = sp_sview->faces[j]; + if (lod[j] < 0.0) { + args.level = psview->u.tex.first_level; + mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]); + } + else if (level0 >= (int) psview->u.tex.last_level) { + args.level = psview->u.tex.last_level; + min_filter(sp_sview, sp_samp, &args, &rgba[0][j]); + } else { float levelBlend = frac(lod[j]); float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; int c; - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0, - sp_sview->faces[j], &rgbax[0][0]); - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0+1, - sp_sview->faces[j], &rgbax[0][1]); + args.level = level0; + min_filter(sp_sview, sp_samp, &args, &rgbax[0][0]); + args.level = level0+1; + min_filter(sp_sview, sp_samp, &args, &rgbax[0][1]); for (c = 0; c < 4; c++) { rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]); @@ -1915,25 +1990,33 @@ mip_filter_nearest(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { const struct pipe_sampler_view *psview = &sp_sview->base; float lod[TGSI_QUAD_SIZE]; int j; + struct img_filter_args args; + + args.offset = filt_args->offset; + args.gather_only = filt_args->control == tgsi_sampler_gather; + args.gather_comp = get_gather_component(lod_in); - compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod); + compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod); for (j = 0; j < TGSI_QUAD_SIZE; j++) { - if (lod[j] < 0.0) - mag_filter(sp_sview, sp_samp, s[j], t[j], p[j], - psview->u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); - else { + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.face_id = sp_sview->faces[j]; + + if (lod[j] < 0.0) { + args.level = psview->u.tex.first_level; + mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]); + } else { int level = psview->u.tex.first_level + (int)(lod[j] + 0.5F); - level = MIN2(level, (int)psview->u.tex.last_level); - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], - level, sp_sview->faces[j], &rgba[0][j]); + args.level = MIN2(level, (int)psview->u.tex.last_level); + min_filter(sp_sview, sp_samp, &args, &rgba[0][j]); } } @@ -1953,24 +2036,29 @@ mip_filter_none(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { float lod[TGSI_QUAD_SIZE]; int j; + struct img_filter_args args; + + args.level = sp_sview->base.u.tex.first_level; + args.offset = filt_args->offset; + args.gather_only = filt_args->control == tgsi_sampler_gather; - compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod); + compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod); for (j = 0; j < TGSI_QUAD_SIZE; j++) { - if (lod[j] < 0.0) { - mag_filter(sp_sview, sp_samp, s[j], t[j], p[j], - sp_sview->base.u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.face_id = sp_sview->faces[j]; + if (lod[j] < 0.0) { + mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]); } else { - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], - sp_sview->base.u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); + min_filter(sp_sview, sp_samp, &args, &rgba[0][j]); } } } @@ -1986,15 +2074,21 @@ mip_filter_none_no_filter_select(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { int j; - - for (j = 0; j < TGSI_QUAD_SIZE; j++) - mag_filter(sp_sview, sp_samp, s[j], t[j], p[j], - sp_sview->base.u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); + struct img_filter_args args; + args.level = sp_sview->base.u.tex.first_level; + args.offset = filt_args->offset; + args.gather_only = filt_args->control == tgsi_sampler_gather; + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.face_id = sp_sview->faces[j]; + mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]); + } } @@ -2050,7 +2144,7 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, float scaling = 1.0f / (1 << level0); int width = u_minify(texture->width0, level0); int height = u_minify(texture->height0, level0); - + struct img_filter_args args; float ux = dudx * scaling; float vx = dvdx * scaling; float uy = dudy * scaling; @@ -2100,7 +2194,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, * full, then the pixel values are read from the image. */ ddq = 2 * A; - + + args.level = level; for (j = 0; j < TGSI_QUAD_SIZE; j++) { /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse * and incrementally update the value of Ax^2+Bxy*Cy^2; when this @@ -2117,6 +2212,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, float num[4] = {0.0F, 0.0F, 0.0F, 0.0F}; buffer_next = 0; den = 0; + args.face_id = sp_sview->faces[j]; + U = u0 - tex_u; for (v = v0; v <= v1; ++v) { float V = v - tex_v; @@ -2148,8 +2245,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, * accelerated img_filter_2d_nearest_XXX functions. */ for (jj = 0; jj < buffer_next; jj++) { - min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj], - level, sp_sview->faces[j], &rgba_temp[0][jj]); + args.s = s_buffer[jj]; + args.t = t_buffer[jj]; + args.p = p[jj]; + min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]); num[0] += weight_buffer[jj] * rgba_temp[0][jj]; num[1] += weight_buffer[jj] * rgba_temp[1][jj]; num[2] += weight_buffer[jj] * rgba_temp[2][jj]; @@ -2176,8 +2275,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, * accelerated img_filter_2d_nearest_XXX functions. */ for (jj = 0; jj < buffer_next; jj++) { - min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj], - level, sp_sview->faces[j], &rgba_temp[0][jj]); + args.s = s_buffer[jj]; + args.t = t_buffer[jj]; + args.p = p[jj]; + min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]); num[0] += weight_buffer[jj] * rgba_temp[0][jj]; num[1] += weight_buffer[jj] * rgba_temp[1][jj]; num[2] += weight_buffer[jj] * rgba_temp[2][jj]; @@ -2196,8 +2297,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview, rgba[2]=0; rgba[3]=0;*/ /* not enough pixels in resampling, resort to direct interpolation */ - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level, - sp_sview->faces[j], &rgba_temp[0][j]); + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][j]); den = 1; num[0] = rgba_temp[0][j]; num[1] = rgba_temp[1][j]; @@ -2226,7 +2329,7 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { const struct pipe_resource *texture = sp_sview->base.texture; @@ -2241,11 +2344,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview, float dudy = (s[QUAD_TOP_LEFT] - s[QUAD_BOTTOM_LEFT]) * s_to_u; float dvdx = (t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]) * t_to_v; float dvdy = (t[QUAD_TOP_LEFT] - t[QUAD_BOTTOM_LEFT]) * t_to_v; - - if (control == tgsi_sampler_lod_bias || - control == tgsi_sampler_lod_none || + struct img_filter_args args; + + if (filt_args->control == tgsi_sampler_lod_bias || + filt_args->control == tgsi_sampler_lod_none || /* XXX FIXME */ - control == tgsi_sampler_derivs_explicit) { + filt_args->control == tgsi_sampler_derivs_explicit) { /* note: instead of working with Px and Py, we will use the * squared length instead, to avoid sqrt. */ @@ -2282,12 +2386,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview, * this since 0.5*log(x) = log(sqrt(x)) */ lambda = 0.5F * util_fast_log2(Pmin2) + sp_samp->base.lod_bias; - compute_lod(&sp_samp->base, control, lambda, lod_in, lod); + compute_lod(&sp_samp->base, filt_args->control, lambda, lod_in, lod); } else { - assert(control == tgsi_sampler_lod_explicit || - control == tgsi_sampler_lod_zero); - compute_lod(&sp_samp->base, control, sp_samp->base.lod_bias, lod_in, lod); + assert(filt_args->control == tgsi_sampler_lod_explicit || + filt_args->control == tgsi_sampler_lod_zero); + compute_lod(&sp_samp->base, filt_args->control, sp_samp->base.lod_bias, lod_in, lod); } /* XXX: Take into account all lod values. @@ -2300,9 +2404,14 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview, */ if (level0 >= (int) psview->u.tex.last_level) { int j; - for (j = 0; j < TGSI_QUAD_SIZE; j++) - min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level, - sp_sview->faces[j], &rgba[0][j]); + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.level = psview->u.tex.last_level; + args.face_id = sp_sview->faces[j]; + min_filter(sp_sview, sp_samp, &args, &rgba[0][j]); + } } else { /* don't bother interpolating between multiple LODs; it doesn't @@ -2334,29 +2443,33 @@ mip_filter_linear_2d_linear_repeat_POT( const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod_in[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { const struct pipe_sampler_view *psview = &sp_sview->base; int j; float lod[TGSI_QUAD_SIZE]; - compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod); + compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod); for (j = 0; j < TGSI_QUAD_SIZE; j++) { int level0 = psview->u.tex.first_level + (int)lod[j]; - + struct img_filter_args args; /* Catches both negative and large values of level0: */ + args.s = s[j]; + args.t = t[j]; + args.p = p[j]; + args.face_id = sp_sview->faces[j]; + args.offset = filt_args->offset; + args.gather_only = filt_args->control == tgsi_sampler_gather; if ((unsigned)level0 >= psview->u.tex.last_level) { if (level0 < 0) - img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], - psview->u.tex.first_level, - sp_sview->faces[j], &rgba[0][j]); + args.level = psview->u.tex.first_level; else - img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], - psview->u.tex.last_level, - sp_sview->faces[j], &rgba[0][j]); + args.level = psview->u.tex.last_level; + img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, + &rgba[0][j]); } else { @@ -2364,10 +2477,10 @@ mip_filter_linear_2d_linear_repeat_POT( float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; int c; - img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0, - sp_sview->faces[j], &rgbax[0][0]); - img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0+1, - sp_sview->faces[j], &rgbax[0][1]); + args.level = level0; + img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][0]); + args.level = level0+1; + img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][1]); for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]); @@ -2395,11 +2508,12 @@ sample_compare(struct sp_sampler_view *sp_sview, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { const struct pipe_sampler_state *sampler = &sp_samp->base; - int j; - int k[4]; + int j, v; + int k[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; float pc[4]; const struct util_format_description *format_desc; unsigned chan_type; + bool is_gather = (control == tgsi_sampler_gather); /** * Compare texcoord 'p' (aka R) against texture value 'rgba[0]' @@ -2408,13 +2522,13 @@ sample_compare(struct sp_sampler_view *sp_sview, * RGBA channels. We look at the red channel here. */ - if (sp_sview->base.texture->target == PIPE_TEXTURE_2D_ARRAY || - sp_sview->base.texture->target == PIPE_TEXTURE_CUBE) { + if (sp_sview->base.target == PIPE_TEXTURE_2D_ARRAY || + sp_sview->base.target == PIPE_TEXTURE_CUBE) { pc[0] = c0[0]; pc[1] = c0[1]; pc[2] = c0[2]; pc[3] = c0[3]; - } else if (sp_sview->base.texture->target == PIPE_TEXTURE_CUBE_ARRAY) { + } else if (sp_sview->base.target == PIPE_TEXTURE_CUBE_ARRAY) { pc[0] = c1[0]; pc[1] = c1[1]; pc[2] = c1[2]; @@ -2443,65 +2557,74 @@ sample_compare(struct sp_sampler_view *sp_sview, pc[3] = CLAMP(pc[3], 0.0F, 1.0F); } - /* compare four texcoords vs. four texture samples */ - switch (sampler->compare_func) { - case PIPE_FUNC_LESS: - k[0] = pc[0] < rgba[0][0]; - k[1] = pc[1] < rgba[0][1]; - k[2] = pc[2] < rgba[0][2]; - k[3] = pc[3] < rgba[0][3]; - break; - case PIPE_FUNC_LEQUAL: - k[0] = pc[0] <= rgba[0][0]; - k[1] = pc[1] <= rgba[0][1]; - k[2] = pc[2] <= rgba[0][2]; - k[3] = pc[3] <= rgba[0][3]; - break; - case PIPE_FUNC_GREATER: - k[0] = pc[0] > rgba[0][0]; - k[1] = pc[1] > rgba[0][1]; - k[2] = pc[2] > rgba[0][2]; - k[3] = pc[3] > rgba[0][3]; - break; - case PIPE_FUNC_GEQUAL: - k[0] = pc[0] >= rgba[0][0]; - k[1] = pc[1] >= rgba[0][1]; - k[2] = pc[2] >= rgba[0][2]; - k[3] = pc[3] >= rgba[0][3]; - break; - case PIPE_FUNC_EQUAL: - k[0] = pc[0] == rgba[0][0]; - k[1] = pc[1] == rgba[0][1]; - k[2] = pc[2] == rgba[0][2]; - k[3] = pc[3] == rgba[0][3]; - break; - case PIPE_FUNC_NOTEQUAL: - k[0] = pc[0] != rgba[0][0]; - k[1] = pc[1] != rgba[0][1]; - k[2] = pc[2] != rgba[0][2]; - k[3] = pc[3] != rgba[0][3]; - break; - case PIPE_FUNC_ALWAYS: - k[0] = k[1] = k[2] = k[3] = 1; - break; - case PIPE_FUNC_NEVER: - k[0] = k[1] = k[2] = k[3] = 0; - break; - default: - k[0] = k[1] = k[2] = k[3] = 0; - assert(0); - break; + for (v = 0; v < (is_gather ? TGSI_NUM_CHANNELS : 1); v++) { + /* compare four texcoords vs. four texture samples */ + switch (sampler->compare_func) { + case PIPE_FUNC_LESS: + k[v][0] = pc[0] < rgba[v][0]; + k[v][1] = pc[1] < rgba[v][1]; + k[v][2] = pc[2] < rgba[v][2]; + k[v][3] = pc[3] < rgba[v][3]; + break; + case PIPE_FUNC_LEQUAL: + k[v][0] = pc[0] <= rgba[v][0]; + k[v][1] = pc[1] <= rgba[v][1]; + k[v][2] = pc[2] <= rgba[v][2]; + k[v][3] = pc[3] <= rgba[v][3]; + break; + case PIPE_FUNC_GREATER: + k[v][0] = pc[0] > rgba[v][0]; + k[v][1] = pc[1] > rgba[v][1]; + k[v][2] = pc[2] > rgba[v][2]; + k[v][3] = pc[3] > rgba[v][3]; + break; + case PIPE_FUNC_GEQUAL: + k[v][0] = pc[0] >= rgba[v][0]; + k[v][1] = pc[1] >= rgba[v][1]; + k[v][2] = pc[2] >= rgba[v][2]; + k[v][3] = pc[3] >= rgba[v][3]; + break; + case PIPE_FUNC_EQUAL: + k[v][0] = pc[0] == rgba[v][0]; + k[v][1] = pc[1] == rgba[v][1]; + k[v][2] = pc[2] == rgba[v][2]; + k[v][3] = pc[3] == rgba[v][3]; + break; + case PIPE_FUNC_NOTEQUAL: + k[v][0] = pc[0] != rgba[v][0]; + k[v][1] = pc[1] != rgba[v][1]; + k[v][2] = pc[2] != rgba[v][2]; + k[v][3] = pc[3] != rgba[v][3]; + break; + case PIPE_FUNC_ALWAYS: + k[v][0] = k[v][1] = k[v][2] = k[v][3] = 1; + break; + case PIPE_FUNC_NEVER: + k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0; + break; + default: + k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0; + assert(0); + break; + } } - for (j = 0; j < TGSI_QUAD_SIZE; j++) { - rgba[0][j] = k[j]; - rgba[1][j] = k[j]; - rgba[2][j] = k[j]; - rgba[3][j] = 1.0F; + if (is_gather) { + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + for (v = 0; v < TGSI_NUM_CHANNELS; v++) { + rgba[v][j] = k[v][j]; + } + } + } else { + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba[0][j] = k[0][j]; + rgba[1][j] = k[0][j]; + rgba[2][j] = k[0][j]; + rgba[3][j] = 1.0F; + } } } - static void do_swizzling(const struct pipe_sampler_view *sview, float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], @@ -2679,9 +2802,9 @@ any_swizzle(const struct pipe_sampler_view *view) static img_filter_func get_img_filter(const struct sp_sampler_view *sp_sview, const struct pipe_sampler_state *sampler, - unsigned filter) + unsigned filter, bool gather) { - switch (sp_sview->base.texture->target) { + switch (sp_sview->base.target) { case PIPE_BUFFER: case PIPE_TEXTURE_1D: if (filter == PIPE_TEX_FILTER_NEAREST) @@ -2699,7 +2822,7 @@ get_img_filter(const struct sp_sampler_view *sp_sview, case PIPE_TEXTURE_RECT: /* Try for fast path: */ - if (sp_sview->pot2d && + if (!gather && sp_sview->pot2d && sampler->wrap_s == sampler->wrap_t && sampler->normalized_coords) { @@ -2769,35 +2892,38 @@ sample_mip(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { mip_filter_func mip_filter; img_filter_func min_img_filter = NULL; img_filter_func mag_img_filter = NULL; - if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) { + if (filt_args->control == tgsi_sampler_gather) { + mip_filter = mip_filter_nearest; + min_img_filter = get_img_filter(sp_sview, &sp_samp->base, PIPE_TEX_FILTER_LINEAR, true); + } else if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) { mip_filter = mip_filter_linear_2d_linear_repeat_POT; } else { mip_filter = sp_samp->mip_filter; - min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter); + min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter, false); if (sp_samp->min_mag_equal) { mag_img_filter = min_img_filter; } else { - mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter); + mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter, false); } } mip_filter(sp_sview, sp_samp, min_img_filter, mag_img_filter, - s, t, p, c0, lod, control, rgba); + s, t, p, c0, lod, filt_args, rgba); if (sp_samp->base.compare_mode != PIPE_TEX_COMPARE_NONE) { - sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, control, rgba); + sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, filt_args->control, rgba); } - if (sp_sview->need_swizzle) { + if (sp_sview->need_swizzle && filt_args->control != tgsi_sampler_gather) { float rgba_temp[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; memcpy(rgba_temp, rgba, sizeof(rgba_temp)); do_swizzling(&sp_sview->base, rgba_temp, rgba); @@ -2818,7 +2944,7 @@ sample_cube(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float c1[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *filt_args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { unsigned j; @@ -2896,7 +3022,7 @@ sample_cube(struct sp_sampler_view *sp_sview, } } - sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, control, rgba); + sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, filt_args, rgba); } @@ -2907,7 +3033,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level, const struct pipe_sampler_view *view = &sp_sview->base; const struct pipe_resource *texture = view->texture; - if (texture->target == PIPE_BUFFER) { + if (view->target == PIPE_BUFFER) { dims[0] = (view->u.buf.last_element - view->u.buf.first_element) + 1; /* the other values are undefined, but let's avoid potential valgrind * warnings. @@ -2924,7 +3050,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level, dims[3] = view->u.tex.last_level - view->u.tex.first_level + 1; dims[0] = u_minify(texture->width0, level); - switch(texture->target) { + switch (view->target) { case PIPE_TEXTURE_1D_ARRAY: dims[1] = view->u.tex.last_layer - view->u.tex.first_layer + 1; /* fallthrough */ @@ -2975,13 +3101,16 @@ sp_get_texels(struct sp_sampler_view *sp_sview, addr.value = 0; /* TODO write a better test for LOD */ - addr.bits.level = lod[0]; + addr.bits.level = sp_sview->base.target == PIPE_BUFFER ? 0 : + CLAMP(lod[0] + sp_sview->base.u.tex.first_level, + sp_sview->base.u.tex.first_level, + sp_sview->base.u.tex.last_level); width = u_minify(texture->width0, addr.bits.level); height = u_minify(texture->height0, addr.bits.level); depth = u_minify(texture->depth0, addr.bits.level); - switch(texture->target) { + switch (sp_sview->base.target) { case PIPE_BUFFER: case PIPE_TEXTURE_1D: for (j = 0; j < TGSI_QUAD_SIZE; j++) { @@ -2995,7 +3124,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview, case PIPE_TEXTURE_1D_ARRAY: for (j = 0; j < TGSI_QUAD_SIZE; j++) { int x = CLAMP(v_i[j] + offset[0], 0, width - 1); - int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer); + int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); tx = get_texel_2d_no_border(sp_sview, addr, x, y); for (c = 0; c < 4; c++) { rgba[c][j] = tx[c]; @@ -3017,7 +3147,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview, for (j = 0; j < TGSI_QUAD_SIZE; j++) { int x = CLAMP(v_i[j] + offset[0], 0, width - 1); int y = CLAMP(v_j[j] + offset[1], 0, height - 1); - int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer); + int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer, + sp_sview->base.u.tex.last_layer); tx = get_texel_3d_no_border(sp_sview, addr, x, y, layer); for (c = 0; c < 4; c++) { rgba[c][j] = tx[c]; @@ -3140,7 +3271,7 @@ softpipe_get_lambda_func(const struct pipe_sampler_view *view, unsigned shader) if (shader != PIPE_SHADER_FRAGMENT) return compute_lambda_vert; - switch (view->texture->target) { + switch (view->target) { case PIPE_BUFFER: case PIPE_TEXTURE_1D: case PIPE_TEXTURE_1D_ARRAY: @@ -3176,19 +3307,49 @@ softpipe_create_sampler_view(struct pipe_context *pipe, pipe_resource_reference(&view->texture, resource); view->context = pipe; +#ifdef DEBUG + /* + * This is possibly too lenient, but the primary reason is just + * to catch state trackers which forget to initialize this, so + * it only catches clearly impossible view targets. + */ + if (view->target != resource->target) { + if (view->target == PIPE_TEXTURE_1D) + assert(resource->target == PIPE_TEXTURE_1D_ARRAY); + else if (view->target == PIPE_TEXTURE_1D_ARRAY) + assert(resource->target == PIPE_TEXTURE_1D); + else if (view->target == PIPE_TEXTURE_2D) + assert(resource->target == PIPE_TEXTURE_2D_ARRAY || + resource->target == PIPE_TEXTURE_CUBE || + resource->target == PIPE_TEXTURE_CUBE_ARRAY); + else if (view->target == PIPE_TEXTURE_2D_ARRAY) + assert(resource->target == PIPE_TEXTURE_2D || + resource->target == PIPE_TEXTURE_CUBE || + resource->target == PIPE_TEXTURE_CUBE_ARRAY); + else if (view->target == PIPE_TEXTURE_CUBE) + assert(resource->target == PIPE_TEXTURE_CUBE_ARRAY || + resource->target == PIPE_TEXTURE_2D_ARRAY); + else if (view->target == PIPE_TEXTURE_CUBE_ARRAY) + assert(resource->target == PIPE_TEXTURE_CUBE || + resource->target == PIPE_TEXTURE_2D_ARRAY); + else + assert(0); + } +#endif + if (any_swizzle(view)) { sview->need_swizzle = TRUE; } - if (resource->target == PIPE_TEXTURE_CUBE || - resource->target == PIPE_TEXTURE_CUBE_ARRAY) + if (view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) sview->get_samples = sample_cube; else { sview->get_samples = sample_mip; } sview->pot2d = spr->pot && - (resource->target == PIPE_TEXTURE_2D || - resource->target == PIPE_TEXTURE_RECT); + (view->target == PIPE_TEXTURE_2D || + view->target == PIPE_TEXTURE_RECT); sview->xpot = util_logbase2( resource->width0 ); sview->ypot = util_logbase2( resource->height0 ); @@ -3230,7 +3391,7 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) { struct sp_tgsi_sampler *sp_samp = (struct sp_tgsi_sampler *)tgsi_sampler; - + struct filter_args filt_args; assert(sview_index < PIPE_MAX_SHADER_SAMPLER_VIEWS); assert(sampler_index < PIPE_MAX_SAMPLERS); assert(sp_samp->sp_sampler[sampler_index]); @@ -3244,9 +3405,12 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler, } return; } + + filt_args.control = control; + filt_args.offset = offset; sp_samp->sp_sview[sview_index].get_samples(&sp_samp->sp_sview[sview_index], sp_samp->sp_sampler[sampler_index], - s, t, p, c0, lod, control, rgba); + s, t, p, c0, lod, &filt_args, rgba); } diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h index 00a97c5186b..7d1aafc4473 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.h +++ b/src/gallium/drivers/softpipe/sp_tex_sample.h @@ -38,10 +38,12 @@ struct sp_sampler; typedef void (*wrap_nearest_func)(float s, unsigned size, + int offset, int *icoord); typedef void (*wrap_linear_func)(float s, unsigned size, + int offset, int *icoord0, int *icoord1, float *w); @@ -51,15 +53,27 @@ typedef float (*compute_lambda_func)(const struct sp_sampler_view *sp_sview, const float t[TGSI_QUAD_SIZE], const float p[TGSI_QUAD_SIZE]); +struct img_filter_args { + float s; + float t; + float p; + unsigned level; + unsigned face_id; + const int8_t *offset; + bool gather_only; + int gather_comp; +}; + typedef void (*img_filter_func)(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, - float s, - float t, - float p, - unsigned level, - unsigned face_id, + const struct img_filter_args *args, float *rgba); +struct filter_args { + enum tgsi_sampler_control control; + const int8_t *offset; +}; + typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, img_filter_func min_filter, @@ -69,7 +83,7 @@ typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); @@ -80,7 +94,7 @@ typedef void (*filter_func)(struct sp_sampler_view *sp_sview, const float p[TGSI_QUAD_SIZE], const float c0[TGSI_QUAD_SIZE], const float lod[TGSI_QUAD_SIZE], - enum tgsi_sampler_control control, + const struct filter_args *args, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c index ab8ba60849a..4a421a8f882 100644 --- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c @@ -151,7 +151,7 @@ sp_tex_tile_cache_set_sampler_view(struct softpipe_tex_tile_cache *tc, tc->entries[i].addr.bits.invalid = 1; } - tc->tex_face = -1; /* any invalid value here */ + tc->tex_z = -1; /* any invalid value here */ } } @@ -172,7 +172,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc) for (pos = 0; pos < Elements(tc->entries); pos++) { tc->entries[pos].addr.bits.invalid = 1; } - tc->tex_face = -1; + tc->tex_z = -1; } } @@ -190,8 +190,7 @@ tex_cache_pos( union tex_tile_address addr ) { uint entry = (addr.bits.x + addr.bits.y * 9 + - addr.bits.z * 3 + - addr.bits.face + + addr.bits.z + addr.bits.level * 7); return entry % NUM_TEX_TILE_ENTRIES; @@ -226,7 +225,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, /* check if we need to get a new transfer */ if (!tc->tex_trans || - tc->tex_face != addr.bits.face || tc->tex_level != addr.bits.level || tc->tex_z != addr.bits.z) { /* get new transfer (view into texture) */ @@ -245,7 +243,7 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, } else { height = u_minify(tc->texture->height0, addr.bits.level); - layer = addr.bits.face + addr.bits.z; + layer = addr.bits.z; } tc->tex_trans_map = @@ -255,7 +253,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED, 0, 0, width, height, &tc->tex_trans); - tc->tex_face = addr.bits.face; tc->tex_level = addr.bits.level; tc->tex_z = addr.bits.z; } diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h index 4eb42460552..2233effc439 100644 --- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h +++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h @@ -55,7 +55,6 @@ union tex_tile_address { unsigned x:TEX_ADDR_BITS; /* 16K / TILE_SIZE */ unsigned y:TEX_ADDR_BITS; /* 16K / TILE_SIZE */ unsigned z:TEX_Z_BITS; /* 16K -- z not tiled */ - unsigned face:3; unsigned level:4; unsigned invalid:1; } bits; @@ -94,7 +93,7 @@ struct softpipe_tex_tile_cache struct pipe_transfer *tex_trans; void *tex_trans_map; - int tex_face, tex_level, tex_z; + int tex_level, tex_z; unsigned swizzle_r; unsigned swizzle_g; @@ -141,7 +140,6 @@ tex_tile_address( unsigned x, addr.bits.x = x / TEX_TILE_SIZE; addr.bits.y = y / TEX_TILE_SIZE; addr.bits.z = z; - addr.bits.face = face; addr.bits.level = level; return addr; diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index b75f0386449..56e486786df 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -308,6 +308,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_UMA: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; } @@ -376,6 +377,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; } /* If we get here, we failed to handle a cap above */ @@ -433,6 +435,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; } /* If we get here, we failed to handle a cap above */ diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c index 7a12b52e2dd..bac956066a5 100644 --- a/src/gallium/drivers/svga/svga_tgsi_insn.c +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c @@ -1900,7 +1900,7 @@ emit_tex(struct svga_shader_emitter *emit, emit->key.fkey.tex[unit].swizzle_b != PIPE_SWIZZLE_BLUE || emit->key.fkey.tex[unit].swizzle_a != PIPE_SWIZZLE_ALPHA); - boolean saturate = insn->Instruction.Saturate != TGSI_SAT_NONE; + boolean saturate = insn->Instruction.Saturate; /* If doing compare processing or tex swizzle or saturation, we need to put * the fetched color into a temporary so it can be used as a source later on. diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index 0b56517e696..0013c963e7a 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -553,6 +553,8 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, TRACE_SHADER_STATE(fs) TRACE_SHADER_STATE(vs) TRACE_SHADER_STATE(gs) +TRACE_SHADER_STATE(tcs) +TRACE_SHADER_STATE(tes) #undef TRACE_SHADER_STATE @@ -1508,6 +1510,23 @@ static void trace_context_memory_barrier(struct pipe_context *_context, } +static void trace_context_set_tess_state(struct pipe_context *_context, + const float default_outer_level[4], + const float default_inner_level[2]) +{ + struct trace_context *tr_context = trace_context(_context); + struct pipe_context *context = tr_context->pipe; + + trace_dump_call_begin("pipe_context", "set_tess_state"); + trace_dump_arg(ptr, context); + trace_dump_arg_array(float, default_outer_level, 4); + trace_dump_arg_array(float, default_inner_level, 2); + trace_dump_call_end(); + + context->set_tess_state(context, default_outer_level, default_inner_level); +} + + static const struct debug_named_value rbug_blocker_flags[] = { {"before", 1, NULL}, {"after", 2, NULL}, @@ -1566,6 +1585,12 @@ trace_context_create(struct trace_screen *tr_scr, TR_CTX_INIT(create_gs_state); TR_CTX_INIT(bind_gs_state); TR_CTX_INIT(delete_gs_state); + TR_CTX_INIT(create_tcs_state); + TR_CTX_INIT(bind_tcs_state); + TR_CTX_INIT(delete_tcs_state); + TR_CTX_INIT(create_tes_state); + TR_CTX_INIT(bind_tes_state); + TR_CTX_INIT(delete_tes_state); TR_CTX_INIT(create_vertex_elements_state); TR_CTX_INIT(bind_vertex_elements_state); TR_CTX_INIT(delete_vertex_elements_state); @@ -1597,6 +1622,7 @@ trace_context_create(struct trace_screen *tr_scr, TR_CTX_INIT(flush); TR_CTX_INIT(texture_barrier); TR_CTX_INIT(memory_barrier); + TR_CTX_INIT(set_tess_state); TR_CTX_INIT(transfer_map); TR_CTX_INIT(transfer_unmap); diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c index 71273380434..9bf4a722d80 100644 --- a/src/gallium/drivers/trace/tr_dump_state.c +++ b/src/gallium/drivers/trace/tr_dump_state.c @@ -709,6 +709,8 @@ void trace_dump_draw_info(const struct pipe_draw_info *state) trace_dump_member(uint, state, start_instance); trace_dump_member(uint, state, instance_count); + trace_dump_member(uint, state, vertices_per_patch); + trace_dump_member(int, state, index_bias); trace_dump_member(uint, state, min_index); trace_dump_member(uint, state, max_index); diff --git a/src/gallium/drivers/trace/tr_public.h b/src/gallium/drivers/trace/tr_public.h index aee4937dd4f..b03133f8d97 100644 --- a/src/gallium/drivers/trace/tr_public.h +++ b/src/gallium/drivers/trace/tr_public.h @@ -28,6 +28,8 @@ #ifndef TR_PUBLIC_H #define TR_PUBLIC_H +#include "pipe/p_compiler.h" + #ifdef __cplusplus extern "C" { #endif diff --git a/src/gallium/drivers/vc4/kernel/Makefile.am b/src/gallium/drivers/vc4/Android.mk index 1ae5f1c2e83..f42a152aa8c 100644 --- a/src/gallium/drivers/vc4/kernel/Makefile.am +++ b/src/gallium/drivers/vc4/Android.mk @@ -1,4 +1,4 @@ -# Copyright © 2014 Broadcom +# Copyright (C) 2014 Emil Velikov <[email protected]> # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -7,34 +7,31 @@ # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. -include Makefile.sources -include $(top_srcdir)/src/gallium/Automake.inc +LOCAL_PATH := $(call my-dir) -if USE_VC4_SIMULATOR -SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1 -endif +# get C_SOURCES +include $(LOCAL_PATH)/Makefile.sources -AM_CFLAGS = \ - $(LIBDRM_CFLAGS) \ - $(GALLIUM_DRIVER_CFLAGS) \ - $(SIM_CFLAGS) \ - -I$(top_srcdir)/src/mesa/ \ - -I$(srcdir)/../ \ - $() +include $(CLEAR_VARS) -noinst_LTLIBRARIES = libvc4_kernel.la +LOCAL_SRC_FILES := \ + $(C_SOURCES) -libvc4_kernel_la_SOURCES = $(C_SOURCES) -libvc4_kernel_la_LDFLAGS = $(SIM_LDFLAGS) +LOCAL_SHARED_LIBRARIES := libdrm +# We need libmesa_glsl to get NIR's generated include directories. +LOCAL_STATIC_LIBRARIES := libmesa_glsl +LOCAL_MODULE := libmesa_pipe_vc4 + +include $(GALLIUM_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am index 3fc591f10c1..3f62ce21a9f 100644 --- a/src/gallium/drivers/vc4/Makefile.am +++ b/src/gallium/drivers/vc4/Makefile.am @@ -19,7 +19,7 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -SUBDIRS = kernel +AUTOMAKE_OPTIONS = subdir-objects include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc @@ -39,5 +39,5 @@ AM_CFLAGS = \ noinst_LTLIBRARIES = libvc4.la libvc4_la_SOURCES = $(C_SOURCES) -libvc4_la_LIBADD = $(SIM_LIB) kernel/libvc4_kernel.la +libvc4_la_LIBADD = $(SIM_LIB) libvc4_la_LDFLAGS = $(SIM_LDFLAGS) diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 49474df3548..1eb029e67e7 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -1,4 +1,10 @@ C_SOURCES := \ + kernel/vc4_drv.h \ + kernel/vc4_gem.c \ + kernel/vc4_packet.h \ + kernel/vc4_render_cl.c \ + kernel/vc4_validate.c \ + kernel/vc4_validate_shaders.c \ vc4_blit.c \ vc4_bufmgr.c \ vc4_bufmgr.h \ @@ -20,7 +26,6 @@ C_SOURCES := \ vc4_opt_dead_code.c \ vc4_opt_small_immediates.c \ vc4_opt_vpm_writes.c \ - vc4_packet.h \ vc4_program.c \ vc4_qir.c \ vc4_qir_lower_uniforms.c \ diff --git a/src/gallium/drivers/vc4/kernel/Makefile.sources b/src/gallium/drivers/vc4/kernel/Makefile.sources deleted file mode 100644 index 7d17a898ebf..00000000000 --- a/src/gallium/drivers/vc4/kernel/Makefile.sources +++ /dev/null @@ -1,6 +0,0 @@ -C_SOURCES := \ - vc4_drv.h \ - vc4_gem.c \ - vc4_validate.c \ - vc4_validate_shaders.c \ - $() diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h index 325f944bf25..1fd8aa9fb28 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_drv.h +++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h @@ -28,8 +28,6 @@ enum vc4_bo_mode { VC4_MODE_UNDECIDED, - VC4_MODE_TILE_ALLOC, - VC4_MODE_TSDA, VC4_MODE_RENDER, VC4_MODE_SHADER, }; @@ -52,6 +50,11 @@ struct vc4_exec_info { struct vc4_bo_exec_state *bo; uint32_t bo_count; + /* List of other BOs used in the job that need to be released + * once the job is complete. + */ + struct list_head unref_list; + /* Current unvalidated indices into @bo loaded by the non-hardware * VC4_PACKET_GEM_HANDLES. */ @@ -83,14 +86,11 @@ struct vc4_exec_info { uint32_t shader_state_count; bool found_tile_binning_mode_config_packet; - bool found_tile_rendering_mode_config_packet; bool found_start_tile_binning_packet; bool found_increment_semaphore_packet; - bool found_wait_on_semaphore_packet; uint8_t bin_tiles_x, bin_tiles_y; - uint32_t fb_width, fb_height; - uint32_t tile_alloc_init_block_size; - struct drm_gem_cma_object *tile_alloc_bo; + struct drm_gem_cma_object *tile_bo; + uint32_t tile_alloc_offset; /** * Computed addresses pointing into exec_bo where we start the @@ -157,13 +157,10 @@ struct vc4_validated_shader_info /* vc4_validate.c */ int -vc4_validate_cl(struct drm_device *dev, - void *validated, - void *unvalidated, - uint32_t len, - bool is_bin, - bool has_bin, - struct vc4_exec_info *exec); +vc4_validate_bin_cl(struct drm_device *dev, + void *validated, + void *unvalidated, + struct vc4_exec_info *exec); int vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec); @@ -171,4 +168,16 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec); struct vc4_validated_shader_info * vc4_validate_shader(struct drm_gem_cma_object *shader_obj); +bool vc4_use_bo(struct vc4_exec_info *exec, + uint32_t hindex, + enum vc4_bo_mode mode, + struct drm_gem_cma_object **obj); + +int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec); + +bool vc4_check_tex_size(struct vc4_exec_info *exec, + struct drm_gem_cma_object *fbo, + uint32_t offset, uint8_t tiling_format, + uint32_t width, uint32_t height, uint8_t cpp); + #endif /* VC4_DRV_H */ diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c index ac29ab35dbc..e4b7fea5968 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_gem.c +++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c @@ -25,24 +25,26 @@ #include "vc4_drv.h" -int -vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) +/* + * Copies in the user's binning command list and generates the validated bin + * CL, along with associated data (shader records, uniforms). + */ +static int +vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec) { struct drm_vc4_submit_cl *args = exec->args; void *temp = NULL; - void *bin, *render; + void *bin; int ret = 0; uint32_t bin_offset = 0; - uint32_t render_offset = bin_offset + args->bin_cl_size; - uint32_t shader_rec_offset = roundup(render_offset + - args->render_cl_size, 16); + uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size, + 16); uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size; uint32_t exec_size = uniforms_offset + args->uniforms_size; uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) * args->shader_rec_count); - if (shader_rec_offset < render_offset || - uniforms_offset < shader_rec_offset || + if (uniforms_offset < shader_rec_offset || exec_size < uniforms_offset || args->shader_rec_count >= (UINT_MAX / sizeof(struct vc4_shader_state)) || @@ -66,7 +68,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) goto fail; } bin = temp + bin_offset; - render = temp + render_offset; exec->shader_rec_u = temp + shader_rec_offset; exec->uniforms_u = temp + uniforms_offset; exec->shader_state = temp + exec_size; @@ -80,14 +81,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) goto fail; } - ret = copy_from_user(render, - (void __user *)(uintptr_t)args->render_cl, - args->render_cl_size); - if (ret) { - DRM_ERROR("Failed to copy in render cl\n"); - goto fail; - } - ret = copy_from_user(exec->shader_rec_u, (void __user *)(uintptr_t)args->shader_rec, args->shader_rec_size); @@ -114,8 +107,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) } #endif + list_addtail(&to_vc4_bo(&exec->exec_bo->base)->unref_head, + &exec->unref_list); + exec->ct0ca = exec->exec_bo->paddr + bin_offset; - exec->ct1ca = exec->exec_bo->paddr + render_offset; exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset; exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset; @@ -125,23 +120,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset; exec->uniforms_size = args->uniforms_size; - ret = vc4_validate_cl(dev, - exec->exec_bo->vaddr + bin_offset, - bin, - args->bin_cl_size, - true, - args->bin_cl_size != 0, - exec); - if (ret) - goto fail; - - ret = vc4_validate_cl(dev, - exec->exec_bo->vaddr + render_offset, - render, - args->render_cl_size, - false, - args->bin_cl_size != 0, - exec); + ret = vc4_validate_bin_cl(dev, + exec->exec_bo->vaddr + bin_offset, + bin, + exec); if (ret) goto fail; @@ -152,4 +134,25 @@ fail: return ret; } +int +vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec) +{ + int ret = 0; + + if (exec->args->bin_cl_size != 0) { + ret = vc4_get_bcl(dev, exec); + if (ret) + goto fail; + } else { + exec->ct0ca = exec->ct0ea = 0; + } + + ret = vc4_get_rcl(dev, exec); + if (ret) + goto fail; + +fail: + return ret; +} + #endif /* USE_VC4_SIMULATOR */ diff --git a/src/gallium/drivers/vc4/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h index 181f2e01dc9..88cfc0fa9f0 100644 --- a/src/gallium/drivers/vc4/vc4_packet.h +++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h @@ -81,6 +81,38 @@ enum vc4_packet { VC4_PACKET_GEM_HANDLES = 254, } __attribute__ ((__packed__)); +#define VC4_PACKET_HALT_SIZE 1 +#define VC4_PACKET_NOP_SIZE 1 +#define VC4_PACKET_FLUSH_SIZE 1 +#define VC4_PACKET_FLUSH_ALL_SIZE 1 +#define VC4_PACKET_START_TILE_BINNING_SIZE 1 +#define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE 1 +#define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE 1 +#define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE 5 +#define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE 1 +#define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE 1 +#define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE 7 +#define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE 7 +#define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE 14 +#define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE 10 +#define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE 2 +#define VC4_PACKET_GL_SHADER_STATE_SIZE 5 +#define VC4_PACKET_NV_SHADER_STATE_SIZE 5 +#define VC4_PACKET_CONFIGURATION_BITS_SIZE 4 +#define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE 5 +#define VC4_PACKET_POINT_SIZE_SIZE 5 +#define VC4_PACKET_LINE_WIDTH_SIZE 5 +#define VC4_PACKET_RHT_X_BOUNDARY_SIZE 3 +#define VC4_PACKET_DEPTH_OFFSET_SIZE 5 +#define VC4_PACKET_CLIP_WINDOW_SIZE 9 +#define VC4_PACKET_VIEWPORT_OFFSET_SIZE 5 +#define VC4_PACKET_CLIPPER_XY_SCALING_SIZE 9 +#define VC4_PACKET_CLIPPER_Z_SCALING_SIZE 9 +#define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE 16 +#define VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE 11 +#define VC4_PACKET_CLEAR_COLORS_SIZE 14 +#define VC4_PACKET_TILE_COORDINATES_SIZE 3 +#define VC4_PACKET_GEM_HANDLES_SIZE 9 #define VC4_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low)) /* Using the GNU statement expression extension */ @@ -117,18 +149,19 @@ enum vc4_packet { /** @{ * - * byte 1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and + * byte 0-1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL */ -#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 7) -#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR (1 << 6) -#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR (1 << 5) -#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP (1 << 4) - -#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888 (0 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER (1 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_BGR565 (2 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_MASK (3 << 0) +#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 15) +#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR (1 << 14) +#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR (1 << 13) +#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP (1 << 12) + +#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK VC4_MASK(9, 8) +#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT 8 +#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888 0 +#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER 1 +#define VC4_LOADSTORE_TILE_BUFFER_BGR565 2 /** @} */ /** @{ @@ -136,21 +169,24 @@ enum vc4_packet { * byte 0 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL */ +#define VC4_STORE_TILE_BUFFER_MODE_MASK VC4_MASK(7, 6) +#define VC4_STORE_TILE_BUFFER_MODE_SHIFT 6 #define VC4_STORE_TILE_BUFFER_MODE_SAMPLE0 (0 << 6) #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4 (1 << 6) #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16 (2 << 6) /** The values of the field are VC4_TILING_FORMAT_* */ -#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK (3 << 4) -#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT 4 - - -#define VC4_LOADSTORE_TILE_BUFFER_NONE (0 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_COLOR (1 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_ZS (2 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_Z (3 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK (4 << 0) -#define VC4_LOADSTORE_TILE_BUFFER_FULL (5 << 0) +#define VC4_LOADSTORE_TILE_BUFFER_TILING_MASK VC4_MASK(5, 4) +#define VC4_LOADSTORE_TILE_BUFFER_TILING_SHIFT 4 + +#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK VC4_MASK(2, 0) +#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_SHIFT 0 +#define VC4_LOADSTORE_TILE_BUFFER_NONE 0 +#define VC4_LOADSTORE_TILE_BUFFER_COLOR 1 +#define VC4_LOADSTORE_TILE_BUFFER_ZS 2 +#define VC4_LOADSTORE_TILE_BUFFER_Z 3 +#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK 4 +#define VC4_LOADSTORE_TILE_BUFFER_FULL 5 /** @} */ #define VC4_INDEX_BUFFER_U8 (0 << 4) @@ -196,15 +232,19 @@ enum vc4_packet { /** @{ bits in the last u8 of VC4_PACKET_TILE_BINNING_MODE_CONFIG */ #define VC4_BIN_CONFIG_DB_NON_MS (1 << 7) -#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 (0 << 5) -#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64 (1 << 5) -#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128 (2 << 5) -#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256 (3 << 5) +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK VC4_MASK(6, 5) +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_SHIFT 5 +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 0 +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64 1 +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128 2 +#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256 3 -#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32 (0 << 3) -#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64 (1 << 3) -#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128 (2 << 3) -#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256 (3 << 3) +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK VC4_MASK(4, 3) +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_SHIFT 3 +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32 0 +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64 1 +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128 2 +#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256 3 #define VC4_BIN_CONFIG_AUTO_INIT_TSDA (1 << 2) #define VC4_BIN_CONFIG_TILE_BUFFER_64BIT (1 << 1) @@ -219,17 +259,18 @@ enum vc4_packet { #define VC4_RENDER_CONFIG_ENABLE_VG_MASK (1 << 8) /** The values of the field are VC4_TILING_FORMAT_* */ -#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK (3 << 6) +#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK VC4_MASK(7, 6) #define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT 6 #define VC4_RENDER_CONFIG_DECIMATE_MODE_1X (0 << 4) #define VC4_RENDER_CONFIG_DECIMATE_MODE_4X (1 << 4) #define VC4_RENDER_CONFIG_DECIMATE_MODE_16X (2 << 4) -#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED (0 << 2) -#define VC4_RENDER_CONFIG_FORMAT_RGBA8888 (1 << 2) -#define VC4_RENDER_CONFIG_FORMAT_BGR565 (2 << 2) -#define VC4_RENDER_CONFIG_FORMAT_MASK (3 << 2) +#define VC4_RENDER_CONFIG_FORMAT_MASK VC4_MASK(3, 2) +#define VC4_RENDER_CONFIG_FORMAT_SHIFT 2 +#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED 0 +#define VC4_RENDER_CONFIG_FORMAT_RGBA8888 1 +#define VC4_RENDER_CONFIG_FORMAT_BGR565 2 #define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT (1 << 1) #define VC4_RENDER_CONFIG_MS_MODE_4X (1 << 0) diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c new file mode 100644 index 00000000000..e2d907ad91f --- /dev/null +++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c @@ -0,0 +1,447 @@ +/* + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * DOC: Render command list generation + * + * In the VC4 driver, render command list generation is performed by the + * kernel instead of userspace. We do this because validating a + * user-submitted command list is hard to get right and has high CPU overhead, + * while the number of valid configurations for render command lists is + * actually fairly low. + */ + +#include "vc4_drv.h" +#include "vc4_packet.h" + +struct vc4_rcl_setup { + struct drm_gem_cma_object *color_read; + struct drm_gem_cma_object *color_ms_write; + struct drm_gem_cma_object *zs_read; + struct drm_gem_cma_object *zs_write; + + struct drm_gem_cma_object *rcl; + u32 next_offset; +}; + +static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val) +{ + *(u8 *)(setup->rcl->vaddr + setup->next_offset) = val; + setup->next_offset += 1; +} + +static inline void rcl_u16(struct vc4_rcl_setup *setup, u16 val) +{ + *(u16 *)(setup->rcl->vaddr + setup->next_offset) = val; + setup->next_offset += 2; +} + +static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val) +{ + *(u32 *)(setup->rcl->vaddr + setup->next_offset) = val; + setup->next_offset += 4; +} + + +/* + * Emits a no-op STORE_TILE_BUFFER_GENERAL. + * + * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of + * some sort before another load is triggered. + */ +static void vc4_store_before_load(struct vc4_rcl_setup *setup) +{ + rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); + rcl_u16(setup, + VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_NONE, + VC4_LOADSTORE_TILE_BUFFER_BUFFER) | + VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR | + VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR | + VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR); + rcl_u32(setup, 0); /* no address, since we're in None mode */ +} + +/* + * Emits a PACKET_TILE_COORDINATES if one isn't already pending. + * + * The tile coordinates packet triggers a pending load if there is one, are + * used for clipping during rendering, and determine where loads/stores happen + * relative to their base address. + */ +static void vc4_tile_coordinates(struct vc4_rcl_setup *setup, + uint32_t x, uint32_t y) +{ + rcl_u8(setup, VC4_PACKET_TILE_COORDINATES); + rcl_u8(setup, x); + rcl_u8(setup, y); +} + +static void emit_tile(struct vc4_exec_info *exec, + struct vc4_rcl_setup *setup, + uint8_t x, uint8_t y, bool first, bool last) +{ + bool has_bin = exec->args->bin_cl_size != 0; + + /* Note that the load doesn't actually occur until the + * tile coords packet is processed, and only one load + * may be outstanding at a time. + */ + if (setup->color_read) { + rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); + rcl_u16(setup, exec->args->color_read.bits); + rcl_u32(setup, + setup->color_read->paddr + + exec->args->color_read.offset); + } + + if (setup->zs_read) { + if (setup->color_read) { + /* Exec previous load. */ + vc4_tile_coordinates(setup, x, y); + vc4_store_before_load(setup); + } + + rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); + rcl_u16(setup, exec->args->zs_read.bits); + rcl_u32(setup, + setup->zs_read->paddr + exec->args->zs_read.offset); + } + + /* Clipping depends on tile coordinates having been + * emitted, so we always need one here. + */ + vc4_tile_coordinates(setup, x, y); + + /* Wait for the binner before jumping to the first + * tile's lists. + */ + if (first && has_bin) + rcl_u8(setup, VC4_PACKET_WAIT_ON_SEMAPHORE); + + if (has_bin) { + rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST); + rcl_u32(setup, (exec->tile_bo->paddr + + exec->tile_alloc_offset + + (y * exec->bin_tiles_x + x) * 32)); + } + + if (setup->zs_write) { + rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); + rcl_u16(setup, exec->args->zs_write.bits | + (setup->color_ms_write ? + VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0)); + rcl_u32(setup, + (setup->zs_write->paddr + exec->args->zs_write.offset) | + ((last && !setup->color_ms_write) ? + VC4_LOADSTORE_TILE_BUFFER_EOF : 0)); + } + + if (setup->color_ms_write) { + if (setup->zs_write) { + /* Reset after previous store */ + vc4_tile_coordinates(setup, x, y); + } + + if (last) + rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF); + else + rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER); + } +} + +static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, + struct vc4_rcl_setup *setup) +{ + bool has_bin = exec->args->bin_cl_size != 0; + uint8_t min_x_tile = exec->args->min_x_tile; + uint8_t min_y_tile = exec->args->min_y_tile; + uint8_t max_x_tile = exec->args->max_x_tile; + uint8_t max_y_tile = exec->args->max_y_tile; + uint8_t xtiles = max_x_tile - min_x_tile + 1; + uint8_t ytiles = max_y_tile - min_y_tile + 1; + uint8_t x, y; + uint32_t size, loop_body_size; + + size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE; + loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE; + + if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + size += VC4_PACKET_CLEAR_COLORS_SIZE + + VC4_PACKET_TILE_COORDINATES_SIZE + + VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; + } + + if (setup->color_read) { + loop_body_size += (VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE); + } + if (setup->zs_read) { + if (setup->color_read) { + loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE; + loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; + } + loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE; + } + + if (has_bin) { + size += VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE; + loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE; + } + + if (setup->zs_write) + loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE; + if (setup->color_ms_write) { + if (setup->zs_write) + loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE; + loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE; + } + size += xtiles * ytiles * loop_body_size; + + setup->rcl = drm_gem_cma_create(dev, size); + if (!setup->rcl) + return -ENOMEM; + list_addtail(&to_vc4_bo(&setup->rcl->base)->unref_head, + &exec->unref_list); + + rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG); + rcl_u32(setup, + (setup->color_ms_write ? + (setup->color_ms_write->paddr + + exec->args->color_ms_write.offset) : + 0)); + rcl_u16(setup, exec->args->width); + rcl_u16(setup, exec->args->height); + rcl_u16(setup, exec->args->color_ms_write.bits); + + /* The tile buffer gets cleared when the previous tile is stored. If + * the clear values changed between frames, then the tile buffer has + * stale clear values in it, so we have to do a store in None mode (no + * writes) so that we trigger the tile buffer clear. + */ + if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + rcl_u8(setup, VC4_PACKET_CLEAR_COLORS); + rcl_u32(setup, exec->args->clear_color[0]); + rcl_u32(setup, exec->args->clear_color[1]); + rcl_u32(setup, exec->args->clear_z); + rcl_u8(setup, exec->args->clear_s); + + vc4_tile_coordinates(setup, 0, 0); + + rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); + rcl_u16(setup, VC4_LOADSTORE_TILE_BUFFER_NONE); + rcl_u32(setup, 0); /* no address, since we're in None mode */ + } + + for (y = min_y_tile; y <= max_y_tile; y++) { + for (x = min_x_tile; x <= max_x_tile; x++) { + bool first = (x == min_x_tile && y == min_y_tile); + bool last = (x == max_x_tile && y == max_y_tile); + emit_tile(exec, setup, x, y, first, last); + } + } + + BUG_ON(setup->next_offset != size); + exec->ct1ca = setup->rcl->paddr; + exec->ct1ea = setup->rcl->paddr + setup->next_offset; + + return 0; +} + +static int vc4_rcl_surface_setup(struct vc4_exec_info *exec, + struct drm_gem_cma_object **obj, + struct drm_vc4_submit_rcl_surface *surf) +{ + uint8_t tiling = VC4_GET_FIELD(surf->bits, + VC4_LOADSTORE_TILE_BUFFER_TILING); + uint8_t buffer = VC4_GET_FIELD(surf->bits, + VC4_LOADSTORE_TILE_BUFFER_BUFFER); + uint8_t format = VC4_GET_FIELD(surf->bits, + VC4_LOADSTORE_TILE_BUFFER_FORMAT); + int cpp; + + if (surf->pad != 0) { + DRM_ERROR("Padding unset\n"); + return -EINVAL; + } + + if (surf->hindex == ~0) + return 0; + + if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + return -EINVAL; + + if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK | + VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK | + VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) { + DRM_ERROR("Unknown bits in load/store: 0x%04x\n", + surf->bits); + return -EINVAL; + } + + if (tiling > VC4_TILING_FORMAT_LT) { + DRM_ERROR("Bad tiling format\n"); + return -EINVAL; + } + + if (buffer == VC4_LOADSTORE_TILE_BUFFER_ZS) { + if (format != 0) { + DRM_ERROR("No color format should be set for ZS\n"); + return -EINVAL; + } + cpp = 4; + } else if (buffer == VC4_LOADSTORE_TILE_BUFFER_COLOR) { + switch (format) { + case VC4_LOADSTORE_TILE_BUFFER_BGR565: + case VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER: + cpp = 2; + break; + case VC4_LOADSTORE_TILE_BUFFER_RGBA8888: + cpp = 4; + break; + default: + DRM_ERROR("Bad tile buffer format\n"); + return -EINVAL; + } + } else { + DRM_ERROR("Bad load/store buffer %d.\n", buffer); + return -EINVAL; + } + + if (surf->offset & 0xf) { + DRM_ERROR("load/store buffer must be 16b aligned.\n"); + return -EINVAL; + } + + if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling, + exec->args->width, exec->args->height, cpp)) { + return -EINVAL; + } + + return 0; +} + +static int +vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec, + struct drm_gem_cma_object **obj, + struct drm_vc4_submit_rcl_surface *surf) +{ + uint8_t tiling = VC4_GET_FIELD(surf->bits, + VC4_RENDER_CONFIG_MEMORY_FORMAT); + uint8_t format = VC4_GET_FIELD(surf->bits, + VC4_RENDER_CONFIG_FORMAT); + int cpp; + + if (surf->pad != 0) { + DRM_ERROR("Padding unset\n"); + return -EINVAL; + } + + if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK | + VC4_RENDER_CONFIG_FORMAT_MASK)) { + DRM_ERROR("Unknown bits in render config: 0x%04x\n", + surf->bits); + return -EINVAL; + } + + if (surf->hindex == ~0) + return 0; + + if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + return -EINVAL; + + if (tiling > VC4_TILING_FORMAT_LT) { + DRM_ERROR("Bad tiling format\n"); + return -EINVAL; + } + + switch (format) { + case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED: + case VC4_RENDER_CONFIG_FORMAT_BGR565: + cpp = 2; + break; + case VC4_RENDER_CONFIG_FORMAT_RGBA8888: + cpp = 4; + break; + default: + DRM_ERROR("Bad tile buffer format\n"); + return -EINVAL; + } + + if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling, + exec->args->width, exec->args->height, cpp)) { + return -EINVAL; + } + + return 0; +} + +int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec) +{ + struct vc4_rcl_setup setup = {0}; + struct drm_vc4_submit_cl *args = exec->args; + bool has_bin = args->bin_cl_size != 0; + int ret; + + if (args->min_x_tile > args->max_x_tile || + args->min_y_tile > args->max_y_tile) { + DRM_ERROR("Bad render tile set (%d,%d)-(%d,%d)\n", + args->min_x_tile, args->min_y_tile, + args->max_x_tile, args->max_y_tile); + return -EINVAL; + } + + if (has_bin && + (args->max_x_tile > exec->bin_tiles_x || + args->max_y_tile > exec->bin_tiles_y)) { + DRM_ERROR("Render tiles (%d,%d) outside of bin config (%d,%d)\n", + args->max_x_tile, args->max_y_tile, + exec->bin_tiles_x, exec->bin_tiles_y); + return -EINVAL; + } + + ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read); + if (ret) + return ret; + + ret = vc4_rcl_ms_surface_setup(exec, &setup.color_ms_write, + &args->color_ms_write); + if (ret) + return ret; + + ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read); + if (ret) + return ret; + + ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write); + if (ret) + return ret; + + /* We shouldn't even have the job submitted to us if there's no + * surface to write out. + */ + if (!setup.color_ms_write && !setup.zs_write) { + DRM_ERROR("RCL requires color or Z/S write\n"); + return -EINVAL; + } + + return vc4_create_rcl_bo(dev, exec, &setup); +} diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c index 2d04a4a7b9a..a0b67a7e50b 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c @@ -94,7 +94,7 @@ size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * utile_height(cpp)); } -static bool +bool vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex, enum vc4_bo_mode mode, @@ -147,33 +147,39 @@ gl_shader_rec_size(uint32_t pointer_bits) return 36 + attribute_count * 8; } -static bool -check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo, - uint32_t offset, uint8_t tiling_format, - uint32_t width, uint32_t height, uint8_t cpp) +bool +vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo, + uint32_t offset, uint8_t tiling_format, + uint32_t width, uint32_t height, uint8_t cpp) { uint32_t aligned_width, aligned_height, stride, size; uint32_t utile_w = utile_width(cpp); uint32_t utile_h = utile_height(cpp); - /* The values are limited by the packet/texture parameter bitfields, - * so we don't need to worry as much about integer overflow. + /* The shaded vertex format stores signed 12.4 fixed point + * (-2048,2047) offsets from the viewport center, so we should + * never have a render target larger than 4096. The texture + * unit can only sample from 2048x2048, so it's even more + * restricted. This lets us avoid worrying about overflow in + * our math. */ - BUG_ON(width > 65535); - BUG_ON(height > 65535); + if (width > 4096 || height > 4096) { + DRM_ERROR("Surface dimesions (%d,%d) too large", width, height); + return false; + } switch (tiling_format) { case VC4_TILING_FORMAT_LINEAR: - aligned_width = roundup(width, utile_w); + aligned_width = round_up(width, utile_w); aligned_height = height; break; case VC4_TILING_FORMAT_T: - aligned_width = roundup(width, utile_w * 8); - aligned_height = roundup(height, utile_h * 8); + aligned_width = round_up(width, utile_w * 8); + aligned_height = round_up(height, utile_h * 8); break; case VC4_TILING_FORMAT_LT: - aligned_width = roundup(width, utile_w); - aligned_height = roundup(height, utile_h); + aligned_width = round_up(width, utile_w); + aligned_height = round_up(height, utile_h); break; default: DRM_ERROR("buffer tiling %d unsupported\n", tiling_format); @@ -181,13 +187,6 @@ check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo, } stride = aligned_width * cpp; - - if (INT_MAX / stride < aligned_height) { - DRM_ERROR("Overflow in fbo size (%dx%d -> %dx%d)\n", - width, height, - aligned_width, aligned_height); - return false; - } size = stride * aligned_height; if (size + offset < size || @@ -249,122 +248,6 @@ validate_increment_semaphore(VALIDATE_ARGS) } static int -validate_wait_on_semaphore(VALIDATE_ARGS) -{ - if (exec->found_wait_on_semaphore_packet) { - DRM_ERROR("Duplicate VC4_PACKET_WAIT_ON_SEMAPHORE\n"); - return -EINVAL; - } - exec->found_wait_on_semaphore_packet = true; - - if (!exec->found_increment_semaphore_packet) { - DRM_ERROR("VC4_PACKET_WAIT_ON_SEMAPHORE without " - "VC4_PACKET_INCREMENT_SEMAPHORE\n"); - return -EINVAL; - } - - return 0; -} - -static int -validate_branch_to_sublist(VALIDATE_ARGS) -{ - struct drm_gem_cma_object *target; - uint32_t offset; - - if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &target)) - return -EINVAL; - - if (target != exec->tile_alloc_bo) { - DRM_ERROR("Jumping to BOs other than tile alloc unsupported\n"); - return -EINVAL; - } - - if (!exec->found_wait_on_semaphore_packet) { - DRM_ERROR("Jumping to tile alloc before binning finished.\n"); - return -EINVAL; - } - - offset = *(uint32_t *)(untrusted + 0); - if (offset % exec->tile_alloc_init_block_size || - offset / exec->tile_alloc_init_block_size >= - exec->bin_tiles_x * exec->bin_tiles_y) { - DRM_ERROR("VC4_PACKET_BRANCH_TO_SUB_LIST must jump to initial " - "tile allocation space.\n"); - return -EINVAL; - } - - *(uint32_t *)(validated + 0) = target->paddr + offset; - - return 0; -} - -/** - * validate_loadstore_tile_buffer_general() - Validation for - * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL and - * VC4_PACKET_STORE_TILE_BUFFER_GENERAL. - * - * The two packets are nearly the same, except for the TLB-clearing management - * bits not being present for loads. Additionally, while stores are executed - * immediately (using the current tile coordinates), loads are queued to be - * executed when the tile coordinates packet occurs. - * - * Note that coordinates packets are validated to be within the declared - * bin_x/y, which themselves are verified to match the rendering-configuration - * FB width and height (which the hardware uses to clip loads and stores). - */ -static int -validate_loadstore_tile_buffer_general(VALIDATE_ARGS) -{ - uint32_t packet_b0 = *(uint8_t *)(untrusted + 0); - uint32_t packet_b1 = *(uint8_t *)(untrusted + 1); - struct drm_gem_cma_object *fbo; - uint32_t buffer_type = packet_b0 & 0xf; - uint32_t untrusted_address, offset, cpp; - - switch (buffer_type) { - case VC4_LOADSTORE_TILE_BUFFER_NONE: - return 0; - case VC4_LOADSTORE_TILE_BUFFER_COLOR: - if ((packet_b1 & VC4_LOADSTORE_TILE_BUFFER_MASK) == - VC4_LOADSTORE_TILE_BUFFER_RGBA8888) { - cpp = 4; - } else { - cpp = 2; - } - break; - - case VC4_LOADSTORE_TILE_BUFFER_Z: - case VC4_LOADSTORE_TILE_BUFFER_ZS: - cpp = 4; - break; - - default: - DRM_ERROR("Load/store type %d unsupported\n", buffer_type); - return -EINVAL; - } - - if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo)) - return -EINVAL; - - untrusted_address = *(uint32_t *)(untrusted + 2); - offset = untrusted_address & ~0xf; - - if (!check_tex_size(exec, fbo, offset, - ((packet_b0 & - VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK) >> - VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT), - exec->fb_width, exec->fb_height, cpp)) { - return -EINVAL; - } - - *(uint32_t *)(validated + 2) = (offset + fbo->paddr + - (untrusted_address & 0xf)); - - return 0; -} - -static int validate_indexed_prim_list(VALIDATE_ARGS) { struct drm_gem_cma_object *ib; @@ -492,14 +375,10 @@ validate_nv_shader_state(VALIDATE_ARGS) static int validate_tile_binning_config(VALIDATE_ARGS) { - struct drm_gem_cma_object *tile_allocation; - struct drm_gem_cma_object *tile_state_data_array; + struct drm_device *dev = exec->exec_bo->base.dev; uint8_t flags; - uint32_t tile_allocation_size; - - if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &tile_allocation) || - !vc4_use_handle(exec, 1, VC4_MODE_TSDA, &tile_state_data_array)) - return -EINVAL; + uint32_t tile_state_size, tile_alloc_size; + uint32_t tile_count; if (exec->found_tile_binning_mode_config_packet) { DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n"); @@ -509,6 +388,7 @@ validate_tile_binning_config(VALIDATE_ARGS) exec->bin_tiles_x = *(uint8_t *)(untrusted + 12); exec->bin_tiles_y = *(uint8_t *)(untrusted + 13); + tile_count = exec->bin_tiles_x * exec->bin_tiles_y; flags = *(uint8_t *)(untrusted + 14); if (exec->bin_tiles_x == 0 || @@ -518,15 +398,6 @@ validate_tile_binning_config(VALIDATE_ARGS) return -EINVAL; } - /* Our validation relies on the user not getting to set up their own - * tile state/tile allocation BO contents. - */ - if (!(flags & VC4_BIN_CONFIG_AUTO_INIT_TSDA)) { - DRM_ERROR("binning config missing " - "VC4_BIN_CONFIG_AUTO_INIT_TSDA\n"); - return -EINVAL; - } - if (flags & (VC4_BIN_CONFIG_DB_NON_MS | VC4_BIN_CONFIG_TILE_BUFFER_64BIT | VC4_BIN_CONFIG_MS_MODE_4X)) { @@ -534,94 +405,52 @@ validate_tile_binning_config(VALIDATE_ARGS) return -EINVAL; } - if (*(uint32_t *)(untrusted + 0) != 0) { - DRM_ERROR("tile allocation offset != 0 unsupported\n"); - return -EINVAL; - } - tile_allocation_size = *(uint32_t *)(untrusted + 4); - if (tile_allocation_size > tile_allocation->base.size) { - DRM_ERROR("tile allocation size %d > BO size %d\n", - tile_allocation_size, tile_allocation->base.size); - return -EINVAL; - } - *(uint32_t *)validated = tile_allocation->paddr; - exec->tile_alloc_bo = tile_allocation; - - exec->tile_alloc_init_block_size = 1 << (5 + ((flags >> 5) & 3)); - if (exec->bin_tiles_x * exec->bin_tiles_y * - exec->tile_alloc_init_block_size > tile_allocation_size) { - DRM_ERROR("tile init exceeds tile alloc size (%d vs %d)\n", - exec->bin_tiles_x * exec->bin_tiles_y * - exec->tile_alloc_init_block_size, - tile_allocation_size); - return -EINVAL; - } - if (*(uint32_t *)(untrusted + 8) != 0) { - DRM_ERROR("TSDA offset != 0 unsupported\n"); - return -EINVAL; - } - if (exec->bin_tiles_x * exec->bin_tiles_y * 48 > - tile_state_data_array->base.size) { - DRM_ERROR("TSDA of %db too small for %dx%d bin config\n", - tile_state_data_array->base.size, - exec->bin_tiles_x, exec->bin_tiles_y); - } - *(uint32_t *)(validated + 8) = tile_state_data_array->paddr; - - return 0; -} - -static int -validate_tile_rendering_mode_config(VALIDATE_ARGS) -{ - struct drm_gem_cma_object *fbo; - uint32_t flags, offset, cpp; - - if (exec->found_tile_rendering_mode_config_packet) { - DRM_ERROR("Duplicate VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n"); - return -EINVAL; - } - exec->found_tile_rendering_mode_config_packet = true; - - if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo)) - return -EINVAL; - - exec->fb_width = *(uint16_t *)(untrusted + 4); - exec->fb_height = *(uint16_t *)(untrusted + 6); - - flags = *(uint16_t *)(untrusted + 8); - if ((flags & VC4_RENDER_CONFIG_FORMAT_MASK) == - VC4_RENDER_CONFIG_FORMAT_RGBA8888) { - cpp = 4; - } else { - cpp = 2; - } - - offset = *(uint32_t *)untrusted; - if (!check_tex_size(exec, fbo, offset, - ((flags & - VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK) >> - VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT), - exec->fb_width, exec->fb_height, cpp)) { - return -EINVAL; - } - - *(uint32_t *)validated = fbo->paddr + offset; - - return 0; -} - -static int -validate_tile_coordinates(VALIDATE_ARGS) -{ - uint8_t tile_x = *(uint8_t *)(untrusted + 0); - uint8_t tile_y = *(uint8_t *)(untrusted + 1); + /* The tile state data array is 48 bytes per tile, and we put it at + * the start of a BO containing both it and the tile alloc. + */ + tile_state_size = 48 * tile_count; + + /* Since the tile alloc array will follow us, align. */ + exec->tile_alloc_offset = roundup(tile_state_size, 4096); + + *(uint8_t *)(validated + 14) = + ((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK | + VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK)) | + VC4_BIN_CONFIG_AUTO_INIT_TSDA | + VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32, + VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE) | + VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128, + VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE)); + + /* Initial block size. */ + tile_alloc_size = 32 * tile_count; + + /* + * The initial allocation gets rounded to the next 256 bytes before + * the hardware starts fulfilling further allocations. + */ + tile_alloc_size = roundup(tile_alloc_size, 256); - if (tile_x * 64 >= exec->fb_width || tile_y * 64 >= exec->fb_height) { - DRM_ERROR("Tile coordinates %d,%d > render config %dx%d\n", - tile_x, tile_y, exec->fb_width, exec->fb_height); - return -EINVAL; - } + /* Add space for the extra allocations. This is what gets used first, + * before overflow memory. It must have at least 4096 bytes, but we + * want to avoid overflow memory usage if possible. + */ + tile_alloc_size += 1024 * 1024; + + exec->tile_bo = drm_gem_cma_create(dev, exec->tile_alloc_offset + + tile_alloc_size); + if (!exec->tile_bo) + return -ENOMEM; + list_addtail(&to_vc4_bo(&exec->tile_bo->base)->unref_head, + &exec->unref_list); + + /* tile alloc address. */ + *(uint32_t *)(validated + 0) = (exec->tile_bo->paddr + + exec->tile_alloc_offset); + /* tile alloc size. */ + *(uint32_t *)(validated + 4) = tile_alloc_size; + /* tile state address. */ + *(uint32_t *)(validated + 8) = exec->tile_bo->paddr; return 0; } @@ -633,78 +462,60 @@ validate_gem_handles(VALIDATE_ARGS) return 0; } +#define VC4_DEFINE_PACKET(packet, name, func) \ + [packet] = { packet ## _SIZE, name, func } + static const struct cmd_info { - bool bin; - bool render; uint16_t len; const char *name; int (*func)(struct vc4_exec_info *exec, void *validated, void *untrusted); } cmd_info[] = { - [VC4_PACKET_HALT] = { 1, 1, 1, "halt", NULL }, - [VC4_PACKET_NOP] = { 1, 1, 1, "nop", NULL }, - [VC4_PACKET_FLUSH] = { 1, 1, 1, "flush", NULL }, - [VC4_PACKET_FLUSH_ALL] = { 1, 0, 1, "flush all state", validate_flush_all }, - [VC4_PACKET_START_TILE_BINNING] = { 1, 0, 1, "start tile binning", validate_start_tile_binning }, - [VC4_PACKET_INCREMENT_SEMAPHORE] = { 1, 0, 1, "increment semaphore", validate_increment_semaphore }, - [VC4_PACKET_WAIT_ON_SEMAPHORE] = { 0, 1, 1, "wait on semaphore", validate_wait_on_semaphore }, - /* BRANCH_TO_SUB_LIST is actually supported in the binner as well, but - * we only use it from the render CL in order to jump into the tile - * allocation BO. - */ - [VC4_PACKET_BRANCH_TO_SUB_LIST] = { 0, 1, 5, "branch to sublist", validate_branch_to_sublist }, - [VC4_PACKET_STORE_MS_TILE_BUFFER] = { 0, 1, 1, "store MS resolved tile color buffer", NULL }, - [VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF] = { 0, 1, 1, "store MS resolved tile color buffer and EOF", NULL }, + VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all), + VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning), + VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore), - [VC4_PACKET_STORE_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Store Tile Buffer General", validate_loadstore_tile_buffer_general }, - [VC4_PACKET_LOAD_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Load Tile Buffer General", validate_loadstore_tile_buffer_general }, + VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, "Indexed Primitive List", validate_indexed_prim_list), - [VC4_PACKET_GL_INDEXED_PRIMITIVE] = { 1, 1, 14, "Indexed Primitive List", validate_indexed_prim_list }, - - [VC4_PACKET_GL_ARRAY_PRIMITIVE] = { 1, 1, 10, "Vertex Array Primitives", validate_gl_array_primitive }, + VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, "Vertex Array Primitives", validate_gl_array_primitive), /* This is only used by clipped primitives (packets 48 and 49), which * we don't support parsing yet. */ - [VC4_PACKET_PRIMITIVE_LIST_FORMAT] = { 1, 1, 2, "primitive list format", NULL }, - - [VC4_PACKET_GL_SHADER_STATE] = { 1, 1, 5, "GL Shader State", validate_gl_shader_state }, - [VC4_PACKET_NV_SHADER_STATE] = { 1, 1, 5, "NV Shader State", validate_nv_shader_state }, - - [VC4_PACKET_CONFIGURATION_BITS] = { 1, 1, 4, "configuration bits", NULL }, - [VC4_PACKET_FLAT_SHADE_FLAGS] = { 1, 1, 5, "flat shade flags", NULL }, - [VC4_PACKET_POINT_SIZE] = { 1, 1, 5, "point size", NULL }, - [VC4_PACKET_LINE_WIDTH] = { 1, 1, 5, "line width", NULL }, - [VC4_PACKET_RHT_X_BOUNDARY] = { 1, 1, 3, "RHT X boundary", NULL }, - [VC4_PACKET_DEPTH_OFFSET] = { 1, 1, 5, "Depth Offset", NULL }, - [VC4_PACKET_CLIP_WINDOW] = { 1, 1, 9, "Clip Window", NULL }, - [VC4_PACKET_VIEWPORT_OFFSET] = { 1, 1, 5, "Viewport Offset", NULL }, - [VC4_PACKET_CLIPPER_XY_SCALING] = { 1, 1, 9, "Clipper XY Scaling", NULL }, + VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL), + + VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state), + VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state), + + VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, "point size", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, "line width", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, "RHT X boundary", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, "Depth Offset", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, "Clip Window", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, "Viewport Offset", NULL), + VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, "Clipper XY Scaling", NULL), /* Note: The docs say this was also 105, but it was 106 in the * initial userland code drop. */ - [VC4_PACKET_CLIPPER_Z_SCALING] = { 1, 1, 9, "Clipper Z Scale and Offset", NULL }, - - [VC4_PACKET_TILE_BINNING_MODE_CONFIG] = { 1, 0, 16, "tile binning configuration", validate_tile_binning_config }, + VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, "Clipper Z Scale and Offset", NULL), - [VC4_PACKET_TILE_RENDERING_MODE_CONFIG] = { 0, 1, 11, "tile rendering mode configuration", validate_tile_rendering_mode_config}, + VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, "tile binning configuration", validate_tile_binning_config), - [VC4_PACKET_CLEAR_COLORS] = { 0, 1, 14, "Clear Colors", NULL }, - - [VC4_PACKET_TILE_COORDINATES] = { 0, 1, 3, "Tile Coordinates", validate_tile_coordinates }, - - [VC4_PACKET_GEM_HANDLES] = { 1, 1, 9, "GEM handles", validate_gem_handles }, + VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, "GEM handles", validate_gem_handles), }; int -vc4_validate_cl(struct drm_device *dev, - void *validated, - void *unvalidated, - uint32_t len, - bool is_bin, - bool has_bin, - struct vc4_exec_info *exec) +vc4_validate_bin_cl(struct drm_device *dev, + void *validated, + void *unvalidated, + struct vc4_exec_info *exec) { + uint32_t len = exec->args->bin_cl_size; uint32_t dst_offset = 0; uint32_t src_offset = 0; @@ -732,14 +543,6 @@ vc4_validate_cl(struct drm_device *dev, src_offset, cmd, info->name, info->len); #endif - if ((is_bin && !info->bin) || - (!is_bin && !info->render)) { - DRM_ERROR("0x%08x: packet %d (%s) invalid for %s\n", - src_offset, cmd, info->name, - is_bin ? "binner" : "render"); - return -EINVAL; - } - if (src_offset + info->len > len) { DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x " "exceeds bounds (0x%08x)\n", @@ -770,30 +573,16 @@ vc4_validate_cl(struct drm_device *dev, break; } - if (is_bin) { - exec->ct0ea = exec->ct0ca + dst_offset; + exec->ct0ea = exec->ct0ca + dst_offset; - if (has_bin && !exec->found_start_tile_binning_packet) { - DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n"); - return -EINVAL; - } - } else { - if (!exec->found_tile_rendering_mode_config_packet) { - DRM_ERROR("Render CL missing VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n"); - return -EINVAL; - } + if (!exec->found_start_tile_binning_packet) { + DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n"); + return -EINVAL; + } - /* Make sure that they actually consumed the semaphore - * increment from the bin CL. Otherwise a later submit would - * have render execute immediately. - */ - if (exec->found_wait_on_semaphore_packet != has_bin) { - DRM_ERROR("Render CL %s VC4_PACKET_WAIT_ON_SEMAPHORE\n", - exec->found_wait_on_semaphore_packet ? - "has" : "missing"); - return -EINVAL; - } - exec->ct1ea = exec->ct1ca + dst_offset; + if (!exec->found_increment_semaphore_packet) { + DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n"); + return -EINVAL; } return 0; @@ -814,10 +603,10 @@ reloc_tex(struct vc4_exec_info *exec, uint32_t p3 = (sample->p_offset[3] != ~0 ? *(uint32_t *)(uniform_data_u + sample->p_offset[3]) : 0); uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0]; - uint32_t offset = p0 & ~0xfff; - uint32_t miplevels = (p0 & 15); - uint32_t width = (p1 >> 8) & 2047; - uint32_t height = (p1 >> 20) & 2047; + uint32_t offset = p0 & VC4_TEX_P0_OFFSET_MASK; + uint32_t miplevels = VC4_GET_FIELD(p0, VC4_TEX_P0_MIPLVLS); + uint32_t width = VC4_GET_FIELD(p1, VC4_TEX_P1_WIDTH); + uint32_t height = VC4_GET_FIELD(p1, VC4_TEX_P1_HEIGHT); uint32_t cpp, tiling_format, utile_w, utile_h; uint32_t i; uint32_t cube_map_stride = 0; @@ -845,16 +634,18 @@ reloc_tex(struct vc4_exec_info *exec, if (height == 0) height = 2048; - if (p0 & (1 << 9)) { - if ((p2 & (3 << 30)) == (1 << 30)) - cube_map_stride = p2 & 0x3ffff000; - if ((p3 & (3 << 30)) == (1 << 30)) { + if (p0 & VC4_TEX_P0_CMMODE_MASK) { + if (VC4_GET_FIELD(p2, VC4_TEX_P2_PTYPE) == + VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) + cube_map_stride = p2 & VC4_TEX_P2_CMST_MASK; + if (VC4_GET_FIELD(p3, VC4_TEX_P2_PTYPE) == + VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) { if (cube_map_stride) { DRM_ERROR("Cube map stride set twice\n"); return false; } - cube_map_stride = p3 & 0x3ffff000; + cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK; } if (!cube_map_stride) { DRM_ERROR("Cube map stride not set\n"); @@ -862,7 +653,8 @@ reloc_tex(struct vc4_exec_info *exec, } } - type = ((p0 >> 4) & 15) | ((p1 >> 31) << 4); + type = (VC4_GET_FIELD(p0, VC4_TEX_P0_TYPE) | + (VC4_GET_FIELD(p1, VC4_TEX_P1_TYPE4) << 4)); switch (type) { case VC4_TEXTURE_TYPE_RGBA8888: @@ -905,8 +697,8 @@ reloc_tex(struct vc4_exec_info *exec, tiling_format = VC4_TILING_FORMAT_T; } - if (!check_tex_size(exec, tex, offset + cube_map_stride * 5, - tiling_format, width, height, cpp)) { + if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5, + tiling_format, width, height, cpp)) { return false; } @@ -927,15 +719,15 @@ reloc_tex(struct vc4_exec_info *exec, switch (tiling_format) { case VC4_TILING_FORMAT_T: - aligned_width = roundup(level_width, utile_w * 8); - aligned_height = roundup(level_height, utile_h * 8); + aligned_width = round_up(level_width, utile_w * 8); + aligned_height = round_up(level_height, utile_h * 8); break; case VC4_TILING_FORMAT_LT: - aligned_width = roundup(level_width, utile_w); - aligned_height = roundup(level_height, utile_h); + aligned_width = round_up(level_width, utile_w); + aligned_height = round_up(level_height, utile_h); break; default: - aligned_width = roundup(level_width, utile_w); + aligned_width = round_up(level_width, utile_w); aligned_height = level_height; break; } diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c index e5a75c5f8c2..ab9a6512e82 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c @@ -58,7 +58,8 @@ struct vc4_shader_validation_state { * * This is used for the validation of direct address memory reads. */ - uint32_t live_clamp_offsets[32 + 32 + 4]; + uint32_t live_min_clamp_offsets[32 + 32 + 4]; + bool live_max_clamp_regs[32 + 32 + 4]; }; static uint32_t @@ -77,6 +78,25 @@ waddr_to_live_reg_index(uint32_t waddr, bool is_b) } } +static uint32_t +raddr_add_a_to_live_reg_index(uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + + if (add_a == QPU_MUX_A) { + return raddr_a; + } else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) { + return 32 + raddr_b; + } else if (add_a <= QPU_MUX_R3) { + return 64 + add_a; + } else { + return ~0; + } +} + static bool is_tmu_submit(uint32_t waddr) { @@ -136,9 +156,8 @@ check_tmu_write(uint64_t inst, uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); if (is_direct) { - uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); - uint32_t clamp_offset = ~0; + uint32_t clamp_reg, clamp_offset; if (sig == QPU_SIG_SMALL_IMM) { DRM_ERROR("direct TMU read used small immediate\n"); @@ -159,14 +178,13 @@ check_tmu_write(uint64_t inst, * This is arbitrary, but simpler than supporting flipping the * two either way. */ - if (add_a == QPU_MUX_A) { - clamp_offset = validation_state->live_clamp_offsets[raddr_a]; - } else if (add_a == QPU_MUX_B) { - clamp_offset = validation_state->live_clamp_offsets[32 + raddr_b]; - } else if (add_a <= QPU_MUX_R4) { - clamp_offset = validation_state->live_clamp_offsets[64 + add_a]; + clamp_reg = raddr_add_a_to_live_reg_index(inst); + if (clamp_reg == ~0) { + DRM_ERROR("direct TMU load wasn't clamped\n"); + return false; } + clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; if (clamp_offset == ~0) { DRM_ERROR("direct TMU load wasn't clamped\n"); return false; @@ -229,8 +247,6 @@ check_register_write(uint64_t inst, uint32_t waddr = (is_mul ? QPU_GET_FIELD(inst, QPU_WADDR_MUL) : QPU_GET_FIELD(inst, QPU_WADDR_ADD)); - bool is_b = is_mul != ((inst & QPU_WS) != 0); - uint32_t live_reg_index; switch (waddr) { case QPU_W_UNIFORMS_ADDRESS: @@ -285,14 +301,6 @@ check_register_write(uint64_t inst, return true; } - /* Clear out the live offset clamp tracking for the written register. - * If this particular instruction is setting up an offset clamp, it'll - * get tracked immediately after we return. - */ - live_reg_index = waddr_to_live_reg_index(waddr, is_b); - if (live_reg_index != ~0) - validation_state->live_clamp_offsets[live_reg_index] = ~0; - return true; } @@ -301,26 +309,72 @@ track_live_clamps(uint64_t inst, struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state) { + uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); - bool is_b = inst & QPU_WS; - uint32_t live_reg_index; + bool ws = inst & QPU_WS; + uint32_t lri_add_a, lri_add, lri_mul; + bool add_a_is_min_0; - if (QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_MIN) + /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), + * before we clear previous live state. + */ + lri_add_a = raddr_add_a_to_live_reg_index(inst); + add_a_is_min_0 = (lri_add_a != ~0 && + validation_state->live_max_clamp_regs[lri_add_a]); + + /* Clear live state for registers written by our instruction. */ + lri_add = waddr_to_live_reg_index(waddr_add, ws); + lri_mul = waddr_to_live_reg_index(waddr_mul, !ws); + if (lri_mul != ~0) { + validation_state->live_max_clamp_regs[lri_mul] = false; + validation_state->live_min_clamp_offsets[lri_mul] = ~0; + } + if (lri_add != ~0) { + validation_state->live_max_clamp_regs[lri_add] = false; + validation_state->live_min_clamp_offsets[lri_add] = ~0; + } else { + /* Nothing further to do for live tracking, since only ADDs + * generate new live clamp registers. + */ return; + } + + /* Now, handle remaining live clamp tracking for the ADD operation. */ - if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && - !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && - sig != QPU_SIG_SMALL_IMM)) { + if (cond_add != QPU_COND_ALWAYS) return; - } - live_reg_index = waddr_to_live_reg_index(waddr_add, is_b); - if (live_reg_index != ~0) { - validation_state->live_clamp_offsets[live_reg_index] = + if (op_add == QPU_A_MAX) { + /* Track live clamps of a value to a minimum of 0 (in either + * arg). + */ + if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || + (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { + return; + } + + validation_state->live_max_clamp_regs[lri_add] = true; + } if (op_add == QPU_A_MIN) { + /* Track live clamps of a value clamped to a minimum of 0 and + * a maximum of some uniform's offset. + */ + if (!add_a_is_min_0) + return; + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && + sig != QPU_SIG_SMALL_IMM)) { + return; + } + + validation_state->live_min_clamp_offsets[lri_add] = validated_shader->uniforms_size; } } @@ -382,8 +436,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) for (i = 0; i < 8; i++) validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0; - for (i = 0; i < ARRAY_SIZE(validation_state.live_clamp_offsets); i++) - validation_state.live_clamp_offsets[i] = ~0; + for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++) + validation_state.live_min_clamp_offsets[i] = ~0; shader = shader_obj->vaddr; max_ip = shader_obj->base.size / sizeof(uint64_t); diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c index 2d524c40b4d..d29e2c9c318 100644 --- a/src/gallium/drivers/vc4/vc4_blit.c +++ b/src/gallium/drivers/vc4/vc4_blit.c @@ -26,86 +26,7 @@ #include "util/u_blitter.h" #include "vc4_context.h" -static void -vc4_tile_blit_color_rcl(struct vc4_context *vc4, - struct vc4_surface *dst_surf, - struct vc4_surface *src_surf) -{ - struct vc4_resource *src = vc4_resource(src_surf->base.texture); - struct vc4_resource *dst = vc4_resource(dst_surf->base.texture); - - uint32_t min_x_tile = 0; - uint32_t min_y_tile = 0; - uint32_t max_x_tile = (dst_surf->base.width - 1) / 64; - uint32_t max_y_tile = (dst_surf->base.height - 1) / 64; - uint32_t xtiles = max_x_tile - min_x_tile + 1; - uint32_t ytiles = max_y_tile - min_y_tile + 1; - uint32_t reloc_size = 9; - uint32_t config_size = 11 + reloc_size; - uint32_t loadstore_size = 7 + reloc_size; - uint32_t tilecoords_size = 3; - cl_ensure_space(&vc4->rcl, - config_size + - xtiles * ytiles * (loadstore_size * 2 + - tilecoords_size * 1)); - cl_ensure_space(&vc4->bo_handles, 2 * sizeof(uint32_t)); - cl_ensure_space(&vc4->bo_pointers, 2 * sizeof(struct vc4_bo *)); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG); - cl_reloc(vc4, &vc4->rcl, dst->bo, dst_surf->offset); - cl_u16(&vc4->rcl, dst_surf->base.width); - cl_u16(&vc4->rcl, dst_surf->base.height); - cl_u16(&vc4->rcl, ((dst_surf->tiling << - VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) | - (vc4_rt_format_is_565(dst_surf->base.format) ? - VC4_RENDER_CONFIG_FORMAT_BGR565 : - VC4_RENDER_CONFIG_FORMAT_RGBA8888))); - - uint32_t src_hindex = vc4_gem_hindex(vc4, src->bo); - - for (int y = min_y_tile; y <= max_y_tile; y++) { - for (int x = min_x_tile; x <= max_x_tile; x++) { - bool end_of_frame = (x == max_x_tile && - y == max_y_tile); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - cl_u8(&vc4->rcl, - VC4_LOADSTORE_TILE_BUFFER_COLOR | - (src_surf->tiling << - VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT)); - cl_u8(&vc4->rcl, - vc4_rt_format_is_565(src_surf->base.format) ? - VC4_LOADSTORE_TILE_BUFFER_BGR565 : - VC4_LOADSTORE_TILE_BUFFER_RGBA8888); - cl_reloc_hindex(&vc4->rcl, src_hindex, - src_surf->offset); - - cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES); - cl_u8(&vc4->rcl, x); - cl_u8(&vc4->rcl, y); - - if (end_of_frame) { - cl_u8(&vc4->rcl, - VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF); - } else { - cl_u8(&vc4->rcl, - VC4_PACKET_STORE_MS_TILE_BUFFER); - } - } - } - - vc4->draw_min_x = 0; - vc4->draw_min_y = 0; - vc4->draw_max_x = dst_surf->base.width; - vc4->draw_max_y = dst_surf->base.height; - - dst->writes++; - vc4->needs_flush = true; -} - -static struct vc4_surface * +static struct pipe_surface * vc4_get_blit_surface(struct pipe_context *pctx, struct pipe_resource *prsc, unsigned level) { @@ -117,7 +38,7 @@ vc4_get_blit_surface(struct pipe_context *pctx, tmpl.u.tex.first_layer = 0; tmpl.u.tex.last_layer = 0; - return vc4_surface(pctx->create_surface(pctx, prsc, &tmpl)); + return pctx->create_surface(pctx, prsc, &tmpl); } static bool @@ -141,17 +62,28 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) if (info->dst.resource->format != info->src.resource->format) return false; - struct vc4_surface *dst_surf = + vc4_flush(pctx); + + struct pipe_surface *dst_surf = vc4_get_blit_surface(pctx, info->dst.resource, info->dst.level); - struct vc4_surface *src_surf = + struct pipe_surface *src_surf = vc4_get_blit_surface(pctx, info->src.resource, info->src.level); - vc4_flush(pctx); - vc4_tile_blit_color_rcl(vc4, dst_surf, src_surf); + pipe_surface_reference(&vc4->color_read, src_surf); + pipe_surface_reference(&vc4->color_write, dst_surf); + pipe_surface_reference(&vc4->zs_read, NULL); + pipe_surface_reference(&vc4->zs_write, NULL); + vc4->draw_min_x = 0; + vc4->draw_min_y = 0; + vc4->draw_max_x = dst_surf->width; + vc4->draw_max_y = dst_surf->height; + vc4->draw_width = dst_surf->width; + vc4->draw_height = dst_surf->height; + vc4->needs_flush = true; vc4_job_submit(vc4); - pctx->surface_destroy(pctx, &dst_surf->base); - pctx->surface_destroy(pctx, &src_surf->base); + pipe_surface_reference(&dst_surf, NULL); + pipe_surface_reference(&src_surf, NULL); return true; } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index 4bb2c711e16..cbdb9e89cf6 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -34,8 +34,46 @@ #include "vc4_context.h" #include "vc4_screen.h" -#define container_of(ptr, type, field) \ - (type*)((char*)ptr - offsetof(type, field)) +static bool dump_stats = false; + +static void +vc4_bo_dump_stats(struct vc4_screen *screen) +{ + struct vc4_bo_cache *cache = &screen->bo_cache; + + fprintf(stderr, " BOs allocated: %d\n", screen->bo_count); + fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 102); + fprintf(stderr, " BOs cached: %d\n", cache->bo_count); + fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 102); + + if (!list_empty(&cache->time_list)) { + struct vc4_bo *first = LIST_ENTRY(struct vc4_bo, + cache->time_list.next, + time_list); + struct vc4_bo *last = LIST_ENTRY(struct vc4_bo, + cache->time_list.prev, + time_list); + + fprintf(stderr, " oldest cache time: %ld\n", + (long)first->free_time); + fprintf(stderr, " newest cache time: %ld\n", + (long)last->free_time); + + struct timespec time; + clock_gettime(CLOCK_MONOTONIC, &time); + fprintf(stderr, " now: %ld\n", + time.tv_sec); + } +} + +static void +vc4_bo_remove_from_cache(struct vc4_bo_cache *cache, struct vc4_bo *bo) +{ + list_del(&bo->time_list); + list_del(&bo->size_list); + cache->bo_count--; + cache->bo_size -= bo->size; +} static struct vc4_bo * vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) @@ -48,12 +86,21 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) struct vc4_bo *bo = NULL; pipe_mutex_lock(cache->lock); - if (!is_empty_list(&cache->size_list[page_index])) { - struct simple_node *node = last_elem(&cache->size_list[page_index]); - bo = container_of(node, struct vc4_bo, size_list); + if (!list_empty(&cache->size_list[page_index])) { + bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next, + size_list); + + /* Check that the BO has gone idle. If not, then we want to + * allocate something new instead, since we assume that the + * user will proceed to CPU map it and fill it with stuff. + */ + if (!vc4_bo_wait(bo, 0)) { + pipe_mutex_unlock(cache->lock); + return NULL; + } + pipe_reference_init(&bo->reference, 1); - remove_from_list(&bo->time_list); - remove_from_list(&bo->size_list); + vc4_bo_remove_from_cache(cache, bo); bo->name = name; } @@ -70,8 +117,14 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) size = align(size, 4096); bo = vc4_bo_from_cache(screen, size, name); - if (bo) + if (bo) { + if (dump_stats) { + fprintf(stderr, "Allocated %s %dkb from cache:\n", + name, size / 1024); + vc4_bo_dump_stats(screen); + } return bo; + } bo = CALLOC_STRUCT(vc4_bo); if (!bo) @@ -108,6 +161,13 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) abort(); } + screen->bo_count++; + screen->bo_size += bo->size; + if (dump_stats) { + fprintf(stderr, "Allocated %s %dkb:\n", name, size / 1024); + vc4_bo_dump_stats(screen); + } + return bo; } @@ -145,26 +205,47 @@ vc4_bo_free(struct vc4_bo *bo) if (ret != 0) fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno)); + screen->bo_count--; + screen->bo_size -= bo->size; + + if (dump_stats) { + fprintf(stderr, "Freed %s%s%dkb:\n", + bo->name ? bo->name : "", + bo->name ? " " : "", + bo->size / 1024); + vc4_bo_dump_stats(screen); + } + free(bo); } static void free_stale_bos(struct vc4_screen *screen, time_t time) { - while (!is_empty_list(&screen->bo_cache.time_list)) { - struct simple_node *node = - first_elem(&screen->bo_cache.time_list); - struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list); + struct vc4_bo_cache *cache = &screen->bo_cache; + bool freed_any = false; + + list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, + time_list) { + if (dump_stats && !freed_any) { + fprintf(stderr, "Freeing stale BOs:\n"); + vc4_bo_dump_stats(screen); + freed_any = true; + } /* If it's more than a second old, free it. */ if (time - bo->free_time > 2) { - remove_from_list(&bo->time_list); - remove_from_list(&bo->size_list); + vc4_bo_remove_from_cache(cache, bo); vc4_bo_free(bo); } else { break; } } + + if (dump_stats && freed_any) { + fprintf(stderr, "Freed stale BOs:\n"); + vc4_bo_dump_stats(screen); + } } void @@ -180,16 +261,16 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) } if (cache->size_list_size <= page_index) { - struct simple_node *new_list = - ralloc_array(screen, struct simple_node, page_index + 1); + struct list_head *new_list = + ralloc_array(screen, struct list_head, page_index + 1); /* Move old list contents over (since the array has moved, and - * therefore the pointers to the list heads have to change. + * therefore the pointers to the list heads have to change). */ for (int i = 0; i < cache->size_list_size; i++) { - struct simple_node *old_head = &cache->size_list[i]; - if (is_empty_list(old_head)) - make_empty_list(&new_list[i]); + struct list_head *old_head = &cache->size_list[i]; + if (list_empty(old_head)) + list_inithead(&new_list[i]); else { new_list[i].next = old_head->next; new_list[i].prev = old_head->prev; @@ -198,15 +279,23 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) } } for (int i = cache->size_list_size; i < page_index + 1; i++) - make_empty_list(&new_list[i]); + list_inithead(&new_list[i]); cache->size_list = new_list; cache->size_list_size = page_index + 1; } bo->free_time = time; - insert_at_tail(&cache->size_list[page_index], &bo->size_list); - insert_at_tail(&cache->time_list, &bo->time_list); + list_addtail(&bo->size_list, &cache->size_list[page_index]); + list_addtail(&bo->time_list, &cache->time_list); + cache->bo_count++; + cache->bo_size += bo->size; + if (dump_stats) { + fprintf(stderr, "Freed %s %dkb to cache:\n", + bo->name, bo->size / 1024); + vc4_bo_dump_stats(screen); + } + bo->name = NULL; free_stale_bos(screen, time); } @@ -286,6 +375,7 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo) bo->handle); return -1; } + bo->private = false; return fd; } @@ -342,15 +432,17 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns) ret = 0; } - if (ret == -ETIME) { - return false; - } else if (ret != 0) { - fprintf(stderr, "wait failed\n"); - abort(); - } else { + if (ret == 0) { screen->finished_seqno = wait.seqno; return true; } + + if (errno != ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); + } + + return false; } bool @@ -369,14 +461,15 @@ vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns) else ret = 0; - if (ret == -ETIME) { - return false; - } else if (ret != 0) { - fprintf(stderr, "wait failed\n"); - abort(); - } else { + if (ret == 0) return true; + + if (errno != ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); } + + return false; } void * @@ -437,12 +530,14 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen) struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_bo_cache *cache = &screen->bo_cache; - while (!is_empty_list(&cache->time_list)) { - struct simple_node *node = first_elem(&cache->time_list); - struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list); - - remove_from_list(&bo->time_list); - remove_from_list(&bo->size_list); + list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, + time_list) { + vc4_bo_remove_from_cache(cache, bo); vc4_bo_free(bo); } + + if (dump_stats) { + fprintf(stderr, "BO stats after screen destroy:\n"); + vc4_bo_dump_stats(screen); + } } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h index f9559e999a1..7320695ca8e 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -44,9 +44,9 @@ struct vc4_bo { #endif /** Entry in the linked list of buffers freed, by age. */ - struct simple_node time_list; + struct list_head time_list; /** Entry in the per-page-count linked list of buffers freed (by age). */ - struct simple_node size_list; + struct list_head size_list; /** Approximate second when the bo was freed. */ time_t free_time; /** diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h index 32a2e717379..4a50e790942 100644 --- a/src/gallium/drivers/vc4/vc4_cl.h +++ b/src/gallium/drivers/vc4/vc4_cl.h @@ -29,7 +29,7 @@ #include "util/u_math.h" #include "util/macros.h" -#include "vc4_packet.h" +#include "kernel/vc4_packet.h" struct vc4_bo; diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c index 14239840d32..69055081daa 100644 --- a/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -174,6 +174,37 @@ dump_VC4_PACKET_CLIPPER_Z_SCALING(void *cl, uint32_t offset, uint32_t hw_offset) } static void +dump_VC4_PACKET_TILE_BINNING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset) +{ + uint32_t *tile_alloc_addr = cl + offset; + uint32_t *tile_alloc_size = cl + offset + 4; + uint32_t *tile_state_addr = cl + offset + 8; + uint8_t *bin_x = cl + offset + 12; + uint8_t *bin_y = cl + offset + 13; + uint8_t *flags = cl + offset + 14; + + fprintf(stderr, "0x%08x 0x%08x: tile alloc addr 0x%08x\n", + offset, hw_offset, + *tile_alloc_addr); + + fprintf(stderr, "0x%08x 0x%08x: tile alloc size %db\n", + offset + 4, hw_offset + 4, + *tile_alloc_size); + + fprintf(stderr, "0x%08x 0x%08x: tile state addr 0x%08x\n", + offset + 8, hw_offset + 8, + *tile_state_addr); + + fprintf(stderr, "0x%08x 0x%08x: tiles (%d, %d)\n", + offset + 12, hw_offset + 12, + *bin_x, *bin_y); + + fprintf(stderr, "0x%08x 0x%08x: flags 0x%02x\n", + offset + 14, hw_offset + 14, + *flags); +} + +static void dump_VC4_PACKET_TILE_RENDERING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset) { uint32_t *render_offset = cl + offset; @@ -311,7 +342,7 @@ static const struct packet_info { PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9), PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9), - PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16), + PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16), PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11), PACKET(VC4_PACKET_CLEAR_COLORS, 14), PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3), diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index b394c186efb..630f8e68896 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -29,6 +29,7 @@ #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_blitter.h" +#include "util/u_upload_mgr.h" #include "indices/u_primconvert.h" #include "pipe/p_screen.h" @@ -36,270 +37,12 @@ #include "vc4_context.h" #include "vc4_resource.h" -/** - * Emits a no-op STORE_TILE_BUFFER_GENERAL. - * - * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of - * some sort before another load is triggered. - */ -static void -vc4_store_before_load(struct vc4_context *vc4, bool *coords_emitted) -{ - if (!*coords_emitted) - return; - - cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); - cl_u8(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE); - cl_u8(&vc4->rcl, (VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR | - VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR | - VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR)); - cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */ - - *coords_emitted = false; -} - -/** - * Emits a PACKET_TILE_COORDINATES if one isn't already pending. - * - * The tile coordinates packet triggers a pending load if there is one, are - * used for clipping during rendering, and determine where loads/stores happen - * relative to their base address. - */ -static void -vc4_tile_coordinates(struct vc4_context *vc4, uint32_t x, uint32_t y, - bool *coords_emitted) -{ - if (*coords_emitted) - return; - - cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES); - cl_u8(&vc4->rcl, x); - cl_u8(&vc4->rcl, y); - - *coords_emitted = true; -} - -static void -vc4_setup_rcl(struct vc4_context *vc4) -{ - struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]); - struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL; - struct vc4_surface *zsurf = vc4_surface(vc4->framebuffer.zsbuf); - struct vc4_resource *ztex = zsurf ? vc4_resource(zsurf->base.texture) : NULL; - - if (!csurf) - vc4->resolve &= ~PIPE_CLEAR_COLOR0; - if (!zsurf) - vc4->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL); - uint32_t resolve_uncleared = vc4->resolve & ~vc4->cleared; - uint32_t width = vc4->framebuffer.width; - uint32_t height = vc4->framebuffer.height; - uint32_t stride_in_tiles = align(width, 64) / 64; - - assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0); - uint32_t min_x_tile = vc4->draw_min_x / 64; - uint32_t min_y_tile = vc4->draw_min_y / 64; - uint32_t max_x_tile = (vc4->draw_max_x - 1) / 64; - uint32_t max_y_tile = (vc4->draw_max_y - 1) / 64; - uint32_t xtiles = max_x_tile - min_x_tile + 1; - uint32_t ytiles = max_y_tile - min_y_tile + 1; - -#if 0 - fprintf(stderr, "RCL: resolve 0x%x clear 0x%x resolve uncleared 0x%x\n", - vc4->resolve, - vc4->cleared, - resolve_uncleared); -#endif - - uint32_t reloc_size = 9; - uint32_t clear_size = 14; - uint32_t config_size = 11 + reloc_size; - uint32_t loadstore_size = 7 + reloc_size; - uint32_t tilecoords_size = 3; - uint32_t branch_size = 5 + reloc_size; - uint32_t color_store_size = 1; - uint32_t semaphore_size = 1; - cl_ensure_space(&vc4->rcl, - clear_size + - config_size + - loadstore_size + - semaphore_size + - xtiles * ytiles * (loadstore_size * 4 + - tilecoords_size * 3 + - branch_size + - color_store_size)); - - if (vc4->cleared) { - cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS); - cl_u32(&vc4->rcl, vc4->clear_color[0]); - cl_u32(&vc4->rcl, vc4->clear_color[1]); - cl_u32(&vc4->rcl, vc4->clear_depth); - cl_u8(&vc4->rcl, vc4->clear_stencil); - } - - /* The rendering mode config determines the pointer that's used for - * VC4_PACKET_STORE_MS_TILE_BUFFER address computations. The kernel - * could handle a no-relocation rendering mode config and deny those - * packets, but instead we just tell the kernel we're doing our color - * rendering to the Z buffer, and just don't emit any of those - * packets. - */ - struct vc4_surface *render_surf = csurf ? csurf : zsurf; - struct vc4_resource *render_tex = vc4_resource(render_surf->base.texture); - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG); - cl_reloc(vc4, &vc4->rcl, render_tex->bo, render_surf->offset); - cl_u16(&vc4->rcl, width); - cl_u16(&vc4->rcl, height); - cl_u16(&vc4->rcl, ((render_surf->tiling << - VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) | - (vc4_rt_format_is_565(render_surf->base.format) ? - VC4_RENDER_CONFIG_FORMAT_BGR565 : - VC4_RENDER_CONFIG_FORMAT_RGBA8888))); - - /* The tile buffer normally gets cleared when the previous tile is - * stored. If the clear values changed between frames, then the tile - * buffer has stale clear values in it, so we have to do a store in - * None mode (no writes) so that we trigger the tile buffer clear. - * - * Excess clearing is only a performance cost, since per-tile contents - * will be loaded/stored in the loop below. - */ - if (vc4->cleared & (PIPE_CLEAR_COLOR0 | - PIPE_CLEAR_DEPTH | - PIPE_CLEAR_STENCIL)) { - cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES); - cl_u8(&vc4->rcl, 0); - cl_u8(&vc4->rcl, 0); - - cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); - cl_u16(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE); - cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */ - } - - uint32_t color_hindex = ctex ? vc4_gem_hindex(vc4, ctex->bo) : 0; - uint32_t depth_hindex = ztex ? vc4_gem_hindex(vc4, ztex->bo) : 0; - uint32_t tile_alloc_hindex = vc4_gem_hindex(vc4, vc4->tile_alloc); - - for (int y = min_y_tile; y <= max_y_tile; y++) { - for (int x = min_x_tile; x <= max_x_tile; x++) { - bool end_of_frame = (x == max_x_tile && - y == max_y_tile); - bool coords_emitted = false; - - /* Note that the load doesn't actually occur until the - * tile coords packet is processed, and only one load - * may be outstanding at a time. - */ - if (resolve_uncleared & PIPE_CLEAR_COLOR) { - vc4_store_before_load(vc4, &coords_emitted); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - cl_u8(&vc4->rcl, - VC4_LOADSTORE_TILE_BUFFER_COLOR | - (csurf->tiling << - VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT)); - cl_u8(&vc4->rcl, - vc4_rt_format_is_565(csurf->base.format) ? - VC4_LOADSTORE_TILE_BUFFER_BGR565 : - VC4_LOADSTORE_TILE_BUFFER_RGBA8888); - cl_reloc_hindex(&vc4->rcl, color_hindex, - csurf->offset); - - vc4_tile_coordinates(vc4, x, y, &coords_emitted); - } - - if (resolve_uncleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - vc4_store_before_load(vc4, &coords_emitted); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - cl_u8(&vc4->rcl, - VC4_LOADSTORE_TILE_BUFFER_ZS | - (zsurf->tiling << - VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT)); - cl_u8(&vc4->rcl, 0); - cl_reloc_hindex(&vc4->rcl, depth_hindex, - zsurf->offset); - - vc4_tile_coordinates(vc4, x, y, &coords_emitted); - } - - /* Clipping depends on tile coordinates having been - * emitted, so make sure it's happened even if - * everything was cleared to start. - */ - vc4_tile_coordinates(vc4, x, y, &coords_emitted); - - /* Wait for the binner before jumping to the first - * tile's lists. - */ - if (x == min_x_tile && y == min_y_tile) - cl_u8(&vc4->rcl, VC4_PACKET_WAIT_ON_SEMAPHORE); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_BRANCH_TO_SUB_LIST); - cl_reloc_hindex(&vc4->rcl, tile_alloc_hindex, - (y * stride_in_tiles + x) * 32); - - if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - vc4_tile_coordinates(vc4, x, y, &coords_emitted); - - cl_start_reloc(&vc4->rcl, 1); - cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); - cl_u8(&vc4->rcl, - VC4_LOADSTORE_TILE_BUFFER_ZS | - (zsurf->tiling << - VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT)); - cl_u8(&vc4->rcl, - VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR); - cl_reloc_hindex(&vc4->rcl, depth_hindex, - zsurf->offset | - ((end_of_frame && - !(vc4->resolve & PIPE_CLEAR_COLOR0)) ? - VC4_LOADSTORE_TILE_BUFFER_EOF : 0)); - - coords_emitted = false; - } - - if (vc4->resolve & PIPE_CLEAR_COLOR0) { - vc4_tile_coordinates(vc4, x, y, &coords_emitted); - if (end_of_frame) { - cl_u8(&vc4->rcl, - VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF); - } else { - cl_u8(&vc4->rcl, - VC4_PACKET_STORE_MS_TILE_BUFFER); - } - - coords_emitted = false; - } - - /* One of the bits needs to have been set that would - * have triggered an EOF. - */ - assert(vc4->resolve & (PIPE_CLEAR_COLOR0 | - PIPE_CLEAR_DEPTH | - PIPE_CLEAR_STENCIL)); - /* Any coords emitted must also have been consumed by - * a store. - */ - assert(!coords_emitted); - } - } - - if (vc4->resolve & PIPE_CLEAR_COLOR0) - ctex->writes++; - - if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) - ztex->writes++; -} - void vc4_flush(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); + struct pipe_surface *cbuf = vc4->framebuffer.cbufs[0]; + struct pipe_surface *zsbuf = vc4->framebuffer.zsbuf; if (!vc4->needs_flush) return; @@ -322,7 +65,31 @@ vc4_flush(struct pipe_context *pctx) /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */ cl_u8(&vc4->bcl, VC4_PACKET_FLUSH); - vc4_setup_rcl(vc4); + if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { + pipe_surface_reference(&vc4->color_write, cbuf); + if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) { + pipe_surface_reference(&vc4->color_read, cbuf); + } else { + pipe_surface_reference(&vc4->color_read, NULL); + } + + } else { + pipe_surface_reference(&vc4->color_write, NULL); + pipe_surface_reference(&vc4->color_read, NULL); + } + + if (vc4->framebuffer.zsbuf && + (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { + pipe_surface_reference(&vc4->zs_write, zsbuf); + if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { + pipe_surface_reference(&vc4->zs_read, zsbuf); + } else { + pipe_surface_reference(&vc4->zs_read, NULL); + } + } else { + pipe_surface_reference(&vc4->zs_write, NULL); + pipe_surface_reference(&vc4->zs_read, NULL); + } vc4_job_submit(vc4); } @@ -410,12 +177,13 @@ vc4_context_destroy(struct pipe_context *pctx) if (vc4->primconvert) util_primconvert_destroy(vc4->primconvert); + if (vc4->uploader) + u_upload_destroy(vc4->uploader); + util_slab_destroy(&vc4->transfer_pool); pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL); pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL); - vc4_bo_unreference(&vc4->tile_alloc); - vc4_bo_unreference(&vc4->tile_state); vc4_program_fini(pctx); @@ -466,6 +234,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv) if (!vc4->primconvert) goto fail; + vc4->uploader = u_upload_create(pctx, 16 * 1024, 4, + PIPE_BIND_INDEX_BUFFER); + vc4_debug |= saved_shaderdb_flag; return &vc4->base; diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index d89f1974e12..d5d6be16f6e 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -178,12 +178,18 @@ struct vc4_context { struct vc4_screen *screen; struct vc4_cl bcl; - struct vc4_cl rcl; struct vc4_cl shader_rec; struct vc4_cl uniforms; struct vc4_cl bo_handles; struct vc4_cl bo_pointers; uint32_t shader_rec_count; + + /** @{ Surfaces to submit rendering for. */ + struct pipe_surface *color_read; + struct pipe_surface *color_write; + struct pipe_surface *zs_read; + struct pipe_surface *zs_write; + /** @} */ /** @{ * Bounding box of the scissor across all queued drawing. * @@ -194,9 +200,13 @@ struct vc4_context { uint32_t draw_max_x; uint32_t draw_max_y; /** @} */ - - struct vc4_bo *tile_alloc; - struct vc4_bo *tile_state; + /** @{ + * Width/height of the color framebuffer being rendered to, + * for VC4_TILE_RENDERING_MODE_CONFIG. + */ + uint32_t draw_width; + uint32_t draw_height; + /** @} */ struct util_slab_mempool transfer_pool; struct blitter_context *blitter; @@ -243,6 +253,8 @@ struct vc4_context { /** Seqno of the last CL flush's job. */ uint64_t last_emit_seqno; + struct u_upload_mgr *uploader; + /** @{ Current pipeline state objects */ struct pipe_scissor_state scissor; struct pipe_blend_state *blend; diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 16418bf12da..5e6d70d6f33 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -72,44 +72,15 @@ vc4_start_draw(struct vc4_context *vc4) uint32_t tilew = align(width, 64) / 64; uint32_t tileh = align(height, 64) / 64; - /* Tile alloc memory setup: We use an initial alloc size of 32b. The - * hardware then aligns that to 256b (we use 4096, because all of our - * BO allocations align to that anyway), then for some reason the - * simulator wants an extra page available, even if you have overflow - * memory set up. - * - * XXX: The binner only does 28-bit addressing math, so the tile alloc - * and tile state should be in the same BO and that BO needs to not - * cross a 256MB boundary, somehow. - */ - uint32_t tile_alloc_size = 32 * tilew * tileh; - tile_alloc_size = align(tile_alloc_size, 4096); - tile_alloc_size += 4096; - uint32_t tile_state_size = 48 * tilew * tileh; - if (!vc4->tile_alloc || vc4->tile_alloc->size < tile_alloc_size) { - vc4_bo_unreference(&vc4->tile_alloc); - vc4->tile_alloc = vc4_bo_alloc(vc4->screen, tile_alloc_size, - "tile_alloc"); - } - if (!vc4->tile_state || vc4->tile_state->size < tile_state_size) { - vc4_bo_unreference(&vc4->tile_state); - vc4->tile_state = vc4_bo_alloc(vc4->screen, tile_state_size, - "tile_state"); - } - // Tile state data is 48 bytes per tile, I think it can be thrown away // as soon as binning is finished. - cl_start_reloc(&vc4->bcl, 2); cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); - cl_reloc(vc4, &vc4->bcl, vc4->tile_alloc, 0); - cl_u32(&vc4->bcl, vc4->tile_alloc->size); - cl_reloc(vc4, &vc4->bcl, vc4->tile_state, 0); + cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */ + cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */ + cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */ cl_u8(&vc4->bcl, tilew); cl_u8(&vc4->bcl, tileh); - cl_u8(&vc4->bcl, - VC4_BIN_CONFIG_AUTO_INIT_TSDA | - VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 | - VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32); + cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */ /* START_TILE_BINNING resets the statechange counters in the hardware, * which are what is used when a primitive is binned to a tile to @@ -129,6 +100,8 @@ vc4_start_draw(struct vc4_context *vc4) vc4->needs_flush = true; vc4->draw_call_queued = true; + vc4->draw_width = width; + vc4->draw_height = height; } static void @@ -266,13 +239,17 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) * definitions, up to but not including QUADS. */ if (info->indexed) { - struct vc4_resource *rsc = vc4_resource(vc4->indexbuf.buffer); uint32_t offset = vc4->indexbuf.offset; uint32_t index_size = vc4->indexbuf.index_size; - if (rsc->shadow_parent) { - vc4_update_shadow_index_buffer(pctx, &vc4->indexbuf); - offset = 0; + struct pipe_resource *prsc; + if (vc4->indexbuf.index_size == 4) { + prsc = vc4_get_shadow_index_buffer(pctx, &vc4->indexbuf, + info->count, &offset); + index_size = 2; + } else { + prsc = vc4->indexbuf.buffer; } + struct vc4_resource *rsc = vc4_resource(prsc); cl_start_reloc(&vc4->bcl, 1); cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); @@ -284,6 +261,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) cl_u32(&vc4->bcl, info->count); cl_reloc(vc4, &vc4->bcl, rsc->bo, offset); cl_u32(&vc4->bcl, max_index); + + if (vc4->indexbuf.index_size == 4) + pipe_resource_reference(&prsc, NULL); } else { cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); cl_u8(&vc4->bcl, info->mode); diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index 062fd3b687e..5f1ee4fa125 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -38,6 +38,15 @@ #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) +struct drm_vc4_submit_rcl_surface { + uint32_t hindex; /* Handle index, or ~0 if not present. */ + uint32_t offset; /* Offset to start of buffer. */ + /* + * Bits for either render config (color_ms_write) or load/store packet. + */ + uint16_t bits; + uint16_t pad; +}; /** * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D @@ -62,16 +71,6 @@ struct drm_vc4_submit_cl { */ uint64_t bin_cl; - /* Pointer to the render command list. - * - * The render command list contains a set of packets to load the - * current tile's state (reading from memory, or just clearing it) - * into the GPU, then call into the tile allocation BO to run the - * stored rendering for that tile, then store the tile's state back to - * memory. - */ - uint64_t render_cl; - /* Pointer to the shader records. * * Shader records are the structures read by the hardware that contain @@ -102,8 +101,6 @@ struct drm_vc4_submit_cl { /* Size in bytes of the binner command list. */ uint32_t bin_cl_size; - /* Size in bytes of the render command list */ - uint32_t render_cl_size; /* Size in bytes of the set of shader records. */ uint32_t shader_rec_size; /* Number of shader records. @@ -119,8 +116,25 @@ struct drm_vc4_submit_cl { /* Number of BO handles passed in (size is that times 4). */ uint32_t bo_handle_count; + /* RCL setup: */ + uint16_t width; + uint16_t height; + uint8_t min_x_tile; + uint8_t min_y_tile; + uint8_t max_x_tile; + uint8_t max_y_tile; + struct drm_vc4_submit_rcl_surface color_read; + struct drm_vc4_submit_rcl_surface color_ms_write; + struct drm_vc4_submit_rcl_surface zs_read; + struct drm_vc4_submit_rcl_surface zs_write; + uint32_t clear_color[2]; + uint32_t clear_z; + uint8_t clear_s; + + uint32_t pad:24; + +#define VC4_SUBMIT_CL_USE_CLEAR_COLOR (1 << 0) uint32_t flags; - uint32_t pad; /* Returned value of the seqno of this render job (for the * wait ioctl). diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index 76037162102..dcade15443a 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -33,7 +33,6 @@ void vc4_job_init(struct vc4_context *vc4) { vc4_init_cl(vc4, &vc4->bcl); - vc4_init_cl(vc4, &vc4->rcl); vc4_init_cl(vc4, &vc4->shader_rec); vc4_init_cl(vc4, &vc4->uniforms); vc4_init_cl(vc4, &vc4->bo_handles); @@ -50,7 +49,6 @@ vc4_job_reset(struct vc4_context *vc4) vc4_bo_unreference(&referenced_bos[i]); } vc4_reset_cl(&vc4->bcl); - vc4_reset_cl(&vc4->rcl); vc4_reset_cl(&vc4->shader_rec); vc4_reset_cl(&vc4->uniforms); vc4_reset_cl(&vc4->bo_handles); @@ -75,6 +73,70 @@ vc4_job_reset(struct vc4_context *vc4) vc4->draw_max_y = 0; } +static void +vc4_submit_setup_rcl_surface(struct vc4_context *vc4, + struct drm_vc4_submit_rcl_surface *submit_surf, + struct pipe_surface *psurf, + bool is_depth, bool is_write) +{ + struct vc4_surface *surf = vc4_surface(psurf); + + if (!surf) { + submit_surf->hindex = ~0; + return; + } + + struct vc4_resource *rsc = vc4_resource(psurf->texture); + submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); + submit_surf->offset = surf->offset; + + if (is_depth) { + submit_surf->bits = + VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS, + VC4_LOADSTORE_TILE_BUFFER_BUFFER); + + } else { + submit_surf->bits = + VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR, + VC4_LOADSTORE_TILE_BUFFER_BUFFER) | + VC4_SET_FIELD(vc4_rt_format_is_565(psurf->format) ? + VC4_LOADSTORE_TILE_BUFFER_BGR565 : + VC4_LOADSTORE_TILE_BUFFER_RGBA8888, + VC4_LOADSTORE_TILE_BUFFER_FORMAT); + } + submit_surf->bits |= + VC4_SET_FIELD(surf->tiling, VC4_LOADSTORE_TILE_BUFFER_TILING); + + if (is_write) + rsc->writes++; +} + +static void +vc4_submit_setup_ms_rcl_surface(struct vc4_context *vc4, + struct drm_vc4_submit_rcl_surface *submit_surf, + struct pipe_surface *psurf) +{ + struct vc4_surface *surf = vc4_surface(psurf); + + if (!surf) { + submit_surf->hindex = ~0; + return; + } + + struct vc4_resource *rsc = vc4_resource(psurf->texture); + submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); + submit_surf->offset = surf->offset; + + submit_surf->bits = + VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ? + VC4_RENDER_CONFIG_FORMAT_BGR565 : + VC4_RENDER_CONFIG_FORMAT_RGBA8888, + VC4_RENDER_CONFIG_FORMAT) | + VC4_SET_FIELD(surf->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT); + + rsc->writes++; +} + /** * Submits the job to the kernel and then reinitializes it. */ @@ -84,26 +146,49 @@ vc4_job_submit(struct vc4_context *vc4) if (vc4_debug & VC4_DEBUG_CL) { fprintf(stderr, "BCL:\n"); vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false); - fprintf(stderr, "RCL:\n"); - vc4_dump_cl(vc4->rcl.base, vc4->rcl.next - vc4->rcl.base, true); } struct drm_vc4_submit_cl submit; memset(&submit, 0, sizeof(submit)); + cl_ensure_space(&vc4->bo_handles, 4 * sizeof(uint32_t)); + cl_ensure_space(&vc4->bo_pointers, 4 * sizeof(struct vc4_bo *)); + + vc4_submit_setup_rcl_surface(vc4, &submit.color_read, + vc4->color_read, false, false); + vc4_submit_setup_ms_rcl_surface(vc4, &submit.color_ms_write, + vc4->color_write); + vc4_submit_setup_rcl_surface(vc4, &submit.zs_read, + vc4->zs_read, true, false); + vc4_submit_setup_rcl_surface(vc4, &submit.zs_write, + vc4->zs_write, true, true); + submit.bo_handles = (uintptr_t)vc4->bo_handles.base; submit.bo_handle_count = (vc4->bo_handles.next - vc4->bo_handles.base) / 4; submit.bin_cl = (uintptr_t)vc4->bcl.base; submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base; - submit.render_cl = (uintptr_t)vc4->rcl.base; - submit.render_cl_size = vc4->rcl.next - vc4->rcl.base; submit.shader_rec = (uintptr_t)vc4->shader_rec.base; submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base; submit.shader_rec_count = vc4->shader_rec_count; submit.uniforms = (uintptr_t)vc4->uniforms.base; submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base; + assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0); + submit.min_x_tile = vc4->draw_min_x / 64; + submit.min_y_tile = vc4->draw_min_y / 64; + submit.max_x_tile = (vc4->draw_max_x - 1) / 64; + submit.max_y_tile = (vc4->draw_max_y - 1) / 64; + submit.width = vc4->draw_width; + submit.height = vc4->draw_height; + if (vc4->cleared) { + submit.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR; + submit.clear_color[0] = vc4->clear_color[0]; + submit.clear_color[1] = vc4->clear_color[1]; + submit.clear_z = vc4->clear_depth; + submit.clear_s = vc4->clear_stencil; + } + if (!(vc4_debug & VC4_DEBUG_NORAST)) { int ret; diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index e40e0f3b71b..7978ea1829f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -136,11 +136,8 @@ bool qir_opt_algebraic(struct vc4_compile *c) { bool progress = false; - struct simple_node *node; - - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { switch (inst->op) { case QOP_SEL_X_Y_ZS: case QOP_SEL_X_Y_ZC: diff --git a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c index ac9be5c9642..15ec9f07260 100644 --- a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c +++ b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c @@ -98,10 +98,8 @@ bool qir_opt_constant_folding(struct vc4_compile *c) { bool progress = false; - struct simple_node *node; - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (constant_fold(c, inst)) progress = true; } diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index 5189a401248..d6d2fbf257f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -38,13 +38,10 @@ bool qir_opt_copy_propagation(struct vc4_compile *c) { bool progress = false; - struct simple_node *node; bool debug = false; struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg)); - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; - + list_for_each_entry(struct qinst, inst, &c->instructions, link) { for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { int index = inst->src[i].index; if (inst->src[i].file == QFILE_TEMP && diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c index 71794f7d1cf..92c8260eb59 100644 --- a/src/gallium/drivers/vc4/vc4_opt_cse.c +++ b/src/gallium/drivers/vc4/vc4_opt_cse.c @@ -121,7 +121,6 @@ bool qir_opt_cse(struct vc4_compile *c) { bool progress = false; - struct simple_node *node, *t; uint32_t sf_count = 0, r4_count = 0; struct hash_table *ht = _mesa_hash_table_create(NULL, NULL, @@ -129,9 +128,7 @@ qir_opt_cse(struct vc4_compile *c) if (!ht) return false; - foreach_s(node, t, &c->instructions) { - struct qinst *inst = (struct qinst *)node; - + list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (qir_has_side_effects(c, inst) || qir_has_side_effect_reads(c, inst)) { continue; diff --git a/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/src/gallium/drivers/vc4/vc4_opt_dead_code.c index e4ead46c9c2..ffd42422de8 100644 --- a/src/gallium/drivers/vc4/vc4_opt_dead_code.c +++ b/src/gallium/drivers/vc4/vc4_opt_dead_code.c @@ -86,7 +86,7 @@ qir_opt_dead_code(struct vc4_compile *c) /* Whether we're eliminating texture setup currently. */ bool dce_tex = false; - struct simple_node *node, *t; + struct list_head *node, *t; for (node = c->instructions.prev, t = node->prev; &c->instructions != node; node = t, t = t->prev) { diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c index a329ac69d11..d6e98f0aebf 100644 --- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c +++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c @@ -37,11 +37,8 @@ bool qir_opt_small_immediates(struct vc4_compile *c) { bool progress = false; - struct simple_node *node; - - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { /* The small immediate value sits in the raddr B field, so we * can't have 2 small immediates in one instruction (unless * they're the same value, but that should be optimized away diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c index e9711f222cd..e04f02859d5 100644 --- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c +++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c @@ -37,15 +37,12 @@ qir_opt_vpm_writes(struct vc4_compile *c) return false; bool progress = false; - struct simple_node *node; struct qinst *vpm_writes[64] = { 0 }; uint32_t use_count[c->num_temps]; uint32_t vpm_write_count = 0; memset(&use_count, 0, sizeof(use_count)); - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; - + list_for_each_entry(struct qinst, inst, &c->instructions, link) { switch (inst->dst.file) { case QFILE_VPM: vpm_writes[vpm_write_count++] = inst; @@ -102,7 +99,8 @@ qir_opt_vpm_writes(struct vc4_compile *c) * to maintain the order of the VPM writes. */ assert(!vpm_writes[i]->sf); - move_to_tail(&vpm_writes[i]->link, &inst->link); + list_del(&inst->link); + list_addtail(&inst->link, &vpm_writes[i]->link); qir_remove_instruction(c, vpm_writes[i]); c->defs[inst->dst.index] = NULL; diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index bf156f9b42d..ba47c51d9bd 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -147,6 +147,9 @@ indirect_uniform_load(struct vc4_compile *c, indirect_offset = qir_ADD(c, indirect_offset, qir_uniform_ui(c, (range->dst_offset + offset))); + + /* Clamp to [0, array size). Note that MIN/MAX are signed. */ + indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0)); indirect_offset = qir_MIN(c, indirect_offset, qir_uniform_ui(c, (range->dst_offset + range->size - 4))); @@ -322,7 +325,9 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) switch (instr->src[i].src_type) { case nir_tex_src_coord: s = ntq_get_src(c, instr->src[i].src, 0); - if (instr->sampler_dim != GLSL_SAMPLER_DIM_1D) + if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) + t = qir_uniform_f(c, 0.5); + else t = ntq_get_src(c, instr->src[i].src, 1); if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) r = ntq_get_src(c, instr->src[i].src, 2); @@ -1849,8 +1854,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) switch (instr->intrinsic) { case nir_intrinsic_load_uniform: - assert(instr->const_index[1] == 1); - for (int i = 0; i < instr->num_components; i++) { dest[i] = qir_uniform(c, QUNIFORM_UNIFORM, instr->const_index[0] * 4 + i); @@ -1858,8 +1861,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_uniform_indirect: - assert(instr->const_index[1] == 1); - for (int i = 0; i < instr->num_components; i++) { dest[i] = indirect_uniform_load(c, ntq_get_src(c, instr->src[0], 0), @@ -1870,8 +1871,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_input: - assert(instr->const_index[1] == 1); - for (int i = 0; i < instr->num_components; i++) dest[i] = c->inputs[instr->const_index[0] * 4 + i]; @@ -2215,11 +2214,9 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, shader->program_id = vc4->next_compiled_program_id++; if (stage == QSTAGE_FRAG) { bool input_live[c->num_input_semantics]; - struct simple_node *node; memset(input_live, 0, sizeof(input_live)); - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { if (inst->src[i].file == QFILE_VARY) input_live[inst->src[i].index] = true; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index e2e6a5cdf16..1c96ef4795f 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -22,7 +22,6 @@ */ #include "util/u_memory.h" -#include "util/simple_list.h" #include "util/ralloc.h" #include "vc4_qir.h" @@ -301,10 +300,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst) void qir_dump(struct vc4_compile *c) { - struct simple_node *node; - - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { qir_dump_inst(c, inst); fprintf(stderr, "\n"); } @@ -370,7 +366,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst) if (inst->dst.file == QFILE_TEMP) c->defs[inst->dst.index] = inst; - insert_at_tail(&c->instructions, &inst->link); + list_addtail(&inst->link, &c->instructions); } bool @@ -384,7 +380,7 @@ qir_compile_init(void) { struct vc4_compile *c = rzalloc(NULL, struct vc4_compile); - make_empty_list(&c->instructions); + list_inithead(&c->instructions); c->output_position_index = -1; c->output_clipvertex_index = -1; @@ -403,7 +399,7 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst) if (qinst->dst.file == QFILE_TEMP) c->defs[qinst->dst.index] = NULL; - remove_from_list(&qinst->link); + list_del(&qinst->link); free(qinst->src); free(qinst); } @@ -420,9 +416,9 @@ qir_follow_movs(struct vc4_compile *c, struct qreg reg) void qir_compile_destroy(struct vc4_compile *c) { - while (!is_empty_list(&c->instructions)) { + while (!list_empty(&c->instructions)) { struct qinst *qinst = - (struct qinst *)first_elem(&c->instructions); + (struct qinst *)c->instructions.next; qir_remove_instruction(c, qinst); } @@ -478,7 +474,7 @@ void qir_SF(struct vc4_compile *c, struct qreg src) { struct qinst *last_inst = NULL; - if (!is_empty_list(&c->instructions)) + if (!list_empty(&c->instructions)) last_inst = (struct qinst *)c->instructions.prev; if (!last_inst || diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index adc2c89d2c1..732cfd0b306 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -33,7 +33,7 @@ #include "util/macros.h" #include "glsl/nir/nir.h" -#include "util/simple_list.h" +#include "util/list.h" #include "util/u_math.h" enum qfile { @@ -162,12 +162,12 @@ enum qop { }; struct queued_qpu_inst { - struct simple_node link; + struct list_head link; uint64_t inst; }; struct qinst { - struct simple_node link; + struct list_head link; enum qop op; struct qreg dst; @@ -356,10 +356,10 @@ struct vc4_compile { struct qreg undef; enum qstage stage; uint32_t num_temps; - struct simple_node instructions; + struct list_head instructions; uint32_t immediates[1024]; - struct simple_node qpu_inst_list; + struct list_head qpu_inst_list; uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index 63f5eb22858..910c89dca79 100644 --- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -88,7 +88,6 @@ is_lowerable_uniform(struct qinst *inst, int i) void qir_lower_uniforms(struct vc4_compile *c) { - struct simple_node *node; struct hash_table *ht = _mesa_hash_table_create(c, index_hash, index_compare); @@ -96,8 +95,7 @@ qir_lower_uniforms(struct vc4_compile *c) * than one uniform referenced, and add those uniform values to the * ht. */ - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { uint32_t nsrc = qir_get_op_nsrc(inst->op); uint32_t count = 0; @@ -137,10 +135,9 @@ qir_lower_uniforms(struct vc4_compile *c) struct qreg temp = qir_get_temp(c); struct qreg unif = { QFILE_UNIF, max_index }; struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef); - insert_at_head(&c->instructions, &mov->link); + list_add(&mov->link, &c->instructions); c->defs[temp.index] = mov; - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { uint32_t nsrc = qir_get_op_nsrc(inst->op); uint32_t count = 0; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index eeb8d3a21ff..99afe4b8798 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -47,14 +47,14 @@ queue(struct vc4_compile *c, uint64_t inst) { struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst); q->inst = inst; - insert_at_tail(&c->qpu_inst_list, &q->link); + list_addtail(&q->link, &c->qpu_inst_list); } static uint64_t * last_inst(struct vc4_compile *c) { struct queued_qpu_inst *q = - (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list); + (struct queued_qpu_inst *)c->qpu_inst_list.prev; return &q->inst; } @@ -117,11 +117,11 @@ fixup_raddr_conflict(struct vc4_compile *c, return; if (mux0 == QPU_MUX_A) { - queue(c, qpu_a_MOV(qpu_rb(31), *src1)); - *src1 = qpu_rb(31); + queue(c, qpu_a_MOV(qpu_rb(31), *src0)); + *src0 = qpu_rb(31); } else { - queue(c, qpu_a_MOV(qpu_ra(31), *src1)); - *src1 = qpu_ra(31); + queue(c, qpu_a_MOV(qpu_ra(31), *src0)); + *src0 = qpu_ra(31); } } @@ -144,7 +144,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) QPU_UNPACK_16B_TO_F32, }; - make_empty_list(&c->qpu_inst_list); + list_inithead(&c->qpu_inst_list); switch (c->stage) { case QSTAGE_VERT: @@ -170,10 +170,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; } - struct simple_node *node; - foreach(node, &c->instructions) { - struct qinst *qinst = (struct qinst *)node; - + list_for_each_entry(struct qinst, qinst, &c->instructions, link) { #if 0 fprintf(stderr, "translating qinst to qpu: "); qir_dump_inst(qinst); diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index f523b4c6fb0..19cbf7bb98c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -43,7 +43,7 @@ static bool debug; struct schedule_node_child; struct schedule_node { - struct simple_node link; + struct list_head link; struct queued_qpu_inst *inst; struct schedule_node_child *children; uint32_t child_count; @@ -400,22 +400,21 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) } static void -calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list) +calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list) { - struct simple_node *node; struct schedule_state state; memset(&state, 0, sizeof(state)); state.dir = F; - foreach(node, schedule_list) - calculate_deps(&state, (struct schedule_node *)node); + list_for_each_entry(struct schedule_node, node, schedule_list, link) + calculate_deps(&state, node); } static void -calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list) +calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list) { - struct simple_node *node; + struct list_head *node; struct schedule_state state; memset(&state, 0, sizeof(state)); @@ -507,15 +506,13 @@ get_instruction_priority(uint64_t inst) static struct schedule_node * choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, - struct simple_node *schedule_list, + struct list_head *schedule_list, struct schedule_node *prev_inst) { struct schedule_node *chosen = NULL; - struct simple_node *node; int chosen_prio = 0; - foreach(node, schedule_list) { - struct schedule_node *n = (struct schedule_node *)node; + list_for_each_entry(struct schedule_node, n, schedule_list, link) { uint64_t inst = n->inst->inst; /* "An instruction must not read from a location in physical @@ -596,14 +593,11 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } static void -dump_state(struct simple_node *schedule_list) +dump_state(struct list_head *schedule_list) { - struct simple_node *node; - uint32_t i = 0; - foreach(node, schedule_list) { - struct schedule_node *n = (struct schedule_node *)node; + list_for_each_entry(struct schedule_node, n, schedule_list, link) { fprintf(stderr, "%3d: ", i++); vc4_qpu_disasm(&n->inst->inst, 1); fprintf(stderr, "\n"); @@ -639,7 +633,7 @@ compute_delay(struct schedule_node *n) } static void -mark_instruction_scheduled(struct simple_node *schedule_list, +mark_instruction_scheduled(struct list_head *schedule_list, struct schedule_node *node, bool war_only) { @@ -658,16 +652,15 @@ mark_instruction_scheduled(struct simple_node *schedule_list, child->parent_count--; if (child->parent_count == 0) - insert_at_head(schedule_list, &child->link); + list_add(&child->link, schedule_list); node->children[i].node = NULL; } } static void -schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) +schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) { - struct simple_node *node, *t; struct choose_scoreboard scoreboard; /* We reorder the uniforms as we schedule instructions, so save the @@ -693,14 +686,12 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) } /* Remove non-DAG heads from the list. */ - foreach_s(node, t, schedule_list) { - struct schedule_node *n = (struct schedule_node *)node; - + list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { if (n->parent_count != 0) - remove_from_list(&n->link); + list_del(&n->link); } - while (!is_empty_list(schedule_list)) { + while (!list_empty(schedule_list)) { struct schedule_node *chosen = choose_instruction_to_schedule(&scoreboard, schedule_list, @@ -724,7 +715,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) * find an instruction to pair with it. */ if (chosen) { - remove_from_list(&chosen->link); + list_del(&chosen->link); mark_instruction_scheduled(schedule_list, chosen, true); if (chosen->uniform != -1) { c->uniform_data[next_uniform] = @@ -738,7 +729,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) schedule_list, chosen); if (merge) { - remove_from_list(&merge->link); + list_del(&merge->link); inst = qpu_merge_inst(inst, merge->inst->inst); assert(inst != 0); if (merge->uniform != -1) { @@ -813,16 +804,14 @@ void qpu_schedule_instructions(struct vc4_compile *c) { void *mem_ctx = ralloc_context(NULL); - struct simple_node schedule_list; - struct simple_node *node; + struct list_head schedule_list; - make_empty_list(&schedule_list); + list_inithead(&schedule_list); if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); - foreach(node, &c->qpu_inst_list) { - struct queued_qpu_inst *q = - (struct queued_qpu_inst *)node; + list_for_each_entry(struct queued_qpu_inst, q, + &c->qpu_inst_list, link) { vc4_qpu_disasm(&q->inst, 1); fprintf(stderr, "\n"); } @@ -831,7 +820,7 @@ qpu_schedule_instructions(struct vc4_compile *c) /* Wrap each instruction in a scheduler structure. */ uint32_t next_uniform = 0; - while (!is_empty_list(&c->qpu_inst_list)) { + while (!list_empty(&c->qpu_inst_list)) { struct queued_qpu_inst *inst = (struct queued_qpu_inst *)c->qpu_inst_list.next; struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); @@ -844,16 +833,15 @@ qpu_schedule_instructions(struct vc4_compile *c) } else { n->uniform = -1; } - remove_from_list(&inst->link); - insert_at_tail(&schedule_list, &n->link); + list_del(&inst->link); + list_addtail(&n->link, &schedule_list); } assert(next_uniform == c->num_uniforms); calculate_forward_deps(c, &schedule_list); calculate_reverse_deps(c, &schedule_list); - foreach(node, &schedule_list) { - struct schedule_node *n = (struct schedule_node *)node; + list_for_each_entry(struct schedule_node, n, &schedule_list, link) { compute_delay(n); } diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c index 1792becb08f..270832eae3a 100644 --- a/src/gallium/drivers/vc4/vc4_query.c +++ b/src/gallium/drivers/vc4/vc4_query.c @@ -50,9 +50,10 @@ vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query) free(query); } -static void +static boolean vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query) { + return true; } static void diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index f40547b8154..3b0b890b66a 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -161,7 +161,6 @@ node_to_temp_priority(const void *in_a, const void *in_b) struct qpu_reg * vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) { - struct simple_node *node; struct node_to_temp_map map[c->num_temps]; uint32_t temp_to_node[c->num_temps]; uint32_t def[c->num_temps]; @@ -189,9 +188,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) /* Compute the live ranges so we can figure out interference. */ uint32_t ip = 0; - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; - + list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (inst->dst.file == QFILE_TEMP) { def[inst->dst.index] = ip; use[inst->dst.index] = ip; @@ -227,9 +224,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } /* Figure out our register classes and preallocated registers*/ - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; - + list_for_each_entry(struct qinst, inst, &c->instructions, link) { switch (inst->op) { case QOP_FRAG_Z: ra_set_node_reg(g, temp_to_node[inst->dst.index], diff --git a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c index 109724369d5..7f11fba2340 100644 --- a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c @@ -42,10 +42,8 @@ qir_reorder_uniforms(struct vc4_compile *c) uint32_t *uniform_index = NULL; uint32_t uniform_index_size = 0; uint32_t next_uniform = 0; - struct simple_node *node; - foreach(node, &c->instructions) { - struct qinst *inst = (struct qinst *)node; + list_for_each_entry(struct qinst, inst, &c->instructions, link) { for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { if (inst->src[i].file != QFILE_UNIF) continue; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 3f180d5845d..cab76406055 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -26,6 +26,7 @@ #include "util/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" +#include "util/u_upload_mgr.h" #include "vc4_screen.h" #include "vc4_context.h" @@ -161,6 +162,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, /* We need to align the box to utile boundaries, since that's * what load/store operate on. */ + uint32_t orig_width = ptrans->box.width; + uint32_t orig_height = ptrans->box.height; uint32_t box_start_x = ptrans->box.x & (utile_w - 1); uint32_t box_start_y = ptrans->box.y & (utile_h - 1); ptrans->box.width += box_start_x; @@ -174,7 +177,9 @@ vc4_resource_transfer_map(struct pipe_context *pctx, ptrans->layer_stride = ptrans->stride; trans->map = malloc(ptrans->stride * ptrans->box.height); - if (usage & PIPE_TRANSFER_READ) { + if (usage & PIPE_TRANSFER_READ || + ptrans->box.width != orig_width || + ptrans->box.height != orig_height) { vc4_load_tiled_image(trans->map, ptrans->stride, buf + slice->offset + box->z * rsc->cube_map_stride, @@ -638,41 +643,37 @@ vc4_update_shadow_baselevel_texture(struct pipe_context *pctx, * was in user memory, it would be nice to not have uploaded it to a VBO * before translating. */ -void -vc4_update_shadow_index_buffer(struct pipe_context *pctx, - const struct pipe_index_buffer *ib) +struct pipe_resource * +vc4_get_shadow_index_buffer(struct pipe_context *pctx, + const struct pipe_index_buffer *ib, + uint32_t count, + uint32_t *shadow_offset) { - struct vc4_resource *shadow = vc4_resource(ib->buffer); - struct vc4_resource *orig = vc4_resource(shadow->shadow_parent); - uint32_t count = shadow->base.b.width0 / 2; - - if (shadow->writes == orig->writes) - return; - + struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_resource *orig = vc4_resource(ib->buffer); perf_debug("Fallback conversion for %d uint indices\n", count); + void *data; + struct pipe_resource *shadow_rsc = NULL; + u_upload_alloc(vc4->uploader, 0, count * 2, + shadow_offset, &shadow_rsc, &data); + uint16_t *dst = data; + struct pipe_transfer *src_transfer; uint32_t *src = pipe_buffer_map_range(pctx, &orig->base.b, ib->offset, count * 4, PIPE_TRANSFER_READ, &src_transfer); - struct pipe_transfer *dst_transfer; - uint16_t *dst = pipe_buffer_map_range(pctx, &shadow->base.b, - 0, - count * 2, - PIPE_TRANSFER_WRITE, &dst_transfer); - for (int i = 0; i < count; i++) { uint32_t src_index = src[i]; assert(src_index <= 0xffff); dst[i] = src_index; } - pctx->transfer_unmap(pctx, dst_transfer); pctx->transfer_unmap(pctx, src_transfer); - shadow->writes = orig->writes; + return shadow_rsc; } void diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h index 2ed848bc7b9..ab8f5d3cd55 100644 --- a/src/gallium/drivers/vc4/vc4_resource.h +++ b/src/gallium/drivers/vc4/vc4_resource.h @@ -26,7 +26,7 @@ #define VC4_RESOURCE_H #include "vc4_screen.h" -#include "vc4_packet.h" +#include "kernel/vc4_packet.h" #include "util/u_transfer.h" struct vc4_transfer { @@ -45,7 +45,6 @@ struct vc4_resource_slice { struct vc4_surface { struct pipe_surface base; uint32_t offset; - uint32_t stride; uint8_t tiling; }; @@ -107,8 +106,10 @@ struct pipe_resource *vc4_resource_create(struct pipe_screen *pscreen, const struct pipe_resource *tmpl); void vc4_update_shadow_baselevel_texture(struct pipe_context *pctx, struct pipe_sampler_view *view); -void vc4_update_shadow_index_buffer(struct pipe_context *pctx, - const struct pipe_index_buffer *ib); +struct pipe_resource *vc4_get_shadow_index_buffer(struct pipe_context *pctx, + const struct pipe_index_buffer *ib, + uint32_t count, + uint32_t *offset); void vc4_dump_surface(struct pipe_surface *psurf); #endif /* VC4_RESOURCE_H */ diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 84aae918326..f63bead0fbb 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -175,6 +175,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; /* Stream output. */ @@ -322,6 +323,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: @@ -458,7 +460,7 @@ vc4_screen_create(int fd) pscreen->is_format_supported = vc4_screen_is_format_supported; screen->fd = fd; - make_empty_list(&screen->bo_cache.time_list); + list_inithead(&screen->bo_cache.time_list); vc4_fence_init(screen); diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h index 60626285d4d..5992e371093 100644 --- a/src/gallium/drivers/vc4/vc4_screen.h +++ b/src/gallium/drivers/vc4/vc4_screen.h @@ -27,7 +27,7 @@ #include "pipe/p_screen.h" #include "os/os_thread.h" #include "state_tracker/drm_driver.h" -#include "vc4_qir.h" +#include "util/list.h" struct vc4_bo; @@ -61,13 +61,19 @@ struct vc4_screen { struct vc4_bo_cache { /** List of struct vc4_bo freed, by age. */ - struct simple_node time_list; + struct list_head time_list; /** List of struct vc4_bo freed, per size, by age. */ - struct simple_node *size_list; + struct list_head *size_list; uint32_t size_list_size; pipe_mutex lock; + + uint32_t bo_size; + uint32_t bo_count; } bo_cache; + + uint32_t bo_size; + uint32_t bo_count; }; static inline struct vc4_screen * diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index 2f72e722fc5..b58013dd2ee 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -39,11 +39,13 @@ vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo) { struct vc4_context *vc4 = dev->vc4; struct vc4_screen *screen = vc4->screen; - struct drm_gem_cma_object *obj = CALLOC_STRUCT(drm_gem_cma_object); + struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo); + struct drm_gem_cma_object *obj = &drm_bo->base; uint32_t size = align(bo->size, 4096); - obj->bo = bo; + drm_bo->bo = bo; obj->base.size = size; + obj->base.dev = dev; obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next; obj->paddr = simpenrose_hw_addr(obj->vaddr); @@ -94,7 +96,7 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec) { for (int i = 0; i < exec->bo_count; i++) { struct drm_gem_cma_object *obj = exec->bo[i].bo; - struct vc4_bo *bo = obj->bo; + struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo; memcpy(bo->map, obj->vaddr, bo->size); @@ -124,6 +126,7 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) int ret; memset(&exec, 0, sizeof(exec)); + list_inithead(&exec.unref_list); if (ctex && ctex->bo->simulator_winsys_map) { #if 0 @@ -176,8 +179,12 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) if (ret) return ret; - vc4_bo_unreference(&exec.exec_bo->bo); - free(exec.exec_bo); + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list, + unref_head) { + list_del(&bo->unref_head); + vc4_bo_unreference(&bo->bo); + free(bo); + } if (ctex && ctex->bo->simulator_winsys_map) { for (int y = 0; y < ctex->base.b.height0; y++) { diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 1f0c6b67c0f..2bb36b253bb 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -43,6 +43,7 @@ struct vc4_exec_info; #define kfree(ptr) free(ptr) #define krealloc(ptr, size, args) realloc(ptr, size) #define roundup(x, y) align(x, y) +#define round_up(x, y) align(x, y) #define max(x, y) MAX2(x, y) #define min(x, y) MiN2(x, y) #define BUG_ON(condition) assert(!(condition)) @@ -63,16 +64,27 @@ struct drm_device { uint32_t simulator_mem_next; }; -struct drm_gem_cma_object { - struct vc4_bo *bo; +struct drm_gem_object { + uint32_t size; + struct drm_device *dev; +}; - struct { - uint32_t size; - } base; +struct drm_gem_cma_object { + struct drm_gem_object base; uint32_t paddr; void *vaddr; }; +struct drm_vc4_bo { + struct drm_gem_cma_object base; + struct vc4_bo *bo; + struct list_head unref_head; +}; + +static inline struct drm_vc4_bo *to_vc4_bo(struct drm_gem_object *obj) +{ + return (struct drm_vc4_bo *)obj; +} struct drm_gem_cma_object * drm_gem_cma_create(struct drm_device *dev, size_t size); diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 80e963ea2ee..4a1d4c3a4d6 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -304,24 +304,8 @@ vc4_set_index_buffer(struct pipe_context *pctx, if (ib) { assert(!ib->user_buffer); - - if (ib->index_size == 4) { - struct pipe_resource tmpl = *ib->buffer; - assert(tmpl.format == PIPE_FORMAT_R8_UNORM); - assert(tmpl.height0 == 1); - tmpl.width0 = (tmpl.width0 - ib->offset) / 2; - struct pipe_resource *pshadow = - vc4_resource_create(&vc4->screen->base, &tmpl); - struct vc4_resource *shadow = vc4_resource(pshadow); - pipe_resource_reference(&shadow->shadow_parent, ib->buffer); - - pipe_resource_reference(&vc4->indexbuf.buffer, NULL); - vc4->indexbuf.buffer = pshadow; - vc4->indexbuf.index_size = 2; - } else { - pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer); - vc4->indexbuf.index_size = ib->index_size; - } + pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer); + vc4->indexbuf.index_size = ib->index_size; vc4->indexbuf.offset = ib->offset; } else { pipe_resource_reference(&vc4->indexbuf.buffer, NULL); @@ -538,6 +522,7 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, struct pipe_resource tmpl = shadow_parent->base.b; struct vc4_resource *clone; + tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level); tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level); tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level; @@ -547,6 +532,8 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, clone->shadow_parent = &shadow_parent->base.b; /* Flag it as needing update of the contents from the parent. */ clone->writes = shadow_parent->writes - 1; + + assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R); } so->texture = prsc; so->reference.count = 1; |