diff options
Diffstat (limited to 'src/gallium/drivers')
414 files changed, 16038 insertions, 7910 deletions
diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk index a6712b2c115..ed51835e1fb 100644 --- a/src/gallium/drivers/freedreno/Android.mk +++ b/src/gallium/drivers/freedreno/Android.mk @@ -28,7 +28,9 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := \ $(C_SOURCES) \ $(a2xx_SOURCES) \ - $(a3xx_SOURCES) + $(a3xx_SOURCES) \ + $(a4xx_SOURCES) \ + $(ir3_SOURCES) LOCAL_CFLAGS := \ -Wno-packed-bitfield-compat @@ -37,6 +39,7 @@ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/ir3 LOCAL_SHARED_LIBRARIES := libdrm libdrm_freedreno +LOCAL_STATIC_LIBRARIES := libmesa_glsl LOCAL_MODULE := libmesa_pipe_freedreno include $(GALLIUM_COMMON_MK) diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index cbf62c6daae..dff95ba5270 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -1,5 +1,3 @@ -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index f4f6b94c1ea..c4516baf2ec 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2013-11-30 14:47:15) -- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2013-03-31 16:51:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2014-06-02 15:21:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2014-11-13 22:44:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14895 bytes, from 2015-04-19 15:23:28) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-04-12 18:16:35) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 59314 bytes, from 2015-04-19 16:21:40) - -Copyright (C) 2013-2014 by the following authors: +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) + +Copyright (C) 2013-2015 by the following authors: - Rob Clark <[email protected]> (robclark) Permission is hereby granted, free of charge, to any person obtaining diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h index 7cafcd3747e..3c8d8f7c09f 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h @@ -39,7 +39,7 @@ struct fd2_blend_stateobj { uint32_t rb_colormask; }; -static INLINE struct fd2_blend_stateobj * +static inline struct fd2_blend_stateobj * fd2_blend_stateobj(struct pipe_blend_state *blend) { return (struct fd2_blend_stateobj *)blend; diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c index a0bf01ffd1f..6089ebc1516 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c @@ -67,7 +67,7 @@ create_solid_vertexbuf(struct pipe_context *pctx) } static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = { - [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_A2XX, + [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_PSIZE, [PIPE_PRIM_LINES] = DI_PT_LINELIST, [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, [PIPE_PRIM_LINE_LOOP] = DI_PT_LINELOOP, @@ -77,7 +77,7 @@ static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = { }; static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = { - [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_A2XX, + [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_PSIZE, [PIPE_PRIM_LINES] = DI_PT_LINELIST, [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, [PIPE_PRIM_TRIANGLES] = DI_PT_TRILIST, diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.h b/src/gallium/drivers/freedreno/a2xx/fd2_context.h index de845f07a85..74147107930 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.h @@ -40,7 +40,7 @@ struct fd2_context { struct pipe_resource *solid_vertexbuf; }; -static INLINE struct fd2_context * +static inline struct fd2_context * fd2_context(struct fd_context *ctx) { return (struct fd2_context *)ctx; diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h index adc0653132b..9e53cd3be75 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h @@ -43,7 +43,7 @@ struct fd2_rasterizer_stateobj { uint32_t pa_su_sc_mode_cntl; }; -static INLINE struct fd2_rasterizer_stateobj * +static inline struct fd2_rasterizer_stateobj * fd2_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { return (struct fd2_rasterizer_stateobj *)rast; diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h index 4fffa08b3c3..5c9236851bd 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h @@ -42,7 +42,7 @@ struct fd2_sampler_stateobj { uint32_t tex0, tex3, tex4, tex5; }; -static INLINE struct fd2_sampler_stateobj * +static inline struct fd2_sampler_stateobj * fd2_sampler_stateobj(struct pipe_sampler_state *samp) { return (struct fd2_sampler_stateobj *)samp; @@ -54,7 +54,7 @@ struct fd2_pipe_sampler_view { uint32_t tex0, tex2, tex3; }; -static INLINE struct fd2_pipe_sampler_view * +static inline struct fd2_pipe_sampler_view * fd2_pipe_sampler_view(struct pipe_sampler_view *pview) { return (struct fd2_pipe_sampler_view *)pview; diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h index dda1e552174..15609ad0267 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h @@ -44,7 +44,7 @@ struct fd2_zsa_stateobj { uint32_t rb_stencilrefmask_bf; }; -static INLINE struct fd2_zsa_stateobj * +static inline struct fd2_zsa_stateobj * fd2_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { return (struct fd2_zsa_stateobj *)zsa; diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index a3bc74eda85..8e8cf6a03f2 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2013-11-30 14:47:15) -- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2013-03-31 16:51:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2014-06-02 15:21:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2014-11-13 22:44:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14895 bytes, from 2015-04-19 15:23:28) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-04-12 18:16:35) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 59314 bytes, from 2015-04-19 16:21:40) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <[email protected]> (robclark) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h index 4f6eeb74481..142df7c300f 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h @@ -32,6 +32,8 @@ #include "pipe/p_state.h" #include "pipe/p_context.h" +#include "freedreno_util.h" + struct fd3_blend_stateobj { struct pipe_blend_state base; struct { @@ -42,10 +44,10 @@ struct fd3_blend_stateobj { /* Blend control bits for alpha channel */ uint32_t blend_control_alpha; uint32_t control; - } rb_mrt[4]; + } rb_mrt[A3XX_MAX_RENDER_TARGETS]; }; -static INLINE struct fd3_blend_stateobj * +static inline struct fd3_blend_stateobj * fd3_blend_stateobj(struct pipe_blend_state *blend) { return (struct fd3_blend_stateobj *)blend; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c index 7e5a99ea571..dc33783e398 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c @@ -88,7 +88,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx) } static const uint8_t primtypes[PIPE_PRIM_MAX] = { - [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_A3XX, + [PIPE_PRIM_POINTS] = DI_PT_POINTLIST, [PIPE_PRIM_LINES] = DI_PT_LINELIST, [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, [PIPE_PRIM_LINE_LOOP] = DI_PT_LINELOOP, @@ -121,6 +121,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv) fd3_gmem_init(pctx); fd3_texture_init(pctx); fd3_prog_init(pctx); + fd3_emit_init(pctx); pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv); if (!pctx) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h index 77e4605e550..6e20b2ff9bc 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h @@ -112,7 +112,7 @@ struct fd3_context { struct ir3_shader_key last_key; }; -static INLINE struct fd3_context * +static inline struct fd3_context * fd3_context(struct fd_context *ctx) { return (struct fd3_context *)ctx; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index b5838b58eb2..a9498835011 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -60,6 +60,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, const struct pipe_draw_info *info = emit->info; enum pc_di_primtype primtype = ctx->primtypes[info->mode]; + if (!(fd3_emit_get_vp(emit) && fd3_emit_get_fp(emit))) + return; + fd3_emit_state(ctx, ring, emit); if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) @@ -79,8 +82,8 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, info->restart_index : 0xffffffff); if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex && - info->mode == PIPE_PRIM_POINTS) - primtype = DI_PT_POINTLIST_A2XX; + (info->mode == PIPE_PRIM_POINTS)) + primtype = DI_PT_POINTLIST_PSIZE; fd_draw_emit(ctx, ring, primtype, @@ -240,10 +243,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, .vtx = &fd3_ctx->solid_vbuf_state, .prog = &ctx->solid_prog, .key = { - .half_precision = (fd3_half_precision(pfb->cbufs[0]) && - fd3_half_precision(pfb->cbufs[1]) && - fd3_half_precision(pfb->cbufs[2]) && - fd3_half_precision(pfb->cbufs[3])), + .half_precision = fd_half_precision(pfb), }, }; @@ -321,7 +321,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); } - for (i = 0; i < 4; i++) { + for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) { OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS) | @@ -342,7 +342,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, fd3_emit_vertex_bufs(ring, &emit); - fd3_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL); + fd3_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL); OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 07cc2266d08..752e7f88cb9 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -43,19 +43,26 @@ #include "fd3_format.h" #include "fd3_zsa.h" +static const enum adreno_state_block sb[] = { + [SHADER_VERTEX] = SB_VERT_SHADER, + [SHADER_FRAGMENT] = SB_FRAG_SHADER, +}; + /* regid: base const register * prsc or dwords: buffer containing constant values * sizedwords: size of const value buffer */ void -fd3_emit_constant(struct fd_ringbuffer *ring, - enum adreno_state_block sb, +fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type, uint32_t regid, uint32_t offset, uint32_t sizedwords, const uint32_t *dwords, struct pipe_resource *prsc) { uint32_t i, sz; enum adreno_state_src src; + debug_assert((regid % 4) == 0); + debug_assert((sizedwords % 4) == 0); + if (prsc) { sz = 0; src = SS_INDIRECT; @@ -67,7 +74,7 @@ fd3_emit_constant(struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) | CP_LOAD_STATE_0_STATE_SRC(src) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | CP_LOAD_STATE_0_NUM_UNIT(sizedwords/2)); if (prsc) { struct fd_bo *bo = fd_resource(prsc)->bo; @@ -84,89 +91,31 @@ fd3_emit_constant(struct fd_ringbuffer *ring, } static void -emit_constants(struct fd_ringbuffer *ring, - enum adreno_state_block sb, - struct fd_constbuf_stateobj *constbuf, - struct ir3_shader_variant *shader, - bool emit_immediates) +fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, + uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets) { - uint32_t enabled_mask = constbuf->enabled_mask; - uint32_t max_const; - int i; - - // XXX TODO only emit dirty consts.. but we need to keep track if - // they are clobbered by a clear, gmem2mem, or mem2gmem.. - constbuf->dirty_mask = enabled_mask; - - /* in particular, with binning shader we may end up with unused - * consts, ie. we could end up w/ constlen that is smaller - * than first_immediate. In that case truncate the user consts - * early to avoid HLSQ lockup caused by writing too many consts - */ - max_const = MIN2(shader->first_driver_param, shader->constlen); - - /* emit user constants: */ - if (enabled_mask & 1) { - const unsigned index = 0; - struct pipe_constant_buffer *cb = &constbuf->cb[index]; - unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ + uint32_t i; - // I expect that size should be a multiple of vec4's: - assert(size == align(size, 4)); + debug_assert((regid % 4) == 0); + debug_assert((num % 4) == 0); - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, 4 * max_const); - - if (size && constbuf->dirty_mask & (1 << index)) { - fd3_emit_constant(ring, sb, 0, - cb->buffer_offset, size, - cb->user_buffer, cb->buffer); - constbuf->dirty_mask &= ~(1 << index); - } - - enabled_mask &= ~(1 << index); - } - - if (shader->constlen > shader->first_driver_param) { - uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param); - /* emit ubos: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | - CP_LOAD_STATE_0_NUM_UNIT(params * 2)); - OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); - - for (i = 1; i <= params * 4; i++) { - struct pipe_constant_buffer *cb = &constbuf->cb[i]; - assert(!cb->user_buffer); - if ((enabled_mask & (1 << i)) && cb->buffer) - OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0); - else - OUT_RING(ring, 0xbad00000 | ((i - 1) << 16)); - } - } - - /* emit shader immediates: */ - if (shader && emit_immediates) { - int size = shader->immediates_count; - uint32_t base = shader->first_immediate; - - /* truncate size to avoid writing constants that shader - * does not use: - */ - size = MIN2(size + base, shader->constlen) - base; - - /* convert out of vec4: */ - base *= 4; - size *= 4; - - if (size > 0) { - fd3_emit_constant(ring, sb, base, - 0, size, shader->immediates[0].val, NULL); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + num); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | + CP_LOAD_STATE_0_NUM_UNIT(num/2)); + OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); + + for (i = 0; i < num; i++) { + if (bos[i]) { + if (write) { + OUT_RELOCW(ring, bos[i], offsets[i], 0, 0); + } else { + OUT_RELOC(ring, bos[i], offsets[i], 0, 0); + } + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); } } } @@ -302,14 +251,15 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); for (i = 0; i < tex->num_textures; i++) { static const struct fd3_pipe_sampler_view dummy_view = { + .base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */ .base.u.tex.first_level = 1, }; const struct fd3_pipe_sampler_view *view = tex->textures[i] ? fd3_pipe_sampler_view(tex->textures[i]) : &dummy_view; struct fd_resource *rsc = fd_resource(view->base.texture); - unsigned start = view->base.u.tex.first_level; - unsigned end = view->base.u.tex.last_level; + unsigned start = fd_sampler_first_level(&view->base); + unsigned end = fd_sampler_last_level(&view->base);; for (j = 0; j < (end - start + 1); j++) { struct fd_resource_slice *slice = @@ -392,6 +342,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, format = fd3_gmem_restore_format(rsc->base.b.format); } + /* note: PIPE_BUFFER disallowed for surfaces */ unsigned lvl = psurf[i]->u.tex.level; struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl); @@ -444,7 +395,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) uint32_t total_in = 0; const struct fd_vertex_state *vtx = emit->vtx; struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); - unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0); + unsigned vertex_regid = regid(63, 0); + unsigned instance_regid = regid(63, 0); + unsigned vtxcnt_regid = regid(63, 0); for (i = 0; i < vp->inputs_count; i++) { uint8_t semantic = sem2name(vp->inputs[i].semantic); @@ -452,14 +405,17 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) vertex_regid = vp->inputs[i].regid; else if (semantic == TGSI_SEMANTIC_INSTANCEID) instance_regid = vp->inputs[i].regid; + else if (semantic == IR3_SEMANTIC_VTXCNT) + vtxcnt_regid = vp->inputs[i].regid; else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask) last = i; } /* hw doesn't like to be configured for zero vbo's, it seems: */ - if (vtx->vtx->num_elements == 0 && - vertex_regid == regid(63, 0) && - instance_regid == regid(63, 0)) + if ((vtx->vtx->num_elements == 0) && + (vertex_regid == regid(63, 0)) && + (instance_regid == regid(63, 0)) && + (vtxcnt_regid == regid(63, 0))) return; for (i = 0, j = 0; i <= last; i++) { @@ -472,8 +428,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) enum pipe_format pfmt = elem->src_format; enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt); bool switchnext = (i != last) || - vertex_regid != regid(63, 0) || - instance_regid != regid(63, 0); + (vertex_regid != regid(63, 0)) || + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); bool isint = util_format_is_pure_integer(pfmt); uint32_t fs = util_format_get_blocksize(pfmt); @@ -512,6 +469,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | A3XX_VFD_CONTROL_1_REGID4INST(instance_regid)); + + OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); + OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | + A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid)); } void @@ -669,33 +630,12 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, HLSQ_FLUSH); - if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) && - /* evil hack to deal sanely with clear path: */ - (emit->prog == &ctx->prog)) { - fd_wfi(ctx, ring); - emit_constants(ring, SB_VERT_SHADER, - &ctx->constbuf[PIPE_SHADER_VERTEX], - vp, emit->prog->dirty & FD_SHADER_DIRTY_VP); - if (!emit->key.binning_pass) { - emit_constants(ring, SB_FRAG_SHADER, - &ctx->constbuf[PIPE_SHADER_FRAGMENT], - fp, emit->prog->dirty & FD_SHADER_DIRTY_FP); - } - } - - /* emit driver params every time */ - if (emit->info && emit->prog == &ctx->prog) { - uint32_t vertex_params[4] = { - emit->info->indexed ? emit->info->index_bias : emit->info->start, - 0, - 0, - 0 - }; - if (vp->constlen >= vp->first_driver_param + 4) { - fd3_emit_constant(ring, SB_VERT_SHADER, - (vp->first_driver_param + 4) * 4, - 0, 4, vertex_params, NULL); - } + if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */ + ir3_emit_consts(vp, ring, emit->info, dirty); + if (!emit->key.binning_pass) + ir3_emit_consts(fp, ring, emit->info, dirty); + /* mark clean after emitting consts: */ + ctx->prog.dirty = 0; } if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) { @@ -930,3 +870,11 @@ fd3_emit_restore(struct fd_context *ctx) ctx->needs_rb_fbd = true; } + +void +fd3_emit_init(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->emit_const = fd3_emit_const; + ctx->emit_const_bo = fd3_emit_const_bo; +} diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index 8f21919c9a7..795654706a7 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -37,10 +37,8 @@ #include "ir3_shader.h" struct fd_ringbuffer; -enum adreno_state_block; -void fd3_emit_constant(struct fd_ringbuffer *ring, - enum adreno_state_block sb, +void fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type, uint32_t regid, uint32_t offset, uint32_t sizedwords, const uint32_t *dwords, struct pipe_resource *prsc); @@ -90,4 +88,6 @@ void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, void fd3_emit_restore(struct fd_context *ctx); +void fd3_emit_init(struct pipe_context *pctx); + #endif /* FD3_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h index 6afc3015901..05c5ea3d247 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h @@ -41,27 +41,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format); uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); -static INLINE bool -fd3_half_precision(const struct pipe_surface *surface) -{ - enum pipe_format format; - if (!surface) - return true; - - format = surface->format; - - /* colors are provided in consts, which go through cov.f32f16, which will - * break these values - */ - if (util_format_is_pure_integer(format)) - return false; - - /* avoid losing precision on 32-bit float formats */ - if (util_format_is_float(format) && - util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32) - return false; - - return true; -} - #endif /* FD3_FORMAT_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 7d3975761dd..9a5b45e2fcb 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -57,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, tile_mode = LINEAR; } - for (i = 0; i < 4; i++) { + for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) { enum pipe_format pformat = 0; enum a3xx_color_fmt format = 0; enum a3xx_color_swap swap = WZYX; @@ -537,10 +537,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) /* NOTE: They all use the same VP, this is for vtx bufs. */ .prog = &ctx->blit_prog[0], .key = { - .half_precision = (fd3_half_precision(pfb->cbufs[0]) && - fd3_half_precision(pfb->cbufs[1]) && - fd3_half_precision(pfb->cbufs[2]) && - fd3_half_precision(pfb->cbufs[3])) + .half_precision = fd_half_precision(pfb), }, }; float x0, y0, x1, y1; @@ -654,6 +651,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) { emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1]; + emit.fp = NULL; /* frag shader changed so clear cache */ fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); } @@ -674,6 +672,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) emit.prog = &ctx->blit_zs; emit.key.half_precision = false; } + emit.fp = NULL; /* frag shader changed so clear cache */ fd3_program_emit(ring, &emit, 1, &pfb->zsbuf); emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 57fcaa9020e..b5360797745 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -51,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state enum shader_t type) { struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj); - so->shader = ir3_shader_create(pctx, cso->tokens, type); + so->shader = ir3_shader_create(pctx, cso, type); return so; } @@ -136,6 +136,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, int constmode; int i, j, k; + debug_assert(nr <= ARRAY_SIZE(color_regid)); + vp = fd3_emit_get_vp(emit); if (emit->key.binning_pass) { @@ -202,12 +204,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = ir3_find_output_regid(fp, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); } else { - for (int i = 0; i < fp->outputs_count; i++) { + for (i = 0; i < fp->outputs_count; i++) { ir3_semantic sem = fp->outputs[i].semantic; unsigned idx = sem2idx(sem); if (sem2name(sem) != TGSI_SEMANTIC_COLOR) continue; - assert(idx < 4); + debug_assert(idx < ARRAY_SIZE(color_regid)); color_regid[idx] = fp->outputs[i].regid; } } @@ -449,10 +451,6 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, OUT_RING(ring, flatshade[1]); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ } - OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); - OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | - A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252)); - if (vpbuffer == BUFFER) emit_shader(ring, vp); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c index 7abab543427..8fc0a0d4229 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c @@ -64,7 +64,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) OUT_PKT3(ring, CP_DRAW_INDX, 3); OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(DI_PT_POINTLIST_A2XX, DI_SRC_SEL_AUTO_INDEX, + OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, INDEX_SIZE_IGN, USE_VISIBILITY, 0)); OUT_RING(ring, 0); /* NumIndices */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h index 7e9c1f51f59..765d9719524 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h @@ -44,7 +44,7 @@ struct fd3_rasterizer_stateobj { uint32_t pc_prim_vtx_cntl; }; -static INLINE struct fd3_rasterizer_stateobj * +static inline struct fd3_rasterizer_stateobj * fd3_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { return (struct fd3_rasterizer_stateobj *)rast; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c index 094dcf376e5..722fe360202 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c @@ -105,7 +105,7 @@ void fd3_screen_init(struct pipe_screen *pscreen) { struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = 4; + screen->max_rts = A3XX_MAX_RENDER_TARGETS; screen->compiler = ir3_compiler_create(screen->gpu_id); pscreen->context_create = fd3_context_create; pscreen->is_format_supported = fd3_screen_is_format_supported; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c index a278bf5c603..c30658d0e7b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c @@ -210,8 +210,8 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, { struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); - unsigned lvl = cso->u.tex.first_level; - unsigned miplevels = cso->u.tex.last_level - lvl; + unsigned lvl = fd_sampler_first_level(cso); + unsigned miplevels = fd_sampler_last_level(cso) - lvl; uint32_t sz2 = 0; if (!so) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h index c38fd847f27..d5afb03cd7a 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h @@ -43,7 +43,7 @@ struct fd3_sampler_stateobj { bool saturate_s, saturate_t, saturate_r; }; -static INLINE struct fd3_sampler_stateobj * +static inline struct fd3_sampler_stateobj * fd3_sampler_stateobj(struct pipe_sampler_state *samp) { return (struct fd3_sampler_stateobj *)samp; @@ -54,7 +54,7 @@ struct fd3_pipe_sampler_view { uint32_t texconst0, texconst1, texconst2, texconst3; }; -static INLINE struct fd3_pipe_sampler_view * +static inline struct fd3_pipe_sampler_view * fd3_pipe_sampler_view(struct pipe_sampler_view *pview) { return (struct fd3_pipe_sampler_view *)pview; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h index 352c3dd5432..d4dc5954da5 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h @@ -45,7 +45,7 @@ struct fd3_zsa_stateobj { uint32_t rb_stencilrefmask_bf; }; -static INLINE struct fd3_zsa_stateobj * +static inline struct fd3_zsa_stateobj * fd3_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { return (struct fd3_zsa_stateobj *)zsa; diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index 0e7d3cf6db1..563f70ac5eb 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2013-11-30 14:47:15) -- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2013-03-31 16:51:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2014-06-02 15:21:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2014-11-13 22:44:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14895 bytes, from 2015-04-19 15:23:28) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-04-12 18:16:35) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 59314 bytes, from 2015-04-19 16:21:40) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <[email protected]> (robclark) @@ -227,6 +227,7 @@ enum a4xx_depth_format { DEPTH4_NONE = 0, DEPTH4_16 = 1, DEPTH4_24_8 = 2, + DEPTH4_32 = 3, }; enum a4xx_tess_spacing { @@ -570,6 +571,15 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val) return ((val) << A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT) & A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK; } +#define REG_A4XX_RB_SAMPLE_COUNT_CONTROL 0x000020fa +#define A4XX_RB_SAMPLE_COUNT_CONTROL_COPY 0x00000002 +#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK 0xfffffffc +#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT 2 +static inline uint32_t A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR(uint32_t val) +{ + return ((val >> 2) << A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT) & A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK; +} + #define REG_A4XX_RB_RENDER_COMPONENTS 0x000020fb #define A4XX_RB_RENDER_COMPONENTS_RT0__MASK 0x0000000f #define A4XX_RB_RENDER_COMPONENTS_RT0__SHIFT 0 @@ -811,6 +821,23 @@ static inline uint32_t A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(enum adreno_stencil_op v #define REG_A4XX_RB_STENCIL_CONTROL2 0x00002107 #define A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER 0x00000001 +#define REG_A4XX_RB_STENCIL_INFO 0x00002108 +#define A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL 0x00000001 +#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK 0xfffff000 +#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT 12 +static inline uint32_t A4XX_RB_STENCIL_INFO_STENCIL_BASE(uint32_t val) +{ + return ((val >> 12) << A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT) & A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK; +} + +#define REG_A4XX_RB_STENCIL_PITCH 0x00002109 +#define A4XX_RB_STENCIL_PITCH__MASK 0xffffffff +#define A4XX_RB_STENCIL_PITCH__SHIFT 0 +static inline uint32_t A4XX_RB_STENCIL_PITCH(uint32_t val) +{ + return ((val >> 5) << A4XX_RB_STENCIL_PITCH__SHIFT) & A4XX_RB_STENCIL_PITCH__MASK; +} + #define REG_A4XX_RB_STENCILREFMASK 0x0000210b #define A4XX_RB_STENCILREFMASK_STENCILREF__MASK 0x000000ff #define A4XX_RB_STENCILREFMASK_STENCILREF__SHIFT 0 @@ -1167,6 +1194,8 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578 #define REG_A4XX_SP_VS_STATUS 0x00000ec0 +#define REG_A4XX_SP_MODE_CONTROL 0x00000ec3 + #define REG_A4XX_SP_PERFCTR_SP_SEL_11 0x00000ecf #define REG_A4XX_SP_SP_CTRL_REG 0x000022c0 @@ -1432,6 +1461,20 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val) return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK; } +#define REG_A4XX_SP_CS_CTRL_REG0 0x00002300 + +#define REG_A4XX_SP_CS_OBJ_OFFSET_REG 0x00002301 + +#define REG_A4XX_SP_CS_OBJ_START 0x00002302 + +#define REG_A4XX_SP_CS_PVT_MEM_PARAM 0x00002303 + +#define REG_A4XX_SP_CS_PVT_MEM_ADDR 0x00002304 + +#define REG_A4XX_SP_CS_PVT_MEM_SIZE 0x00002305 + +#define REG_A4XX_SP_CS_LENGTH_REG 0x00002306 + #define REG_A4XX_SP_HS_OBJ_OFFSET_REG 0x0000230d #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK 0x01ff0000 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT 16 @@ -1454,6 +1497,76 @@ static inline uint32_t A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val) #define REG_A4XX_SP_HS_LENGTH_REG 0x00002312 +#define REG_A4XX_SP_DS_PARAM_REG 0x0000231a +#define A4XX_SP_DS_PARAM_REG_POSREGID__MASK 0x000000ff +#define A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT 0 +static inline uint32_t A4XX_SP_DS_PARAM_REG_POSREGID(uint32_t val) +{ + return ((val) << A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_DS_PARAM_REG_POSREGID__MASK; +} +#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK 0xfff00000 +#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT 20 +static inline uint32_t A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR(uint32_t val) +{ + return ((val) << A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK; +} + +static inline uint32_t REG_A4XX_SP_DS_OUT(uint32_t i0) { return 0x0000231b + 0x1*i0; } + +static inline uint32_t REG_A4XX_SP_DS_OUT_REG(uint32_t i0) { return 0x0000231b + 0x1*i0; } +#define A4XX_SP_DS_OUT_REG_A_REGID__MASK 0x000001ff +#define A4XX_SP_DS_OUT_REG_A_REGID__SHIFT 0 +static inline uint32_t A4XX_SP_DS_OUT_REG_A_REGID(uint32_t val) +{ + return ((val) << A4XX_SP_DS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_A_REGID__MASK; +} +#define A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK 0x00001e00 +#define A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT 9 +static inline uint32_t A4XX_SP_DS_OUT_REG_A_COMPMASK(uint32_t val) +{ + return ((val) << A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK; +} +#define A4XX_SP_DS_OUT_REG_B_REGID__MASK 0x01ff0000 +#define A4XX_SP_DS_OUT_REG_B_REGID__SHIFT 16 +static inline uint32_t A4XX_SP_DS_OUT_REG_B_REGID(uint32_t val) +{ + return ((val) << A4XX_SP_DS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_B_REGID__MASK; +} +#define A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK 0x1e000000 +#define A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT 25 +static inline uint32_t A4XX_SP_DS_OUT_REG_B_COMPMASK(uint32_t val) +{ + return ((val) << A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK; +} + +static inline uint32_t REG_A4XX_SP_DS_VPC_DST(uint32_t i0) { return 0x0000232c + 0x1*i0; } + +static inline uint32_t REG_A4XX_SP_DS_VPC_DST_REG(uint32_t i0) { return 0x0000232c + 0x1*i0; } +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK 0x000000ff +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT 0 +static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC0(uint32_t val) +{ + return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK; +} +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK 0x0000ff00 +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT 8 +static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC1(uint32_t val) +{ + return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK; +} +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK 0x00ff0000 +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT 16 +static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC2(uint32_t val) +{ + return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK; +} +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK 0xff000000 +#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT 24 +static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC3(uint32_t val) +{ + return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK; +} + #define REG_A4XX_SP_DS_OBJ_OFFSET_REG 0x00002334 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK 0x01ff0000 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT 16 @@ -1476,6 +1589,82 @@ static inline uint32_t A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val) #define REG_A4XX_SP_DS_LENGTH_REG 0x00002339 +#define REG_A4XX_SP_GS_PARAM_REG 0x00002341 +#define A4XX_SP_GS_PARAM_REG_POSREGID__MASK 0x000000ff +#define A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT 0 +static inline uint32_t A4XX_SP_GS_PARAM_REG_POSREGID(uint32_t val) +{ + return ((val) << A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_POSREGID__MASK; +} +#define A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK 0x0000ff00 +#define A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT 8 +static inline uint32_t A4XX_SP_GS_PARAM_REG_PRIMREGID(uint32_t val) +{ + return ((val) << A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK; +} +#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK 0xfff00000 +#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT 20 +static inline uint32_t A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR(uint32_t val) +{ + return ((val) << A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK; +} + +static inline uint32_t REG_A4XX_SP_GS_OUT(uint32_t i0) { return 0x00002342 + 0x1*i0; } + +static inline uint32_t REG_A4XX_SP_GS_OUT_REG(uint32_t i0) { return 0x00002342 + 0x1*i0; } +#define A4XX_SP_GS_OUT_REG_A_REGID__MASK 0x000001ff +#define A4XX_SP_GS_OUT_REG_A_REGID__SHIFT 0 +static inline uint32_t A4XX_SP_GS_OUT_REG_A_REGID(uint32_t val) +{ + return ((val) << A4XX_SP_GS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_A_REGID__MASK; +} +#define A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK 0x00001e00 +#define A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT 9 +static inline uint32_t A4XX_SP_GS_OUT_REG_A_COMPMASK(uint32_t val) +{ + return ((val) << A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK; +} +#define A4XX_SP_GS_OUT_REG_B_REGID__MASK 0x01ff0000 +#define A4XX_SP_GS_OUT_REG_B_REGID__SHIFT 16 +static inline uint32_t A4XX_SP_GS_OUT_REG_B_REGID(uint32_t val) +{ + return ((val) << A4XX_SP_GS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_B_REGID__MASK; +} +#define A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK 0x1e000000 +#define A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT 25 +static inline uint32_t A4XX_SP_GS_OUT_REG_B_COMPMASK(uint32_t val) +{ + return ((val) << A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK; +} + +static inline uint32_t REG_A4XX_SP_GS_VPC_DST(uint32_t i0) { return 0x00002353 + 0x1*i0; } + +static inline uint32_t REG_A4XX_SP_GS_VPC_DST_REG(uint32_t i0) { return 0x00002353 + 0x1*i0; } +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK 0x000000ff +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT 0 +static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC0(uint32_t val) +{ + return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK; +} +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK 0x0000ff00 +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT 8 +static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC1(uint32_t val) +{ + return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK; +} +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK 0x00ff0000 +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT 16 +static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC2(uint32_t val) +{ + return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK; +} +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK 0xff000000 +#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT 24 +static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC3(uint32_t val) +{ + return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK; +} + #define REG_A4XX_SP_GS_OBJ_OFFSET_REG 0x0000235b #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK 0x01ff0000 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT 16 @@ -1677,6 +1866,18 @@ static inline uint32_t A4XX_VFD_CONTROL_3_REGID_VTXCNT(uint32_t val) { return ((val) << A4XX_VFD_CONTROL_3_REGID_VTXCNT__SHIFT) & A4XX_VFD_CONTROL_3_REGID_VTXCNT__MASK; } +#define A4XX_VFD_CONTROL_3_REGID_TESSX__MASK 0x00ff0000 +#define A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT 16 +static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSX(uint32_t val) +{ + return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSX__MASK; +} +#define A4XX_VFD_CONTROL_3_REGID_TESSY__MASK 0xff000000 +#define A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT 24 +static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSY(uint32_t val) +{ + return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSY__MASK; +} #define REG_A4XX_VFD_CONTROL_4 0x00002204 @@ -1758,6 +1959,8 @@ static inline uint32_t A4XX_VFD_DECODE_INSTR_SHIFTCNT(uint32_t val) #define REG_A4XX_TPL1_DEBUG_ECO_CONTROL 0x00000f00 +#define REG_A4XX_TPL1_TP_MODE_CONTROL 0x00000f03 + #define REG_A4XX_TPL1_PERFCTR_TP_SEL_7 0x00000f0b #define REG_A4XX_TPL1_TP_TEX_OFFSET 0x00002380 @@ -1800,6 +2003,10 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val) #define REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR 0x000023a1 +#define REG_A4XX_TPL1_TP_CS_BORDER_COLOR_BASE_ADDR 0x000023a4 + +#define REG_A4XX_TPL1_TP_CS_SAMPLER_BASE_ADDR 0x000023a5 + #define REG_A4XX_TPL1_TP_CS_TEXMEMOBJ_BASE_ADDR 0x000023a6 #define REG_A4XX_GRAS_TSE_STATUS 0x00000c80 @@ -2078,6 +2285,8 @@ static inline uint32_t A4XX_GRAS_SC_EXTENT_WINDOW_TL_Y(uint32_t val) #define REG_A4XX_HLSQ_DEBUG_ECO_CONTROL 0x00000e04 +#define REG_A4XX_HLSQ_MODE_CONTROL 0x00000e05 + #define REG_A4XX_HLSQ_PERF_PIPE_MASK 0x00000e0e #define REG_A4XX_HLSQ_CONTROL_0_REG 0x000023c0 @@ -2158,6 +2367,8 @@ static inline uint32_t A4XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val) return ((val) << A4XX_HLSQ_CONTROL_3_REG_REGID__SHIFT) & A4XX_HLSQ_CONTROL_3_REG_REGID__MASK; } +#define REG_A4XX_HLSQ_CONTROL_4_REG 0x000023c4 + #define REG_A4XX_HLSQ_VS_CONTROL_REG 0x000023c5 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK 0x000000ff #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT 0 @@ -2293,6 +2504,36 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val) return ((val) << A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__SHIFT) & A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__MASK; } +#define REG_A4XX_HLSQ_CS_CONTROL 0x000023ca + +#define REG_A4XX_HLSQ_CL_NDRANGE_0 0x000023cd + +#define REG_A4XX_HLSQ_CL_NDRANGE_1 0x000023ce + +#define REG_A4XX_HLSQ_CL_NDRANGE_2 0x000023cf + +#define REG_A4XX_HLSQ_CL_NDRANGE_3 0x000023d0 + +#define REG_A4XX_HLSQ_CL_NDRANGE_4 0x000023d1 + +#define REG_A4XX_HLSQ_CL_NDRANGE_5 0x000023d2 + +#define REG_A4XX_HLSQ_CL_NDRANGE_6 0x000023d3 + +#define REG_A4XX_HLSQ_CL_CONTROL_0 0x000023d4 + +#define REG_A4XX_HLSQ_CL_CONTROL_1 0x000023d5 + +#define REG_A4XX_HLSQ_CL_KERNEL_CONST 0x000023d6 + +#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_X 0x000023d7 + +#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Y 0x000023d8 + +#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Z 0x000023d9 + +#define REG_A4XX_HLSQ_CL_WG_OFFSET 0x000023da + #define REG_A4XX_HLSQ_UPDATE_CONTROL 0x000023db #define REG_A4XX_PC_BINNING_COMMAND 0x00000d00 @@ -2389,16 +2630,10 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val) #define REG_A4XX_UNKNOWN_0D01 0x00000d01 -#define REG_A4XX_UNKNOWN_0E05 0x00000e05 - #define REG_A4XX_UNKNOWN_0E42 0x00000e42 #define REG_A4XX_UNKNOWN_0EC2 0x00000ec2 -#define REG_A4XX_UNKNOWN_0EC3 0x00000ec3 - -#define REG_A4XX_UNKNOWN_0F03 0x00000f03 - #define REG_A4XX_UNKNOWN_2001 0x00002001 #define REG_A4XX_UNKNOWN_209B 0x0000209b @@ -2439,6 +2674,8 @@ static inline uint32_t A4XX_UNKNOWN_20F7(float val) #define REG_A4XX_UNKNOWN_22D7 0x000022d7 +#define REG_A4XX_UNKNOWN_2352 0x00002352 + #define REG_A4XX_TEX_SAMP_0 0x00000000 #define A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR 0x00000001 #define A4XX_TEX_SAMP_0_XY_MAG__MASK 0x00000006 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c index 396caa532fc..d5e823ef69d 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c @@ -61,7 +61,7 @@ fd4_blend_state_create(struct pipe_context *pctx, struct fd4_blend_stateobj *so; // enum a3xx_rop_code rop = ROP_COPY; bool reads_dest = false; - int i; + unsigned i, mrt_blend = 0; if (cso->logicop_enable) { // rop = cso->logicop_func; /* maps 1:1 */ @@ -84,11 +84,6 @@ fd4_blend_state_create(struct pipe_context *pctx, } } - if (cso->independent_blend_enable) { - DBG("Unsupported! independent blend state"); - return NULL; - } - so = CALLOC_STRUCT(fd4_blend_stateobj); if (!so) return NULL; @@ -96,7 +91,12 @@ fd4_blend_state_create(struct pipe_context *pctx, so->base = *cso; for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { - const struct pipe_rt_blend_state *rt = &cso->rt[i]; + const struct pipe_rt_blend_state *rt; + + if (cso->independent_blend_enable) + rt = &cso->rt[i]; + else + rt = &cso->rt[0]; so->rb_mrt[i].blend_control = A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | @@ -115,7 +115,7 @@ fd4_blend_state_create(struct pipe_context *pctx, A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE | A4XX_RB_MRT_CONTROL_BLEND | A4XX_RB_MRT_CONTROL_BLEND2; - so->rb_fs_output |= A4XX_RB_FS_OUTPUT_ENABLE_BLEND(1); + mrt_blend |= (1 << i); } if (reads_dest) @@ -125,5 +125,7 @@ fd4_blend_state_create(struct pipe_context *pctx, so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS); } + so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend); + return so; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h index 33641da5e2c..7620d00a625 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h @@ -32,17 +32,19 @@ #include "pipe/p_state.h" #include "pipe/p_context.h" +#include "freedreno_util.h" + struct fd4_blend_stateobj { struct pipe_blend_state base; struct { uint32_t control; uint32_t buf_info; uint32_t blend_control; - } rb_mrt[8]; + } rb_mrt[A4XX_MAX_RENDER_TARGETS]; uint32_t rb_fs_output; }; -static INLINE struct fd4_blend_stateobj * +static inline struct fd4_blend_stateobj * fd4_blend_stateobj(struct pipe_blend_state *blend) { return (struct fd4_blend_stateobj *)blend; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c index 2321876dd48..e172d350517 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c @@ -86,7 +86,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx) } static const uint8_t primtypes[PIPE_PRIM_MAX] = { - [PIPE_PRIM_POINTS] = DI_PT_POINTLIST_A3XX, + [PIPE_PRIM_POINTS] = DI_PT_POINTLIST, [PIPE_PRIM_LINES] = DI_PT_LINELIST, [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, [PIPE_PRIM_LINE_LOOP] = DI_PT_LINELOOP, @@ -119,6 +119,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv) fd4_gmem_init(pctx); fd4_texture_init(pctx); fd4_prog_init(pctx); + fd4_emit_init(pctx); pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv); if (!pctx) diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index 53e1bf6a2e6..0b749916841 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -90,7 +90,7 @@ struct fd4_context { struct ir3_shader_key last_key; }; -static INLINE struct fd4_context * +static inline struct fd4_context * fd4_context(struct fd_context *ctx) { return (struct fd4_context *)ctx; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index de5a306af60..2bd2ca23d54 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -48,6 +48,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, { const struct pipe_draw_info *info = emit->info; + if (!(fd4_emit_get_vp(emit) && fd4_emit_get_fp(emit))) + return; + fd4_emit_state(ctx, ring, emit); if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) @@ -108,7 +111,6 @@ static void fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) { struct fd4_context *fd4_ctx = fd4_context(ctx); - struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd4_emit emit = { .vtx = &ctx->vtx, .prog = &ctx->prog, @@ -129,8 +131,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) .fsaturate_t = fd4_ctx->fsaturate_t, .fsaturate_r = fd4_ctx->fsaturate_r, }, - .format = fd4_emit_format(pfb->cbufs[0]), - .pformat = pipe_surface_format(pfb->cbufs[0]), + .rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade, + .sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false, + .sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false, }; unsigned dirty; @@ -170,20 +173,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, struct fd4_context *fd4_ctx = fd4_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; unsigned dirty = ctx->dirty; - unsigned ce, i; + unsigned i; struct fd4_emit emit = { .vtx = &fd4_ctx->solid_vbuf_state, .prog = &ctx->solid_prog, .key = { - .half_precision = true, + .half_precision = fd_half_precision(pfb), }, - .format = fd4_emit_format(pfb->cbufs[0]), }; - uint32_t colr = 0; - - if ((buffers & PIPE_CLEAR_COLOR) && pfb->nr_cbufs) - colr = pack_rgba(pfb->cbufs[0]->format, color->f); dirty &= FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR; dirty |= FD_DIRTY_PROG; @@ -257,16 +256,15 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, if (buffers & PIPE_CLEAR_COLOR) { OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1); OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); - ce = 0xf; - } else { - ce = 0x0; } - for (i = 0; i < 8; i++) { + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = (buffers & (PIPE_CLEAR_COLOR0 << i)) ? 0xf : 0x0; + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR | A4XX_RB_MRT_CONTROL_B11 | - A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(ce)); + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) | @@ -277,6 +275,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); } + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + fd4_emit_vertex_bufs(ring, &emit); OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); @@ -285,14 +293,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1); OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_RB_CLEAR_COLOR_DW0, 4); - OUT_RING(ring, colr); /* RB_CLEAR_COLOR_DW0 */ - OUT_RING(ring, colr); /* RB_CLEAR_COLOR_DW1 */ - OUT_RING(ring, colr); /* RB_CLEAR_COLOR_DW2 */ - OUT_RING(ring, colr); /* RB_CLEAR_COLOR_DW3 */ - /* until fastclear works: */ - fd4_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL); + fd4_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL); OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h index 1bd376ca6ec..b89a30a7c4b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h @@ -106,6 +106,7 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, { struct pipe_index_buffer *idx = &ctx->indexbuf; struct fd_bo *idx_bo = NULL; + enum pc_di_primtype primtype = ctx->primtypes[info->mode]; enum a4xx_index_size idx_type; enum pc_di_src_sel src_sel; uint32_t idx_size, idx_offset; @@ -126,7 +127,12 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, src_sel = DI_SRC_SEL_AUTO_INDEX; } - fd4_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel, + /* points + psize -> spritelist: */ + if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex && + (info->mode == PIPE_PRIM_POINTS)) + primtype = DI_PT_POINTLIST_PSIZE; + + fd4_draw(ctx, ring, primtype, vismode, src_sel, info->count, info->instance_count, idx_type, idx_size, idx_offset, idx_bo); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index 4b6eb646aa7..b75be29e523 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -43,19 +43,26 @@ #include "fd4_format.h" #include "fd4_zsa.h" +static const enum adreno_state_block sb[] = { + [SHADER_VERTEX] = SB_VERT_SHADER, + [SHADER_FRAGMENT] = SB_FRAG_SHADER, +}; + /* regid: base const register * prsc or dwords: buffer containing constant values * sizedwords: size of const value buffer */ void -fd4_emit_constant(struct fd_ringbuffer *ring, - enum adreno_state_block sb, +fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type, uint32_t regid, uint32_t offset, uint32_t sizedwords, const uint32_t *dwords, struct pipe_resource *prsc) { uint32_t i, sz; enum adreno_state_src src; + debug_assert((regid % 4) == 0); + debug_assert((sizedwords % 4) == 0); + if (prsc) { sz = 0; src = 0x2; // TODO ?? @@ -67,7 +74,7 @@ fd4_emit_constant(struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) | CP_LOAD_STATE_0_STATE_SRC(src) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4)); if (prsc) { struct fd_bo *bo = fd_resource(prsc)->bo; @@ -84,89 +91,31 @@ fd4_emit_constant(struct fd_ringbuffer *ring, } static void -emit_constants(struct fd_ringbuffer *ring, - enum adreno_state_block sb, - struct fd_constbuf_stateobj *constbuf, - struct ir3_shader_variant *shader, - bool emit_immediates) +fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, + uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets) { - uint32_t enabled_mask = constbuf->enabled_mask; - uint32_t max_const; - int i; - - // XXX TODO only emit dirty consts.. but we need to keep track if - // they are clobbered by a clear, gmem2mem, or mem2gmem.. - constbuf->dirty_mask = enabled_mask; - - /* in particular, with binning shader we may end up with unused - * consts, ie. we could end up w/ constlen that is smaller - * than first_immediate. In that case truncate the user consts - * early to avoid HLSQ lockup caused by writing too many consts - */ - max_const = MIN2(shader->first_driver_param, shader->constlen); - - /* emit user constants: */ - if (enabled_mask & 1) { - const unsigned index = 0; - struct pipe_constant_buffer *cb = &constbuf->cb[index]; - unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ - - // I expect that size should be a multiple of vec4's: - assert(size == align(size, 4)); - - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, 4 * max_const); - - if (size && (constbuf->dirty_mask & (1 << index))) { - fd4_emit_constant(ring, sb, 0, - cb->buffer_offset, size, - cb->user_buffer, cb->buffer); - constbuf->dirty_mask &= ~(1 << index); - } - - enabled_mask &= ~(1 << index); - } - - /* emit ubos: */ - if (shader->constlen > shader->first_driver_param) { - uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param); - OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | - CP_LOAD_STATE_0_NUM_UNIT(params)); - OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); - - for (i = 1; i <= params * 4; i++) { - struct pipe_constant_buffer *cb = &constbuf->cb[i]; - assert(!cb->user_buffer); - if ((enabled_mask & (1 << i)) && cb->buffer) - OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0); - else - OUT_RING(ring, 0xbad00000 | ((i - 1) << 16)); - } - } - - /* emit shader immediates: */ - if (shader && emit_immediates) { - int size = shader->immediates_count; - uint32_t base = shader->first_immediate; - - /* truncate size to avoid writing constants that shader - * does not use: - */ - size = MIN2(size + base, shader->constlen) - base; + uint32_t i; - /* convert out of vec4: */ - base *= 4; - size *= 4; + debug_assert((regid % 4) == 0); + debug_assert((num % 4) == 0); - if (size > 0) { - fd4_emit_constant(ring, sb, base, - 0, size, shader->immediates[0].val, NULL); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + num); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | + CP_LOAD_STATE_0_NUM_UNIT(num/4)); + OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); + + for (i = 0; i < num; i++) { + if (bos[i]) { + if (write) { + OUT_RELOCW(ring, bos[i], offsets[i], 0, 0); + } else { + OUT_RELOC(ring, bos[i], offsets[i], 0, 0); + } + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); } } } @@ -223,15 +172,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, const struct fd4_pipe_sampler_view *view = tex->textures[i] ? fd4_pipe_sampler_view(tex->textures[i]) : &dummy_view; - struct fd_resource *rsc = fd_resource(view->base.texture); - unsigned start = view->base.u.tex.first_level; - uint32_t offset = fd_resource_offset(rsc, start, 0); + unsigned start = fd_sampler_first_level(&view->base); OUT_RING(ring, view->texconst0); OUT_RING(ring, view->texconst1); OUT_RING(ring, view->texconst2); OUT_RING(ring, view->texconst3); - OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0); + if (view->base.texture) { + struct fd_resource *rsc = fd_resource(view->base.texture); + uint32_t offset = fd_resource_offset(rsc, start, 0); + OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0); + } else { + OUT_RING(ring, 0x00000000); + } OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); @@ -244,51 +197,110 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, * special cases.. */ void -fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf) +fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs, + struct pipe_surface **bufs) { - struct fd_resource *rsc = fd_resource(psurf->texture); - unsigned lvl = psurf->u.tex.level; - struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl); - uint32_t offset = fd_resource_offset(rsc, lvl, psurf->u.tex.first_layer); - enum pipe_format format = fd4_gmem_restore_format(psurf->format); + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS]; + int i; - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = (i < nr_bufs) ? 0xf : 0; + } /* output sampler state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 4); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs)); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) | CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | - CP_LOAD_STATE_0_NUM_UNIT(1)); + CP_LOAD_STATE_0_NUM_UNIT(nr_bufs)); OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) | CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) | - A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) | - A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) | - A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) | - A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT)); - OUT_RING(ring, 0x00000000); + for (i = 0; i < nr_bufs; i++) { + OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) | + A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) | + A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) | + A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) | + A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT)); + OUT_RING(ring, 0x00000000); + } /* emit texture state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 10); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * nr_bufs)); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) | CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | - CP_LOAD_STATE_0_NUM_UNIT(1)); + CP_LOAD_STATE_0_NUM_UNIT(nr_bufs)); OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | - A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | - fd4_tex_swiz(format, PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN, - PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA)); - OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(psurf->width) | - A4XX_TEX_CONST_1_HEIGHT(psurf->height)); - OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp)); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + for (i = 0; i < nr_bufs; i++) { + if (bufs[i]) { + struct fd_resource *rsc = fd_resource(bufs[i]->texture); + /* note: PIPE_BUFFER disallowed for surfaces */ + unsigned lvl = bufs[i]->u.tex.level; + struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl); + uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer); + enum pipe_format format = fd4_gmem_restore_format(bufs[i]->format); + + /* The restore blit_zs shader expects stencil in sampler 0, + * and depth in sampler 1 + */ + if (rsc->stencil && (i == 0)) { + rsc = rsc->stencil; + format = fd4_gmem_restore_format(rsc->base.b.format); + } + + /* z32 restore is accomplished using depth write. If there is + * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + * then no render target: + * + * (The same applies for z32_s8x24, since for stencil sampler + * state the above 'if' will replace 'format' with s8) + */ + if ((format == PIPE_FORMAT_Z32_FLOAT) || + (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) + mrt_comp[i] = 0; + + debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer); + + OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | + A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | + fd4_tex_swiz(format, PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN, + PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA)); + OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) | + A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height)); + OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp)); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } else { + OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) | + A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | + A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE)); + OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) | + A4XX_TEX_CONST_1_HEIGHT(0)); + OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); } void @@ -298,7 +310,9 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) uint32_t total_in = 0; const struct fd_vertex_state *vtx = emit->vtx; struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); - unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0); + unsigned vertex_regid = regid(63, 0); + unsigned instance_regid = regid(63, 0); + unsigned vtxcnt_regid = regid(63, 0); for (i = 0; i < vp->inputs_count; i++) { uint8_t semantic = sem2name(vp->inputs[i].semantic); @@ -306,6 +320,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) vertex_regid = vp->inputs[i].regid; else if (semantic == TGSI_SEMANTIC_INSTANCEID) instance_regid = vp->inputs[i].regid; + else if (semantic == IR3_SEMANTIC_VTXCNT) + vtxcnt_regid = vp->inputs[i].regid; else if ((i < vtx->vtx->num_elements) && vp->inputs[i].compmask) last = i; } @@ -313,7 +329,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) /* hw doesn't like to be configured for zero vbo's, it seems: */ if ((vtx->vtx->num_elements == 0) && (vertex_regid == regid(63, 0)) && - (instance_regid == regid(63, 0))) + (instance_regid == regid(63, 0)) && + (vtxcnt_regid == regid(63, 0))) return; for (i = 0, j = 0; i <= last; i++) { @@ -327,7 +344,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt); bool switchnext = (i != last) || (vertex_regid != regid(63, 0)) || - (instance_regid != regid(63, 0)); + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); bool isint = util_format_is_pure_integer(pfmt); uint32_t fs = util_format_get_blocksize(pfmt); uint32_t off = vb->buffer_offset + elem->src_offset; @@ -368,7 +386,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | A4XX_VFD_CONTROL_1_REGID4INST(instance_regid)); OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_2 */ - OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(regid(63, 0))); + OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid)); OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_4 */ /* cache invalidate, otherwise vertex fetch could see @@ -389,6 +407,25 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, emit_marker(ring, 5); + if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->key.binning_pass) { + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; + + for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; + } + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + } + if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) { uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control; @@ -513,43 +550,24 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); } - if (dirty & FD_DIRTY_PROG) - fd4_program_emit(ring, emit); - - if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) && - /* evil hack to deal sanely with clear path: */ - (emit->prog == &ctx->prog)) { - fd_wfi(ctx, ring); - emit_constants(ring, SB_VERT_SHADER, - &ctx->constbuf[PIPE_SHADER_VERTEX], - vp, emit->prog->dirty & FD_SHADER_DIRTY_VP); - if (!emit->key.binning_pass) { - emit_constants(ring, SB_FRAG_SHADER, - &ctx->constbuf[PIPE_SHADER_FRAGMENT], - fp, emit->prog->dirty & FD_SHADER_DIRTY_FP); - } + if (dirty & FD_DIRTY_PROG) { + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs); } - /* emit driver params every time */ - if (emit->info && emit->prog == &ctx->prog) { - uint32_t vertex_params[4] = { - emit->info->indexed ? emit->info->index_bias : emit->info->start, - 0, - 0, - 0 - }; - if (vp->constlen >= vp->first_driver_param + 4) { - fd4_emit_constant(ring, SB_VERT_SHADER, - (vp->first_driver_param + 4) * 4, - 0, 4, vertex_params, NULL); - } + if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */ + ir3_emit_consts(vp, ring, emit->info, dirty); + if (!emit->key.binning_pass) + ir3_emit_consts(fp, ring, emit->info, dirty); + /* mark clean after emitting consts: */ + ctx->prog.dirty = 0; } if ((dirty & FD_DIRTY_BLEND) && ctx->blend) { struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend); uint32_t i; - for (i = 0; i < 8; i++) { + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, blend->rb_mrt[i].control); @@ -607,10 +625,10 @@ fd4_emit_restore(struct fd_context *ctx) OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1); OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC3, 1); + OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1); OUT_RING(ring, 0x00000006); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0F03, 1); + OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1); OUT_RING(ring, 0x0000003a); OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1); @@ -629,7 +647,7 @@ fd4_emit_restore(struct fd_context *ctx) OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000012); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E05, 1); + OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1); OUT_RING(ring, 0x00000000); OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1); @@ -752,9 +770,6 @@ fd4_emit_restore(struct fd_context *ctx) OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff)); - OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(0xf)); - OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1); OUT_RING(ring, A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR); @@ -763,3 +778,11 @@ fd4_emit_restore(struct fd_context *ctx) ctx->needs_rb_fbd = true; } + +void +fd4_emit_init(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->emit_const = fd4_emit_const; + ctx->emit_const_bo = fd4_emit_const_bo; +} diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h index 7d059f8e532..ab7850e50b0 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h @@ -37,15 +37,13 @@ #include "ir3_shader.h" struct fd_ringbuffer; -enum adreno_state_block; -void fd4_emit_constant(struct fd_ringbuffer *ring, - enum adreno_state_block sb, +void fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type, uint32_t regid, uint32_t offset, uint32_t sizedwords, const uint32_t *dwords, struct pipe_resource *prsc); void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, - struct pipe_surface *psurf); + unsigned nr_bufs, struct pipe_surface **bufs); /* grouped together emit-state for prog/vertex/state emit: */ struct fd4_emit { @@ -53,10 +51,12 @@ struct fd4_emit { const struct fd_program_stateobj *prog; const struct pipe_draw_info *info; struct ir3_shader_key key; - enum a4xx_color_fmt format; - enum pipe_format pformat; uint32_t dirty; + uint32_t sprite_coord_enable; /* bitmask */ + bool sprite_coord_mode; + bool rasterflat; + /* cached to avoid repeated lookups of same variants: */ struct ir3_shader_variant *vp, *fp; /* TODO: other shader stages.. */ @@ -96,4 +96,6 @@ void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, void fd4_emit_restore(struct fd_context *ctx); +void fd4_emit_init(struct pipe_context *pctx); + #endif /* FD4_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c index 29abe0b0cc3..3e0045449eb 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c @@ -89,6 +89,14 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { _T(L8_UNORM, 8_UNORM, R8_UNORM, WZYX), _T(I8_UNORM, 8_UNORM, NONE, WZYX), + /* NOTE: should be TFMT_8_UINT (which then gets remapped to + * TFMT_8_UNORM for mem2gmem in _gmem_restore_format()), but + * we don't know TFMT_8_UINT yet.. so just use TFMT_8_UNORM + * for now.. sampling from stencil as a texture might not + * work right, but at least should be fine for zsbuf.. + */ + _T(S8_UINT, 8_UNORM, R8_UNORM, WZYX), + /* 16-bit */ V_(R16_UNORM, 16_UNORM, NONE, WZYX), V_(R16_SNORM, 16_SNORM, NONE, WZYX), @@ -96,7 +104,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { VT(R16_SINT, 16_SINT, R16_SINT, WZYX), V_(R16_USCALED, 16_UINT, NONE, WZYX), V_(R16_SSCALED, 16_UINT, NONE, WZYX), - VT(R16_FLOAT, 16_FLOAT, NONE, WZYX), + VT(R16_FLOAT, 16_FLOAT, R16_FLOAT,WZYX), _T(A16_UINT, 16_UINT, NONE, WZYX), _T(A16_SINT, 16_SINT, NONE, WZYX), @@ -132,7 +140,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { VT(R32_SINT, 32_SINT, R32_SINT, WZYX), V_(R32_USCALED, 32_UINT, NONE, WZYX), V_(R32_SSCALED, 32_UINT, NONE, WZYX), - VT(R32_FLOAT, 32_FLOAT, NONE, WZYX), + VT(R32_FLOAT, 32_FLOAT, R32_FLOAT,WZYX), V_(R32_FIXED, 32_FIXED, NONE, WZYX), _T(A32_UINT, 32_UINT, NONE, WZYX), @@ -148,7 +156,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { VT(R16G16_SINT, 16_16_SINT, R16G16_SINT, WZYX), V_(R16G16_USCALED, 16_16_UINT, NONE, WZYX), V_(R16G16_SSCALED, 16_16_SINT, NONE, WZYX), - VT(R16G16_FLOAT, 16_16_FLOAT, NONE, WZYX), + VT(R16G16_FLOAT, 16_16_FLOAT, R16G16_FLOAT,WZYX), _T(L16A16_UINT, 16_16_UINT, NONE, WZYX), _T(L16A16_SINT, 16_16_SINT, NONE, WZYX), @@ -191,7 +199,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { _T(Z24X8_UNORM, X8Z24_UNORM, R8G8B8A8_UNORM, WZYX), _T(Z24_UNORM_S8_UINT, X8Z24_UNORM, R8G8B8A8_UNORM, WZYX), - /*_T(Z32_FLOAT, Z32_FLOAT, R8G8B8A8_UNORM, WZYX),*/ + _T(Z32_FLOAT, 32_FLOAT, R8G8B8A8_UNORM, WZYX), + _T(Z32_FLOAT_S8X24_UINT, 32_FLOAT,R8G8B8A8_UNORM, WZYX), /* 48-bit */ V_(R16G16B16_UNORM, 16_16_16_UNORM, NONE, WZYX), @@ -218,7 +227,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { VT(R32G32_SINT, 32_32_SINT, R32G32_SINT, WZYX), V_(R32G32_USCALED, 32_32_UINT, NONE, WZYX), V_(R32G32_SSCALED, 32_32_SINT, NONE, WZYX), - VT(R32G32_FLOAT, 32_32_FLOAT, NONE, WZYX), + VT(R32G32_FLOAT, 32_32_FLOAT, R32G32_FLOAT,WZYX), V_(R32G32_FIXED, 32_32_FIXED, NONE, WZYX), _T(L32A32_UINT, 32_32_UINT, NONE, WZYX), @@ -282,6 +291,9 @@ fd4_pipe2swap(enum pipe_format format) enum a4xx_tex_fetchsize fd4_pipe2fetchsize(enum pipe_format format) { + if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + format = PIPE_FORMAT_Z32_FLOAT; + switch (util_format_get_blocksizebits(format)) { case 8: return TFETCH4_1_BYTE; case 16: return TFETCH4_2_BYTE; @@ -312,6 +324,8 @@ fd4_gmem_restore_format(enum pipe_format format) return PIPE_FORMAT_R8G8B8A8_UNORM; case PIPE_FORMAT_Z16_UNORM: return PIPE_FORMAT_R8G8_UNORM; + case PIPE_FORMAT_S8_UINT: + return PIPE_FORMAT_R8_UNORM; default: return format; } @@ -328,6 +342,9 @@ fd4_pipe2depth(enum pipe_format format) case PIPE_FORMAT_X8Z24_UNORM: case PIPE_FORMAT_S8_UINT_Z24_UNORM: return DEPTH4_24_8; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return DEPTH4_32; default: return ~0; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c index 9a905062071..81c37f72565 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c @@ -44,12 +44,6 @@ #include "fd4_format.h" #include "fd4_zsa.h" -static const struct ir3_shader_key key = { - // XXX should set this based on render target format! We don't - // want half_precision if float32 render target!!! - .half_precision = true, -}; - static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w) @@ -63,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, tile_mode = TILE4_LINEAR; } - for (i = 0; i < 8; i++) { + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { enum a4xx_color_fmt format = 0; enum a3xx_color_swap swap = WZYX; struct fd_resource *rsc = NULL; @@ -74,11 +68,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, if ((i < nr_bufs) && bufs[i]) { struct pipe_surface *psurf = bufs[i]; + enum pipe_format pformat = 0; rsc = fd_resource(psurf->texture); + pformat = psurf->format; + + /* In case we're drawing to Z32F_S8, the "color" actually goes to + * the stencil + */ + if (rsc->stencil) { + rsc = rsc->stencil; + pformat = rsc->base.b.format; + bases++; + } + slice = fd_resource_slice(rsc, psurf->u.tex.level); - format = fd4_pipe2color(psurf->format); - swap = fd4_pipe2swap(psurf->format); + format = fd4_pipe2color(pformat); + swap = fd4_pipe2swap(pformat); debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); @@ -94,6 +100,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, } else { stride = slice->pitch * rsc->cpp; } + } else if ((i < nr_bufs) && bases) { + base = bases[i]; } OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3); @@ -101,7 +109,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap)); - if (bin_w || (i >= nr_bufs)) { + if (bin_w || (i >= nr_bufs) || !bufs[i]) { OUT_RING(ring, base); OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride)); } else { @@ -115,30 +123,26 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, } } -static uint32_t -depth_base(struct fd_context *ctx) -{ - struct fd_gmem_stateobj *gmem = &ctx->gmem; - struct pipe_framebuffer_state *pfb = &ctx->framebuffer; - uint32_t cpp = 4; - if (pfb->cbufs[0]) { - struct fd_resource *rsc = - fd_resource(pfb->cbufs[0]->texture); - cpp = rsc->cpp; - } - return align(gmem->bin_w * gmem->bin_h * cpp, 0x4000); -} - /* transfer from gmem to system memory (ie. normal RAM) */ static void -emit_gmem2mem_surf(struct fd_context *ctx, +emit_gmem2mem_surf(struct fd_context *ctx, bool stencil, uint32_t base, struct pipe_surface *psurf) { struct fd_ringbuffer *ring = ctx->ring; struct fd_resource *rsc = fd_resource(psurf->texture); - struct fd_resource_slice *slice = &rsc->slices[psurf->u.tex.level]; - uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, + enum pipe_format pformat = psurf->format; + struct fd_resource_slice *slice; + uint32_t offset; + + if (stencil) { + debug_assert(rsc->stencil); + rsc = rsc->stencil; + pformat = rsc->base.b.format; + } + + slice = &rsc->slices[psurf->u.tex.level]; + offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); @@ -150,10 +154,10 @@ emit_gmem2mem_surf(struct fd_context *ctx, OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp)); OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) | - A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(psurf->format)) | + A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) | A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | - A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(psurf->format))); + A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat))); fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL); @@ -163,13 +167,15 @@ static void fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) { struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_gmem_stateobj *gmem = &ctx->gmem; struct fd_ringbuffer *ring = ctx->ring; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd4_emit emit = { .vtx = &fd4_ctx->solid_vbuf_state, .prog = &ctx->solid_prog, - .key = key, - .format = fd4_emit_format(pfb->cbufs[0]), + .key = { + .half_precision = true, + }, }; OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); @@ -238,16 +244,26 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ - fd4_program_emit(ring, &emit); + fd4_program_emit(ring, &emit, 0, NULL); fd4_emit_vertex_bufs(ring, &emit); if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - uint32_t base = depth_base(ctx); - emit_gmem2mem_surf(ctx, base, pfb->zsbuf); + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + if (!rsc->stencil || (ctx->resolve & FD_BUFFER_DEPTH)) + emit_gmem2mem_surf(ctx, false, ctx->gmem.zsbuf_base[0], pfb->zsbuf); + if (rsc->stencil && (ctx->resolve & FD_BUFFER_STENCIL)) + emit_gmem2mem_surf(ctx, true, ctx->gmem.zsbuf_base[1], pfb->zsbuf); } if (ctx->resolve & FD_BUFFER_COLOR) { - emit_gmem2mem_surf(ctx, 0, pfb->cbufs[0]); + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(ctx->resolve & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_gmem2mem_surf(ctx, false, gmem->cbuf_base[i], pfb->cbufs[i]); + } } OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); @@ -260,14 +276,25 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) /* transfer from system memory to gmem */ static void -emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, - struct pipe_surface *psurf, uint32_t bin_w) +emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases, + struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w) { struct fd_ringbuffer *ring = ctx->ring; + struct pipe_surface *zsbufs[2]; + + emit_mrt(ring, nr_bufs, bufs, bases, bin_w); + + if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { + /* The gmem_restore_tex logic will put the first buffer's stencil + * as color. Supply it with the proper information to make that + * happen. + */ + zsbufs[0] = zsbufs[1] = bufs[0]; + bufs = zsbufs; + nr_bufs = 2; + } - emit_mrt(ring, 1, &psurf, &base, bin_w); - - fd4_emit_gmem_restore_tex(ring, psurf); + fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs); fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL); @@ -282,10 +309,14 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd4_emit emit = { .vtx = &fd4_ctx->blit_vbuf_state, + .sprite_coord_enable = 1, + /* NOTE: They all use the same VP, this is for vtx bufs. */ .prog = &ctx->blit_prog[0], - .key = key, - .format = fd4_emit_format(pfb->cbufs[0]), + .key = { + .half_precision = fd_half_precision(pfb), + }, }; + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; float x0, y0, x1, y1; unsigned bin_w = tile->bin_w; unsigned bin_h = tile->bin_h; @@ -304,7 +335,9 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, fui(x1)); OUT_RING(ring, fui(y1)); - for (i = 0; i < 8; i++) { + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR | A4XX_RB_MRT_CONTROL_B11 | @@ -319,6 +352,16 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); } + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); OUT_RING(ring, 0x8); /* XXX RB_RENDER_CONTROL */ @@ -381,7 +424,6 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ - fd4_program_emit(ring, &emit); fd4_emit_vertex_bufs(ring, &emit); /* for gmem pitch/base calculations, we need to use the non- @@ -390,11 +432,46 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) bin_w = gmem->bin_w; bin_h = gmem->bin_h; - if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) - emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w); + if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) { + emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1]; + emit.fp = NULL; /* frag shader changed so clear cache */ + fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); + emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); + } - if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) - emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w); + if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + switch (pfb->zsbuf->format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + case PIPE_FORMAT_Z32_FLOAT: + emit.prog = (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) ? + &ctx->blit_z : &ctx->blit_zs; + emit.key.half_precision = false; + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE | + A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE | + A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) | + A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x80000); /* GRAS_CL_CLIP_CNTL */ + + break; + default: + /* Non-float can use a regular color write. It's split over 8-bit + * components, so half precision is always sufficient. + */ + emit.prog = &ctx->blit_prog[0]; + emit.key.half_precision = true; + break; + } + emit.fp = NULL; /* frag shader changed so clear cache */ + fd4_program_emit(ring, &emit, 1, &pfb->zsbuf); + emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); + } OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | @@ -534,21 +611,35 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) struct fd_ringbuffer *ring = ctx->ring; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_gmem_stateobj *gmem = &ctx->gmem; - uint32_t reg; - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); - reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(ctx)); if (pfb->zsbuf) { - reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format)); - } - OUT_RING(ring, reg); - if (pfb->zsbuf) { - uint32_t cpp = util_format_get_blocksize(pfb->zsbuf->format); + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + uint32_t cpp = rsc->cpp; + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); + OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) | + A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format))); OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w)); OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w)); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); + if (rsc->stencil) { + OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL | + A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); + OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w)); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } } else { + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); + OUT_RING(ring, 0); /* RB_STENCIL_INFO */ + OUT_RING(ring, 0); /* RB_STENCIL_PITCH */ } OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1); @@ -586,7 +677,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, gmem->bin_w); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w); /* setup scissor/offset for current tile: */ OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index e8f5837f7ce..1a6d0142132 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -31,8 +31,6 @@ #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_format.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_parse.h" #include "freedreno_program.h" @@ -53,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state enum shader_t type) { struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj); - so->shader = ir3_shader_create(pctx, cso->tokens, type); + so->shader = ir3_shader_create(pctx, cso, type); return so; } @@ -213,14 +211,17 @@ setup_stages(struct fd4_emit *emit, struct stage *s) } void -fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) +fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, + int nr, struct pipe_surface **bufs) { struct stage s[MAX_STAGES]; - uint32_t pos_regid, posz_regid, psize_regid, color_regid; + uint32_t pos_regid, posz_regid, psize_regid, color_regid[8]; uint32_t face_regid, coord_regid, zwcoord_regid; int constmode; int i, j, k; + debug_assert(nr <= ARRAY_SIZE(color_regid)); + setup_stages(emit, s); /* blob seems to always use constmode currently: */ @@ -232,11 +233,30 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); psize_regid = ir3_find_output_regid(s[VS].v, ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0)); - color_regid = ir3_find_output_regid(s[FS].v, - ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); + if (s[FS].v->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = + ir3_find_output_regid(s[FS].v, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); + } else { + const struct ir3_shader_variant *fp = s[FS].v; + memset(color_regid, 0, sizeof(color_regid)); + for (i = 0; i < fp->outputs_count; i++) { + ir3_semantic sem = fp->outputs[i].semantic; + unsigned idx = sem2idx(sem); + if (sem2name(sem) != TGSI_SEMANTIC_COLOR) + continue; + debug_assert(idx < ARRAY_SIZE(color_regid)); + color_regid[idx] = fp->outputs[i].regid; + } + } + + /* adjust regids for alpha output formats. there is no alpha render + * format, so it's just treated like red + */ + for (i = 0; i < nr; i++) + if (util_format_is_alpha(pipe_surface_format(bufs[i]))) + color_regid[i] += 3; - if (util_format_is_alpha(emit->pformat)) - color_regid += 3; /* TODO get these dynamically: */ face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0); @@ -419,29 +439,24 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) A4XX_RB_RENDER_CONTROL2_WCOORD)); OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(1) | + OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); - if (s[FS].v->writes_pos) { - OUT_RING(ring, 0x00000001 | - A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE | - A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); - } else { - OUT_RING(ring, 0x00000001); - } + OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | + COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | + A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid) | - A4XX_SP_FS_MRT_REG_MRTFORMAT(emit->format) | - COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0)); + for (i = 0; i < 8; i++) { + enum a4xx_color_fmt format = 0; + if (i < nr) + format = fd4_emit_format(bufs[i]); + OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | + A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | + COND(emit->key.half_precision, + A4XX_SP_FS_MRT_REG_HALF_PRECISION)); + } if (emit->key.binning_pass) { OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); @@ -450,10 +465,10 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); OUT_RING(ring, 0x00000000); } else { - uint32_t vinterp[8], flatshade[2]; + uint32_t vinterp[8], vpsrepl[8]; memset(vinterp, 0, sizeof(vinterp)); - memset(flatshade, 0, sizeof(flatshade)); + memset(vpsrepl, 0, sizeof(vpsrepl)); /* looks like we need to do int varyings in the frag * shader on a4xx (no flatshad reg? or a420.0 bug?): @@ -470,29 +485,40 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) * something like the code below instead of workaround * in the shader: */ -#if 0 - /* figure out VARYING_INTERP / FLAT_SHAD register values: */ + /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) { uint32_t interp = s[FS].v->inputs[j].interpolate; + + /* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG + * instead.. rather than -8 everywhere else.. + */ + uint32_t inloc = s[FS].v->inputs[j].inloc - 8; + + /* currently assuming varyings aligned to 4 (not + * packed): + */ + debug_assert((inloc % 4) == 0); + if ((interp == TGSI_INTERPOLATE_CONSTANT) || ((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) { - /* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG - * instead.. rather than -8 everywhere else.. - */ - uint32_t loc = s[FS].v->inputs[j].inloc - 8; - - /* currently assuming varyings aligned to 4 (not - * packed): - */ - debug_assert((loc % 4) == 0); + uint32_t loc = inloc; for (i = 0; i < 4; i++, loc++) { vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - flatshade[loc / 32] |= 1 << (loc % 32); + //flatshade[loc / 32] |= 1 << (loc % 32); } } + + /* Replace the .xy coordinates with S/T from the point sprite. Set + * interpolation bits for .zw such that they become .01 + */ + if (emit->sprite_coord_enable & (1 << sem2idx(s[FS].v->inputs[j].semantic))) { + vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09) + << ((inloc % 16) * 2); + vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2); + vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2); + } } -#endif OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) | @@ -509,7 +535,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8); for (i = 0; i < 8; i++) - OUT_RING(ring, s[FS].v->shader->vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ } if (s[VS].instrlen) @@ -520,19 +546,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit) emit_shader(ring, s[FS].v); } -/* hack.. until we figure out how to deal w/ vpsrepl properly.. */ -static void -fix_blit_fp(struct pipe_context *pctx) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd4_shader_stateobj *so = ctx->blit_prog[0].fp; - - so->shader->vpsrepl[0] = 0x99999999; - so->shader->vpsrepl[1] = 0x99999999; - so->shader->vpsrepl[2] = 0x99999999; - so->shader->vpsrepl[3] = 0x99999999; -} - void fd4_prog_init(struct pipe_context *pctx) { @@ -543,6 +556,4 @@ fd4_prog_init(struct pipe_context *pctx) pctx->delete_vs_state = fd4_vp_state_delete; fd_prog_init(pctx); - - fix_blit_fp(pctx); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h index 52306a4c60d..8dfccaf9d74 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h @@ -39,7 +39,8 @@ struct fd4_shader_stateobj { struct fd4_emit; -void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit); +void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, + int nr, struct pipe_surface **bufs); void fd4_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index 6db1c11b94b..4f69e0c1694 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -31,9 +31,93 @@ #include "freedreno_util.h" #include "fd4_query.h" +#include "fd4_draw.h" #include "fd4_format.h" + +struct fd_rb_samp_ctrs { + uint64_t ctr[16]; +}; + +/* + * Occlusion Query: + * + * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they + * interpret results + */ + +static struct fd_hw_sample * +occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + struct fd_hw_sample *samp = + fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs)); + + /* low bits of sample addr should be zero (since they are control + * flags in RB_SAMPLE_COUNT_CONTROL): + */ + debug_assert((samp->offset & 0x3) == 0); + + /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of + * HW_QUERY_BASE_REG register: + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); + OUT_RING(ring, HW_QUERY_BASE_REG); + OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | + samp->offset); + + OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); + OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, + INDEX4_SIZE_32_BIT, USE_VISIBILITY)); + OUT_RING(ring, 1); /* NumInstances */ + OUT_RING(ring, 0); /* NumIndices */ + + fd_event_write(ctx, ring, ZPASS_DONE); + + return samp; +} + +static uint64_t +count_samples(const struct fd_rb_samp_ctrs *start, + const struct fd_rb_samp_ctrs *end) +{ + return end->ctr[0] - start->ctr[0]; +} + +static void +occlusion_counter_accumulate_result(struct fd_context *ctx, + const void *start, const void *end, + union pipe_query_result *result) +{ + uint64_t n = count_samples(start, end); + result->u64 += n; +} + +static void +occlusion_predicate_accumulate_result(struct fd_context *ctx, + const void *start, const void *end, + union pipe_query_result *result) +{ + uint64_t n = count_samples(start, end); + result->b |= (n > 0); +} + +static const struct fd_hw_sample_provider occlusion_counter = { + .query_type = PIPE_QUERY_OCCLUSION_COUNTER, + .active = FD_STAGE_DRAW, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_counter_accumulate_result, +}; + +static const struct fd_hw_sample_provider occlusion_predicate = { + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, + .active = FD_STAGE_DRAW, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_predicate_accumulate_result, +}; + void fd4_query_context_init(struct pipe_context *pctx) { - /* TODO */ + fd_hw_query_register_provider(pctx, &occlusion_counter); + fd_hw_query_register_provider(pctx, &occlusion_predicate); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c index e54b606a285..dc7e98b149d 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c @@ -50,7 +50,7 @@ fd4_rasterizer_state_create(struct pipe_context *pctx, if (cso->point_size_per_vertex) { psize_min = util_get_min_point_size(cso); - psize_max = 8192; + psize_max = 4092; } else { /* Force the point size to be as if the vertex output was disabled. */ psize_min = cso->point_size; @@ -67,9 +67,9 @@ fd4_rasterizer_state_create(struct pipe_context *pctx, */ so->gras_cl_clip_cntl = 0x80000; /* ??? */ so->gras_su_point_minmax = - A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min/2) | - A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max/2); - so->gras_su_point_size = A4XX_GRAS_SU_POINT_SIZE(cso->point_size/2); + A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | + A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); + so->gras_su_point_size = A4XX_GRAS_SU_POINT_SIZE(cso->point_size); so->gras_su_poly_offset_scale = A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); so->gras_su_poly_offset_offset = diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h index 06c728f2f1f..64e81a9983b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h @@ -44,7 +44,7 @@ struct fd4_rasterizer_stateobj { uint32_t pc_prim_vtx_cntl; }; -static INLINE struct fd4_rasterizer_stateobj * +static inline struct fd4_rasterizer_stateobj * fd4_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { return (struct fd4_rasterizer_stateobj *)rast; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c index e8cbb2d201a..d8ea414f300 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c @@ -102,7 +102,7 @@ void fd4_screen_init(struct pipe_screen *pscreen) { struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = 1; + screen->max_rts = A4XX_MAX_RENDER_TARGETS; screen->compiler = ir3_compiler_create(screen->gpu_id); pscreen->context_create = fd4_context_create; pscreen->is_format_supported = fd4_screen_is_format_supported; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c index 6ba25d0816d..d2bc5fee6c0 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c @@ -150,8 +150,8 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, { struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); - unsigned lvl = cso->u.tex.first_level; - unsigned miplevels = cso->u.tex.last_level - lvl; + unsigned lvl = fd_sampler_first_level(cso); + unsigned miplevels = fd_sampler_last_level(cso) - lvl; if (!so) return NULL; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h index 579ed87f14b..84ee7ecb50c 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h @@ -42,7 +42,7 @@ struct fd4_sampler_stateobj { uint32_t texsamp0, texsamp1; }; -static INLINE struct fd4_sampler_stateobj * +static inline struct fd4_sampler_stateobj * fd4_sampler_stateobj(struct pipe_sampler_state *samp) { return (struct fd4_sampler_stateobj *)samp; @@ -53,7 +53,7 @@ struct fd4_pipe_sampler_view { uint32_t texconst0, texconst1, texconst2, texconst3, textconst4; }; -static INLINE struct fd4_pipe_sampler_view * +static inline struct fd4_pipe_sampler_view * fd4_pipe_sampler_view(struct pipe_sampler_view *pview) { return (struct fd4_pipe_sampler_view *)pview; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h index 033317cf620..6a92a9b6785 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h @@ -47,7 +47,7 @@ struct fd4_zsa_stateobj { uint32_t rb_stencilrefmask_bf; }; -static INLINE struct fd4_zsa_stateobj * +static inline struct fd4_zsa_stateobj * fd4_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { return (struct fd4_zsa_stateobj *)zsa; diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index b23aa830770..00b6acba065 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2013-11-30 14:47:15) -- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2013-03-31 16:51:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2014-06-02 15:21:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2014-11-13 22:44:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14895 bytes, from 2015-04-19 15:23:28) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-04-12 18:16:35) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 59314 bytes, from 2015-04-19 16:21:40) - -Copyright (C) 2013-2014 by the following authors: +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) + +Copyright (C) 2013-2015 by the following authors: - Rob Clark <[email protected]> (robclark) Permission is hereby granted, free of charge, to any person obtaining diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 2b24c5b4e78..98a90e26679 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2013-11-30 14:47:15) -- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2013-03-31 16:51:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2014-06-02 15:21:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2014-11-13 22:44:30) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14895 bytes, from 2015-04-19 15:23:28) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-04-12 18:16:35) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 59314 bytes, from 2015-04-19 16:21:40) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <[email protected]> (robclark) @@ -67,7 +67,7 @@ enum vgt_event_type { enum pc_di_primtype { DI_PT_NONE = 0, - DI_PT_POINTLIST_A2XX = 1, + DI_PT_POINTLIST_PSIZE = 1, DI_PT_LINELIST = 2, DI_PT_LINESTRIP = 3, DI_PT_TRILIST = 4, @@ -75,7 +75,7 @@ enum pc_di_primtype { DI_PT_TRISTRIP = 6, DI_PT_LINELOOP = 7, DI_PT_RECTLIST = 8, - DI_PT_POINTLIST_A3XX = 9, + DI_PT_POINTLIST = 9, DI_PT_LINE_ADJ = 10, DI_PT_LINESTRIP_ADJ = 11, DI_PT_TRI_ADJ = 12, diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 668ef3629bf..8e6d43150ce 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -94,9 +94,7 @@ void fd_context_render(struct pipe_context *pctx) { struct fd_context *ctx = fd_context(pctx); - struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_resource *rsc, *rsc_tmp; - int i; DBG("needs_flush: %d", ctx->needs_flush); @@ -118,20 +116,11 @@ fd_context_render(struct pipe_context *pctx) ctx->gmem_reason = 0; ctx->num_draws = 0; - for (i = 0; i < pfb->nr_cbufs; i++) - if (pfb->cbufs[i]) - fd_resource(pfb->cbufs[i]->texture)->dirty = false; - if (pfb->zsbuf) { - rsc = fd_resource(pfb->zsbuf->texture); - rsc->dirty = false; - if (rsc->stencil) - rsc->stencil->dirty = false; - } - /* go through all the used resources and clear their reading flag */ LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) { - assert(rsc->reading); - rsc->reading = false; + debug_assert(rsc->status != 0); + rsc->status = 0; + rsc->pending_ctx = NULL; list_delinit(&rsc->list); } @@ -144,8 +133,10 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, { fd_context_render(pctx); - if (fence) + if (fence) { + fd_screen_fence_ref(pctx->screen, fence, NULL); *fence = fd_fence_create(pctx); + } } void diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index e420f1e5bd9..509a90fdf23 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -82,6 +82,20 @@ struct fd_vertex_stateobj { unsigned num_elements; }; +struct fd_streamout_stateobj { + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned num_targets; + /* Track offset from vtxcnt for streamout data. This counter + * is just incremented by # of vertices on each draw until + * reset or new streamout buffer bound. + * + * When we eventually have GS, the CPU won't actually know the + * number of vertices per draw, so I think we'll have to do + * something more clever. + */ + unsigned offsets[PIPE_MAX_SO_BUFFERS]; +}; + /* group together the vertex and vertexbuf state.. for ease of passing * around, and because various internal operations (gmem<->mem, etc) * need their own vertex state: @@ -179,7 +193,7 @@ struct fd_context { struct fd_program_stateobj solid_prog; // TODO move to screen? /* shaders used by mem->gmem blits: */ - struct fd_program_stateobj blit_prog[8]; // TODO move to screen? + struct fd_program_stateobj blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen? struct fd_program_stateobj blit_z, blit_zs; /* do we need to mem2gmem before rendering. We don't, if for example, @@ -319,6 +333,7 @@ struct fd_context { FD_DIRTY_VTXBUF = (1 << 15), FD_DIRTY_INDEXBUF = (1 << 16), FD_DIRTY_SCISSOR = (1 << 17), + FD_DIRTY_STREAMOUT = (1 << 18), } dirty; struct pipe_blend_state *blend; @@ -339,6 +354,7 @@ struct fd_context { struct pipe_viewport_state viewport; struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct pipe_index_buffer indexbuf; + struct fd_streamout_stateobj streamout; /* GMEM/tile handling fxns: */ void (*emit_tile_init)(struct fd_context *ctx); @@ -351,18 +367,25 @@ struct fd_context { void (*emit_sysmem_prep)(struct fd_context *ctx); /* draw: */ - void (*draw_vbo)(struct fd_context *pctx, const struct pipe_draw_info *info); + void (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info); void (*clear)(struct fd_context *ctx, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil); + + /* constant emit: (note currently not used/needed for a2xx) */ + void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type, + uint32_t regid, uint32_t offset, uint32_t sizedwords, + const uint32_t *dwords, struct pipe_resource *prsc); + void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write, + uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets); }; -static INLINE struct fd_context * +static inline struct fd_context * fd_context(struct pipe_context *pctx) { return (struct fd_context *)pctx; } -static INLINE struct pipe_scissor_state * +static inline struct pipe_scissor_state * fd_context_get_scissor(struct fd_context *ctx) { if (ctx->rasterizer && ctx->rasterizer->scissor) @@ -370,13 +393,13 @@ fd_context_get_scissor(struct fd_context *ctx) return &ctx->disabled_scissor; } -static INLINE bool +static inline bool fd_supported_prim(struct fd_context *ctx, unsigned prim) { return (1 << prim) & ctx->primtype_mask; } -static INLINE void +static inline void fd_reset_wfi(struct fd_context *ctx) { ctx->needs_wfi = true; diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index c9e317c7dc9..6831a58749c 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -40,7 +40,8 @@ #include "freedreno_util.h" static void -resource_reading(struct fd_context *ctx, struct pipe_resource *prsc) +resource_used(struct fd_context *ctx, struct pipe_resource *prsc, + enum fd_resource_status status) { struct fd_resource *rsc; @@ -48,9 +49,29 @@ resource_reading(struct fd_context *ctx, struct pipe_resource *prsc) return; rsc = fd_resource(prsc); - rsc->reading = true; + rsc->status |= status; + if (rsc->stencil) + rsc->stencil->status |= status; + + /* TODO resources can actually be shared across contexts, + * so I'm not sure a single list-head will do the trick? + */ + debug_assert((rsc->pending_ctx == ctx) || !rsc->pending_ctx); list_delinit(&rsc->list); list_addtail(&rsc->list, &ctx->used_resources); + rsc->pending_ctx = ctx; +} + +static void +resource_read(struct fd_context *ctx, struct pipe_resource *prsc) +{ + resource_used(ctx, prsc, FD_PENDING_READ); +} + +static void +resource_written(struct fd_context *ctx, struct pipe_resource *prsc) +{ + resource_used(ctx, prsc, FD_PENDING_WRITE); } static void @@ -59,7 +80,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct fd_context *ctx = fd_context(pctx); struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - unsigned i, buffers = 0; + unsigned i, prims, buffers = 0; /* if we supported transform feedback, we'd have to disable this: */ if (((scissor->maxx - scissor->minx) * @@ -69,6 +90,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) /* emulate unsupported primitives: */ if (!fd_supported_prim(ctx, info->mode)) { + if (ctx->streamout.num_targets > 0) + debug_error("stream-out with emulated prims"); util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf); util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer); util_primconvert_draw_vbo(ctx->primconvert, info); @@ -83,17 +106,13 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (fd_depth_enabled(ctx)) { buffers |= FD_BUFFER_DEPTH; - fd_resource(pfb->zsbuf->texture)->dirty = true; + resource_written(ctx, pfb->zsbuf->texture); ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED; } if (fd_stencil_enabled(ctx)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); buffers |= FD_BUFFER_STENCIL; - if (rsc->stencil) - rsc->stencil->dirty = true; - else - rsc->dirty = true; + resource_written(ctx, pfb->zsbuf->texture); ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED; } @@ -108,7 +127,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) surf = pfb->cbufs[i]->texture; - fd_resource(surf)->dirty = true; + resource_written(ctx, surf); buffers |= PIPE_CLEAR_COLOR0 << i; if (surf->nr_samples > 1) @@ -120,32 +139,38 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) /* Skip over buffer 0, that is sent along with the command stream */ for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - resource_reading(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer); - resource_reading(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer); + resource_read(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer); + resource_read(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer); } /* Mark VBOs as being read */ for (i = 0; i < ctx->vtx.vertexbuf.count; i++) { assert(!ctx->vtx.vertexbuf.vb[i].user_buffer); - resource_reading(ctx, ctx->vtx.vertexbuf.vb[i].buffer); + resource_read(ctx, ctx->vtx.vertexbuf.vb[i].buffer); } /* Mark index buffer as being read */ - resource_reading(ctx, ctx->indexbuf.buffer); + resource_read(ctx, ctx->indexbuf.buffer); /* Mark textures as being read */ for (i = 0; i < ctx->verttex.num_textures; i++) if (ctx->verttex.textures[i]) - resource_reading(ctx, ctx->verttex.textures[i]->texture); + resource_read(ctx, ctx->verttex.textures[i]->texture); for (i = 0; i < ctx->fragtex.num_textures; i++) if (ctx->fragtex.textures[i]) - resource_reading(ctx, ctx->fragtex.textures[i]->texture); + resource_read(ctx, ctx->fragtex.textures[i]->texture); + + /* Mark streamout buffers as being written.. */ + for (i = 0; i < ctx->streamout.num_targets; i++) + if (ctx->streamout.targets[i]) + resource_written(ctx, ctx->streamout.targets[i]->buffer); ctx->num_draws++; + prims = u_reduced_prims_for_vertices(info->mode, info->count); + ctx->stats.draw_calls++; - ctx->stats.prims_emitted += - u_reduced_prims_for_vertices(info->mode, info->count); + ctx->stats.prims_emitted += prims; /* any buffers that haven't been cleared yet, we need to restore: */ ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared); @@ -159,6 +184,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW); ctx->draw_vbo(ctx, info); + for (i = 0; i < ctx->streamout.num_targets; i++) + ctx->streamout.offsets[i] += prims; + /* if an app (or, well, piglit test) does many thousands of draws * without flush (or anything which implicitly flushes, like * changing render targets), we can exceed the ringbuffer size. @@ -216,15 +244,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers, if (buffers & PIPE_CLEAR_COLOR) for (i = 0; i < pfb->nr_cbufs; i++) if (buffers & (PIPE_CLEAR_COLOR0 << i)) - fd_resource(pfb->cbufs[i]->texture)->dirty = true; + resource_written(ctx, pfb->cbufs[i]->texture); if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - if (rsc->stencil && buffers & PIPE_CLEAR_STENCIL) - rsc->stencil->dirty = true; - if (!rsc->stencil || buffers & PIPE_CLEAR_DEPTH) - rsc->dirty = true; - + resource_written(ctx, pfb->zsbuf->texture); ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL; } @@ -242,7 +265,8 @@ fd_clear(struct pipe_context *pctx, unsigned buffers, FD_DIRTY_SAMPLE_MASK | FD_DIRTY_PROG | FD_DIRTY_CONSTBUF | - FD_DIRTY_BLEND; + FD_DIRTY_BLEND | + FD_DIRTY_FRAMEBUFFER; if (fd_mesa_debug & FD_DBG_DCLEAR) ctx->dirty = 0xffffffff; diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c index 375e58f7022..04a9feacd58 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.c +++ b/src/gallium/drivers/freedreno/freedreno_fence.c @@ -69,6 +69,9 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) { + if (!timeout) + return fd_screen_fence_signalled(screen, fence); + if (fd_pipe_wait(fence->screen->pipe, fence->timestamp)) return false; diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index c105378ec4e..648db9baee5 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -82,7 +82,7 @@ total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2], { uint32_t total = 0, i; - for (i = 0; i < 4; i++) { + for (i = 0; i < MAX_RENDER_TARGETS; i++) { if (cbuf_cpp[i]) { gmem->cbuf_base[i] = align(total, 0x4000); total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h; @@ -113,7 +113,7 @@ calculate_tiles(struct fd_context *ctx) uint32_t nbins_x = 1, nbins_y = 1; uint32_t bin_w, bin_h; uint32_t max_width = bin_width(ctx); - uint8_t cbuf_cpp[4] = {0}, zsbuf_cpp[2] = {0}; + uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0}; uint32_t i, j, t, xoff, yoff; uint32_t tpp_x, tpp_y; bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)); @@ -162,12 +162,17 @@ calculate_tiles(struct fd_context *ctx) bin_w = align(width / nbins_x, 32); } + if (fd_mesa_debug & FD_DBG_MSGS) { + debug_printf("binning input: cbuf cpp:"); + for (i = 0; i < pfb->nr_cbufs; i++) + debug_printf(" %d", cbuf_cpp[i]); + debug_printf(", zsbuf cpp: %d; %dx%d\n", + zsbuf_cpp[0], width, height); + } + /* then find a bin width/height that satisfies the memory * constraints: */ - DBG("binning input: cbuf cpp: %d %d %d %d, zsbuf cpp: %d; %dx%d", - cbuf_cpp[0], cbuf_cpp[1], cbuf_cpp[2], cbuf_cpp[3], zsbuf_cpp[0], - width, height); while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) { if (bin_w > bin_h) { nbins_x++; diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h index 5867235db90..38b557eb077 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.h +++ b/src/gallium/drivers/freedreno/freedreno_gmem.h @@ -31,6 +31,8 @@ #include "pipe/p_context.h" +#include "freedreno_util.h" + /* per-pipe configuration for hw binning: */ struct fd_vsc_pipe { struct fd_bo *bo; @@ -47,9 +49,9 @@ struct fd_tile { struct fd_gmem_stateobj { struct pipe_scissor_state scissor; - uint32_t cbuf_base[4]; + uint32_t cbuf_base[MAX_RENDER_TARGETS]; uint32_t zsbuf_base[2]; - uint8_t cbuf_cpp[4]; + uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; uint8_t zsbuf_cpp[2]; uint16_t bin_h, nbins_y; uint16_t bin_w, nbins_x; diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c index 5e344e69146..e6a647852a3 100644 --- a/src/gallium/drivers/freedreno/freedreno_program.c +++ b/src/gallium/drivers/freedreno/freedreno_program.c @@ -96,7 +96,11 @@ fd_prog_blit(struct pipe_context *pctx, int rts, bool depth) { int i; struct ureg_src tc; - struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT); + struct ureg_program *ureg; + + debug_assert(rts <= MAX_RENDER_TARGETS); + + ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT); if (!ureg) return NULL; diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index 95f79df565e..709ad4eb55b 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -42,6 +42,14 @@ #include <errno.h> + +static bool +pending(struct fd_resource *rsc, enum fd_resource_status status) +{ + return (rsc->status & status) || + (rsc->stencil && (rsc->stencil->status & status)); +} + static void fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc) { @@ -72,11 +80,11 @@ fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc) /* Textures */ for (i = 0; i < ctx->verttex.num_textures && !(ctx->dirty & FD_DIRTY_VERTTEX); i++) { - if (ctx->verttex.textures[i]->texture == prsc) + if (ctx->verttex.textures[i] && (ctx->verttex.textures[i]->texture == prsc)) ctx->dirty |= FD_DIRTY_VERTTEX; } for (i = 0; i < ctx->fragtex.num_textures && !(ctx->dirty & FD_DIRTY_FRAGTEX); i++) { - if (ctx->fragtex.textures[i]->texture == prsc) + if (ctx->fragtex.textures[i] && (ctx->fragtex.textures[i]->texture == prsc)) ctx->dirty |= FD_DIRTY_FRAGTEX; } } @@ -97,7 +105,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size) rsc->bo = fd_bo_new(screen->dev, size, flags); rsc->timestamp = 0; - rsc->dirty = rsc->reading = false; + rsc->status = 0; + rsc->pending_ctx = NULL; list_delinit(&rsc->list); util_range_set_empty(&rsc->valid_buffer_range); } @@ -238,8 +247,9 @@ fd_resource_transfer_map(struct pipe_context *pctx, /* If the GPU is writing to the resource, or if it is reading from the * resource and we're trying to write to it, flush the renders. */ - if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty) || - ((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading)) + if (((ptrans->usage & PIPE_TRANSFER_WRITE) && + pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) || + pending(rsc, FD_PENDING_WRITE)) fd_context_render(pctx); /* The GPU keeps track of how the various bo's are being used, and @@ -646,6 +656,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx) util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb); util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx); util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp); + util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets, + ctx->streamout.targets); util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer); util_blitter_save_viewport(ctx->blitter, &ctx->viewport); util_blitter_save_scissor(ctx->blitter, &ctx->scissor); @@ -675,7 +687,7 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc) { struct fd_resource *rsc = fd_resource(prsc); - if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty)) + if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ)) fd_context_render(pctx); } diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h index 0634923fcb2..7549becaa1f 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.h +++ b/src/gallium/drivers/freedreno/freedreno_resource.h @@ -60,6 +60,15 @@ struct fd_resource_slice { uint32_t size0; /* size of first layer in slice */ }; +/* status of queued up but not flushed reads and write operations. + * In _transfer_map() we need to know if queued up rendering needs + * to be flushed to preserve the order of cpu and gpu access. + */ +enum fd_resource_status { + FD_PENDING_WRITE = 0x01, + FD_PENDING_READ = 0x02, +}; + struct fd_resource { struct u_resource base; struct fd_bo *bo; @@ -68,17 +77,23 @@ struct fd_resource { uint32_t layer_size; struct fd_resource_slice slices[MAX_MIP_LEVELS]; uint32_t timestamp; - bool dirty, reading; /* buffer range that has been initialized */ struct util_range valid_buffer_range; /* reference to the resource holding stencil data for a z32_s8 texture */ + /* TODO rename to secondary or auxiliary? */ struct fd_resource *stencil; + /* pending read/write state: */ + enum fd_resource_status status; + /* resources accessed by queued but not flushed draws are tracked + * in the used_resources list. + */ struct list_head list; + struct fd_context *pending_ctx; }; -static INLINE struct fd_resource * +static inline struct fd_resource * fd_resource(struct pipe_resource *ptex) { return (struct fd_resource *)ptex; @@ -89,13 +104,13 @@ struct fd_transfer { void *staging; }; -static INLINE struct fd_transfer * +static inline struct fd_transfer * fd_transfer(struct pipe_transfer *ptrans) { return (struct fd_transfer *)ptrans; } -static INLINE struct fd_resource_slice * +static inline struct fd_resource_slice * fd_resource_slice(struct fd_resource *rsc, unsigned level) { assert(level <= rsc->base.b.last_level); @@ -103,7 +118,7 @@ fd_resource_slice(struct fd_resource *rsc, unsigned level) } /* get offset for specified mipmap level and texture/array layer */ -static INLINE uint32_t +static inline uint32_t fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer) { struct fd_resource_slice *slice = fd_resource_slice(rsc, level); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index b3b5462b437..b55f5b36ca9 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -68,7 +68,8 @@ static const struct debug_named_value debug_options[] = { {"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"}, {"nobin", FD_DBG_NOBIN, "Disable hw binning"}, {"optmsgs", FD_DBG_OPTMSGS,"Enable optimizer debug messages"}, - {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"}, + {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"}, + {"shaderdb", FD_DBG_SHADERDB, "Enable shaderdb output"}, DEBUG_NAMED_VALUE_END }; @@ -163,9 +164,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_BARRIER: case PIPE_CAP_TEXTURE_MIRROR_CLAMP: case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: case PIPE_CAP_START_INSTANCE: case PIPE_CAP_COMPUTE: @@ -175,10 +173,23 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PRIMITIVE_RESTART: case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - return is_a3xx(screen) || is_a4xx(screen); - case PIPE_CAP_INDEP_BLEND_ENABLE: case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + return is_a3xx(screen) || is_a4xx(screen); + + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + /* ignoring first/last_element.. but I guess that should be + * easy to add.. + */ + return 0; + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + /* I think 32k on a4xx.. and we could possibly emulate more + * by pretending 2d/rect textures and splitting high bits + * of index into 2nd dimension.. + */ + return 16383; + case PIPE_CAP_DEPTH_CLIP_DISABLE: return is_a3xx(screen); @@ -188,7 +199,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_GLSL_FEATURE_LEVEL: if (glsl120) return 120; - return (is_a3xx(screen) || is_a4xx(screen)) ? 130 : 120; + return is_ir3(screen) ? 130 : 120; /* Unsupported features. */ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: @@ -218,6 +229,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -225,9 +240,17 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + if (is_ir3(screen)) + return PIPE_MAX_SO_BUFFERS; + return 0; case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + if (is_ir3(screen)) + return 1; + return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + if (is_ir3(screen)) + return 16 * 4; /* should only be shader out limit? */ return 0; /* Geometry shader output, unsupported. */ @@ -258,9 +281,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_QUERY_TIMESTAMP: return 0; case PIPE_CAP_OCCLUSION_QUERY: - /* TODO still missing on a4xx, but we lie to get gl2.. - * it's not a feature, it's a bug! - */ return is_a3xx(screen) || is_a4xx(screen); case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: @@ -357,7 +377,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, */ return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]); case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return (is_a3xx(screen) || is_a4xx(screen)) ? 16 : 1; + return is_ir3(screen) ? 16 : 1; case PIPE_SHADER_CAP_MAX_PREDS: return 0; /* nothing uses this */ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: @@ -379,7 +399,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_INTEGERS: if (glsl120) return 0; - return (is_a3xx(screen) || is_a4xx(screen)) ? 1 : 0; + return is_ir3(screen) ? 1 : 0; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return 16; @@ -546,7 +566,6 @@ fd_screen_create(struct fd_device *dev) pscreen->get_timestamp = fd_screen_get_timestamp; pscreen->fence_reference = fd_screen_fence_ref; - pscreen->fence_signalled = fd_screen_fence_signalled; pscreen->fence_finish = fd_screen_fence_finish; util_format_s3tc_init(); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index dbc2808262a..4e5c3a61958 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -56,7 +56,7 @@ struct fd_screen { int64_t cpu_gpu_time_delta; }; -static INLINE struct fd_screen * +static inline struct fd_screen * fd_screen(struct pipe_screen *pscreen) { return (struct fd_screen *)pscreen; @@ -73,6 +73,7 @@ struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen, struct pipe_screen * fd_screen_create(struct fd_device *dev); /* is a3xx patch revision 0? */ +/* TODO a306.0 probably doesn't need this.. be more clever?? */ static inline boolean is_a3xx_p0(struct fd_screen *screen) { @@ -91,4 +92,11 @@ is_a4xx(struct fd_screen *screen) return (screen->gpu_id >= 400) && (screen->gpu_id < 500); } +/* is it using the ir3 compiler (shader isa introduced with a3xx)? */ +static inline boolean +is_ir3(struct fd_screen *screen) +{ + return is_a3xx(screen) || is_a4xx(screen); +} + #endif /* FREEDRENO_SCREEN_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index 77aa4f21d3b..7bf8bdb4507 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso) ctx->dirty |= FD_DIRTY_VTXSTATE; } +static struct pipe_stream_output_target * +fd_create_stream_output_target(struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned buffer_offset, + unsigned buffer_size) +{ + struct pipe_stream_output_target *target; + + target = CALLOC_STRUCT(pipe_stream_output_target); + if (!target) + return NULL; + + pipe_reference_init(&target->reference, 1); + pipe_resource_reference(&target->buffer, prsc); + + target->context = pctx; + target->buffer_offset = buffer_offset; + target->buffer_size = buffer_size; + + return target; +} + +static void +fd_stream_output_target_destroy(struct pipe_context *pctx, + struct pipe_stream_output_target *target) +{ + pipe_resource_reference(&target->buffer, NULL); + FREE(target); +} + +static void +fd_set_stream_output_targets(struct pipe_context *pctx, + unsigned num_targets, struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + unsigned i; + + debug_assert(num_targets <= ARRAY_SIZE(so->targets)); + + for (i = 0; i < num_targets; i++) { + boolean changed = targets[i] != so->targets[i]; + boolean append = (offsets[i] == (unsigned)-1); + + if (!changed && append) + continue; + + so->offsets[i] = 0; + + pipe_so_target_reference(&so->targets[i], targets[i]); + } + + for (; i < so->num_targets; i++) { + pipe_so_target_reference(&so->targets[i], NULL); + } + + so->num_targets = num_targets; + + ctx->dirty |= FD_DIRTY_STREAMOUT; +} + void fd_state_init(struct pipe_context *pctx) { @@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx) pctx->create_vertex_elements_state = fd_vertex_state_create; pctx->delete_vertex_elements_state = fd_vertex_state_delete; pctx->bind_vertex_elements_state = fd_vertex_state_bind; + + pctx->create_stream_output_target = fd_create_stream_output_target; + pctx->stream_output_target_destroy = fd_stream_output_target_destroy; + pctx->set_stream_output_targets = fd_set_stream_output_targets; } diff --git a/src/gallium/drivers/freedreno/freedreno_surface.c b/src/gallium/drivers/freedreno/freedreno_surface.c index 250fe4bc0f5..70c44eb79c3 100644 --- a/src/gallium/drivers/freedreno/freedreno_surface.c +++ b/src/gallium/drivers/freedreno/freedreno_surface.c @@ -41,7 +41,8 @@ fd_create_surface(struct pipe_context *pctx, // struct fd_resource* tex = fd_resource(ptex); struct fd_surface* surface = CALLOC_STRUCT(fd_surface); - assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer); + debug_assert(ptex->target != PIPE_BUFFER); + debug_assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer); if (surface) { struct pipe_surface *psurf = &surface->base; diff --git a/src/gallium/drivers/freedreno/freedreno_surface.h b/src/gallium/drivers/freedreno/freedreno_surface.h index 3293f33dd84..2de37cee2dd 100644 --- a/src/gallium/drivers/freedreno/freedreno_surface.h +++ b/src/gallium/drivers/freedreno/freedreno_surface.h @@ -40,7 +40,7 @@ struct fd_surface { uint16_t depth; }; -static INLINE struct fd_surface * +static inline struct fd_surface * fd_surface(struct pipe_surface *psurf) { return (struct fd_surface *)psurf; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index deb0e602ce2..7129a1bddd1 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -40,6 +40,7 @@ #include "util/u_dynarray.h" #include "util/u_pack_color.h" +#include "disasm.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" @@ -53,6 +54,12 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); /* TBD if it is same on a2xx, but for now: */ #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS +#define A2XX_MAX_RENDER_TARGETS 1 +#define A3XX_MAX_RENDER_TARGETS 4 +#define A4XX_MAX_RENDER_TARGETS 8 + +#define MAX_RENDER_TARGETS A4XX_MAX_RENDER_TARGETS + #define FD_DBG_MSGS 0x0001 #define FD_DBG_DISASM 0x0002 #define FD_DBG_DCLEAR 0x0004 @@ -64,6 +71,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_NOBIN 0x0100 #define FD_DBG_OPTMSGS 0x0200 #define FD_DBG_GLSL120 0x0400 +#define FD_DBG_SHADERDB 0x0800 extern int fd_mesa_debug; extern bool fd_binning_enabled; @@ -108,6 +116,58 @@ pipe_surface_format(struct pipe_surface *psurf) return psurf->format; } +static inline bool +fd_surface_half_precision(const struct pipe_surface *psurf) +{ + enum pipe_format format; + + if (!psurf) + return true; + + format = psurf->format; + + /* colors are provided in consts, which go through cov.f32f16, which will + * break these values + */ + if (util_format_is_pure_integer(format)) + return false; + + /* avoid losing precision on 32-bit float formats */ + if (util_format_is_float(format) && + util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32) + return false; + + return true; +} + +static inline unsigned +fd_sampler_first_level(const struct pipe_sampler_view *view) +{ + if (view->target == PIPE_BUFFER) + return 0; + return view->u.tex.first_level; +} + +static inline unsigned +fd_sampler_last_level(const struct pipe_sampler_view *view) +{ + if (view->target == PIPE_BUFFER) + return 0; + return view->u.tex.last_level; +} + +static inline bool +fd_half_precision(struct pipe_framebuffer_state *pfb) +{ + unsigned i; + + for (i = 0; i < pfb->nr_cbufs; i++) + if (!fd_surface_half_precision(pfb->cbufs[i])) + return false; + + return true; +} + #define LOG_DWORDS 0 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx); diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c index 48ae7c71b9f..83ed5ffdca0 100644 --- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c +++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -103,7 +103,7 @@ static void print_reg(reg_t reg, bool full, bool r, bool c, bool im, } else if ((reg.num == REG_P0) && !c) { printf("p0.%c", component[reg.comp]); } else { - printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]); + printf("%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]); } } @@ -122,6 +122,32 @@ static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im, print_reg(reg, full, r, c, im, neg, abs, addr_rel); } +/* TODO switch to using reginfo struct everywhere, since more readable + * than passing a bunch of bools to print_reg_src + */ + +struct reginfo { + reg_t reg; + bool full; + bool r; + bool c; + bool im; + bool neg; + bool abs; + bool addr_rel; +}; + +static void print_src(struct reginfo *info) +{ + print_reg_src(info->reg, info->full, info->r, info->c, info->im, + info->neg, info->abs, info->addr_rel); +} + +//static void print_dst(struct reginfo *info) +//{ +// print_reg_dst(info->reg, info->full, info->addr_rel); +//} + static void print_instr_cat0(instr_t *instr) { instr_cat0_t *cat0 = &instr->cat0; @@ -454,10 +480,70 @@ static void print_instr_cat6(instr_t *instr) { instr_cat6_t *cat6 = &instr->cat6; char sd = 0, ss = 0; /* dst/src address space */ - bool full = type_size(cat6->type) == 32; bool nodst = false; + struct reginfo dst, src1, src2; + int src1off = 0, dstoff = 0; - printf(".%s ", type[cat6->type]); + memset(&dst, 0, sizeof(dst)); + memset(&src1, 0, sizeof(src1)); + memset(&src2, 0, sizeof(src2)); + + switch (cat6->opc) { + case OPC_RESINFO: + case OPC_RESFMT: + dst.full = type_size(cat6->type) == 32; + src1.full = type_size(cat6->type) == 32; + src2.full = type_size(cat6->type) == 32; + break; + case OPC_L2G: + case OPC_G2L: + dst.full = true; + src1.full = true; + src2.full = true; + break; + case OPC_STG: + case OPC_STL: + case OPC_STP: + case OPC_STI: + case OPC_STLW: + case OPC_STGB_4D_4: + case OPC_STIB: + dst.full = true; + src1.full = type_size(cat6->type) == 32; + src2.full = type_size(cat6->type) == 32; + break; + default: + dst.full = type_size(cat6->type) == 32; + src1.full = true; + src2.full = true; + break; + } + + switch (cat6->opc) { + case OPC_PREFETCH: + case OPC_RESINFO: + break; + case OPC_ATOMIC_ADD: + case OPC_ATOMIC_SUB: + case OPC_ATOMIC_XCHG: + case OPC_ATOMIC_INC: + case OPC_ATOMIC_DEC: + case OPC_ATOMIC_CMPXCHG: + case OPC_ATOMIC_MIN: + case OPC_ATOMIC_MAX: + case OPC_ATOMIC_AND: + case OPC_ATOMIC_OR: + case OPC_ATOMIC_XOR: + ss = cat6->g ? 'g' : 'l'; + printf(".%c", ss); + printf(".%s", type[cat6->type]); + break; + default: + dst.im = cat6->g && !cat6->dst_off; + printf(".%s", type[cat6->type]); + break; + } + printf(" "); switch (cat6->opc) { case OPC_STG: @@ -499,68 +585,65 @@ static void print_instr_cat6(instr_t *instr) break; case OPC_STI: - full = false; // XXX or inverts?? + dst.full = false; // XXX or inverts?? break; } - if (cat6->has_off) { - if (!nodst) { - if (sd) - printf("%c[", sd); - print_reg_dst((reg_t)(cat6->a.dst), full, false); - if (sd) - printf("]"); - printf(", "); - } - if (ss) - printf("%c[", ss); - print_reg_src((reg_t)(cat6->a.src1), true, - false, false, cat6->a.src1_im, false, false, false); - if (cat6->a.off) - printf("%+d", cat6->a.off); - if (ss) - printf("]"); - printf(", "); - print_reg_src((reg_t)(cat6->a.src2), full, - false, false, cat6->a.src2_im, false, false, false); + if (cat6->dst_off) { + dst.reg = (reg_t)(cat6->c.dst); + dstoff = cat6->c.off; } else { - if (!nodst) { - if (sd) - printf("%c[", sd); - print_reg_dst((reg_t)(cat6->b.dst), full, false); - if (sd) - printf("]"); - printf(", "); - } - if (ss) - printf("%c[", ss); - print_reg_src((reg_t)(cat6->b.src1), true, - false, false, cat6->b.src1_im, false, false, false); - if (ss) + dst.reg = (reg_t)(cat6->d.dst); + } + + if (cat6->src_off) { + src1.reg = (reg_t)(cat6->a.src1); + src1.im = cat6->a.src1_im; + src2.reg = (reg_t)(cat6->a.src2); + src2.im = cat6->a.src2_im; + src1off = cat6->a.off; + } else { + src1.reg = (reg_t)(cat6->b.src1); + src1.im = cat6->b.src1_im; + src2.reg = (reg_t)(cat6->b.src2); + src2.im = cat6->b.src2_im; + } + + if (!nodst) { + if (sd) + printf("%c[", sd); + /* note: dst might actually be a src (ie. address to store to) */ + print_src(&dst); + if (dstoff) + printf("%+d", dstoff); + if (sd) printf("]"); printf(", "); - print_reg_src((reg_t)(cat6->b.src2), full, - false, false, cat6->b.src2_im, false, false, false); } - if (debug & PRINT_VERBOSE) { - switch (cat6->opc) { - case OPC_LDG: - case OPC_LDP: - /* load instructions: */ - if (cat6->a.dummy2|cat6->a.dummy3) - printf("\t{6: %x,%x}", cat6->a.dummy2, cat6->a.dummy3); - break; - case OPC_STG: - case OPC_STP: - case OPC_STI: - /* store instructions: */ - if (cat6->b.dummy2|cat6->b.dummy2) - printf("\t{6: %x,%x}", cat6->b.dummy2, cat6->b.dummy3); - if (cat6->b.ignore0) - printf("\t{?? %x}", cat6->b.ignore0); - break; - } + if (ss) + printf("%c[", ss); + + /* can have a larger than normal immed, so hack: */ + if (src1.im) { + printf("%u", src1.reg.dummy13); + } else { + print_src(&src1); + } + + if (src1off) + printf("%+d", src1off); + if (ss) + printf("]"); + + switch (cat6->opc) { + case OPC_RESINFO: + case OPC_RESFMT: + break; + default: + printf(", "); + print_src(&src2); + break; } } @@ -711,19 +794,19 @@ struct opc_info { OPC(6, OPC_LDLW, ldlw), OPC(6, OPC_STLW, stlw), OPC(6, OPC_RESFMT, resfmt), - OPC(6, OPC_RESINFO, resinf), - OPC(6, OPC_ATOMIC_ADD_L, atomic.add.l), - OPC(6, OPC_ATOMIC_SUB_L, atomic.sub.l), - OPC(6, OPC_ATOMIC_XCHG_L, atomic.xchg.l), - OPC(6, OPC_ATOMIC_INC_L, atomic.inc.l), - OPC(6, OPC_ATOMIC_DEC_L, atomic.dec.l), - OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l), - OPC(6, OPC_ATOMIC_MIN_L, atomic.min.l), - OPC(6, OPC_ATOMIC_MAX_L, atomic.max.l), - OPC(6, OPC_ATOMIC_AND_L, atomic.and.l), - OPC(6, OPC_ATOMIC_OR_L, atomic.or.l), - OPC(6, OPC_ATOMIC_XOR_L, atomic.xor.l), - OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.4d), + OPC(6, OPC_RESINFO, resinfo), + OPC(6, OPC_ATOMIC_ADD, atomic.add), + OPC(6, OPC_ATOMIC_SUB, atomic.sub), + OPC(6, OPC_ATOMIC_XCHG, atomic.xchg), + OPC(6, OPC_ATOMIC_INC, atomic.inc), + OPC(6, OPC_ATOMIC_DEC, atomic.dec), + OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg), + OPC(6, OPC_ATOMIC_MIN, atomic.min), + OPC(6, OPC_ATOMIC_MAX, atomic.max), + OPC(6, OPC_ATOMIC_AND, atomic.and), + OPC(6, OPC_ATOMIC_OR, atomic.or), + OPC(6, OPC_ATOMIC_XOR, atomic.xor), + OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.3d), OPC(6, OPC_STGB_4D_4, stgb.4d.4), OPC(6, OPC_STIB, stib), OPC(6, OPC_LDC_4, ldc.4), diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h index efb07ea479e..c3fb68d511c 100644 --- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -173,17 +173,17 @@ typedef enum { OPC_STLW = 11, OPC_RESFMT = 14, OPC_RESINFO = 15, - OPC_ATOMIC_ADD_L = 16, - OPC_ATOMIC_SUB_L = 17, - OPC_ATOMIC_XCHG_L = 18, - OPC_ATOMIC_INC_L = 19, - OPC_ATOMIC_DEC_L = 20, - OPC_ATOMIC_CMPXCHG_L = 21, - OPC_ATOMIC_MIN_L = 22, - OPC_ATOMIC_MAX_L = 23, - OPC_ATOMIC_AND_L = 24, - OPC_ATOMIC_OR_L = 25, - OPC_ATOMIC_XOR_L = 26, + OPC_ATOMIC_ADD = 16, + OPC_ATOMIC_SUB = 17, + OPC_ATOMIC_XCHG = 18, + OPC_ATOMIC_INC = 19, + OPC_ATOMIC_DEC = 20, + OPC_ATOMIC_CMPXCHG = 21, + OPC_ATOMIC_MIN = 22, + OPC_ATOMIC_MAX = 23, + OPC_ATOMIC_AND = 24, + OPC_ATOMIC_OR = 25, + OPC_ATOMIC_XOR = 26, OPC_LDGB_TYPED_4D = 27, OPC_STGB_4D_4 = 28, OPC_STIB = 29, @@ -575,7 +575,7 @@ typedef struct PACKED { uint32_t opc_cat : 3; } instr_cat5_t; -/* [src1 + off], src2: */ +/* dword0 encoding for src_off: [src1 + off], src2: */ typedef struct PACKED { /* dword0: */ uint32_t mustbe1 : 1; @@ -586,37 +586,50 @@ typedef struct PACKED { uint32_t src2 : 8; /* dword1: */ - uint32_t dst : 8; - uint32_t dummy2 : 9; - uint32_t type : 3; - uint32_t dummy3 : 2; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; + uint32_t dword1; } instr_cat6a_t; -/* [src1], src2: */ +/* dword0 encoding for !src_off: [src1], src2 */ typedef struct PACKED { /* dword0: */ uint32_t mustbe0 : 1; - uint32_t src1 : 8; - uint32_t ignore0 : 13; + uint32_t src1 : 13; + uint32_t ignore0 : 8; uint32_t src1_im : 1; uint32_t src2_im : 1; uint32_t src2 : 8; /* dword1: */ - uint32_t dst : 8; - uint32_t dummy2 : 9; - uint32_t type : 3; - uint32_t dummy3 : 2; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; + uint32_t dword1; } instr_cat6b_t; +/* dword1 encoding for dst_off: */ +typedef struct PACKED { + /* dword0: */ + uint32_t dword0; + + /* note: there is some weird stuff going on where sometimes + * cat6->a.off is involved.. but that seems like a bug in + * the blob, since it is used even if !cat6->src_off + * It would make sense for there to be some more bits to + * bring us to 11 bits worth of offset, but not sure.. + */ + int32_t off : 8; + uint32_t mustbe1 : 1; + uint32_t dst : 8; + uint32_t pad1 : 15; +} instr_cat6c_t; + +/* dword1 encoding for !dst_off: */ +typedef struct PACKED { + /* dword0: */ + uint32_t dword0; + + uint32_t dst : 8; + uint32_t mustbe0 : 1; + uint32_t pad0 : 23; +} instr_cat6d_t; + /* I think some of the other cat6 instructions use additional * sub-encodings.. */ @@ -624,16 +637,20 @@ typedef struct PACKED { typedef union PACKED { instr_cat6a_t a; instr_cat6b_t b; + instr_cat6c_t c; + instr_cat6d_t d; struct PACKED { /* dword0: */ - uint32_t has_off : 1; + uint32_t src_off : 1; uint32_t pad1 : 31; /* dword1: */ - uint32_t dst : 8; - uint32_t dummy2 : 9; + uint32_t pad2 : 8; + uint32_t dst_off : 1; + uint32_t pad3 : 8; uint32_t type : 3; - uint32_t dummy3 : 2; + uint32_t g : 1; /* or in some cases it means dst immed */ + uint32_t pad4 : 1; uint32_t opc : 5; uint32_t jmp_tgt : 1; uint32_t sync : 1; diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index a166b67d7cf..b24825cff85 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -499,32 +499,51 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr, static int emit_cat6(struct ir3_instruction *instr, void *ptr, struct ir3_info *info) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; + struct ir3_register *dst, *src1, *src2; instr_cat6_t *cat6 = ptr; - iassert(instr->regs_count >= 2); + /* the "dst" for a store instruction is (from the perspective + * of data flow in the shader, ie. register use/def, etc) in + * fact a register that is read by the instruction, rather + * than written: + */ + if (is_store(instr)) { + iassert(instr->regs_count >= 3); - if (instr->cat6.offset || instr->opc == OPC_LDG) { + dst = instr->regs[1]; + src1 = instr->regs[2]; + src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL; + } else { + iassert(instr->regs_count >= 2); + + dst = instr->regs[0]; + src1 = instr->regs[1]; + src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; + } + + + /* TODO we need a more comprehensive list about which instructions + * can be encoded which way. Or possibly use IR3_INSTR_0 flag to + * indicate to use the src_off encoding even if offset is zero + * (but then what to do about dst_off?) + */ + if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) { instr_cat6a_t *cat6a = ptr; - cat6->has_off = true; + cat6->src_off = true; - cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED); if (src2) { cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED); } - cat6a->off = instr->cat6.offset; + cat6a->off = instr->cat6.src_offset; } else { instr_cat6b_t *cat6b = ptr; - cat6->has_off = false; + cat6->src_off = false; - cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED); if (src2) { @@ -533,10 +552,22 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr, } } + if (instr->cat6.dst_offset || (instr->opc == OPC_STG)) { + instr_cat6c_t *cat6c = ptr; + cat6->dst_off = true; + cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat6c->off = instr->cat6.dst_offset; + } else { + instr_cat6d_t *cat6d = ptr; + cat6->dst_off = false; + cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + } + cat6->type = instr->cat6.type; cat6->opc = instr->opc; cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); cat6->sync = !!(instr->flags & IR3_INSTR_SY); + cat6->g = !!(instr->flags & IR3_INSTR_G); cat6->opc_cat = 6; return 0; @@ -669,7 +700,6 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block, return ir3_instr_create2(block, category, opc, 4); } -/* only used by old compiler: */ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) { struct ir3_instruction *new_instr = instr_create(instr->block, @@ -708,6 +738,17 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, } void +ir3_instr_set_address(struct ir3_instruction *instr, + struct ir3_instruction *addr) +{ + if (instr->address != addr) { + struct ir3 *ir = instr->block->shader; + instr->address = addr; + array_insert(ir->indirects, instr); + } +} + +void ir3_block_clear_mark(struct ir3_block *block) { list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) @@ -723,15 +764,16 @@ ir3_clear_mark(struct ir3 *ir) } /* note: this will destroy instr->depth, don't do it until after sched! */ -void +unsigned ir3_count_instructions(struct ir3 *ir) { - unsigned ip = 0; + unsigned cnt = 0; list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - instr->ip = ip++; + instr->ip = cnt++; } block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip; block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip; } + return cnt; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 9c35a763d58..12f2ebe18db 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -172,6 +172,7 @@ struct ir3_instruction { IR3_INSTR_P = 0x080, IR3_INSTR_S = 0x100, IR3_INSTR_S2EN = 0x200, + IR3_INSTR_G = 0x400, /* meta-flags, for intermediate stages of IR, ie. * before register assignment is done: */ @@ -209,7 +210,8 @@ struct ir3_instruction { } cat5; struct { type_t type; - int offset; + int src_offset; + int dst_offset; int iim_val; } cat6; /* for meta-instructions, just used to hold extra data @@ -285,6 +287,8 @@ struct ir3_instruction { /* an instruction can reference at most one address register amongst * it's src/dst registers. Beyond that, you need to insert mov's. + * + * NOTE: do not write this directly, use ir3_instr_set_address() */ struct ir3_instruction *address; @@ -365,6 +369,12 @@ struct ir3 { unsigned predicates_count, predicates_sz; struct ir3_instruction **predicates; + /* Track instructions which do not write a register but other- + * wise must not be discarded (such as kill, stg, etc) + */ + unsigned keeps_count, keeps_sz; + struct ir3_instruction **keeps; + /* List of blocks: */ struct list_head block_list; @@ -420,6 +430,9 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); +void ir3_instr_set_address(struct ir3_instruction *instr, + struct ir3_instruction *addr); + static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) { if (instr->flags & IR3_INSTR_MARK) @@ -431,7 +444,7 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) void ir3_block_clear_mark(struct ir3_block *block); void ir3_clear_mark(struct ir3 *shader); -void ir3_count_instructions(struct ir3 *ir); +unsigned ir3_count_instructions(struct ir3 *ir); static inline int ir3_instr_regno(struct ir3_instruction *instr, struct ir3_register *reg) @@ -547,6 +560,26 @@ is_store(struct ir3_instruction *instr) return false; } +static inline bool is_load(struct ir3_instruction *instr) +{ + if (is_mem(instr)) { + switch (instr->opc) { + case OPC_LDG: + case OPC_LDL: + case OPC_LDP: + case OPC_L2G: + case OPC_LDLW: + case OPC_LDC_4: + case OPC_LDLV: + /* probably some others too.. */ + return true; + default: + break; + } + } + return false; +} + static inline bool is_input(struct ir3_instruction *instr) { /* in some cases, ldlv is used to fetch varying without @@ -1036,6 +1069,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, /* cat6 instructions: */ INSTR2(6, LDLV) INSTR2(6, LDG) +INSTR3(6, STG) /* ************************************************************************* */ /* split this out or find some helper to use.. like main/bitset.h.. */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index ad9d2719d59..ede29f445dc 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -43,127 +43,15 @@ #include "instr-a3xx.h" #include "ir3.h" -static void dump_reg(const char *name, uint32_t r) -{ - if (r != regid(63,0)) - debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]); -} - -static void dump_semantic(struct ir3_shader_variant *so, - unsigned sem, const char *name) -{ - uint32_t regid; - regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0)); - dump_reg(name, regid); -} - static void dump_info(struct ir3_shader_variant *so, const char *str) { uint32_t *bin; - const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG"; - - // for debug, dump some before/after info: + const char *type = ir3_shader_stage(so->shader); // TODO make gpu_id configurable on cmdline bin = ir3_shader_assemble(so, 320); - if (fd_mesa_debug & FD_DBG_DISASM) { - struct ir3 *ir = so->ir; - struct ir3_register *reg; - uint8_t regid; - unsigned i; - - debug_printf("; %s: %s\n", type, str); - - for (i = 0; i < ir->ninputs; i++) { - if (!ir->inputs[i]) { - debug_printf("; in%d unused\n", i); - continue; - } - reg = ir->inputs[i]->regs[0]; - regid = reg->num; - debug_printf("@in(%sr%d.%c)\tin%d\n", - (reg->flags & IR3_REG_HALF) ? "h" : "", - (regid >> 2), "xyzw"[regid & 0x3], i); - } - - for (i = 0; i < ir->noutputs; i++) { - if (!ir->outputs[i]) { - debug_printf("; out%d unused\n", i); - continue; - } - /* kill shows up as a virtual output.. skip it! */ - if (is_kill(ir->outputs[i])) - continue; - reg = ir->outputs[i]->regs[0]; - regid = reg->num; - debug_printf("@out(%sr%d.%c)\tout%d\n", - (reg->flags & IR3_REG_HALF) ? "h" : "", - (regid >> 2), "xyzw"[regid & 0x3], i); - } - - for (i = 0; i < so->immediates_count; i++) { - debug_printf("@const(c%d.x)\t", so->first_immediate + i); - debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n", - so->immediates[i].val[0], - so->immediates[i].val[1], - so->immediates[i].val[2], - so->immediates[i].val[3]); - } - - disasm_a3xx(bin, so->info.sizedwords, 0, so->type); - - debug_printf("; %s: outputs:", type); - for (i = 0; i < so->outputs_count; i++) { - uint8_t regid = so->outputs[i].regid; - ir3_semantic sem = so->outputs[i].semantic; - debug_printf(" r%d.%c (%u:%u)", - (regid >> 2), "xyzw"[regid & 0x3], - sem2name(sem), sem2idx(sem)); - } - debug_printf("\n"); - debug_printf("; %s: inputs:", type); - for (i = 0; i < so->inputs_count; i++) { - uint8_t regid = so->inputs[i].regid; - ir3_semantic sem = so->inputs[i].semantic; - debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)", - (regid >> 2), "xyzw"[regid & 0x3], - sem2name(sem), sem2idx(sem), - so->inputs[i].compmask, - so->inputs[i].inloc, - so->inputs[i].bary); - } - debug_printf("\n"); - } - - /* print generic shader info: */ - debug_printf("; %s: %u instructions, %d half, %d full\n", type, - so->info.instrs_count, - so->info.max_half_reg + 1, - so->info.max_reg + 1); - - /* print shader type specific info: */ - switch (so->type) { - case SHADER_VERTEX: - dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos"); - dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize"); - break; - case SHADER_FRAGMENT: - dump_reg("pos (bary)", so->pos_regid); - dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz"); - dump_semantic(so, TGSI_SEMANTIC_COLOR, "color"); - /* these two are hard-coded since we don't know how to - * program them to anything but all 0's... - */ - if (so->frag_coord) - debug_printf("; fragcoord: r0.x\n"); - if (so->frag_face) - debug_printf("; fragface: hr0.x\n"); - break; - case SHADER_COMPUTE: - break; - } + debug_printf("; %s: %s\n", type, str); + ir3_shader_disasm(so, bin); free(bin); - - debug_printf("\n"); } @@ -205,8 +93,7 @@ static void print_usage(void) printf(" --saturate-s MASK - bitmask of samplers to saturate S coord\n"); printf(" --saturate-t MASK - bitmask of samplers to saturate T coord\n"); printf(" --saturate-r MASK - bitmask of samplers to saturate R coord\n"); - printf(" --nocp - disable copy propagation\n"); - printf(" --nir - use NIR compiler\n"); + printf(" --stream-out - enable stream-out (aka transform feedback)\n"); printf(" --help - show this message\n"); } @@ -218,6 +105,7 @@ int main(int argc, char **argv) struct tgsi_parse_context parse; struct ir3_compiler *compiler; struct ir3_shader_variant v; + struct ir3_shader s; struct ir3_shader_key key = {}; const char *info; void *ptr; @@ -225,6 +113,9 @@ int main(int argc, char **argv) fd_mesa_debug |= FD_DBG_DISASM; + memset(&s, 0, sizeof(s)); + memset(&v, 0, sizeof(v)); + /* cmdline args which impact shader variant get spit out in a * comment on the first line.. a quick/dirty way to preserve * that info so when ir3test recompiles the shader with a new @@ -281,6 +172,24 @@ int main(int argc, char **argv) continue; } + if (!strcmp(argv[n], "--stream-out")) { + struct pipe_stream_output_info *so = &s.stream_output; + debug_printf(" %s", argv[n]); + /* TODO more dynamic config based on number of outputs, etc + * rather than just hard-code for first output: + */ + so->num_outputs = 1; + so->stride[0] = 4; + so->output[0].register_index = 0; + so->output[0].start_component = 0; + so->output[0].num_components = 4; + so->output[0].output_buffer = 0; + so->output[0].dst_offset = 2; + so->output[0].stream = 0; + n++; + continue; + } + if (!strcmp(argv[n], "--help")) { print_usage(); return 0; @@ -292,9 +201,6 @@ int main(int argc, char **argv) filename = argv[n]; - memset(&v, 0, sizeof(v)); - v.key = key; - ret = read_file(filename, &ptr, &size); if (ret) { print_usage(); @@ -307,16 +213,21 @@ int main(int argc, char **argv) if (!tgsi_text_translate(ptr, toks, Elements(toks))) errx(1, "could not parse `%s'", filename); + s.tokens = toks; + + v.key = key; + v.shader = &s; + tgsi_parse_init(&parse, toks); switch (parse.FullHeader.Processor.Processor) { case TGSI_PROCESSOR_FRAGMENT: - v.type = SHADER_FRAGMENT; + s.type = v.type = SHADER_FRAGMENT; break; case TGSI_PROCESSOR_VERTEX: - v.type = SHADER_VERTEX; + s.type = v.type = SHADER_VERTEX; break; case TGSI_PROCESSOR_COMPUTE: - v.type = SHADER_COMPUTE; + s.type = v.type = SHADER_COMPUTE; break; } @@ -324,7 +235,7 @@ int main(int argc, char **argv) compiler = ir3_compiler_create(320); info = "NIR compiler"; - ret = ir3_compile_shader_nir(compiler, &v, toks, key); + ret = ir3_compile_shader_nir(compiler, &v); if (ret) { fprintf(stderr, "compiler failed!\n"); return ret; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h index 86b1161d9cb..697afeba61a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -36,14 +36,13 @@ struct ir3_ra_reg_set; struct ir3_compiler { uint32_t gpu_id; struct ir3_ra_reg_set *set; + uint32_t shader_count; }; struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id); void ir3_compiler_destroy(struct ir3_compiler *compiler); int ir3_compile_shader_nir(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens, - struct ir3_shader_key key); + struct ir3_shader_variant *so); #endif /* IR3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 48b1d8f3606..0ab33455ed1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -117,10 +117,6 @@ struct ir3_compile { /* for looking up which system value is which */ unsigned sysval_semantics[8]; - /* list of kill instructions: */ - struct ir3_instruction *kill[16]; - unsigned int kill_count; - /* set if we encounter something we can't handle yet, so we * can bail cleanly and fallback to TGSI compiler f/e */ @@ -153,6 +149,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens) nir_opt_global_to_local(s); nir_convert_to_ssa(s); nir_lower_idiv(s); + nir_lower_load_const_to_scalar(s); do { progress = false; @@ -261,13 +258,29 @@ compile_init(struct ir3_compiler *compiler, so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; - /* one (vec4) slot for vertex id base: */ - if (so->type == SHADER_VERTEX) - so->first_immediate++; + /* Layout of constant registers: + * + * num_uniform * vec4 - user consts + * 4 * vec4 - UBO addresses + * if (vertex shader) { + * 1 * vec4 - driver params (IR3_DP_*) + * 1 * vec4 - stream-out addresses + * } + * + * TODO this could be made more dynamic, to at least skip sections + * that we don't need.. + */ /* reserve 4 (vec4) slots for ubo base addresses: */ so->first_immediate += 4; + if (so->type == SHADER_VERTEX) { + /* one (vec4) slot for driver params (see ir3_driver_param): */ + so->first_immediate++; + /* one (vec4) slot for stream-output base addresses: */ + so->first_immediate++; + } + return ctx; } @@ -637,9 +650,8 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n, mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); - mov->address = address; - array_insert(ctx->ir->indirects, mov); + ir3_instr_set_address(mov, address); return mov; } @@ -677,9 +689,8 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, src->instr = collect; src->size = arrsz; src->offset = n; - mov->address = address; - array_insert(ctx->ir->indirects, mov); + ir3_instr_set_address(mov, address); return mov; } @@ -700,25 +711,21 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, dst->size = arrsz; dst->offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->address = address; mov->fanin = collect; - array_insert(ctx->ir->indirects, mov); + ir3_instr_set_address(mov, address); return mov; } static struct ir3_instruction * -create_input(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) +create_input(struct ir3_block *block, unsigned n) { struct ir3_instruction *in; in = ir3_instr_create(block, -1, OPC_META_INPUT); in->inout.block = block; ir3_reg_create(in, n, 0); - if (instr) - ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; return in; } @@ -750,7 +757,7 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp) compile_assert(ctx, !ctx->frag_coord[comp]); - ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0); + ctx->frag_coord[comp] = create_input(ctx->block, 0); switch (comp) { case 0: /* .x */ @@ -789,7 +796,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) case 0: /* .x */ compile_assert(ctx, !ctx->frag_face); - ctx->frag_face = create_input(block, NULL, 0); + ctx->frag_face = create_input(block, 0); ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; /* for faceness, we always get -1 or 0 (int).. but TGSI expects @@ -817,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) } } +static struct ir3_instruction * +create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) +{ + /* first four vec4 sysval's reserved for UBOs: */ + unsigned r = regid(ctx->so->first_driver_param + 4, dp); + return create_uniform(ctx, r); +} + /* helper for instructions that produce multiple consecutive scalar * outputs which need to have a split/fanout meta instruction inserted */ @@ -1218,7 +1233,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction *load = ir3_LDG(b, addr, 0, create_immed(b, 1), 0); load->cat6.type = TYPE_U32; - load->cat6.offset = off + i * 4; /* byte offset */ + load->cat6.src_offset = off + i * 4; /* byte offset */ dst[i] = load; } } @@ -1307,7 +1322,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) * store_output_indirect? or move this into * create_indirect_store()? */ - for (int j = i; j < arr->length; j += 4) { + for (int j = i; j < arr->length; j += intr->num_components) { struct ir3_instruction *split; split = ir3_instr_create(ctx->block, -1, OPC_META_FO); @@ -1318,6 +1333,13 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) arr->arr[j] = split; } } + /* fixup fanout/split neighbors: */ + for (int i = 0; i < arr->length; i++) { + arr->arr[i]->cp.right = (i < (arr->length - 1)) ? + arr->arr[i+1] : NULL; + arr->arr[i]->cp.left = (i > 0) ? + arr->arr[i-1] : NULL; + } break; } default: @@ -1372,6 +1394,11 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) dst[i] = create_uniform_indirect(ctx, n, get_addr(ctx, src[0])); } + /* NOTE: if relative addressing is used, we set constlen in + * the compiler (to worst-case value) since we don't know in + * the assembler what the max addr reg value can be: + */ + ctx->so->constlen = ctx->s->num_uniforms; break; case nir_intrinsic_load_ubo: case nir_intrinsic_load_ubo_indirect: @@ -1409,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) break; case nir_intrinsic_load_base_vertex: if (!ctx->basevertex) { - /* first four vec4 sysval's reserved for UBOs: */ - unsigned r = regid(ctx->so->first_driver_param + 4, 0); - ctx->basevertex = create_uniform(ctx, r); + ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX, ctx->basevertex); } @@ -1419,7 +1444,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) break; case nir_intrinsic_load_vertex_id_zero_base: if (!ctx->vertex_id) { - ctx->vertex_id = create_input(ctx->block, NULL, 0); + ctx->vertex_id = create_input(ctx->block, 0); add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE, ctx->vertex_id); } @@ -1427,7 +1452,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) break; case nir_intrinsic_load_instance_id: if (!ctx->instance_id) { - ctx->instance_id = create_input(ctx->block, NULL, 0); + ctx->instance_id = create_input(ctx->block, 0); add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID, ctx->instance_id); } @@ -1456,7 +1481,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) kill = ir3_KILL(b, cond, 0); array_insert(ctx->ir->predicates, kill); - ctx->kill[ctx->kill_count++] = kill; + array_insert(ctx->ir->keeps, kill); ctx->so->has_kill = true; break; @@ -1950,6 +1975,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) } } +/* emit stream-out code. At this point, the current block is the original + * (nir) end block, and nir ensures that all flow control paths terminate + * into the end block. We re-purpose the original end block to generate + * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional + * block holding stream-out write instructions, followed by the new end + * block: + * + * blockOrigEnd { + * p0.x = (vtxcnt < maxvtxcnt) + * // succs: blockStreamOut, blockNewEnd + * } + * blockStreamOut { + * ... stream-out instructions ... + * // succs: blockNewEnd + * } + * blockNewEnd { + * } + */ +static void +emit_stream_out(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *v = ctx->so; + struct ir3 *ir = ctx->ir; + struct pipe_stream_output_info *strmout = + &ctx->so->shader->stream_output; + struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; + struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; + struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS]; + + /* create vtxcnt input in input block at top of shader, + * so that it is seen as live over the entire duration + * of the shader: + */ + vtxcnt = create_input(ctx->in_block, 0); + add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt); + + maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); + + /* at this point, we are at the original 'end' block, + * re-purpose this block to stream-out condition, then + * append stream-out block and new-end block + */ + orig_end_block = ctx->block; + + stream_out_block = ir3_block_create(ir); + list_addtail(&stream_out_block->node, &ir->block_list); + + new_end_block = ir3_block_create(ir); + list_addtail(&new_end_block->node, &ir->block_list); + + orig_end_block->successors[0] = stream_out_block; + orig_end_block->successors[1] = new_end_block; + stream_out_block->successors[0] = new_end_block; + + /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ + cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); + cond->regs[0]->num = regid(REG_P0, 0); + cond->cat2.condition = IR3_COND_LT; + + /* condition goes on previous block to the conditional, + * since it is used to pick which of the two successor + * paths to take: + */ + orig_end_block->condition = cond; + + /* switch to stream_out_block to generate the stream-out + * instructions: + */ + ctx->block = stream_out_block; + + /* Calculate base addresses based on vtxcnt. Instructions + * generated for bases not used in following loop will be + * stripped out in the backend. + */ + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + unsigned stride = strmout->stride[i]; + struct ir3_instruction *base, *off; + + base = create_uniform(ctx, regid(v->first_driver_param + 5, i)); + + /* 24-bit should be enough: */ + off = ir3_MUL_U(ctx->block, vtxcnt, 0, + create_immed(ctx->block, stride * 4), 0); + + bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); + } + + /* Generate the per-output store instructions: */ + for (unsigned i = 0; i < strmout->num_outputs; i++) { + for (unsigned j = 0; j < strmout->output[i].num_components; j++) { + unsigned c = j + strmout->output[i].start_component; + struct ir3_instruction *base, *out, *stg; + + base = bases[strmout->output[i].output_buffer]; + out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; + + stg = ir3_STG(ctx->block, base, 0, out, 0, + create_immed(ctx->block, 1), 0); + stg->cat6.type = TYPE_U32; + stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; + + array_insert(ctx->ir->keeps, stg); + } + } + + /* and finally switch to the new_end_block: */ + ctx->block = new_end_block; +} + static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { @@ -1960,6 +2094,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl) * into which we emit the 'end' instruction. */ compile_assert(ctx, list_empty(&ctx->block->instr_list)); + + /* If stream-out (aka transform-feedback) enabled, emit the + * stream-out instructions, followed by a new empty block (into + * which the 'end' instruction lands). + * + * NOTE: it is done in this order, rather than inserting before + * we emit end_block, because NIR guarantees that all blocks + * flow into end_block, and that end_block has no successors. + * So by re-purposing end_block as the first block of stream- + * out, we guarantee that all exit paths flow into the stream- + * out instructions. + */ + if ((ctx->so->shader->stream_output.num_outputs > 0) && + !ctx->so->key.binning_pass) { + debug_assert(ctx->so->type == SHADER_VERTEX); + emit_stream_out(ctx); + } + ir3_END(ctx->block); } @@ -1974,7 +2126,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) unsigned semantic_index = in->data.index; unsigned n = in->data.driver_location; - DBG("; in: %u:%u, len=%ux%u, loc=%u\n", + DBG("; in: %u:%u, len=%ux%u, loc=%u", semantic_name, semantic_index, array_len, ncomp, n); @@ -2045,7 +2197,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) so->inputs[n].inloc + i - 8, use_ldlv); } } else { - instr = create_input(ctx->block, NULL, idx); + instr = create_input(ctx->block, idx); } ctx->ir->inputs[idx] = instr; @@ -2069,7 +2221,7 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) unsigned n = out->data.driver_location; unsigned comp = 0; - DBG("; out: %u:%u, len=%ux%u, loc=%u\n", + DBG("; out: %u:%u, len=%ux%u, loc=%u", semantic_name, semantic_index, array_len, ncomp, n); @@ -2098,6 +2250,10 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) so->writes_pos = true; break; case TGSI_SEMANTIC_COLOR: + if (semantic_index == -1) { + semantic_index = 0; + so->color0_mrt = 1; + } break; default: compile_error(ctx, "unknown FS semantic name: %s\n", @@ -2136,13 +2292,9 @@ emit_instructions(struct ir3_compile *ctx) ninputs = exec_list_length(&ctx->s->inputs) * 4; noutputs = exec_list_length(&ctx->s->outputs) * 4; - /* we need to allocate big enough outputs array so that - * we can stuff the kill's at the end. Likewise for vtx - * shaders, we need to leave room for sysvals: + /* or vtx shaders, we need to leave room for sysvals: */ - if (ctx->so->type == SHADER_FRAGMENT) { - noutputs += ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { + if (ctx->so->type == SHADER_VERTEX) { ninputs += 8; } @@ -2153,9 +2305,7 @@ emit_instructions(struct ir3_compile *ctx) ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); - if (ctx->so->type == SHADER_FRAGMENT) { - ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { + if (ctx->so->type == SHADER_VERTEX) { ctx->ir->ninputs -= 8; } @@ -2254,13 +2404,13 @@ fixup_frag_inputs(struct ir3_compile *ctx) so->pos_regid = regid; /* r0.x */ - instr = create_input(ctx->in_block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[1]->instr = instr; /* r0.y */ - instr = create_input(ctx->in_block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[2]->instr = instr; @@ -2270,9 +2420,7 @@ fixup_frag_inputs(struct ir3_compile *ctx) int ir3_compile_shader_nir(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens, - struct ir3_shader_key key) + struct ir3_shader_variant *so) { struct ir3_compile *ctx; struct ir3 *ir; @@ -2282,7 +2430,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, assert(!so->ir); - ctx = compile_init(compiler, so, tokens); + ctx = compile_init(compiler, so, so->shader->tokens); if (!ctx) { DBG("INIT failed!"); ret = -1; @@ -2307,7 +2455,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, fixup_frag_inputs(ctx); /* at this point, for binning pass, throw away unneeded outputs: */ - if (key.binning_pass) { + if (so->key.binning_pass) { for (i = 0, j = 0; i < so->outputs_count; i++) { unsigned name = sem2name(so->outputs[i].semantic); unsigned idx = sem2idx(so->outputs[i].semantic); @@ -2332,7 +2480,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* if we want half-precision outputs, mark the output registers * as half: */ - if (key.half_precision) { + if (so->key.half_precision) { for (i = 0; i < ir->noutputs; i++) { struct ir3_instruction *out = ir->outputs[i]; if (!out) @@ -2353,15 +2501,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, } } - /* at this point, we want the kill's in the outputs array too, - * so that they get scheduled (since they have no dst).. we've - * already ensured that the array is big enough in push_block(): - */ - if (so->type == SHADER_FRAGMENT) { - for (i = 0; i < ctx->kill_count; i++) - ir->outputs[ir->noutputs++] = ctx->kill[i]; - } - if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE CP:\n"); ir3_print(ir); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 8c7c80f7aae..be4e4e81109 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -291,7 +291,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) instr->regs[n+1] = src_reg; if (src_reg->flags & IR3_REG_RELATIV) - instr->address = reg->instr->address; + ir3_instr_set_address(instr, reg->instr->address); return; } @@ -300,7 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) !conflicts(instr->address, reg->instr->address)) { src_reg->flags = new_flags; instr->regs[n+1] = src_reg; - instr->address = reg->instr->address; + ir3_instr_set_address(instr, reg->instr->address); return; } @@ -389,7 +389,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags) } if (instr->address) - instr->address = instr_cp(instr->address, NULL); + ir3_instr_set_address(instr, instr_cp(instr->address, NULL)); return instr; } @@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir) } } + for (unsigned i = 0; i < ir->keeps_count; i++) { + ir->keeps[i] = instr_cp(ir->keeps[i], NULL); + } + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { if (block->condition) block->condition = instr_cp(block->condition, NULL); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 3a108243479..97df0c2ac99 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir) if (ir->outputs[i]) ir3_instr_depth(ir->outputs[i]); + for (i = 0; i < ir->keeps_count; i++) + ir3_instr_depth(ir->keeps[i]); + /* We also need to account for if-condition: */ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { if (block->condition) @@ -167,6 +170,15 @@ ir3_depth(struct ir3 *ir) remove_unused_by_block(block); } + /* note that we can end up with unused indirects, but we should + * not end up with unused predicates. + */ + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *instr = ir->indirects[i]; + if (instr->depth == DEPTH_UNUSED) + ir->indirects[i] = NULL; + } + /* cleanup unused inputs: */ for (i = 0; i < ir->ninputs; i++) { struct ir3_instruction *in = ir->inputs[i]; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c index 70d9b08e019..ca28aefd502 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_group.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c @@ -236,6 +236,11 @@ find_neighbors(struct ir3 *ir) instr_find_neighbors(instr); } } + + for (i = 0; i < ir->keeps_count; i++) { + struct ir3_instruction *instr = ir->keeps[i]; + instr_find_neighbors(instr); + } } void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index f4a4223ae17..e94293f6d6b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ ctx->has_samp = true; regmask_set(&needs_sy, n->regs[0]); - } else if (is_mem(n)) { + } else if (is_load(n)) { regmask_set(&needs_sy, n->regs[0]); } /* both tex/sfu appear to not always immediately consume * their src register(s): */ - if (is_tex(n) || is_sfu(n) || is_mem(n)) { + if (is_tex(n) || is_sfu(n) || is_load(n)) { foreach_src(reg, n) { if (reg_gpr(reg)) regmask_set(&needs_ss_war, reg); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c index f377982dd5e..07e03d26908 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -175,6 +175,20 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } + if (instr->cp.left) { + printf(", left=_"); + printf("["); + print_instr_name(instr->cp.left); + printf("]"); + } + + if (instr->cp.right) { + printf(", right=_"); + printf("["); + print_instr_name(instr->cp.right); + printf("]"); + } + if (is_meta(instr)) { if (instr->opc == OPC_META_FO) { printf(", off=%d", instr->fo.off); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index e5aba859fab..eaf3b3c35e8 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -241,6 +241,21 @@ ir3_ra_alloc_reg_set(void *memctx) return set; } +/* additional block-data (per-block) */ +struct ir3_ra_block_data { + BITSET_WORD *def; /* variables defined before used in block */ + BITSET_WORD *use; /* variables used before defined in block */ + BITSET_WORD *livein; /* which defs reach entry point of block */ + BITSET_WORD *liveout; /* which defs reach exit point of block */ +}; + +/* additional instruction-data (per-instruction) */ +struct ir3_ra_instr_data { + /* cached instruction 'definer' info: */ + struct ir3_instruction *defn; + int off, sz, cls; +}; + /* register-assign context, per-shader */ struct ir3_ra_ctx { struct ir3 *ir; @@ -254,14 +269,7 @@ struct ir3_ra_ctx { unsigned class_base[total_class_count]; unsigned instr_cnt; unsigned *def, *use; /* def/use table */ -}; - -/* additional block-data (per-block) */ -struct ir3_ra_block_data { - BITSET_WORD *def; /* variables defined before used in block */ - BITSET_WORD *use; /* variables used before defined in block */ - BITSET_WORD *livein; /* which defs reach entry point of block */ - BITSET_WORD *liveout; /* which defs reach exit point of block */ + struct ir3_ra_instr_data *instrd; }; static bool @@ -291,8 +299,6 @@ is_temp(struct ir3_register *reg) { if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) return false; - if (reg->flags & IR3_REG_RELATIV) // TODO - return false; if ((reg->num == regid(REG_A0, 0)) || (reg->num == regid(REG_P0, 0))) return false; @@ -309,28 +315,45 @@ writes_gpr(struct ir3_instruction *instr) } static struct ir3_instruction * -get_definer(struct ir3_instruction *instr, int *sz, int *off) +get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, + int *sz, int *off) { + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; struct ir3_instruction *d = NULL; + + if (instr->fanin) + return get_definer(ctx, instr->fanin, sz, off); + + if (id->defn) { + *sz = id->sz; + *off = id->off; + return id->defn; + } + if (is_meta(instr) && (instr->opc == OPC_META_FI)) { /* What about the case where collect is subset of array, we * need to find the distance between where actual array starts * and fanin.. that probably doesn't happen currently. */ struct ir3_register *src; + int dsz, doff; /* note: don't use foreach_ssa_src as this gets called once * while assigning regs (which clears SSA flag) */ - foreach_src(src, instr) { + foreach_src_n(src, n, instr) { + struct ir3_instruction *dd; if (!src->instr) continue; - if ((!d) || (src->instr->ip < d->ip)) - d = src->instr; - } - *sz = instr->regs_count - 1; - *off = 0; + dd = get_definer(ctx, src->instr, &dsz, &doff); + + if ((!d) || (dd->ip < d->ip)) { + d = dd; + *sz = dsz; + *off = doff - n; + } + } } else if (instr->cp.right || instr->cp.left) { /* covers also the meta:fo case, which ends up w/ single @@ -386,7 +409,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off) struct ir3_instruction *dd; int dsz, doff; - dd = get_definer(phi, &dsz, &doff); + dd = get_definer(ctx, phi, &dsz, &doff); *sz = MAX2(*sz, dsz); *off = doff; @@ -401,6 +424,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off) * the phi, so we don't need to chase definers */ struct ir3_register *src; + struct ir3_instruction *dd = d; /* note: don't use foreach_ssa_src as this gets called once * while assigning regs (which clears SSA flag) @@ -408,16 +432,18 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off) foreach_src(src, d) { if (!src->instr) continue; - if (src->instr->ip < d->ip) - d = src->instr; + if (src->instr->ip < dd->ip) + dd = src->instr; } + + d = dd; } if (is_meta(d) && (d->opc == OPC_META_FO)) { struct ir3_instruction *dd; int dsz, doff; - dd = get_definer(d->regs[1]->instr, &dsz, &doff); + dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff); /* by definition, should come before: */ debug_assert(dd->ip < d->ip); @@ -429,9 +455,30 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off) d = dd; } + id->defn = d; + id->sz = *sz; + id->off = *off; + return d; } +static void +ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + if (instr->regs_count == 0) + continue; + /* couple special cases: */ + if (writes_addr(instr) || writes_pred(instr)) { + id->cls = -1; + continue; + } + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn)); + } +} + /* give each instruction a name (and ip), and count up the # of names * of each class */ @@ -439,8 +486,11 @@ static void ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) { list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_instruction *defn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + +#ifdef DEBUG + instr->name = ~0; +#endif ctx->instr_cnt++; @@ -450,9 +500,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (!writes_gpr(instr)) continue; - defn = get_definer(instr, &sz, &off); - - if (defn != instr) + if (id->defn != instr) continue; /* arrays which don't fit in one of the pre-defined class @@ -460,9 +508,8 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) * * TODO but we still need to allocate names for them, don't we?? */ - cls = size_to_class(sz, is_half(defn)); - if (cls >= 0) { - instr->name = ctx->class_alloc_count[cls]++; + if (id->cls >= 0) { + instr->name = ctx->class_alloc_count[id->cls]++; ctx->alloc_count++; } } @@ -471,8 +518,16 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_init(struct ir3_ra_ctx *ctx) { + unsigned n; + ir3_clear_mark(ctx->ir); - ir3_count_instructions(ctx->ir); + n = ir3_count_instructions(ctx->ir); + + ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_find_definers(ctx, block); + } list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { ra_block_name_instructions(ctx, block); @@ -488,6 +543,7 @@ ra_init(struct ir3_ra_ctx *ctx) } ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); + ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); } @@ -555,39 +611,36 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) */ if (writes_gpr(instr)) { - struct ir3_instruction *defn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - defn = get_definer(instr, &sz, &off); - if (defn == instr) { + if (id->defn == instr) { /* arrays which don't fit in one of the pre-defined class * sizes are pre-colored: */ - cls = size_to_class(sz, is_half(defn)); - if (cls >= 0) { - unsigned name = ra_name(ctx, cls, defn); + if (id->cls >= 0) { + unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->def[name] = defn->ip; - ctx->use[name] = defn->ip; + ctx->def[name] = id->defn->ip; + ctx->use[name] = id->defn->ip; /* since we are in SSA at this point: */ debug_assert(!BITSET_TEST(bd->use, name)); BITSET_SET(bd->def, name); - if (is_half(defn)) { + if (is_half(id->defn)) { ra_set_node_class(ctx->g, name, - ctx->set->half_classes[cls - class_count]); + ctx->set->half_classes[id->cls - class_count]); } else { ra_set_node_class(ctx->g, name, - ctx->set->classes[cls]); + ctx->set->classes[id->cls]); } /* extend the live range for phi srcs, which may come * from the bottom of the loop */ - if (defn->regs[0]->flags & IR3_REG_PHI_SRC) { - struct ir3_instruction *phi = defn->regs[0]->instr; + if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = id->defn->regs[0]->instr; foreach_ssa_src(src, phi) { /* if src is after phi, then we need to extend * the liverange to the end of src's block: @@ -606,13 +659,10 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_ssa_src(src, instr) { if (writes_gpr(src)) { - struct ir3_instruction *srcdefn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[src->ip]; - srcdefn = get_definer(src, &sz, &off); - cls = size_to_class(sz, is_half(srcdefn)); - if (cls >= 0) { - unsigned name = ra_name(ctx, cls, srcdefn); + if (id->cls >= 0) { + unsigned name = ra_name(ctx, id->cls, id->defn); ctx->use[name] = MAX2(ctx->use[name], instr->ip); if (!BITSET_TEST(bd->def, name)) BITSET_SET(bd->use, name); @@ -704,13 +754,10 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* need to fix things up to keep outputs live: */ for (unsigned i = 0; i < ir->noutputs; i++) { struct ir3_instruction *instr = ir->outputs[i]; - struct ir3_instruction *defn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - defn = get_definer(instr, &sz, &off); - cls = size_to_class(sz, is_half(defn)); - if (cls >= 0) { - unsigned name = ra_name(ctx, cls, defn); + if (id->cls >= 0) { + unsigned name = ra_name(ctx, id->cls, id->defn); ctx->use[name] = ctx->instr_cnt; } } @@ -780,15 +827,12 @@ static void reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_instruction *defn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - defn = get_definer(instr, &sz, &off); - cls = size_to_class(sz, is_half(defn)); - if (cls >= 0) { - unsigned name = ra_name(ctx, cls, defn); + if (id->cls >= 0) { + unsigned name = ra_name(ctx, id->cls, id->defn); unsigned r = ra_get_node_reg(ctx->g, name); - unsigned num = ctx->set->ra_reg_to_gpr[r] + off; + unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; if (reg->flags & IR3_REG_RELATIV) num += reg->offset; @@ -796,7 +840,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, reg->num = num; reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); - if (is_half(defn)) + if (is_half(id->defn)) reg->flags |= IR3_REG_HALF; } } @@ -851,19 +895,16 @@ ra_alloc(struct ir3_ra_ctx *ctx) for (j = 0; i < ir->ninputs; i++) { struct ir3_instruction *instr = ir->inputs[i]; if (instr) { - struct ir3_instruction *defn; - int cls, sz, off; + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - defn = get_definer(instr, &sz, &off); - if (defn == instr) { + if (id->defn == instr) { unsigned name, reg; - cls = size_to_class(sz, is_half(defn)); - name = ra_name(ctx, cls, defn); - reg = ctx->set->gpr_to_ra_reg[cls][j]; + name = ra_name(ctx, id->cls, id->defn); + reg = ctx->set->gpr_to_ra_reg[id->cls][j]; ra_set_node_reg(ctx->g, name, reg); - j += sz; + j += id->sz; } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 49a4426d163..2ee325518f7 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -80,12 +80,12 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) list_delinit(&instr->node); if (writes_addr(instr)) { - assert(ctx->addr == NULL); + debug_assert(ctx->addr == NULL); ctx->addr = instr; } if (writes_pred(instr)) { - assert(ctx->pred == NULL); + debug_assert(ctx->pred == NULL); ctx->pred = instr; } @@ -180,13 +180,13 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, * free: */ if (writes_addr(instr) && ctx->addr) { - assert(ctx->addr != instr); + debug_assert(ctx->addr != instr); notes->addr_conflict = true; return true; } if (writes_pred(instr) && ctx->pred) { - assert(ctx->pred != instr); + debug_assert(ctx->pred != instr); notes->pred_conflict = true; return true; } @@ -261,6 +261,20 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, return 0; } +/* could an instruction be scheduled if specified ssa src was scheduled? */ +static bool +could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) +{ + struct ir3_instruction *other_src; + foreach_ssa_src(other_src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if ((src != other_src) && !is_scheduled(other_src)) { + return false; + } + } + return true; +} + /* move eligible instructions to the priority list: */ static unsigned add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, @@ -272,6 +286,31 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, int e = instr_eligibility(ctx, notes, instr); if (e < 0) continue; + + /* For instructions that write address register we need to + * make sure there is at least one instruction that uses the + * addr value which is otherwise ready. + * + * TODO if any instructions use pred register and have other + * src args, we would need to do the same for writes_pred().. + */ + if (unlikely(writes_addr(instr))) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + continue; + } + min_delay = MIN2(min_delay, e); if (e == 0) { /* remove from unscheduled list and into priority queue: */ @@ -287,20 +326,25 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, * instructions which depend on the current address register * to a clone of the instruction which wrote the address reg. */ -static void +static struct ir3_instruction * split_addr(struct ir3_sched_ctx *ctx) { - struct ir3 *ir = ctx->addr->block->shader; + struct ir3 *ir; struct ir3_instruction *new_addr = NULL; unsigned i; debug_assert(ctx->addr); + ir = ctx->addr->block->shader; + for (i = 0; i < ir->indirects_count; i++) { struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + /* skip instructions already scheduled: */ - if (indirect->flags & IR3_INSTR_MARK) + if (is_scheduled(indirect)) continue; /* remap remaining instructions using current addr @@ -312,32 +356,36 @@ split_addr(struct ir3_sched_ctx *ctx) /* original addr is scheduled, but new one isn't: */ new_addr->flags &= ~IR3_INSTR_MARK; } - indirect->address = new_addr; + ir3_instr_set_address(indirect, new_addr); } } /* all remaining indirects remapped to new addr: */ ctx->addr = NULL; + + return new_addr; } /* "spill" the predicate register by remapping any unscheduled * instructions which depend on the current predicate register * to a clone of the instruction which wrote the address reg. */ -static void +static struct ir3_instruction * split_pred(struct ir3_sched_ctx *ctx) { - struct ir3 *ir = ctx->pred->block->shader; + struct ir3 *ir; struct ir3_instruction *new_pred = NULL; unsigned i; debug_assert(ctx->pred); + ir = ctx->pred->block->shader; + for (i = 0; i < ir->predicates_count; i++) { struct ir3_instruction *predicated = ir->predicates[i]; /* skip instructions already scheduled: */ - if (predicated->flags & IR3_INSTR_MARK) + if (is_scheduled(predicated)) continue; /* remap remaining instructions using current pred @@ -358,6 +406,8 @@ split_pred(struct ir3_sched_ctx *ctx) /* all remaining predicated remapped to new pred: */ ctx->pred = NULL; + + return new_pred; } static void @@ -407,20 +457,32 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) schedule(ctx, instr); } else if (delay == ~0) { + struct ir3_instruction *new_instr = NULL; + /* nothing available to schedule.. if we are blocked on * address/predicate register conflict, then break the * deadlock by cloning the instruction that wrote that * reg: */ if (notes.addr_conflict) { - split_addr(ctx); + new_instr = split_addr(ctx); } else if (notes.pred_conflict) { - split_pred(ctx); + new_instr = split_pred(ctx); } else { debug_assert(0); ctx->error = true; return; } + + if (new_instr) { + list_del(&new_instr->node); + list_addtail(&new_instr->node, &unscheduled_list); + /* the original instr that wrote addr/pred may have + * originated from a different block: + */ + new_instr->block = block; + } + } else { /* and if we run out of instructions that can be scheduled, * then it is time for nop's: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index b5b038100cc..312174c0c6d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -46,7 +46,8 @@ delete_variant(struct ir3_shader_variant *v) { if (v->ir) ir3_destroy(v->ir); - fd_bo_del(v->bo); + if (v->bo) + fd_bo_del(v->bo); free(v); } @@ -139,6 +140,32 @@ assemble_variant(struct ir3_shader_variant *v) memcpy(fd_bo_map(v->bo), bin, sz); + if (fd_mesa_debug & FD_DBG_DISASM) { + struct ir3_shader_key key = v->key; + DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, + key.binning_pass, key.color_two_side, key.half_precision); + ir3_shader_disasm(v, bin); + } + + if (fd_mesa_debug & FD_DBG_SHADERDB) { + /* print generic shader info: */ + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n", + ir3_shader_stage(v->shader), + v->shader->id, v->id, + v->info.instrs_count, + v->info.sizedwords); + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u half, %u full\n", + ir3_shader_stage(v->shader), + v->shader->id, v->id, + v->info.max_half_reg + 1, + v->info.max_reg + 1); + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n", + ir3_shader_stage(v->shader), + v->shader->id, v->id, + v->info.max_const + 1, + v->constlen); + } + free(bin); /* no need to keep the ir around beyond this point: */ @@ -150,12 +177,12 @@ static struct ir3_shader_variant * create_variant(struct ir3_shader *shader, struct ir3_shader_key key) { struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); - const struct tgsi_token *tokens = shader->tokens; int ret; if (!v) return NULL; + v->id = ++shader->variant_count; v->shader = shader; v->key = key; v->type = shader->type; @@ -163,10 +190,10 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key) if (fd_mesa_debug & FD_DBG_DISASM) { DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type, key.binning_pass, key.color_two_side, key.half_precision); - tgsi_dump(tokens, 0); + tgsi_dump(shader->tokens, 0); } - ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key); + ret = ir3_compile_shader_nir(shader->compiler, v); if (ret) { debug_error("compile failed!"); goto fail; @@ -178,12 +205,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key) goto fail; } - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, - key.binning_pass, key.color_two_side, key.half_precision); - disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); - } - return v; fail: @@ -228,8 +249,10 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key) /* compile new variant if it doesn't exist already: */ v = create_variant(shader, key); - v->next = shader->variants; - shader->variants = v; + if (v) { + v->next = shader->variants; + shader->variants = v; + } return v; } @@ -249,13 +272,372 @@ ir3_shader_destroy(struct ir3_shader *shader) } struct ir3_shader * -ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens, +ir3_shader_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso, enum shader_t type) { struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); shader->compiler = fd_context(pctx)->screen->compiler; + shader->id = ++shader->compiler->shader_count; shader->pctx = pctx; shader->type = type; - shader->tokens = tgsi_dup_tokens(tokens); + shader->tokens = tgsi_dup_tokens(cso->tokens); + shader->stream_output = cso->stream_output; + if (fd_mesa_debug & FD_DBG_SHADERDB) { + /* if shader-db run, create a standard variant immediately + * (as otherwise nothing will trigger the shader to be + * actually compiled) + */ + static struct ir3_shader_key key = {}; + ir3_shader_variant(shader, key); + } return shader; } + +static void dump_reg(const char *name, uint32_t r) +{ + if (r != regid(63,0)) + debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]); +} + +static void dump_semantic(struct ir3_shader_variant *so, + unsigned sem, const char *name) +{ + uint32_t regid; + regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0)); + dump_reg(name, regid); +} + +void +ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin) +{ + struct ir3 *ir = so->ir; + struct ir3_register *reg; + const char *type = ir3_shader_stage(so->shader); + uint8_t regid; + unsigned i; + + for (i = 0; i < ir->ninputs; i++) { + if (!ir->inputs[i]) { + debug_printf("; in%d unused\n", i); + continue; + } + reg = ir->inputs[i]->regs[0]; + regid = reg->num; + debug_printf("@in(%sr%d.%c)\tin%d\n", + (reg->flags & IR3_REG_HALF) ? "h" : "", + (regid >> 2), "xyzw"[regid & 0x3], i); + } + + for (i = 0; i < ir->noutputs; i++) { + if (!ir->outputs[i]) { + debug_printf("; out%d unused\n", i); + continue; + } + /* kill shows up as a virtual output.. skip it! */ + if (is_kill(ir->outputs[i])) + continue; + reg = ir->outputs[i]->regs[0]; + regid = reg->num; + debug_printf("@out(%sr%d.%c)\tout%d\n", + (reg->flags & IR3_REG_HALF) ? "h" : "", + (regid >> 2), "xyzw"[regid & 0x3], i); + } + + for (i = 0; i < so->immediates_count; i++) { + debug_printf("@const(c%d.x)\t", so->first_immediate + i); + debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n", + so->immediates[i].val[0], + so->immediates[i].val[1], + so->immediates[i].val[2], + so->immediates[i].val[3]); + } + + disasm_a3xx(bin, so->info.sizedwords, 0, so->type); + + debug_printf("; %s: outputs:", type); + for (i = 0; i < so->outputs_count; i++) { + uint8_t regid = so->outputs[i].regid; + ir3_semantic sem = so->outputs[i].semantic; + debug_printf(" r%d.%c (%u:%u)", + (regid >> 2), "xyzw"[regid & 0x3], + sem2name(sem), sem2idx(sem)); + } + debug_printf("\n"); + debug_printf("; %s: inputs:", type); + for (i = 0; i < so->inputs_count; i++) { + uint8_t regid = so->inputs[i].regid; + ir3_semantic sem = so->inputs[i].semantic; + debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)", + (regid >> 2), "xyzw"[regid & 0x3], + sem2name(sem), sem2idx(sem), + so->inputs[i].compmask, + so->inputs[i].inloc, + so->inputs[i].bary); + } + debug_printf("\n"); + + /* print generic shader info: */ + debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n", + type, so->shader->id, so->id, + so->info.instrs_count, + so->info.max_half_reg + 1, + so->info.max_reg + 1); + + debug_printf("; %d const, %u constlen\n", + so->info.max_const + 1, + so->constlen); + + /* print shader type specific info: */ + switch (so->type) { + case SHADER_VERTEX: + dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos"); + dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize"); + break; + case SHADER_FRAGMENT: + dump_reg("pos (bary)", so->pos_regid); + dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz"); + dump_semantic(so, TGSI_SEMANTIC_COLOR, "color"); + /* these two are hard-coded since we don't know how to + * program them to anything but all 0's... + */ + if (so->frag_coord) + debug_printf("; fragcoord: r0.x\n"); + if (so->frag_face) + debug_printf("; fragface: hr0.x\n"); + break; + case SHADER_COMPUTE: + break; + } + + debug_printf("\n"); +} + +/* This has to reach into the fd_context a bit more than the rest of + * ir3, but it needs to be aligned with the compiler, so both agree + * on which const regs hold what. And the logic is identical between + * a3xx/a4xx, the only difference is small details in the actual + * CP_LOAD_STATE packets (which is handled inside the generation + * specific ctx->emit_const(_bo)() fxns) + */ + +#include "freedreno_resource.h" + +static void +emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_constbuf_stateobj *constbuf) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + const unsigned index = 0; /* user consts are index 0 */ + /* TODO save/restore dirty_mask for binning pass instead: */ + uint32_t dirty_mask = constbuf->enabled_mask; + + if (dirty_mask & (1 << index)) { + struct pipe_constant_buffer *cb = &constbuf->cb[index]; + unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ + + /* in particular, with binning shader we may end up with + * unused consts, ie. we could end up w/ constlen that is + * smaller than first_driver_param. In that case truncate + * the user consts early to avoid HLSQ lockup caused by + * writing too many consts + */ + uint32_t max_const = MIN2(v->first_driver_param, v->constlen); + + // I expect that size should be a multiple of vec4's: + assert(size == align(size, 4)); + + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, 4 * max_const); + + if (size > 0) { + fd_wfi(ctx, ring); + ctx->emit_const(ring, v->type, 0, + cb->buffer_offset, size, + cb->user_buffer, cb->buffer); + constbuf->dirty_mask &= ~(1 << index); + } + } +} + +static void +emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_constbuf_stateobj *constbuf) +{ + uint32_t offset = v->first_driver_param; /* UBOs after user consts */ + if (v->constlen > offset) { + struct fd_context *ctx = fd_context(v->shader->pctx); + uint32_t params = MIN2(4, v->constlen - offset) * 4; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + const uint32_t index = i + 1; /* UBOs start at index 1 */ + struct pipe_constant_buffer *cb = &constbuf->cb[index]; + assert(!cb->user_buffer); + + if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) { + offsets[i] = cb->buffer_offset; + bos[i] = fd_resource(cb->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + fd_wfi(ctx, ring); + ctx->emit_const_bo(ring, v->type, false, offset * 4, params, bos, offsets); + } +} + +static void +emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + int size = v->immediates_count; + uint32_t base = v->first_immediate; + + /* truncate size to avoid writing constants that shader + * does not use: + */ + size = MIN2(size + base, v->constlen) - base; + + /* convert out of vec4: */ + base *= 4; + size *= 4; + + if (size > 0) { + fd_wfi(ctx, ring); + ctx->emit_const(ring, v->type, base, + 0, size, v->immediates[0].val, NULL); + } +} + +/* emit stream-out buffers: */ +static void +emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/ + if (v->constlen > offset) { + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t params = 4; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + + if (target) { + offsets[i] = (so->offsets[i] * info->stride[i] * 4) + + target->buffer_offset; + bos[i] = fd_resource(target->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + fd_wfi(ctx, ring); + ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets); + } +} + +static uint32_t +max_tf_vtx(struct ir3_shader_variant *v) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t maxvtxcnt = 0x7fffffff; + + if (v->key.binning_pass) + return 0; + if (v->shader->stream_output.num_outputs == 0) + return 0; + if (so->num_targets == 0) + return 0; + + /* offset to write to is: + * + * total_vtxcnt = vtxcnt + offsets[i] + * offset = total_vtxcnt * stride[i] + * + * offset = vtxcnt * stride[i] ; calculated in shader + * + offsets[i] * stride[i] ; calculated at emit_tfbos() + * + * assuming for each vtx, each target buffer will have data written + * up to 'offset + stride[i]', that leaves maxvtxcnt as: + * + * buffer_size = (maxvtxcnt * stride[i]) + stride[i] + * maxvtxcnt = (buffer_size - stride[i]) / stride[i] + * + * but shader is actually doing a less-than (rather than less-than- + * equal) check, so we can drop the -stride[i]. + * + * TODO is assumption about `offset + stride[i]` legit? + */ + for (unsigned i = 0; i < so->num_targets; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ + if (target) { + uint32_t max = target->buffer_size / stride; + maxvtxcnt = MIN2(maxvtxcnt, max); + } + } + + return maxvtxcnt; +} + +void +ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + const struct pipe_draw_info *info, uint32_t dirty) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) { + struct fd_constbuf_stateobj *constbuf; + bool shader_dirty; + + if (v->type == SHADER_VERTEX) { + constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX]; + shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_VP); + } else if (v->type == SHADER_FRAGMENT) { + constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT]; + shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_FP); + } else { + unreachable("bad shader type"); + return; + } + + emit_user_consts(v, ring, constbuf); + emit_ubos(v, ring, constbuf); + if (shader_dirty) + emit_immediates(v, ring); + } + + /* emit driver params every time: */ + /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */ + if (info && (v->type == SHADER_VERTEX)) { + uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */ + if (v->constlen >= offset) { + uint32_t vertex_params[4] = { + [IR3_DP_VTXID_BASE] = info->indexed ? + info->index_bias : info->start, + [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v), + }; + + fd_wfi(ctx, ring); + ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0, + ARRAY_SIZE(vertex_params), vertex_params, NULL); + + /* if needed, emit stream-out buffer addresses: */ + if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { + emit_tfbos(v, ring); + } + } + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 9f1b0769180..1bbbdbd224d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -29,9 +29,22 @@ #ifndef IR3_SHADER_H_ #define IR3_SHADER_H_ +#include "pipe/p_state.h" + #include "ir3.h" #include "disasm.h" +/* driver param indices: */ +enum ir3_driver_param { + IR3_DP_VTXID_BASE = 0, + IR3_DP_VTXCNT_MAX = 1, +}; + +/* internal semantic used for passing vtxcnt to vertex shader to + * implement transform feedback: + */ +#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0) + typedef uint16_t ir3_semantic; /* semantic name + index */ static inline ir3_semantic ir3_semantic_name(uint8_t name, uint16_t index) @@ -100,6 +113,9 @@ ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b) struct ir3_shader_variant { struct fd_bo *bo; + /* variant id (for debug) */ + uint32_t id; + struct ir3_shader_key key; struct ir3_info info; @@ -192,26 +208,44 @@ struct ir3_shader_variant { struct ir3_shader { enum shader_t type; + /* shader id (for debug): */ + uint32_t id; + uint32_t variant_count; + struct ir3_compiler *compiler; struct pipe_context *pctx; const struct tgsi_token *tokens; + struct pipe_stream_output_info stream_output; struct ir3_shader_variant *variants; - - /* so far, only used for blit_prog shader.. values for - * VPC_VARYING_PS_REPL[i].MODE - */ - uint32_t vpsrepl[8]; }; void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); struct ir3_shader * ir3_shader_create(struct pipe_context *pctx, - const struct tgsi_token *tokens, enum shader_t type); + const struct pipe_shader_state *cso, enum shader_t type); void ir3_shader_destroy(struct ir3_shader *shader); struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key); +void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin); + +struct fd_ringbuffer; +void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + const struct pipe_draw_info *info, uint32_t dirty); + +static inline const char * +ir3_shader_stage(struct ir3_shader *shader) +{ + switch (shader->type) { + case SHADER_VERTEX: return "VERT"; + case SHADER_FRAGMENT: return "FRAG"; + case SHADER_COMPUTE: return "CL"; + default: + unreachable("invalid type"); + return NULL; + } +} /* * Helper/util: diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h index dcf63543219..6466fa594f9 100644 --- a/src/gallium/drivers/i915/i915_batchbuffer.h +++ b/src/gallium/drivers/i915/i915_batchbuffer.h @@ -33,20 +33,20 @@ struct i915_context; -static INLINE size_t +static inline size_t i915_winsys_batchbuffer_space(struct i915_winsys_batchbuffer *batch) { return batch->size - (batch->ptr - batch->map); } -static INLINE boolean +static inline boolean i915_winsys_batchbuffer_check(struct i915_winsys_batchbuffer *batch, size_t dwords) { return dwords * 4 <= i915_winsys_batchbuffer_space(batch); } -static INLINE void +static inline void i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch, unsigned dword) { @@ -54,7 +54,7 @@ i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch, batch->ptr += 4; } -static INLINE void +static inline void i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch, float f) { @@ -64,7 +64,7 @@ i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch, i915_winsys_batchbuffer_dword_unchecked(batch, uif.ui); } -static INLINE void +static inline void i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch, unsigned dword) { @@ -72,7 +72,7 @@ i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch, i915_winsys_batchbuffer_dword_unchecked(batch, dword); } -static INLINE void +static inline void i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch, void *data, size_t size) @@ -83,7 +83,7 @@ i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch, batch->ptr += size; } -static INLINE boolean +static inline boolean i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch, struct i915_winsys_buffer **buffers, int num_of_buffers) @@ -91,7 +91,7 @@ i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch, return batch->iws->validate_buffers(batch, buffers, num_of_buffers); } -static INLINE int +static inline int i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch, struct i915_winsys_buffer *buffer, enum i915_winsys_buffer_usage usage, diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h index 40abf3c577f..c8c7d64f5cb 100644 --- a/src/gallium/drivers/i915/i915_context.h +++ b/src/gallium/drivers/i915/i915_context.h @@ -339,7 +339,7 @@ struct i915_context { #define I915_DST_VARS 4 #define I915_DST_RECT 8 -static INLINE +static inline void i915_set_flush_dirty(struct i915_context *i915, unsigned flush) { i915->hardware_dirty |= I915_HW_FLUSH; @@ -408,7 +408,7 @@ struct pipe_context *i915_create_context(struct pipe_screen *screen, * Inline conversion functions. These are better-typed than the * macros used previously: */ -static INLINE struct i915_context * +static inline struct i915_context * i915_context( struct pipe_context *pipe ) { return (struct i915_context *)pipe; diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h index 079882c811f..0f12a592ae8 100644 --- a/src/gallium/drivers/i915/i915_debug.h +++ b/src/gallium/drivers/i915/i915_debug.h @@ -48,13 +48,13 @@ struct i915_winsys_batchbuffer; extern unsigned i915_debug; #ifdef DEBUG -static INLINE boolean +static inline boolean I915_DBG_ON(unsigned flags) { return i915_debug & flags; } -static INLINE void +static inline void I915_DBG(unsigned flags, const char *fmt, ...) { if (I915_DBG_ON(flags)) { @@ -67,7 +67,7 @@ I915_DBG(unsigned flags, const char *fmt, ...) } #else #define I915_DBG_ON(flags) (0) -static INLINE void I915_DBG(unsigned flags, const char *fmt, ...) {} +static inline void I915_DBG(unsigned flags, const char *fmt, ...) {} #endif void i915_debug_init(struct i915_screen *i915); diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h index a4dbcb4d271..adc42542fea 100644 --- a/src/gallium/drivers/i915/i915_fpc.h +++ b/src/gallium/drivers/i915/i915_fpc.h @@ -136,7 +136,7 @@ struct i915_fp_compile { /* One neat thing about the UREG representation: */ -static INLINE int +static inline int swizzle(int reg, uint x, uint y, uint z, uint w) { assert(x <= SRC_ONE); diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c index 38a33888166..456be9d92ca 100644 --- a/src/gallium/drivers/i915/i915_fpc_translate.c +++ b/src/gallium/drivers/i915/i915_fpc_translate.c @@ -111,7 +111,7 @@ static const float cos_constants[4] = { 1.0, /** * component-wise negation of ureg */ -static INLINE int +static inline int negate(int reg, int x, int y, int z, int w) { /* Another neat thing about the UREG representation */ diff --git a/src/gallium/drivers/i915/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c index 248e21e02da..ea84efd1d17 100644 --- a/src/gallium/drivers/i915/i915_prim_emit.c +++ b/src/gallium/drivers/i915/i915_prim_emit.c @@ -53,7 +53,7 @@ struct setup_stage { /** * Basically a cast wrapper. */ -static INLINE struct setup_stage *setup_stage( struct draw_stage *stage ) +static inline struct setup_stage *setup_stage( struct draw_stage *stage ) { return (struct setup_stage *)stage; } @@ -65,7 +65,7 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage ) * have a couple of slots at the beginning (1-dword header, 4-dword * clip pos) that we ignore here. */ -static INLINE void +static inline void emit_hw_vertex( struct i915_context *i915, const struct vertex_header *vertex) { @@ -124,7 +124,7 @@ emit_hw_vertex( struct i915_context *i915, -static INLINE void +static inline void emit_prim( struct draw_stage *stage, struct prim_header *prim, unsigned hwprim, diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c index d134dbb1620..8f61f151e0c 100644 --- a/src/gallium/drivers/i915/i915_prim_vbuf.c +++ b/src/gallium/drivers/i915/i915_prim_vbuf.c @@ -96,7 +96,7 @@ struct i915_vbuf_render { /** * Basically a cast wrapper. */ -static INLINE struct i915_vbuf_render * +static inline struct i915_vbuf_render * i915_vbuf_render(struct vbuf_render *render) { assert(render); diff --git a/src/gallium/drivers/i915/i915_resource.h b/src/gallium/drivers/i915/i915_resource.h index ef99cfb5d3c..77fe8b70f79 100644 --- a/src/gallium/drivers/i915/i915_resource.h +++ b/src/gallium/drivers/i915/i915_resource.h @@ -94,14 +94,14 @@ void i915_init_resource_functions(struct i915_context *i915); extern struct u_resource_vtbl i915_buffer_vtbl; extern struct u_resource_vtbl i915_texture_vtbl; -static INLINE struct i915_texture *i915_texture(struct pipe_resource *resource) +static inline struct i915_texture *i915_texture(struct pipe_resource *resource) { struct i915_texture *tex = (struct i915_texture *)resource; assert(tex->b.vtbl == &i915_texture_vtbl); return tex; } -static INLINE struct i915_buffer *i915_buffer(struct pipe_resource *resource) +static inline struct i915_buffer *i915_buffer(struct pipe_resource *resource) { struct i915_buffer *tex = (struct i915_buffer *)resource; assert(tex->b.vtbl == &i915_buffer_vtbl); diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c index 8ef73d6f2c2..9a3279ccb75 100644 --- a/src/gallium/drivers/i915/i915_resource_texture.c +++ b/src/gallium/drivers/i915/i915_resource_texture.c @@ -89,25 +89,25 @@ static const int bottom_offsets[6] = { [PIPE_TEX_FACE_NEG_Z] = 16 + 5 * 8, }; -static INLINE unsigned +static inline unsigned align_nblocksx(enum pipe_format format, unsigned width, unsigned align_to) { return align(util_format_get_nblocksx(format, width), align_to); } -static INLINE unsigned +static inline unsigned align_nblocksy(enum pipe_format format, unsigned width, unsigned align_to) { return align(util_format_get_nblocksy(format, width), align_to); } -static INLINE unsigned +static inline unsigned get_pot_stride(enum pipe_format format, unsigned width) { return util_next_power_of_two(util_format_get_stride(format, width)); } -static INLINE const char* +static inline const char* get_tiling_string(enum i915_winsys_buffer_tile tile) { switch(tile) { diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 0590da07b9a..19a94a8e019 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -243,6 +243,10 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: @@ -463,21 +467,15 @@ i915_fence_reference(struct pipe_screen *screen, } static boolean -i915_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - struct i915_screen *is = i915_screen(screen); - - return is->iws->fence_signalled(is->iws, fence) == 1; -} - -static boolean i915_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) { struct i915_screen *is = i915_screen(screen); + if (!timeout) + return is->iws->fence_signalled(is->iws, fence) == 1; + return is->iws->fence_finish(is->iws, fence) == 1; } @@ -565,7 +563,6 @@ i915_screen_create(struct i915_winsys *iws) is->base.context_create = i915_create_context; is->base.fence_reference = i915_fence_reference; - is->base.fence_signalled = i915_fence_signalled; is->base.fence_finish = i915_fence_finish; i915_init_screen_resource_functions(is); diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h index 99d3ffd3af9..3be941a1561 100644 --- a/src/gallium/drivers/i915/i915_screen.h +++ b/src/gallium/drivers/i915/i915_screen.h @@ -59,7 +59,7 @@ struct i915_screen */ -static INLINE struct i915_screen * +static inline struct i915_screen * i915_screen(struct pipe_screen *pscreen) { return (struct i915_screen *) pscreen; diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c index 4050cd4ac44..1c29e8ae671 100644 --- a/src/gallium/drivers/i915/i915_state_dynamic.c +++ b/src/gallium/drivers/i915/i915_state_dynamic.c @@ -46,7 +46,7 @@ * (active) state every time a 4kb boundary is crossed. */ -static INLINE void set_dynamic(struct i915_context *i915, +static inline void set_dynamic(struct i915_context *i915, unsigned offset, const unsigned state) { @@ -60,7 +60,7 @@ static INLINE void set_dynamic(struct i915_context *i915, -static INLINE void set_dynamic_array(struct i915_context *i915, +static inline void set_dynamic_array(struct i915_context *i915, unsigned offset, const unsigned *src, unsigned dwords) diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c index d244a349fce..c4a6cae1beb 100644 --- a/src/gallium/drivers/i915/i915_state_immediate.c +++ b/src/gallium/drivers/i915/i915_state_immediate.c @@ -39,7 +39,7 @@ /* Convinience function to check immediate state. */ -static INLINE void set_immediate(struct i915_context *i915, +static inline void set_immediate(struct i915_context *i915, unsigned offset, const unsigned state) { diff --git a/src/gallium/drivers/i915/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h index d4c5ab69555..015ea32933b 100644 --- a/src/gallium/drivers/i915/i915_state_inlines.h +++ b/src/gallium/drivers/i915/i915_state_inlines.h @@ -34,7 +34,7 @@ #include "i915_reg.h" -static INLINE unsigned +static inline unsigned i915_translate_compare_func(unsigned func) { switch (func) { @@ -59,7 +59,7 @@ i915_translate_compare_func(unsigned func) } } -static INLINE unsigned +static inline unsigned i915_translate_shadow_compare_func(unsigned func) { switch (func) { @@ -84,7 +84,7 @@ i915_translate_shadow_compare_func(unsigned func) } } -static INLINE unsigned +static inline unsigned i915_translate_stencil_op(unsigned op) { switch (op) { @@ -109,7 +109,7 @@ i915_translate_stencil_op(unsigned op) } } -static INLINE unsigned +static inline unsigned i915_translate_blend_factor(unsigned factor) { switch (factor) { @@ -148,7 +148,7 @@ i915_translate_blend_factor(unsigned factor) } } -static INLINE unsigned +static inline unsigned i915_translate_blend_func(unsigned mode) { switch (mode) { @@ -168,7 +168,7 @@ i915_translate_blend_func(unsigned mode) } -static INLINE unsigned +static inline unsigned i915_translate_logic_op(unsigned opcode) { switch (opcode) { @@ -211,7 +211,7 @@ i915_translate_logic_op(unsigned opcode) -static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr ) +static inline boolean i915_validate_vertices( unsigned hw_prim, unsigned nr ) { boolean ok; diff --git a/src/gallium/drivers/ilo/Makefile.am b/src/gallium/drivers/ilo/Makefile.am index a8785a5e8c4..1f14153748e 100644 --- a/src/gallium/drivers/ilo/Makefile.am +++ b/src/gallium/drivers/ilo/Makefile.am @@ -21,8 +21,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources index e1bbb9a0781..7a7db938f92 100644 --- a/src/gallium/drivers/ilo/Makefile.sources +++ b/src/gallium/drivers/ilo/Makefile.sources @@ -1,5 +1,4 @@ C_SOURCES := \ - core/ilo_buffer.h \ core/ilo_builder.c \ core/ilo_builder.h \ core/ilo_builder_3d.h \ @@ -43,6 +42,7 @@ C_SOURCES := \ core/ilo_state_viewport.h \ core/ilo_state_zs.c \ core/ilo_state_zs.h \ + core/ilo_vma.h \ core/intel_winsys.h \ ilo_blit.c \ ilo_blit.h \ @@ -65,8 +65,6 @@ C_SOURCES := \ ilo_public.h \ ilo_query.c \ ilo_query.h \ - ilo_resource.c \ - ilo_resource.h \ ilo_render.c \ ilo_render.h \ ilo_render_gen.h \ @@ -76,6 +74,8 @@ C_SOURCES := \ ilo_render_gen8.c \ ilo_render_media.c \ ilo_render_surface.c \ + ilo_resource.c \ + ilo_resource.h \ ilo_screen.c \ ilo_screen.h \ ilo_shader.c \ diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h index 6d9e3699125..5efe9da2d22 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h @@ -39,6 +39,7 @@ #include "ilo_state_shader.h" #include "ilo_state_viewport.h" #include "ilo_state_zs.h" +#include "ilo_vma.h" #include "ilo_builder.h" #include "ilo_builder_3d_top.h" @@ -674,9 +675,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder, dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT; - if (zs->depth_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo, - zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->z_vma) { + ilo_builder_batch_reloc64(builder, pos + 2, zs->z_vma->bo, + zs->z_vma->bo_offset + zs->depth[1], + (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { dw[1] = zs->depth[0]; @@ -691,9 +693,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder, else dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT; - if (zs->depth_bo) { - ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo, - zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->z_vma) { + ilo_builder_batch_reloc(builder, pos + 2, zs->z_vma->bo, + zs->z_vma->bo_offset + zs->depth[1], + (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } } @@ -724,9 +727,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder, dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT; - if (zs->stencil_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo, - zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->s_vma) { + ilo_builder_batch_reloc64(builder, pos + 2, zs->s_vma->bo, + zs->s_vma->bo_offset + zs->stencil[1], + (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { dw[1] = zs->stencil[0]; @@ -734,9 +738,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder, dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT; - if (zs->stencil_bo) { - ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo, - zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->s_vma) { + ilo_builder_batch_reloc(builder, pos + 2, zs->s_vma->bo, + zs->s_vma->bo_offset + zs->stencil[1], + (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE); } } } @@ -767,9 +772,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder, dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT; - if (zs->hiz_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo, - zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->hiz_vma) { + ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_vma->bo, + zs->hiz_vma->bo_offset + zs->hiz[1], + (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } else { dw[1] = zs->hiz[0]; @@ -777,9 +783,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder, dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT; - if (zs->hiz_bo) { - ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo, - zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); + if (zs->hiz_vma) { + ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_vma->bo, + zs->hiz_vma->bo_offset + zs->hiz[1], + (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE); } } } diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h index 8d30095e6f6..6e94fb25f1f 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h +++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h @@ -39,6 +39,7 @@ #include "ilo_state_surface.h" #include "ilo_state_urb.h" #include "ilo_state_vf.h" +#include "ilo_vma.h" #include "ilo_builder.h" static inline void @@ -318,8 +319,10 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder, dw[3] = 0; if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) { - if (b->need_bo) - ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0); + if (b->vma) { + ilo_builder_batch_reloc64(builder, pos + 1, b->vma->bo, + b->vma->bo_offset + b->vb[1], 0); + } dw[3] |= b->vb[2]; } else { @@ -331,9 +334,11 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder, dw[3] |= vf->user_instancing[elem][1]; } - if (b->need_bo) { - ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0); - ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0); + if (b->vma) { + ilo_builder_batch_reloc(builder, pos + 1, b->vma->bo, + b->vma->bo_offset + b->vb[1], 0); + ilo_builder_batch_reloc(builder, pos + 2, b->vma->bo, + b->vma->bo_offset + b->vb[2], 0); } } @@ -429,9 +434,11 @@ gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder, pos = ilo_builder_batch_pointer(builder, cmd_len, &dw); dw[0] = dw0; - if (ib->need_bo) { - ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0); - ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0); + if (ib->vma) { + ilo_builder_batch_reloc(builder, pos + 1, ib->vma->bo, + ib->vma->bo_offset + ib->ib[1], 0); + ilo_builder_batch_reloc(builder, pos + 2, ib->vma->bo, + ib->vma->bo_offset + ib->ib[2], 0); } else { dw[1] = 0; dw[2] = 0; @@ -456,8 +463,9 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder, dw[1] = ib->ib[0] | builder->mocs << GEN8_IB_DW1_MOCS__SHIFT; - if (ib->need_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0); + if (ib->vma) { + ilo_builder_batch_reloc64(builder, pos + 2, ib->vma->bo, + ib->vma->bo_offset + ib->ib[1], 0); } else { dw[2] = 0; dw[3] = 0; @@ -801,11 +809,11 @@ gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder, builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT | sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT; - if (sb->need_bo) { - ilo_builder_batch_reloc(builder, pos + 2, sb->bo, - sb->so_buf[0], INTEL_RELOC_WRITE); - ilo_builder_batch_reloc(builder, pos + 3, sb->bo, - sb->so_buf[1], INTEL_RELOC_WRITE); + if (sb->vma) { + ilo_builder_batch_reloc(builder, pos + 2, sb->vma->bo, + sb->vma->bo_offset + sb->so_buf[0], INTEL_RELOC_WRITE); + ilo_builder_batch_reloc(builder, pos + 3, sb->vma->bo, + sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE); } else { dw[2] = 0; dw[3] = 0; @@ -832,9 +840,9 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder, buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT | builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT; - if (sb->need_bo) { - ilo_builder_batch_reloc64(builder, pos + 2, sb->bo, - sb->so_buf[1], INTEL_RELOC_WRITE); + if (sb->vma) { + ilo_builder_batch_reloc64(builder, pos + 2, sb->vma->bo, + sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE); } else { dw[2] = 0; dw[3] = 0; @@ -842,9 +850,10 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder, dw[4] = sb->so_buf[2]; - if (sb->need_write_offset_bo) { - ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo, - sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE); + if (sb->write_offset_vma) { + ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_vma->bo, + sb->write_offset_vma->bo_offset + sizeof(uint32_t) * buffer, + INTEL_RELOC_WRITE); } else { dw[5] = 0; dw[6] = 0; @@ -1254,14 +1263,15 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw); memcpy(dw, surf->surface, state_len << 2); - if (surf->bo) { + if (surf->vma) { const uint32_t mocs = (surf->scanout) ? (GEN8_MOCS_MT_PTE | GEN8_MOCS_CT_L3) : builder->mocs; dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT; - ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo, - surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE); + ilo_builder_surface_reloc64(builder, state_offset, 8, surf->vma->bo, + surf->vma->bo_offset + surf->surface[8], + (surf->readonly) ? 0 : INTEL_RELOC_WRITE); } } else { state_align = 32; @@ -1271,15 +1281,16 @@ gen6_SURFACE_STATE(struct ilo_builder *builder, ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw); memcpy(dw, surf->surface, state_len << 2); - if (surf->bo) { + if (surf->vma) { /* * For scanouts, we should not enable caching in LLC. Since we only * enable that on Gen8+, we are fine here. */ dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT; - ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo, - surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE); + ilo_builder_surface_reloc(builder, state_offset, 1, surf->vma->bo, + surf->vma->bo_offset + surf->surface[1], + (surf->readonly) ? 0 : INTEL_RELOC_WRITE); } } diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h index 0a7f7d9d3fe..da7db90a54b 100644 --- a/src/gallium/drivers/ilo/core/ilo_core.h +++ b/src/gallium/drivers/ilo/core/ilo_core.h @@ -29,15 +29,9 @@ #define ILO_CORE_H #include "pipe/p_compiler.h" -#include "pipe/p_defines.h" -#include "pipe/p_format.h" #include "util/u_debug.h" -#include "util/list.h" -#include "util/u_format.h" -#include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_pointer.h" #endif /* ILO_CORE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c index 0d837d8a9d5..fa547ac5c36 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.c +++ b/src/gallium/drivers/ilo/core/ilo_image.c @@ -40,269 +40,356 @@ enum { IMAGE_TILING_W) }; -struct ilo_image_params { - const struct ilo_dev *dev; - const struct pipe_resource *templ; - unsigned valid_tilings; +struct ilo_image_layout { + enum ilo_image_walk_type walk; + bool interleaved_samples; - bool compressed; + uint8_t valid_tilings; + enum gen_surface_tiling tiling; - unsigned h0, h1; - unsigned max_x, max_y; + enum ilo_image_aux_type aux; + + int align_i; + int align_j; + + struct ilo_image_lod *lods; + int walk_layer_h0; + int walk_layer_h1; + int walk_layer_height; + int monolithic_width; + int monolithic_height; }; -static void -img_get_slice_size(const struct ilo_image *img, - const struct ilo_image_params *params, - unsigned level, unsigned *width, unsigned *height) +static enum ilo_image_walk_type +image_get_gen6_walk(const struct ilo_dev *dev, + const struct ilo_image_info *info) { - const struct pipe_resource *templ = params->templ; - unsigned w, h; + ILO_DEV_ASSERT(dev, 6, 6); - w = u_minify(img->width0, level); - h = u_minify(img->height0, level); + /* TODO we want LODs to be page-aligned */ + if (info->type == GEN6_SURFTYPE_3D) + return ILO_IMAGE_WALK_3D; /* - * From the Sandy Bridge PRM, volume 1 part 1, page 114: + * From the Sandy Bridge PRM, volume 1 part 1, page 115: * - * "The dimensions of the mip maps are first determined by applying the - * sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then, - * if necessary, they are padded out to compression block boundaries." + * "The separate stencil buffer does not support mip mapping, thus the + * storage for LODs other than LOD 0 is not needed. The following + * QPitch equation applies only to the separate stencil buffer: + * + * QPitch = h_0" + * + * Use ILO_IMAGE_WALK_LOD and manually offset to the (page-aligned) levels + * when bound. */ - w = align(w, img->block_width); - h = align(h, img->block_height); + if (info->bind_zs && info->format == GEN6_FORMAT_R8_UINT) + return ILO_IMAGE_WALK_LOD; + + /* compact spacing is not supported otherwise */ + return ILO_IMAGE_WALK_LAYER; +} + +static enum ilo_image_walk_type +image_get_gen7_walk(const struct ilo_dev *dev, + const struct ilo_image_info *info) +{ + ILO_DEV_ASSERT(dev, 7, 8); + + if (info->type == GEN6_SURFTYPE_3D) + return ILO_IMAGE_WALK_3D; /* - * From the Sandy Bridge PRM, volume 1 part 1, page 111: - * - * "If the surface is multisampled (4x), these values must be adjusted - * as follows before proceeding: + * From the Ivy Bridge PRM, volume 1 part 1, page 111: * - * W_L = ceiling(W_L / 2) * 4 - * H_L = ceiling(H_L / 2) * 4" + * "note that the depth buffer and stencil buffer have an implied value + * of ARYSPC_FULL" * - * From the Ivy Bridge PRM, volume 1 part 1, page 108: + * From the Ivy Bridge PRM, volume 4 part 1, page 66: * - * "If the surface is multisampled and it is a depth or stencil surface - * or Multisampled Surface StorageFormat in SURFACE_STATE is - * MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before - * proceeding: + * "If Multisampled Surface Storage Format is MSFMT_MSS and Number of + * Multisamples is not MULTISAMPLECOUNT_1, this field (Surface Array + * Spacing) must be set to ARYSPC_LOD0." + */ + if (info->sample_count > 1) + assert(info->level_count == 1); + return (info->bind_zs || info->level_count > 1) ? + ILO_IMAGE_WALK_LAYER : ILO_IMAGE_WALK_LOD; +} + +static bool +image_get_gen6_interleaved_samples(const struct ilo_dev *dev, + const struct ilo_image_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * Gen6 supports only interleaved samples. It is not explicitly stated, + * but on Gen7+, render targets are expected to be UMS/CMS (samples + * non-interleaved) and depth/stencil buffers are expected to be IMS + * (samples interleaved). * - * #samples W_L = H_L = - * 2 ceiling(W_L / 2) * 4 HL [no adjustment] - * 4 ceiling(W_L / 2) * 4 ceiling(H_L / 2) * 4 - * 8 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 4 - * 16 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 8" + * See "Multisampled Surface Storage Format" field of SURFACE_STATE. + */ + return (ilo_dev_gen(dev) == ILO_GEN(6) || info->bind_zs); +} + +static uint8_t +image_get_gen6_valid_tilings(const struct ilo_dev *dev, + const struct ilo_image_info *info) +{ + uint8_t valid_tilings = IMAGE_TILING_ALL; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (info->valid_tilings) + valid_tilings &= info->valid_tilings; + + /* + * From the Sandy Bridge PRM, volume 1 part 2, page 32: * - * For interleaved samples (4x), where pixels + * "Display/Overlay Y-Major not supported. + * X-Major required for Async Flips" + */ + if (unlikely(info->bind_scanout)) + valid_tilings &= IMAGE_TILING_X; + + /* + * From the Sandy Bridge PRM, volume 3 part 2, page 158: * - * (x, y ) (x+1, y ) - * (x, y+1) (x+1, y+1) + * "The cursor surface address must be 4K byte aligned. The cursor must + * be in linear memory, it cannot be tiled." + */ + if (unlikely(info->bind_cursor)) + valid_tilings &= IMAGE_TILING_NONE; + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 318: * - * would be is occupied by + * "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear + * Depth Buffer is not supported." * - * (x, y , si0) (x+1, y , si0) (x, y , si1) (x+1, y , si1) - * (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1) - * (x, y , si2) (x+1, y , si2) (x, y , si3) (x+1, y , si3) - * (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3) + * "The Depth Buffer, if tiled, must use Y-Major tiling." * - * Thus the need to + * From the Sandy Bridge PRM, volume 1 part 2, page 22: * - * w = align(w, 2) * 2; - * y = align(y, 2) * 2; + * "W-Major Tile Format is used for separate stencil." */ - if (img->interleaved_samples) { - switch (templ->nr_samples) { - case 0: - case 1: - break; - case 2: - w = align(w, 2) * 2; - break; - case 4: - w = align(w, 2) * 2; - h = align(h, 2) * 2; - break; - case 8: - w = align(w, 2) * 4; - h = align(h, 2) * 2; - break; - case 16: - w = align(w, 2) * 4; - h = align(h, 2) * 4; - break; - default: - assert(!"unsupported sample count"); - break; - } + if (info->bind_zs) { + if (info->format == GEN6_FORMAT_R8_UINT) + valid_tilings &= IMAGE_TILING_W; + else + valid_tilings &= IMAGE_TILING_Y; } - /* - * From the Ivy Bridge PRM, volume 1 part 1, page 108: - * - * "For separate stencil buffer, the width must be mutiplied by 2 and - * height divided by 2..." - * - * To make things easier (for transfer), we will just double the stencil - * stride in 3DSTATE_STENCIL_BUFFER. - */ - w = align(w, img->align_i); - h = align(h, img->align_j); + if (info->bind_surface_sampler || + info->bind_surface_dp_render || + info->bind_surface_dp_typed) { + /* + * From the Haswell PRM, volume 2d, page 233: + * + * "If Number of Multisamples is not MULTISAMPLECOUNT_1, this field + * (Tiled Surface) must be TRUE." + */ + if (info->sample_count > 1) + valid_tilings &= ~IMAGE_TILING_NONE; - *width = w; - *height = h; -} + if (ilo_dev_gen(dev) < ILO_GEN(8)) + valid_tilings &= ~IMAGE_TILING_W; + } -static unsigned -img_get_num_layers(const struct ilo_image *img, - const struct ilo_image_params *params) -{ - const struct pipe_resource *templ = params->templ; - unsigned num_layers = templ->array_size; + if (info->bind_surface_dp_render) { + /* + * From the Sandy Bridge PRM, volume 1 part 2, page 32: + * + * "NOTE: 128BPE Format Color buffer ( render target ) MUST be + * either TileX or Linear." + * + * From the Haswell PRM, volume 5, page 32: + * + * "NOTE: 128 BPP format color buffer (render target) supports + * Linear, TiledX and TiledY." + */ + if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->block_size == 16) + valid_tilings &= ~IMAGE_TILING_Y; - /* samples of the same index are stored in a layer */ - if (templ->nr_samples > 1 && !img->interleaved_samples) - num_layers *= templ->nr_samples; + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 63: + * + * "This field (Surface Vertical Aligment) must be set to VALIGN_4 + * for all tiled Y Render Target surfaces." + * + * "VALIGN_4 is not supported for surface format R32G32B32_FLOAT." + * + * R32G32B32_FLOAT is not renderable and we only need an assert() here. + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) + assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT); + } - return num_layers; + return valid_tilings; } -static void -img_init_layer_height(struct ilo_image *img, - struct ilo_image_params *params) +static uint64_t +image_get_gen6_estimated_size(const struct ilo_dev *dev, + const struct ilo_image_info *info) { - const struct pipe_resource *templ = params->templ; - unsigned num_layers; + /* padding not considered */ + const uint64_t slice_size = info->width * info->height * + info->block_size / (info->block_width * info->block_height); + const uint64_t slice_count = + info->depth * info->array_size * info->sample_count; + const uint64_t estimated_size = slice_size * slice_count; - if (img->walk != ILO_IMAGE_WALK_LAYER) - return; + ILO_DEV_ASSERT(dev, 6, 8); - num_layers = img_get_num_layers(img, params); - if (num_layers <= 1) - return; + if (info->level_count == 1) + return estimated_size; + else + return estimated_size * 4 / 3; +} + +static enum gen_surface_tiling +image_get_gen6_tiling(const struct ilo_dev *dev, + const struct ilo_image_info *info, + uint8_t valid_tilings) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + switch (valid_tilings) { + case IMAGE_TILING_NONE: + return GEN6_TILING_NONE; + case IMAGE_TILING_X: + return GEN6_TILING_X; + case IMAGE_TILING_Y: + return GEN6_TILING_Y; + case IMAGE_TILING_W: + return GEN8_TILING_W; + default: + break; + } /* - * From the Sandy Bridge PRM, volume 1 part 1, page 115: - * - * "The following equation is used for surface formats other than - * compressed textures: - * - * QPitch = (h0 + h1 + 11j)" - * - * "The equation for compressed textures (BC* and FXT1 surface formats) - * follows: - * - * QPitch = (h0 + h1 + 11j) / 4" - * - * "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the - * value calculated in the equation above, for every other odd Surface - * Height starting from 1 i.e. 1,5,9,13" - * - * From the Ivy Bridge PRM, volume 1 part 1, page 111-112: + * X-tiling has the property that vertically adjacent pixels are usually in + * the same page. When the image size is less than a page, the image + * height is 1, or when the image is not accessed in blocks, there is no + * reason to tile. * - * "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth - * buffer and stencil buffer have an implied value of ARYSPC_FULL): - * - * QPitch = (h0 + h1 + 12j) - * QPitch = (h0 + h1 + 12j) / 4 (compressed) - * - * (There are many typos or missing words here...)" - * - * To access the N-th slice, an offset of (Stride * QPitch * N) is added to - * the base address. The PRM divides QPitch by 4 for compressed formats - * because the block height for those formats are 4, and it wants QPitch to - * mean the number of memory rows, as opposed to texel rows, between - * slices. Since we use texel rows everywhere, we do not need to divide - * QPitch by 4. + * Y-tiling is similar, where vertically adjacent pixels are usually in the + * same cacheline. */ - img->walk_layer_height = params->h0 + params->h1 + - ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * img->align_j; + if (valid_tilings & IMAGE_TILING_NONE) { + const uint64_t estimated_size = + image_get_gen6_estimated_size(dev, info); - if (ilo_dev_gen(params->dev) == ILO_GEN(6) && templ->nr_samples > 1 && - img->height0 % 4 == 1) - img->walk_layer_height += 4; + if (info->height == 1 || !(info->bind_surface_sampler || + info->bind_surface_dp_render || + info->bind_surface_dp_typed)) + return GEN6_TILING_NONE; + + if (estimated_size <= 64 || + estimated_size > info->prefer_linear_threshold) + return GEN6_TILING_NONE; + + if (estimated_size <= 2048) + valid_tilings &= ~IMAGE_TILING_X; + } - params->max_y += img->walk_layer_height * (num_layers - 1); + return (valid_tilings & IMAGE_TILING_Y) ? GEN6_TILING_Y : + (valid_tilings & IMAGE_TILING_X) ? GEN6_TILING_X : + GEN6_TILING_NONE; } -static void -img_init_lods(struct ilo_image *img, - struct ilo_image_params *params) +static bool +image_get_gen6_hiz_enable(const struct ilo_dev *dev, + const struct ilo_image_info *info) { - const struct pipe_resource *templ = params->templ; - unsigned cur_x, cur_y; - unsigned lv; + ILO_DEV_ASSERT(dev, 6, 8); - cur_x = 0; - cur_y = 0; - for (lv = 0; lv <= templ->last_level; lv++) { - unsigned lod_w, lod_h; + /* depth buffer? */ + if (!info->bind_zs || + info->format == GEN6_FORMAT_R8_UINT || + info->interleaved_stencil) + return false; - img_get_slice_size(img, params, lv, &lod_w, &lod_h); + /* we want to be able to force 8x4 alignments */ + if (info->type == GEN6_SURFTYPE_1D) + return false; - img->lods[lv].x = cur_x; - img->lods[lv].y = cur_y; - img->lods[lv].slice_width = lod_w; - img->lods[lv].slice_height = lod_h; + if (info->aux_disable) + return false; - switch (img->walk) { - case ILO_IMAGE_WALK_LAYER: - /* MIPLAYOUT_BELOW */ - if (lv == 1) - cur_x += lod_w; - else - cur_y += lod_h; - break; - case ILO_IMAGE_WALK_LOD: - lod_h *= img_get_num_layers(img, params); - if (lv == 1) - cur_x += lod_w; - else - cur_y += lod_h; + if (ilo_debug & ILO_DEBUG_NOHIZ) + return false; - /* every LOD begins at tile boundaries */ - if (templ->last_level > 0) { - assert(img->format == PIPE_FORMAT_S8_UINT); - cur_x = align(cur_x, 64); - cur_y = align(cur_y, 64); - } - break; - case ILO_IMAGE_WALK_3D: - { - const unsigned num_slices = u_minify(templ->depth0, lv); - const unsigned num_slices_per_row = 1 << lv; - const unsigned num_rows = - (num_slices + num_slices_per_row - 1) / num_slices_per_row; + return true; +} - lod_w *= num_slices_per_row; - lod_h *= num_rows; +static bool +image_get_gen7_mcs_enable(const struct ilo_dev *dev, + const struct ilo_image_info *info, + enum gen_surface_tiling tiling) +{ + ILO_DEV_ASSERT(dev, 7, 8); - cur_y += lod_h; - } - break; - } + if (!info->bind_surface_sampler && !info->bind_surface_dp_render) + return false; - if (params->max_x < img->lods[lv].x + lod_w) - params->max_x = img->lods[lv].x + lod_w; - if (params->max_y < img->lods[lv].y + lod_h) - params->max_y = img->lods[lv].y + lod_h; + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 77: + * + * "For Render Target and Sampling Engine Surfaces:If the surface is + * multisampled (Number of Multisamples any value other than + * MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled." + * + * "This field must be set to 0 for all SINT MSRTs when all RT channels + * are not written" + */ + if (info->sample_count > 1) { + if (ilo_dev_gen(dev) < ILO_GEN(8)) + assert(!info->is_integer); + return true; } - if (img->walk == ILO_IMAGE_WALK_LAYER) { - params->h0 = img->lods[0].slice_height; + if (info->aux_disable) + return false; - if (templ->last_level > 0) - params->h1 = img->lods[1].slice_height; - else - img_get_slice_size(img, params, 1, &cur_x, ¶ms->h1); + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 326: + * + * "When MCS is buffer is used for color clear of non-multisampler + * render target, the following restrictions apply. + * - Support is limited to tiled render targets. + * - Support is for non-mip-mapped and non-array surface types only. + * - Clear is supported only on the full RT; i.e., no partial clear or + * overlapping clears. + * - MCS buffer for non-MSRT is supported only for RT formats 32bpp, + * 64bpp and 128bpp. + * ..." + * + * How about SURFTYPE_3D? + */ + if (!info->bind_surface_dp_render || + tiling == GEN6_TILING_NONE || + info->level_count > 1 || + info->array_size > 1) + return false; + + switch (info->block_size) { + case 4: + case 8: + case 16: + return true; + default: + return false; } } static void -img_init_alignments(struct ilo_image *img, - const struct ilo_image_params *params) +image_get_gen6_alignments(const struct ilo_dev *dev, + const struct ilo_image_info *info, + int *align_i, int *align_j) { - const struct pipe_resource *templ = params->templ; + ILO_DEV_ASSERT(dev, 6, 6); /* * From the Sandy Bridge PRM, volume 1 part 1, page 113: @@ -335,13 +422,33 @@ img_init_alignments(struct ilo_image *img, * * align_i align_j * compressed formats block width block height - * PIPE_FORMAT_S8_UINT 4 2 + * GEN6_FORMAT_R8_UINT 4 2 * other depth/stencil formats 4 4 * 4x multisampled 4 4 * bpp 96 4 2 * others 4 2 or 4 */ + *align_i = (info->compressed) ? info->block_width : 4; + if (info->compressed) { + *align_j = info->block_height; + } else if (info->bind_zs) { + *align_j = (info->format == GEN6_FORMAT_R8_UINT) ? 2 : 4; + } else { + *align_j = (info->sample_count > 1 || info->block_size != 12) ? 4 : 2; + } +} + +static void +image_get_gen7_alignments(const struct ilo_dev *dev, + const struct ilo_image_info *info, + enum gen_surface_tiling tiling, + int *align_i, int *align_j) +{ + int i, j; + + ILO_DEV_ASSERT(dev, 7, 8); + /* * From the Ivy Bridge PRM, volume 1 part 1, page 110: * @@ -383,465 +490,301 @@ img_init_alignments(struct ilo_image *img, * * align_i align_j * compressed formats block width block height - * PIPE_FORMAT_Z16_UNORM 8 4 - * PIPE_FORMAT_S8_UINT 8 8 + * GEN6_FORMAT_R16_UNORM 8 4 + * GEN6_FORMAT_R8_UINT 8 8 * other depth/stencil formats 4 4 * 2x or 4x multisampled 4 or 8 4 * tiled Y 4 or 8 4 (if rt) - * PIPE_FORMAT_R32G32B32_FLOAT 4 or 8 2 + * GEN6_FORMAT_R32G32B32_FLOAT 4 or 8 2 * others 4 or 8 2 or 4 */ - - if (params->compressed) { - /* this happens to be the case */ - img->align_i = img->block_width; - img->align_j = img->block_height; - } else if (templ->bind & PIPE_BIND_DEPTH_STENCIL) { - if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) { - switch (img->format) { - case PIPE_FORMAT_Z16_UNORM: - img->align_i = 8; - img->align_j = 4; - break; - case PIPE_FORMAT_S8_UINT: - img->align_i = 8; - img->align_j = 8; - break; - default: - img->align_i = 4; - img->align_j = 4; - break; - } - } else { - switch (img->format) { - case PIPE_FORMAT_S8_UINT: - img->align_i = 4; - img->align_j = 2; - break; - default: - img->align_i = 4; - img->align_j = 4; - break; - } + if (info->compressed) { + i = info->block_width; + j = info->block_height; + } else if (info->bind_zs) { + switch (info->format) { + case GEN6_FORMAT_R16_UNORM: + i = 8; + j = 4; + break; + case GEN6_FORMAT_R8_UINT: + i = 8; + j = 8; + break; + default: + i = 4; + j = 4; + break; } } else { const bool valign_4 = - (templ->nr_samples > 1) || - (ilo_dev_gen(params->dev) >= ILO_GEN(8)) || - (ilo_dev_gen(params->dev) >= ILO_GEN(7) && - img->tiling == GEN6_TILING_Y && - (templ->bind & PIPE_BIND_RENDER_TARGET)); - - if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && - ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4) - assert(img->format != PIPE_FORMAT_R32G32B32_FLOAT); - - img->align_i = 4; - img->align_j = (valign_4) ? 4 : 2; - } + (info->sample_count > 1 || ilo_dev_gen(dev) >= ILO_GEN(8) || + (tiling == GEN6_TILING_Y && info->bind_surface_dp_render)); - /* - * the fact that align i and j are multiples of block width and height - * respectively is what makes the size of the bo a multiple of the block - * size, slices start at block boundaries, and many of the computations - * work. - */ - assert(img->align_i % img->block_width == 0); - assert(img->align_j % img->block_height == 0); + if (ilo_dev_gen(dev) < ILO_GEN(8) && valign_4) + assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT); - /* make sure align() works */ - assert(util_is_power_of_two(img->align_i) && - util_is_power_of_two(img->align_j)); - assert(util_is_power_of_two(img->block_width) && - util_is_power_of_two(img->block_height)); + i = 4; + j = (valign_4) ? 4 : 2; + } + + *align_i = i; + *align_j = j; } -static void -img_init_tiling(struct ilo_image *img, - const struct ilo_image_params *params) +static bool +image_init_gen6_hardware_layout(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout) { - const struct pipe_resource *templ = params->templ; - unsigned preferred_tilings = params->valid_tilings; - - /* no fencing nor BLT support */ - if (preferred_tilings & ~IMAGE_TILING_W) - preferred_tilings &= ~IMAGE_TILING_W; - - if (templ->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) { - /* - * heuristically set a minimum width/height for enabling tiling - */ - if (img->width0 < 64 && (preferred_tilings & ~IMAGE_TILING_X)) - preferred_tilings &= ~IMAGE_TILING_X; - - if ((img->width0 < 32 || img->height0 < 16) && - (img->width0 < 16 || img->height0 < 32) && - (preferred_tilings & ~IMAGE_TILING_Y)) - preferred_tilings &= ~IMAGE_TILING_Y; - } else { - /* force linear if we are not sure where the texture is bound to */ - if (preferred_tilings & IMAGE_TILING_NONE) - preferred_tilings &= IMAGE_TILING_NONE; - } + ILO_DEV_ASSERT(dev, 6, 8); - /* prefer tiled over linear */ - if (preferred_tilings & IMAGE_TILING_Y) - img->tiling = GEN6_TILING_Y; - else if (preferred_tilings & IMAGE_TILING_X) - img->tiling = GEN6_TILING_X; - else if (preferred_tilings & IMAGE_TILING_W) - img->tiling = GEN8_TILING_W; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + layout->walk = image_get_gen7_walk(dev, info); else - img->tiling = GEN6_TILING_NONE; -} + layout->walk = image_get_gen6_walk(dev, info); -static void -img_init_walk_gen7(struct ilo_image *img, - const struct ilo_image_params *params) -{ - const struct pipe_resource *templ = params->templ; + layout->interleaved_samples = + image_get_gen6_interleaved_samples(dev, info); - /* - * It is not explicitly states, but render targets are expected to be - * UMS/CMS (samples non-interleaved) and depth/stencil buffers are expected - * to be IMS (samples interleaved). - * - * See "Multisampled Surface Storage Format" field of SURFACE_STATE. - */ - if (templ->bind & PIPE_BIND_DEPTH_STENCIL) { - /* - * From the Ivy Bridge PRM, volume 1 part 1, page 111: - * - * "note that the depth buffer and stencil buffer have an implied - * value of ARYSPC_FULL" - */ - img->walk = (templ->target == PIPE_TEXTURE_3D) ? - ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER; + layout->valid_tilings = image_get_gen6_valid_tilings(dev, info); + if (!layout->valid_tilings) + return false; - img->interleaved_samples = true; - } else { - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 66: - * - * "If Multisampled Surface Storage Format is MSFMT_MSS and Number - * of Multisamples is not MULTISAMPLECOUNT_1, this field (Surface - * Array Spacing) must be set to ARYSPC_LOD0." - * - * As multisampled resources are not mipmapped, we never use - * ARYSPC_FULL for them. - */ - if (templ->nr_samples > 1) - assert(templ->last_level == 0); + layout->tiling = image_get_gen6_tiling(dev, info, layout->valid_tilings); - img->walk = - (templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D : - (templ->last_level > 0) ? ILO_IMAGE_WALK_LAYER : - ILO_IMAGE_WALK_LOD; + if (image_get_gen6_hiz_enable(dev, info)) + layout->aux = ILO_IMAGE_AUX_HIZ; + else if (ilo_dev_gen(dev) >= ILO_GEN(7) && + image_get_gen7_mcs_enable(dev, info, layout->tiling)) + layout->aux = ILO_IMAGE_AUX_MCS; + else + layout->aux = ILO_IMAGE_AUX_NONE; - img->interleaved_samples = false; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + image_get_gen7_alignments(dev, info, layout->tiling, + &layout->align_i, &layout->align_j); + } else { + image_get_gen6_alignments(dev, info, + &layout->align_i, &layout->align_j); } + + return true; } -static void -img_init_walk_gen6(struct ilo_image *img, - const struct ilo_image_params *params) +static bool +image_init_gen6_transfer_layout(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout) { - /* - * From the Sandy Bridge PRM, volume 1 part 1, page 115: - * - * "The separate stencil buffer does not support mip mapping, thus the - * storage for LODs other than LOD 0 is not needed. The following - * QPitch equation applies only to the separate stencil buffer: - * - * QPitch = h_0" - * - * GEN6 does not support compact spacing otherwise. - */ - img->walk = - (params->templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D : - (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD : - ILO_IMAGE_WALK_LAYER; + ILO_DEV_ASSERT(dev, 6, 8); + + /* we can define our own layout to save space */ + layout->walk = ILO_IMAGE_WALK_LOD; + layout->interleaved_samples = false; + layout->valid_tilings = IMAGE_TILING_NONE; + layout->tiling = GEN6_TILING_NONE; + layout->aux = ILO_IMAGE_AUX_NONE; + layout->align_i = info->block_width; + layout->align_j = info->block_height; - /* GEN6 supports only interleaved samples */ - img->interleaved_samples = true; + return true; } static void -img_init_walk(struct ilo_image *img, - const struct ilo_image_params *params) +image_get_gen6_slice_size(const struct ilo_dev *dev, + const struct ilo_image_info *info, + const struct ilo_image_layout *layout, + uint8_t level, + int *width, int *height) { - if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) - img_init_walk_gen7(img, params); - else - img_init_walk_gen6(img, params); -} + int w, h; -static unsigned -img_get_valid_tilings(const struct ilo_image *img, - const struct ilo_image_params *params) -{ - const struct pipe_resource *templ = params->templ; - const enum pipe_format format = img->format; - unsigned valid_tilings = params->valid_tilings; + ILO_DEV_ASSERT(dev, 6, 8); - /* - * From the Sandy Bridge PRM, volume 1 part 2, page 32: - * - * "Display/Overlay Y-Major not supported. - * X-Major required for Async Flips" - */ - if (unlikely(templ->bind & PIPE_BIND_SCANOUT)) - valid_tilings &= IMAGE_TILING_X; + w = u_minify(info->width, level); + h = u_minify(info->height, level); /* - * From the Sandy Bridge PRM, volume 3 part 2, page 158: + * From the Sandy Bridge PRM, volume 1 part 1, page 114: * - * "The cursor surface address must be 4K byte aligned. The cursor must - * be in linear memory, it cannot be tiled." + * "The dimensions of the mip maps are first determined by applying the + * sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then, + * if necessary, they are padded out to compression block boundaries." */ - if (unlikely(templ->bind & (PIPE_BIND_CURSOR | PIPE_BIND_LINEAR))) - valid_tilings &= IMAGE_TILING_NONE; + w = align(w, info->block_width); + h = align(h, info->block_height); /* - * From the Sandy Bridge PRM, volume 2 part 1, page 318: + * From the Sandy Bridge PRM, volume 1 part 1, page 111: * - * "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear - * Depth Buffer is not supported." + * "If the surface is multisampled (4x), these values must be adjusted + * as follows before proceeding: * - * "The Depth Buffer, if tiled, must use Y-Major tiling." + * W_L = ceiling(W_L / 2) * 4 + * H_L = ceiling(H_L / 2) * 4" * - * From the Sandy Bridge PRM, volume 1 part 2, page 22: + * From the Ivy Bridge PRM, volume 1 part 1, page 108: * - * "W-Major Tile Format is used for separate stencil." + * "If the surface is multisampled and it is a depth or stencil surface + * or Multisampled Surface StorageFormat in SURFACE_STATE is + * MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before + * proceeding: + * + * #samples W_L = H_L = + * 2 ceiling(W_L / 2) * 4 HL [no adjustment] + * 4 ceiling(W_L / 2) * 4 ceiling(H_L / 2) * 4 + * 8 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 4 + * 16 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 8" + * + * For interleaved samples (4x), where pixels + * + * (x, y ) (x+1, y ) + * (x, y+1) (x+1, y+1) + * + * would be is occupied by + * + * (x, y , si0) (x+1, y , si0) (x, y , si1) (x+1, y , si1) + * (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1) + * (x, y , si2) (x+1, y , si2) (x, y , si3) (x+1, y , si3) + * (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3) + * + * Thus the need to + * + * w = align(w, 2) * 2; + * y = align(y, 2) * 2; */ - if (templ->bind & PIPE_BIND_DEPTH_STENCIL) { - switch (format) { - case PIPE_FORMAT_S8_UINT: - valid_tilings &= IMAGE_TILING_W; + if (layout->interleaved_samples) { + switch (info->sample_count) { + case 1: + break; + case 2: + w = align(w, 2) * 2; + break; + case 4: + w = align(w, 2) * 2; + h = align(h, 2) * 2; + break; + case 8: + w = align(w, 2) * 4; + h = align(h, 2) * 2; + break; + case 16: + w = align(w, 2) * 4; + h = align(h, 2) * 4; break; default: - valid_tilings &= IMAGE_TILING_Y; + assert(!"unsupported sample count"); break; } } - if (templ->bind & PIPE_BIND_RENDER_TARGET) { - /* - * From the Sandy Bridge PRM, volume 1 part 2, page 32: - * - * "NOTE: 128BPE Format Color buffer ( render target ) MUST be - * either TileX or Linear." - * - * From the Haswell PRM, volume 5, page 32: - * - * "NOTE: 128 BPP format color buffer (render target) supports - * Linear, TiledX and TiledY." - */ - if (ilo_dev_gen(params->dev) < ILO_GEN(7.5) && img->block_size == 16) - valid_tilings &= ~IMAGE_TILING_Y; - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 63: - * - * "This field (Surface Vertical Aligment) must be set to VALIGN_4 - * for all tiled Y Render Target surfaces." - * - * "VALIGN_4 is not supported for surface format R32G32B32_FLOAT." - */ - if (ilo_dev_gen(params->dev) >= ILO_GEN(7) && - ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && - img->format == PIPE_FORMAT_R32G32B32_FLOAT) - valid_tilings &= ~IMAGE_TILING_Y; - - valid_tilings &= ~IMAGE_TILING_W; - } - - if (templ->bind & PIPE_BIND_SAMPLER_VIEW) { - if (ilo_dev_gen(params->dev) < ILO_GEN(8)) - valid_tilings &= ~IMAGE_TILING_W; - } - - /* no conflicting binding flags */ - assert(valid_tilings); - - return valid_tilings; -} - -static void -img_init_size_and_format(struct ilo_image *img, - struct ilo_image_params *params) -{ - const struct pipe_resource *templ = params->templ; - enum pipe_format format = templ->format; - bool require_separate_stencil = false; - - img->target = templ->target; - img->width0 = templ->width0; - img->height0 = templ->height0; - img->depth0 = templ->depth0; - img->array_size = templ->array_size; - img->level_count = templ->last_level + 1; - img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1; - /* - * From the Sandy Bridge PRM, volume 2 part 1, page 317: + * From the Ivy Bridge PRM, volume 1 part 1, page 108: * - * "This field (Separate Stencil Buffer Enable) must be set to the same - * value (enabled or disabled) as Hierarchical Depth Buffer Enable." + * "For separate stencil buffer, the width must be mutiplied by 2 and + * height divided by 2..." * - * GEN7+ requires separate stencil buffers. + * To make things easier (for transfer), we will just double the stencil + * stride in 3DSTATE_STENCIL_BUFFER. */ - if (templ->bind & PIPE_BIND_DEPTH_STENCIL) { - if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) - require_separate_stencil = true; - else - require_separate_stencil = (img->aux.type == ILO_IMAGE_AUX_HIZ); - } - - switch (format) { - case PIPE_FORMAT_ETC1_RGB8: - format = PIPE_FORMAT_R8G8B8X8_UNORM; - break; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - if (require_separate_stencil) { - format = PIPE_FORMAT_Z24X8_UNORM; - img->separate_stencil = true; - } - break; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - if (require_separate_stencil) { - format = PIPE_FORMAT_Z32_FLOAT; - img->separate_stencil = true; - } - break; - default: - break; - } + w = align(w, layout->align_i); + h = align(h, layout->align_j); - img->format = format; - img->block_width = util_format_get_blockwidth(format); - img->block_height = util_format_get_blockheight(format); - img->block_size = util_format_get_blocksize(format); - - params->valid_tilings = img_get_valid_tilings(img, params); - params->compressed = util_format_is_compressed(img->format); + *width = w; + *height = h; } -static bool -img_want_mcs(const struct ilo_image *img, - const struct ilo_image_params *params) +static int +image_get_gen6_layer_count(const struct ilo_dev *dev, + const struct ilo_image_info *info, + const struct ilo_image_layout *layout) { - const struct pipe_resource *templ = params->templ; - bool want_mcs = false; + int count = info->array_size; - /* MCS is for RT on GEN7+ */ - if (ilo_dev_gen(params->dev) < ILO_GEN(7)) - return false; + ILO_DEV_ASSERT(dev, 6, 8); - if (templ->target != PIPE_TEXTURE_2D || - !(templ->bind & PIPE_BIND_RENDER_TARGET)) - return false; - - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 77: - * - * "For Render Target and Sampling Engine Surfaces:If the surface is - * multisampled (Number of Multisamples any value other than - * MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled." - * - * "This field must be set to 0 for all SINT MSRTs when all RT channels - * are not written" - */ - if (templ->nr_samples > 1 && !util_format_is_pure_sint(templ->format)) { - want_mcs = true; - } else if (templ->nr_samples <= 1) { - /* - * From the Ivy Bridge PRM, volume 2 part 1, page 326: - * - * "When MCS is buffer is used for color clear of non-multisampler - * render target, the following restrictions apply. - * - Support is limited to tiled render targets. - * - Support is for non-mip-mapped and non-array surface types - * only. - * - Clear is supported only on the full RT; i.e., no partial clear - * or overlapping clears. - * - MCS buffer for non-MSRT is supported only for RT formats - * 32bpp, 64bpp and 128bpp. - * ..." - */ - if (img->tiling != GEN6_TILING_NONE && - templ->last_level == 0 && templ->array_size == 1) { - switch (img->block_size) { - case 4: - case 8: - case 16: - want_mcs = true; - break; - default: - break; - } - } - } + /* samples of the same index are stored in a layer */ + if (!layout->interleaved_samples) + count *= info->sample_count; - return want_mcs; + return count; } -static bool -img_want_hiz(const struct ilo_image *img, - const struct ilo_image_params *params) +static void +image_get_gen6_walk_layer_heights(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout) { - const struct pipe_resource *templ = params->templ; - const struct util_format_description *desc = - util_format_description(templ->format); + ILO_DEV_ASSERT(dev, 6, 8); - if (ilo_debug & ILO_DEBUG_NOHIZ) - return false; + layout->walk_layer_h0 = layout->lods[0].slice_height; - /* we want 8x4 aligned levels */ - if (templ->target == PIPE_TEXTURE_1D) - return false; - - if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL)) - return false; - - if (!util_format_has_depth(desc)) - return false; + if (info->level_count > 1) { + layout->walk_layer_h1 = layout->lods[1].slice_height; + } else { + int dummy; + image_get_gen6_slice_size(dev, info, layout, 1, + &dummy, &layout->walk_layer_h1); + } - /* no point in having HiZ */ - if (templ->usage == PIPE_USAGE_STAGING) - return false; + if (image_get_gen6_layer_count(dev, info, layout) == 1) { + layout->walk_layer_height = 0; + return; + } /* - * As can be seen in img_calculate_hiz_size(), HiZ may not be enabled - * for every level. This is generally fine except on GEN6, where HiZ and - * separate stencil are enabled and disabled at the same time. When the - * format is PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, enabling and disabling HiZ - * can result in incompatible formats. + * From the Sandy Bridge PRM, volume 1 part 1, page 115: + * + * "The following equation is used for surface formats other than + * compressed textures: + * + * QPitch = (h0 + h1 + 11j)" + * + * "The equation for compressed textures (BC* and FXT1 surface formats) + * follows: + * + * QPitch = (h0 + h1 + 11j) / 4" + * + * "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the + * value calculated in the equation above, for every other odd Surface + * Height starting from 1 i.e. 1,5,9,13" + * + * From the Ivy Bridge PRM, volume 1 part 1, page 111-112: + * + * "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth + * buffer and stencil buffer have an implied value of ARYSPC_FULL): + * + * QPitch = (h0 + h1 + 12j) + * QPitch = (h0 + h1 + 12j) / 4 (compressed) + * + * (There are many typos or missing words here...)" + * + * To access the N-th slice, an offset of (Stride * QPitch * N) is added to + * the base address. The PRM divides QPitch by 4 for compressed formats + * because the block height for those formats are 4, and it wants QPitch to + * mean the number of memory rows, as opposed to texel rows, between + * slices. Since we use texel rows everywhere, we do not need to divide + * QPitch by 4. */ - if (ilo_dev_gen(params->dev) == ILO_GEN(6) && - templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && - templ->last_level) - return false; + layout->walk_layer_height = layout->walk_layer_h0 + layout->walk_layer_h1 + + ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * layout->align_j; - return true; -} - -static void -img_init_aux(struct ilo_image *img, - const struct ilo_image_params *params) -{ - if (img_want_hiz(img, params)) - img->aux.type = ILO_IMAGE_AUX_HIZ; - else if (img_want_mcs(img, params)) - img->aux.type = ILO_IMAGE_AUX_MCS; + if (ilo_dev_gen(dev) == ILO_GEN(6) && info->sample_count > 1 && + info->height % 4 == 1) + layout->walk_layer_height += 4; } static void -img_align(struct ilo_image *img, struct ilo_image_params *params) +image_get_gen6_monolithic_size(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout, + int max_x, int max_y) { - const struct pipe_resource *templ = params->templ; int align_w = 1, align_h = 1, pad_h = 0; + ILO_DEV_ASSERT(dev, 6, 8); + /* * From the Sandy Bridge PRM, volume 1 part 1, page 118: * @@ -864,15 +807,15 @@ img_align(struct ilo_image *img, struct ilo_image_params *params) * padding purposes. The value of 4 for j still applies for mip level * alignment and QPitch calculation." */ - if (templ->bind & PIPE_BIND_SAMPLER_VIEW) { - align_w = MAX2(align_w, img->align_i); - align_h = MAX2(align_h, img->align_j); + if (info->bind_surface_sampler) { + align_w = MAX2(align_w, layout->align_i); + align_h = MAX2(align_h, layout->align_j); - if (templ->target == PIPE_TEXTURE_CUBE) + if (info->type == GEN6_SURFTYPE_CUBE) pad_h += 2; - if (params->compressed) - align_h = MAX2(align_h, img->align_j * 2); + if (info->compressed) + align_h = MAX2(align_h, layout->align_j * 2); } /* @@ -881,149 +824,288 @@ img_align(struct ilo_image *img, struct ilo_image_params *params) * "If the surface contains an odd number of rows of data, a final row * below the surface must be allocated." */ - if (templ->bind & PIPE_BIND_RENDER_TARGET) + if (info->bind_surface_dp_render) align_h = MAX2(align_h, 2); /* * Depth Buffer Clear/Resolve works in 8x4 sample blocks. Pad to allow HiZ * for unaligned non-mipmapped and non-array images. */ - if (img->aux.type == ILO_IMAGE_AUX_HIZ && - templ->last_level == 0 && - templ->array_size == 1 && - templ->depth0 == 1) { + if (layout->aux == ILO_IMAGE_AUX_HIZ && + info->level_count == 1 && info->array_size == 1 && info->depth == 1) { align_w = MAX2(align_w, 8); align_h = MAX2(align_h, 4); } - params->max_x = align(params->max_x, align_w); - params->max_y = align(params->max_y + pad_h, align_h); + layout->monolithic_width = align(max_x, align_w); + layout->monolithic_height = align(max_y + pad_h, align_h); } -/* note that this may force the texture to be linear */ static void -img_calculate_bo_size(struct ilo_image *img, - const struct ilo_image_params *params) +image_get_gen6_lods(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout) { - assert(params->max_x % img->block_width == 0); - assert(params->max_y % img->block_height == 0); - assert(img->walk_layer_height % img->block_height == 0); + const int layer_count = image_get_gen6_layer_count(dev, info, layout); + int cur_x, cur_y, max_x, max_y; + uint8_t lv; - img->bo_stride = - (params->max_x / img->block_width) * img->block_size; - img->bo_height = params->max_y / img->block_height; + ILO_DEV_ASSERT(dev, 6, 8); - while (true) { - unsigned w = img->bo_stride, h = img->bo_height; - unsigned align_w, align_h; + cur_x = 0; + cur_y = 0; + max_x = 0; + max_y = 0; + for (lv = 0; lv < info->level_count; lv++) { + int slice_w, slice_h, lod_w, lod_h; - /* - * From the Haswell PRM, volume 5, page 163: - * - * "For linear surfaces, additional padding of 64 bytes is required - * at the bottom of the surface. This is in addition to the padding - * required above." - */ - if (ilo_dev_gen(params->dev) >= ILO_GEN(7.5) && - (params->templ->bind & PIPE_BIND_SAMPLER_VIEW) && - img->tiling == GEN6_TILING_NONE) - h += (64 + img->bo_stride - 1) / img->bo_stride; + image_get_gen6_slice_size(dev, info, layout, lv, &slice_w, &slice_h); - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 81: - * - * "- For linear render target surfaces, the pitch must be a - * multiple of the element size for non-YUV surface formats. - * Pitch must be a multiple of 2 * element size for YUV surface - * formats. - * - For other linear surfaces, the pitch can be any multiple of - * bytes. - * - For tiled surfaces, the pitch must be a multiple of the tile - * width." - * - * Different requirements may exist when the bo is used in different - * places, but our alignments here should be good enough that we do not - * need to check params->templ->bind. - */ - switch (img->tiling) { - case GEN6_TILING_X: - align_w = 512; - align_h = 8; + layout->lods[lv].x = cur_x; + layout->lods[lv].y = cur_y; + layout->lods[lv].slice_width = slice_w; + layout->lods[lv].slice_height = slice_h; + + switch (layout->walk) { + case ILO_IMAGE_WALK_LAYER: + lod_w = slice_w; + lod_h = slice_h; + + /* MIPLAYOUT_BELOW */ + if (lv == 1) + cur_x += lod_w; + else + cur_y += lod_h; break; - case GEN6_TILING_Y: - align_w = 128; - align_h = 32; + case ILO_IMAGE_WALK_LOD: + lod_w = slice_w; + lod_h = slice_h * layer_count; + + if (lv == 1) + cur_x += lod_w; + else + cur_y += lod_h; + + /* every LOD begins at tile boundaries */ + if (info->level_count > 1) { + assert(info->format == GEN6_FORMAT_R8_UINT); + cur_x = align(cur_x, 64); + cur_y = align(cur_y, 64); + } break; - case GEN8_TILING_W: - /* - * From the Sandy Bridge PRM, volume 1 part 2, page 22: - * - * "A 4KB tile is subdivided into 8-high by 8-wide array of - * Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8 - * bytes." - */ - align_w = 64; - align_h = 64; + case ILO_IMAGE_WALK_3D: + { + const int slice_count = u_minify(info->depth, lv); + const int slice_count_per_row = 1 << lv; + const int row_count = + (slice_count + slice_count_per_row - 1) / slice_count_per_row; + + lod_w = slice_w * slice_count_per_row; + lod_h = slice_h * row_count; + } + + cur_y += lod_h; break; default: - assert(img->tiling == GEN6_TILING_NONE); - /* some good enough values */ - align_w = 64; - align_h = 2; + assert(!"unknown walk type"); + lod_w = 0; + lod_h = 0; break; } - w = align(w, align_w); - h = align(h, align_h); - - /* make sure the bo is mappable */ - if (img->tiling != GEN6_TILING_NONE) { - /* - * Usually only the first 256MB of the GTT is mappable. - * - * See also how intel_context::max_gtt_map_object_size is calculated. - */ - const size_t mappable_gtt_size = 256 * 1024 * 1024; - - /* - * Be conservative. We may be able to switch from VALIGN_4 to - * VALIGN_2 if the image was Y-tiled, but let's keep it simple. - */ - if (mappable_gtt_size / w / 4 < h) { - if (params->valid_tilings & IMAGE_TILING_NONE) { - img->tiling = GEN6_TILING_NONE; - /* MCS support for non-MSRTs is limited to tiled RTs */ - if (img->aux.type == ILO_IMAGE_AUX_MCS && - params->templ->nr_samples <= 1) - img->aux.type = ILO_IMAGE_AUX_NONE; - - continue; - } else { - ilo_warn("cannot force texture to be linear\n"); - } - } - } + if (max_x < layout->lods[lv].x + lod_w) + max_x = layout->lods[lv].x + lod_w; + if (max_y < layout->lods[lv].y + lod_h) + max_y = layout->lods[lv].y + lod_h; + } + + if (layout->walk == ILO_IMAGE_WALK_LAYER) { + image_get_gen6_walk_layer_heights(dev, info, layout); + if (layer_count > 1) + max_y += layout->walk_layer_height * (layer_count - 1); + } else { + layout->walk_layer_h0 = 0; + layout->walk_layer_h1 = 0; + layout->walk_layer_height = 0; + } + + image_get_gen6_monolithic_size(dev, info, layout, max_x, max_y); +} + +static bool +image_bind_gpu(const struct ilo_image_info *info) +{ + return (info->bind_surface_sampler || + info->bind_surface_dp_render || + info->bind_surface_dp_typed || + info->bind_zs || + info->bind_scanout || + info->bind_cursor); +} + +static bool +image_validate_gen6(const struct ilo_dev *dev, + const struct ilo_image_info *info) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + /* + * From the Ivy Bridge PRM, volume 2 part 1, page 314: + * + * "The separate stencil buffer is always enabled, thus the field in + * 3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil + * buffer has been removed Surface formats with interleaved depth and + * stencil are no longer supported" + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->bind_zs) + assert(!info->interleaved_stencil); + + return true; +} + +static bool +image_get_gen6_layout(const struct ilo_dev *dev, + const struct ilo_image_info *info, + struct ilo_image_layout *layout) +{ + ILO_DEV_ASSERT(dev, 6, 8); + + if (!image_validate_gen6(dev, info)) + return false; + + if (image_bind_gpu(info) || info->level_count > 1) { + if (!image_init_gen6_hardware_layout(dev, info, layout)) + return false; + } else { + if (!image_init_gen6_transfer_layout(dev, info, layout)) + return false; + } + + /* + * the fact that align i and j are multiples of block width and height + * respectively is what makes the size of the bo a multiple of the block + * size, slices start at block boundaries, and many of the computations + * work. + */ + assert(layout->align_i % info->block_width == 0); + assert(layout->align_j % info->block_height == 0); + + /* make sure align() works */ + assert(util_is_power_of_two(layout->align_i) && + util_is_power_of_two(layout->align_j)); + assert(util_is_power_of_two(info->block_width) && + util_is_power_of_two(info->block_height)); + + image_get_gen6_lods(dev, info, layout); + + assert(layout->walk_layer_height % info->block_height == 0); + assert(layout->monolithic_width % info->block_width == 0); + assert(layout->monolithic_height % info->block_height == 0); + + return true; +} + +static bool +image_set_gen6_bo_size(struct ilo_image *img, + const struct ilo_dev *dev, + const struct ilo_image_info *info, + const struct ilo_image_layout *layout) +{ + int stride, height; + int align_w, align_h; + + ILO_DEV_ASSERT(dev, 6, 8); + + stride = (layout->monolithic_width / info->block_width) * info->block_size; + height = layout->monolithic_height / info->block_height; + + /* + * From the Haswell PRM, volume 5, page 163: + * + * "For linear surfaces, additional padding of 64 bytes is required + * at the bottom of the surface. This is in addition to the padding + * required above." + */ + if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && info->bind_surface_sampler && + layout->tiling == GEN6_TILING_NONE) + height += (64 + stride - 1) / stride; - img->bo_stride = w; - img->bo_height = h; + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 81: + * + * "- For linear render target surfaces, the pitch must be a multiple + * of the element size for non-YUV surface formats. Pitch must be a + * multiple of 2 * element size for YUV surface formats. + * + * - For other linear surfaces, the pitch can be any multiple of + * bytes. + * - For tiled surfaces, the pitch must be a multiple of the tile + * width." + * + * Different requirements may exist when the image is used in different + * places, but our alignments here should be good enough that we do not + * need to check info->bind_x. + */ + switch (layout->tiling) { + case GEN6_TILING_X: + align_w = 512; + align_h = 8; + break; + case GEN6_TILING_Y: + align_w = 128; + align_h = 32; + break; + case GEN8_TILING_W: + /* + * From the Sandy Bridge PRM, volume 1 part 2, page 22: + * + * "A 4KB tile is subdivided into 8-high by 8-wide array of + * Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8 + * bytes." + */ + align_w = 64; + align_h = 64; + break; + default: + assert(layout->tiling == GEN6_TILING_NONE); + /* some good enough values */ + align_w = 64; + align_h = 2; break; } + + if (info->force_bo_stride) { + if (info->force_bo_stride % align_w || info->force_bo_stride < stride) + return false; + + img->bo_stride = info->force_bo_stride; + } else { + img->bo_stride = align(stride, align_w); + } + + img->bo_height = align(height, align_h); + + return true; } -static void -img_calculate_hiz_size(struct ilo_image *img, - const struct ilo_image_params *params) +static bool +image_set_gen6_hiz(struct ilo_image *img, + const struct ilo_dev *dev, + const struct ilo_image_info *info, + const struct ilo_image_layout *layout) { - const struct pipe_resource *templ = params->templ; - const unsigned hz_align_j = 8; + const int hz_align_j = 8; enum ilo_image_walk_type hz_walk; - unsigned hz_width, hz_height, lv; - unsigned hz_clear_w, hz_clear_h; + int hz_width, hz_height; + int hz_clear_w, hz_clear_h; + uint8_t lv; + + ILO_DEV_ASSERT(dev, 6, 8); - assert(img->aux.type == ILO_IMAGE_AUX_HIZ); + assert(layout->aux == ILO_IMAGE_AUX_HIZ); - assert(img->walk == ILO_IMAGE_WALK_LAYER || - img->walk == ILO_IMAGE_WALK_3D); + assert(layout->walk == ILO_IMAGE_WALK_LAYER || + layout->walk == ILO_IMAGE_WALK_3D); /* * From the Sandy Bridge PRM, volume 2 part 1, page 312: @@ -1036,8 +1118,8 @@ img_calculate_hiz_size(struct ilo_image *img, * * We will put all LODs in a single bo with ILO_IMAGE_WALK_LOD. */ - if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) - hz_walk = img->walk; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) + hz_walk = layout->walk; else hz_walk = ILO_IMAGE_WALK_LOD; @@ -1051,16 +1133,16 @@ img_calculate_hiz_size(struct ilo_image *img, switch (hz_walk) { case ILO_IMAGE_WALK_LAYER: { - const unsigned h0 = align(params->h0, hz_align_j); - const unsigned h1 = align(params->h1, hz_align_j); - const unsigned htail = - ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j; - const unsigned hz_qpitch = h0 + h1 + htail; + const int h0 = align(layout->walk_layer_h0, hz_align_j); + const int h1 = align(layout->walk_layer_h1, hz_align_j); + const int htail = + ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j; + const int hz_qpitch = h0 + h1 + htail; - hz_width = align(img->lods[0].slice_width, 16); + hz_width = align(layout->lods[0].slice_width, 16); - hz_height = hz_qpitch * templ->array_size / 2; - if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) + hz_height = hz_qpitch * info->array_size / 2; + if (ilo_dev_gen(dev) >= ILO_GEN(7)) hz_height = align(hz_height, 8); img->aux.walk_layer_height = hz_qpitch; @@ -1068,27 +1150,27 @@ img_calculate_hiz_size(struct ilo_image *img, break; case ILO_IMAGE_WALK_LOD: { - unsigned lod_tx[PIPE_MAX_TEXTURE_LEVELS]; - unsigned lod_ty[PIPE_MAX_TEXTURE_LEVELS]; - unsigned cur_tx, cur_ty; + int lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT]; + int lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT]; + int cur_tx, cur_ty; /* figure out the tile offsets of LODs */ hz_width = 0; hz_height = 0; cur_tx = 0; cur_ty = 0; - for (lv = 0; lv <= templ->last_level; lv++) { - unsigned tw, th; + for (lv = 0; lv < info->level_count; lv++) { + int tw, th; lod_tx[lv] = cur_tx; lod_ty[lv] = cur_ty; - tw = align(img->lods[lv].slice_width, 16); - th = align(img->lods[lv].slice_height, hz_align_j) * - templ->array_size / 2; + tw = align(layout->lods[lv].slice_width, 16); + th = align(layout->lods[lv].slice_height, hz_align_j) * + info->array_size / 2; /* convert to Y-tiles */ - tw = align(tw, 128) / 128; - th = align(th, 32) / 32; + tw = (tw + 127) / 128; + th = (th + 31) / 32; if (hz_width < cur_tx + tw) hz_width = cur_tx + tw; @@ -1102,22 +1184,23 @@ img_calculate_hiz_size(struct ilo_image *img, } /* convert tile offsets to memory offsets */ - for (lv = 0; lv <= templ->last_level; lv++) { + for (lv = 0; lv < info->level_count; lv++) { img->aux.walk_lod_offsets[lv] = (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096; } + hz_width *= 128; hz_height *= 32; } break; case ILO_IMAGE_WALK_3D: - hz_width = align(img->lods[0].slice_width, 16); + hz_width = align(layout->lods[0].slice_width, 16); hz_height = 0; - for (lv = 0; lv <= templ->last_level; lv++) { - const unsigned h = align(img->lods[lv].slice_height, hz_align_j); + for (lv = 0; lv < info->level_count; lv++) { + const int h = align(layout->lods[lv].slice_height, hz_align_j); /* according to the formula, slices are packed together vertically */ - hz_height += h * u_minify(templ->depth0, lv); + hz_height += h * u_minify(info->depth, lv); } hz_height /= 2; break; @@ -1136,8 +1219,7 @@ img_calculate_hiz_size(struct ilo_image *img, */ hz_clear_w = 8; hz_clear_h = 4; - switch (templ->nr_samples) { - case 0: + switch (info->sample_count) { case 1: default: break; @@ -1158,33 +1240,38 @@ img_calculate_hiz_size(struct ilo_image *img, break; } - for (lv = 0; lv <= templ->last_level; lv++) { - if (u_minify(img->width0, lv) % hz_clear_w || - u_minify(img->height0, lv) % hz_clear_h) + for (lv = 0; lv < info->level_count; lv++) { + if (u_minify(info->width, lv) % hz_clear_w || + u_minify(info->height, lv) % hz_clear_h) break; img->aux.enables |= 1 << lv; } - /* we padded to allow this in img_align() */ - if (templ->last_level == 0 && templ->array_size == 1 && templ->depth0 == 1) + /* we padded to allow this in image_get_gen6_monolithic_size() */ + if (info->level_count == 1 && info->array_size == 1 && info->depth == 1) img->aux.enables |= 0x1; /* align to Y-tile */ img->aux.bo_stride = align(hz_width, 128); img->aux.bo_height = align(hz_height, 32); + + return true; } -static void -img_calculate_mcs_size(struct ilo_image *img, - const struct ilo_image_params *params) +static bool +image_set_gen7_mcs(struct ilo_image *img, + const struct ilo_dev *dev, + const struct ilo_image_info *info, + const struct ilo_image_layout *layout) { - const struct pipe_resource *templ = params->templ; int mcs_width, mcs_height, mcs_cpp; int downscale_x, downscale_y; - assert(img->aux.type == ILO_IMAGE_AUX_MCS); + ILO_DEV_ASSERT(dev, 7, 8); + + assert(layout->aux == ILO_IMAGE_AUX_MCS); - if (templ->nr_samples > 1) { + if (info->sample_count > 1) { /* * From the Ivy Bridge PRM, volume 2 part 1, page 326, the clear * rectangle is scaled down by 8x2 for 4X MSAA and 2x2 for 8X MSAA. The @@ -1198,7 +1285,7 @@ img_calculate_mcs_size(struct ilo_image *img, * RT. Similarly, we could reason that an OWord in 4X MCS maps to a 8x2 * pixel block in the RT. */ - switch (templ->nr_samples) { + switch (info->sample_count) { case 2: case 4: downscale_x = 8; @@ -1217,7 +1304,7 @@ img_calculate_mcs_size(struct ilo_image *img, break; default: assert(!"unsupported sample count"); - return; + return false; break; } @@ -1226,8 +1313,8 @@ img_calculate_mcs_size(struct ilo_image *img, * clear rectangle cannot be masked. The scale-down clear rectangle * thus must be aligned to 2x2, and we need to pad. */ - mcs_width = align(img->width0, downscale_x * 2); - mcs_height = align(img->height0, downscale_y * 2); + mcs_width = align(info->width, downscale_x * 2); + mcs_height = align(info->height, downscale_y * 2); } else { /* * From the Ivy Bridge PRM, volume 2 part 1, page 327: @@ -1262,18 +1349,18 @@ img_calculate_mcs_size(struct ilo_image *img, * anything except for the size of the allocated MCS. Let's see if we * hit out-of-bound access. */ - switch (img->tiling) { + switch (layout->tiling) { case GEN6_TILING_X: - downscale_x = 64 / img->block_size; + downscale_x = 64 / info->block_size; downscale_y = 2; break; case GEN6_TILING_Y: - downscale_x = 32 / img->block_size; + downscale_x = 32 / info->block_size; downscale_y = 4; break; default: assert(!"unsupported tiling mode"); - return; + return false; break; } @@ -1290,181 +1377,75 @@ img_calculate_mcs_size(struct ilo_image *img, * The scaled-down clear rectangle must be aligned to 4x4 instead of * 2x2, and we need to pad. */ - mcs_width = align(img->width0, downscale_x * 4) / downscale_x; - mcs_height = align(img->height0, downscale_y * 4) / downscale_y; + mcs_width = align(info->width, downscale_x * 4) / downscale_x; + mcs_height = align(info->height, downscale_y * 4) / downscale_y; mcs_cpp = 16; /* an OWord */ } - img->aux.enables = (1 << (templ->last_level + 1)) - 1; + img->aux.enables = (1 << info->level_count) - 1; /* align to Y-tile */ img->aux.bo_stride = align(mcs_width * mcs_cpp, 128); img->aux.bo_height = align(mcs_height, 32); -} - -static void -img_init(struct ilo_image *img, - struct ilo_image_params *params) -{ - /* there are hard dependencies between every function here */ - - img_init_aux(img, params); - img_init_size_and_format(img, params); - img_init_walk(img, params); - img_init_tiling(img, params); - img_init_alignments(img, params); - img_init_lods(img, params); - img_init_layer_height(img, params); - - img_align(img, params); - img_calculate_bo_size(img, params); - img->scanout = (params->templ->bind & PIPE_BIND_SCANOUT); - - switch (img->aux.type) { - case ILO_IMAGE_AUX_HIZ: - img_calculate_hiz_size(img, params); - break; - case ILO_IMAGE_AUX_MCS: - img_calculate_mcs_size(img, params); - break; - default: - break; - } -} - -/** - * The texutre is for transfer only. We can define our own layout to save - * space. - */ -static void -img_init_for_transfer(struct ilo_image *img, - const struct ilo_dev *dev, - const struct pipe_resource *templ) -{ - const unsigned num_layers = (templ->target == PIPE_TEXTURE_3D) ? - templ->depth0 : templ->array_size; - unsigned layer_width, layer_height; - - assert(templ->last_level == 0); - assert(templ->nr_samples <= 1); - - img->aux.type = ILO_IMAGE_AUX_NONE; - - img->target = templ->target; - img->width0 = templ->width0; - img->height0 = templ->height0; - img->depth0 = templ->depth0; - img->array_size = templ->array_size; - img->level_count = 1; - img->sample_count = 1; - - img->format = templ->format; - img->block_width = util_format_get_blockwidth(templ->format); - img->block_height = util_format_get_blockheight(templ->format); - img->block_size = util_format_get_blocksize(templ->format); - - img->walk = ILO_IMAGE_WALK_LOD; - - img->tiling = GEN6_TILING_NONE; - - img->align_i = img->block_width; - img->align_j = img->block_height; - - assert(util_is_power_of_two(img->block_width) && - util_is_power_of_two(img->block_height)); - - /* use packed layout */ - layer_width = align(templ->width0, img->align_i); - layer_height = align(templ->height0, img->align_j); - - img->lods[0].slice_width = layer_width; - img->lods[0].slice_height = layer_height; - - img->bo_stride = (layer_width / img->block_width) * img->block_size; - img->bo_stride = align(img->bo_stride, 64); - - img->bo_height = (layer_height / img->block_height) * num_layers; + return true; } -/** - * Initialize the image. Callers should zero-initialize \p img first. - */ -void ilo_image_init(struct ilo_image *img, - const struct ilo_dev *dev, - const struct pipe_resource *templ) +bool +ilo_image_init(struct ilo_image *img, + const struct ilo_dev *dev, + const struct ilo_image_info *info) { - struct ilo_image_params params; - bool transfer_only; + struct ilo_image_layout layout; assert(ilo_is_zeroed(img, sizeof(*img))); - /* use transfer layout when the texture is never bound to GPU */ - transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE | - PIPE_BIND_TRANSFER_READ)); - if (transfer_only && templ->last_level == 0 && templ->nr_samples <= 1) { - img_init_for_transfer(img, dev, templ); - return; - } + memset(&layout, 0, sizeof(layout)); + layout.lods = img->lods; - memset(¶ms, 0, sizeof(params)); - params.dev = dev; - params.templ = templ; - params.valid_tilings = IMAGE_TILING_ALL; + if (!image_get_gen6_layout(dev, info, &layout)) + return false; - img_init(img, ¶ms); -} + img->type = info->type; -bool -ilo_image_init_for_imported(struct ilo_image *img, - const struct ilo_dev *dev, - const struct pipe_resource *templ, - enum gen_surface_tiling tiling, - unsigned bo_stride) -{ - struct ilo_image_params params; + img->format = info->format; + img->block_width = info->block_width; + img->block_height = info->block_height; + img->block_size = info->block_size; - assert(ilo_is_zeroed(img, sizeof(*img))); + img->width0 = info->width; + img->height0 = info->height; + img->depth0 = info->depth; + img->array_size = info->array_size; + img->level_count = info->level_count; + img->sample_count = info->sample_count; - if ((tiling == GEN6_TILING_X && bo_stride % 512) || - (tiling == GEN6_TILING_Y && bo_stride % 128) || - (tiling == GEN8_TILING_W && bo_stride % 64)) - return false; + img->walk = layout.walk; + img->interleaved_samples = layout.interleaved_samples; - memset(¶ms, 0, sizeof(params)); - params.dev = dev; - params.templ = templ; - params.valid_tilings = 1 << tiling; + img->tiling = layout.tiling; - img_init(img, ¶ms); + img->aux.type = layout.aux; - assert(img->tiling == tiling); - if (img->bo_stride > bo_stride) - return false; - - img->bo_stride = bo_stride; - - /* assume imported RTs are also scanouts */ - if (!img->scanout) - img->scanout = (templ->bind & PIPE_BIND_RENDER_TARGET); + img->align_i = layout.align_i; + img->align_j = layout.align_j; - return true; -} + img->walk_layer_height = layout.walk_layer_height; -bool -ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev) -{ - /* HiZ is required for separate stencil on Gen6 */ - if (ilo_dev_gen(dev) == ILO_GEN(6) && - img->aux.type == ILO_IMAGE_AUX_HIZ && - img->separate_stencil) + if (!image_set_gen6_bo_size(img, dev, info, &layout)) return false; - /* MCS is required for multisample images */ - if (img->aux.type == ILO_IMAGE_AUX_MCS && - img->sample_count > 1) - return false; + img->scanout = info->bind_scanout; - img->aux.enables = 0x0; + switch (layout.aux) { + case ILO_IMAGE_AUX_HIZ: + image_set_gen6_hiz(img, dev, info, &layout); + break; + case ILO_IMAGE_AUX_MCS: + image_set_gen7_mcs(img, dev, info, &layout); + break; + default: + break; + } return true; } diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h index af15e856028..646ed6f5727 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.h +++ b/src/gallium/drivers/ilo/core/ilo_image.h @@ -29,11 +29,17 @@ #define ILO_IMAGE_H #include "genhw/genhw.h" -#include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" +/* + * From the Ivy Bridge PRM, volume 4 part 1, page 75: + * + * "(MIP Count / LOD) representing [1,15] MIP levels" + */ +#define ILO_IMAGE_MAX_LEVEL_COUNT 15 + enum ilo_image_aux_type { ILO_IMAGE_AUX_NONE, ILO_IMAGE_AUX_HIZ, @@ -68,6 +74,49 @@ enum ilo_image_walk_type { ILO_IMAGE_WALK_3D, }; +struct ilo_image_info { + enum gen_surface_type type; + + enum gen_surface_format format; + bool interleaved_stencil; + bool is_integer; + /* width, height and size of pixel blocks */ + bool compressed; + unsigned block_width; + unsigned block_height; + unsigned block_size; + + /* image size */ + uint16_t width; + uint16_t height; + uint16_t depth; + uint16_t array_size; + uint8_t level_count; + uint8_t sample_count; + + /* disable optional aux */ + bool aux_disable; + + /* tilings to consider, if any bit is set */ + uint8_t valid_tilings; + + /* + * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the + * threshold + */ + uint32_t prefer_linear_threshold; + + /* force a stride when non-zero */ + uint32_t force_bo_stride; + + bool bind_surface_sampler; + bool bind_surface_dp_render; + bool bind_surface_dp_typed; + bool bind_zs; + bool bind_scanout; + bool bind_cursor; +}; + /* * When the walk type is ILO_IMAGE_WALK_LAYER, there is only a slice in each * LOD and this is used to describe LODs in the first array layer. Otherwise, @@ -88,7 +137,10 @@ struct ilo_image_lod { * Texture layout. */ struct ilo_image { - enum pipe_texture_target target; + enum gen_surface_type type; + + enum gen_surface_format format; + bool interleaved_stencil; /* size, format, etc for programming hardware states */ unsigned width0; @@ -97,8 +149,6 @@ struct ilo_image { unsigned array_size; unsigned level_count; unsigned sample_count; - enum pipe_format format; - bool separate_stencil; /* * width, height, and size of pixel blocks for conversion between pixel @@ -117,7 +167,7 @@ struct ilo_image { unsigned align_i; unsigned align_j; - struct ilo_image_lod lods[PIPE_MAX_TEXTURE_LEVELS]; + struct ilo_image_lod lods[ILO_IMAGE_MAX_LEVEL_COUNT]; /* physical layer height for ILO_IMAGE_WALK_LAYER */ unsigned walk_layer_height; @@ -136,36 +186,18 @@ struct ilo_image { unsigned enables; /* LOD offsets for ILO_IMAGE_WALK_LOD */ - unsigned walk_lod_offsets[PIPE_MAX_TEXTURE_LEVELS]; + unsigned walk_lod_offsets[ILO_IMAGE_MAX_LEVEL_COUNT]; unsigned walk_layer_height; unsigned bo_stride; unsigned bo_height; - - /* managed by users */ - struct intel_bo *bo; } aux; - - /* managed by users */ - struct intel_bo *bo; }; -struct pipe_resource; - -void +bool ilo_image_init(struct ilo_image *img, const struct ilo_dev *dev, - const struct pipe_resource *templ); - -bool -ilo_image_init_for_imported(struct ilo_image *img, - const struct ilo_dev *dev, - const struct pipe_resource *templ, - enum gen_surface_tiling tiling, - unsigned bo_stride); - -bool -ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev); + const struct ilo_image_info *info); static inline bool ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level) diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c index 38c0b719ab3..6ef2c91a592 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_sol.c +++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c @@ -26,7 +26,7 @@ */ #include "ilo_debug.h" -#include "ilo_buffer.h" +#include "ilo_vma.h" #include "ilo_state_sol.h" static bool @@ -270,9 +270,6 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev, { ILO_DEV_ASSERT(dev, 7, 8); - if (info->buf) - assert(info->offset < info->buf->bo_size && info->size); - /* * From the Ivy Bridge PRM, volume 2 part 1, page 208: * @@ -281,9 +278,17 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev, */ assert(info->offset % 4 == 0); + if (info->vma) { + assert(info->vma->vm_alignment % 4 == 0); + assert(info->size && info->offset + info->size <= info->vma->vm_size); + } + /* Gen8+ only */ - if (info->write_offset_load || info->write_offset_save) - assert(ilo_dev_gen(dev) >= ILO_GEN(8)); + if (info->write_offset_load || info->write_offset_save) { + assert(ilo_dev_gen(dev) >= ILO_GEN(8) && info->write_offset_vma); + assert(info->write_offset_offset + sizeof(uint32_t) <= + info->write_offset_vma->vm_size); + } /* * From the Broadwell PRM, volume 2b, page 206: @@ -304,25 +309,15 @@ static uint32_t sol_buffer_get_gen6_size(const struct ilo_dev *dev, const struct ilo_state_sol_buffer_info *info) { - uint32_t size; - ILO_DEV_ASSERT(dev, 6, 8); - if (!info->buf) - return 0; - - size = (info->offset + info->size <= info->buf->bo_size) ? info->size : - info->buf->bo_size - info->offset; - /* * From the Ivy Bridge PRM, volume 2 part 1, page 208: * * "(Surface End Address) This field specifies the ending DWord * address..." */ - size &= ~3; - - return size; + return (info->vma) ? info->size & ~3 : 0; } static bool @@ -359,7 +354,7 @@ sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb, dw1 = 0; - if (info->buf) + if (info->vma) dw1 |= GEN8_SO_BUF_DW1_ENABLE; if (info->write_offset_load) dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE; @@ -429,6 +424,15 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol, return ilo_state_sol_init(sol, dev, &info); } +uint32_t +ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment) +{ + /* DWord aligned without padding */ + *alignment = 4; + return size; +} + bool ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb, const struct ilo_dev *dev, @@ -443,9 +447,8 @@ ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb, else ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info); - sb->need_bo = (info->size > 0); - sb->need_write_offset_bo = (info->write_offset_save || - (info->write_offset_load && !info->write_offset_imm_enable)); + sb->vma = info->vma; + sb->write_offset_vma = info->write_offset_vma; assert(ret); diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h index 2513fcb4979..92c5f94725b 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_sol.h +++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h @@ -107,17 +107,17 @@ struct ilo_state_sol { uint8_t decl_count; }; -struct ilo_buffer; +struct ilo_vma; struct ilo_state_sol_buffer_info { - const struct ilo_buffer *buf; + const struct ilo_vma *vma; uint32_t offset; uint32_t size; - /* - * Gen8+ only. When enabled, require a write offset bo of at least - * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes - */ + /* Gen8+ only; at least sizeof(uint32_t) bytes */ + const struct ilo_vma *write_offset_vma; + uint32_t write_offset_offset; + bool write_offset_load; bool write_offset_save; @@ -126,14 +126,10 @@ struct ilo_state_sol_buffer_info { }; struct ilo_state_sol_buffer { - uint32_t so_buf[4]; - - bool need_bo; - bool need_write_offset_bo; + uint32_t so_buf[5]; - /* managed by users */ - struct intel_bo *bo; - struct intel_bo *write_offset_bo; + const struct ilo_vma *vma; + const struct ilo_vma *write_offset_vma; }; static inline size_t @@ -154,6 +150,10 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol, const struct ilo_dev *dev, bool render_disable); +uint32_t +ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment); + bool ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb, const struct ilo_dev *dev, diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c index 5be9f8f6270..40fe15f316f 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_surface.c +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c @@ -26,8 +26,8 @@ */ #include "ilo_debug.h" -#include "ilo_buffer.h" #include "ilo_image.h" +#include "ilo_vma.h" #include "ilo_state_surface.h" static bool @@ -94,31 +94,13 @@ surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf, return true; } -static bool -surface_validate_gen6_buffer(const struct ilo_dev *dev, - const struct ilo_state_surface_buffer_info *info) +static uint32_t +surface_get_gen6_buffer_offset_alignment(const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) { - ILO_DEV_ASSERT(dev, 6, 8); - - /* SVB writes are Gen6-only */ - if (ilo_dev_gen(dev) >= ILO_GEN(7)) - assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB); - - if (info->offset + info->size > info->buf->bo_size) { - ilo_warn("invalid buffer range\n"); - return false; - } + uint32_t alignment; - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 81: - * - * "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B] - * For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]" - */ - if (!info->struct_size || info->struct_size > 2048) { - ilo_warn("invalid buffer struct size\n"); - return false; - } + ILO_DEV_ASSERT(dev, 6, 8); /* * From the Ivy Bridge PRM, volume 4 part 1, page 68: @@ -132,76 +114,153 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev, * "Certain message types used to access surfaces have more stringent * alignment requirements. Please refer to the specific message * documentation for additional restrictions." - * - * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237: - * - * "the surface base address must be OWord aligned" - * - * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual - * Block Read/Write. - * - * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249: - * - * "The surface base address must be DWord aligned" - * - * for DWord Scattered Read/Write and Byte Scattered Read/Write. - * - * We have to rely on users to correctly set info->struct_size here. DWord - * Scattered Read/Write has conflicting pitch and alignment, but we do not - * use them yet so we are fine. - * - * It is unclear if sampling engine surfaces require aligned offsets. */ - if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) { - assert(info->struct_size % info->format_size == 0); + switch (info->access) { + case ILO_STATE_SURFACE_ACCESS_SAMPLER: + /* no alignment requirements */ + alignment = 1; + break; + case ILO_STATE_SURFACE_ACCESS_DP_RENDER: + case ILO_STATE_SURFACE_ACCESS_DP_TYPED: + /* element-size aligned */ + alignment = info->format_size; - if (info->offset % info->struct_size) { - ilo_warn("bad buffer offset\n"); - return false; - } - } + assert(info->struct_size % alignment == 0); + break; + case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED: + /* + * Nothing is said about Untyped* messages, but I think they require the + * base address to be DWord aligned. + */ + alignment = 4; - if (info->format == GEN6_FORMAT_RAW) { /* - * From the Sandy Bridge PRM, volume 4 part 1, page 97: + * From the Ivy Bridge PRM, volume 4 part 1, page 70: + * + * "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the + * pitch must be a multiple of 4 bytes." + */ + if (info->struct_size > 1) + assert(info->struct_size % alignment == 0); + break; + case ILO_STATE_SURFACE_ACCESS_DP_DATA: + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237: + * + * "the surface base address must be OWord aligned" + * + * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord + * Dual Block Read/Write. + * + * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249: * - * ""RAW" is supported only with buffers and structured buffers - * accessed via the untyped surface read/write and untyped atomic - * operation messages, which do not have a column in the table." + * "The surface base address must be DWord aligned" * - * We do not have a specific access mode for untyped messages. + * for DWord Scattered Read/Write and Byte Scattered Read/Write. */ - assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED); + alignment = (info->format_size > 4) ? 16 : 4; /* - * Nothing is said about Untyped* messages, but I guess they require the - * base address to be DWord aligned. + * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, 237, and + * 246: + * + * "the surface pitch is ignored, the surface is treated as a + * 1-dimensional surface. An element size (pitch) of 16 bytes is + * used to determine the size of the buffer for out-of-bounds + * checking if using the surface state model." + * + * for OWord Block Read/Write, Unaligned OWord Block Read, OWord + * Dual Block Read/Write, and DWord Scattered Read/Write. + * + * From the Ivy Bridge PRM, volume 4 part 1, page 248: + * + * "The surface pitch is ignored, the surface is treated as a + * 1-dimensional surface. An element size (pitch) of 4 bytes is + * used to determine the size of the buffer for out-of-bounds + * checking if using the surface state model." + * + * for Byte Scattered Read/Write. + * + * It is programmable on Gen7.5+. */ - if (info->offset % 4) { - ilo_warn("bad RAW buffer offset\n"); - return false; + if (ilo_dev_gen(dev) < ILO_GEN(7.5)) { + const int fixed = (info->format_size > 1) ? 16 : 4; + assert(info->struct_size == fixed); } + break; + case ILO_STATE_SURFACE_ACCESS_DP_SVB: + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 259: + * + * "Both the surface base address and surface pitch must be DWord + * aligned." + */ + alignment = 4; - if (info->struct_size > 1) { - /* no STRBUF on Gen6 */ - if (ilo_dev_gen(dev) == ILO_GEN(6)) { - ilo_warn("no STRBUF support\n"); - return false; - } + assert(info->struct_size % alignment == 0); + break; + default: + assert(!"unknown access"); + alignment = 1; + break; + } - /* - * From the Ivy Bridge PRM, volume 4 part 1, page 70: - * - * "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the - * pitch must be a multiple of 4 bytes." - */ - if (info->struct_size % 4) { - ilo_warn("bad STRBUF pitch\n"); - return false; - } - } + return alignment; +} + +static bool +surface_validate_gen6_buffer(const struct ilo_dev *dev, + const struct ilo_state_surface_buffer_info *info) +{ + uint32_t alignment; + + ILO_DEV_ASSERT(dev, 6, 8); + + if (info->offset + info->size > info->vma->vm_size) { + ilo_warn("invalid buffer range\n"); + return false; } + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 81: + * + * "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B] + * For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]" + */ + if (!info->struct_size || info->struct_size > 2048) { + ilo_warn("invalid buffer struct size\n"); + return false; + } + + alignment = surface_get_gen6_buffer_offset_alignment(dev, info); + if (info->offset % alignment || info->vma->vm_alignment % alignment) { + ilo_warn("bad buffer offset\n"); + return false; + } + + /* no STRBUF on Gen6 */ + if (info->format == GEN6_FORMAT_RAW && info->struct_size > 1) + assert(ilo_dev_gen(dev) >= ILO_GEN(7)); + + /* SVB writes are Gen6 only */ + if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB) + assert(ilo_dev_gen(dev) == ILO_GEN(6)); + + /* + * From the Ivy Bridge PRM, volume 4 part 1, page 83: + * + * "NOTE: "RAW" is supported only with buffers and structured buffers + * accessed via the untyped surface read/write and untyped atomic + * operation messages, which do not have a column in the table." + * + * From the Ivy Bridge PRM, volume 4 part 1, page 252: + * + * "For untyped messages, the Surface Format must be RAW and the + * Surface Type must be SURFTYPE_BUFFER or SURFTYPE_STRBUF." + */ + assert((info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED) == + (info->format == GEN6_FORMAT_RAW)); + return true; } @@ -215,8 +274,7 @@ surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev, ILO_DEV_ASSERT(dev, 6, 8); c = info->size / info->struct_size; - if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB && - info->format_size < info->size - info->struct_size * c) + if (info->format_size < info->size - info->struct_size * c) c++; /* @@ -367,29 +425,6 @@ surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf, return true; } -static enum gen_surface_type -get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - switch (img->target) { - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return GEN6_SURFTYPE_1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE_ARRAY: - return GEN6_SURFTYPE_2D; - case PIPE_TEXTURE_3D: - return GEN6_SURFTYPE_3D; - default: - assert(!"unknown texture target"); - return GEN6_SURFTYPE_NULL; - } -} - static bool surface_validate_gen6_image(const struct ilo_dev *dev, const struct ilo_state_surface_image_info *info) @@ -408,6 +443,17 @@ surface_validate_gen6_image(const struct ilo_dev *dev, break; } + assert(info->img && info->vma); + + if (info->img->tiling != GEN6_TILING_NONE) + assert(info->vma->vm_alignment % 4096 == 0); + + if (info->aux_vma) { + assert(ilo_image_can_enable_aux(info->img, info->level_base)); + /* always tiled */ + assert(info->aux_vma->vm_alignment % 4096 == 0); + } + /* * From the Sandy Bridge PRM, volume 4 part 1, page 78: * @@ -418,16 +464,18 @@ surface_validate_gen6_image(const struct ilo_dev *dev, assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 && info->img->width0 <= info->img->bo_stride); - if (info->is_cube_map) { - assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D); + if (info->type != info->img->type) { + assert(info->type == GEN6_SURFTYPE_2D && + info->img->type == GEN6_SURFTYPE_CUBE); + } - /* - * From the Sandy Bridge PRM, volume 4 part 1, page 78: - * - * "For cube maps, Width must be set equal to the Height." - */ + /* + * From the Sandy Bridge PRM, volume 4 part 1, page 78: + * + * "For cube maps, Width must be set equal to the Height." + */ + if (info->type == GEN6_SURFTYPE_CUBE) assert(info->img->width0 == info->img->height0); - } /* * From the Sandy Bridge PRM, volume 4 part 1, page 72: @@ -463,20 +511,21 @@ surface_validate_gen6_image(const struct ilo_dev *dev, } static void -get_gen6_max_extent(const struct ilo_dev *dev, - const struct ilo_image *img, - uint16_t *max_w, uint16_t *max_h) +surface_get_gen6_image_max_extent(const struct ilo_dev *dev, + const struct ilo_state_surface_image_info *info, + uint16_t *max_w, uint16_t *max_h) { const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; ILO_DEV_ASSERT(dev, 6, 8); - switch (get_gen6_surface_type(dev, img)) { + switch (info->type) { case GEN6_SURFTYPE_1D: *max_w = max_size; *max_h = 1; break; case GEN6_SURFTYPE_2D: + case GEN6_SURFTYPE_CUBE: *max_w = max_size; *max_h = max_size; break; @@ -504,7 +553,7 @@ surface_get_gen6_image_extent(const struct ilo_dev *dev, w = info->img->width0; h = info->img->height0; - get_gen6_max_extent(dev, info->img, &max_w, &max_h); + surface_get_gen6_image_max_extent(dev, info, &max_w, &max_h); assert(w && h && w <= max_w && h <= max_h); *width = w - 1; @@ -555,16 +604,17 @@ surface_get_gen6_image_slices(const struct ilo_dev *dev, * layers to (86 * 6), about 512. */ - switch (get_gen6_surface_type(dev, info->img)) { + switch (info->type) { case GEN6_SURFTYPE_1D: case GEN6_SURFTYPE_2D: + case GEN6_SURFTYPE_CUBE: max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512; assert(info->img->array_size <= max_slice); max_slice = info->img->array_size; d = info->slice_count; - if (info->is_cube_map) { + if (info->type == GEN6_SURFTYPE_CUBE) { if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) { if (!d || d % 6) { ilo_warn("invalid cube slice count\n"); @@ -877,7 +927,6 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf, uint8_t min_lod, mip_count; enum gen_sample_count sample_count; uint32_t alignments; - enum gen_surface_type type; uint32_t dw0, dw2, dw3, dw4, dw5; ILO_DEV_ASSERT(dev, 6, 6); @@ -897,10 +946,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf, if (info->img->sample_count > 1) assert(info->img->interleaved_samples); - type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : - get_gen6_surface_type(dev, info->img); - - dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT | + dw0 = info->type << GEN6_SURFACE_DW0_TYPE__SHIFT | info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT | GEN6_SURFACE_DW0_MIPLAYOUT_BELOW; @@ -927,7 +973,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf, * "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this * field must be programmed to 111111b (all faces enabled)." */ - if (info->is_cube_map && + if (info->type == GEN6_SURFTYPE_CUBE && info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) { dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE | GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; @@ -956,7 +1002,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf, surf->surface[4] = dw4; surf->surface[5] = dw5; - surf->type = type; + surf->type = info->type; surf->min_lod = min_lod; surf->mip_count = mip_count; @@ -972,7 +1018,6 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf, uint8_t min_lod, mip_count; uint32_t alignments; enum gen_sample_count sample_count; - enum gen_surface_type type; uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7; ILO_DEV_ASSERT(dev, 7, 8); @@ -986,10 +1031,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf, !surface_get_gen6_image_alignments(dev, info, &alignments)) return false; - type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : - get_gen6_surface_type(dev, info->img); - - dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT | + dw0 = info->type << GEN7_SURFACE_DW0_TYPE__SHIFT | info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT | alignments; @@ -1023,7 +1065,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf, * field must be programmed to 111111b (all faces enabled). This field * is ignored unless the Surface Type is SURFTYPE_CUBE." */ - if (info->is_cube_map && + if (info->type == GEN6_SURFTYPE_CUBE && info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; @@ -1087,13 +1129,61 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf, surf->surface[12] = 0; } - surf->type = type; + surf->type = info->type; surf->min_lod = min_lod; surf->mip_count = mip_count; return true; } +uint32_t +ilo_state_surface_buffer_size(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + uint32_t size, uint32_t *alignment) +{ + switch (access) { + case ILO_STATE_SURFACE_ACCESS_SAMPLER: + /* + * From the Sandy Bridge PRM, volume 1 part 1, page 118: + * + * "For buffers, which have no inherent "height," padding + * requirements are different. A buffer must be padded to the next + * multiple of 256 array elements, with an additional 16 bytes + * added beyond that to account for the L1 cache line." + * + * Assuming tightly packed GEN6_FORMAT_R32G32B32A32_FLOAT, the size + * needs to be padded to 4096 (= 16 * 256). + */ + *alignment = 1; + size = align(size, 4096) + 16; + break; + case ILO_STATE_SURFACE_ACCESS_DP_RENDER: + case ILO_STATE_SURFACE_ACCESS_DP_TYPED: + /* element-size aligned for worst cases */ + *alignment = 16; + break; + case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED: + /* DWord aligned? */ + *alignment = 4; + break; + case ILO_STATE_SURFACE_ACCESS_DP_DATA: + /* OWord aligned */ + *alignment = 16; + size = align(size, 16); + break; + case ILO_STATE_SURFACE_ACCESS_DP_SVB: + /* always DWord aligned */ + *alignment = 4; + break; + default: + assert(!"unknown access"); + *alignment = 1; + break; + } + + return size; +} + bool ilo_state_surface_init_for_null(struct ilo_state_surface *surf, const struct ilo_dev *dev) @@ -1107,6 +1197,7 @@ ilo_state_surface_init_for_null(struct ilo_state_surface *surf, else ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev); + surf->vma = NULL; surf->type = GEN6_SURFTYPE_NULL; surf->readonly = true; @@ -1129,6 +1220,7 @@ ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf, else ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info); + surf->vma = info->vma; surf->readonly = info->readonly; assert(ret); @@ -1150,6 +1242,9 @@ ilo_state_surface_init_for_image(struct ilo_state_surface *surf, else ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info); + surf->vma = info->vma; + surf->aux_vma = info->aux_vma; + surf->is_integer = info->is_integer; surf->readonly = info->readonly; surf->scanout = info->img->scanout; diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h index 9c025428d50..e78c7c97db1 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_surface.h +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h @@ -29,14 +29,10 @@ #define ILO_STATE_SURFACE_H #include "genhw/genhw.h" -#include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" -struct ilo_buffer; -struct ilo_image; - enum ilo_state_surface_access { ILO_STATE_SURFACE_ACCESS_SAMPLER, /* sampling engine surfaces */ ILO_STATE_SURFACE_ACCESS_DP_RENDER, /* render target surfaces */ @@ -46,42 +42,51 @@ enum ilo_state_surface_access { ILO_STATE_SURFACE_ACCESS_DP_SVB, }; +struct ilo_vma; +struct ilo_image; + struct ilo_state_surface_buffer_info { - const struct ilo_buffer *buf; + const struct ilo_vma *vma; + uint32_t offset; + uint32_t size; enum ilo_state_surface_access access; + /* format_size may be less than, equal to, or greater than struct_size */ enum gen_surface_format format; uint8_t format_size; bool readonly; uint16_t struct_size; - - uint32_t offset; - uint32_t size; }; struct ilo_state_surface_image_info { const struct ilo_image *img; + uint8_t level_base; + uint8_t level_count; + uint16_t slice_base; + uint16_t slice_count; + + const struct ilo_vma *vma; + const struct ilo_vma *aux_vma; enum ilo_state_surface_access access; + enum gen_surface_type type; + enum gen_surface_format format; bool is_integer; bool readonly; - bool is_cube_map; bool is_array; - - uint8_t level_base; - uint8_t level_count; - uint16_t slice_base; - uint16_t slice_count; }; struct ilo_state_surface { uint32_t surface[13]; + const struct ilo_vma *vma; + const struct ilo_vma *aux_vma; + enum gen_surface_type type; uint8_t min_lod; uint8_t mip_count; @@ -89,9 +94,6 @@ struct ilo_state_surface { bool readonly; bool scanout; - - /* managed by users */ - struct intel_bo *bo; }; bool @@ -99,6 +101,11 @@ ilo_state_surface_valid_format(const struct ilo_dev *dev, enum ilo_state_surface_access access, enum gen_surface_format format); +uint32_t +ilo_state_surface_buffer_size(const struct ilo_dev *dev, + enum ilo_state_surface_access access, + uint32_t size, uint32_t *alignment); + bool ilo_state_surface_init_for_null(struct ilo_state_surface *surf, const struct ilo_dev *dev); diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c index ddc75428ed7..9faf835fef2 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_vf.c +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c @@ -26,7 +26,7 @@ */ #include "ilo_debug.h" -#include "ilo_buffer.h" +#include "ilo_vma.h" #include "ilo_state_vf.h" static bool @@ -479,8 +479,8 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev, { ILO_DEV_ASSERT(dev, 6, 8); - if (info->buf) - assert(info->offset < info->buf->bo_size && info->size); + if (info->vma) + assert(info->size && info->offset + info->size <= info->vma->vm_size); /* * From the Sandy Bridge PRM, volume 2 part 1, page 86: @@ -500,6 +500,9 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev, * aligned address, and BufferPitch must be a multiple of 64-bits." */ if (info->cv_has_double) { + if (info->vma) + assert(info->vma->vm_alignment % 8 == 0); + assert(info->stride % 8 == 0); assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0); } @@ -512,12 +515,7 @@ vertex_buffer_get_gen6_size(const struct ilo_dev *dev, const struct ilo_state_vertex_buffer_info *info) { ILO_DEV_ASSERT(dev, 6, 8); - - if (!info->buf) - return 0; - - return (info->offset + info->size <= info->buf->bo_size) ? info->size : - info->buf->bo_size - info->offset; + return (info->vma) ? info->size : 0; } static bool @@ -537,7 +535,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb, if (ilo_dev_gen(dev) >= ILO_GEN(7)) dw0 |= GEN7_VB_DW0_ADDR_MODIFIED; - if (!info->buf) + if (!info->vma) dw0 |= GEN6_VB_DW0_IS_NULL; STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3); @@ -551,7 +549,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb, vb->vb[2] = (size) ? info->offset + size - 1 : 0; } - vb->need_bo = (info->buf != NULL); + vb->vma = info->vma; return true; } @@ -586,8 +584,10 @@ index_buffer_validate_gen6(const struct ilo_dev *dev, */ assert(info->offset % format_size == 0); - if (info->buf) - assert(info->offset < info->buf->bo_size && info->size); + if (info->vma) { + assert(info->vma->vm_alignment % format_size == 0); + assert(info->size && info->offset + info->size <= info->vma->vm_size); + } return true; } @@ -600,12 +600,10 @@ index_buffer_get_gen6_size(const struct ilo_dev *dev, ILO_DEV_ASSERT(dev, 6, 8); - if (!info->buf) + if (!info->vma) return 0; - size = (info->offset + info->size <= info->buf->bo_size) ? info->size : - info->buf->bo_size - info->offset; - + size = info->size; if (ilo_dev_gen(dev) < ILO_GEN(8)) { const uint32_t format_size = get_index_format_size(info->format); size -= (size % format_size); @@ -638,7 +636,7 @@ index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib, ib->ib[2] = (size) ? info->offset + size - 1 : 0; } - ib->need_bo = (info->buf != NULL); + ib->vma = info->vma; return true; } @@ -949,6 +947,15 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf, } } +uint32_t +ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment) +{ + /* align for doubles without padding */ + *alignment = 8; + return size; +} + /** * No need to initialize first. */ @@ -966,6 +973,15 @@ ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb, return ret; } +uint32_t +ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment) +{ + /* align for the worst case without padding */ + *alignment = get_index_format_size(GEN6_INDEX_DWORD); + return size; +} + /** * No need to initialize first. */ diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h index f15c63a248a..16b128bf63c 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_vf.h +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h @@ -126,10 +126,10 @@ struct ilo_state_vf_delta { uint32_t dirty; }; -struct ilo_buffer; +struct ilo_vma; struct ilo_state_vertex_buffer_info { - const struct ilo_buffer *buf; + const struct ilo_vma *vma; uint32_t offset; uint32_t size; @@ -143,14 +143,11 @@ struct ilo_state_vertex_buffer_info { struct ilo_state_vertex_buffer { uint32_t vb[3]; - bool need_bo; - - /* managed by users */ - struct intel_bo *bo; + const struct ilo_vma *vma; }; struct ilo_state_index_buffer_info { - const struct ilo_buffer *buf; + const struct ilo_vma *vma; uint32_t offset; uint32_t size; @@ -160,10 +157,7 @@ struct ilo_state_index_buffer_info { struct ilo_state_index_buffer { uint32_t ib[3]; - bool need_bo; - - /* managed by users */ - struct intel_bo *bo; + const struct ilo_vma *vma; }; static inline size_t @@ -215,11 +209,19 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf, const struct ilo_state_vf *old, struct ilo_state_vf_delta *delta); +uint32_t +ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment); + bool ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb, const struct ilo_dev *dev, const struct ilo_state_vertex_buffer_info *info); +uint32_t +ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size, + uint32_t *alignment); + bool ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib, const struct ilo_dev *dev, diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c index 901fedb5599..827632764b2 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_zs.c +++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c @@ -25,10 +25,9 @@ * Chia-I Wu <[email protected]> */ -#include "intel_winsys.h" - #include "ilo_debug.h" #include "ilo_image.h" +#include "ilo_vma.h" #include "ilo_state_zs.h" static bool @@ -56,70 +55,9 @@ zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, zs->depth[3] = 0; zs->depth[4] = 0; - zs->depth_format = format; - return true; } -static enum gen_surface_type -get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - switch (img->target) { - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return GEN6_SURFTYPE_1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE_ARRAY: - return GEN6_SURFTYPE_2D; - case PIPE_TEXTURE_3D: - return GEN6_SURFTYPE_3D; - default: - assert(!"unknown texture target"); - return GEN6_SURFTYPE_NULL; - } -} - -static enum gen_depth_format -get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img) -{ - ILO_DEV_ASSERT(dev, 6, 8); - - if (ilo_dev_gen(dev) >= ILO_GEN(7)) { - switch (img->format) { - case PIPE_FORMAT_Z32_FLOAT: - return GEN6_ZFORMAT_D32_FLOAT; - case PIPE_FORMAT_Z24X8_UNORM: - return GEN6_ZFORMAT_D24_UNORM_X8_UINT; - case PIPE_FORMAT_Z16_UNORM: - return GEN6_ZFORMAT_D16_UNORM; - default: - assert(!"unknown depth format"); - return GEN6_ZFORMAT_D32_FLOAT; - } - } else { - switch (img->format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT; - case PIPE_FORMAT_Z32_FLOAT: - return GEN6_ZFORMAT_D32_FLOAT; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return GEN6_ZFORMAT_D24_UNORM_S8_UINT; - case PIPE_FORMAT_Z24X8_UNORM: - return GEN6_ZFORMAT_D24_UNORM_X8_UINT; - case PIPE_FORMAT_Z16_UNORM: - return GEN6_ZFORMAT_D16_UNORM; - default: - assert(!"unknown depth format"); - return GEN6_ZFORMAT_D32_FLOAT; - } - } -} - static bool zs_validate_gen6(const struct ilo_dev *dev, const struct ilo_state_zs_info *info) @@ -128,63 +66,102 @@ zs_validate_gen6(const struct ilo_dev *dev, ILO_DEV_ASSERT(dev, 6, 8); + assert(!info->z_img == !info->z_vma); + assert(!info->s_img == !info->s_vma); + + /* all tiled */ + if (info->z_img) { + assert(info->z_img->tiling == GEN6_TILING_Y); + assert(info->z_vma->vm_alignment % 4096 == 0); + } + if (info->s_img) { + assert(info->s_img->tiling == GEN8_TILING_W); + assert(info->s_vma->vm_alignment % 4096 == 0); + } + if (info->hiz_vma) { + assert(info->z_img && + ilo_image_can_enable_aux(info->z_img, info->level)); + assert(info->z_vma->vm_alignment % 4096 == 0); + } + /* * From the Ivy Bridge PRM, volume 2 part 1, page 315: * - * The stencil buffer has a format of S8_UINT, and shares Surface + * "The stencil buffer has a format of S8_UINT, and shares Surface * Type, Height, Width, and Depth, Minimum Array Element, Render * Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth - * Buffer Object Control State fields of the depth buffer. + * Buffer Object Control State fields of the depth buffer." */ - if (info->z_img == info->s_img) { - assert(info->z_img->target == info->s_img->target && - info->z_img->width0 == info->s_img->width0 && + if (info->z_img && info->s_img && info->z_img != info->s_img) { + assert(info->z_img->type == info->s_img->type && info->z_img->height0 == info->s_img->height0 && info->z_img->depth0 == info->s_img->depth0); } - assert(info->level < img->level_count); - assert(img->bo_stride); - - if (info->hiz_enable) { - assert(info->z_img && - ilo_image_can_enable_aux(info->z_img, info->level)); + if (info->type != img->type) { + assert(info->type == GEN6_SURFTYPE_2D && + img->type == GEN6_SURFTYPE_CUBE); } - if (info->is_cube_map) { - assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D); - + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + switch (info->format) { + case GEN6_ZFORMAT_D32_FLOAT: + case GEN6_ZFORMAT_D24_UNORM_X8_UINT: + case GEN6_ZFORMAT_D16_UNORM: + break; + default: + assert(!"unknown depth format"); + break; + } + } else { /* - * From the Sandy Bridge PRM, volume 2 part 1, page 323: + * From the Ironlake PRM, volume 2 part 1, page 330: + * + * "If this field (Separate Stencil Buffer Enable) is disabled, the + * Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT." + * + * From the Sandy Bridge PRM, volume 2 part 1, page 321: * - * "For cube maps, Width must be set equal to Height." + * "[DevSNB]: This field (Separate Stencil Buffer Enable) must be + * set to the same value (enabled or disabled) as Hierarchical + * Depth Buffer Enable." */ - assert(img->width0 == img->height0); + if (info->hiz_vma) + assert(info->format != GEN6_ZFORMAT_D24_UNORM_S8_UINT); + else + assert(info->format != GEN6_ZFORMAT_D24_UNORM_X8_UINT); } - if (info->z_img) - assert(info->z_img->tiling == GEN6_TILING_Y); - if (info->s_img) - assert(info->s_img->tiling == GEN8_TILING_W); + assert(info->level < img->level_count); + assert(img->bo_stride); + + /* + * From the Sandy Bridge PRM, volume 2 part 1, page 323: + * + * "For cube maps, Width must be set equal to Height." + */ + if (info->type == GEN6_SURFTYPE_CUBE) + assert(img->width0 == img->height0); return true; } static void -get_gen6_max_extent(const struct ilo_dev *dev, - const struct ilo_image *img, - uint16_t *max_w, uint16_t *max_h) +zs_get_gen6_max_extent(const struct ilo_dev *dev, + const struct ilo_state_zs_info *info, + uint16_t *max_w, uint16_t *max_h) { const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192; ILO_DEV_ASSERT(dev, 6, 8); - switch (get_gen6_surface_type(dev, img)) { + switch (info->type) { case GEN6_SURFTYPE_1D: *max_w = max_size; *max_h = 1; break; case GEN6_SURFTYPE_2D: + case GEN6_SURFTYPE_CUBE: *max_w = max_size; *max_h = max_size; break; @@ -274,7 +251,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev, w = img->width0; h = img->height0; - if (info->hiz_enable) { + if (info->hiz_vma) { uint16_t align_w, align_h; get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h); @@ -290,7 +267,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev, h = align(h, align_h); } - get_gen6_max_extent(dev, img, &max_w, &max_h); + zs_get_gen6_max_extent(dev, info, &max_w, &max_h); assert(w && h && w <= max_w && h <= max_h); *width = w - 1; @@ -319,16 +296,17 @@ zs_get_gen6_depth_slices(const struct ilo_dev *dev, * surfaces. If the volume texture is MIP-mapped, this field specifies * the depth of the base MIP level." */ - switch (get_gen6_surface_type(dev, img)) { + switch (info->type) { case GEN6_SURFTYPE_1D: case GEN6_SURFTYPE_2D: + case GEN6_SURFTYPE_CUBE: max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512; assert(img->array_size <= max_slice); max_slice = img->array_size; d = info->slice_count; - if (info->is_cube_map) { + if (info->type == GEN6_SURFTYPE_CUBE) { /* * Minumum Array Element and Depth must be 0; Render Target View * Extent is ignored. @@ -408,8 +386,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, const struct ilo_state_zs_info *info) { uint16_t width, height, depth, array_base, view_extent; - enum gen_surface_type type; - enum gen_depth_format format; uint32_t dw1, dw2, dw3, dw4; ILO_DEV_ASSERT(dev, 6, 6); @@ -420,37 +396,15 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, &view_extent)) return false; - type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : - (info->z_img) ? get_gen6_surface_type(dev, info->z_img) : - get_gen6_surface_type(dev, info->s_img); - - format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) : - GEN6_ZFORMAT_D32_FLOAT; - - /* - * From the Ironlake PRM, volume 2 part 1, page 330: - * - * "If this field (Separate Stencil Buffer Enable) is disabled, the - * Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT." - * - * From the Sandy Bridge PRM, volume 2 part 1, page 321: - * - * "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set - * to the same value (enabled or disabled) as Hierarchical Depth - * Buffer Enable." - */ - if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT) - format = GEN6_ZFORMAT_D24_UNORM_S8_UINT; - /* info->z_readonly and info->s_readonly are ignored on Gen6 */ - dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT | + dw1 = info->type << GEN6_DEPTH_DW1_TYPE__SHIFT | GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT | - format << GEN6_DEPTH_DW1_FORMAT__SHIFT; + info->format << GEN6_DEPTH_DW1_FORMAT__SHIFT; if (info->z_img) dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT; - if (info->hiz_enable || !info->z_img) { + if (info->hiz_vma || !info->z_img) { dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE | GEN6_DEPTH_DW1_SEPARATE_STENCIL; } @@ -471,8 +425,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, zs->depth[3] = dw4; zs->depth[4] = 0; - zs->depth_format = format; - return true; } @@ -481,8 +433,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, const struct ilo_dev *dev, const struct ilo_state_zs_info *info) { - enum gen_surface_type type; - enum gen_depth_format format; uint16_t width, height, depth; uint16_t array_base, view_extent; uint32_t dw1, dw2, dw3, dw4, dw6; @@ -495,20 +445,13 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, &view_extent)) return false; - type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE : - (info->z_img) ? get_gen6_surface_type(dev, info->z_img) : - get_gen6_surface_type(dev, info->s_img); - - format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) : - GEN6_ZFORMAT_D32_FLOAT; - - dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT | - format << GEN7_DEPTH_DW1_FORMAT__SHIFT; + dw1 = info->type << GEN7_DEPTH_DW1_TYPE__SHIFT | + info->format << GEN7_DEPTH_DW1_FORMAT__SHIFT; if (info->z_img) { if (!info->z_readonly) dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE; - if (info->hiz_enable) + if (info->hiz_vma) dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE; dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT; @@ -539,8 +482,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs, zs->depth[3] = dw4; zs->depth[4] = dw6; - zs->depth_format = format; - return true; } @@ -683,11 +624,15 @@ ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev, else ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev); - if (info->z_img && info->hiz_enable) + if (info->z_img && info->hiz_vma) ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info); else ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev); + zs->z_vma = info->z_vma; + zs->s_vma = info->s_vma; + zs->hiz_vma = info->hiz_vma; + zs->z_readonly = info->z_readonly; zs->s_readonly = info->s_readonly; @@ -703,6 +648,8 @@ ilo_state_zs_init_for_null(struct ilo_state_zs *zs, struct ilo_state_zs_info info; memset(&info, 0, sizeof(info)); + info.type = GEN6_SURFTYPE_NULL; + info.format = GEN6_ZFORMAT_D32_FLOAT; return ilo_state_zs_init(zs, dev, &info); } @@ -720,8 +667,11 @@ ilo_state_zs_disable_hiz(struct ilo_state_zs *zs, */ assert(ilo_dev_gen(dev) >= ILO_GEN(7)); - zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE; - zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev); + if (zs->hiz_vma) { + zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE; + zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev); + zs->hiz_vma = NULL; + } return true; } diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h index 98212daf74f..6a25a873897 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_zs.h +++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h @@ -29,28 +29,31 @@ #define ILO_STATE_ZS_H #include "genhw/genhw.h" -#include "intel_winsys.h" #include "ilo_core.h" #include "ilo_dev.h" +struct ilo_vma; struct ilo_image; struct ilo_state_zs_info { - /* both are optional */ + /* both optional */ const struct ilo_image *z_img; const struct ilo_image *s_img; + uint8_t level; + uint16_t slice_base; + uint16_t slice_count; + + const struct ilo_vma *z_vma; + const struct ilo_vma *s_vma; + const struct ilo_vma *hiz_vma; + + enum gen_surface_type type; + enum gen_depth_format format; /* ignored prior to Gen7 */ bool z_readonly; bool s_readonly; - - bool hiz_enable; - bool is_cube_map; - - uint8_t level; - uint16_t slice_base; - uint16_t slice_count; }; struct ilo_state_zs { @@ -58,16 +61,12 @@ struct ilo_state_zs { uint32_t stencil[3]; uint32_t hiz[3]; - /* TODO move this to ilo_image */ - enum gen_depth_format depth_format; + const struct ilo_vma *z_vma; + const struct ilo_vma *s_vma; + const struct ilo_vma *hiz_vma; bool z_readonly; bool s_readonly; - - /* managed by users */ - struct intel_bo *depth_bo; - struct intel_bo *stencil_bo; - struct intel_bo *hiz_bo; }; bool @@ -83,11 +82,4 @@ bool ilo_state_zs_disable_hiz(struct ilo_state_zs *zs, const struct ilo_dev *dev); -static inline enum gen_depth_format -ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs, - const struct ilo_dev *dev) -{ - return zs->depth_format; -} - #endif /* ILO_STATE_ZS_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_vma.h index ca3c61ff890..ad2a1d4b33e 100644 --- a/src/gallium/drivers/ilo/core/ilo_buffer.h +++ b/src/gallium/drivers/ilo/core/ilo_vma.h @@ -1,7 +1,7 @@ /* * Mesa 3-D graphics library * - * Copyright (C) 2012-2013 LunarG, Inc. + * Copyright (C) 2015 LunarG, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,40 +25,49 @@ * Chia-I Wu <[email protected]> */ -#ifndef ILO_BUFFER_H -#define ILO_BUFFER_H - -#include "intel_winsys.h" +#ifndef ILO_VMA_H +#define ILO_VMA_H #include "ilo_core.h" #include "ilo_debug.h" #include "ilo_dev.h" -struct ilo_buffer { - unsigned bo_size; +struct intel_bo; + +/** + * A virtual memory area. + */ +struct ilo_vma { + /* address space */ + uint32_t vm_size; + uint32_t vm_alignment; - /* managed by users */ + /* backing storage */ struct intel_bo *bo; + uint32_t bo_offset; }; -static inline void -ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev, - unsigned size, uint32_t bind, uint32_t flags) +static inline bool +ilo_vma_init(struct ilo_vma *vma, const struct ilo_dev *dev, + uint32_t size, uint32_t alignment) { - assert(ilo_is_zeroed(buf, sizeof(*buf))); + assert(ilo_is_zeroed(vma, sizeof(*vma))); + assert(size && alignment); + + vma->vm_alignment = alignment; + vma->vm_size = size; - buf->bo_size = size; + return true; +} + +static inline void +ilo_vma_set_bo(struct ilo_vma *vma, const struct ilo_dev *dev, + struct intel_bo *bo, uint32_t offset) +{ + assert(offset % vma->vm_alignment == 0); - /* - * From the Sandy Bridge PRM, volume 1 part 1, page 118: - * - * "For buffers, which have no inherent "height," padding requirements - * are different. A buffer must be padded to the next multiple of 256 - * array elements, with an additional 16 bytes added beyond that to - * account for the L1 cache line." - */ - if (bind & PIPE_BIND_SAMPLER_VIEW) - buf->bo_size = align(buf->bo_size, 256) + 16; + vma->bo = bo; + vma->bo_offset = offset; } -#endif /* ILO_BUFFER_H */ +#endif /* ILO_VMA_H */ diff --git a/src/gallium/drivers/ilo/ilo_blitter_blt.c b/src/gallium/drivers/ilo/ilo_blitter_blt.c index d55dc35e360..66203e86137 100644 --- a/src/gallium/drivers/ilo/ilo_blitter_blt.c +++ b/src/gallium/drivers/ilo/ilo_blitter_blt.c @@ -127,7 +127,7 @@ ilo_blitter_blt_end(struct ilo_blitter *blitter, uint32_t swctrl) static bool buf_clear_region(struct ilo_blitter *blitter, - struct ilo_buffer *buf, unsigned offset, + struct ilo_buffer_resource *buf, unsigned offset, uint32_t val, unsigned size, enum gen6_blt_mask value_mask, enum gen6_blt_mask write_mask) @@ -140,8 +140,8 @@ buf_clear_region(struct ilo_blitter *blitter, if (offset % cpp || size % cpp) return false; - dst.bo = buf->bo; - dst.offset = offset; + dst.bo = buf->vma.bo; + dst.offset = buf->vma.bo_offset + offset; ilo_blitter_blt_begin(blitter, GEN6_COLOR_BLT__SIZE * (1 + size / 32764 / gen6_blt_max_scanlines), @@ -179,25 +179,26 @@ buf_clear_region(struct ilo_blitter *blitter, static bool buf_copy_region(struct ilo_blitter *blitter, - struct ilo_buffer *dst_buf, unsigned dst_offset, - struct ilo_buffer *src_buf, unsigned src_offset, + struct ilo_buffer_resource *dst_buf, unsigned dst_offset, + struct ilo_buffer_resource *src_buf, unsigned src_offset, unsigned size) { const uint8_t rop = 0xcc; /* SRCCOPY */ struct ilo_builder *builder = &blitter->ilo->cp->builder; struct gen6_blt_bo dst, src; - dst.bo = dst_buf->bo; - dst.offset = dst_offset; + dst.bo = dst_buf->vma.bo; + dst.offset = dst_buf->vma.bo_offset + dst_offset; dst.pitch = 0; - src.bo = src_buf->bo; - src.offset = src_offset; + src.bo = src_buf->vma.bo; + src.offset = src_buf->vma.bo_offset + src_offset; src.pitch = 0; ilo_blitter_blt_begin(blitter, GEN6_SRC_COPY_BLT__SIZE * (1 + size / 32764 / gen6_blt_max_scanlines), - dst_buf->bo, GEN6_TILING_NONE, src_buf->bo, GEN6_TILING_NONE); + dst_buf->vma.bo, GEN6_TILING_NONE, + src_buf->vma.bo, GEN6_TILING_NONE); while (size) { unsigned width, height; @@ -258,14 +259,14 @@ tex_clear_region(struct ilo_blitter *blitter, if (dst_box->width * cpp > gen6_blt_max_bytes_per_scanline) return false; - dst.bo = dst_tex->image.bo; - dst.offset = 0; + dst.bo = dst_tex->vma.bo; + dst.offset = dst_tex->vma.bo_offset; dst.pitch = dst_tex->image.bo_stride; dst.tiling = dst_tex->image.tiling; swctrl = ilo_blitter_blt_begin(blitter, GEN6_XY_COLOR_BLT__SIZE * dst_box->depth, - dst_tex->image.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE); + dst_tex->vma.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE); for (slice = 0; slice < dst_box->depth; slice++) { unsigned x, y; @@ -299,7 +300,7 @@ tex_copy_region(struct ilo_blitter *blitter, const struct pipe_box *src_box) { const struct util_format_description *desc = - util_format_description(dst_tex->image.format); + util_format_description(dst_tex->image_format); const unsigned max_extent = 32767; /* INT16_MAX */ const uint8_t rop = 0xcc; /* SRCCOPY */ struct ilo_builder *builder = &blitter->ilo->cp->builder; @@ -347,13 +348,13 @@ tex_copy_region(struct ilo_blitter *blitter, break; } - dst.bo = dst_tex->image.bo; - dst.offset = 0; + dst.bo = dst_tex->vma.bo; + dst.offset = dst_tex->vma.bo_offset; dst.pitch = dst_tex->image.bo_stride; dst.tiling = dst_tex->image.tiling; - src.bo = src_tex->image.bo; - src.offset = 0; + src.bo = src_tex->vma.bo; + src.offset = src_tex->vma.bo_offset; src.pitch = src_tex->image.bo_stride; src.tiling = src_tex->image.tiling; @@ -423,8 +424,8 @@ ilo_blitter_blt_copy_resource(struct ilo_blitter *blitter, src_box->height == 1 && src_box->depth == 1); - success = buf_copy_region(blitter, - ilo_buffer(dst), dst_offset, ilo_buffer(src), src_offset, size); + success = buf_copy_region(blitter, ilo_buffer_resource(dst), dst_offset, + ilo_buffer_resource(src), src_offset, size); } else if (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER) { success = tex_copy_region(blitter, @@ -488,7 +489,7 @@ ilo_blitter_blt_clear_rt(struct ilo_blitter *blitter, if (offset + size > end) size = end - offset; - success = buf_clear_region(blitter, ilo_buffer(rt->texture), + success = buf_clear_region(blitter, ilo_buffer_resource(rt->texture), offset, packed.ui[0], size, mask, mask); } else { diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c index 13c8f500680..86e67084d6e 100644 --- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c +++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c @@ -318,7 +318,7 @@ hiz_can_clear_zs(const struct ilo_blitter *blitter, * The truth is when HiZ is enabled, separate stencil is also enabled on * all GENs. The depth buffer format cannot be combined depth/stencil. */ - switch (tex->image.format) { + switch (tex->image_format) { case PIPE_FORMAT_Z16_UNORM: if (ilo_dev_gen(blitter->ilo->dev) == ILO_GEN(6) && tex->base.width0 % 16) @@ -355,7 +355,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter, if (ilo_dev_gen(blitter->ilo->dev) >= ILO_GEN(8)) clear_value = fui(depth); else - clear_value = util_pack_z(tex->image.format, depth); + clear_value = util_pack_z(tex->image_format, depth); ilo_blit_resolve_surface(blitter->ilo, zs, ILO_TEXTURE_RENDER_WRITE | ILO_TEXTURE_CLEAR); diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h index 9ebbf76e81e..3dbe79fb872 100644 --- a/src/gallium/drivers/ilo/ilo_common.h +++ b/src/gallium/drivers/ilo/ilo_common.h @@ -28,6 +28,14 @@ #ifndef ILO_COMMON_H #define ILO_COMMON_H +#include "pipe/p_format.h" +#include "pipe/p_defines.h" + +#include "util/list.h" +#include "util/u_format.h" +#include "util/u_inlines.h" +#include "util/u_pointer.h" + #include "core/ilo_core.h" #include "core/ilo_debug.h" #include "core/ilo_dev.h" diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c index 3d5c7b636a8..b9a16aab81d 100644 --- a/src/gallium/drivers/ilo/ilo_context.c +++ b/src/gallium/drivers/ilo/ilo_context.c @@ -62,6 +62,8 @@ ilo_flush(struct pipe_context *pipe, (flags & PIPE_FLUSH_END_OF_FRAME) ? "frame end" : "user request"); if (f) { + struct pipe_screen *screen = pipe->screen; + screen->fence_reference(screen, f, NULL); *f = ilo_screen_fence_create(pipe->screen, ilo->cp->last_submitted_bo); } } diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c index e8e1a4cd14c..433348d9326 100644 --- a/src/gallium/drivers/ilo/ilo_draw.c +++ b/src/gallium/drivers/ilo/ilo_draw.c @@ -444,6 +444,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, const struct pipe_draw_info *info) { const struct ilo_ib_state *ib = &ilo->state_vector.ib; + const struct ilo_vma *vma; union { const void *ptr; const uint8_t *u8; @@ -453,10 +454,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, /* we will draw with IB mapped */ if (ib->state.buffer) { - u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false); + vma = ilo_resource_get_vma(ib->state.buffer); + u.ptr = intel_bo_map(vma->bo, false); if (u.ptr) - u.u8 += ib->state.offset; + u.u8 += vma->bo_offset + ib->state.offset; } else { + vma = NULL; u.ptr = ib->state.user_buffer; } @@ -500,8 +503,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo, #undef DRAW_VBO_WITH_SW_RESTART - if (ib->state.buffer) - intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo); + if (vma) + intel_bo_unmap(vma->bo); } static bool diff --git a/src/gallium/drivers/ilo/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h index 4e955c09c14..0a19c02659e 100644 --- a/src/gallium/drivers/ilo/ilo_format.h +++ b/src/gallium/drivers/ilo/ilo_format.h @@ -165,4 +165,39 @@ ilo_format_translate_vertex(const struct ilo_dev *dev, return ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER); } +static inline enum gen_depth_format +ilo_format_translate_depth(const struct ilo_dev *dev, + enum pipe_format format) +{ + if (ilo_dev_gen(dev) >= ILO_GEN(7)) { + switch (format) { + case PIPE_FORMAT_Z32_FLOAT: + return GEN6_ZFORMAT_D32_FLOAT; + case PIPE_FORMAT_Z24X8_UNORM: + return GEN6_ZFORMAT_D24_UNORM_X8_UINT; + case PIPE_FORMAT_Z16_UNORM: + return GEN6_ZFORMAT_D16_UNORM; + default: + assert(!"unknown depth format"); + return GEN6_ZFORMAT_D32_FLOAT; + } + } else { + switch (format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT; + case PIPE_FORMAT_Z32_FLOAT: + return GEN6_ZFORMAT_D32_FLOAT; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return GEN6_ZFORMAT_D24_UNORM_S8_UINT; + case PIPE_FORMAT_Z24X8_UNORM: + return GEN6_ZFORMAT_D24_UNORM_X8_UINT; + case PIPE_FORMAT_Z16_UNORM: + return GEN6_ZFORMAT_D16_UNORM; + default: + assert(!"unknown depth format"); + return GEN6_ZFORMAT_D32_FLOAT; + } + } +} + #endif /* ILO_FORMAT_H */ diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c index ad053564294..3bf8646b344 100644 --- a/src/gallium/drivers/ilo/ilo_render_surface.c +++ b/src/gallium/drivers/ilo/ilo_render_surface.c @@ -42,14 +42,17 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder, const struct pipe_stream_output_info *so_info, int so_index) { - struct ilo_buffer *buf = ilo_buffer(so->buffer); struct ilo_state_surface_buffer_info info; struct ilo_state_surface surf; ILO_DEV_ASSERT(builder->dev, 6, 6); memset(&info, 0, sizeof(info)); - info.buf = buf; + + info.vma = ilo_resource_get_vma(so->buffer); + info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4; + info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4; + info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB; switch (so_info->output[so_index].num_components) { @@ -78,12 +81,9 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder, info.struct_size = so_info->stride[so_info->output[so_index].output_buffer] * 4; - info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4; - info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4; memset(&surf, 0, sizeof(surf)); ilo_state_surface_init_for_buffer(&surf, builder->dev, &info); - surf.bo = info.buf->bo; return gen6_SURFACE_STATE(builder, &surf); } @@ -482,18 +482,19 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r, return; memset(&info, 0, sizeof(info)); - info.buf = ilo_buffer(session->input->buffer); + + info.vma = ilo_resource_get_vma(session->input->buffer); + info.offset = session->input->buffer_offset; + info.size = session->input->buffer_size; + info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED; info.format = GEN6_FORMAT_RAW; info.format_size = 1; info.struct_size = 1; info.readonly = true; - info.offset = session->input->buffer_offset; - info.size = session->input->buffer_size; memset(&surf, 0, sizeof(surf)); ilo_state_surface_init_for_buffer(&surf, r->dev, &info); - surf.bo = info.buf->bo; assert(count == 1 && session->input->buffer); surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf); @@ -538,23 +539,23 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r, surface_state += base; for (i = 0; i < count; i++) { if (i < vec->global_binding.count && bindings[i].resource) { - const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource); struct ilo_state_surface_buffer_info info; struct ilo_state_surface surf; assert(bindings[i].resource->target == PIPE_BUFFER); memset(&info, 0, sizeof(info)); - info.buf = buf; + + info.vma = ilo_resource_get_vma(bindings[i].resource); + info.size = info.vma->vm_size; + info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED; info.format = GEN6_FORMAT_RAW; info.format_size = 1; info.struct_size = 1; - info.size = buf->bo_size; memset(&surf, 0, sizeof(surf)); ilo_state_surface_init_for_buffer(&surf, r->dev, &info); - surf.bo = info.buf->bo; surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf); } else { diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c index be9fd10a84c..9026ba9a983 100644 --- a/src/gallium/drivers/ilo/ilo_resource.c +++ b/src/gallium/drivers/ilo/ilo_resource.c @@ -25,7 +25,12 @@ * Chia-I Wu <[email protected]> */ +#include "core/ilo_state_vf.h" +#include "core/ilo_state_sol.h" +#include "core/ilo_state_surface.h" + #include "ilo_screen.h" +#include "ilo_format.h" #include "ilo_resource.h" /* @@ -83,6 +88,134 @@ resource_get_cpu_init(const struct pipe_resource *templ) PIPE_BIND_STREAM_OUTPUT)) ? false : true; } +static enum gen_surface_type +get_surface_type(enum pipe_texture_target target) +{ + switch (target) { + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return GEN6_SURFTYPE_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + return GEN6_SURFTYPE_2D; + case PIPE_TEXTURE_3D: + return GEN6_SURFTYPE_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return GEN6_SURFTYPE_CUBE; + default: + assert(!"unknown texture target"); + return GEN6_SURFTYPE_NULL; + } +} + +static enum pipe_format +resource_get_image_format(const struct pipe_resource *templ, + const struct ilo_dev *dev, + bool *separate_stencil_ret) +{ + enum pipe_format format = templ->format; + bool separate_stencil; + + /* silently promote ETC1 */ + if (templ->format == PIPE_FORMAT_ETC1_RGB8) + format = PIPE_FORMAT_R8G8B8X8_UNORM; + + /* separate stencil buffers */ + separate_stencil = false; + if ((templ->bind & PIPE_BIND_DEPTH_STENCIL) && + util_format_is_depth_and_stencil(templ->format)) { + switch (templ->format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Gen6 requires HiZ to be available for all levels */ + if (ilo_dev_gen(dev) >= ILO_GEN(7) || templ->last_level == 0) { + format = PIPE_FORMAT_Z32_FLOAT; + separate_stencil = true; + } + break; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + format = PIPE_FORMAT_Z24X8_UNORM; + separate_stencil = true; + break; + default: + break; + } + } + + if (separate_stencil_ret) + *separate_stencil_ret = separate_stencil; + + return format; +} + +static inline enum gen_surface_format +pipe_to_surface_format(const struct ilo_dev *dev, enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS; + case PIPE_FORMAT_Z32_FLOAT: + return GEN6_FORMAT_R32_FLOAT; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + return GEN6_FORMAT_R24_UNORM_X8_TYPELESS; + case PIPE_FORMAT_Z16_UNORM: + return GEN6_FORMAT_R16_UNORM; + case PIPE_FORMAT_S8_UINT: + return GEN6_FORMAT_R8_UINT; + default: + return ilo_format_translate_color(dev, format); + } +} + +static void +resource_get_image_info(const struct pipe_resource *templ, + const struct ilo_dev *dev, + enum pipe_format image_format, + struct ilo_image_info *info) +{ + memset(info, 0, sizeof(*info)); + + info->type = get_surface_type(templ->target); + + info->format = pipe_to_surface_format(dev, image_format); + info->interleaved_stencil = util_format_is_depth_and_stencil(image_format); + info->is_integer = util_format_is_pure_integer(image_format); + info->compressed = util_format_is_compressed(image_format); + info->block_width = util_format_get_blockwidth(image_format); + info->block_height = util_format_get_blockheight(image_format); + info->block_size = util_format_get_blocksize(image_format); + + info->width = templ->width0; + info->height = templ->height0; + info->depth = templ->depth0; + info->array_size = templ->array_size; + info->level_count = templ->last_level + 1; + info->sample_count = (templ->nr_samples) ? templ->nr_samples : 1; + + info->aux_disable = (templ->usage == PIPE_USAGE_STAGING); + + if (templ->bind & PIPE_BIND_LINEAR) + info->valid_tilings = 1 << GEN6_TILING_NONE; + + /* + * Tiled images must be mapped via GTT to get a linear view. Prefer linear + * images when the image size is greater than one-fourth of the mappable + * aperture. + */ + if (templ->bind & (PIPE_BIND_TRANSFER_WRITE | PIPE_BIND_TRANSFER_READ)) + info->prefer_linear_threshold = dev->aperture_mappable / 4; + + info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW); + info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET); + info->bind_surface_dp_typed = (templ->bind & + (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE)); + info->bind_zs = (templ->bind & PIPE_BIND_DEPTH_STENCIL); + info->bind_scanout = (templ->bind & PIPE_BIND_SCANOUT); + info->bind_cursor = (templ->bind & PIPE_BIND_CURSOR); +} + static enum gen_surface_tiling winsys_to_surface_tiling(enum intel_tiling_mode tiling) { @@ -178,8 +311,8 @@ tex_create_bo(struct ilo_texture *tex) if (!bo) return false; - intel_bo_unref(tex->image.bo); - tex->image.bo = bo; + intel_bo_unref(tex->vma.bo); + ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0); return true; } @@ -206,7 +339,7 @@ tex_create_separate_stencil(struct ilo_texture *tex) tex->separate_s8 = ilo_texture(s8); - assert(tex->separate_s8->image.format == PIPE_FORMAT_S8_UINT); + assert(tex->separate_s8->image_format == PIPE_FORMAT_S8_UINT); return true; } @@ -215,15 +348,16 @@ static bool tex_create_hiz(struct ilo_texture *tex) { const struct pipe_resource *templ = &tex->base; + const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height; struct ilo_screen *is = ilo_screen(tex->base.screen); struct intel_bo *bo; - bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture", - tex->image.aux.bo_stride * tex->image.aux.bo_height, false); + bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture", size, false); if (!bo) return false; - tex->image.aux.bo = bo; + ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096); + ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0); if (tex->imported) { unsigned lv; @@ -246,17 +380,18 @@ tex_create_hiz(struct ilo_texture *tex) static bool tex_create_mcs(struct ilo_texture *tex) { + const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height; struct ilo_screen *is = ilo_screen(tex->base.screen); struct intel_bo *bo; assert(tex->image.aux.enables == (1 << (tex->base.last_level + 1)) - 1); - bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture", - tex->image.aux.bo_stride * tex->image.aux.bo_height, false); + bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture", size, false); if (!bo) return false; - tex->image.aux.bo = bo; + ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096); + ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0); return true; } @@ -267,8 +402,8 @@ tex_destroy(struct ilo_texture *tex) if (tex->separate_s8) tex_destroy(tex->separate_s8); - intel_bo_unref(tex->image.bo); - intel_bo_unref(tex->image.aux.bo); + intel_bo_unref(tex->vma.bo); + intel_bo_unref(tex->aux_vma.bo); tex_free_slices(tex); FREE(tex); @@ -277,24 +412,16 @@ tex_destroy(struct ilo_texture *tex) static bool tex_alloc_bos(struct ilo_texture *tex) { - struct ilo_screen *is = ilo_screen(tex->base.screen); - if (!tex->imported && !tex_create_bo(tex)) return false; - /* allocate separate stencil resource */ - if (tex->image.separate_stencil && !tex_create_separate_stencil(tex)) - return false; - switch (tex->image.aux.type) { case ILO_IMAGE_AUX_HIZ: - if (!tex_create_hiz(tex) && - !ilo_image_disable_aux(&tex->image, &is->dev)) + if (!tex_create_hiz(tex)) return false; break; case ILO_IMAGE_AUX_MCS: - if (!tex_create_mcs(tex) && - !ilo_image_disable_aux(&tex->image, &is->dev)) + if (!tex_create_mcs(tex)) return false; break; default: @@ -304,9 +431,10 @@ tex_alloc_bos(struct ilo_texture *tex) return true; } -static bool +static struct intel_bo * tex_import_handle(struct ilo_texture *tex, - const struct winsys_handle *handle) + const struct winsys_handle *handle, + struct ilo_image_info *info) { struct ilo_screen *is = ilo_screen(tex->base.screen); const struct pipe_resource *templ = &tex->base; @@ -317,45 +445,94 @@ tex_import_handle(struct ilo_texture *tex, bo = intel_winsys_import_handle(is->dev.winsys, name, handle, tex->image.bo_height, &tiling, &pitch); - if (!bo) - return false; + /* modify image info */ + if (bo) { + const uint8_t valid_tilings = 1 << winsys_to_surface_tiling(tiling); - if (!ilo_image_init_for_imported(&tex->image, &is->dev, templ, - winsys_to_surface_tiling(tiling), pitch)) { - ilo_err("failed to import handle for texture\n"); - intel_bo_unref(bo); - return false; - } + if (info->valid_tilings && !(info->valid_tilings & valid_tilings)) { + intel_bo_unref(bo); + return NULL; + } - tex->image.bo = bo; + info->valid_tilings = valid_tilings; + info->force_bo_stride = pitch; - tex->imported = true; + /* assume imported RTs are also scanouts */ + if (!info->bind_scanout) + info->bind_scanout = (templ->usage & PIPE_BIND_RENDER_TARGET); + } - return true; + return bo; } static bool tex_init_image(struct ilo_texture *tex, - const struct winsys_handle *handle) + const struct winsys_handle *handle, + bool *separate_stencil) { struct ilo_screen *is = ilo_screen(tex->base.screen); const struct pipe_resource *templ = &tex->base; struct ilo_image *img = &tex->image; + struct intel_bo *imported_bo = NULL;; + struct ilo_image_info info; + + tex->image_format = resource_get_image_format(templ, + &is->dev, separate_stencil); + resource_get_image_info(templ, &is->dev, tex->image_format, &info); if (handle) { - if (!tex_import_handle(tex, handle)) + imported_bo = tex_import_handle(tex, handle, &info); + if (!imported_bo) return false; - } else { - ilo_image_init(img, &is->dev, templ); } - if (img->bo_height > ilo_max_resource_size / img->bo_stride) + if (!ilo_image_init(img, &is->dev, &info)) { + intel_bo_unref(imported_bo); return false; + } + + /* + * HiZ requires 8x4 alignment and some levels might need HiZ disabled. It + * is generally fine except on Gen6, where HiZ and separate stencil must be + * enabled together. For PIPE_FORMAT_Z24X8_UNORM with separate stencil, we + * can live with stencil values being interleaved for levels where HiZ is + * disabled. But it is not the case for PIPE_FORMAT_Z32_FLOAT with + * separate stencil. If HiZ was disabled for a level, we had to change the + * format to PIPE_FORMAT_Z32_FLOAT_S8X24_UINT for the level and that format + * had a different bpp. In other words, HiZ has to be available for all + * levels. + */ + if (ilo_dev_gen(&is->dev) == ILO_GEN(6) && + templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && + tex->image_format == PIPE_FORMAT_Z32_FLOAT && + img->aux.enables != (1 << templ->last_level)) { + tex->image_format = templ->format; + info.format = pipe_to_surface_format(&is->dev, tex->image_format); + info.interleaved_stencil = true; + + memset(img, 0, sizeof(*img)); + if (!ilo_image_init(img, &is->dev, &info)) { + intel_bo_unref(imported_bo); + return false; + } + } + + if (img->bo_height > ilo_max_resource_size / img->bo_stride || + !ilo_vma_init(&tex->vma, &is->dev, img->bo_stride * img->bo_height, + 4096)) { + intel_bo_unref(imported_bo); + return false; + } + + if (imported_bo) { + ilo_vma_set_bo(&tex->vma, &is->dev, imported_bo, 0); + tex->imported = true; + } if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) { /* require on-the-fly tiling/untiling or format conversion */ - if (img->tiling == GEN8_TILING_W || img->separate_stencil || - img->format != templ->format) + if (img->tiling == GEN8_TILING_W || *separate_stencil || + tex->image_format != templ->format) return false; } @@ -371,6 +548,7 @@ tex_create(struct pipe_screen *screen, const struct winsys_handle *handle) { struct ilo_texture *tex; + bool separate_stencil; tex = CALLOC_STRUCT(ilo_texture); if (!tex) @@ -380,12 +558,13 @@ tex_create(struct pipe_screen *screen, tex->base.screen = screen; pipe_reference_init(&tex->base.reference, 1); - if (!tex_init_image(tex, handle)) { + if (!tex_init_image(tex, handle, &separate_stencil)) { FREE(tex); return NULL; } - if (!tex_alloc_bos(tex)) { + if (!tex_alloc_bos(tex) || + (separate_stencil && !tex_create_separate_stencil(tex))) { tex_destroy(tex); return NULL; } @@ -406,7 +585,7 @@ tex_get_handle(struct ilo_texture *tex, struct winsys_handle *handle) else tiling = surface_to_winsys_tiling(tex->image.tiling); - err = intel_winsys_export_handle(is->dev.winsys, tex->image.bo, tiling, + err = intel_winsys_export_handle(is->dev.winsys, tex->vma.bo, tiling, tex->image.bo_stride, tex->image.bo_height, handle); return !err; @@ -420,13 +599,12 @@ buf_create_bo(struct ilo_buffer_resource *buf) const bool cpu_init = resource_get_cpu_init(&buf->base); struct intel_bo *bo; - bo = intel_winsys_alloc_bo(is->dev.winsys, name, - buf->buffer.bo_size, cpu_init); + bo = intel_winsys_alloc_bo(is->dev.winsys, name, buf->bo_size, cpu_init); if (!bo) return false; - intel_bo_unref(buf->buffer.bo); - buf->buffer.bo = bo; + intel_bo_unref(buf->vma.bo); + ilo_vma_set_bo(&buf->vma, &is->dev, bo, 0); return true; } @@ -434,7 +612,7 @@ buf_create_bo(struct ilo_buffer_resource *buf) static void buf_destroy(struct ilo_buffer_resource *buf) { - intel_bo_unref(buf->buffer.bo); + intel_bo_unref(buf->vma.bo); FREE(buf); } @@ -443,6 +621,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ) { const struct ilo_screen *is = ilo_screen(screen); struct ilo_buffer_resource *buf; + uint32_t alignment; unsigned size; buf = CALLOC_STRUCT(ilo_buffer_resource); @@ -471,10 +650,17 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ) ilo_dev_gen(&is->dev) < ILO_GEN(7.5)) size = align(size, 4096); - ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags); + if (templ->bind & PIPE_BIND_VERTEX_BUFFER) + size = ilo_state_vertex_buffer_size(&is->dev, size, &alignment); + if (templ->bind & PIPE_BIND_INDEX_BUFFER) + size = ilo_state_index_buffer_size(&is->dev, size, &alignment); + if (templ->bind & PIPE_BIND_STREAM_OUTPUT) + size = ilo_state_sol_buffer_size(&is->dev, size, &alignment); + + buf->bo_size = size; + ilo_vma_init(&buf->vma, &is->dev, buf->bo_size, 4096); - if (buf->buffer.bo_size < templ->width0 || - buf->buffer.bo_size > ilo_max_resource_size || + if (buf->bo_size < templ->width0 || buf->bo_size > ilo_max_resource_size || !buf_create_bo(buf)) { FREE(buf); return NULL; @@ -487,13 +673,30 @@ static boolean ilo_can_create_resource(struct pipe_screen *screen, const struct pipe_resource *templ) { + struct ilo_screen *is = ilo_screen(screen); + enum pipe_format image_format; + struct ilo_image_info info; struct ilo_image img; if (templ->target == PIPE_BUFFER) return (templ->width0 <= ilo_max_resource_size); + image_format = resource_get_image_format(templ, &is->dev, NULL); + resource_get_image_info(templ, &is->dev, image_format, &info); + memset(&img, 0, sizeof(img)); - ilo_image_init(&img, &ilo_screen(screen)->dev, templ); + ilo_image_init(&img, &ilo_screen(screen)->dev, &info); + + /* as in tex_init_image() */ + if (ilo_dev_gen(&is->dev) == ILO_GEN(6) && + templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && + image_format == PIPE_FORMAT_Z32_FLOAT && + img.aux.enables != (1 << templ->last_level)) { + info.format = pipe_to_surface_format(&is->dev, templ->format); + info.interleaved_stencil = true; + memset(&img, 0, sizeof(img)); + ilo_image_init(&img, &ilo_screen(screen)->dev, &info); + } return (img.bo_height <= ilo_max_resource_size / img.bo_stride); } diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h index d602e0cbf70..8378af54741 100644 --- a/src/gallium/drivers/ilo/ilo_resource.h +++ b/src/gallium/drivers/ilo/ilo_resource.h @@ -29,8 +29,8 @@ #define ILO_RESOURCE_H #include "core/intel_winsys.h" -#include "core/ilo_buffer.h" #include "core/ilo_image.h" +#include "core/ilo_vma.h" #include "ilo_common.h" #include "ilo_screen.h" @@ -92,7 +92,10 @@ struct ilo_texture { bool imported; + enum pipe_format image_format; struct ilo_image image; + struct ilo_vma vma; + struct ilo_vma aux_vma; /* XXX thread-safety */ struct ilo_texture_slice *slices[PIPE_MAX_TEXTURE_LEVELS]; @@ -103,14 +106,15 @@ struct ilo_texture { struct ilo_buffer_resource { struct pipe_resource base; - struct ilo_buffer buffer; + uint32_t bo_size; + struct ilo_vma vma; }; -static inline struct ilo_buffer * -ilo_buffer(struct pipe_resource *res) +static inline struct ilo_buffer_resource * +ilo_buffer_resource(struct pipe_resource *res) { - return (res && res->target == PIPE_BUFFER) ? - &((struct ilo_buffer_resource *) res)->buffer : NULL; + return (struct ilo_buffer_resource *) + ((res && res->target == PIPE_BUFFER) ? res : NULL); } static inline struct ilo_texture * @@ -127,13 +131,14 @@ bool ilo_resource_rename_bo(struct pipe_resource *res); /** - * Return the bo of the resource. + * Return the VMA of the resource. */ -static inline struct intel_bo * -ilo_resource_get_bo(struct pipe_resource *res) +static inline const struct ilo_vma * +ilo_resource_get_vma(struct pipe_resource *res) { return (res->target == PIPE_BUFFER) ? - ilo_buffer(res)->bo : ilo_texture(res)->image.bo; + &((struct ilo_buffer_resource *) res)->vma : + &((struct ilo_texture *) res)->vma; } static inline struct ilo_texture_slice * diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index 94105559b80..ab4d1377c9f 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -193,6 +193,7 @@ ilo_get_compute_param(struct pipe_screen *screen, uint32_t max_clock_frequency; uint32_t max_compute_units; uint32_t images_supported; + uint32_t subgroup_size; } val; const void *ptr; int size; @@ -284,6 +285,13 @@ ilo_get_compute_param(struct pipe_screen *screen, ptr = &val.images_supported; size = sizeof(val.images_supported); break; + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + /* best case is actually SIMD32 */ + val.subgroup_size = 16; + + ptr = &val.subgroup_size; + size = sizeof(val.subgroup_size); + break; default: ptr = NULL; size = 0; @@ -443,6 +451,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_SM5: return 0; case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return true; case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_QUERY_LOD: @@ -457,6 +467,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; case PIPE_CAP_VENDOR_ID: @@ -665,13 +677,6 @@ ilo_screen_fence_finish(struct pipe_screen *screen, return signaled; } -static boolean -ilo_screen_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - return ilo_screen_fence_finish(screen, fence, 0); -} - /** * Create a fence for \p bo. When \p bo is not NULL, it must be submitted * before waited on or checked. @@ -738,7 +743,6 @@ ilo_screen_create(struct intel_winsys *ws) is->base.flush_frontbuffer = NULL; is->base.fence_reference = ilo_screen_fence_reference; - is->base.fence_signalled = ilo_screen_fence_signalled; is->base.fence_finish = ilo_screen_fence_finish; is->base.get_driver_query_info = NULL; diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c index 63534f33fa7..d89765a9d23 100644 --- a/src/gallium/drivers/ilo/ilo_state.c +++ b/src/gallium/drivers/ilo/ilo_state.c @@ -379,13 +379,12 @@ finalize_cbuf_state(struct ilo_context *ilo, u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource); - cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource); + cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource); cbuf->cso[i].info.offset = offset; memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface)); ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface, ilo->dev, &cbuf->cso[i].info); - cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo; ilo->state_vector.dirty |= ILO_DIRTY_CBUF; } @@ -466,11 +465,9 @@ finalize_index_buffer(struct ilo_context *ilo) memset(&info, 0, sizeof(info)); if (vec->ib.hw_resource) { - info.buf = ilo_buffer(vec->ib.hw_resource); - info.size = info.buf->bo_size; + info.vma = ilo_resource_get_vma(vec->ib.hw_resource); + info.size = info.vma->vm_size; info.format = ilo_translate_index_size(vec->ib.hw_index_size); - - vec->ib.ib.bo = info.buf->bo; } ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info); @@ -532,13 +529,11 @@ finalize_vertex_buffers(struct ilo_context *ilo) const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx]; if (cso->buffer) { - info.buf = ilo_buffer(cso->buffer); + info.vma = ilo_resource_get_vma(cso->buffer); info.offset = cso->buffer_offset; - info.size = info.buf->bo_size; + info.size = info.vma->vm_size - cso->buffer_offset; info.stride = cso->stride; - - vec->vb.vb[i].bo = info.buf->bo; } else { memset(&info, 0, sizeof(info)); } @@ -1566,24 +1561,23 @@ ilo_set_constant_buffer(struct pipe_context *pipe, cso->info.size = buf[i].buffer_size; if (buf[i].buffer) { - cso->info.buf = ilo_buffer(buf[i].buffer); + cso->info.vma = ilo_resource_get_vma(buf[i].buffer); cso->info.offset = buf[i].buffer_offset; memset(&cso->surface, 0, sizeof(cso->surface)); ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info); - cso->surface.bo = cso->info.buf->bo; cso->user_buffer = NULL; cbuf->enabled_mask |= 1 << (index + i); } else if (buf[i].user_buffer) { - cso->info.buf = NULL; + cso->info.vma = NULL; /* buffer_offset does not apply for user buffer */ cso->user_buffer = buf[i].user_buffer; cbuf->enabled_mask |= 1 << (index + i); } else { - cso->info.buf = NULL; + cso->info.vma = NULL; cso->info.size = 0; cso->user_buffer = NULL; @@ -1596,7 +1590,7 @@ ilo_set_constant_buffer(struct pipe_context *pipe, pipe_resource_reference(&cso->resource, NULL); - cso->info.buf = NULL; + cso->info.vma = NULL; cso->info.size = 0; cso->user_buffer = NULL; @@ -1705,10 +1699,11 @@ ilo_set_framebuffer_state(struct pipe_context *pipe, if (state->zsbuf) { const struct ilo_surface_cso *cso = (const struct ilo_surface_cso *) state->zsbuf; + const struct ilo_texture *tex = ilo_texture(cso->base.texture); - fb->has_hiz = cso->u.zs.hiz_bo; + fb->has_hiz = cso->u.zs.hiz_vma; fb->depth_offset_format = - ilo_state_zs_get_depth_format(&cso->u.zs, dev); + ilo_format_translate_depth(dev, tex->image_format); } else { fb->has_hiz = false; fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT; @@ -1854,10 +1849,11 @@ ilo_set_sampler_views(struct pipe_context *pipe, unsigned shader, } static void -ilo_set_shader_resources(struct pipe_context *pipe, - unsigned start, unsigned count, - struct pipe_surface **surfaces) +ilo_set_shader_images(struct pipe_context *pipe, unsigned shader, + unsigned start, unsigned count, + struct pipe_image_view **views) { +#if 0 struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector; struct ilo_resource_state *dst = &vec->resource; unsigned i; @@ -1886,6 +1882,7 @@ ilo_set_shader_resources(struct pipe_context *pipe, } vec->dirty |= ILO_DIRTY_RESOURCE; +#endif } static void @@ -1945,12 +1942,11 @@ ilo_create_stream_output_target(struct pipe_context *pipe, target->base.buffer_size = buffer_size; memset(&info, 0, sizeof(info)); - info.buf = ilo_buffer(res); + info.vma = ilo_resource_get_vma(res); info.offset = buffer_offset; info.size = buffer_size; ilo_state_sol_buffer_init(&target->sb, dev, &info); - target->sb.bo = info.buf->bo; return &target->base; } @@ -2018,18 +2014,17 @@ ilo_create_sampler_view(struct pipe_context *pipe, struct ilo_state_surface_buffer_info info; memset(&info, 0, sizeof(info)); - info.buf = ilo_buffer(res); + info.vma = ilo_resource_get_vma(res); + info.offset = templ->u.buf.first_element * info.struct_size; + info.size = (templ->u.buf.last_element - + templ->u.buf.first_element + 1) * info.struct_size; info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER; info.format = ilo_format_translate_color(dev, templ->format); info.format_size = util_format_get_blocksize(templ->format); info.struct_size = info.format_size; info.readonly = true; - info.offset = templ->u.buf.first_element * info.struct_size; - info.size = (templ->u.buf.last_element - - templ->u.buf.first_element + 1) * info.struct_size; ilo_state_surface_init_for_buffer(&view->surface, dev, &info); - view->surface.bo = info.buf->bo; } else { struct ilo_texture *tex = ilo_texture(res); struct ilo_state_surface_image_info info; @@ -2042,32 +2037,31 @@ ilo_create_sampler_view(struct pipe_context *pipe, } memset(&info, 0, sizeof(info)); + info.img = &tex->image; + info.level_base = templ->u.tex.first_level; + info.level_count = templ->u.tex.last_level - + templ->u.tex.first_level + 1; + info.slice_base = templ->u.tex.first_layer; + info.slice_count = templ->u.tex.last_layer - + templ->u.tex.first_layer + 1; + info.vma = &tex->vma; info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER; + info.type = tex->image.type; if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && - tex->image.separate_stencil) { + tex->separate_s8) { info.format = ilo_format_translate_texture(dev, PIPE_FORMAT_Z32_FLOAT); } else { info.format = ilo_format_translate_texture(dev, templ->format); } - info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE || - tex->image.target == PIPE_TEXTURE_CUBE_ARRAY); info.is_array = util_resource_is_array_texture(&tex->base); info.readonly = true; - info.level_base = templ->u.tex.first_level; - info.level_count = templ->u.tex.last_level - - templ->u.tex.first_level + 1; - info.slice_base = templ->u.tex.first_layer; - info.slice_count = templ->u.tex.last_layer - - templ->u.tex.first_layer + 1; - ilo_state_surface_init_for_image(&view->surface, dev, &info); - view->surface.bo = info.img->bo; } return &view->base; @@ -2111,18 +2105,27 @@ ilo_create_surface(struct pipe_context *pipe, assert(tex->base.target != PIPE_BUFFER); memset(&info, 0, sizeof(info)); + info.img = &tex->image; - info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER; - info.format = ilo_format_translate_render(dev, templ->format); - info.is_array = util_resource_is_array_texture(&tex->base); info.level_base = templ->u.tex.level; info.level_count = 1; info.slice_base = templ->u.tex.first_layer; info.slice_count = templ->u.tex.last_layer - templ->u.tex.first_layer + 1; + info.vma = &tex->vma; + if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level)) + info.aux_vma = &tex->aux_vma; + + info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER; + + info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ? + GEN6_SURFTYPE_2D : tex->image.type; + + info.format = ilo_format_translate_render(dev, templ->format); + info.is_array = util_resource_is_array_texture(&tex->base); + ilo_state_surface_init_for_image(&surf->u.rt, dev, &info); - surf->u.rt.bo = info.img->bo; } else { struct ilo_state_zs_info info; @@ -2131,13 +2134,19 @@ ilo_create_surface(struct pipe_context *pipe, memset(&info, 0, sizeof(info)); if (templ->format == PIPE_FORMAT_S8_UINT) { + info.s_vma = &tex->vma; info.s_img = &tex->image; } else { + info.z_vma = &tex->vma; info.z_img = &tex->image; - info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL; - info.hiz_enable = - ilo_image_can_enable_aux(&tex->image, templ->u.tex.level); + if (tex->separate_s8) { + info.s_vma = &tex->separate_s8->vma; + info.s_img = &tex->separate_s8->image; + } + + if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level)) + info.hiz_vma = &tex->aux_vma; } info.level = templ->u.tex.level; @@ -2145,16 +2154,15 @@ ilo_create_surface(struct pipe_context *pipe, info.slice_count = templ->u.tex.last_layer - templ->u.tex.first_layer + 1; - ilo_state_zs_init(&surf->u.zs, dev, &info); + info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ? + GEN6_SURFTYPE_2D : tex->image.type; - if (info.z_img) { - surf->u.zs.depth_bo = info.z_img->bo; - if (info.hiz_enable) - surf->u.zs.hiz_bo = info.z_img->aux.bo; - } + info.format = ilo_format_translate_depth(dev, tex->image_format); + if (ilo_dev_gen(dev) == ILO_GEN(6) && !info.hiz_vma && + tex->image_format == PIPE_FORMAT_Z24X8_UNORM) + info.format = GEN6_ZFORMAT_D24_UNORM_S8_UINT; - if (info.s_img) - surf->u.zs.stencil_bo = info.s_img->bo; + ilo_state_zs_init(&surf->u.zs, dev, &info); } return &surf->base; @@ -2339,7 +2347,7 @@ ilo_init_state_functions(struct ilo_context *ilo) ilo->base.set_scissor_states = ilo_set_scissor_states; ilo->base.set_viewport_states = ilo_set_viewport_states; ilo->base.set_sampler_views = ilo_set_sampler_views; - ilo->base.set_shader_resources = ilo_set_shader_resources; + ilo->base.set_shader_images = ilo_set_shader_images; ilo->base.set_vertex_buffers = ilo_set_vertex_buffers; ilo->base.set_index_buffer = ilo_set_index_buffer; @@ -2451,7 +2459,6 @@ void ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, struct pipe_resource *res) { - struct intel_bo *bo = ilo_resource_get_bo(res); uint32_t states = 0; unsigned sh, i; @@ -2482,10 +2489,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, for (i = 0; i < vec->so.count; i++) { if (vec->so.states[i]->buffer == res) { - struct ilo_stream_output_target *target = - (struct ilo_stream_output_target *) vec->so.states[i]; - - target->sb.bo = ilo_buffer(res)->bo; states |= ILO_DIRTY_SO; break; } @@ -2503,7 +2506,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, [PIPE_SHADER_GEOMETRY] = ILO_DIRTY_VIEW_GS, [PIPE_SHADER_COMPUTE] = ILO_DIRTY_VIEW_CS, }; - cso->surface.bo = bo; states |= view_dirty_bits[sh]; break; @@ -2515,7 +2517,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, struct ilo_cbuf_cso *cbuf = &vec->cbuf[sh].cso[i]; if (cbuf->resource == res) { - cbuf->surface.bo = bo; states |= ILO_DIRTY_CBUF; break; } @@ -2528,7 +2529,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, (struct ilo_surface_cso *) vec->resource.states[i]; if (cso->base.texture == res) { - cso->u.rt.bo = bo; states |= ILO_DIRTY_RESOURCE; break; } @@ -2540,27 +2540,19 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec, struct ilo_surface_cso *cso = (struct ilo_surface_cso *) vec->fb.state.cbufs[i]; if (cso && cso->base.texture == res) { - cso->u.rt.bo = bo; states |= ILO_DIRTY_FB; break; } } - if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res) { - struct ilo_surface_cso *cso = - (struct ilo_surface_cso *) vec->fb.state.zsbuf; - - cso->u.zs.depth_bo = bo; - + if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res) states |= ILO_DIRTY_FB; - } } for (i = 0; i < vec->cs_resource.count; i++) { struct ilo_surface_cso *cso = (struct ilo_surface_cso *) vec->cs_resource.states[i]; if (cso->base.texture == res) { - cso->u.rt.bo = bo; states |= ILO_DIRTY_CS_RESOURCE; break; } diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h index 3e6fd8a2554..66c93007eb1 100644 --- a/src/gallium/drivers/ilo/ilo_state.h +++ b/src/gallium/drivers/ilo/ilo_state.h @@ -202,7 +202,7 @@ struct ilo_cbuf_state { }; struct ilo_resource_state { - struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES]; + struct pipe_surface *states[PIPE_MAX_SHADER_IMAGES]; unsigned count; }; diff --git a/src/gallium/drivers/ilo/ilo_transfer.c b/src/gallium/drivers/ilo/ilo_transfer.c index ec41473f94a..5abd3bebf68 100644 --- a/src/gallium/drivers/ilo/ilo_transfer.c +++ b/src/gallium/drivers/ilo/ilo_transfer.c @@ -100,7 +100,7 @@ resource_get_transfer_method(struct pipe_resource *res, m = ILO_TRANSFER_MAP_SW_ZS; need_convert = true; } - } else if (tex->image.format != tex->base.format) { + } else if (tex->image_format != tex->base.format) { m = ILO_TRANSFER_MAP_SW_CONVERT; need_convert = true; } @@ -268,23 +268,27 @@ xfer_alloc_staging_sys(struct ilo_transfer *xfer) static void * xfer_map(struct ilo_transfer *xfer) { + const struct ilo_vma *vma; void *ptr; switch (xfer->method) { case ILO_TRANSFER_MAP_CPU: - ptr = intel_bo_map(ilo_resource_get_bo(xfer->base.resource), - xfer->base.usage & PIPE_TRANSFER_WRITE); + vma = ilo_resource_get_vma(xfer->base.resource); + ptr = intel_bo_map(vma->bo, xfer->base.usage & PIPE_TRANSFER_WRITE); break; case ILO_TRANSFER_MAP_GTT: - ptr = intel_bo_map_gtt(ilo_resource_get_bo(xfer->base.resource)); + vma = ilo_resource_get_vma(xfer->base.resource); + ptr = intel_bo_map_gtt(vma->bo); break; case ILO_TRANSFER_MAP_GTT_ASYNC: - ptr = intel_bo_map_gtt_async(ilo_resource_get_bo(xfer->base.resource)); + vma = ilo_resource_get_vma(xfer->base.resource); + ptr = intel_bo_map_gtt_async(vma->bo); break; case ILO_TRANSFER_MAP_STAGING: { const struct ilo_screen *is = ilo_screen(xfer->staging.res->screen); - struct intel_bo *bo = ilo_resource_get_bo(xfer->staging.res); + + vma = ilo_resource_get_vma(xfer->staging.res); /* * We want a writable, optionally persistent and coherent, mapping @@ -292,25 +296,29 @@ xfer_map(struct ilo_transfer *xfer) * this turns out to be fairly simple. */ if (is->dev.has_llc) - ptr = intel_bo_map(bo, true); + ptr = intel_bo_map(vma->bo, true); else - ptr = intel_bo_map_gtt(bo); + ptr = intel_bo_map_gtt(vma->bo); if (ptr && xfer->staging.res->target == PIPE_BUFFER) ptr += (xfer->base.box.x % ILO_TRANSFER_MAP_BUFFER_ALIGNMENT); - } break; case ILO_TRANSFER_MAP_SW_CONVERT: case ILO_TRANSFER_MAP_SW_ZS: + vma = NULL; ptr = xfer->staging.sys; break; default: assert(!"unknown mapping method"); + vma = NULL; ptr = NULL; break; } + if (ptr && vma) + ptr = (void *) ((char *) ptr + vma->bo_offset); + return ptr; } @@ -324,10 +332,10 @@ xfer_unmap(struct ilo_transfer *xfer) case ILO_TRANSFER_MAP_CPU: case ILO_TRANSFER_MAP_GTT: case ILO_TRANSFER_MAP_GTT_ASYNC: - intel_bo_unmap(ilo_resource_get_bo(xfer->base.resource)); + intel_bo_unmap(ilo_resource_get_vma(xfer->base.resource)->bo); break; case ILO_TRANSFER_MAP_STAGING: - intel_bo_unmap(ilo_resource_get_bo(xfer->staging.res)); + intel_bo_unmap(ilo_resource_get_vma(xfer->staging.res)->bo); break; default: break; @@ -541,9 +549,12 @@ tex_staging_sys_map_bo(struct ilo_texture *tex, if (prefer_cpu && (tex->image.tiling == GEN6_TILING_NONE || !linear_view)) - ptr = intel_bo_map(tex->image.bo, !for_read_back); + ptr = intel_bo_map(tex->vma.bo, !for_read_back); else - ptr = intel_bo_map_gtt(tex->image.bo); + ptr = intel_bo_map_gtt(tex->vma.bo); + + if (ptr) + ptr = (void *) ((char *) ptr + tex->vma.bo_offset); return ptr; } @@ -551,7 +562,7 @@ tex_staging_sys_map_bo(struct ilo_texture *tex, static void tex_staging_sys_unmap_bo(struct ilo_texture *tex) { - intel_bo_unmap(tex->image.bo); + intel_bo_unmap(tex->vma.bo); } static bool @@ -590,7 +601,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex, s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row); if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { - assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM); + assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM); dst_cpp = 4; dst_s8_pos = 3; @@ -598,7 +609,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex, } else { assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); - assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT); + assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT); dst_cpp = 8; dst_s8_pos = 4; @@ -644,7 +655,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex, tex_staging_sys_unmap_bo(s8_tex); } else { - assert(tex->image.format == PIPE_FORMAT_S8_UINT); + assert(tex->image_format == PIPE_FORMAT_S8_UINT); for (slice = 0; slice < box->depth; slice++) { unsigned mem_x, mem_y; @@ -717,7 +728,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex, s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row); if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { - assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM); + assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM); src_cpp = 4; src_s8_pos = 3; @@ -725,7 +736,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex, } else { assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); - assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT); + assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT); src_cpp = 8; src_s8_pos = 4; @@ -771,7 +782,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex, tex_staging_sys_unmap_bo(s8_tex); } else { - assert(tex->image.format == PIPE_FORMAT_S8_UINT); + assert(tex->image_format == PIPE_FORMAT_S8_UINT); for (slice = 0; slice < box->depth; slice++) { unsigned mem_x, mem_y; @@ -829,8 +840,8 @@ tex_staging_sys_convert_write(struct ilo_texture *tex, else dst_slice_stride = 0; - if (unlikely(tex->image.format == tex->base.format)) { - util_copy_box(dst, tex->image.format, tex->image.bo_stride, + if (unlikely(tex->image_format == tex->base.format)) { + util_copy_box(dst, tex->image_format, tex->image.bo_stride, dst_slice_stride, 0, 0, 0, box->width, box->height, box->depth, xfer->staging.sys, xfer->base.stride, xfer->base.layer_stride, 0, 0, 0); @@ -842,7 +853,7 @@ tex_staging_sys_convert_write(struct ilo_texture *tex, switch (tex->base.format) { case PIPE_FORMAT_ETC1_RGB8: - assert(tex->image.format == PIPE_FORMAT_R8G8B8X8_UNORM); + assert(tex->image_format == PIPE_FORMAT_R8G8B8X8_UNORM); for (slice = 0; slice < box->depth; slice++) { const void *src = @@ -1055,7 +1066,7 @@ choose_transfer_method(struct ilo_context *ilo, struct ilo_transfer *xfer) return false; /* see if we can avoid blocking */ - if (is_bo_busy(ilo, ilo_resource_get_bo(res), &need_submit)) { + if (is_bo_busy(ilo, ilo_resource_get_vma(res)->bo, &need_submit)) { bool resource_renamed; if (!xfer_unblock(xfer, &resource_renamed)) { @@ -1078,11 +1089,11 @@ static void buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res, unsigned usage, int offset, int size, const void *data) { - struct ilo_buffer *buf = ilo_buffer(res); + struct ilo_buffer_resource *buf = ilo_buffer_resource(res); bool need_submit; /* see if we can avoid blocking */ - if (is_bo_busy(ilo, buf->bo, &need_submit)) { + if (is_bo_busy(ilo, buf->vma.bo, &need_submit)) { bool unblocked = false; if ((usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) && @@ -1103,9 +1114,12 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res, templ.bind = PIPE_BIND_TRANSFER_WRITE; staging = ilo->base.screen->resource_create(ilo->base.screen, &templ); if (staging) { + const struct ilo_vma *staging_vma = ilo_resource_get_vma(staging); struct pipe_box staging_box; - intel_bo_pwrite(ilo_buffer(staging)->bo, 0, size, data); + /* offset by staging_vma->bo_offset for pwrite */ + intel_bo_pwrite(staging_vma->bo, staging_vma->bo_offset, + size, data); u_box_1d(0, size, &staging_box); ilo_blitter_blt_copy_resource(ilo->blitter, @@ -1123,7 +1137,8 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res, ilo_cp_submit(ilo->cp, "syncing for pwrites"); } - intel_bo_pwrite(buf->bo, offset, size, data); + /* offset by buf->vma.bo_offset for pwrite */ + intel_bo_pwrite(buf->vma.bo, buf->vma.bo_offset + offset, size, data); } static void diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c index 1de43f77ee0..1feb415c9e5 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c @@ -78,7 +78,7 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func) /** * Whether the blending factors are complementary of each other. */ -static INLINE boolean +static inline boolean lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor) { return dst_factor == (src_factor ^ 0x10); diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 0d47c0d517c..c273b25f096 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -169,7 +169,7 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen, unsigned bind_flags); -static INLINE struct llvmpipe_context * +static inline struct llvmpipe_context * llvmpipe_context( struct pipe_context *pipe ) { return (struct llvmpipe_context *)pipe; diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h index e0f7d8e1bc3..1038c5fe151 100644 --- a/src/gallium/drivers/llvmpipe/lp_debug.h +++ b/src/gallium/drivers/llvmpipe/lp_debug.h @@ -71,7 +71,7 @@ extern int LP_DEBUG; void st_debug_init( void ); -static INLINE void +static inline void LP_DBG( unsigned flag, const char *fmt, ... ) { if (LP_DEBUG & flag) diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h index 3c591187801..d7f0c153ec8 100644 --- a/src/gallium/drivers/llvmpipe/lp_fence.h +++ b/src/gallium/drivers/llvmpipe/lp_fence.h @@ -72,7 +72,7 @@ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen); void lp_fence_destroy(struct lp_fence *fence); -static INLINE void +static inline void lp_fence_reference(struct lp_fence **ptr, struct lp_fence *f) { @@ -85,7 +85,7 @@ lp_fence_reference(struct lp_fence **ptr, *ptr = f; } -static INLINE boolean +static inline boolean lp_fence_issued(const struct lp_fence *fence) { return fence->issued; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index c209f47f0f5..c19f9318006 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -184,7 +184,7 @@ union lp_rast_cmd_arg { /* Cast wrappers. Hopefully these compile to noops! */ -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile ) { union lp_rast_cmd_arg arg; @@ -192,7 +192,7 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile ) return arg; } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_triangle( const struct lp_rast_triangle *triangle, unsigned plane_mask) { @@ -208,7 +208,7 @@ lp_rast_arg_triangle( const struct lp_rast_triangle *triangle, * All planes are enabled, so instead of the plane mask we pass the upper * left coordinates of the a block that fully encloses the triangle. */ -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle, unsigned x, unsigned y) { @@ -218,7 +218,7 @@ lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle, return arg; } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_state( const struct lp_rast_state *state ) { union lp_rast_cmd_arg arg; @@ -226,7 +226,7 @@ lp_rast_arg_state( const struct lp_rast_state *state ) return arg; } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_fence( struct lp_fence *fence ) { union lp_rast_cmd_arg arg; @@ -235,7 +235,7 @@ lp_rast_arg_fence( struct lp_fence *fence ) } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_clearzs( uint64_t value, uint64_t mask ) { union lp_rast_cmd_arg arg; @@ -245,7 +245,7 @@ lp_rast_arg_clearzs( uint64_t value, uint64_t mask ) } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_query( struct llvmpipe_query *pq ) { union lp_rast_cmd_arg arg; @@ -253,7 +253,7 @@ lp_rast_arg_query( struct llvmpipe_query *pq ) return arg; } -static INLINE union lp_rast_cmd_arg +static inline union lp_rast_cmd_arg lp_rast_arg_null( void ) { union lp_rast_cmd_arg arg; @@ -312,7 +312,7 @@ lp_debug_draw_bins_by_coverage( struct lp_scene *scene ); #include <emmintrin.h> #include "util/u_sse.h" -static INLINE __m128i +static inline __m128i lp_plane_to_m128i(const struct lp_rast_plane *plane) { return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index e6ebbcd526d..9aa7e874657 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -145,7 +145,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, * Get the pointer to a 4x4 color block (within a 64x64 tile). * \param x, y location of 4x4 block in window coords */ -static INLINE uint8_t * +static inline uint8_t * lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, unsigned buf, unsigned x, unsigned y, unsigned layer) @@ -186,7 +186,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, * Get the pointer to a 4x4 depth block (within a 64x64 tile). * \param x, y location of 4x4 block in window coords */ -static INLINE uint8_t * +static inline uint8_t * lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task, unsigned x, unsigned y, unsigned layer) { @@ -222,7 +222,7 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task, * triangle in/out tests. * \param x, y location of 4x4 block in window coords */ -static INLINE void +static inline void lp_rast_shade_quads_all( struct lp_rasterizer_task *task, const struct lp_rast_shader_inputs *inputs, unsigned x, unsigned y ) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 41f6fbfa059..c9b9221d87c 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -63,7 +63,7 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } -static INLINE unsigned +static inline unsigned build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy) { unsigned mask = 0; @@ -94,7 +94,7 @@ build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy) } -static INLINE void +static inline void build_masks(int64_t c, int64_t cdiff, int64_t dcdx, @@ -167,7 +167,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #include "util/u_sse.h" -static INLINE void +static inline void build_masks_32(int c, int cdiff, int dcdx, @@ -213,7 +213,7 @@ build_masks_32(int c, } -static INLINE unsigned +static inline unsigned build_mask_linear_32(int c, int dcdx, int dcdy) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); @@ -239,7 +239,7 @@ build_mask_linear_32(int c, int dcdx, int dcdy) return _mm_movemask_epi8(result); } -static INLINE unsigned +static inline unsigned sign_bits4(const __m128i *cstep, int cdiff) { diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index a226ff0c485..b1464bb54c4 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -207,7 +207,7 @@ boolean lp_scene_is_resource_referenced(const struct lp_scene *scene, * Allocate space for a command/data in the bin's data buffer. * Grow the block list if needed. */ -static INLINE void * +static inline void * lp_scene_alloc( struct lp_scene *scene, unsigned size) { struct data_block_list *list = &scene->data; @@ -240,7 +240,7 @@ lp_scene_alloc( struct lp_scene *scene, unsigned size) /** * As above, but with specific alignment. */ -static INLINE void * +static inline void * lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size, unsigned alignment ) { @@ -272,7 +272,7 @@ lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size, /* Put back data if we decide not to use it, eg. culled triangles. */ -static INLINE void +static inline void lp_scene_putback_data( struct lp_scene *scene, unsigned size) { struct data_block_list *list = &scene->data; @@ -282,7 +282,7 @@ lp_scene_putback_data( struct lp_scene *scene, unsigned size) /** Return pointer to a particular tile's bin. */ -static INLINE struct cmd_bin * +static inline struct cmd_bin * lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y) { return &scene->tile[x][y]; @@ -296,7 +296,7 @@ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y); /* Add a command to bin[x][y]. */ -static INLINE boolean +static inline boolean lp_scene_bin_command( struct lp_scene *scene, unsigned x, unsigned y, unsigned cmd, @@ -328,7 +328,7 @@ lp_scene_bin_command( struct lp_scene *scene, } -static INLINE boolean +static inline boolean lp_scene_bin_cmd_with_state( struct lp_scene *scene, unsigned x, unsigned y, const struct lp_rast_state *state, @@ -354,7 +354,7 @@ lp_scene_bin_cmd_with_state( struct lp_scene *scene, /* Add a command to all active bins. */ -static INLINE boolean +static inline boolean lp_scene_bin_everywhere( struct lp_scene *scene, unsigned cmd, const union lp_rast_cmd_arg arg ) @@ -371,7 +371,7 @@ lp_scene_bin_everywhere( struct lp_scene *scene, } -static INLINE unsigned +static inline unsigned lp_scene_get_num_bins( const struct lp_scene *scene ) { return scene->tiles_x * scene->tiles_y; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 47f1897c732..14eeab03387 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -288,10 +288,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_VERTEXID_NOBASE: return 0; case PIPE_CAP_POLYGON_OFFSET_CLAMP: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; } /* should only get here on unhandled cases */ @@ -529,18 +533,6 @@ llvmpipe_fence_reference(struct pipe_screen *screen, /** - * Has the fence been executed/finished? - */ -static boolean -llvmpipe_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - struct lp_fence *f = (struct lp_fence *) fence; - return lp_fence_signalled(f); -} - - -/** * Wait for the fence to finish. */ static boolean @@ -550,6 +542,9 @@ llvmpipe_fence_finish(struct pipe_screen *screen, { struct lp_fence *f = (struct lp_fence *) fence_handle; + if (!timeout) + return lp_fence_signalled(f); + lp_fence_wait(f); return TRUE; } @@ -601,7 +596,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys) screen->base.context_create = llvmpipe_create_context; screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer; screen->base.fence_reference = llvmpipe_fence_reference; - screen->base.fence_signalled = llvmpipe_fence_signalled; screen->base.fence_finish = llvmpipe_fence_finish; screen->base.get_timestamp = llvmpipe_get_timestamp; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h index 8b8ea1afac9..00bf20c8c5f 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.h +++ b/src/gallium/drivers/llvmpipe/lp_screen.h @@ -62,7 +62,7 @@ struct llvmpipe_screen -static INLINE struct llvmpipe_screen * +static inline struct llvmpipe_screen * llvmpipe_screen( struct pipe_screen *pipe ) { return (struct llvmpipe_screen *)pipe; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index c944ad26756..a42df2dc9e0 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -159,7 +159,7 @@ void lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq); -static INLINE unsigned +static inline unsigned lp_clamp_viewport_idx(int idx) { return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index 6c05b90e64a..a190254d9df 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -233,7 +233,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup, -static INLINE int subpixel_snap( float a ) +static inline int subpixel_snap( float a ) { return util_iround(FIXED_ONE * a); } @@ -262,14 +262,14 @@ print_line(struct lp_setup_context *setup, } -static INLINE boolean sign(float x){ +static inline boolean sign(float x){ return x >= 0; } /* Used on positive floats only: */ -static INLINE float fracf(float f) +static inline float fracf(float f) { return f - floorf(f); } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index f065676a7fb..75544b52493 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -296,7 +296,7 @@ setup_point_coefficients( struct lp_setup_context *setup, } -static INLINE int +static inline int subpixel_snap(float a) { return util_iround(FIXED_ONE * a); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index a2f55ed3a1e..98a9d4bc28b 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -48,13 +48,13 @@ #include <emmintrin.h> #endif -static INLINE int +static inline int subpixel_snap(float a) { return util_iround(FIXED_ONE * a); } -static INLINE float +static inline float fixed_to_float(int a) { return a * (1.0f / FIXED_ONE); @@ -579,7 +579,7 @@ do_triangle_ccw(struct lp_setup_context *setup, * * Undefined if no bit set exists, so code should check against 0 first. */ -static INLINE uint32_t +static inline uint32_t floor_pot(uint32_t n) { #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) @@ -841,7 +841,7 @@ static void retry_triangle_ccw( struct lp_setup_context *setup, /** * Calculate fixed position data for a triangle */ -static INLINE void +static inline void calc_fixed_position( struct lp_setup_context *setup, struct fixed_position* position, const float (*v0)[4], @@ -873,7 +873,7 @@ calc_fixed_position( struct lp_setup_context *setup, * Rotate a triangle, flipping its clockwise direction, * Swaps values for xy[0] and xy[1] */ -static INLINE void +static inline void rotate_fixed_position_01( struct fixed_position* position ) { int x, y; @@ -898,7 +898,7 @@ rotate_fixed_position_01( struct fixed_position* position ) * Rotate a triangle, flipping its clockwise direction, * Swaps values for xy[1] and xy[2] */ -static INLINE void +static inline void rotate_fixed_position_12( struct fixed_position* position ) { int x, y; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c index 89992007849..534c5f48a64 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c @@ -122,7 +122,7 @@ lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim) typedef const float (*const_float4_ptr)[4]; -static INLINE const_float4_ptr get_vert( const void *vertex_buffer, +static inline const_float4_ptr get_vert( const void *vertex_buffer, int index, int stride ) { diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index b5ce8683f1a..fd6c49aacd8 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -840,7 +840,7 @@ store_unswizzled_block(struct gallivm_state *gallivm, * * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5. */ -static INLINE boolean +static inline boolean is_arithmetic_format(const struct util_format_description *format_desc) { boolean arith = false; @@ -860,7 +860,7 @@ is_arithmetic_format(const struct util_format_description *format_desc) * to floats for blending, and furthermore has "natural" packed AoS -> unpacked * SoA conversion. */ -static INLINE boolean +static inline boolean format_expands_to_float_soa(const struct util_format_description *format_desc) { if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || @@ -876,7 +876,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc) * * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte */ -static INLINE void +static inline void lp_mem_type_from_format_desc(const struct util_format_description *format_desc, struct lp_type* type) { @@ -924,7 +924,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc, * * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte */ -static INLINE void +static inline void lp_blend_type_from_format_desc(const struct util_format_description *format_desc, struct lp_type* type) { @@ -996,7 +996,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc * * but we try to avoid division and multiplication through shifts. */ -static INLINE LLVMValueRef +static inline LLVMValueRef scale_bits(struct gallivm_state *gallivm, int src_bits, int dst_bits, @@ -1108,7 +1108,7 @@ scale_bits(struct gallivm_state *gallivm, /** * If RT is a smallfloat (needing denorms) format */ -static INLINE int +static inline int have_smallfloat_format(struct lp_type dst_type, enum pipe_format format) { @@ -2880,7 +2880,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe, /** * Return the blend factor equivalent to a destination alpha of one. */ -static INLINE unsigned +static inline unsigned force_dst_alpha_one(unsigned factor, boolean clamped_zero) { switch(factor) { diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h index 4b6c8a7a6a5..e1b51c9c9a6 100644 --- a/src/gallium/drivers/llvmpipe/lp_test.h +++ b/src/gallium/drivers/llvmpipe/lp_test.h @@ -77,7 +77,7 @@ unsigned __int64 __rdtsc(); #elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) -static INLINE uint64_t +static inline uint64_t rdtsc(void) { uint32_t hi, lo; diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h index 9fbd3a21648..3d315bb9a73 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.h +++ b/src/gallium/drivers/llvmpipe/lp_texture.h @@ -106,21 +106,21 @@ struct llvmpipe_transfer /** cast wrappers */ -static INLINE struct llvmpipe_resource * +static inline struct llvmpipe_resource * llvmpipe_resource(struct pipe_resource *pt) { return (struct llvmpipe_resource *) pt; } -static INLINE const struct llvmpipe_resource * +static inline const struct llvmpipe_resource * llvmpipe_resource_const(const struct pipe_resource *pt) { return (const struct llvmpipe_resource *) pt; } -static INLINE struct llvmpipe_transfer * +static inline struct llvmpipe_transfer * llvmpipe_transfer(struct pipe_transfer *pt) { return (struct llvmpipe_transfer *) pt; @@ -131,7 +131,7 @@ void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen); void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe); -static INLINE boolean +static inline boolean llvmpipe_resource_is_texture(const struct pipe_resource *resource) { switch (resource->target) { @@ -153,7 +153,7 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource) } -static INLINE boolean +static inline boolean llvmpipe_resource_is_1d(const struct pipe_resource *resource) { switch (resource->target) { @@ -175,7 +175,7 @@ llvmpipe_resource_is_1d(const struct pipe_resource *resource) } -static INLINE unsigned +static inline unsigned llvmpipe_layer_stride(struct pipe_resource *resource, unsigned level) { @@ -185,7 +185,7 @@ llvmpipe_layer_stride(struct pipe_resource *resource, } -static INLINE unsigned +static inline unsigned llvmpipe_resource_stride(struct pipe_resource *resource, unsigned level) { diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am index d05f0a17ab4..c52d62e54a2 100644 --- a/src/gallium/drivers/nouveau/Makefile.am +++ b/src/gallium/drivers/nouveau/Makefile.am @@ -20,8 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index ca3c806e92f..cce60550ae5 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -1153,8 +1153,8 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info) switch (info->type) { PROG_TYPE_CASE(VERTEX, VERTEX); -// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL); -// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL); + PROG_TYPE_CASE(TESS_CTRL, TESSELLATION_CONTROL); + PROG_TYPE_CASE(TESS_EVAL, TESSELLATION_EVAL); PROG_TYPE_CASE(GEOMETRY, GEOMETRY); PROG_TYPE_CASE(FRAGMENT, FRAGMENT); PROG_TYPE_CASE(COMPUTE, COMPUTE); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 529dcb9bdc2..3ddaeafebbd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -106,6 +106,7 @@ enum operation OP_MEMBAR, // memory barrier (mfence, lfence, sfence) OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1] + OP_AFETCH, // fetch base address of shader input (a[%r1+0x10]) OP_EXPORT, OP_LINTERP, OP_PINTERP, @@ -372,7 +373,8 @@ enum SVSemantic SV_SAMPLE_INDEX, SV_SAMPLE_POS, SV_SAMPLE_MASK, - SV_TESS_FACTOR, + SV_TESS_OUTER, + SV_TESS_INNER, SV_TESS_COORD, SV_TID, SV_CTAID, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp index 51b9225156b..fa8ee072a92 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp @@ -332,6 +332,9 @@ BasicBlock::splitBefore(Instruction *insn, bool attach) BasicBlock *bb = new BasicBlock(func); assert(!insn || insn->op != OP_PHI); + bb->joinAt = joinAt; + joinAt = NULL; + splitCommon(insn, bb, attach); return bb; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index 708c5b322ee..19418c0e0f1 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -428,8 +428,7 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex) { Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0); - assert(svIndex < 4 || - (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR)); + assert(svIndex < 4 || svName == SV_CLIP_DISTANCE); switch (svName) { case SV_POSITION: @@ -438,7 +437,9 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex) case SV_POINT_SIZE: case SV_POINT_COORD: case SV_CLIP_DISTANCE: - case SV_TESS_FACTOR: + case SV_TESS_OUTER: + case SV_TESS_INNER: + case SV_TESS_COORD: sym->reg.type = TYPE_F32; break; default: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index dba56bf2716..2b9edcf9172 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -69,18 +69,6 @@ struct nv50_ir_varying # define NV50_IR_DEBUG_REG_ALLOC 0 #endif -#define NV50_SEMANTIC_CLIPDISTANCE (TGSI_SEMANTIC_COUNT + 0) -#define NV50_SEMANTIC_TESSFACTOR (TGSI_SEMANTIC_COUNT + 7) -#define NV50_SEMANTIC_TESSCOORD (TGSI_SEMANTIC_COUNT + 8) -#define NV50_SEMANTIC_COUNT (TGSI_SEMANTIC_COUNT + 10) - -#define NV50_TESS_PART_FRACT_ODD 0 -#define NV50_TESS_PART_FRACT_EVEN 1 -#define NV50_TESS_PART_POW2 2 -#define NV50_TESS_PART_INTEGER 3 - -#define NV50_PRIM_PATCHES PIPE_PRIM_MAX - struct nv50_ir_prog_symbol { uint32_t label; @@ -151,10 +139,10 @@ struct nv50_ir_prog_info } gp; struct { unsigned numColourResults; - boolean writesDepth; - boolean earlyFragTests; - boolean separateFragData; - boolean usesDiscard; + bool writesDepth; + bool earlyFragTests; + bool separateFragData; + bool usesDiscard; } fp; struct { uint32_t inputOffset; /* base address for user args */ @@ -180,11 +168,11 @@ struct nv50_ir_prog_info int8_t viewportId; /* output index of ViewportIndex */ uint8_t fragDepth; /* output index of FragDepth */ uint8_t sampleMask; /* output index of SampleMask */ - boolean sampleInterp; /* perform sample interp on all fp inputs */ + bool sampleInterp; /* perform sample interp on all fp inputs */ uint8_t backFaceColor[2]; /* input/output indices of back face colour */ uint8_t globalAccess; /* 1 for read, 2 for wr, 3 for rw */ - boolean fp64; /* program uses fp64 math */ - boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */ + bool fp64; /* program uses fp64 math */ + bool nv50styleSurfaces; /* generate gX[] access for raw buffers */ uint8_t resInfoCBSlot; /* cX[] used for tex handles, surface info */ uint16_t texBindBase; /* base address for tex handles (nve4) */ uint16_t suInfoBase; /* base address for surface info (nve4) */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index ab8bf2e5504..f06056f8f17 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -77,6 +77,7 @@ private: void emitMOV(const Instruction *); void emitINTERP(const Instruction *); + void emitAFETCH(const Instruction *); void emitPFETCH(const Instruction *); void emitVFETCH(const Instruction *); void emitEXPORT(const Instruction *); @@ -120,6 +121,8 @@ private: void emitPIXLD(const Instruction *); + void emitBAR(const Instruction *); + void emitFlow(const Instruction *); inline void defId(const ValueDef&, const int pos); @@ -1250,6 +1253,13 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i) } void +CodeEmitterGK110::emitBAR(const Instruction *i) +{ + /* TODO */ + emitNOP(i); +} + +void CodeEmitterGK110::emitFlow(const Instruction *i) { const FlowInstruction *f = i->asFlow(); @@ -1330,6 +1340,23 @@ CodeEmitterGK110::emitFlow(const Instruction *i) } void +CodeEmitterGK110::emitAFETCH(const Instruction *i) +{ + uint32_t offset = i->src(0).get()->reg.data.offset & 0x7ff; + + code[0] = 0x00000002 | (offset << 23); + code[1] = 0x7d000000 | (offset >> 9); + + if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT) + code[1] |= 0x8; + + emitPredicate(i); + + defId(i->def(0), 2); + srcId(i->src(0).getIndirect(0), 10); +} + +void CodeEmitterGK110::emitPFETCH(const Instruction *i) { uint32_t prim = i->src(0).get()->reg.data.u32; @@ -1698,6 +1725,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn) case OP_EXPORT: emitEXPORT(insn); break; + case OP_AFETCH: + emitAFETCH(insn); + break; case OP_PFETCH: emitPFETCH(insn); break; @@ -1856,6 +1886,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn) emitNOP(insn); insn->join = 1; break; + case OP_BAR: + emitBAR(insn); + break; case OP_PHI: case OP_UNION: case OP_CONSTRAINT: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 399a6f1db13..ef5c87d0437 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -174,6 +174,7 @@ private: void emitALD(); void emitAST(); void emitISBERD(); + void emitAL2P(); void emitIPA(); void emitPIXLD(); @@ -2204,6 +2205,17 @@ CodeEmitterGM107::emitISBERD() } void +CodeEmitterGM107::emitAL2P() +{ + emitInsn (0xefa00000); + emitField(0x2f, 2, (insn->getDef(0)->reg.size / 4) - 1); + emitO (0x20); + emitField(0x14, 11, insn->src(0).get()->reg.data.offset); + emitGPR (0x08, insn->src(0).getIndirect(0)); + emitGPR (0x00, insn->def(0)); +} + +void CodeEmitterGM107::emitIPA() { int ipam = 0, ipas = 0; @@ -2441,8 +2453,14 @@ CodeEmitterGM107::emitTXQ() break; } - emitInsn (0xdf4a0000); - emitField(0x24, 13, insn->tex.r); + if (insn->tex.rIndirectSrc >= 0) { + emitInsn (0xdf500000); + } else { + emitInsn (0xdf480000); + emitField(0x24, 13, insn->tex.r); + } + + emitField(0x31, 1, insn->tex.liveOnly); emitField(0x1f, 4, insn->tex.mask); emitField(0x16, 6, type); emitGPR (0x08, insn->src(0)); @@ -2753,6 +2771,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i) case OP_PFETCH: emitISBERD(); break; + case OP_AFETCH: + emitAL2P(); + break; case OP_LINTERP: case OP_PINTERP: emitIPA(); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 1bfc8e32e84..67ea6df773c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -499,10 +499,14 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i) setSrc(i, 2, 2); if (i->getIndirect(0, 0)) { - assert(!i->getIndirect(1, 0)); + assert(!i->srcExists(1) || !i->getIndirect(1, 0)); + assert(!i->srcExists(2) || !i->getIndirect(2, 0)); setAReg16(i, 0); - } else { + } else if (i->srcExists(1) && i->getIndirect(1, 0)) { + assert(!i->srcExists(2) || !i->getIndirect(2, 0)); setAReg16(i, 1); + } else { + setAReg16(i, 2); } } @@ -546,7 +550,7 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i) } // usual immediate form -// - 1 to 3 sources where last is immediate (rir, gir) +// - 1 to 3 sources where second is immediate (rir, gir) // - no address or predicate possible void CodeEmitterNV50::emitForm_IMM(const Instruction *i) @@ -562,7 +566,7 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i) if (Target::operationSrcNr[i->op] > 1) { setSrc(i, 0, 0); setImmediate(i, 1); - setSrc(i, 2, 1); + // If there is another source, it has to be the same as the dest reg. } else { setImmediate(i, 0); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 472e3a84119..f607f3ba3ec 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -85,6 +85,7 @@ private: void emitCCTL(const Instruction *); void emitINTERP(const Instruction *); + void emitAFETCH(const Instruction *); void emitPFETCH(const Instruction *); void emitVFETCH(const Instruction *); void emitEXPORT(const Instruction *); @@ -1450,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) ImmediateValue *imm = i->getSrc(0)->asImm(); assert(imm); code[0] |= imm->reg.data.u32 << 20; + code[1] |= 0x8000; } // thread count @@ -1460,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) assert(imm); code[0] |= imm->reg.data.u32 << 26; code[1] |= imm->reg.data.u32 >> 6; + code[1] |= 0x4000; } if (i->srcExists(2) && (i->predSrc != 2)) { @@ -1494,6 +1497,21 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) } void +CodeEmitterNVC0::emitAFETCH(const Instruction *i) +{ + code[0] = 0x00000006; + code[1] = 0x0c000000 | (i->src(0).get()->reg.data.offset & 0x7ff); + + if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT) + code[0] |= 0x200; + + emitPredicate(i); + + defId(i->def(0), 14); + srcId(i->src(0).getIndirect(0), 20); +} + +void CodeEmitterNVC0::emitPFETCH(const Instruction *i) { uint32_t prim = i->src(0).get()->reg.data.u32; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index ecd115f9807..4847a0f3355 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -372,6 +372,10 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval) case TGSI_SEMANTIC_SAMPLEPOS: return nv50_ir::SV_SAMPLE_POS; case TGSI_SEMANTIC_SAMPLEMASK: return nv50_ir::SV_SAMPLE_MASK; case TGSI_SEMANTIC_INVOCATIONID: return nv50_ir::SV_INVOCATION_ID; + case TGSI_SEMANTIC_TESSCOORD: return nv50_ir::SV_TESS_COORD; + case TGSI_SEMANTIC_TESSOUTER: return nv50_ir::SV_TESS_OUTER; + case TGSI_SEMANTIC_TESSINNER: return nv50_ir::SV_TESS_INNER; + case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT; default: assert(0); return nv50_ir::SV_CLOCK; @@ -434,7 +438,6 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_USLT: case TGSI_OPCODE_USNE: case TGSI_OPCODE_USHR: - case TGSI_OPCODE_UCMP: case TGSI_OPCODE_ATOMUADD: case TGSI_OPCODE_ATOMXCHG: case TGSI_OPCODE_ATOMCAS: @@ -827,7 +830,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog) if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) tgsi_dump(tokens, 0); - mainTempsInLMem = FALSE; + mainTempsInLMem = false; } Source::~Source() @@ -938,7 +941,7 @@ void Source::scanProperty(const struct tgsi_full_property *prop) info->prop.gp.instanceCount = prop->u[0].Data; break; case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: - info->prop.fp.separateFragData = TRUE; + info->prop.fp.separateFragData = true; break; case TGSI_PROPERTY_FS_COORD_ORIGIN: case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER: @@ -947,6 +950,24 @@ void Source::scanProperty(const struct tgsi_full_property *prop) case TGSI_PROPERTY_VS_PROHIBIT_UCPS: info->io.genUserClip = -1; break; + case TGSI_PROPERTY_TCS_VERTICES_OUT: + info->prop.tp.outputPatchSize = prop->u[0].Data; + break; + case TGSI_PROPERTY_TES_PRIM_MODE: + info->prop.tp.domain = prop->u[0].Data; + break; + case TGSI_PROPERTY_TES_SPACING: + info->prop.tp.partitioning = prop->u[0].Data; + break; + case TGSI_PROPERTY_TES_VERTEX_ORDER_CW: + info->prop.tp.winding = prop->u[0].Data; + break; + case TGSI_PROPERTY_TES_POINT_MODE: + if (prop->u[0].Data) + info->prop.tp.outputPrim = PIPE_PRIM_POINTS; + else + info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */ + break; default: INFO("unhandled TGSI property %d\n", prop->Property.PropertyName); break; @@ -1035,6 +1056,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) if (decl->Interp.Location || info->io.sampleInterp) info->in[i].centroid = 1; } + + if (sn == TGSI_SEMANTIC_PATCH) + info->in[i].patch = 1; + if (sn == TGSI_SEMANTIC_PATCH) + info->numPatchConstants = MAX2(info->numPatchConstants, si + 1); } } break; @@ -1069,6 +1095,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) case TGSI_SEMANTIC_VIEWPORT_INDEX: info->io.viewportId = i; break; + case TGSI_SEMANTIC_PATCH: + info->numPatchConstants = MAX2(info->numPatchConstants, si + 1); + /* fallthrough */ + case TGSI_SEMANTIC_TESSOUTER: + case TGSI_SEMANTIC_TESSINNER: + info->out[i].patch = 1; + break; default: break; } @@ -1092,6 +1125,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) info->sv[i].sn = sn; info->sv[i].si = si; info->sv[i].input = inferSysValDirection(sn); + + switch (sn) { + case TGSI_SEMANTIC_TESSOUTER: + case TGSI_SEMANTIC_TESSINNER: + info->sv[i].patch = 1; + break; + } } break; case TGSI_FILE_RESOURCE: @@ -1156,7 +1196,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) } else if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) { if (insn.getDst(0).isIndirect(0)) - mainTempsInLMem = TRUE; + mainTempsInLMem = true; } } @@ -1164,12 +1204,22 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) Instruction::SrcRegister src = insn.getSrc(s); if (src.getFile() == TGSI_FILE_TEMPORARY) { if (src.isIndirect(0)) - mainTempsInLMem = TRUE; + mainTempsInLMem = true; } else if (src.getFile() == TGSI_FILE_RESOURCE) { if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL) info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ? 0x1 : 0x2; + } else + if (src.getFile() == TGSI_FILE_OUTPUT) { + if (src.isIndirect(0)) { + // We don't know which one is accessed, just mark everything for + // reading. This is an extremely unlikely occurrence. + for (unsigned i = 0; i < info->numOutputs; ++i) + info->out[i].oread = 1; + } else { + info->out[src.getIndex(0)].oread = 1; + } } if (src.getFile() != TGSI_FILE_INPUT) continue; @@ -1246,6 +1296,7 @@ private: Value *shiftAddress(Value *); Value *getVertexBase(int s); + Value *getOutputBase(int s); DataArray *getArrayForFile(unsigned file, int idx); Value *fetchSrc(int s, int c); Value *acquireDst(int d, int c); @@ -1343,6 +1394,8 @@ private: Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP) uint8_t vtxBaseValid; + Value *outBase; // base address of vertex out patch (for TCP) + Stack condBBs; // fork BB, then else clause BB Stack joinBBs; // fork BB, for inserting join ops on ENDIF Stack loopBBs; // loop headers @@ -1476,6 +1529,22 @@ Converter::getVertexBase(int s) } Value * +Converter::getOutputBase(int s) +{ + assert(s < 5); + if (!(vtxBaseValid & (1 << s))) { + Value *offset = loadImm(NULL, tgsi.getSrc(s).getIndex(1)); + if (tgsi.getSrc(s).isIndirect(1)) + offset = mkOp2v(OP_ADD, TYPE_U32, getSSA(), + fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL), + offset); + vtxBaseValid |= 1 << s; + vtxBase[s] = mkOp2v(OP_ADD, TYPE_U32, getSSA(), outBase, offset); + } + return vtxBase[s]; +} + +Value * Converter::fetchSrc(int s, int c) { Value *res; @@ -1488,6 +1557,9 @@ Converter::fetchSrc(int s, int c) if (src.is2D()) { switch (src.getFile()) { + case TGSI_FILE_OUTPUT: + dimRel = getOutputBase(s); + break; case TGSI_FILE_INPUT: dimRel = getVertexBase(s); break; @@ -1542,6 +1614,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) const int idx2d = src.is2D() ? src.getIndex(1) : 0; const int idx = src.getIndex(0); const int swz = src.getSwizzle(c); + Instruction *ld; switch (src.getFile()) { case TGSI_FILE_IMMEDIATE: @@ -1569,13 +1642,19 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) if (ptr) return mkLoadv(TYPE_U32, srcToSym(src, c), ptr); } - return mkLoadv(TYPE_U32, srcToSym(src, c), shiftAddress(ptr)); + ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr)); + ld->perPatch = info->in[idx].patch; + return ld->getDef(0); case TGSI_FILE_OUTPUT: - assert(!"load from output file"); - return NULL; + assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); + ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr)); + ld->perPatch = info->out[idx].patch; + return ld->getDef(0); case TGSI_FILE_SYSTEM_VALUE: assert(!ptr); - return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c)); + ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c)); + ld->perPatch = info->sv[idx].patch; + return ld->getDef(0); default: return getArrayForFile(src.getFile(), idx2d)->load( sub.cur->values, idx, swz, shiftAddress(ptr)); @@ -1645,7 +1724,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, viewport != NULL) mkOp1(OP_MOV, TYPE_U32, viewport, val); else - mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val); + mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val)->perPatch = + info->out[idx].patch; } } else if (f == TGSI_FILE_TEMPORARY || @@ -1687,6 +1767,7 @@ Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork) join->fixed = 1; conv->insertHead(join); + assert(!fork->joinAt); fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv); fork->insertBefore(fork->getExit(), fork->joinAt); } @@ -1728,7 +1809,7 @@ Converter::handleTXQ(Value *dst0[4], enum TexQuery query) } tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level - setTexRS(tex, c, 1, -1); + setTexRS(tex, ++c, 1, -1); bb->insertTail(tex); } @@ -2569,6 +2650,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) } break; case TGSI_OPCODE_UCMP: + srcTy = TYPE_U32; + /* fallthrough */ case TGSI_OPCODE_CMP: FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { src0 = fetchSrc(0, c); @@ -3282,10 +3365,21 @@ Converter::run() clipVtx[c] = getScratch(); } - if (prog->getType() == Program::TYPE_FRAGMENT) { + switch (prog->getType()) { + case Program::TYPE_TESSELLATION_CONTROL: + outBase = mkOp2v( + OP_SUB, TYPE_U32, getSSA(), + mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)), + mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0))); + break; + case Program::TYPE_FRAGMENT: { Symbol *sv = mkSysVal(SV_POSITION, 3); fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv); mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]); + break; + } + default: + break; } if (info->io.viewportId >= 0) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 596ac95d489..1f3fce2bb9a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -176,7 +176,7 @@ GM107LoweringPass::handlePOPCNT(Instruction *i) i->getSrc(0), i->getSrc(1)); i->setSrc(0, tmp); i->setSrc(1, NULL); - return TRUE; + return true; } // diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 2c7f7e326b2..bea293bac99 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -871,6 +871,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i) BasicBlock *joinBB = i->bb->splitAfter(i); bld.setPosition(currBB, true); + assert(!currBB->joinAt); currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); for (int l = 0; l <= 3; ++l) { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 7a5d1ce0299..c3c302da5c8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) } else if (i->isNop()) { bb->remove(i); + } else + if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC && + prog->getType() != Program::TYPE_COMPUTE) { + // It seems like barriers are never required for tessellation since + // the warp size is 32, and there are always at most 32 tcs threads. + bb->remove(i); } else { // TODO: Move this to before register allocation for operations that // need the $c register ! @@ -956,7 +962,43 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd) bool NVC0LoweringPass::handleTXQ(TexInstruction *txq) { - // TODO: indirect resource/sampler index + if (txq->tex.rIndirectSrc < 0) + return true; + + Value *ticRel = txq->getIndirectR(); + const int chipset = prog->getTarget()->getChipset(); + + txq->setIndirectS(NULL); + txq->tex.sIndirectSrc = -1; + + assert(ticRel); + + if (chipset < NVISA_GK104_CHIPSET) { + LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa + + txq->setSrc(txq->tex.rIndirectSrc, NULL); + if (txq->tex.r) + ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), + ticRel, bld.mkImm(txq->tex.r)); + + bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17)); + + txq->moveSources(0, 1); + txq->setSrc(0, src); + } else { + Value *hnd = loadTexHandle( + bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), + txq->getIndirectR(), bld.mkImm(2)), + txq->tex.r); + txq->tex.r = 0xff; + txq->tex.s = 0x1f; + + txq->setIndirectR(NULL); + txq->moveSources(0, 1); + txq->setSrc(0, hnd); + txq->tex.rIndirectSrc = 0; + } + return true; } @@ -1485,6 +1527,10 @@ NVC0LoweringPass::handleRDSV(Instruction *i) i->op = OP_MOV; i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0)); } + if (sv == SV_VERTEX_COUNT) { + bld.setPosition(i, true); + bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808)); + } return true; } @@ -1554,7 +1600,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i) ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK; break; default: - if (prog->getType() == Program::TYPE_TESSELLATION_EVAL) + if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch) vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); ld = bld.mkFetch(i->getDef(0), i->dType, FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx); @@ -1705,6 +1751,7 @@ NVC0LoweringPass::checkPredicate(Instruction *insn) bool NVC0LoweringPass::visit(Instruction *i) { + bool ret = true; bld.setPosition(i, false); if (i->cc != CC_ALWAYS) @@ -1736,7 +1783,8 @@ NVC0LoweringPass::visit(Instruction *i) case OP_SQRT: return handleSQRT(i); case OP_EXPORT: - return handleEXPORT(i); + ret = handleEXPORT(i); + break; case OP_EMIT: case OP_RESTART: return handleOUT(i); @@ -1775,6 +1823,9 @@ NVC0LoweringPass::visit(Instruction *i) i->setIndirect(0, 0, ptr); i->subOp = NV50_IR_SUBOP_LDC_IS; } + } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { + assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); + i->op = OP_VFETCH; } break; case OP_ATOM: @@ -1796,7 +1847,20 @@ NVC0LoweringPass::visit(Instruction *i) default: break; } - return true; + + /* Kepler+ has a special opcode to compute a new base address to be used + * for indirect loads. + */ + if (targ->getChipset() >= NVISA_GK104_CHIPSET && !i->perPatch && + (i->op == OP_VFETCH || i->op == OP_EXPORT) && i->src(0).isIndirect(0)) { + Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(), + cloneShallow(func, i->getSrc(0))); + afetch->setIndirect(0, 0, i->getIndirect(0, 0)); + i->src(0).get()->reg.data.offset = 0; + i->setIndirect(0, 0, afetch->getDef(0)); + } + + return ret; } bool diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index ae739eeda83..cea96dcdfc5 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -608,9 +608,12 @@ ConstantFolding::expr(Instruction *i, case OP_FMA: { i->op = OP_ADD; + /* Move the immediate to the second arg, otherwise the ADD operation + * won't be emittable + */ i->setSrc(1, i->getSrc(0)); - i->src(1).mod = i->src(2).mod; i->setSrc(0, i->getSrc(2)); + i->src(0).mod = i->src(2).mod; i->setSrc(2, NULL); ImmediateValue src0; @@ -2082,6 +2085,8 @@ MemoryOpt::runOpt(BasicBlock *bb) } if (ldst->getPredicate()) // TODO: handle predicated ld/st continue; + if (ldst->perPatch) // TODO: create separate per-patch lists + continue; if (isLoad) { DataFile file = ldst->src(0).getFile(); @@ -2515,6 +2520,8 @@ Instruction::isResultEqual(const Instruction *that) const case FILE_MEMORY_CONST: case FILE_SHADER_INPUT: return true; + case FILE_SHADER_OUTPUT: + return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL; default: return false; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index ef3de6ff92a..9ebdc6586db 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -135,6 +135,7 @@ const char *operationStr[OP_LAST + 1] = "membar", "vfetch", "pfetch", + "afetch", "export", "linterp", "pinterp", @@ -258,7 +259,8 @@ static const char *SemanticStr[SV_LAST + 1] = "SAMPLE_INDEX", "SAMPLE_POS", "SAMPLE_MASK", - "TESS_FACTOR", + "TESS_OUTER", + "TESS_INNER", "TESS_COORD", "TID", "CTAID", diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 898653c9953..78bc97f4397 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -2066,6 +2066,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) condenseDefs(i); if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8) addHazard(i, i->src(0).getIndirect(0)); + if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8) + addHazard(i, i->src(0).getIndirect(1)); } else if (i->op == OP_UNION || i->op == OP_MERGE || diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index 7992f539782..fe530c76b62 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -41,7 +41,7 @@ const uint8_t Target::operationSrcNr[] = 0, 0, 0, 0, 0, // BRA, CALL, RET, CONT, BREAK, 0, 0, 0, // PRERET,CONT,BREAK 0, 0, 0, 0, 0, 0, // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR - 1, 1, 2, 1, 2, // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP + 1, 1, 1, 2, 1, 2, // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP 1, 1, // EMIT, RESTART 1, 1, 1, // TEX, TXB, TXL, 1, 1, 1, 1, 1, 1, 2, // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP @@ -96,8 +96,8 @@ const OpClass Target::operationClass[] = OPCLASS_FLOW, OPCLASS_FLOW, // MEMBAR OPCLASS_CONTROL, - // VFETCH, PFETCH, EXPORT - OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE, + // VFETCH, PFETCH, AFETCH, EXPORT + OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE, // LINTERP, PINTERP OPCLASS_SFU, OPCLASS_SFU, // EMIT, RESTART diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index ca545a6024a..f3ddcaa5199 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -118,7 +118,7 @@ void TargetNV50::initOpInfo() static const uint32_t shortForm[(OP_LAST + 31) / 32] = { // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF - 0x00014e40, 0x00000040, 0x00000498, 0x00000000 + 0x00014e40, 0x00000040, 0x00000930, 0x00000000 }; static const operation noDestList[] = { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 7d4a859dde4..27df0eba66b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -286,7 +286,8 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4; case SV_POINT_COORD: return 0x2e0 + idx * 4; case SV_FACE: return 0x3fc; - case SV_TESS_FACTOR: return 0x000 + idx * 4; + case SV_TESS_OUTER: return 0x000 + idx * 4; + case SV_TESS_INNER: return 0x010 + idx * 4; case SV_TESS_COORD: return 0x2f0 + idx * 4; case SV_NTID: return kepler ? (0x00 + idx * 4) : ~0; case SV_NCTAID: return kepler ? (0x0c + idx * 4) : ~0; diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c index 09cdbb53ecb..67e181e803a 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -22,13 +22,13 @@ struct nouveau_transfer { uint32_t offset; }; -static INLINE struct nouveau_transfer * +static inline struct nouveau_transfer * nouveau_transfer(struct pipe_transfer *transfer) { return (struct nouveau_transfer *)transfer; } -static INLINE boolean +static inline bool nouveau_buffer_malloc(struct nv04_resource *buf) { if (!buf->data) @@ -36,16 +36,11 @@ nouveau_buffer_malloc(struct nv04_resource *buf) return !!buf->data; } -static INLINE boolean +static inline bool nouveau_buffer_allocate(struct nouveau_screen *screen, struct nv04_resource *buf, unsigned domain) { - uint32_t size = buf->base.width0; - - if (buf->base.bind & (PIPE_BIND_CONSTANT_BUFFER | - PIPE_BIND_COMPUTE_RESOURCE | - PIPE_BIND_SHADER_RESOURCE)) - size = align(size, 0x100); + uint32_t size = align(buf->base.width0, 0x100); if (domain == NOUVEAU_BO_VRAM) { buf->mm = nouveau_mm_allocate(screen->mm_VRAM, size, @@ -58,12 +53,12 @@ nouveau_buffer_allocate(struct nouveau_screen *screen, buf->mm = nouveau_mm_allocate(screen->mm_GART, size, &buf->bo, &buf->offset); if (!buf->bo) - return FALSE; + return false; NOUVEAU_DRV_STAT(screen, buf_obj_current_bytes_sys, buf->base.width0); } else { assert(domain == 0); if (!nouveau_buffer_malloc(buf)) - return FALSE; + return false; } buf->domain = domain; if (buf->bo) @@ -71,10 +66,10 @@ nouveau_buffer_allocate(struct nouveau_screen *screen, util_range_set_empty(&buf->valid_buffer_range); - return TRUE; + return true; } -static INLINE void +static inline void release_allocation(struct nouveau_mm_allocation **mm, struct nouveau_fence *fence) { @@ -82,7 +77,7 @@ release_allocation(struct nouveau_mm_allocation **mm, (*mm) = NULL; } -INLINE void +inline void nouveau_buffer_release_gpu_storage(struct nv04_resource *buf) { nouveau_bo_ref(NULL, &buf->bo); @@ -98,7 +93,7 @@ nouveau_buffer_release_gpu_storage(struct nv04_resource *buf) buf->domain = 0; } -static INLINE boolean +static inline bool nouveau_buffer_reallocate(struct nouveau_screen *screen, struct nv04_resource *buf, unsigned domain) { @@ -139,13 +134,13 @@ nouveau_buffer_destroy(struct pipe_screen *pscreen, */ static uint8_t * nouveau_transfer_staging(struct nouveau_context *nv, - struct nouveau_transfer *tx, boolean permit_pb) + struct nouveau_transfer *tx, bool permit_pb) { const unsigned adj = tx->base.box.x & NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK; const unsigned size = align(tx->base.box.width, 4) + adj; if (!nv->push_data) - permit_pb = FALSE; + permit_pb = false; if ((size <= NOUVEAU_TRANSFER_PUSHBUF_THRESHOLD) && permit_pb) { tx->map = align_malloc(size, NOUVEAU_MIN_BUFFER_MAP_ALIGN); @@ -167,7 +162,7 @@ nouveau_transfer_staging(struct nouveau_context *nv, * buffer. Also updates buf->data if present. * * Maybe just migrate to GART right away if we actually need to do this. */ -static boolean +static bool nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx) { struct nv04_resource *buf = nv04_resource(tx->base.resource); @@ -180,12 +175,12 @@ nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx) buf->bo, buf->offset + base, buf->domain, size); if (nouveau_bo_wait(tx->bo, NOUVEAU_BO_RD, nv->client)) - return FALSE; + return false; if (buf->data) memcpy(buf->data + base, tx->map, size); - return TRUE; + return true; } static void @@ -195,7 +190,7 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx, struct nv04_resource *buf = nv04_resource(tx->base.resource); uint8_t *data = tx->map + offset; const unsigned base = tx->base.box.x + offset; - const boolean can_cb = !((base | size) & 3); + const bool can_cb = !((base | size) & 3); if (buf->data) memcpy(data, buf->data + base, size); @@ -224,32 +219,32 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx, /* Does a CPU wait for the buffer's backing data to become reliably accessible * for write/read by waiting on the buffer's relevant fences. */ -static INLINE boolean +static inline bool nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw) { if (rw == PIPE_TRANSFER_READ) { if (!buf->fence_wr) - return TRUE; + return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence_wr)); if (!nouveau_fence_wait(buf->fence_wr)) - return FALSE; + return false; } else { if (!buf->fence) - return TRUE; + return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence)); if (!nouveau_fence_wait(buf->fence)) - return FALSE; + return false; nouveau_fence_ref(NULL, &buf->fence); } nouveau_fence_ref(NULL, &buf->fence_wr); - return TRUE; + return true; } -static INLINE boolean +static inline bool nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw) { if (rw == PIPE_TRANSFER_READ) @@ -258,7 +253,7 @@ nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw) return (buf->fence && !nouveau_fence_signalled(buf->fence)); } -static INLINE void +static inline void nouveau_buffer_transfer_init(struct nouveau_transfer *tx, struct pipe_resource *resource, const struct pipe_box *box, @@ -280,7 +275,7 @@ nouveau_buffer_transfer_init(struct nouveau_transfer *tx, tx->map = NULL; } -static INLINE void +static inline void nouveau_buffer_transfer_del(struct nouveau_context *nv, struct nouveau_transfer *tx) { @@ -297,11 +292,11 @@ nouveau_buffer_transfer_del(struct nouveau_context *nv, } /* Creates a cache in system memory of the buffer data. */ -static boolean +static bool nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf) { struct nouveau_transfer tx; - boolean ret; + bool ret; tx.base.resource = &buf->base; tx.base.box.x = 0; tx.base.box.width = buf->base.width0; @@ -310,13 +305,13 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf) if (!buf->data) if (!nouveau_buffer_malloc(buf)) - return FALSE; + return false; if (!(buf->status & NOUVEAU_BUFFER_STATUS_DIRTY)) - return TRUE; + return true; nv->stats.buf_cache_count++; - if (!nouveau_transfer_staging(nv, &tx, FALSE)) - return FALSE; + if (!nouveau_transfer_staging(nv, &tx, false)) + return false; ret = nouveau_transfer_read(nv, &tx); if (ret) { @@ -335,15 +330,15 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf) * resource. This can be useful if we would otherwise have to wait for a read * operation to complete on this data. */ -static INLINE boolean +static inline bool nouveau_buffer_should_discard(struct nv04_resource *buf, unsigned usage) { if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) - return FALSE; + return false; if (unlikely(buf->base.bind & PIPE_BIND_SHARED)) - return FALSE; + return false; if (unlikely(usage & PIPE_TRANSFER_PERSISTENT)) - return FALSE; + return false; return buf->mm && nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE); } @@ -413,7 +408,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, * back into VRAM on unmap. */ if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) buf->status &= NOUVEAU_BUFFER_STATUS_REALLOC_MASK; - nouveau_transfer_staging(nv, tx, TRUE); + nouveau_transfer_staging(nv, tx, true); } else { if (buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { /* The GPU is currently writing to this buffer. Copy its current @@ -424,13 +419,13 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, align_free(buf->data); buf->data = NULL; } - nouveau_transfer_staging(nv, tx, FALSE); + nouveau_transfer_staging(nv, tx, false); nouveau_transfer_read(nv, tx); } else { /* The buffer is currently idle. Create a staging area for writes, * and make sure that the cached data is up-to-date. */ if (usage & PIPE_TRANSFER_WRITE) - nouveau_transfer_staging(nv, tx, TRUE); + nouveau_transfer_staging(nv, tx, true); if (!buf->data) nouveau_buffer_cache(nv, buf); } @@ -482,7 +477,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, if (usage & PIPE_TRANSFER_DISCARD_RANGE) { /* The whole range is being discarded, so it doesn't matter what was * there before. No need to copy anything over. */ - nouveau_transfer_staging(nv, tx, TRUE); + nouveau_transfer_staging(nv, tx, true); map = tx->map; } else if (nouveau_buffer_busy(buf, PIPE_TRANSFER_READ)) { @@ -493,7 +488,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, } else { /* It is expected that the returned buffer be a representation of the * data in question, so we must copy it over from the buffer. */ - nouveau_transfer_staging(nv, tx, TRUE); + nouveau_transfer_staging(nv, tx, true); if (tx->map) memcpy(tx->map, map, box->width); map = tx->map; @@ -544,7 +539,7 @@ nouveau_buffer_transfer_unmap(struct pipe_context *pipe, const uint8_t bind = buf->base.bind; /* make sure we invalidate dedicated caches */ if (bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER)) - nv->vbo_dirty = TRUE; + nv->vbo_dirty = true; } util_range_add(&buf->valid_buffer_range, @@ -639,7 +634,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen, { struct nouveau_screen *screen = nouveau_screen(pscreen); struct nv04_resource *buffer; - boolean ret; + bool ret; buffer = CALLOC_STRUCT(nv04_resource); if (!buffer) @@ -683,7 +678,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen, } ret = nouveau_buffer_allocate(screen, buffer, buffer->domain); - if (ret == FALSE) + if (ret == false) goto fail; if (buffer->domain == NOUVEAU_BO_VRAM && screen->hint_buf_keep_sysmem_copy) @@ -730,20 +725,20 @@ nouveau_user_buffer_create(struct pipe_screen *pscreen, void *ptr, return &buffer->base; } -static INLINE boolean +static inline bool nouveau_buffer_data_fetch(struct nouveau_context *nv, struct nv04_resource *buf, struct nouveau_bo *bo, unsigned offset, unsigned size) { if (!nouveau_buffer_malloc(buf)) - return FALSE; + return false; if (nouveau_bo_map(bo, NOUVEAU_BO_RD, nv->client)) - return FALSE; + return false; memcpy(buf->data, (uint8_t *)bo->map + offset, size); - return TRUE; + return true; } /* Migrate a linear buffer (vertex, index, constants) USER -> GART -> VRAM. */ -boolean +bool nouveau_buffer_migrate(struct nouveau_context *nv, struct nv04_resource *buf, const unsigned new_domain) { @@ -758,7 +753,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv, if (new_domain == NOUVEAU_BO_GART && old_domain == 0) { if (!nouveau_buffer_allocate(screen, buf, new_domain)) - return FALSE; + return false; ret = nouveau_bo_map(buf->bo, 0, nv->client); if (ret) return ret; @@ -771,7 +766,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv, if (new_domain == NOUVEAU_BO_VRAM) { /* keep a system memory copy of our data in case we hit a fallback */ if (!nouveau_buffer_data_fetch(nv, buf, buf->bo, buf->offset, size)) - return FALSE; + return false; if (nouveau_mesa_debug) debug_printf("migrating %u KiB to VRAM\n", size / 1024); } @@ -792,28 +787,28 @@ nouveau_buffer_migrate(struct nouveau_context *nv, if (new_domain == NOUVEAU_BO_VRAM && old_domain == 0) { struct nouveau_transfer tx; if (!nouveau_buffer_allocate(screen, buf, NOUVEAU_BO_VRAM)) - return FALSE; + return false; tx.base.resource = &buf->base; tx.base.box.x = 0; tx.base.box.width = buf->base.width0; tx.bo = NULL; tx.map = NULL; - if (!nouveau_transfer_staging(nv, &tx, FALSE)) - return FALSE; + if (!nouveau_transfer_staging(nv, &tx, false)) + return false; nouveau_transfer_write(nv, &tx, 0, tx.base.box.width); nouveau_buffer_transfer_del(nv, &tx); } else - return FALSE; + return false; assert(buf->domain == new_domain); - return TRUE; + return true; } /* Migrate data from glVertexAttribPointer(non-VBO) user buffers to GART. * We'd like to only allocate @size bytes here, but then we'd have to rebase * the vertex indices ... */ -boolean +bool nouveau_user_buffer_upload(struct nouveau_context *nv, struct nv04_resource *buf, unsigned base, unsigned size) @@ -825,20 +820,20 @@ nouveau_user_buffer_upload(struct nouveau_context *nv, buf->base.width0 = base + size; if (!nouveau_buffer_reallocate(screen, buf, NOUVEAU_BO_GART)) - return FALSE; + return false; ret = nouveau_bo_map(buf->bo, 0, nv->client); if (ret) - return FALSE; + return false; memcpy((uint8_t *)buf->bo->map + buf->offset + base, buf->data + base, size); - return TRUE; + return true; } /* Scratch data allocation. */ -static INLINE int +static inline int nouveau_scratch_bo_alloc(struct nouveau_context *nv, struct nouveau_bo **pbo, unsigned size) { @@ -875,7 +870,7 @@ nouveau_scratch_runout_release(struct nouveau_context *nv) /* Allocate an extra bo if we can't fit everything we need simultaneously. * (Could happen for very large user arrays.) */ -static INLINE boolean +static inline bool nouveau_scratch_runout(struct nouveau_context *nv, unsigned size) { int ret; @@ -909,7 +904,7 @@ nouveau_scratch_runout(struct nouveau_context *nv, unsigned size) /* Continue to next scratch buffer, if available (no wrapping, large enough). * Allocate it if it has not yet been created. */ -static INLINE boolean +static inline bool nouveau_scratch_next(struct nouveau_context *nv, unsigned size) { struct nouveau_bo *bo; @@ -917,14 +912,14 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size) const unsigned i = (nv->scratch.id + 1) % NOUVEAU_MAX_SCRATCH_BUFS; if ((size > nv->scratch.bo_size) || (i == nv->scratch.wrap)) - return FALSE; + return false; nv->scratch.id = i; bo = nv->scratch.bo[i]; if (!bo) { ret = nouveau_scratch_bo_alloc(nv, &bo, nv->scratch.bo_size); if (ret) - return FALSE; + return false; nv->scratch.bo[i] = bo; } nv->scratch.current = bo; @@ -937,10 +932,10 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size) return !ret; } -static boolean +static bool nouveau_scratch_more(struct nouveau_context *nv, unsigned min_size) { - boolean ret; + bool ret; ret = nouveau_scratch_next(nv, min_size); if (!ret) diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h index de77f481da3..7e6a6cc804b 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.h +++ b/src/gallium/drivers/nouveau/nouveau_buffer.h @@ -58,7 +58,7 @@ nouveau_copy_buffer(struct nouveau_context *, struct nv04_resource *dst, unsigned dst_pos, struct nv04_resource *src, unsigned src_pos, unsigned size); -boolean +bool nouveau_buffer_migrate(struct nouveau_context *, struct nv04_resource *, unsigned domain); @@ -66,20 +66,20 @@ void * nouveau_resource_map_offset(struct nouveau_context *, struct nv04_resource *, uint32_t offset, uint32_t flags); -static INLINE void +static inline void nouveau_resource_unmap(struct nv04_resource *res) { /* no-op */ } -static INLINE struct nv04_resource * +static inline struct nv04_resource * nv04_resource(struct pipe_resource *resource) { return (struct nv04_resource *)resource; } /* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */ -static INLINE boolean +static inline bool nouveau_resource_mapped_by_gpu(struct pipe_resource *resource) { return nv04_resource(resource)->domain != 0; @@ -93,7 +93,7 @@ struct pipe_resource * nouveau_user_buffer_create(struct pipe_screen *screen, void *ptr, unsigned bytes, unsigned usage); -boolean +bool nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *, unsigned base, unsigned size); diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h index c2ba0159afe..24deb7ee4c0 100644 --- a/src/gallium/drivers/nouveau/nouveau_context.h +++ b/src/gallium/drivers/nouveau/nouveau_context.h @@ -13,7 +13,7 @@ struct nouveau_context { struct nouveau_client *client; struct nouveau_pushbuf *pushbuf; - boolean vbo_dirty; + bool vbo_dirty; void (*copy_data)(struct nouveau_context *, struct nouveau_bo *dst, unsigned, unsigned, @@ -53,7 +53,7 @@ struct nouveau_context { } stats; }; -static INLINE struct nouveau_context * +static inline struct nouveau_context * nouveau_context(struct pipe_context *pipe) { return (struct nouveau_context *)pipe; @@ -69,7 +69,7 @@ nouveau_scratch_runout_release(struct nouveau_context *); * because we don't want to un-bo_ref each allocation every time. This is less * work, and we need the wrap index anyway for extreme situations. */ -static INLINE void +static inline void nouveau_scratch_done(struct nouveau_context *nv) { nv->scratch.wrap = nv->scratch.id; @@ -84,7 +84,7 @@ void * nouveau_scratch_get(struct nouveau_context *, unsigned size, uint64_t *gpu_addr, struct nouveau_bo **); -static INLINE void +static inline void nouveau_context_destroy(struct nouveau_context *ctx) { int i; @@ -96,7 +96,7 @@ nouveau_context_destroy(struct nouveau_context *ctx) FREE(ctx); } -static INLINE void +static inline void nouveau_context_update_frame_stats(struct nouveau_context *nv) { nv->stats.buf_cache_frame <<= 1; @@ -104,7 +104,7 @@ nouveau_context_update_frame_stats(struct nouveau_context *nv) nv->stats.buf_cache_count = 0; nv->stats.buf_cache_frame |= 1; if ((nv->stats.buf_cache_frame & 0xf) == 0xf) - nv->screen->hint_buf_keep_sysmem_copy = TRUE; + nv->screen->hint_buf_keep_sysmem_copy = true; } } diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index 17a5174594d..abcdb479954 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -28,13 +28,13 @@ #include <sched.h> #endif -boolean +bool nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence, - boolean emit) + bool emit) { *fence = CALLOC_STRUCT(nouveau_fence); if (!*fence) - return FALSE; + return false; (*fence)->screen = screen; (*fence)->ref = 1; @@ -43,7 +43,7 @@ nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence, if (emit) nouveau_fence_emit(*fence); - return TRUE; + return true; } static void @@ -58,7 +58,7 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence) } } -boolean +bool nouveau_fence_work(struct nouveau_fence *fence, void (*func)(void *), void *data) { @@ -66,16 +66,16 @@ nouveau_fence_work(struct nouveau_fence *fence, if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { func(data); - return TRUE; + return true; } work = CALLOC_STRUCT(nouveau_fence_work); if (!work) - return FALSE; + return false; work->func = func; work->data = data; LIST_ADD(&work->list, &fence->work); - return TRUE; + return true; } void @@ -132,7 +132,7 @@ nouveau_fence_del(struct nouveau_fence *fence) } void -nouveau_fence_update(struct nouveau_screen *screen, boolean flushed) +nouveau_fence_update(struct nouveau_screen *screen, bool flushed) { struct nouveau_fence *fence; struct nouveau_fence *next = NULL; @@ -167,21 +167,21 @@ nouveau_fence_update(struct nouveau_screen *screen, boolean flushed) #define NOUVEAU_FENCE_MAX_SPINS (1 << 31) -boolean +bool nouveau_fence_signalled(struct nouveau_fence *fence) { struct nouveau_screen *screen = fence->screen; if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) - return TRUE; + return true; if (fence->state >= NOUVEAU_FENCE_STATE_EMITTED) - nouveau_fence_update(screen, FALSE); + nouveau_fence_update(screen, false); return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED; } -boolean +bool nouveau_fence_wait(struct nouveau_fence *fence) { struct nouveau_screen *screen = fence->screen; @@ -195,16 +195,16 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED) if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel)) - return FALSE; + return false; if (fence == screen->fence.current) nouveau_fence_next(screen); do { - nouveau_fence_update(screen, FALSE); + nouveau_fence_update(screen, false); if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) - return TRUE; + return true; if (!spins) NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1); spins++; @@ -218,7 +218,7 @@ nouveau_fence_wait(struct nouveau_fence *fence) fence->sequence, screen->fence.sequence_ack, screen->fence.sequence); - return FALSE; + return false; } void @@ -229,5 +229,5 @@ nouveau_fence_next(struct nouveau_screen *screen) nouveau_fence_ref(NULL, &screen->fence.current); - nouveau_fence_new(screen, &screen->fence.current, FALSE); + nouveau_fence_new(screen, &screen->fence.current, false); } diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h index 7bb132a5d15..a1587051b0f 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.h +++ b/src/gallium/drivers/nouveau/nouveau_fence.h @@ -29,15 +29,15 @@ struct nouveau_fence { void nouveau_fence_emit(struct nouveau_fence *); void nouveau_fence_del(struct nouveau_fence *); -boolean nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **, - boolean emit); -boolean nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *); -void nouveau_fence_update(struct nouveau_screen *, boolean flushed); -void nouveau_fence_next(struct nouveau_screen *); -boolean nouveau_fence_wait(struct nouveau_fence *); -boolean nouveau_fence_signalled(struct nouveau_fence *); - -static INLINE void +bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **, + bool emit); +bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *); +void nouveau_fence_update(struct nouveau_screen *, bool flushed); +void nouveau_fence_next(struct nouveau_screen *); +bool nouveau_fence_wait(struct nouveau_fence *); +bool nouveau_fence_signalled(struct nouveau_fence *); + +static inline void nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref) { if (fence) @@ -51,7 +51,7 @@ nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref) *ref = fence; } -static INLINE struct nouveau_fence * +static inline struct nouveau_fence * nouveau_fence(struct pipe_fence_handle *fence) { return (struct nouveau_fence *)fence; diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h index ff97aaa9af0..1538c7b6e57 100644 --- a/src/gallium/drivers/nouveau/nouveau_gldefs.h +++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h @@ -1,7 +1,7 @@ #ifndef __NOUVEAU_GLDEFS_H__ #define __NOUVEAU_GLDEFS_H__ -static INLINE unsigned +static inline unsigned nvgl_blend_func(unsigned factor) { switch (factor) { @@ -40,7 +40,7 @@ nvgl_blend_func(unsigned factor) } } -static INLINE unsigned +static inline unsigned nvgl_blend_eqn(unsigned func) { switch (func) { @@ -59,7 +59,7 @@ nvgl_blend_eqn(unsigned func) } } -static INLINE unsigned +static inline unsigned nvgl_logicop_func(unsigned func) { switch (func) { @@ -100,7 +100,7 @@ nvgl_logicop_func(unsigned func) } } -static INLINE unsigned +static inline unsigned nvgl_comparison_op(unsigned op) { switch (op) { @@ -125,7 +125,7 @@ nvgl_comparison_op(unsigned op) } } -static INLINE unsigned +static inline unsigned nvgl_polygon_mode(unsigned mode) { switch (mode) { @@ -140,7 +140,7 @@ nvgl_polygon_mode(unsigned mode) } } -static INLINE unsigned +static inline unsigned nvgl_stencil_op(unsigned op) { switch (op) { @@ -165,7 +165,7 @@ nvgl_stencil_op(unsigned op) } } -static INLINE unsigned +static inline unsigned nvgl_primitive(unsigned prim) { switch (prim) { case PIPE_PRIM_POINTS: diff --git a/src/gallium/drivers/nouveau/nouveau_mm.c b/src/gallium/drivers/nouveau/nouveau_mm.c index 9c454c56db0..43b3d99f48a 100644 --- a/src/gallium/drivers/nouveau/nouveau_mm.c +++ b/src/gallium/drivers/nouveau/nouveau_mm.c @@ -70,7 +70,7 @@ mm_slab_alloc(struct mm_slab *slab) return -1; } -static INLINE void +static inline void mm_slab_free(struct mm_slab *slab, int i) { assert(i < slab->count); @@ -79,7 +79,7 @@ mm_slab_free(struct mm_slab *slab, int i) assert(slab->free <= slab->count); } -static INLINE int +static inline int mm_get_order(uint32_t size) { int s = __builtin_clz(size) ^ 31; @@ -104,7 +104,7 @@ mm_bucket_by_size(struct nouveau_mman *cache, unsigned size) } /* size of bo allocation for slab with chunks of (1 << chunk_order) bytes */ -static INLINE uint32_t +static inline uint32_t mm_default_slab_size(unsigned chunk_order) { static const int8_t slab_order[MM_MAX_ORDER - MM_MIN_ORDER + 1] = @@ -263,7 +263,7 @@ nouveau_mm_create(struct nouveau_device *dev, uint32_t domain, return cache; } -static INLINE void +static inline void nouveau_mm_free_slabs(struct list_head *head) { struct mm_slab *slab, *next; diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index c6e5074db19..b2290e7e784 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -68,17 +68,13 @@ nouveau_screen_fence_ref(struct pipe_screen *pscreen, } static boolean -nouveau_screen_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *pfence) -{ - return nouveau_fence_signalled(nouveau_fence(pfence)); -} - -static boolean nouveau_screen_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *pfence, uint64_t timeout) { + if (!timeout) + return nouveau_fence_signalled(nouveau_fence(pfence)); + return nouveau_fence_wait(nouveau_fence(pfence)); } @@ -115,7 +111,7 @@ nouveau_screen_bo_from_handle(struct pipe_screen *pscreen, } -boolean +bool nouveau_screen_bo_get_handle(struct pipe_screen *pscreen, struct nouveau_bo *bo, unsigned stride, @@ -127,11 +123,11 @@ nouveau_screen_bo_get_handle(struct pipe_screen *pscreen, return nouveau_bo_name_get(bo, &whandle->handle) == 0; } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) { whandle->handle = bo->handle; - return TRUE; + return true; } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) { return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0; } else { - return FALSE; + return false; } } @@ -203,7 +199,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) pscreen->get_timestamp = nouveau_screen_get_timestamp; pscreen->fence_reference = nouveau_screen_fence_ref; - pscreen->fence_signalled = nouveau_screen_fence_signalled; pscreen->fence_finish = nouveau_screen_fence_finish; util_format_s3tc_init(); @@ -214,7 +209,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_CURSOR | PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE | + PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE | + PIPE_BIND_COMPUTE_RESOURCE | PIPE_BIND_GLOBAL; screen->sysmem_bindings = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT | diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h index 30041b271c9..4fdde9fbf3d 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.h +++ b/src/gallium/drivers/nouveau/nouveau_screen.h @@ -49,7 +49,7 @@ struct nouveau_screen { int64_t cpu_gpu_time_delta; - boolean hint_buf_keep_sysmem_copy; + bool hint_buf_keep_sysmem_copy; unsigned vram_domain; @@ -112,15 +112,15 @@ struct nouveau_screen { # define NOUVEAU_DRV_STAT_IFD(x) #endif -static INLINE struct nouveau_screen * +static inline struct nouveau_screen * nouveau_screen(struct pipe_screen *pscreen) { return (struct nouveau_screen *)pscreen; } -boolean nouveau_drm_screen_unref(struct nouveau_screen *screen); +bool nouveau_drm_screen_unref(struct nouveau_screen *screen); -boolean +bool nouveau_screen_bo_get_handle(struct pipe_screen *pscreen, struct nouveau_bo *bo, unsigned stride, diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h index 4f8bd7bdf16..f38014091ba 100644 --- a/src/gallium/drivers/nouveau/nouveau_statebuf.h +++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h @@ -20,7 +20,7 @@ struct nouveau_statebuf_builder #define sb_data(sb, v) *(sb).p++ = (v) #endif -static INLINE uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size) +static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size) { return (size << 18) | (subc << 13) | mthd; } diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c index d6330fa63a8..e414a534418 100644 --- a/src/gallium/drivers/nouveau/nouveau_video.c +++ b/src/gallium/drivers/nouveau/nouveau_video.c @@ -100,7 +100,7 @@ nouveau_vpe_fini(struct nouveau_decoder *dec) { dec->current = dec->future = dec->past = 8; } -static INLINE void +static inline void nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb) { int cbb; @@ -125,7 +125,7 @@ nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_ } } -static INLINE void +static inline void nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb) { int cbb; @@ -143,7 +143,7 @@ nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12 } } -static INLINE void +static inline void nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb, bool luma) @@ -187,7 +187,7 @@ nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec, x | (y << NV17_MPEG_CMD_MB_COORDS_Y__SHIFT)); } -static INLINE unsigned int +static inline unsigned int nouveau_vpe_mb_mv_flags(bool luma, int mv_h, int mv_v, bool forward, bool first, bool vert) { unsigned mc_header = 0; @@ -228,7 +228,7 @@ static int div_up(int val, int mult) { return val / mult; } -static INLINE void +static inline void nouveau_vpe_mb_mv(struct nouveau_decoder *dec, unsigned mc_header, bool luma, bool frame, bool forward, bool vert, int x, int y, const short motions[2], @@ -296,16 +296,16 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec, case PIPE_MPEG12_MO_TYPE_DUAL_PRIME: { base = NV17_MPEG_CMD_CHROMA_MV_HEADER_COUNT_2; if (forward) { - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE, - x, y, mb->PMV[0][0], dec->past, TRUE); - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, TRUE, - x, y2, mb->PMV[0][0], dec->past, FALSE); + nouveau_vpe_mb_mv(dec, base, luma, frame, true, false, + x, y, mb->PMV[0][0], dec->past, true); + nouveau_vpe_mb_mv(dec, base, luma, frame, true, true, + x, y2, mb->PMV[0][0], dec->past, false); } if (backward && forward) { - nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, TRUE, - x, y, mb->PMV[1][0], dec->future, TRUE); - nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE, - x, y2, mb->PMV[1][1], dec->future, FALSE); + nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, true, + x, y, mb->PMV[1][0], dec->future, true); + nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false, + x, y2, mb->PMV[1][1], dec->future, false); } else assert(!backward); break; } @@ -320,13 +320,13 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec, if (frame) base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME; if (forward) - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, + nouveau_vpe_mb_mv(dec, base, luma, frame, true, dec->picture_structure != PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP, - x, y, mb->PMV[0][0], dec->past, TRUE); + x, y, mb->PMV[0][0], dec->past, true); if (backward && forward) - nouveau_vpe_mb_mv(dec, base, luma, frame, FALSE, + nouveau_vpe_mb_mv(dec, base, luma, frame, false, dec->picture_structure == PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP, - x, y, mb->PMV[0][1], dec->future, TRUE); + x, y, mb->PMV[0][1], dec->future, true); else assert(!backward); break; } @@ -341,11 +341,11 @@ mv1: base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME; /* frame 16x16 */ if (forward) - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE, - x, y, mb->PMV[0][0], dec->past, TRUE); + nouveau_vpe_mb_mv(dec, base, luma, frame, true, false, + x, y, mb->PMV[0][0], dec->past, true); if (backward) - nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE, - x, y, mb->PMV[0][1], dec->future, TRUE); + nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false, + x, y, mb->PMV[0][1], dec->future, true); return; mv2: @@ -353,20 +353,20 @@ mv2: if (!frame) base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_MV_SPLIT_HALF_MB; if (forward) { - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, + nouveau_vpe_mb_mv(dec, base, luma, frame, true, mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_FORWARD, - x, y, mb->PMV[0][0], dec->past, TRUE); - nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, + x, y, mb->PMV[0][0], dec->past, true); + nouveau_vpe_mb_mv(dec, base, luma, frame, true, mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_FORWARD, - x, y2, mb->PMV[1][0], dec->past, FALSE); + x, y2, mb->PMV[1][0], dec->past, false); } if (backward) { nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_BACKWARD, - x, y, mb->PMV[0][1], dec->future, TRUE); + x, y, mb->PMV[0][1], dec->future, true); nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_BACKWARD, - x, y2, mb->PMV[1][1], dec->future, FALSE); + x, y2, mb->PMV[1][1], dec->future, false); } } @@ -438,14 +438,14 @@ nouveau_decoder_decode_macroblock(struct pipe_video_codec *decoder, mb = (const struct pipe_mpeg12_macroblock *)pipe_mb; for (i = 0; i < num_macroblocks; ++i, mb++) { if (mb->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA) { - nouveau_vpe_mb_dct_header(dec, mb, TRUE); - nouveau_vpe_mb_dct_header(dec, mb, FALSE); + nouveau_vpe_mb_dct_header(dec, mb, true); + nouveau_vpe_mb_dct_header(dec, mb, false); } else { - nouveau_vpe_mb_mv_header(dec, mb, TRUE); - nouveau_vpe_mb_dct_header(dec, mb, TRUE); + nouveau_vpe_mb_mv_header(dec, mb, true); + nouveau_vpe_mb_dct_header(dec, mb, true); - nouveau_vpe_mb_mv_header(dec, mb, FALSE); - nouveau_vpe_mb_dct_header(dec, mb, FALSE); + nouveau_vpe_mb_mv_header(dec, mb, false); + nouveau_vpe_mb_dct_header(dec, mb, false); } if (dec->base.entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT) nouveau_vpe_mb_dct_blocks(dec, mb); diff --git a/src/gallium/drivers/nouveau/nouveau_video.h b/src/gallium/drivers/nouveau/nouveau_video.h index 08d48b371fd..fd1bd527deb 100644 --- a/src/gallium/drivers/nouveau/nouveau_video.h +++ b/src/gallium/drivers/nouveau/nouveau_video.h @@ -45,7 +45,7 @@ struct nouveau_decoder { #define NV31_VIDEO_BIND_CMD NV31_MPEG_IMAGE_Y_OFFSET__LEN #define NV31_VIDEO_BIND_COUNT (NV31_MPEG_IMAGE_Y_OFFSET__LEN + 1) -static INLINE void +static inline void nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) { dec->cmds[dec->ofs++] = data; } @@ -54,33 +54,33 @@ nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) { #define NV31_MPEG(mthd) SUBC_MPEG(NV31_MPEG_##mthd) #define NV84_MPEG(mthd) SUBC_MPEG(NV84_MPEG_##mthd) -static INLINE uint32_t +static inline uint32_t NV04_FIFO_PKHDR(int subc, int mthd, unsigned size) { return 0x00000000 | (size << 18) | (subc << 13) | mthd; } -static INLINE uint32_t +static inline uint32_t NV04_FIFO_PKHDR_NI(int subc, int mthd, unsigned size) { return 0x40000000 | (size << 18) | (subc << 13) | mthd; } -static INLINE void +static inline void BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { PUSH_SPACE(push, size + 1); PUSH_DATA (push, NV04_FIFO_PKHDR(subc, mthd, size)); } -static INLINE void +static inline void BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { PUSH_SPACE(push, size + 1); PUSH_DATA (push, NV04_FIFO_PKHDR_NI(subc, mthd, size)); } -static INLINE void +static inline void PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, struct nouveau_bo *bo, uint32_t offset, struct nouveau_bufctx *ctx, int bin, uint32_t rw) diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h index 279a1ce18ef..33e3bef3df3 100644 --- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h +++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h @@ -135,22 +135,22 @@ struct comm { uint32_t parse_endpos[0x10]; // 1c0 }; -static INLINE uint32_t nouveau_vp3_video_align(uint32_t h) +static inline uint32_t nouveau_vp3_video_align(uint32_t h) { return ((h+0x3f)&~0x3f); }; -static INLINE uint32_t mb(uint32_t coord) +static inline uint32_t mb(uint32_t coord) { return (coord + 0xf)>>4; } -static INLINE uint32_t mb_half(uint32_t coord) +static inline uint32_t mb_half(uint32_t coord) { return (coord + 0x1f)>>5; } -static INLINE uint64_t +static inline uint64_t nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target) { uint64_t ret; @@ -161,7 +161,7 @@ nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video return dec->ref_bo->offset + ret; } -static INLINE void +static inline void nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2, uint32_t *cbcr, uint32_t *cbcr2) { @@ -182,7 +182,7 @@ nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2, } } -static INLINE void +static inline void nouveau_vp3_inter_sizes(struct nouveau_vp3_decoder *dec, uint32_t slice_count, uint32_t *slice_size, uint32_t *bucket_size, uint32_t *ring_size) diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h index 51effb1d8d2..389a229eb78 100644 --- a/src/gallium/drivers/nouveau/nouveau_winsys.h +++ b/src/gallium/drivers/nouveau/nouveau_winsys.h @@ -15,34 +15,34 @@ #define NOUVEAU_MIN_BUFFER_MAP_ALIGN 64 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK (NOUVEAU_MIN_BUFFER_MAP_ALIGN - 1) -static INLINE uint32_t +static inline uint32_t PUSH_AVAIL(struct nouveau_pushbuf *push) { return push->end - push->cur; } -static INLINE boolean +static inline bool PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size) { if (PUSH_AVAIL(push) < size) return nouveau_pushbuf_space(push, size, 0, 0) == 0; - return TRUE; + return true; } -static INLINE void +static inline void PUSH_DATA(struct nouveau_pushbuf *push, uint32_t data) { *push->cur++ = data; } -static INLINE void +static inline void PUSH_DATAp(struct nouveau_pushbuf *push, const void *data, uint32_t size) { memcpy(push->cur, data, size * 4); push->cur += size; } -static INLINE void +static inline void PUSH_DATAf(struct nouveau_pushbuf *push, float f) { union { float f; uint32_t i; } u; @@ -50,7 +50,7 @@ PUSH_DATAf(struct nouveau_pushbuf *push, float f) PUSH_DATA(push, u.i); } -static INLINE void +static inline void PUSH_KICK(struct nouveau_pushbuf *push) { nouveau_pushbuf_kick(push, push->channel); @@ -60,7 +60,7 @@ PUSH_KICK(struct nouveau_pushbuf *push) #define NOUVEAU_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define NOUVEAU_RESOURCE_FLAG_DRV_PRIV (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) -static INLINE uint32_t +static inline uint32_t nouveau_screen_transfer_flags(unsigned pipe) { uint32_t flags = 0; diff --git a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h index 447f4b3b7ae..95468e580dd 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h +++ b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h @@ -1459,6 +1459,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV40_3D_VTX_CACHE_INVALIDATE 0x00001714 +#define NV40_3D_VB_ELEMENT_BASE 0x0000173c + #define NV30_3D_VTXFMT(i0) (0x00001740 + 0x4*(i0)) #define NV30_3D_VTXFMT__ESIZE 0x00000004 #define NV30_3D_VTXFMT__LEN 0x00000010 diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c index 83fd1fa38dd..118cac77277 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c @@ -32,7 +32,7 @@ #include "nv30/nv30_context.h" #include "nv30/nv30_format.h" -static INLINE uint32_t +static inline uint32_t pack_rgba(enum pipe_format format, const float *rgba) { union util_color uc; @@ -40,7 +40,7 @@ pack_rgba(enum pipe_format format, const float *rgba) return uc.ui[0]; } -static INLINE uint32_t +static inline uint32_t pack_zeta(enum pipe_format format, double depth, unsigned stencil) { uint32_t zuint = (uint32_t)(depth * 4294967295.0); @@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers, struct pipe_framebuffer_state *fb = &nv30->framebuffer; uint32_t colr = 0, zeta = 0, mode = 0; - if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE)) + if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, true)) return; if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c index 617b0887810..6e88ed725d6 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_context.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c @@ -45,7 +45,7 @@ nv30_context_kick_notify(struct nouveau_pushbuf *push) screen = &nv30->screen->base; nouveau_fence_next(screen); - nouveau_fence_update(screen, TRUE); + nouveau_fence_update(screen, true); if (push->bufctx) { struct nouveau_bufref *bref; @@ -165,6 +165,12 @@ nv30_context_destroy(struct pipe_context *pipe) if (nv30->draw) draw_destroy(nv30->draw); + if (nv30->blit_vp) + nouveau_heap_free(&nv30->blit_vp); + + if (nv30->blit_fp) + pipe_resource_reference(&nv30->blit_fp, NULL); + if (nv30->screen->base.pushbuf->user_priv == &nv30->bufctx) nv30->screen->base.pushbuf->user_priv = NULL; @@ -233,7 +239,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv) nv30->config.aniso = NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF; - if (debug_get_bool_option("NV30_SWTNL", FALSE)) + if (debug_get_bool_option("NV30_SWTNL", false)) nv30->draw_flags |= NV30_NEW_SWTNL; nv30->sample_mask = 0xffff; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h index 592cdbe24f9..d5c18bb62dc 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_context.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h @@ -51,7 +51,8 @@ struct nv30_context { unsigned rt_enable; unsigned scissor_off; unsigned num_vtxelts; - boolean prim_restart; + int index_bias; + bool prim_restart; struct nv30_fragprog *fragprog; } state; @@ -114,17 +115,17 @@ struct nv30_context { uint32_t vbo_user; unsigned vbo_min_index; unsigned vbo_max_index; - boolean vbo_push_hint; + bool vbo_push_hint; struct nouveau_heap *blit_vp; struct pipe_resource *blit_fp; struct pipe_query *render_cond_query; unsigned render_cond_mode; - boolean render_cond_cond; + bool render_cond_cond; }; -static INLINE struct nv30_context * +static inline struct nv30_context * nv30_context(struct pipe_context *pipe) { return (struct nv30_context *)pipe; @@ -203,8 +204,8 @@ nv30_draw_init(struct pipe_context *pipe); void nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info); -boolean -nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl); +bool +nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl); void nv30_state_release(struct nv30_context *nv30); @@ -213,7 +214,7 @@ nv30_state_release(struct nv30_context *nv30); #define NV30_PRIM_GL_CASE(n) \ case PIPE_PRIM_##n: return NV30_3D_VERTEX_BEGIN_END_##n -static INLINE unsigned +static inline unsigned nv30_prim_gl(unsigned prim) { switch (prim) { diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c index c1665b7ad2f..098d6e499fa 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c @@ -52,7 +52,7 @@ struct nv30_render { uint32_t prim; }; -static INLINE struct nv30_render * +static inline struct nv30_render * nv30_render(struct vbuf_render *render) { return (struct nv30_render *)render; @@ -79,12 +79,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render, PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM, render->max_vertex_buffer_bytes); if (!r->buffer) - return FALSE; + return false; r->offset = 0; } - return TRUE; + return true; } static void * @@ -134,7 +134,7 @@ nv30_render_draw_elements(struct vbuf_render *render, NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1); } - if (!nv30_state_validate(nv30, ~0, FALSE)) + if (!nv30_state_validate(nv30, ~0, false)) return; BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1); @@ -179,7 +179,7 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr) NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1); } - if (!nv30_state_validate(nv30, ~0, FALSE)) + if (!nv30_state_validate(nv30, ~0, false)) return; BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1); @@ -221,7 +221,7 @@ static const struct { [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }, }; -static boolean +static bool vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) { struct nv30_screen *screen = r->nv30->screen; @@ -245,7 +245,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) } if (emit == EMIT_OMIT) - return FALSE; + return false; draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib); format = draw_translate_vinfo_format(emit); @@ -272,10 +272,10 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx) assert(sem == TGSI_SEMANTIC_TEXCOORD); *idx = 0x00001000 << (result - 8); } - return TRUE; + return true; } -static boolean +static bool nv30_render_validate(struct nv30_context *nv30) { struct nv30_render *r = nv30_render(nv30->draw->render); @@ -300,7 +300,7 @@ nv30_render_validate(struct nv30_context *nv30) } if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog)) - return FALSE; + return false; } } @@ -370,7 +370,7 @@ nv30_render_validate(struct nv30_context *nv30) } vinfo->size /= 4; - return TRUE; + return true; } void @@ -519,6 +519,6 @@ nv30_draw_init(struct pipe_context *pipe) draw_set_rasterize_stage(draw, stage); draw_wide_line_threshold(draw, 10000000.f); draw_wide_point_threshold(draw, 10000000.f); - draw_wide_point_sprites(draw, TRUE); + draw_wide_point_sprites(draw, true); nv30->draw = draw; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.h b/src/gallium/drivers/nouveau/nv30/nv30_format.h index 8bf4a37299f..fa1e922fb65 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_format.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_format.h @@ -27,28 +27,28 @@ struct nv30_texfmt { }; extern const struct nv30_format_info nv30_format_info_table[]; -static INLINE const struct nv30_format_info * +static inline const struct nv30_format_info * nv30_format_info(struct pipe_screen *pscreen, enum pipe_format format) { return &nv30_format_info_table[format]; } extern const struct nv30_format nv30_format_table[]; -static INLINE const struct nv30_format * +static inline const struct nv30_format * nv30_format(struct pipe_screen *pscreen, enum pipe_format format) { return &nv30_format_table[format]; } extern const struct nv30_vtxfmt nv30_vtxfmt_table[]; -static INLINE const struct nv30_vtxfmt * +static inline const struct nv30_vtxfmt * nv30_vtxfmt(struct pipe_screen *pscreen, enum pipe_format format) { return &nv30_vtxfmt_table[format]; } extern const struct nv30_texfmt nv30_texfmt_table[]; -static INLINE const struct nv30_texfmt * +static inline const struct nv30_texfmt * nv30_texfmt(struct pipe_screen *pscreen, enum pipe_format format) { return &nv30_texfmt_table[format]; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c index 7f227868f73..6de61bcc1c0 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c @@ -37,22 +37,26 @@ nv30_fragprog_upload(struct nv30_context *nv30) struct nouveau_context *nv = &nv30->base; struct nv30_fragprog *fp = nv30->fragprog.program; struct pipe_context *pipe = &nv30->base.pipe; - struct pipe_transfer *transfer; - uint32_t *map; - int i; (void)i; - if (unlikely(!fp->buffer)) { + if (unlikely(!fp->buffer)) fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4); - } - map = pipe_buffer_map(pipe, fp->buffer, PIPE_TRANSFER_WRITE, &transfer); #ifndef PIPE_ARCH_BIG_ENDIAN - memcpy(map, fp->insn, fp->insn_len * 4); + pipe_buffer_write(pipe, fp->buffer, 0, fp->insn_len * 4, fp->insn); #else - for (i = 0; i < fp->insn_len; i++) - *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16); + { + struct pipe_transfer *transfer; + uint32_t *map; + int i; + + map = pipe_buffer_map(pipe, fp->buffer, + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE, + &transfer); + for (i = 0; i < fp->insn_len; i++) + *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16); + pipe_buffer_unmap(pipe, transfer); + } #endif - pipe_buffer_unmap(pipe, transfer); if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM) nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM); @@ -64,7 +68,7 @@ nv30_fragprog_validate(struct nv30_context *nv30) struct nouveau_pushbuf *push = nv30->base.pushbuf; struct nouveau_object *eng3d = nv30->screen->eng3d; struct nv30_fragprog *fp = nv30->fragprog.program; - boolean upload = FALSE; + bool upload = false; int i; if (!fp->translated) { @@ -72,7 +76,7 @@ nv30_fragprog_validate(struct nv30_context *nv30) if (!fp->translated) return; - upload = TRUE; + upload = true; } /* update constants, also needs to be done on every fp switch as we @@ -89,7 +93,7 @@ nv30_fragprog_validate(struct nv30_context *nv30) if (!memcmp(&fp->insn[off], &cbuf[idx], 4 * 4)) continue; memcpy(&fp->insn[off], &cbuf[idx], 4 * 4); - upload = TRUE; + upload = true; } } @@ -161,8 +165,15 @@ static void nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso) { struct nv30_context *nv30 = nv30_context(pipe); + struct nv30_fragprog *fp = hwcso; + + /* reset the bucftx so that we don't keep a dangling reference to the fp + * code + */ + if (fp != nv30->state.fragprog) + PUSH_RESET(nv30->base.pushbuf, BUFCTX_FRAGPROG); - nv30->fragprog.program = hwcso; + nv30->fragprog.program = fp; nv30->dirty |= NV30_NEW_FRAGPROG; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c index 1a4b8929c0f..c75b4b95fd8 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c @@ -33,7 +33,7 @@ #include "nv30/nv30_resource.h" #include "nv30/nv30_transfer.h" -static INLINE unsigned +static inline unsigned layer_offset(struct pipe_resource *pt, unsigned level, unsigned layer) { struct nv30_miptree *mt = nv30_miptree(pt); @@ -54,7 +54,7 @@ nv30_miptree_get_handle(struct pipe_screen *pscreen, unsigned stride; if (!mt || !mt->base.bo) - return FALSE; + return false; stride = mt->level[0].pitch; @@ -78,13 +78,13 @@ struct nv30_transfer { unsigned nblocksy; }; -static INLINE struct nv30_transfer * +static inline struct nv30_transfer * nv30_transfer(struct pipe_transfer *ptx) { return (struct nv30_transfer *)ptx; } -static INLINE void +static inline void define_rect(struct pipe_resource *pt, unsigned level, unsigned z, unsigned x, unsigned y, unsigned w, unsigned h, struct nv30_rect *rect) @@ -242,8 +242,8 @@ nv30_miptree_transfer_map(struct pipe_context *pipe, struct pipe_resource *pt, tx->base.level = level; tx->base.usage = usage; tx->base.box = *box; - tx->base.stride = util_format_get_nblocksx(pt->format, box->width) * - util_format_get_blocksize(pt->format); + tx->base.stride = align(util_format_get_nblocksx(pt->format, box->width) * + util_format_get_blocksize(pt->format), 64); tx->base.layer_stride = util_format_get_nblocksy(pt->format, box->height) * tx->base.stride; @@ -372,7 +372,7 @@ nv30_miptree_create(struct pipe_screen *pscreen, } if (!mt->uniform_pitch) - mt->swizzled = TRUE; + mt->swizzled = true; size = 0; for (l = 0; l <= pt->last_level; l++) { diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c index e0734fa70d3..67ab0508c17 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_push.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c @@ -47,12 +47,12 @@ struct push_context { struct translate *translate; - boolean primitive_restart; + bool primitive_restart; uint32_t prim; uint32_t restart_index; }; -static INLINE unsigned +static inline unsigned prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index) { unsigned i; @@ -62,7 +62,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index) { unsigned i; @@ -72,7 +72,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index) { unsigned i; @@ -199,7 +199,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info) { struct push_context ctx; unsigned i, index_size; - boolean apply_bias = info->indexed && info->index_bias; + bool apply_bias = info->indexed && info->index_bias; ctx.push = nv30->base.pushbuf; ctx.translate = nv30->vertex->translate; @@ -241,7 +241,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info) } else { ctx.idxbuf = NULL; index_size = 0; - ctx.primitive_restart = FALSE; + ctx.primitive_restart = false; ctx.restart_index = 0; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c index 516ee83168e..3980be9579a 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_query.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c @@ -98,7 +98,7 @@ struct nv30_query { uint64_t result; }; -static INLINE struct nv30_query * +static inline struct nv30_query * nv30_query(struct pipe_query *pipe) { return (struct nv30_query *)pipe; @@ -208,7 +208,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq, if (ntfy1) { while (ntfy1[3] & 0xff000000) { if (!wait) - return FALSE; + return false; } switch (q->type) { @@ -228,7 +228,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq, } *res64 = q->result; - return TRUE; + return true; } static void diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c index 38fac8af898..a98a6464de8 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c @@ -42,12 +42,12 @@ nv30_memory_barrier(struct pipe_context *pipe, unsigned flags) if (!nv30->vtxbuf[i].buffer) continue; if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; } if (nv30->idxbuf.buffer && nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; } } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h index 1981c8d9ab9..8dac7795c9d 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_resource.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h @@ -15,7 +15,7 @@ struct nv30_surface { uint16_t depth; }; -static INLINE struct nv30_surface * +static inline struct nv30_surface * nv30_surface(struct pipe_surface *ps) { return (struct nv30_surface *)ps; @@ -32,13 +32,13 @@ struct nv30_miptree { struct nv30_miptree_level level[13]; uint32_t uniform_pitch; uint32_t layer_size; - boolean swizzled; + bool swizzled; unsigned ms_mode; unsigned ms_x:1; unsigned ms_y:1; }; -static INLINE struct nv30_miptree * +static inline struct nv30_miptree * nv30_miptree(struct pipe_resource *pt) { return (struct nv30_miptree *)pt; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 2e38a1978ae..7aad26ba18b 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -69,6 +69,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return PIPE_ENDIAN_LITTLE; case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: return 16; + case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: + return NOUVEAU_MIN_BUFFER_MAP_ALIGN; case PIPE_CAP_MAX_VIEWPORTS: return 1; case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: @@ -96,6 +98,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 1; + /* nv35 capabilities */ + case PIPE_CAP_DEPTH_BOUNDS_TEST: + return eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS; /* nv4x capabilities */ case PIPE_CAP_BLEND_EQUATION_SEPARATE: case PIPE_CAP_NPOT_TEXTURES: @@ -135,7 +140,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: case PIPE_CAP_START_INSTANCE: case PIPE_CAP_TEXTURE_MULTISAMPLE: - case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_QUERY_PIPELINE_STATISTICS: @@ -162,6 +166,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 0; case PIPE_CAP_VENDOR_ID: @@ -313,12 +320,12 @@ nv30_screen_is_format_supported(struct pipe_screen *pscreen, unsigned bindings) { if (sample_count > 4) - return FALSE; + return false; if (!(0x00000017 & (1 << sample_count))) - return FALSE; + return false; if (!util_format_is_supported(format, bindings)) { - return FALSE; + return false; } /* transfers & shared are always supported */ @@ -656,6 +663,6 @@ nv30_screen_create(struct nouveau_device *dev) nouveau_pushbuf_kick(push, push->channel); - nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE); + nouveau_fence_new(&screen->base, &screen->base.fence.current, false); return pscreen; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.h b/src/gallium/drivers/nouveau/nv30/nv30_screen.h index 3f2e47fec99..7b17b88097c 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.h @@ -40,7 +40,7 @@ struct nv30_screen { struct nouveau_heap *vp_data_heap; }; -static INLINE struct nv30_screen * +static inline struct nv30_screen * nv30_screen(struct pipe_screen *pscreen) { return (struct nv30_screen *)pscreen; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c index 708ba34c1e5..fd604c2266d 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_state.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c @@ -211,6 +211,7 @@ static void * nv30_zsa_state_create(struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *cso) { + struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d; struct nv30_zsa_stateobj *so; so = CALLOC_STRUCT(nv30_zsa_stateobj); @@ -223,6 +224,13 @@ nv30_zsa_state_create(struct pipe_context *pipe, SB_DATA (so, cso->depth.writemask); SB_DATA (so, cso->depth.enabled); + if (eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS) { + SB_MTHD35(so, DEPTH_BOUNDS_TEST_ENABLE, 3); + SB_DATA (so, cso->depth.bounds_test); + SB_DATA (so, fui(cso->depth.bounds_min)); + SB_DATA (so, fui(cso->depth.bounds_max)); + } + if (cso->stencil[0].enabled) { SB_MTHD30(so, STENCIL_ENABLE(0), 3); SB_DATA (so, 1); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h index e27e16fae82..ed3b8103a00 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_state.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h @@ -13,6 +13,8 @@ #define SB_DATA(so, u) (so)->data[(so)->size++] = (u) #define SB_MTHD30(so, mthd, size) \ SB_DATA((so), ((size) << 18) | (7 << 13) | NV30_3D_##mthd) +#define SB_MTHD35(so, mthd, size) \ + SB_DATA((so), ((size) << 18) | (7 << 13) | NV35_3D_##mthd) #define SB_MTHD40(so, mthd, size) \ SB_DATA((so), ((size) << 18) | (7 << 13) | NV40_3D_##mthd) @@ -30,7 +32,7 @@ struct nv30_rasterizer_stateobj { struct nv30_zsa_stateobj { struct pipe_depth_stencil_alpha_state pipe; - unsigned data[32]; + unsigned data[36]; unsigned size; }; @@ -80,7 +82,7 @@ struct nv30_vertprog { struct tgsi_shader_info info; struct draw_vertex_shader *draw; - boolean translated; + bool translated; unsigned enabled_ucps; uint16_t texcoord[10]; @@ -109,7 +111,7 @@ struct nv30_fragprog { struct tgsi_shader_info info; struct draw_fragment_shader *draw; - boolean translated; + bool translated; uint32_t *insn; unsigned insn_len; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c index a954dcce562..8957634f0fa 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c @@ -453,8 +453,8 @@ nv30_state_context_switch(struct nv30_context *nv30) nv30->base.pushbuf->user_priv = &nv30->bufctx; } -boolean -nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl) +bool +nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl) { struct nouveau_screen *screen = &nv30->screen->base; struct nouveau_pushbuf *push = nv30->base.pushbuf; @@ -494,7 +494,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl) nouveau_pushbuf_bufctx(push, bctx); if (nouveau_pushbuf_validate(push)) { nouveau_pushbuf_bufctx(push, NULL); - return FALSE; + return false; } /*XXX*/ @@ -528,7 +528,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl) } } - return TRUE; + return true; } void diff --git a/src/gallium/drivers/nouveau/nv30/nv30_texture.c b/src/gallium/drivers/nouveau/nv30/nv30_texture.c index c3567217442..bfe21cceaa2 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_texture.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_texture.c @@ -37,7 +37,7 @@ #define NV40_WRAP(n) \ case PIPE_TEX_WRAP_##n: ret = NV40_3D_TEX_WRAP_S_##n; break -static INLINE unsigned +static inline unsigned wrap_mode(unsigned pipe) { unsigned ret = NV30_3D_TEX_WRAP_S_REPEAT; @@ -58,7 +58,7 @@ wrap_mode(unsigned pipe) return ret >> NV30_3D_TEX_WRAP_S__SHIFT; } -static INLINE unsigned +static inline unsigned filter_mode(const struct pipe_sampler_state *cso) { unsigned filter; @@ -104,7 +104,7 @@ filter_mode(const struct pipe_sampler_state *cso) return filter; } -static INLINE unsigned +static inline unsigned compare_mode(const struct pipe_sampler_state *cso) { if (cso->compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE) @@ -201,7 +201,7 @@ nv30_bind_sampler_states(struct pipe_context *pipe, } } -static INLINE uint32_t +static inline uint32_t swizzle(const struct nv30_texfmt *fmt, unsigned cmp, unsigned swz) { uint32_t data = fmt->swz[swz].src << 8; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c index 99bc0994ac2..214da6568c3 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c @@ -41,33 +41,33 @@ * of different ways. */ -static INLINE boolean +static inline bool nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst) { if (src->x1 - src->x0 != dst->x1 - dst->x0) - return TRUE; + return true; if (src->y1 - src->y0 != dst->y1 - dst->y0) - return TRUE; - return FALSE; + return true; + return false; } -static INLINE boolean +static inline bool nv30_transfer_blit(XFER_ARGS) { if (nv30->screen->eng3d->oclass < NV40_3D_CLASS) - return FALSE; + return false; if (dst->offset & 63 || dst->pitch & 63 || dst->d > 1) - return FALSE; + return false; if (dst->w < 2 || dst->h < 2) - return FALSE; + return false; if (dst->cpp > 4 || (dst->cpp == 1 && !dst->pitch)) - return FALSE; + return false; if (src->cpp > 4) - return FALSE; - return TRUE; + return false; + return true; } -static INLINE struct nouveau_heap * +static inline struct nouveau_heap * nv30_transfer_rect_vertprog(struct nv30_context *nv30) { struct nouveau_heap *heap = nv30->screen->vp_exec_heap; @@ -108,7 +108,7 @@ nv30_transfer_rect_vertprog(struct nv30_context *nv30) } -static INLINE struct nv04_resource * +static inline struct nv04_resource * nv30_transfer_rect_fragprog(struct nv30_context *nv30) { struct nv04_resource *fp = nv04_resource(nv30->blit_fp); @@ -368,29 +368,29 @@ nv30_transfer_rect_blit(XFER_ARGS) PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP); } -static boolean +static bool nv30_transfer_sifm(XFER_ARGS) { if (!src->pitch || (src->w | src->h) > 1024 || src->w < 2 || src->h < 2) - return FALSE; + return false; if (src->d > 1 || dst->d > 1) - return FALSE; + return false; if (dst->offset & 63) - return FALSE; + return false; if (!dst->pitch) { if ((dst->w | dst->h) > 2048 || dst->w < 2 || dst->h < 2) - return FALSE; + return false; } else { if (dst->domain != NOUVEAU_BO_VRAM) - return FALSE; + return false; if (dst->pitch & 63) - return FALSE; + return false; } - return TRUE; + return true; } static void @@ -481,14 +481,14 @@ nv30_transfer_rect_sifm(XFER_ARGS) * that name is still accurate on nv4x) error. */ -static boolean +static bool nv30_transfer_m2mf(XFER_ARGS) { if (!src->pitch || !dst->pitch) - return FALSE; + return false; if (nv30_transfer_scaled(src, dst)) - return FALSE; - return TRUE; + return false; + return true; } static void @@ -540,12 +540,12 @@ nv30_transfer_rect_m2mf(XFER_ARGS) } } -static boolean +static bool nv30_transfer_cpu(XFER_ARGS) { if (nv30_transfer_scaled(src, dst)) - return FALSE; - return TRUE; + return false; + return true; } static char * @@ -554,7 +554,7 @@ linear_ptr(struct nv30_rect *rect, char *base, int x, int y, int z) return base + (y * rect->pitch) + (x * rect->cpp); } -static INLINE unsigned +static inline unsigned swizzle2d(unsigned v, unsigned s) { v = (v | (v << 8)) & 0x00ff00ff; @@ -614,7 +614,7 @@ swizzle3d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z) typedef char *(*get_ptr_t)(struct nv30_rect *, char *, int, int, int); -static INLINE get_ptr_t +static inline get_ptr_t get_ptr(struct nv30_rect *rect) { if (rect->pitch) @@ -653,7 +653,7 @@ nv30_transfer_rect(struct nv30_context *nv30, enum nv30_transfer_filter filter, { static const struct { char *name; - boolean (*possible)(XFER_ARGS); + bool (*possible)(XFER_ARGS); void (*execute)(XFER_ARGS); } *method, methods[] = { { "m2mf", nv30_transfer_m2mf, nv30_transfer_rect_m2mf }, diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c index d4e384b21d2..8494549e9b1 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c @@ -79,7 +79,7 @@ nv30_emit_vtxattr(struct nv30_context *nv30, struct pipe_vertex_buffer *vb, } } -static INLINE void +static inline void nv30_vbuf_range(struct nv30_context *nv30, int vbi, uint32_t *base, uint32_t *size) { @@ -119,7 +119,7 @@ nv30_prevalidate_vbufs(struct nv30_context *nv30) } else { nouveau_buffer_migrate(&nv30->base, buf, NOUVEAU_BO_GART); } - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; } } } @@ -160,10 +160,10 @@ nv30_update_user_vbufs(struct nv30_context *nv30) NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1); } - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; } -static INLINE void +static inline void nv30_release_user_vbufs(struct nv30_context *nv30) { uint32_t vbo_user = nv30->vbo_user; @@ -202,6 +202,9 @@ nv30_vbo_validate(struct nv30_context *nv30) return; redefine = MAX2(vertex->num_elements, nv30->state.num_vtxelts); + if (redefine == 0) + return; + BEGIN_NV04(push, NV30_3D(VTXFMT(0)), redefine); for (i = 0; i < vertex->num_elements; i++) { @@ -221,7 +224,7 @@ nv30_vbo_validate(struct nv30_context *nv30) for (i = 0; i < vertex->num_elements; i++) { struct nv04_resource *res; unsigned offset; - boolean user; + bool user; ve = &vertex->pipe[i]; vb = &nv30->vtxbuf[ve->vertex_buffer_index]; @@ -254,14 +257,12 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements, struct translate_key transkey; unsigned i; - assert(num_elements); - so = MALLOC(sizeof(*so) + sizeof(*so->element) * num_elements); if (!so) return NULL; memcpy(so->pipe, elements, sizeof(*elements) * num_elements); so->num_elements = num_elements; - so->need_conversion = FALSE; + so->need_conversion = false; transkey.nr_elements = 0; transkey.output_stride = 0; @@ -284,7 +285,7 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements, return NULL; } so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw; - so->need_conversion = TRUE; + so->need_conversion = true; } if (1) { @@ -452,7 +453,7 @@ nv30_draw_elements_inline_u32_short(struct nouveau_pushbuf *push, } static void -nv30_draw_elements(struct nv30_context *nv30, boolean shorten, +nv30_draw_elements(struct nv30_context *nv30, bool shorten, unsigned mode, unsigned start, unsigned count, unsigned instance_count, int32_t index_bias) { @@ -461,13 +462,11 @@ nv30_draw_elements(struct nv30_context *nv30, boolean shorten, struct nouveau_object *eng3d = nv30->screen->eng3d; unsigned prim = nv30_prim_gl(mode); -#if 0 /*XXX*/ - if (index_bias != nv30->state.index_bias) { - BEGIN_NV04(push, NV30_3D(VB_ELEMENT_BASE), 1); + if (eng3d->oclass >= NV40_3D_CLASS && index_bias != nv30->state.index_bias) { + BEGIN_NV04(push, NV40_3D(VB_ELEMENT_BASE), 1); PUSH_DATA (push, index_bias); nv30->state.index_bias = index_bias; } -#endif if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 && nv30->idxbuf.buffer) { @@ -564,7 +563,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS))) nv30_update_user_vbufs(nv30); - nv30_state_validate(nv30, ~0, TRUE); + nv30_state_validate(nv30, ~0, true); if (nv30->draw_flags) { nv30_render_vbo(pipe, info); return; @@ -578,17 +577,17 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (!nv30->vtxbuf[i].buffer) continue; if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; } if (!nv30->base.vbo_dirty && nv30->idxbuf.buffer && nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv30->base.vbo_dirty = TRUE; + nv30->base.vbo_dirty = true; if (nv30->base.vbo_dirty) { BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1); PUSH_DATA (push, 0); - nv30->base.vbo_dirty = FALSE; + nv30->base.vbo_dirty = false; } if (!info->indexed) { @@ -596,7 +595,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) info->mode, info->start, info->count, info->instance_count); } else { - boolean shorten = info->max_index <= 65535; + bool shorten = info->max_index <= 65535; if (info->primitive_restart != nv30->state.prim_restart) { if (info->primitive_restart) { @@ -605,7 +604,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->restart_index); if (info->restart_index > 65535) - shorten = FALSE; + shorten = false; } else { BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 1); PUSH_DATA (push, 0); @@ -617,7 +616,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->restart_index); if (info->restart_index > 65535) - shorten = FALSE; + shorten = false; } nv30_draw_elements(nv30, shorten, diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c index 4d4145d10b5..ee0a6280d7a 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c @@ -48,7 +48,7 @@ nv30_vertprog_destroy(struct nv30_vertprog *vp) vp->consts = NULL; vp->nr_consts = 0; - vp->translated = FALSE; + vp->translated = false; } void @@ -58,8 +58,8 @@ nv30_vertprog_validate(struct nv30_context *nv30) struct nouveau_object *eng3d = nv30->screen->eng3d; struct nv30_vertprog *vp = nv30->vertprog.program; struct nv30_fragprog *fp = nv30->fragprog.program; - boolean upload_code = FALSE; - boolean upload_data = FALSE; + bool upload_code = false; + bool upload_data = false; unsigned i; if (nv30->dirty & NV30_NEW_FRAGPROG) { @@ -125,7 +125,7 @@ nv30_vertprog_validate(struct nv30_context *nv30) } } - upload_code = TRUE; + upload_code = true; } if (vp->nr_consts && !vp->data) { @@ -166,8 +166,8 @@ nv30_vertprog_validate(struct nv30_context *nv30) } } - upload_code = TRUE; - upload_data = TRUE; + upload_code = true; + upload_data = true; } if (vp->nr_consts) { diff --git a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h index 5cee5df60ce..2324b517c44 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h +++ b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h @@ -19,34 +19,34 @@ #define NV40_3D_PRIM_RESTART_ENABLE 0x1dac #define NV40_3D_PRIM_RESTART_INDEX 0x1db0 -static INLINE void +static inline void PUSH_RELOC(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t offset, uint32_t flags, uint32_t vor, uint32_t tor) { nouveau_pushbuf_reloc(push, bo, offset, flags, vor, tor); } -static INLINE struct nouveau_bufctx * +static inline struct nouveau_bufctx * bufctx(struct nouveau_pushbuf *push) { struct nouveau_bufctx **pctx = push->user_priv; return *pctx; } -static INLINE void +static inline void PUSH_RESET(struct nouveau_pushbuf *push, int bin) { nouveau_bufctx_reset(bufctx(push), bin); } -static INLINE void +static inline void PUSH_REFN(struct nouveau_pushbuf *push, int bin, struct nouveau_bo *bo, uint32_t access) { nouveau_bufctx_refn(bufctx(push), bin, bo, access); } -static INLINE void +static inline void PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin, struct nouveau_bo *bo, uint32_t offset, uint32_t access) { @@ -55,7 +55,7 @@ PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin, PUSH_DATA(push, bo->offset + offset); } -static INLINE void +static inline void PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin, struct nouveau_bo *bo, uint32_t access, uint32_t vor, uint32_t tor) { @@ -67,7 +67,7 @@ PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin, PUSH_DATA(push, tor); } -static INLINE void +static inline void PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin, struct nouveau_bo *bo, uint32_t data, uint32_t access, uint32_t vor, uint32_t tor) @@ -80,7 +80,7 @@ PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin, PUSH_DATA(push, data | tor); } -static INLINE struct nouveau_bufref * +static inline struct nouveau_bufref * PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin, struct nouveau_bo *bo, uint32_t data, uint32_t access, uint32_t vor, uint32_t tor) @@ -99,7 +99,7 @@ PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin, return bref; } -static INLINE void +static inline void PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin, struct nv04_resource *r, uint32_t data, uint32_t access, uint32_t vor, uint32_t tor) @@ -108,14 +108,14 @@ PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin, r->domain | access, vor, tor)->priv = r; } -static INLINE void +static inline void BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size) { PUSH_SPACE(push, size + 1); PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd); } -static INLINE void +static inline void BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size) { PUSH_SPACE(push, size + 1); diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c index 9ef16965f39..e68d23e5587 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c @@ -44,7 +44,7 @@ struct nvfx_fpc { struct util_dynarray label_relocs; }; -static INLINE struct nvfx_reg +static inline struct nvfx_reg temp(struct nvfx_fpc *fpc) { int idx = __builtin_ctzll(~fpc->r_temps); @@ -60,7 +60,7 @@ temp(struct nvfx_fpc *fpc) return nvfx_reg(NVFXSR_TEMP, idx); } -static INLINE void +static inline void release_temps(struct nvfx_fpc *fpc) { fpc->r_temps &= ~fpc->r_temps_discard; @@ -373,7 +373,7 @@ nv40_fp_brk(struct nvfx_fpc *fpc) hw[3] = 0; } -static INLINE struct nvfx_src +static inline struct nvfx_src tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) { struct nvfx_src src; @@ -415,7 +415,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) return src; } -static INLINE struct nvfx_reg +static inline struct nvfx_reg tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) { switch (fdst->Register.File) { case TGSI_FILE_OUTPUT: @@ -430,7 +430,7 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) { } } -static INLINE int +static inline int tgsi_mask(uint tgsi) { int mask = 0; @@ -442,7 +442,7 @@ tgsi_mask(uint tgsi) return mask; } -static boolean +static bool nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc, const struct tgsi_full_instruction *finst) { @@ -455,7 +455,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc, int i; if (finst->Instruction.Opcode == TGSI_OPCODE_END) - return TRUE; + return true; for (i = 0; i < finst->Instruction.NumSrcRegs; i++) { const struct tgsi_full_src_register *fsrc; @@ -525,7 +525,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc, break; default: NOUVEAU_ERR("bad src file\n"); - return FALSE; + return false; } } @@ -868,12 +868,12 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc, default: NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); - return FALSE; + return false; } out: release_temps(fpc); - return TRUE; + return true; nv3x_cflow: { static int warned = 0; @@ -887,7 +887,7 @@ nv3x_cflow: goto out; } -static boolean +static bool nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc, const struct tgsi_full_declaration *fdec) { @@ -917,17 +917,17 @@ nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc, case TGSI_SEMANTIC_GENERIC: case TGSI_SEMANTIC_PCOORD: /* will be assigned to remaining TC slots later */ - return TRUE; + return true; default: assert(0); - return FALSE; + return false; } fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw); - return TRUE; + return true; } -static boolean +static bool nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc, const struct tgsi_full_declaration *fdec) { @@ -954,16 +954,16 @@ nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc, } hw = NVFX_FP_OP_INPUT_SRC_TC(hw); fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw); - return TRUE; + return true; } } - return FALSE; + return false; default: - return TRUE; + return true; } } -static boolean +static bool nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc, const struct tgsi_full_declaration *fdec) { @@ -984,20 +984,20 @@ nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc, } if(hw > ((fpc->is_nv4x) ? 4 : 2)) { NOUVEAU_ERR("bad rcol index\n"); - return FALSE; + return false; } break; default: NOUVEAU_ERR("bad output semantic\n"); - return FALSE; + return false; } fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw); fpc->r_temps |= (1ULL << hw); - return TRUE; + return true; } -static boolean +static bool nvfx_fragprog_prepare(struct nvfx_fpc *fpc) { struct tgsi_parse_context p; @@ -1081,17 +1081,17 @@ nvfx_fragprog_prepare(struct nvfx_fpc *fpc) fpc->r_temps_discard = 0ULL; } - return TRUE; + return true; out_err: FREE(fpc->r_temp); fpc->r_temp = NULL; tgsi_parse_free(&p); - return FALSE; + return false; } -DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE) +DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false) void _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp) @@ -1100,7 +1100,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp) struct nvfx_fpc *fpc = NULL; struct util_dynarray insns; - fp->translated = FALSE; + fp->translated = false; fp->point_sprite_control = 0; fp->vp_or = 0; @@ -1182,7 +1182,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp) debug_printf("\n"); } - fp->translated = TRUE; + fp->translated = true; out: tgsi_parse_free(&parse); diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h index 9538a793d7e..e66d8af7620 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h +++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h @@ -448,8 +448,8 @@ struct nvfx_insn struct nvfx_src src[3]; }; -static INLINE struct nvfx_insn -nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2) +static inline struct nvfx_insn +nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2) { struct nvfx_insn insn = { .op = op, @@ -468,7 +468,7 @@ nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask return insn; } -static INLINE struct nvfx_reg +static inline struct nvfx_reg nvfx_reg(int type, int index) { struct nvfx_reg temp = { @@ -478,7 +478,7 @@ nvfx_reg(int type, int index) return temp; } -static INLINE struct nvfx_src +static inline struct nvfx_src nvfx_src(struct nvfx_reg reg) { struct nvfx_src temp = { @@ -491,7 +491,7 @@ nvfx_src(struct nvfx_reg reg) return temp; } -static INLINE struct nvfx_src +static inline struct nvfx_src nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w) { struct nvfx_src dst = src; @@ -503,14 +503,14 @@ nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w) return dst; } -static INLINE struct nvfx_src +static inline struct nvfx_src nvfx_src_neg(struct nvfx_src src) { src.negate = !src.negate; return src; } -static INLINE struct nvfx_src +static inline struct nvfx_src nvfx_src_abs(struct nvfx_src src) { src.abs = 1; @@ -529,7 +529,7 @@ struct nv30_vertprog; void _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp); -boolean +bool _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp); #endif diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c index 1ce0589be71..5757eb1fb16 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c @@ -416,7 +416,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) { return src; } -static INLINE struct nvfx_reg +static inline struct nvfx_reg tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) { struct nvfx_reg dst; @@ -455,7 +455,7 @@ tgsi_mask(uint tgsi) return mask; } -static boolean +static bool nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, unsigned idx, const struct tgsi_full_instruction *finst) { @@ -466,7 +466,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, struct nvfx_insn insn; struct nvfx_relocation reloc; struct nvfx_loop_entry loop; - boolean sat = FALSE; + bool sat = false; int mask; int ai = -1, ci = -1, ii = -1; int i; @@ -524,25 +524,25 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, break; default: NOUVEAU_ERR("bad src file\n"); - return FALSE; + return false; } } for (i = 0; i < finst->Instruction.NumSrcRegs; i++) { if(src[i].reg.type < 0) - return FALSE; + return false; } if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS && finst->Instruction.Opcode != TGSI_OPCODE_ARL) - return FALSE; + return false; final_dst = dst = tgsi_dst(vpc, &finst->Dst[0]); mask = tgsi_mask(finst->Dst[0].Register.WriteMask); if(finst->Instruction.Saturate) { assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL); if (vpc->is_nv4x) - sat = TRUE; + sat = true; else if(dst.type != NVFXSR_TEMP) dst = temp(vpc); @@ -793,7 +793,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, break; default: NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); - return FALSE; + return false; } if(finst->Instruction.Saturate && !vpc->is_nv4x) { @@ -804,10 +804,10 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc, } release_temps(vpc); - return TRUE; + return true; } -static boolean +static bool nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, const struct tgsi_full_declaration *fdec) { @@ -825,7 +825,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, vpc->r_result[idx] = temp(vpc); vpc->r_temps_discard = 0; vpc->cvtx_idx = idx; - return TRUE; + return true; case TGSI_SEMANTIC_COLOR: if (fdec->Semantic.Index == 0) { hw = NVFX_VP(INST_DEST_COL0); @@ -834,7 +834,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, hw = NVFX_VP(INST_DEST_COL1); } else { NOUVEAU_ERR("bad colour semantic index\n"); - return FALSE; + return false; } break; case TGSI_SEMANTIC_BCOLOR: @@ -845,7 +845,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, hw = NVFX_VP(INST_DEST_BFC1); } else { NOUVEAU_ERR("bad bcolour semantic index\n"); - return FALSE; + return false; } break; case TGSI_SEMANTIC_FOG: @@ -868,22 +868,22 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc, if (i == num_texcoords) { vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0); - return TRUE; + return true; } break; case TGSI_SEMANTIC_EDGEFLAG: vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0); - return TRUE; + return true; default: NOUVEAU_ERR("bad output semantic\n"); - return FALSE; + return false; } vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw); - return TRUE; + return true; } -static boolean +static bool nvfx_vertprog_prepare(struct nvfx_vpc *vpc) { struct tgsi_parse_context p; @@ -924,7 +924,7 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc) break; case TGSI_FILE_OUTPUT: if (!nvfx_vertprog_parse_decl_output(vpc, fdec)) - return FALSE; + return false; break; default: break; @@ -961,12 +961,12 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc) } vpc->r_temps_discard = 0; - return TRUE; + return true; } -DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE) +DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", false) -boolean +bool _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp) { struct tgsi_parse_context parse; @@ -975,13 +975,13 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp) struct util_dynarray insns; int i, ucps; - vp->translated = FALSE; + vp->translated = false; vp->nr_insns = 0; vp->nr_consts = 0; vpc = CALLOC_STRUCT(nvfx_vpc); if (!vpc) - return FALSE; + return false; vpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0; vpc->vp = vp; vpc->pipe = vp->pipe; @@ -990,7 +990,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp) if (!nvfx_vertprog_prepare(vpc)) { FREE(vpc); - return FALSE; + return false; } /* Redirect post-transform vertex position to a temp if user clip @@ -1108,7 +1108,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp) debug_printf("\n"); } - vp->translated = TRUE; + vp->translated = true; out: tgsi_parse_free(&parse); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h index 756c4c11bf6..0ccec568d3a 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h @@ -37,7 +37,7 @@ nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *); #define NV50_BLIT_TEXTURE_2D_ARRAY 5 #define NV50_BLIT_MAX_TEXTURE_TYPES 6 -static INLINE unsigned +static inline unsigned nv50_blit_texture_type(enum pipe_texture_target target) { switch (target) { @@ -52,7 +52,7 @@ nv50_blit_texture_type(enum pipe_texture_target target) } } -static INLINE unsigned +static inline unsigned nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target) { switch (target) { @@ -67,7 +67,7 @@ nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target) } } -static INLINE enum pipe_texture_target +static inline enum pipe_texture_target nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target) { switch (target) { @@ -81,7 +81,7 @@ nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target) } } -static INLINE unsigned +static inline unsigned nv50_blit_get_filter(const struct pipe_blit_info *info) { if (info->dst.resource->nr_samples < info->src.resource->nr_samples) @@ -102,7 +102,7 @@ nv50_blit_get_filter(const struct pipe_blit_info *info) /* Since shaders cannot export stencil, we cannot copy stencil values when * rendering to ZETA, so we attach the ZS surface to a colour render target. */ -static INLINE enum pipe_format +static inline enum pipe_format nv50_blit_zeta_to_colour_format(enum pipe_format format) { switch (format) { @@ -127,7 +127,7 @@ nv50_blit_zeta_to_colour_format(enum pipe_format format) } -static INLINE uint16_t +static inline uint16_t nv50_blit_derive_color_mask(const struct pipe_blit_info *info) { const unsigned mask = info->mask; @@ -162,7 +162,7 @@ nv50_blit_derive_color_mask(const struct pipe_blit_info *info) return color_mask; } -static INLINE uint32_t +static inline uint32_t nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info) { uint32_t mask = 0; @@ -191,8 +191,8 @@ nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info) # define nv50_format_table nvc0_format_table #endif -/* return TRUE for formats that can be converted among each other by NVC0_2D */ -static INLINE boolean +/* return true for formats that can be converted among each other by NVC0_2D */ +static inline bool nv50_2d_dst_format_faithful(enum pipe_format format) { const uint64_t mask = @@ -201,7 +201,7 @@ nv50_2d_dst_format_faithful(enum pipe_format format) uint8_t id = nv50_format_table[format].rt; return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0))); } -static INLINE boolean +static inline bool nv50_2d_src_format_faithful(enum pipe_format format) { const uint64_t mask = @@ -211,7 +211,7 @@ nv50_2d_src_format_faithful(enum pipe_format format) return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0))); } -static INLINE boolean +static inline bool nv50_2d_format_supported(enum pipe_format format) { uint8_t id = nv50_format_table[format].rt; @@ -219,7 +219,7 @@ nv50_2d_format_supported(enum pipe_format format) (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0))); } -static INLINE boolean +static inline bool nv50_2d_dst_format_ops_supported(enum pipe_format format) { uint8_t id = nv50_format_table[format].rt; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 5b5d3912c20..f8d46db7c67 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -64,12 +64,12 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags) if (!nv50->vtxbuf[i].buffer) continue; if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; } if (nv50->idxbuf.buffer && nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; for (s = 0; s < 3 && !nv50->cb_dirty; ++s) { uint32_t valid = nv50->constbuf_valid[s]; @@ -87,7 +87,7 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags) continue; if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nv50->cb_dirty = TRUE; + nv50->cb_dirty = true; } } } @@ -100,9 +100,9 @@ nv50_default_kick_notify(struct nouveau_pushbuf *push) if (screen) { nouveau_fence_next(&screen->base); - nouveau_fence_update(&screen->base, TRUE); + nouveau_fence_update(&screen->base, true); if (screen->cur_ctx) - screen->cur_ctx->state.flushed = TRUE; + screen->cur_ctx->state.flushed = true; } } @@ -310,7 +310,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv) nv50->base.invalidate_resource_storage = nv50_invalidate_resource_storage; if (screen->base.device->chipset < 0x84 || - debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) { + debug_get_bool_option("NOUVEAU_PMPEG", false)) { /* PMPEG */ nouveau_context_init_vdec(&nv50->base); } else if (screen->base.device->chipset < 0x98 || @@ -351,7 +351,7 @@ out_err: } void -nv50_bufctx_fence(struct nouveau_bufctx *bufctx, boolean on_flush) +nv50_bufctx_fence(struct nouveau_bufctx *bufctx, bool on_flush) { struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending; struct nouveau_list *it; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h index 1f123ef7e92..ce12e714774 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h @@ -91,7 +91,7 @@ struct nv50_blitctx; -boolean nv50_blitctx_create(struct nv50_context *); +bool nv50_blitctx_create(struct nv50_context *); struct nv50_context { struct nouveau_context base; @@ -102,7 +102,7 @@ struct nv50_context { struct nouveau_bufctx *bufctx; uint32_t dirty; - boolean cb_dirty; + bool cb_dirty; struct nv50_graph_state state; @@ -152,26 +152,26 @@ struct nv50_context { unsigned sample_mask; unsigned min_samples; - boolean vbo_push_hint; + bool vbo_push_hint; uint32_t rt_array_mode; struct pipe_query *cond_query; - boolean cond_cond; /* inverted rendering condition */ + bool cond_cond; /* inverted rendering condition */ uint cond_mode; uint32_t cond_condmode; /* the calculated condition */ struct nv50_blitctx *blit; }; -static INLINE struct nv50_context * +static inline struct nv50_context * nv50_context(struct pipe_context *pipe) { return (struct nv50_context *)pipe; } /* return index used in nv50_context arrays for a specific shader type */ -static INLINE unsigned +static inline unsigned nv50_context_shader_stage(unsigned pipe) { switch (pipe) { @@ -188,7 +188,7 @@ nv50_context_shader_stage(unsigned pipe) /* nv50_context.c */ struct pipe_context *nv50_create(struct pipe_screen *, void *); -void nv50_bufctx_fence(struct nouveau_bufctx *, boolean on_flush); +void nv50_bufctx_fence(struct nouveau_bufctx *, bool on_flush); void nv50_default_kick_notify(struct nouveau_pushbuf *); @@ -202,7 +202,7 @@ void nv50_query_pushbuf_submit(struct nouveau_pushbuf *, void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *); void nva0_so_target_save_offset(struct pipe_context *, struct pipe_stream_output_target *, - unsigned index, boolean seralize); + unsigned index, bool seralize); #define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) @@ -221,8 +221,8 @@ extern void nv50_init_state_functions(struct nv50_context *); /* nv50_state_validate.c */ /* @words: check for space before emitting relocs */ -extern boolean nv50_state_validate(struct nv50_context *, uint32_t state_mask, - unsigned space_words); +extern bool nv50_state_validate(struct nv50_context *, uint32_t state_mask, + unsigned space_words); /* nv50_surface.c */ extern void nv50_clear(struct pipe_context *, unsigned buffers, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c index 0f86ba1de0d..49a93bf1d91 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c @@ -44,7 +44,7 @@ */ #define U_V PIPE_BIND_VERTEX_BUFFER #define U_T PIPE_BIND_SAMPLER_VIEW -#define U_I PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE +#define U_I PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE #define U_TR PIPE_BIND_RENDER_TARGET | U_T #define U_IR U_TR | U_I #define U_TB PIPE_BIND_BLENDABLE | U_TR diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c index f15d8f3ecb6..92d49e49ff2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c @@ -30,7 +30,7 @@ uint32_t nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz, - boolean is_3d) + bool is_3d) { uint32_t tile_mode = 0x000; @@ -59,13 +59,13 @@ nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz, } static uint32_t -nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d) +nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d) { return nv50_tex_choose_tile_dims_helper(nx, ny * 2, nz, is_3d); } static uint32_t -nv50_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed) +nv50_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed) { const unsigned ms = util_logbase2(mt->base.base.nr_samples); uint32_t tile_flags; @@ -184,7 +184,7 @@ nv50_miptree_get_handle(struct pipe_screen *pscreen, unsigned stride; if (!mt || !mt->base.bo) - return FALSE; + return false; stride = mt->level[0].pitch; @@ -204,7 +204,7 @@ const struct u_resource_vtbl nv50_miptree_vtbl = u_default_transfer_inline_write /* transfer_inline_write */ }; -static INLINE boolean +static inline bool nv50_miptree_init_ms_mode(struct nv50_miptree *mt) { switch (mt->base.base.nr_samples) { @@ -228,12 +228,12 @@ nv50_miptree_init_ms_mode(struct nv50_miptree *mt) break; default: NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples); - return FALSE; + return false; } - return TRUE; + return true; } -boolean +bool nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align) { struct pipe_resource *pt = &mt->base.base; @@ -241,12 +241,12 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align) unsigned h = pt->height0; if (util_format_is_depth_or_stencil(pt->format)) - return FALSE; + return false; if ((pt->last_level > 0) || (pt->depth0 > 1) || (pt->array_size > 1)) - return FALSE; + return false; if (mt->ms_x | mt->ms_y) - return FALSE; + return false; mt->level[0].pitch = align(pt->width0 * blocksize, pitch_align); @@ -256,7 +256,7 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align) mt->total_size = mt->level[0].pitch * h; - return TRUE; + return true; } static void @@ -335,7 +335,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, struct nouveau_device *dev = nouveau_screen(pscreen)->device; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_resource *pt = &mt->base.base; - boolean compressed = dev->drm_version >= 0x01000101; + bool compressed = dev->drm_version >= 0x01000101; int ret; union nouveau_bo_config bo_config; uint32_t bo_flags; @@ -438,7 +438,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen, /* Offset of zslice @z from start of level @l. */ -INLINE unsigned +inline unsigned nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z) { const struct pipe_resource *pt = &mt->base.base; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index aaca4c550d9..02dc3677259 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -25,7 +25,7 @@ #include "codegen/nv50_ir_driver.h" -static INLINE unsigned +static inline unsigned bitcount4(const uint32_t val) { static const uint8_t cnt[16] @@ -104,7 +104,7 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info) prog->vp.bfc[info->out[i].si] = i; break; case TGSI_SEMANTIC_LAYER: - prog->gp.has_layer = TRUE; + prog->gp.has_layer = true; prog->gp.layerid = n; break; case TGSI_SEMANTIC_VIEWPORT_INDEX: @@ -316,7 +316,7 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, return so; } -boolean +bool nv50_program_translate(struct nv50_program *prog, uint16_t chipset) { struct nv50_ir_prog_info *info; @@ -325,7 +325,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset) info = CALLOC_STRUCT(nv50_ir_prog_info); if (!info) - return FALSE; + return false; info->type = prog->type; info->target = chipset; @@ -410,7 +410,7 @@ out: return !ret; } -boolean +bool nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) { struct nouveau_heap *heap; @@ -423,7 +423,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break; default: assert(!"invalid program type"); - return FALSE; + return false; } ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); @@ -440,7 +440,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); if (ret) { NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); - return FALSE; + return false; } } prog->code_base = prog->mem->start; @@ -448,10 +448,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) ret = nv50_tls_realloc(nv50->screen, prog->tls_space); if (ret < 0) { nouveau_heap_free(&prog->mem); - return FALSE; + return false; } if (ret > 0) - nv50->state.new_tls_space = TRUE; + nv50->state.new_tls_space = true; if (prog->fixups) nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0); @@ -463,7 +463,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1); PUSH_DATA (nv50->base.pushbuf, 0); - return TRUE; + return true; } void diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h index fe6bd6025be..5d3ff5644d2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h @@ -53,7 +53,7 @@ struct nv50_program { struct pipe_shader_state pipe; ubyte type; - boolean translated; + bool translated; uint32_t *code; unsigned code_size; @@ -104,8 +104,8 @@ struct nv50_program { struct nv50_stream_output_state *so; }; -boolean nv50_program_translate(struct nv50_program *, uint16_t chipset); -boolean nv50_program_upload_code(struct nv50_context *, struct nv50_program *); +bool nv50_program_translate(struct nv50_program *, uint16_t chipset); +bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *); void nv50_program_destroy(struct nv50_context *, struct nv50_program *); #endif /* __NV50_PROG_H__ */ diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c index a3a397c52c1..f31eaa0e314 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_push.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c @@ -23,13 +23,13 @@ struct push_context { struct translate *translate; - boolean primitive_restart; + bool primitive_restart; uint32_t prim; uint32_t restart_index; uint32_t instance_id; }; -static INLINE unsigned +static inline unsigned prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index) { unsigned i; @@ -39,7 +39,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index) { unsigned i; @@ -49,7 +49,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index) { unsigned i; @@ -179,7 +179,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count) #define NV50_PRIM_GL_CASE(n) \ case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n -static INLINE unsigned +static inline unsigned nv50_prim_gl(unsigned prim) { switch (prim) { @@ -212,7 +212,7 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info) unsigned i, index_size; unsigned inst_count = info->instance_count; unsigned vert_count = info->count; - boolean apply_bias = info->indexed && info->index_bias; + bool apply_bias = info->indexed && info->index_bias; ctx.push = nv50->base.pushbuf; ctx.translate = nv50->vertex->translate; @@ -258,12 +258,12 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info) NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n"); return; } - pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count); + pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count); vert_count /= targ->stride; } ctx.idxbuf = NULL; index_size = 0; - ctx.primitive_restart = FALSE; + ctx.primitive_restart = false; ctx.restart_index = 0; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c index 81f7474e36b..f4adbf8c653 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c @@ -48,20 +48,21 @@ struct nv50_query { uint32_t base; uint32_t offset; /* base + i * 32 */ uint8_t state; - boolean is64bit; + bool is64bit; + int nesting; /* only used for occlusion queries */ struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; #define NV50_QUERY_ALLOC_SPACE 256 -static INLINE struct nv50_query * +static inline struct nv50_query * nv50_query(struct pipe_query *pipe) { return (struct nv50_query *)pipe; } -static boolean +static bool nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size) { struct nv50_screen *screen = nv50->screen; @@ -80,17 +81,17 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size) if (size) { q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); if (!q->bo) - return FALSE; + return false; q->offset = q->base; ret = nouveau_bo_map(q->bo, 0, screen->base.client); if (ret) { nv50_query_allocate(nv50, q, 0); - return FALSE; + return false; } q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base); } - return TRUE; + return true; } static void @@ -153,8 +154,8 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq) struct nv50_query *q = nv50_query(pq); /* For occlusion queries we have to change the storage, because a previous - * query might set the initial render conition to FALSE even *after* we re- - * initialized it to TRUE. + * query might set the initial render conition to false even *after* we re- + * initialized it to true. */ if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) { q->offset += 32; @@ -166,7 +167,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq) * query ? */ q->data[0] = q->sequence; /* initialize sequence */ - q->data[1] = 1; /* initial render condition = TRUE */ + q->data[1] = 1; /* initial render condition = true */ q->data[4] = q->sequence + 1; /* for comparison COND_MODE */ q->data[5] = 0; } @@ -175,11 +176,16 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq) switch (q->type) { case PIPE_QUERY_OCCLUSION_COUNTER: - PUSH_SPACE(push, 4); - BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1); - PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT); - BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); - PUSH_DATA (push, 1); + q->nesting = nv50->screen->num_occlusion_queries_active++; + if (q->nesting) { + nv50_query_get(push, q, 0x10, 0x0100f002); + } else { + PUSH_SPACE(push, 4); + BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1); + PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT); + BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); + PUSH_DATA (push, 1); + } break; case PIPE_QUERY_PRIMITIVES_GENERATED: nv50_query_get(push, q, 0x10, 0x06805002); @@ -223,9 +229,11 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) switch (q->type) { case PIPE_QUERY_OCCLUSION_COUNTER: nv50_query_get(push, q, 0, 0x0100f002); - PUSH_SPACE(push, 2); - BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); - PUSH_DATA (push, 0); + if (--nv50->screen->num_occlusion_queries_active == 0) { + PUSH_SPACE(push, 2); + BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); + PUSH_DATA (push, 0); + } break; case PIPE_QUERY_PRIMITIVES_GENERATED: nv50_query_get(push, q, 0, 0x06805002); @@ -261,7 +269,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5)); break; case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* This query is not issued on GPU because disjoint is forced to FALSE */ + /* This query is not issued on GPU because disjoint is forced to false */ q->state = NV50_QUERY_STATE_READY; break; default: @@ -273,7 +281,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq) nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence); } -static INLINE void +static inline void nv50_query_update(struct nv50_query *q) { if (q->is64bit) { @@ -293,7 +301,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, struct nv50_query *q = nv50_query(pq); uint64_t *res64 = (uint64_t *)result; uint32_t *res32 = (uint32_t *)result; - boolean *res8 = (boolean *)result; + uint8_t *res8 = (uint8_t *)result; uint64_t *data64 = (uint64_t *)q->data; int i; @@ -307,19 +315,19 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, q->state = NV50_QUERY_STATE_FLUSHED; PUSH_KICK(nv50->base.pushbuf); } - return FALSE; + return false; } if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client)) - return FALSE; + return false; } q->state = NV50_QUERY_STATE_READY; switch (q->type) { case PIPE_QUERY_GPU_FINISHED: - res8[0] = TRUE; + res8[0] = true; break; case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ - res64[0] = q->data[1]; + res64[0] = q->data[1] - q->data[5]; break; case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ @@ -338,7 +346,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, break; case PIPE_QUERY_TIMESTAMP_DISJOINT: res64[0] = 1000000000; - res8[8] = FALSE; + res8[8] = false; break; case PIPE_QUERY_TIME_ELAPSED: res64[0] = data64[1] - data64[3]; @@ -347,10 +355,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq, res32[0] = q->data[1]; break; default: - return FALSE; + return false; } - return TRUE; + return true; } void @@ -377,7 +385,7 @@ nv50_render_condition(struct pipe_context *pipe, struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nv50_query *q; uint32_t cond; - boolean wait = + bool wait = mode != PIPE_RENDER_COND_NO_WAIT && mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; @@ -391,13 +399,12 @@ nv50_render_condition(struct pipe_context *pipe, case PIPE_QUERY_SO_OVERFLOW_PREDICATE: cond = condition ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_NOT_EQUAL; - wait = TRUE; + wait = true; break; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: if (likely(!condition)) { - /* XXX: Placeholder, handle nesting here if available */ - if (unlikely(false)) + if (unlikely(q->nesting)) cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS; else @@ -461,7 +468,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push, void nva0_so_target_save_offset(struct pipe_context *pipe, struct pipe_stream_output_target *ptarg, - unsigned index, boolean serialize) + unsigned index, bool serialize) { struct nv50_so_target *targ = nv50_so_target(ptarg); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h index f7ee1354a92..a46e622c597 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h @@ -35,7 +35,7 @@ nv50_screen_init_resource_functions(struct pipe_screen *pscreen); uint32_t nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz, - boolean is_3d); + bool is_3d); struct nv50_miptree_level { uint32_t offset; @@ -50,13 +50,13 @@ struct nv50_miptree { struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS]; uint32_t total_size; uint32_t layer_stride; - boolean layout_3d; /* TRUE if layer count varies with mip level */ + bool layout_3d; /* true if layer count varies with mip level */ uint8_t ms_x; /* log2 of number of samples in x/y dimension */ uint8_t ms_y; uint8_t ms_mode; }; -static INLINE struct nv50_miptree * +static inline struct nv50_miptree * nv50_miptree(struct pipe_resource *pt) { return (struct nv50_miptree *)pt; @@ -70,7 +70,7 @@ nv50_miptree(struct pipe_resource *pt) /* Internal functions: */ -boolean +bool nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align); struct pipe_resource * @@ -98,13 +98,13 @@ struct nv50_surface { uint16_t depth; }; -static INLINE struct nv50_surface * +static inline struct nv50_surface * nv50_surface(struct pipe_surface *ps) { return (struct nv50_surface *)ps; } -static INLINE enum pipe_format +static inline enum pipe_format nv50_zs_to_s_format(enum pipe_format format) { switch (format) { diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 6583a353578..30e6e042fbf 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -51,19 +51,19 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, unsigned bindings) { if (sample_count > 8) - return FALSE; + return false; if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */ - return FALSE; + return false; if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128) - return FALSE; + return false; if (!util_format_is_supported(format, bindings)) - return FALSE; + return false; switch (format) { case PIPE_FORMAT_Z16_UNORM: if (nv50_screen(pscreen)->tesla->oclass < NVA0_3D_CLASS) - return FALSE; + return false; break; default: break; @@ -176,6 +176,9 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CLIP_HALFZ: case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_QUERY_PIPELINE_STATISTICS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -210,6 +213,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: return 0; case PIPE_CAP_VENDOR_ID: @@ -286,7 +290,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: /* The chip could handle more sampler views than samplers */ case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return MIN2(32, PIPE_MAX_SAMPLERS); + return MIN2(16, PIPE_MAX_SAMPLERS); case PIPE_SHADER_CAP_DOUBLES: case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: @@ -454,7 +458,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen) BEGIN_NV04(push, NV50_3D(UNK1400_LANES), 1); PUSH_DATA (push, 0xf); - if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) { + if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) { BEGIN_NV04(push, NV50_3D(WATCHDOG_TIMER), 1); PUSH_DATA (push, 0x18); } @@ -734,7 +738,7 @@ nv50_screen_create(struct nouveau_device *dev) nv50_screen_init_resource_functions(pscreen); if (screen->base.device->chipset < 0x84 || - debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) { + debug_get_bool_option("NOUVEAU_PMPEG", false)) { /* PMPEG */ nouveau_screen_init_vdec(&screen->base); } else if (screen->base.device->chipset < 0x98 || @@ -890,7 +894,7 @@ nv50_screen_create(struct nouveau_device *dev) nv50_screen_init_hwctx(screen); - nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE); + nouveau_fence_new(&screen->base, &screen->base.fence.current, false); return pscreen; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h index 881051b1862..ce51f0fc254 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h @@ -32,14 +32,14 @@ struct nv50_graph_state { uint32_t semantic_color; uint32_t semantic_psize; int32_t index_bias; - boolean uniform_buffer_bound[3]; - boolean prim_restart; - boolean point_sprite; - boolean rt_serialize; - boolean flushed; - boolean rasterizer_discard; + bool uniform_buffer_bound[3]; + bool prim_restart; + bool point_sprite; + bool rt_serialize; + bool flushed; + bool rasterizer_discard; uint8_t tls_required; - boolean new_tls_space; + bool new_tls_space; uint8_t num_vtxbufs; uint8_t num_vtxelts; uint8_t num_textures[3]; @@ -54,6 +54,8 @@ struct nv50_screen { struct nv50_context *cur_ctx; struct nv50_graph_state save_state; + int num_occlusion_queries_active; + struct nouveau_bo *code; struct nouveau_bo *uniforms; struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ @@ -95,19 +97,19 @@ struct nv50_screen { struct nouveau_object *m2mf; }; -static INLINE struct nv50_screen * +static inline struct nv50_screen * nv50_screen(struct pipe_screen *screen) { return (struct nv50_screen *)screen; } -boolean nv50_blitter_create(struct nv50_screen *); +bool nv50_blitter_create(struct nv50_screen *); void nv50_blitter_destroy(struct nv50_screen *); int nv50_screen_tic_alloc(struct nv50_screen *, void *); int nv50_screen_tsc_alloc(struct nv50_screen *, void *); -static INLINE void +static inline void nv50_resource_fence(struct nv04_resource *res, uint32_t flags) { struct nv50_screen *screen = nv50_screen(res->base.screen); @@ -119,7 +121,7 @@ nv50_resource_fence(struct nv04_resource *res, uint32_t flags) } } -static INLINE void +static inline void nv50_resource_validate(struct nv04_resource *res, uint32_t flags) { if (likely(res->bo)) { @@ -142,21 +144,21 @@ struct nv50_format { extern const struct nv50_format nv50_format_table[]; -static INLINE void +static inline void nv50_screen_tic_unlock(struct nv50_screen *screen, struct nv50_tic_entry *tic) { if (tic->id >= 0) screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32)); } -static INLINE void +static inline void nv50_screen_tsc_unlock(struct nv50_screen *screen, struct nv50_tsc_entry *tsc) { if (tsc->id >= 0) screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32)); } -static INLINE void +static inline void nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic) { if (tic->id >= 0) { @@ -165,7 +167,7 @@ nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic) } } -static INLINE void +static inline void nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc) { if (tsc->id >= 0) { diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index c698782d8bd..b033ce5c6dc 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -60,7 +60,7 @@ nv50_constbufs_validate(struct nv50_context *nv50) continue; } if (!nv50->state.uniform_buffer_bound[s]) { - nv50->state.uniform_buffer_bound[s] = TRUE; + nv50->state.uniform_buffer_bound[s] = true; BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1); PUSH_DATA (push, (b << 12) | (i << 8) | p | 1); } @@ -99,33 +99,35 @@ nv50_constbufs_validate(struct nv50_context *nv50) PUSH_DATA (push, (b << 12) | (i << 8) | p | 1); BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD); + + nv50->cb_dirty = 1; /* Force cache flush for UBO. */ } else { BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1); PUSH_DATA (push, (i << 8) | p | 0); } if (i == 0) - nv50->state.uniform_buffer_bound[s] = FALSE; + nv50->state.uniform_buffer_bound[s] = false; } } } } -static boolean +static bool nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog) { if (!prog->translated) { prog->translated = nv50_program_translate( prog, nv50->screen->base.device->chipset); if (!prog->translated) - return FALSE; + return false; } else if (prog->mem) - return TRUE; + return true; return nv50_program_upload_code(nv50, prog); } -static INLINE void +static inline void nv50_program_update_context_state(struct nv50_context *nv50, struct nv50_program *prog, int stage) { @@ -136,7 +138,7 @@ nv50_program_update_context_state(struct nv50_context *nv50, nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS); if (!nv50->state.tls_required || nv50->state.new_tls_space) BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo); - nv50->state.new_tls_space = FALSE; + nv50->state.new_tls_space = false; nv50->state.tls_required |= 1 << stage; } else { if (nv50->state.tls_required == (1 << stage)) @@ -243,11 +245,11 @@ nv50_sprite_coords_validate(struct nv50_context *nv50) for (i = 0; i < 8; ++i) PUSH_DATA(push, 0); - nv50->state.point_sprite = FALSE; + nv50->state.point_sprite = false; } return; } else { - nv50->state.point_sprite = TRUE; + nv50->state.point_sprite = true; } memset(pntc, 0, sizeof(pntc)); @@ -646,7 +648,7 @@ nv50_stream_output_validate(struct nv50_context *nv50) nv50_query_pushbuf_submit(push, targ->pq, 0x4); } else { PUSH_DATA(push, 0); - targ->clean = FALSE; + targ->clean = false; } } else { const unsigned limit = targ->pipe.buffer_size / diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index d4d41af3c61..9505a0b4085 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -62,7 +62,7 @@ * in advance to maintain elegant separate shader objects.) */ -static INLINE uint32_t +static inline uint32_t nv50_colormask(unsigned mask) { uint32_t ret = 0; @@ -82,7 +82,7 @@ nv50_colormask(unsigned mask) #define NV50_BLEND_FACTOR_CASE(a, b) \ case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b -static INLINE uint32_t +static inline uint32_t nv50_blend_fac(unsigned factor) { switch (factor) { @@ -116,7 +116,7 @@ nv50_blend_state_create(struct pipe_context *pipe, { struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj); int i; - boolean emit_common_func = cso->rt[0].blend_enable; + bool emit_common_func = cso->rt[0].blend_enable; uint32_t ms; if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) { @@ -137,11 +137,11 @@ nv50_blend_state_create(struct pipe_context *pipe, for (i = 0; i < 8; ++i) { SB_DATA(so, cso->rt[i].blend_enable); if (cso->rt[i].blend_enable) - emit_common_func = TRUE; + emit_common_func = true; } if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) { - emit_common_func = FALSE; + emit_common_func = false; for (i = 0; i < 8; ++i) { if (!cso->rt[i].blend_enable) @@ -373,6 +373,16 @@ nv50_zsa_state_create(struct pipe_context *pipe, SB_DATA (so, 0); } + SB_BEGIN_3D(so, DEPTH_BOUNDS_EN, 1); + if (cso->depth.bounds_test) { + SB_DATA (so, 1); + SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2); + SB_DATA (so, fui(cso->depth.bounds_min)); + SB_DATA (so, fui(cso->depth.bounds_max)); + } else { + SB_DATA (so, 0); + } + if (cso->stencil[0].enabled) { SB_BEGIN_3D(so, STENCIL_ENABLE, 5); SB_DATA (so, 1); @@ -439,7 +449,7 @@ nv50_zsa_state_delete(struct pipe_context *pipe, void *hwcso) #define NV50_TSC_WRAP_CASE(n) \ case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n -static INLINE unsigned +static inline unsigned nv50_tsc_wrap_mode(unsigned wrap) { switch (wrap) { @@ -572,7 +582,7 @@ nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso) FREE(hwcso); } -static INLINE void +static inline void nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s, unsigned nr, void **hwcso) { @@ -650,7 +660,7 @@ nv50_sampler_view_destroy(struct pipe_context *pipe, FREE(nv50_tic_entry(view)); } -static INLINE void +static inline void nv50_stage_set_sampler_views(struct nv50_context *nv50, int s, unsigned nr, struct pipe_sampler_view **views) @@ -808,7 +818,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res); - nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE; + nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false; if (nv50->constbuf[s][i].user) { nv50->constbuf[s][i].u.data = cb->user_buffer; nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); @@ -1041,7 +1051,7 @@ nv50_so_target_create(struct pipe_context *pipe, } else { targ->pq = NULL; } - targ->clean = TRUE; + targ->clean = true; targ->pipe.buffer_size = size; targ->pipe.buffer_offset = offset; @@ -1075,32 +1085,32 @@ nv50_set_stream_output_targets(struct pipe_context *pipe, { struct nv50_context *nv50 = nv50_context(pipe); unsigned i; - boolean serialize = TRUE; - const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS; + bool serialize = true; + const bool can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS; assert(num_targets <= 4); for (i = 0; i < num_targets; ++i) { - const boolean changed = nv50->so_target[i] != targets[i]; - const boolean append = (offsets[i] == (unsigned)-1); + const bool changed = nv50->so_target[i] != targets[i]; + const bool append = (offsets[i] == (unsigned)-1); if (!changed && append) continue; nv50->so_targets_dirty |= 1 << i; if (can_resume && changed && nv50->so_target[i]) { nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize); - serialize = FALSE; + serialize = false; } if (targets[i] && !append) - nv50_so_target(targets[i])->clean = TRUE; + nv50_so_target(targets[i])->clean = true; pipe_so_target_reference(&nv50->so_target[i], targets[i]); } for (; i < nv50->num_so_targets; ++i) { if (can_resume && nv50->so_target[i]) { nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize); - serialize = FALSE; + serialize = false; } pipe_so_target_reference(&nv50->so_target[i], NULL); nv50->so_targets_dirty |= 1 << i; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c index 116bf4bba7c..985603df5fa 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c @@ -2,7 +2,7 @@ #include "nv50/nv50_context.h" #include "nv50/nv50_defs.xml.h" -static INLINE void +static inline void nv50_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i) { BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(i)), 4); @@ -82,7 +82,7 @@ nv50_validate_fb(struct nv50_context *nv50) ms_mode = mt->ms_mode; if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING) - nv50->state.rt_serialize = TRUE; + nv50->state.rt_serialize = true; mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; @@ -111,7 +111,7 @@ nv50_validate_fb(struct nv50_context *nv50) ms_mode = mt->ms_mode; if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING) - nv50->state.rt_serialize = TRUE; + nv50->state.rt_serialize = true; mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; @@ -275,7 +275,7 @@ nv50_validate_viewport(struct nv50_context *nv50) nv50->viewports_dirty = 0; } -static INLINE void +static inline void nv50_check_program_ucps(struct nv50_context *nv50, struct nv50_program *vp, uint8_t mask) { @@ -296,6 +296,23 @@ nv50_check_program_ucps(struct nv50_context *nv50, nv50_fp_linkage_validate(nv50); } +/* alpha test is disabled if there are no color RTs, so make sure we have at + * least one if alpha test is enabled. Note that this must run after + * nv50_validate_fb, otherwise that will override the RT count setting. + */ +static void +nv50_validate_derived_2(struct nv50_context *nv50) +{ + struct nouveau_pushbuf *push = nv50->base.pushbuf; + + if (nv50->zsa && nv50->zsa->pipe.alpha.enabled && + nv50->framebuffer.nr_cbufs == 0) { + nv50_fb_set_null_rt(push, 0); + BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1); + PUSH_DATA (push, (076543210 << 4) | 1); + } +} + static void nv50_validate_clip(struct nv50_context *nv50) { @@ -456,6 +473,7 @@ static struct state_validate { { nv50_gp_linkage_validate, NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG }, { nv50_validate_derived_rs, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER | NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, + { nv50_validate_derived_2, NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER }, { nv50_validate_clip, NV50_NEW_CLIP | NV50_NEW_RASTERIZER | NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, { nv50_constbufs_validate, NV50_NEW_CONSTBUF }, @@ -468,7 +486,7 @@ static struct state_validate { }; #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0])) -boolean +bool nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words) { uint32_t state_mask; @@ -490,19 +508,19 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words) nv50->dirty &= ~state_mask; if (nv50->state.rt_serialize) { - nv50->state.rt_serialize = FALSE; + nv50->state.rt_serialize = false; BEGIN_NV04(nv50->base.pushbuf, SUBC_3D(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (nv50->base.pushbuf, 0); } - nv50_bufctx_fence(nv50->bufctx_3d, FALSE); + nv50_bufctx_fence(nv50->bufctx_3d, false); } nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d); ret = nouveau_pushbuf_validate(nv50->base.pushbuf); if (unlikely(nv50->state.flushed)) { - nv50->state.flushed = FALSE; - nv50_bufctx_fence(nv50->bufctx_3d, TRUE); + nv50->state.flushed = false; + nv50_bufctx_fence(nv50->bufctx_3d, true); } return !ret; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h index eea5327b6cb..cf75d1eb11b 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h @@ -31,7 +31,7 @@ struct nv50_rasterizer_stateobj { struct nv50_zsa_stateobj { struct pipe_depth_stencil_alpha_state pipe; int size; - uint32_t state[29]; + uint32_t state[34]; }; struct nv50_constbuf { @@ -41,7 +41,7 @@ struct nv50_constbuf { } u; uint32_t size; /* max 65536 */ uint32_t offset; - boolean user; /* should only be TRUE if u.data is valid and non-NULL */ + bool user; /* should only be true if u.data is valid and non-NULL */ }; struct nv50_vertex_element { @@ -56,7 +56,7 @@ struct nv50_vertex_stateobj { unsigned num_elements; uint32_t instance_elts; uint32_t instance_bufs; - boolean need_conversion; + bool need_conversion; unsigned vertex_size; unsigned packet_vertex_limit; struct nv50_vertex_element element[0]; @@ -66,10 +66,10 @@ struct nv50_so_target { struct pipe_stream_output_target pipe; struct pipe_query *pq; unsigned stride; - boolean clean; + bool clean; }; -static INLINE struct nv50_so_target * +static inline struct nv50_so_target * nv50_so_target(struct pipe_stream_output_target *ptarg) { return (struct nv50_so_target *)ptarg; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h index 99548cbdb42..e0793bb6ec4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h @@ -9,7 +9,7 @@ struct nv50_tsc_entry { uint32_t tsc[8]; }; -static INLINE struct nv50_tsc_entry * +static inline struct nv50_tsc_entry * nv50_tsc_entry(void *hwcso) { return (struct nv50_tsc_entry *)hwcso; @@ -21,7 +21,7 @@ struct nv50_tic_entry { uint32_t tic[8]; }; -static INLINE struct nv50_tic_entry * +static inline struct nv50_tic_entry * nv50_tic_entry(struct pipe_sampler_view *view) { return (struct nv50_tic_entry *)view; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index dc9852d4e47..b1ae01692cb 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -49,8 +49,8 @@ #define NOUVEAU_DRIVER 0x50 #include "nv50/nv50_blit.h" -static INLINE uint8_t -nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal) +static inline uint8_t +nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal) { uint8_t id = nv50_format_table[format].rt; @@ -76,7 +76,7 @@ nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal) static int nv50_2d_texture_set(struct nouveau_pushbuf *push, int dst, struct nv50_miptree *mt, unsigned level, unsigned layer, - enum pipe_format pformat, boolean dst_src_pformat_equal) + enum pipe_format pformat, bool dst_src_pformat_equal) { struct nouveau_bo *bo = mt->base.bo; uint32_t width, height, depth; @@ -153,7 +153,7 @@ nv50_2d_texture_do_copy(struct nouveau_pushbuf *push, const enum pipe_format dfmt = dst->base.base.format; const enum pipe_format sfmt = src->base.base.format; int ret; - boolean eqfmt = dfmt == sfmt; + bool eqfmt = dfmt == sfmt; if (!PUSH_SPACE(push, 2 * 16 + 32)) return PIPE_ERROR; @@ -196,7 +196,7 @@ nv50_resource_copy_region(struct pipe_context *pipe, { struct nv50_context *nv50 = nv50_context(pipe); int ret; - boolean m2mf; + bool m2mf; unsigned dst_layer = dstz, src_layer = src_box->z; if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { @@ -658,7 +658,7 @@ nv50_blitter_make_vp(struct nv50_blitter *blit) }; blit->vp.type = PIPE_SHADER_VERTEX; - blit->vp.translated = TRUE; + blit->vp.translated = true; blit->vp.code = (uint32_t *)code; /* const_cast */ blit->vp.code_size = sizeof(code); blit->vp.max_gpr = 4; @@ -687,24 +687,24 @@ nv50_blitter_make_fp(struct pipe_context *pipe, const unsigned target = nv50_blit_get_tgsi_texture_target(ptarg); - boolean tex_rgbaz = FALSE; - boolean tex_s = FALSE; - boolean cvt_un8 = FALSE; + bool tex_rgbaz = false; + bool tex_s = false; + bool cvt_un8 = false; if (mode != NV50_BLIT_MODE_PASS && mode != NV50_BLIT_MODE_Z24X8 && mode != NV50_BLIT_MODE_X8Z24) - tex_s = TRUE; + tex_s = true; if (mode != NV50_BLIT_MODE_X24S8 && mode != NV50_BLIT_MODE_S8X24 && mode != NV50_BLIT_MODE_XS) - tex_rgbaz = TRUE; + tex_rgbaz = true; if (mode != NV50_BLIT_MODE_PASS && mode != NV50_BLIT_MODE_ZS && mode != NV50_BLIT_MODE_XS) - cvt_un8 = TRUE; + cvt_un8 = true; ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT); if (!ureg) @@ -1271,7 +1271,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info) int i; uint32_t mode; uint32_t mask = nv50_blit_eng2d_get_mask(info); - boolean b; + bool b; mode = nv50_blit_get_filter(info) ? NV50_2D_BLIT_CONTROL_FILTER_BILINEAR : @@ -1410,7 +1410,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info) PUSH_DATA (push, srcy >> 32); } } - nv50_bufctx_fence(nv50->bufctx, FALSE); + nv50_bufctx_fence(nv50->bufctx, false); nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D); @@ -1432,71 +1432,82 @@ static void nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) { struct nv50_context *nv50 = nv50_context(pipe); - boolean eng3d = FALSE; + struct nouveau_pushbuf *push = nv50->base.pushbuf; + bool eng3d = FALSE; if (util_format_is_depth_or_stencil(info->dst.resource->format)) { if (!(info->mask & PIPE_MASK_ZS)) return; if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT || info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) - eng3d = TRUE; + eng3d = true; if (info->filter != PIPE_TEX_FILTER_NEAREST) - eng3d = TRUE; + eng3d = true; } else { if (!(info->mask & PIPE_MASK_RGBA)) return; if (info->mask != PIPE_MASK_RGBA) - eng3d = TRUE; + eng3d = true; } if (nv50_miptree(info->src.resource)->layout_3d) { - eng3d = TRUE; + eng3d = true; } else if (info->src.box.depth != info->dst.box.depth) { - eng3d = TRUE; + eng3d = true; debug_printf("blit: cannot filter array or cube textures in z direction"); } if (!eng3d && info->dst.format != info->src.format) { if (!nv50_2d_dst_format_faithful(info->dst.format) || !nv50_2d_src_format_faithful(info->src.format)) { - eng3d = TRUE; + eng3d = true; } else if (!nv50_2d_src_format_faithful(info->src.format)) { if (!util_format_is_luminance(info->src.format)) { if (util_format_is_intensity(info->src.format)) - eng3d = TRUE; + eng3d = true; else if (!nv50_2d_dst_format_ops_supported(info->dst.format)) - eng3d = TRUE; + eng3d = true; else eng3d = !nv50_2d_format_supported(info->src.format); } } else if (util_format_is_luminance_alpha(info->src.format)) - eng3d = TRUE; + eng3d = true; } if (info->src.resource->nr_samples == 8 && info->dst.resource->nr_samples <= 1) - eng3d = TRUE; + eng3d = true; /* FIXME: can't make this work with eng2d anymore */ if ((info->src.resource->nr_samples | 1) != (info->dst.resource->nr_samples | 1)) - eng3d = TRUE; + eng3d = true; /* FIXME: find correct src coordinate adjustments */ if ((info->src.box.width != info->dst.box.width && info->src.box.width != -info->dst.box.width) || (info->src.box.height != info->dst.box.height && info->src.box.height != -info->dst.box.height)) - eng3d = TRUE; + eng3d = true; + + if (nv50->screen->num_occlusion_queries_active) { + BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); + PUSH_DATA (push, 0); + } if (!eng3d) nv50_blit_eng2d(nv50, info); else nv50_blit_3d(nv50, info); + + if (nv50->screen->num_occlusion_queries_active) { + BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1); + PUSH_DATA (push, 1); + } } static void @@ -1505,13 +1516,13 @@ nv50_flush_resource(struct pipe_context *ctx, { } -boolean +bool nv50_blitter_create(struct nv50_screen *screen) { screen->blitter = CALLOC_STRUCT(nv50_blitter); if (!screen->blitter) { NOUVEAU_ERR("failed to allocate blitter struct\n"); - return FALSE; + return false; } pipe_mutex_init(screen->blitter->mutex); @@ -1519,7 +1530,7 @@ nv50_blitter_create(struct nv50_screen *screen) nv50_blitter_make_vp(screen->blitter); nv50_blitter_make_sampler(screen->blitter); - return TRUE; + return true; } void @@ -1542,20 +1553,20 @@ nv50_blitter_destroy(struct nv50_screen *screen) FREE(blitter); } -boolean +bool nv50_blitctx_create(struct nv50_context *nv50) { nv50->blit = CALLOC_STRUCT(nv50_blitctx); if (!nv50->blit) { NOUVEAU_ERR("failed to allocate blit context\n"); - return FALSE; + return false; } nv50->blit->nv50 = nv50; nv50->blit->rast.pipe.half_pixel_center = 1; - return TRUE; + return true; } void diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c index d69c8d6ff0d..fc6374d1b1b 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c @@ -31,8 +31,8 @@ (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK | \ NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK) -static INLINE uint32_t -nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int) +static inline uint32_t +nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int) { switch (swz) { case PIPE_SWIZZLE_RED: @@ -71,6 +71,7 @@ nv50_create_texture_view(struct pipe_context *pipe, uint32_t flags, enum pipe_texture_target target) { + const uint32_t class_3d = nouveau_context(pipe)->screen->class_3d; const struct util_format_description *desc; uint64_t addr; uint32_t *tic; @@ -78,7 +79,7 @@ nv50_create_texture_view(struct pipe_context *pipe, uint32_t depth; struct nv50_tic_entry *view; struct nv50_miptree *mt = nv50_miptree(texture); - boolean tex_int; + bool tex_int; view = MALLOC_STRUCT(nv50_tic_entry); if (!view) @@ -192,7 +193,7 @@ nv50_create_texture_view(struct pipe_context *pipe, break; default: NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target); - return FALSE; + return false; } tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000; @@ -201,11 +202,17 @@ nv50_create_texture_view(struct pipe_context *pipe, tic[5] = (mt->base.base.height0 << mt->ms_y) & 0xffff; tic[5] |= depth << 16; - tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT; + if (class_3d > NV50_3D_CLASS) + tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT; + else + tic[5] |= view->pipe.u.tex.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT; tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000; /* sampling points */ - tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level; + if (class_3d > NV50_3D_CLASS) + tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level; + else + tic[7] = 0; if (unlikely(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS))) if (mt->base.base.last_level) @@ -214,13 +221,13 @@ nv50_create_texture_view(struct pipe_context *pipe, return &view->pipe; } -static boolean +static bool nv50_validate_tic(struct nv50_context *nv50, int s) { struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nouveau_bo *txc = nv50->screen->txc; unsigned i; - boolean need_flush = FALSE; + bool need_flush = false; assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS); for (i = 0; i < nv50->num_textures[s]; ++i) { @@ -263,7 +270,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s) BEGIN_NI04(push, NV50_2D(SIFC_DATA), 8); PUSH_DATAp(push, &tic->tic[0], 8); - need_flush = TRUE; + need_flush = true; } else if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1); @@ -309,7 +316,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s) void nv50_validate_textures(struct nv50_context *nv50) { - boolean need_flush; + bool need_flush; need_flush = nv50_validate_tic(nv50, 0); need_flush |= nv50_validate_tic(nv50, 1); @@ -321,12 +328,12 @@ void nv50_validate_textures(struct nv50_context *nv50) } } -static boolean +static bool nv50_validate_tsc(struct nv50_context *nv50, int s) { struct nouveau_pushbuf *push = nv50->base.pushbuf; unsigned i; - boolean need_flush = FALSE; + bool need_flush = false; assert(nv50->num_samplers[s] <= PIPE_MAX_SAMPLERS); for (i = 0; i < nv50->num_samplers[s]; ++i) { @@ -343,7 +350,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s) nv50_sifc_linear_u8(&nv50->base, nv50->screen->txc, 65536 + tsc->id * 32, NOUVEAU_BO_VRAM, 32, tsc->tsc); - need_flush = TRUE; + need_flush = true; } nv50->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32); @@ -361,7 +368,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s) void nv50_validate_samplers(struct nv50_context *nv50) { - boolean need_flush; + bool need_flush; need_flush = nv50_validate_tsc(nv50, 0); need_flush |= nv50_validate_tsc(nv50, 1); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 1fd33b8aa59..6324726acec 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -58,7 +58,7 @@ nv50_vertex_state_create(struct pipe_context *pipe, so->num_elements = num_elements; so->instance_elts = 0; so->instance_bufs = 0; - so->need_conversion = FALSE; + so->need_conversion = false; memset(so->vb_access_size, 0, sizeof(so->vb_access_size)); @@ -89,7 +89,7 @@ nv50_vertex_state_create(struct pipe_context *pipe, return NULL; } so->element[i].state = nv50_format_table[fmt].vtx; - so->need_conversion = TRUE; + so->need_conversion = true; } so->element[i].state |= i; @@ -188,7 +188,7 @@ nv50_emit_vtxattr(struct nv50_context *nv50, struct pipe_vertex_buffer *vb, } } -static INLINE void +static inline void nv50_user_vbuf_range(struct nv50_context *nv50, unsigned vbi, uint32_t *base, uint32_t *size) { @@ -229,7 +229,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50, BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, bo); } - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; } static void @@ -275,10 +275,10 @@ nv50_update_user_vbufs(struct nv50_context *nv50) PUSH_DATAh(push, address[b] + ve->src_offset); PUSH_DATA (push, address[b] + ve->src_offset); } - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; } -static INLINE void +static inline void nv50_release_user_vbufs(struct nv50_context *nv50) { if (nv50->vbo_user) { @@ -316,7 +316,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50) struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer); if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; break; } } @@ -382,6 +382,11 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50) if (nv50->vbo_user & (1 << b)) { address = addrs[b] + ve->pipe.src_offset; limit = addrs[b] + limits[b]; + } else + if (!vb->buffer) { + BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1); + PUSH_DATA (push, 0); + continue; } else { struct nv04_resource *buf = nv04_resource(vb->buffer); if (!(refd & (1 << b))) { @@ -418,7 +423,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50) #define NV50_PRIM_GL_CASE(n) \ case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n -static INLINE unsigned +static inline unsigned nv50_prim_gl(unsigned prim) { switch (prim) { @@ -585,7 +590,7 @@ nv50_draw_elements_inline_u32_short(struct nouveau_pushbuf *push, } static void -nv50_draw_elements(struct nv50_context *nv50, boolean shorten, +nv50_draw_elements(struct nv50_context *nv50, bool shorten, unsigned mode, unsigned start, unsigned count, unsigned instance_count, int32_t index_bias) { @@ -746,9 +751,9 @@ nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan) { struct nv50_screen *screen = chan->user_priv; - nouveau_fence_update(&screen->base, TRUE); + nouveau_fence_update(&screen->base, true); - nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, TRUE); + nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, true); } void @@ -801,7 +806,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) continue; if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv50->cb_dirty = TRUE; + nv50->cb_dirty = true; } } @@ -809,7 +814,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (nv50->cb_dirty) { BEGIN_NV04(push, NV50_3D(CODE_CB_FLUSH), 1); PUSH_DATA (push, 0); - nv50->cb_dirty = FALSE; + nv50->cb_dirty = false; } if (nv50->vbo_fifo) { @@ -830,21 +835,21 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (!nv50->vtxbuf[i].buffer) continue; if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; } if (!nv50->base.vbo_dirty && nv50->idxbuf.buffer && nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nv50->base.vbo_dirty = TRUE; + nv50->base.vbo_dirty = true; if (nv50->base.vbo_dirty) { BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1); PUSH_DATA (push, 0); - nv50->base.vbo_dirty = FALSE; + nv50->base.vbo_dirty = false; } if (info->indexed) { - boolean shorten = info->max_index <= 65535; + bool shorten = info->max_index <= 65535; if (info->primitive_restart != nv50->state.prim_restart) { if (info->primitive_restart) { @@ -853,7 +858,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->restart_index); if (info->restart_index > 65535) - shorten = FALSE; + shorten = false; } else { BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 1); PUSH_DATA (push, 0); @@ -865,7 +870,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_DATA (push, info->restart_index); if (info->restart_index > 65535) - shorten = FALSE; + shorten = false; } nv50_draw_elements(nv50, shorten, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h index e8578c8be6f..76f1b41ea70 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h @@ -16,14 +16,14 @@ #endif -static INLINE void +static inline void nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin, unsigned flags, struct nouveau_bo *bo) { nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL; } -static INLINE void +static inline void nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin, struct nv04_resource *res, unsigned flags) { @@ -39,7 +39,7 @@ nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin, #define BCTX_REFN(bctx, bin, res, acc) \ nv50_add_bufctx_resident(bctx, NV50_BIND_##bin, res, NOUVEAU_BO_##acc) -static INLINE void +static inline void PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) { struct nouveau_pushbuf_refn ref = { bo, flags }; @@ -61,39 +61,39 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n) -static INLINE uint32_t +static inline uint32_t NV50_FIFO_PKHDR(int subc, int mthd, unsigned size) { return 0x00000000 | (size << 18) | (subc << 13) | mthd; } -static INLINE uint32_t +static inline uint32_t NV50_FIFO_PKHDR_NI(int subc, int mthd, unsigned size) { return 0x40000000 | (size << 18) | (subc << 13) | mthd; } -static INLINE uint32_t +static inline uint32_t NV50_FIFO_PKHDR_L(int subc, int mthd) { return 0x00030000 | (subc << 13) | mthd; } -static INLINE uint32_t +static inline uint32_t nouveau_bo_memtype(const struct nouveau_bo *bo) { return bo->config.nv50.memtype; } -static INLINE void +static inline void PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data) { *push->cur++ = (uint32_t)(data >> 32); } -static INLINE void +static inline void BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING @@ -102,7 +102,7 @@ BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) PUSH_DATA (push, NV50_FIFO_PKHDR(subc, mthd, size)); } -static INLINE void +static inline void BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING @@ -112,7 +112,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) } /* long, non-incremental, nv50-only */ -static INLINE void +static inline void BEGIN_NL50(struct nouveau_pushbuf *push, int subc, int mthd, uint32_t size) { #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.h b/src/gallium/drivers/nouveau/nv50/nv84_video.h index 2edba389dbf..09773c12974 100644 --- a/src/gallium/drivers/nouveau/nv50/nv84_video.h +++ b/src/gallium/drivers/nouveau/nv50/nv84_video.h @@ -102,12 +102,12 @@ struct nv84_decoder { uint8_t mpeg12_non_intra_matrix[64]; }; -static INLINE uint32_t mb(uint32_t coord) +static inline uint32_t mb(uint32_t coord) { return (coord + 0xf)>>4; } -static INLINE uint32_t mb_half(uint32_t coord) +static inline uint32_t mb_half(uint32_t coord) { return (coord + 0x1f)>>5; } diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c index f3480b2e00e..8b121477a37 100644 --- a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c +++ b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c @@ -221,7 +221,7 @@ nv84_decoder_vp_h264(struct nv84_decoder *dec, PUSH_KICK (push); } -static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) { +static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) { int16_t ret = val * quant / 16; if (mpeg1 && ret) { if (ret > 0) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 56fc83d3679..47bd123621b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -121,51 +121,51 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, return 0; } -boolean +bool nvc0_compute_validate_program(struct nvc0_context *nvc0) { struct nvc0_program *prog = nvc0->compprog; if (prog->mem) - return TRUE; + return true; if (!prog->translated) { prog->translated = nvc0_program_translate( prog, nvc0->screen->base.device->chipset); if (!prog->translated) - return FALSE; + return false; } if (unlikely(!prog->code_size)) - return FALSE; + return false; if (likely(prog->code_size)) { if (nvc0_program_upload_code(nvc0, prog)) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE); - return TRUE; + return true; } } - return FALSE; + return false; } -static boolean +static bool nvc0_compute_state_validate(struct nvc0_context *nvc0) { if (!nvc0_compute_validate_program(nvc0)) - return FALSE; + return false; /* TODO: textures, samplers, surfaces, global memory buffers */ - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE); + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return FALSE; + return false; if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE); + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); - return TRUE; + return true; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h index 9a1a71760d7..168a6d1bee2 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h @@ -4,7 +4,7 @@ #include "nv50/nv50_defs.xml.h" #include "nvc0/nvc0_compute.xml.h" -boolean +bool nvc0_compute_validate_program(struct nvc0_context *nvc0); #endif /* NVC0_COMPUTE_H */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index a35c3f66142..84f8db6a8ac 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -63,12 +63,12 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags) if (!nvc0->vtxbuf[i].buffer) continue; if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nvc0->base.vbo_dirty = TRUE; + nvc0->base.vbo_dirty = true; } if (nvc0->idxbuf.buffer && nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nvc0->base.vbo_dirty = TRUE; + nvc0->base.vbo_dirty = true; for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) { uint32_t valid = nvc0->constbuf_valid[s]; @@ -86,7 +86,7 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags) continue; if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) - nvc0->cb_dirty = TRUE; + nvc0->cb_dirty = true; } } } @@ -164,9 +164,9 @@ nvc0_default_kick_notify(struct nouveau_pushbuf *push) if (screen) { nouveau_fence_next(&screen->base); - nouveau_fence_update(&screen->base, TRUE); + nouveau_fence_update(&screen->base, true); if (screen->cur_ctx) - screen->cur_ctx->state.flushed = TRUE; + screen->cur_ctx->state.flushed = true; NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1); } } @@ -378,7 +378,7 @@ out_err: void nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx, - boolean on_flush) + bool on_flush) { struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending; struct nouveau_list *it; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index a8d7593b398..f4499423a10 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -54,6 +54,7 @@ #define NVC0_NEW_IDXBUF (1 << 22) #define NVC0_NEW_SURFACES (1 << 23) #define NVC0_NEW_MIN_SAMPLES (1 << 24) +#define NVC0_NEW_TESSFACTOR (1 << 25) #define NVC0_NEW_CP_PROGRAM (1 << 0) #define NVC0_NEW_CP_SURFACES (1 << 1) @@ -93,7 +94,7 @@ struct nvc0_blitctx; -boolean nvc0_blitctx_create(struct nvc0_context *); +bool nvc0_blitctx_create(struct nvc0_context *); void nvc0_blitctx_destroy(struct nvc0_context *); struct nvc0_context { @@ -130,7 +131,7 @@ struct nvc0_context { struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS]; uint16_t constbuf_dirty[6]; uint16_t constbuf_valid[6]; - boolean cb_dirty; + bool cb_dirty; struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS]; unsigned num_vtxbufs; @@ -164,14 +165,17 @@ struct nvc0_context { unsigned sample_mask; unsigned min_samples; - boolean vbo_push_hint; + float default_tess_outer[4]; + float default_tess_inner[2]; + + bool vbo_push_hint; uint8_t tfbbuf_dirty; struct pipe_stream_output_target *tfbbuf[4]; unsigned num_tfbbufs; struct pipe_query *cond_query; - boolean cond_cond; /* inverted rendering condition */ + bool cond_cond; /* inverted rendering condition */ uint cond_mode; uint32_t cond_condmode; /* the calculated condition */ @@ -184,19 +188,19 @@ struct nvc0_context { struct util_dynarray global_residents; }; -static INLINE struct nvc0_context * +static inline struct nvc0_context * nvc0_context(struct pipe_context *pipe) { return (struct nvc0_context *)pipe; } -static INLINE unsigned +static inline unsigned nvc0_shader_stage(unsigned pipe) { switch (pipe) { case PIPE_SHADER_VERTEX: return 0; -/* case PIPE_SHADER_TESSELLATION_CONTROL: return 1; */ -/* case PIPE_SHADER_TESSELLATION_EVALUATION: return 2; */ + case PIPE_SHADER_TESS_CTRL: return 1; + case PIPE_SHADER_TESS_EVAL: return 2; case PIPE_SHADER_GEOMETRY: return 3; case PIPE_SHADER_FRAGMENT: return 4; case PIPE_SHADER_COMPUTE: return 5; @@ -210,15 +214,15 @@ nvc0_shader_stage(unsigned pipe) /* nvc0_context.c */ struct pipe_context *nvc0_create(struct pipe_screen *, void *); void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *, - boolean on_flush); + bool on_flush); void nvc0_default_kick_notify(struct nouveau_pushbuf *); /* nvc0_draw.c */ extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *); /* nvc0_program.c */ -boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset); -boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); +bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset); +bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, @@ -231,7 +235,7 @@ void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *); void nvc0_so_target_save_offset(struct pipe_context *, struct pipe_stream_output_target *, unsigned i, - boolean *serialize); + bool *serialize); #define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) @@ -250,8 +254,8 @@ extern void nvc0_init_state_functions(struct nvc0_context *); /* nvc0_state_validate.c */ void nvc0_validate_global_residents(struct nvc0_context *, struct nouveau_bufctx *, int bin); -extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask, - unsigned space_words); +extern bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask, + unsigned space_words); /* nvc0_surface.c */ extern void nvc0_clear(struct pipe_context *, unsigned buffers, @@ -260,7 +264,7 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers, extern void nvc0_init_surface_functions(struct nvc0_context *); /* nvc0_tex.c */ -boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s); +bool nve4_validate_tsc(struct nvc0_context *nvc0, int s); void nvc0_validate_textures(struct nvc0_context *); void nvc0_validate_samplers(struct nvc0_context *); void nve4_set_tex_handles(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c index 3875bbf4ca4..15991c3d2bd 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c @@ -29,13 +29,13 @@ #include "nvc0/nvc0_resource.h" static uint32_t -nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d) +nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d) { return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d); } static uint32_t -nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed) +nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed) { const unsigned ms = util_logbase2(mt->base.base.nr_samples); @@ -133,7 +133,7 @@ nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed) return tile_flags; } -static INLINE boolean +static inline bool nvc0_miptree_init_ms_mode(struct nv50_miptree *mt) { switch (mt->base.base.nr_samples) { @@ -157,9 +157,9 @@ nvc0_miptree_init_ms_mode(struct nv50_miptree *mt) break; default: NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples); - return FALSE; + return false; } - return TRUE; + return true; } static void @@ -250,7 +250,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen, struct nouveau_device *dev = nouveau_screen(pscreen)->device; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_resource *pt = &mt->base.base; - boolean compressed = dev->drm_version >= 0x01000101; + bool compressed = dev->drm_version >= 0x01000101; int ret; union nouveau_bo_config bo_config; uint32_t bo_flags; @@ -325,7 +325,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen, } /* Offset of zslice @z from start of level @l. */ -INLINE unsigned +inline unsigned nvc0_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z) { const struct pipe_resource *pt = &mt->base.base; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index e1f5a8c4416..507a2507fe3 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -31,24 +31,25 @@ * 124 scalar varying values. */ static uint32_t -nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase) +nvc0_shader_input_address(unsigned sn, unsigned si) { switch (sn) { - case NV50_SEMANTIC_TESSFACTOR: return 0x000 + si * 0x4; + case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; + case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; + case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; case TGSI_SEMANTIC_PRIMID: return 0x060; case TGSI_SEMANTIC_LAYER: return 0x064; case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; case TGSI_SEMANTIC_PSIZE: return 0x06c; case TGSI_SEMANTIC_POSITION: return 0x070; - case TGSI_SEMANTIC_GENERIC: return ubase + si * 0x10; + case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; case TGSI_SEMANTIC_FOG: return 0x2e8; case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; - case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4; case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; case TGSI_SEMANTIC_PCOORD: return 0x2e0; - case NV50_SEMANTIC_TESSCOORD: return 0x2f0; + case TGSI_SEMANTIC_TESSCOORD: return 0x2f0; case TGSI_SEMANTIC_INSTANCEID: return 0x2f8; case TGSI_SEMANTIC_VERTEXID: return 0x2fc; case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; @@ -60,20 +61,21 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase) } static uint32_t -nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase) +nvc0_shader_output_address(unsigned sn, unsigned si) { switch (sn) { - case NV50_SEMANTIC_TESSFACTOR: return 0x000 + si * 0x4; + case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; + case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; + case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; case TGSI_SEMANTIC_PRIMID: return 0x060; case TGSI_SEMANTIC_LAYER: return 0x064; case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; case TGSI_SEMANTIC_PSIZE: return 0x06c; case TGSI_SEMANTIC_POSITION: return 0x070; - case TGSI_SEMANTIC_GENERIC: return ubase + si * 0x10; + case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; case TGSI_SEMANTIC_FOG: return 0x2e8; case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; - case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4; case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; @@ -95,7 +97,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info) case TGSI_SEMANTIC_VERTEXID: info->in[i].mask = 0x1; info->in[i].slot[0] = - nvc0_shader_input_address(info->in[i].sn, 0, 0) / 4; + nvc0_shader_input_address(info->in[i].sn, 0) / 4; continue; default: break; @@ -111,18 +113,11 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info) static int nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info) { - unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10); unsigned offset; unsigned i, c; for (i = 0; i < info->numInputs; ++i) { - offset = nvc0_shader_input_address(info->in[i].sn, - info->in[i].si, ubase); - if (info->in[i].patch && offset >= 0x20) - offset = 0x20 + info->in[i].si * 0x10; - - if (info->in[i].sn == NV50_SEMANTIC_TESSCOORD) - info->in[i].mask &= 3; + offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si); for (c = 0; c < 4; ++c) info->in[i].slot[c] = (offset + c * 0x4) / 4; @@ -157,15 +152,11 @@ nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info) static int nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info) { - unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10); unsigned offset; unsigned i, c; for (i = 0; i < info->numOutputs; ++i) { - offset = nvc0_shader_output_address(info->out[i].sn, - info->out[i].si, ubase); - if (info->out[i].patch && offset >= 0x20) - offset = 0x20 + info->out[i].si * 0x10; + offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si); for (c = 0; c < 4; ++c) info->out[i].slot[c] = (offset + c * 0x4) / 4; @@ -193,7 +184,7 @@ nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info) return ret; } -static INLINE void +static inline void nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot) { uint8_t min = (vp->hdr[4] >> 12) & 0xff; @@ -216,12 +207,8 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) continue; for (c = 0; c < 4; ++c) { a = info->in[i].slot[c]; - if (info->in[i].mask & (1 << c)) { - if (info->in[i].sn != NV50_SEMANTIC_TESSCOORD) - vp->hdr[5 + a / 32] |= 1 << (a % 32); - else - nvc0_vtgp_hdr_update_oread(vp, info->in[i].slot[c]); - } + if (info->in[i].mask & (1 << c)) + vp->hdr[5 + a / 32] |= 1 << (a % 32); } } @@ -250,6 +237,14 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) case TGSI_SEMANTIC_VERTEXID: vp->hdr[10] |= 1 << 31; break; + case TGSI_SEMANTIC_TESSCOORD: + /* We don't have the mask, nor the slots populated. While this could + * be achieved, the vast majority of the time if either of the coords + * are read, then both will be read. + */ + nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4); + nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4); + break; default: break; } @@ -277,7 +272,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) return nvc0_vtgp_gen_header(vp, info); } -#if defined(PIPE_SHADER_HULL) || defined(PIPE_SHADER_DOMAIN) static void nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) { @@ -305,14 +299,13 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; switch (info->prop.tp.partitioning) { - case PIPE_TESS_PART_INTEGER: - case PIPE_TESS_PART_POW2: + case PIPE_TESS_SPACING_EQUAL: tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL; break; - case PIPE_TESS_PART_FRACT_ODD: + case PIPE_TESS_SPACING_FRACTIONAL_ODD: tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD; break; - case PIPE_TESS_PART_FRACT_EVEN: + case PIPE_TESS_SPACING_FRACTIONAL_EVEN: tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN; break; default: @@ -320,9 +313,7 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) break; } } -#endif -#ifdef PIPE_SHADER_HULL static int nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info) { @@ -346,9 +337,7 @@ nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info) return 0; } -#endif -#ifdef PIPE_SHADER_DOMAIN static int nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info) { @@ -365,7 +354,6 @@ nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info) return 0; } -#endif static int nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info) @@ -523,7 +511,7 @@ nvc0_program_dump(struct nvc0_program *prog) } #endif -boolean +bool nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) { struct nv50_ir_prog_info *info; @@ -531,7 +519,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) info = CALLOC_STRUCT(nv50_ir_prog_info); if (!info) - return FALSE; + return false; info->type = prog->type; info->target = chipset; @@ -598,16 +586,12 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) case PIPE_SHADER_VERTEX: ret = nvc0_vp_gen_header(prog, info); break; -#ifdef PIPE_SHADER_HULL - case PIPE_SHADER_HULL: + case PIPE_SHADER_TESS_CTRL: ret = nvc0_tcp_gen_header(prog, info); break; -#endif -#ifdef PIPE_SHADER_DOMAIN - case PIPE_SHADER_DOMAIN: + case PIPE_SHADER_TESS_EVAL: ret = nvc0_tep_gen_header(prog, info); break; -#endif case PIPE_SHADER_GEOMETRY: ret = nvc0_gp_gen_header(prog, info); break; @@ -630,7 +614,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) assert(info->bin.tlsSpace < (1 << 24)); prog->hdr[0] |= 1 << 26; prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */ - prog->need_tls = TRUE; + prog->need_tls = true; } /* TODO: factor 2 only needed where joinat/precont is used, * and we only have to count non-uniform branches @@ -638,7 +622,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) /* if ((info->maxCFDepth * 2) > 16) { prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200; - prog->need_tls = TRUE; + prog->need_tls = true; } */ if (info->io.globalAccess) @@ -655,11 +639,11 @@ out: return !ret; } -boolean +bool nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) { struct nvc0_screen *screen = nvc0->screen; - const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE; + const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; int ret; uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); uint32_t lib_pos = screen->lib_code->start; @@ -694,7 +678,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); if (ret) { NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); - return FALSE; + return false; } IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0); } @@ -729,7 +713,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0); #ifdef DEBUG - if (debug_get_bool_option("NV50_PROG_DEBUG", FALSE)) + if (debug_get_bool_option("NV50_PROG_DEBUG", false)) nvc0_program_dump(prog); #endif @@ -746,7 +730,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1); PUSH_DATA (nvc0->base.pushbuf, 0x1011); - return TRUE; + return true; } /* Upload code for builtin functions like integer division emulation. */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h index 3fd9d21b4c4..390e0c7a4f0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h @@ -21,8 +21,8 @@ struct nvc0_program { struct pipe_shader_state pipe; ubyte type; - boolean translated; - boolean need_tls; + bool translated; + bool need_tls; uint8_t num_gprs; uint32_t *code; @@ -41,7 +41,7 @@ struct nvc0_program { uint8_t clip_enable; /* mask of defined clip planes */ uint8_t num_ucps; /* also set to max if ClipDistance is used */ uint8_t edgeflag; /* attribute index of edgeflag input */ - boolean need_vertex_id; + bool need_vertex_id; } vp; struct { uint8_t early_z; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index aea6cbda02d..f7b85a8e931 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -44,7 +44,7 @@ struct nvc0_query { uint32_t base; uint32_t offset; /* base + i * rotate */ uint8_t state; - boolean is64bit; + bool is64bit; uint8_t rotate; int nesting; /* only used for occlusion queries */ union { @@ -62,13 +62,13 @@ static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *); static boolean nvc0_mp_pm_query_result(struct nvc0_context *, struct nvc0_query *, void *, boolean); -static INLINE struct nvc0_query * +static inline struct nvc0_query * nvc0_query(struct pipe_query *pipe) { return (struct nvc0_query *)pipe; } -static boolean +static bool nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) { struct nvc0_screen *screen = nvc0->screen; @@ -87,17 +87,17 @@ nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) if (size) { q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); if (!q->bo) - return FALSE; + return false; q->offset = q->base; ret = nouveau_bo_map(q->bo, 0, screen->base.client); if (ret) { nvc0_query_allocate(nvc0, q, 0); - return FALSE; + return false; } q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base); } - return TRUE; + return true; } static void @@ -126,17 +126,17 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index) space = NVC0_QUERY_ALLOC_SPACE; break; case PIPE_QUERY_PIPELINE_STATISTICS: - q->is64bit = TRUE; + q->is64bit = true; space = 512; break; case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - q->is64bit = TRUE; + q->is64bit = true; space = 64; break; case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_PRIMITIVES_EMITTED: - q->is64bit = TRUE; + q->is64bit = true; q->index = index; space = 32; break; @@ -257,11 +257,11 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q = nvc0_query(pq); - boolean ret = true; + bool ret = true; /* For occlusion queries we have to change the storage, because a previous - * query might set the initial render conition to FALSE even *after* we re- - * initialized it to TRUE. + * query might set the initial render conition to false even *after* we re- + * initialized it to true. */ if (q->rotate) { nvc0_query_rotate(nvc0, q); @@ -270,7 +270,7 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) * query ? */ q->data[0] = q->sequence; /* initialize sequence */ - q->data[1] = 1; /* initial render condition = TRUE */ + q->data[1] = 1; /* initial render condition = true */ q->data[4] = q->sequence + 1; /* for comparison COND_MODE */ q->data[5] = 0; } @@ -401,7 +401,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); break; case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* This query is not issued on GPU because disjoint is forced to FALSE */ + /* This query is not issued on GPU because disjoint is forced to false */ q->state = NVC0_QUERY_STATE_READY; break; default: @@ -422,7 +422,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence); } -static INLINE void +static inline void nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q) { if (q->is64bit) { @@ -442,7 +442,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, struct nvc0_query *q = nvc0_query(pq); uint64_t *res64 = (uint64_t*)result; uint32_t *res32 = (uint32_t*)result; - boolean *res8 = (boolean*)result; + uint8_t *res8 = (uint8_t*)result; uint64_t *data64 = (uint64_t *)q->data; unsigned i; @@ -450,7 +450,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, if (q->type >= NVC0_QUERY_DRV_STAT(0) && q->type <= NVC0_QUERY_DRV_STAT_LAST) { res64[0] = q->u.value; - return TRUE; + return true; } else #endif if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) || @@ -468,17 +468,17 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */ PUSH_KICK(nvc0->base.pushbuf); } - return FALSE; + return false; } if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client)) - return FALSE; + return false; NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1); } q->state = NVC0_QUERY_STATE_READY; switch (q->type) { case PIPE_QUERY_GPU_FINISHED: - res8[0] = TRUE; + res8[0] = true; break; case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ res64[0] = q->data[1] - q->data[5]; @@ -502,7 +502,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, break; case PIPE_QUERY_TIMESTAMP_DISJOINT: res64[0] = 1000000000; - res8[8] = FALSE; + res8[8] = false; break; case PIPE_QUERY_TIME_ELAPSED: res64[0] = data64[1] - data64[3]; @@ -516,10 +516,10 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, break; default: assert(0); /* can't happen, we don't create queries with invalid type */ - return FALSE; + return false; } - return TRUE; + return true; } void @@ -549,7 +549,7 @@ nvc0_render_condition(struct pipe_context *pipe, struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q; uint32_t cond; - boolean wait = + bool wait = mode != PIPE_RENDER_COND_NO_WAIT && mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; @@ -563,7 +563,7 @@ nvc0_render_condition(struct pipe_context *pipe, case PIPE_QUERY_SO_OVERFLOW_PREDICATE: cond = condition ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_NOT_EQUAL; - wait = TRUE; + wait = true; break; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: @@ -626,12 +626,12 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, void nvc0_so_target_save_offset(struct pipe_context *pipe, struct pipe_stream_output_target *ptarg, - unsigned index, boolean *serialize) + unsigned index, bool *serialize) { struct nvc0_so_target *targ = nvc0_so_target(ptarg); if (*serialize) { - *serialize = FALSE; + *serialize = false; PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1); IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0); @@ -1080,7 +1080,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) { struct nvc0_screen *screen = nvc0->screen; struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; const struct nvc0_mp_pm_query_cfg *cfg; unsigned i, c; unsigned num_ab[2] = { 0, 0 }; @@ -1101,7 +1101,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); if (!screen->pm.mp_counters_enabled) { - screen->pm.mp_counters_enabled = TRUE; + screen->pm.mp_counters_enabled = true; BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); PUSH_DATA (push, 0x1fcb); } @@ -1168,7 +1168,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) struct nvc0_screen *screen = nvc0->screen; struct pipe_context *pipe = &nvc0->base.pipe; struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; uint32_t mask; uint32_t input[3]; const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; @@ -1181,7 +1181,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) if (unlikely(!screen->pm.prog)) { struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); prog->type = PIPE_SHADER_COMPUTE; - prog->translated = TRUE; + prog->translated = true; prog->num_gprs = 14; prog->parm_size = 12; if (is_nve4) { @@ -1249,9 +1249,9 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) } } -static INLINE boolean +static inline bool nvc0_mp_pm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, boolean wait, + struct nvc0_context *nvc0, bool wait, struct nvc0_query *q, const struct nvc0_mp_pm_query_cfg *cfg, unsigned mp_count) @@ -1264,19 +1264,19 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4], for (c = 0; c < cfg->num_counters; ++c) { if (q->data[b + 8] != q->sequence) { if (!wait) - return FALSE; + return false; if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return FALSE; + return false; } count[p][c] = q->data[b + q->ctr[c]]; } } - return TRUE; + return true; } -static INLINE boolean +static inline bool nve4_mp_pm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, boolean wait, + struct nvc0_context *nvc0, bool wait, struct nvc0_query *q, const struct nvc0_mp_pm_query_cfg *cfg, unsigned mp_count) @@ -1291,9 +1291,9 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4], for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) { if (q->data[b + 20 + d] != q->sequence) { if (!wait) - return FALSE; + return false; if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return FALSE; + return false; } if (q->ctr[c] & ~0x3) count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)]; @@ -1302,7 +1302,7 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4], } } } - return TRUE; + return true; } /* Metric calculations: @@ -1325,7 +1325,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); unsigned p, c; const struct nvc0_mp_pm_query_cfg *cfg; - boolean ret; + bool ret; cfg = nvc0_mp_pm_query_get_cfg(nvc0, q); @@ -1334,7 +1334,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, else ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count); if (!ret) - return FALSE; + return false; if (cfg->op == NVC0_COUNTER_OPn_SUM) { for (c = 0; c < cfg->num_counters; ++c) @@ -1394,7 +1394,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, } *(uint64_t *)result = value; - return TRUE; + return true; } int diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 56c230e42fc..ab19b26f156 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -44,16 +44,16 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen, unsigned bindings) { if (sample_count > 8) - return FALSE; + return false; if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */ - return FALSE; + return false; if (!util_format_is_supported(format, bindings)) - return FALSE; + return false; if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER)) if (util_format_get_blocksizebits(format) == 3 * 32) - return FALSE; + return false; /* transfers & shared are always supported */ bindings &= ~(PIPE_BIND_TRANSFER_READ | @@ -120,6 +120,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50; case PIPE_CAP_ENDIANNESS: return PIPE_ENDIAN_LITTLE; + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + return 30; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -163,7 +165,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_USER_CONSTANT_BUFFERS: case PIPE_CAP_USER_INDEX_BUFFERS: case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_SAMPLE_SHADING: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: @@ -174,11 +175,16 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CLIP_HALFZ: case PIPE_CAP_POLYGON_OFFSET_CLAMP: case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_COMPUTE: return (class_3d == NVE4_3D_CLASS) ? 1 : 0; + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; /* unsupported caps */ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: @@ -226,13 +232,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, switch (shader) { case PIPE_SHADER_VERTEX: - /* - case PIPE_SHADER_TESSELLATION_CONTROL: - case PIPE_SHADER_TESSELLATION_EVALUATION: - */ case PIPE_SHADER_GEOMETRY: case PIPE_SHADER_FRAGMENT: break; + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + if (class_3d >= GM107_3D_CLASS) + return 0; + break; case PIPE_SHADER_COMPUTE: if (class_3d != NVE4_3D_CLASS) return 0; @@ -341,6 +348,7 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen, enum pipe_compute_cap param, void *data) { uint64_t *data64 = (uint64_t *)data; + uint32_t *data32 = (uint32_t *)data; const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass; switch (param) { @@ -372,6 +380,9 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen, case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */ data64[0] = 4096; return 8; + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + data32[0] = 32; + return 4; default: return 0; } @@ -550,7 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) /* Using COMPUTE has weird effects on 3D state, we need to * investigate this further before enabling it by default. */ - if (debug_get_bool_option("NVC0_COMPUTE", FALSE)) + if (debug_get_bool_option("NVC0_COMPUTE", false)) return nvc0_screen_compute_setup(screen, screen->base.pushbuf); return 0; case 0xe0: @@ -564,7 +575,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) } } -boolean +bool nvc0_screen_resize_tls_area(struct nvc0_screen *screen, uint32_t lpos, uint32_t lneg, uint32_t cstack) { @@ -574,7 +585,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, if (size >= (1 << 20)) { NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size); - return FALSE; + return false; } size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */ @@ -587,11 +598,11 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, NULL, &bo); if (ret) { NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size); - return FALSE; + return false; } nouveau_bo_ref(NULL, &screen->tls); screen->tls = bo; - return TRUE; + return true; } #define FAIL_SCREEN_INIT(str, err) \ @@ -610,6 +621,7 @@ nvc0_screen_create(struct nouveau_device *dev) struct nouveau_pushbuf *push; uint64_t value; uint32_t obj_class; + uint32_t flags; int ret; unsigned i; @@ -665,8 +677,11 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param; screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported; - ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096, NULL, - &screen->fence.bo); + flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP; + if (dev->drm_version >= 0x01000202) + flags |= NOUVEAU_BO_COHERENT; + + ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo); if (ret) goto fail; nouveau_bo_map(screen->fence.bo, 0, NULL); @@ -781,7 +796,7 @@ nvc0_screen_create(struct nouveau_device *dev) BEGIN_NVC0(push, NVC0_3D(COND_MODE), 1); PUSH_DATA (push, NVC0_3D_COND_MODE_ALWAYS); - if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) { + if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) { /* kill shaders after about 1 second (at 100 MHz) */ BEGIN_NVC0(push, NVC0_3D(WATCHDOG_TIMER), 1); PUSH_DATA (push, 0x17); @@ -1012,6 +1027,7 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, 0x20); BEGIN_NVC0(push, NVC0_3D(SP_SELECT(0)), 1); PUSH_DATA (push, 0x00); + screen->save_state.patch_vertices = 3; BEGIN_NVC0(push, NVC0_3D(POINT_COORD_REPLACE), 1); PUSH_DATA (push, 0); @@ -1031,7 +1047,7 @@ nvc0_screen_create(struct nouveau_device *dev) if (!nvc0_blitter_create(screen)) goto fail; - nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE); + nouveau_fence_new(&screen->base, &screen->base.fence.current, false); return pscreen; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index ef2bd43f006..d8826ae0c0d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -28,16 +28,17 @@ struct nvc0_context; struct nvc0_blitter; struct nvc0_graph_state { - boolean flushed; - boolean rasterizer_discard; - boolean early_z_forced; - boolean prim_restart; + bool flushed; + bool rasterizer_discard; + bool early_z_forced; + bool prim_restart; uint32_t instance_elts; /* bitmask of per-instance elements */ uint32_t instance_base; uint32_t constant_vbos; uint32_t constant_elts; int32_t index_bias; uint16_t scissor; + uint8_t patch_vertices; uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */ uint8_t num_vtxbufs; uint8_t num_vtxelts; @@ -95,7 +96,7 @@ struct nvc0_screen { struct nvc0_program *prog; /* compute state object to read MP counters */ struct pipe_query *mp_counter[8]; /* counter to query allocation */ uint8_t num_mp_pm_active[2]; - boolean mp_counters_enabled; + bool mp_counters_enabled; } pm; struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */ @@ -105,7 +106,7 @@ struct nvc0_screen { struct nouveau_object *nvsw; }; -static INLINE struct nvc0_screen * +static inline struct nvc0_screen * nvc0_screen(struct pipe_screen *screen) { return (struct nvc0_screen *)screen; @@ -276,7 +277,7 @@ int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned, int nvc0_screen_get_driver_query_group_info(struct pipe_screen *, unsigned, struct pipe_driver_query_group_info *); -boolean nvc0_blitter_create(struct nvc0_screen *); +bool nvc0_blitter_create(struct nvc0_screen *); void nvc0_blitter_destroy(struct nvc0_screen *); void nvc0_screen_make_buffers_resident(struct nvc0_screen *); @@ -287,10 +288,10 @@ int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *); int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *); int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *); -boolean nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos, - uint32_t lneg, uint32_t cstack); +bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos, + uint32_t lneg, uint32_t cstack); -static INLINE void +static inline void nvc0_resource_fence(struct nv04_resource *res, uint32_t flags) { struct nvc0_screen *screen = nvc0_screen(res->base.screen); @@ -302,7 +303,7 @@ nvc0_resource_fence(struct nv04_resource *res, uint32_t flags) } } -static INLINE void +static inline void nvc0_resource_validate(struct nv04_resource *res, uint32_t flags) { if (likely(res->bo)) { @@ -325,21 +326,21 @@ struct nvc0_format { extern const struct nvc0_format nvc0_format_table[]; -static INLINE void +static inline void nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic) { if (tic->id >= 0) screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32)); } -static INLINE void +static inline void nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc) { if (tsc->id >= 0) screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32)); } -static INLINE void +static inline void nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic) { if (tic->id >= 0) { @@ -348,7 +349,7 @@ nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic) } } -static INLINE void +static inline void nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc) { if (tsc->id >= 0) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index e0842784a88..8aa127adc0a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -27,7 +27,7 @@ #include "nvc0/nvc0_context.h" -static INLINE void +static inline void nvc0_program_update_context_state(struct nvc0_context *nvc0, struct nvc0_program *prog, int stage) { @@ -63,22 +63,22 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0, } } -static INLINE boolean +static inline bool nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog) { if (prog->mem) - return TRUE; + return true; if (!prog->translated) { prog->translated = nvc0_program_translate( prog, nvc0->screen->base.device->chipset); if (!prog->translated) - return FALSE; + return false; } if (likely(prog->code_size)) return nvc0_program_upload_code(nvc0, prog); - return TRUE; /* stream output info only */ + return true; /* stream output info only */ } void @@ -147,9 +147,6 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0) PUSH_DATA (push, tp->code_base); BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1); PUSH_DATA (push, tp->num_gprs); - - if (tp->tp.input_patch_size <= 32) - IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size); } else { BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1); PUSH_DATA (push, 0x20); @@ -192,7 +189,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0) /* we allow GPs with no code for specifying stream output state only */ if (gp && gp->code_size) { - const boolean gp_selects_layer = !!(gp->hdr[13] & (1 << 9)); + const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9)); BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1); PUSH_DATA (push, 0x41); @@ -280,7 +277,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) nvc0_query_pushbuf_submit(push, targ->pq, 0x4); } else { PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */ - targ->clean = FALSE; + targ->clean = false; } } for (; b < 4; ++b) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 6b7a211e71b..2a33857d9df 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -35,7 +35,7 @@ #include "nouveau_gldefs.h" -static INLINE uint32_t +static inline uint32_t nvc0_colormask(unsigned mask) { uint32_t ret = 0; @@ -55,7 +55,7 @@ nvc0_colormask(unsigned mask) #define NVC0_BLEND_FACTOR_CASE(a, b) \ case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b -static INLINE uint32_t +static inline uint32_t nvc0_blend_fac(unsigned factor) { switch (factor) { @@ -92,8 +92,8 @@ nvc0_blend_state_create(struct pipe_context *pipe, int r; /* reference */ uint32_t ms; uint8_t blend_en = 0; - boolean indep_masks = FALSE; - boolean indep_funcs = FALSE; + bool indep_masks = false; + bool indep_funcs = false; so->pipe = *cso; @@ -111,7 +111,7 @@ nvc0_blend_state_create(struct pipe_context *pipe, cso->rt[i].alpha_func != cso->rt[r].alpha_func || cso->rt[i].alpha_src_factor != cso->rt[r].alpha_src_factor || cso->rt[i].alpha_dst_factor != cso->rt[r].alpha_dst_factor) { - indep_funcs = TRUE; + indep_funcs = true; break; } } @@ -120,7 +120,7 @@ nvc0_blend_state_create(struct pipe_context *pipe, for (i = 1; i < 8; ++i) { if (cso->rt[i].colormask != cso->rt[0].colormask) { - indep_masks = TRUE; + indep_masks = true; break; } } @@ -351,6 +351,13 @@ nvc0_zsa_state_create(struct pipe_context *pipe, SB_DATA (so, nvgl_comparison_op(cso->depth.func)); } + SB_IMMED_3D(so, DEPTH_BOUNDS_EN, cso->depth.bounds_test); + if (cso->depth.bounds_test) { + SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2); + SB_DATA (so, fui(cso->depth.bounds_min)); + SB_DATA (so, fui(cso->depth.bounds_max)); + } + if (cso->stencil[0].enabled) { SB_BEGIN_3D(so, STENCIL_ENABLE, 5); SB_DATA (so, 1); @@ -428,7 +435,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso) FREE(hwcso); } -static INLINE void +static inline void nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s, unsigned nr, void **hwcso) { @@ -508,6 +515,14 @@ nvc0_bind_sampler_states(struct pipe_context *pipe, unsigned shader, assert(start == 0); nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s); break; + case PIPE_SHADER_TESS_CTRL: + assert(start == 0); + nvc0_stage_sampler_states_bind(nvc0_context(pipe), 1, nr, s); + break; + case PIPE_SHADER_TESS_EVAL: + assert(start == 0); + nvc0_stage_sampler_states_bind(nvc0_context(pipe), 2, nr, s); + break; case PIPE_SHADER_GEOMETRY: assert(start == 0); nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s); @@ -537,7 +552,7 @@ nvc0_sampler_view_destroy(struct pipe_context *pipe, FREE(nv50_tic_entry(view)); } -static INLINE void +static inline void nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s, unsigned nr, struct pipe_sampler_view **views) @@ -633,6 +648,12 @@ nvc0_set_sampler_views(struct pipe_context *pipe, unsigned shader, case PIPE_SHADER_VERTEX: nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views); break; + case PIPE_SHADER_TESS_CTRL: + nvc0_stage_set_sampler_views(nvc0_context(pipe), 1, nr, views); + break; + case PIPE_SHADER_TESS_EVAL: + nvc0_stage_set_sampler_views(nvc0_context(pipe), 2, nr, views); + break; case PIPE_SHADER_GEOMETRY: nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views); break; @@ -734,6 +755,38 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso) } static void * +nvc0_tcp_state_create(struct pipe_context *pipe, + const struct pipe_shader_state *cso) +{ + return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_CTRL); +} + +static void +nvc0_tcp_state_bind(struct pipe_context *pipe, void *hwcso) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + + nvc0->tctlprog = hwcso; + nvc0->dirty |= NVC0_NEW_TCTLPROG; +} + +static void * +nvc0_tep_state_create(struct pipe_context *pipe, + const struct pipe_shader_state *cso) +{ + return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_EVAL); +} + +static void +nvc0_tep_state_bind(struct pipe_context *pipe, void *hwcso) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + + nvc0->tevlprog = hwcso; + nvc0->dirty |= NVC0_NEW_TEVLPROG; +} + +static void * nvc0_cp_state_create(struct pipe_context *pipe, const struct pipe_compute_state *cso) { @@ -790,7 +843,7 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res); - nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE; + nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false; if (nvc0->constbuf[s][i].user) { nvc0->constbuf[s][i].u.data = cb->user_buffer; nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000); @@ -934,6 +987,18 @@ nvc0_set_viewport_states(struct pipe_context *pipe, } static void +nvc0_set_tess_state(struct pipe_context *pipe, + const float default_tess_outer[4], + const float default_tess_inner[2]) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + + memcpy(nvc0->default_tess_outer, default_tess_outer, 4 * sizeof(float)); + memcpy(nvc0->default_tess_inner, default_tess_inner, 2 * sizeof(float)); + nvc0->dirty |= NVC0_NEW_TESSFACTOR; +} + +static void nvc0_set_vertex_buffers(struct pipe_context *pipe, unsigned start_slot, unsigned count, const struct pipe_vertex_buffer *vb) @@ -1018,7 +1083,7 @@ nvc0_so_target_create(struct pipe_context *pipe, FREE(targ); return NULL; } - targ->clean = TRUE; + targ->clean = true; targ->pipe.buffer_size = size; targ->pipe.buffer_offset = offset; @@ -1051,13 +1116,13 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe, { struct nvc0_context *nvc0 = nvc0_context(pipe); unsigned i; - boolean serialize = TRUE; + bool serialize = true; assert(num_targets <= 4); for (i = 0; i < num_targets; ++i) { - const boolean changed = nvc0->tfbbuf[i] != targets[i]; - const boolean append = (offsets[i] == ((unsigned)-1)); + const bool changed = nvc0->tfbbuf[i] != targets[i]; + const bool append = (offsets[i] == ((unsigned)-1)); if (!changed && append) continue; nvc0->tfbbuf_dirty |= 1 << i; @@ -1066,7 +1131,7 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe, nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize); if (targets[i] && !append) - nvc0_so_target(targets[i])->clean = TRUE; + nvc0_so_target(targets[i])->clean = true; pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]); } @@ -1125,16 +1190,18 @@ nvc0_set_compute_resources(struct pipe_context *pipe, } static void -nvc0_set_shader_resources(struct pipe_context *pipe, - unsigned start, unsigned nr, - struct pipe_surface **resources) +nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader, + unsigned start_slot, unsigned count, + struct pipe_image_view **views) { - nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources); +#if 0 + nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views); nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES; +#endif } -static INLINE void +static inline void nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res) { struct nv04_resource *buf = nv04_resource(res); @@ -1218,12 +1285,18 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->create_vs_state = nvc0_vp_state_create; pipe->create_fs_state = nvc0_fp_state_create; pipe->create_gs_state = nvc0_gp_state_create; + pipe->create_tcs_state = nvc0_tcp_state_create; + pipe->create_tes_state = nvc0_tep_state_create; pipe->bind_vs_state = nvc0_vp_state_bind; pipe->bind_fs_state = nvc0_fp_state_bind; pipe->bind_gs_state = nvc0_gp_state_bind; + pipe->bind_tcs_state = nvc0_tcp_state_bind; + pipe->bind_tes_state = nvc0_tep_state_bind; pipe->delete_vs_state = nvc0_sp_state_delete; pipe->delete_fs_state = nvc0_sp_state_delete; pipe->delete_gs_state = nvc0_sp_state_delete; + pipe->delete_tcs_state = nvc0_sp_state_delete; + pipe->delete_tes_state = nvc0_sp_state_delete; pipe->create_compute_state = nvc0_cp_state_create; pipe->bind_compute_state = nvc0_cp_state_bind; @@ -1239,6 +1312,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->set_polygon_stipple = nvc0_set_polygon_stipple; pipe->set_scissor_states = nvc0_set_scissor_states; pipe->set_viewport_states = nvc0_set_viewport_states; + pipe->set_tess_state = nvc0_set_tess_state; pipe->create_vertex_elements_state = nvc0_vertex_state_create; pipe->delete_vertex_elements_state = nvc0_vertex_state_delete; @@ -1253,8 +1327,14 @@ nvc0_init_state_functions(struct nvc0_context *nvc0) pipe->set_global_binding = nvc0_set_global_bindings; pipe->set_compute_resources = nvc0_set_compute_resources; - pipe->set_shader_resources = nvc0_set_shader_resources; + pipe->set_shader_images = nvc0_set_shader_images; nvc0->sample_mask = ~0; nvc0->min_samples = 1; + nvc0->default_tess_outer[0] = + nvc0->default_tess_outer[1] = + nvc0->default_tess_outer[2] = + nvc0->default_tess_outer[3] = 1.0; + nvc0->default_tess_inner[0] = + nvc0->default_tess_inner[1] = 1.0; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index c52399ab312..ce1119c284d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -55,7 +55,7 @@ nvc0_validate_zcull(struct nvc0_context *nvc0) } #endif -static INLINE void +static inline void nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i) { BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6); @@ -74,7 +74,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0) struct pipe_framebuffer_state *fb = &nvc0->framebuffer; unsigned i, ms; unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1; - boolean serialize = FALSE; + bool serialize = false; nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); @@ -136,7 +136,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0) } if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING) - serialize = TRUE; + serialize = true; res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; @@ -168,7 +168,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0) ms_mode = mt->ms_mode; if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING) - serialize = TRUE; + serialize = true; mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; @@ -309,7 +309,7 @@ nvc0_validate_viewport(struct nvc0_context *nvc0) nvc0->viewports_dirty = 0; } -static INLINE void +static inline void nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -324,7 +324,7 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s) PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4); } -static INLINE void +static inline void nvc0_check_program_ucps(struct nvc0_context *nvc0, struct nvc0_program *vp, uint8_t mask) { @@ -339,7 +339,7 @@ nvc0_check_program_ucps(struct nvc0_context *nvc0, nvc0_vertprog_validate(nvc0); else if (likely(vp == nvc0->gmtyprog)) - nvc0_vertprog_validate(nvc0); + nvc0_gmtyprog_validate(nvc0); else nvc0_tevlprog_validate(nvc0); } @@ -455,6 +455,8 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) PUSH_DATA (push, (i << 4) | 1); BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD); + + nvc0->cb_dirty = 1; /* Force cache flush for UBO. */ } else { BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1); PUSH_DATA (push, (i << 4) | 0); @@ -518,12 +520,12 @@ static void nvc0_validate_derived_1(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; - boolean rasterizer_discard; + bool rasterizer_discard; if (nvc0->rast && nvc0->rast->pipe.rasterizer_discard) { - rasterizer_discard = TRUE; + rasterizer_discard = true; } else { - boolean zs = nvc0->zsa && + bool zs = nvc0->zsa && (nvc0->zsa->pipe.depth.enabled || nvc0->zsa->pipe.stencil[0].enabled); rasterizer_discard = !zs && (!nvc0->fragprog || !nvc0->fragprog->hdr[18]); @@ -535,6 +537,33 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0) } } +/* alpha test is disabled if there are no color RTs, so make sure we have at + * least one if alpha test is enabled. Note that this must run after + * nvc0_validate_fb, otherwise that will override the RT count setting. + */ +static void +nvc0_validate_derived_2(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + + if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled && + nvc0->framebuffer.nr_cbufs == 0) { + nvc0_fb_set_null_rt(push, 0); + BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); + PUSH_DATA (push, (076543210 << 4) | 1); + } +} + +static void +nvc0_validate_tess_state(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + + BEGIN_NVC0(push, NVC0_3D(TESS_LEVEL_OUTER(0)), 6); + PUSH_DATAp(push, nvc0->default_tess_outer, 4); + PUSH_DATAp(push, nvc0->default_tess_inner, 2); +} + static void nvc0_switch_pipe_context(struct nvc0_context *ctx_to) { @@ -593,10 +622,12 @@ static struct state_validate { { nvc0_vertprog_validate, NVC0_NEW_VERTPROG }, { nvc0_tctlprog_validate, NVC0_NEW_TCTLPROG }, { nvc0_tevlprog_validate, NVC0_NEW_TEVLPROG }, + { nvc0_validate_tess_state, NVC0_NEW_TESSFACTOR }, { nvc0_gmtyprog_validate, NVC0_NEW_GMTYPROG }, { nvc0_fragprog_validate, NVC0_NEW_FRAGPROG }, { nvc0_validate_derived_1, NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA | NVC0_NEW_RASTERIZER }, + { nvc0_validate_derived_2, NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER }, { nvc0_validate_clip, NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER | NVC0_NEW_VERTPROG | NVC0_NEW_TEVLPROG | @@ -613,7 +644,7 @@ static struct state_validate { }; #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0])) -boolean +bool nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words) { uint32_t state_mask; @@ -634,15 +665,15 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words) } nvc0->dirty &= ~state_mask; - nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, FALSE); + nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false); } nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d); ret = nouveau_pushbuf_validate(nvc0->base.pushbuf); if (unlikely(nvc0->state.flushed)) { - nvc0->state.flushed = FALSE; - nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, TRUE); + nvc0->state.flushed = false; + nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true); } return !ret; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h index 1d70b7c7b23..18fcc12dea3 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h @@ -29,7 +29,7 @@ struct nvc0_rasterizer_stateobj { struct nvc0_zsa_stateobj { struct pipe_depth_stencil_alpha_state pipe; int size; - uint32_t state[26]; + uint32_t state[30]; }; struct nvc0_constbuf { @@ -39,7 +39,7 @@ struct nvc0_constbuf { } u; uint32_t size; uint32_t offset; - boolean user; /* should only be TRUE if u.data is valid and non-NULL */ + bool user; /* should only be true if u.data is valid and non-NULL */ }; struct nvc0_vertex_element { @@ -55,8 +55,8 @@ struct nvc0_vertex_stateobj { unsigned num_elements; uint32_t instance_elts; uint32_t instance_bufs; - boolean shared_slots; - boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */ + bool shared_slots; + bool need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */ unsigned size; /* size of vertex in bytes (when packed) */ struct nvc0_vertex_element element[0]; }; @@ -65,10 +65,10 @@ struct nvc0_so_target { struct pipe_stream_output_target pipe; struct pipe_query *pq; unsigned stride; - boolean clean; + bool clean; }; -static INLINE struct nvc0_so_target * +static inline struct nvc0_so_target * nvc0_so_target(struct pipe_stream_output_target *ptarg) { return (struct nvc0_so_target *)ptarg; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index a820de7259a..51a6f93f891 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -47,8 +47,8 @@ #define NOUVEAU_DRIVER 0xc0 #include "nv50/nv50_blit.h" -static INLINE uint8_t -nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal) +static inline uint8_t +nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal) { uint8_t id = nvc0_format_table[format].rt; @@ -81,9 +81,9 @@ nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal) } static int -nvc0_2d_texture_set(struct nouveau_pushbuf *push, boolean dst, +nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst, struct nv50_miptree *mt, unsigned level, unsigned layer, - enum pipe_format pformat, boolean dst_src_pformat_equal) + enum pipe_format pformat, bool dst_src_pformat_equal) { struct nouveau_bo *bo = mt->base.bo; uint32_t width, height, depth; @@ -161,16 +161,16 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push, const enum pipe_format dfmt = dst->base.base.format; const enum pipe_format sfmt = src->base.base.format; int ret; - boolean eqfmt = dfmt == sfmt; + bool eqfmt = dfmt == sfmt; if (!PUSH_SPACE(push, 2 * 16 + 32)) return PIPE_ERROR; - ret = nvc0_2d_texture_set(push, TRUE, dst, dst_level, dz, dfmt, eqfmt); + ret = nvc0_2d_texture_set(push, true, dst, dst_level, dz, dfmt, eqfmt); if (ret) return ret; - ret = nvc0_2d_texture_set(push, FALSE, src, src_level, sz, sfmt, eqfmt); + ret = nvc0_2d_texture_set(push, false, src, src_level, sz, sfmt, eqfmt); if (ret) return ret; @@ -189,7 +189,7 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push, PUSH_DATA (push, 0); PUSH_DATA (push, sx << src->ms_x); PUSH_DATA (push, 0); - PUSH_DATA (push, sy << src->ms_x); + PUSH_DATA (push, sy << src->ms_y); return 0; } @@ -203,7 +203,7 @@ nvc0_resource_copy_region(struct pipe_context *pipe, { struct nvc0_context *nvc0 = nvc0_context(pipe); int ret; - boolean m2mf; + bool m2mf; unsigned dst_layer = dstz, src_layer = src_box->z; if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { @@ -704,7 +704,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit) }; blit->vp.type = PIPE_SHADER_VERTEX; - blit->vp.translated = TRUE; + blit->vp.translated = true; if (blit->screen->base.class_3d >= GM107_3D_CLASS) { blit->vp.code = (uint32_t *)code_gm107; /* const_cast */ blit->vp.code_size = sizeof(code_gm107); @@ -1217,7 +1217,7 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) int i; uint32_t mode; uint32_t mask = nv50_blit_eng2d_get_mask(info); - boolean b; + bool b; mode = nv50_blit_get_filter(info) ? NV50_2D_BLIT_CONTROL_FILTER_BILINEAR : @@ -1376,39 +1376,40 @@ static void nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); - boolean eng3d = FALSE; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + bool eng3d = false; if (util_format_is_depth_or_stencil(info->dst.resource->format)) { if (!(info->mask & PIPE_MASK_ZS)) return; if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT || info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) - eng3d = TRUE; + eng3d = true; if (info->filter != PIPE_TEX_FILTER_NEAREST) - eng3d = TRUE; + eng3d = true; } else { if (!(info->mask & PIPE_MASK_RGBA)) return; if (info->mask != PIPE_MASK_RGBA) - eng3d = TRUE; + eng3d = true; } if (nv50_miptree(info->src.resource)->layout_3d) { - eng3d = TRUE; + eng3d = true; } else if (info->src.box.depth != info->dst.box.depth) { - eng3d = TRUE; + eng3d = true; debug_printf("blit: cannot filter array or cube textures in z direction"); } if (!eng3d && info->dst.format != info->src.format) { if (!nv50_2d_dst_format_faithful(info->dst.format)) { - eng3d = TRUE; + eng3d = true; } else if (!nv50_2d_src_format_faithful(info->src.format)) { if (!util_format_is_luminance(info->src.format)) { if (!nv50_2d_dst_format_ops_supported(info->dst.format)) - eng3d = TRUE; + eng3d = true; else if (util_format_is_intensity(info->src.format)) eng3d = info->src.format != PIPE_FORMAT_I8_UNORM; @@ -1420,30 +1421,36 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info) } } else if (util_format_is_luminance_alpha(info->src.format)) - eng3d = TRUE; + eng3d = true; } if (info->src.resource->nr_samples == 8 && info->dst.resource->nr_samples <= 1) - eng3d = TRUE; + eng3d = true; #if 0 /* FIXME: can't make this work with eng2d anymore, at least not on nv50 */ if (info->src.resource->nr_samples > 1 || info->dst.resource->nr_samples > 1) - eng3d = TRUE; + eng3d = true; #endif /* FIXME: find correct src coordinates adjustments */ if ((info->src.box.width != info->dst.box.width && info->src.box.width != -info->dst.box.width) || (info->src.box.height != info->dst.box.height && info->src.box.height != -info->dst.box.height)) - eng3d = TRUE; + eng3d = true; + + if (nvc0->screen->num_occlusion_queries_active) + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); if (!eng3d) nvc0_blit_eng2d(nvc0, info); else nvc0_blit_3d(nvc0, info); + if (nvc0->screen->num_occlusion_queries_active) + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); + NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_blit_count, 1); } @@ -1453,13 +1460,13 @@ nvc0_flush_resource(struct pipe_context *ctx, { } -boolean +bool nvc0_blitter_create(struct nvc0_screen *screen) { screen->blitter = CALLOC_STRUCT(nvc0_blitter); if (!screen->blitter) { NOUVEAU_ERR("failed to allocate blitter struct\n"); - return FALSE; + return false; } screen->blitter->screen = screen; @@ -1468,7 +1475,7 @@ nvc0_blitter_create(struct nvc0_screen *screen) nvc0_blitter_make_vp(screen->blitter); nvc0_blitter_make_sampler(screen->blitter); - return TRUE; + return true; } void @@ -1491,20 +1498,20 @@ nvc0_blitter_destroy(struct nvc0_screen *screen) FREE(blitter); } -boolean +bool nvc0_blitctx_create(struct nvc0_context *nvc0) { nvc0->blit = CALLOC_STRUCT(nvc0_blitctx); if (!nvc0->blit) { NOUVEAU_ERR("failed to allocate blit context\n"); - return FALSE; + return false; } nvc0->blit->nvc0 = nvc0; nvc0->blit->rast.pipe.half_pixel_center = 1; - return TRUE; + return true; } void diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index ddc0409ca86..d19082e0e15 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -34,8 +34,8 @@ (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK | \ NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK) -static INLINE uint32_t -nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int) +static inline uint32_t +nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int) { switch (swz) { case PIPE_SWIZZLE_RED: @@ -82,7 +82,7 @@ nvc0_create_texture_view(struct pipe_context *pipe, uint32_t depth; struct nv50_tic_entry *view; struct nv50_miptree *mt; - boolean tex_int; + bool tex_int; view = MALLOC_STRUCT(nv50_tic_entry); if (!view) @@ -195,7 +195,7 @@ nvc0_create_texture_view(struct pipe_context *pipe, default: NOUVEAU_ERR("unexpected/invalid texture target: %d\n", mt->base.base.target); - return FALSE; + return false; } tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000; @@ -226,7 +226,7 @@ nvc0_create_texture_view(struct pipe_context *pipe, return &view->pipe; } -static boolean +static bool nvc0_validate_tic(struct nvc0_context *nvc0, int s) { uint32_t commands[32]; @@ -234,12 +234,12 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) struct nouveau_bo *txc = nvc0->screen->txc; unsigned i; unsigned n = 0; - boolean need_flush = FALSE; + bool need_flush = false; for (i = 0; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); struct nv04_resource *res; - const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i)); + const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); if (!tic) { if (dirty) @@ -263,7 +263,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) BEGIN_NIC0(push, NVC0_M2MF(DATA), 8); PUSH_DATAp(push, &tic->tic[0], 8); - need_flush = TRUE; + need_flush = true; } else if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1); @@ -295,18 +295,18 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s) return need_flush; } -static boolean +static bool nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) { struct nouveau_bo *txc = nvc0->screen->txc; struct nouveau_pushbuf *push = nvc0->base.pushbuf; unsigned i; - boolean need_flush = FALSE; + bool need_flush = false; for (i = 0; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); struct nv04_resource *res; - const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i)); + const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); if (!tic) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; @@ -328,7 +328,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) PUSH_DATA (push, 0x1001); PUSH_DATAp(push, &tic->tic[0], 8); - need_flush = TRUE; + need_flush = true; } else if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1); @@ -356,16 +356,14 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s) void nvc0_validate_textures(struct nvc0_context *nvc0) { - boolean need_flush; - - if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) { - need_flush = nve4_validate_tic(nvc0, 0); - need_flush |= nve4_validate_tic(nvc0, 3); - need_flush |= nve4_validate_tic(nvc0, 4); - } else { - need_flush = nvc0_validate_tic(nvc0, 0); - need_flush |= nvc0_validate_tic(nvc0, 3); - need_flush |= nvc0_validate_tic(nvc0, 4); + bool need_flush = false; + int i; + + for (i = 0; i < 5; i++) { + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + need_flush |= nve4_validate_tic(nvc0, i); + else + need_flush |= nvc0_validate_tic(nvc0, i); } if (need_flush) { @@ -374,14 +372,14 @@ void nvc0_validate_textures(struct nvc0_context *nvc0) } } -static boolean +static bool nvc0_validate_tsc(struct nvc0_context *nvc0, int s) { uint32_t commands[16]; struct nouveau_pushbuf *push = nvc0->base.pushbuf; unsigned i; unsigned n = 0; - boolean need_flush = FALSE; + bool need_flush = false; for (i = 0; i < nvc0->num_samplers[s]; ++i) { struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]); @@ -398,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc, 65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base), 32, tsc->tsc); - need_flush = TRUE; + need_flush = true; } nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32); @@ -418,13 +416,13 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s) return need_flush; } -boolean +bool nve4_validate_tsc(struct nvc0_context *nvc0, int s) { struct nouveau_bo *txc = nvc0->screen->txc; struct nouveau_pushbuf *push = nvc0->base.pushbuf; unsigned i; - boolean need_flush = FALSE; + bool need_flush = false; for (i = 0; i < nvc0->num_samplers[s]; ++i) { struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]); @@ -447,7 +445,7 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s) PUSH_DATA (push, 0x1001); PUSH_DATAp(push, &tsc->tsc[0], 8); - need_flush = TRUE; + need_flush = true; } nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32); @@ -466,16 +464,14 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s) void nvc0_validate_samplers(struct nvc0_context *nvc0) { - boolean need_flush; - - if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) { - need_flush = nve4_validate_tsc(nvc0, 0); - need_flush |= nve4_validate_tsc(nvc0, 3); - need_flush |= nve4_validate_tsc(nvc0, 4); - } else { - need_flush = nvc0_validate_tsc(nvc0, 0); - need_flush |= nvc0_validate_tsc(nvc0, 3); - need_flush |= nvc0_validate_tsc(nvc0, 4); + bool need_flush = false; + int i; + + for (i = 0; i < 5; i++) { + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + need_flush |= nve4_validate_tsc(nvc0, i); + else + need_flush |= nvc0_validate_tsc(nvc0, i); } if (need_flush) { @@ -645,13 +641,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, } } -static INLINE void +static inline void nvc0_update_surface_bindings(struct nvc0_context *nvc0) { /* TODO */ } -static INLINE void +static inline void nve4_update_surface_bindings(struct nvc0_context *nvc0) { /* TODO */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index 45c6f7cc3ca..7cc5b4b1f48 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -329,17 +329,17 @@ nve4_m2mf_copy_linear(struct nouveau_context *nv, } -static INLINE boolean +static inline bool nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt) { if (mt->base.domain == NOUVEAU_BO_VRAM) - return FALSE; + return false; if (mt->base.base.usage != PIPE_USAGE_STAGING) - return FALSE; + return false; return !nouveau_bo_memtype(mt->base.bo); } -static INLINE boolean +static inline bool nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage) { if (!mt->base.mm) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 8cf2584b0ce..6f9e7906713 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -61,8 +61,8 @@ nvc0_vertex_state_create(struct pipe_context *pipe, so->num_elements = num_elements; so->instance_elts = 0; so->instance_bufs = 0; - so->shared_slots = FALSE; - so->need_conversion = FALSE; + so->shared_slots = false; + so->need_conversion = false; memset(so->vb_access_size, 0, sizeof(so->vb_access_size)); @@ -93,7 +93,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe, return NULL; } so->element[i].state = nvc0_format_table[fmt].vtx; - so->need_conversion = TRUE; + so->need_conversion = true; } size = util_format_get_blocksize(fmt); @@ -141,7 +141,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe, if (so->instance_elts || src_offset_max >= (1 << 14)) return so; - so->shared_slots = TRUE; + so->shared_slots = true; for (i = 0; i < num_elements; ++i) { const unsigned b = elements[i].vertex_buffer_index; @@ -196,7 +196,7 @@ nvc0_set_constant_vertex_attrib(struct nvc0_context *nvc0, const unsigned a) push->cur += 5; } -static INLINE void +static inline void nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi, uint32_t *base, uint32_t *size) { @@ -214,7 +214,7 @@ nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi, } } -static INLINE void +static inline void nvc0_release_user_vbufs(struct nvc0_context *nvc0) { if (nvc0->vbo_user) { @@ -265,7 +265,7 @@ nvc0_update_user_vbufs(struct nvc0_context *nvc0) PUSH_DATAh(push, address[b] + ve->src_offset); PUSH_DATA (push, address[b] + ve->src_offset); } - nvc0->base.vbo_dirty = TRUE; + nvc0->base.vbo_dirty = true; } static void @@ -419,7 +419,7 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0) uint32_t const_vbos; unsigned i; uint8_t vbo_mode; - boolean update_vertex; + bool update_vertex; nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); @@ -529,7 +529,7 @@ nvc0_idxbuf_validate(struct nvc0_context *nvc0) #define NVC0_PRIM_GL_CASE(n) \ case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n -static INLINE unsigned +static inline unsigned nvc0_prim_gl(unsigned prim) { switch (prim) { @@ -547,8 +547,7 @@ nvc0_prim_gl(unsigned prim) NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY); NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY); NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY); - /* - NVC0_PRIM_GL_CASE(PATCHES); */ + NVC0_PRIM_GL_CASE(PATCHES); default: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS; } @@ -559,7 +558,7 @@ nvc0_draw_vbo_kick_notify(struct nouveau_pushbuf *push) { struct nvc0_screen *screen = push->user_priv; - nouveau_fence_update(&screen->base, TRUE); + nouveau_fence_update(&screen->base, true); NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1); } @@ -695,7 +694,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push, } static void -nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten, +nvc0_draw_elements(struct nvc0_context *nvc0, bool shorten, unsigned mode, unsigned start, unsigned count, unsigned instance_count, int32_t index_bias) { @@ -835,8 +834,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size); } -static INLINE void -nvc0_update_prim_restart(struct nvc0_context *nvc0, boolean en, uint32_t index) +static inline void +nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -889,6 +888,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } } + if (info->mode == PIPE_PRIM_PATCHES && + nvc0->state.patch_vertices != info->vertices_per_patch) { + nvc0->state.patch_vertices = info->vertices_per_patch; + IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices); + } + /* 8 as minimum to avoid immediate double validation of new buffers */ nvc0_state_validate(nvc0, ~0, 8); @@ -910,13 +915,13 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) continue; if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nvc0->cb_dirty = TRUE; + nvc0->cb_dirty = true; } } if (nvc0->cb_dirty) { IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011); - nvc0->cb_dirty = FALSE; + nvc0->cb_dirty = false; } if (nvc0->state.vbo_mode) { @@ -940,19 +945,19 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (!nvc0->vtxbuf[i].buffer) continue; if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nvc0->base.vbo_dirty = TRUE; + nvc0->base.vbo_dirty = true; } if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer && nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) - nvc0->base.vbo_dirty = TRUE; + nvc0->base.vbo_dirty = true; nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index); if (nvc0->base.vbo_dirty) { if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS) IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0); - nvc0->base.vbo_dirty = FALSE; + nvc0->base.vbo_dirty = false; } if (unlikely(info->indirect)) { @@ -962,10 +967,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) nvc0_draw_stream_output(nvc0, info); } else if (info->indexed) { - boolean shorten = info->max_index <= 65535; + bool shorten = info->max_index <= 65535; if (info->primitive_restart && info->restart_index > 65535) - shorten = FALSE; + shorten = false; nvc0_draw_elements(nvc0, shorten, info->mode, info->start, info->count, diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c index f180087161d..8b23a4887da 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c @@ -21,12 +21,12 @@ struct push_context { uint32_t restart_index; uint32_t instance_id; - boolean prim_restart; - boolean need_vertex_id; + bool prim_restart; + bool need_vertex_id; struct { - boolean enabled; - boolean value; + bool enabled; + bool value; unsigned stride; const uint8_t *data; } edgeflag; @@ -47,7 +47,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx) ctx->need_vertex_id = nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32); - ctx->edgeflag.value = TRUE; + ctx->edgeflag.value = true; ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS; /* silence warnings */ @@ -55,7 +55,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx) ctx->edgeflag.stride = 0; } -static INLINE void +static inline void nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias) { struct translate *translate = nvc0->vertex->translate; @@ -78,7 +78,7 @@ nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias) } } -static INLINE void +static inline void nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0) { if (nvc0->idxbuf.buffer) { @@ -90,7 +90,7 @@ nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0) } } -static INLINE void +static inline void nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0, int32_t index_bias) { @@ -112,7 +112,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0, ctx->edgeflag.data += (intptr_t)index_bias * vb->stride; } -static INLINE unsigned +static inline unsigned prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index) { unsigned i; @@ -120,7 +120,7 @@ prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index) { unsigned i; @@ -128,7 +128,7 @@ prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index) return i; } -static INLINE unsigned +static inline unsigned prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index) { unsigned i; @@ -136,21 +136,21 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index) return i; } -static INLINE boolean +static inline bool ef_value(const struct push_context *ctx, uint32_t index) { float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride]; - return *pf ? TRUE : FALSE; + return *pf ? true : false; } -static INLINE boolean +static inline bool ef_toggle(struct push_context *ctx) { ctx->edgeflag.value = !ctx->edgeflag.value; return ctx->edgeflag.value; } -static INLINE unsigned +static inline unsigned ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n) { unsigned i; @@ -158,7 +158,7 @@ ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n) return i; } -static INLINE unsigned +static inline unsigned ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n) { unsigned i; @@ -166,7 +166,7 @@ ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n) return i; } -static INLINE unsigned +static inline unsigned ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n) { unsigned i; @@ -174,7 +174,7 @@ ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n) return i; } -static INLINE unsigned +static inline unsigned ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n) { unsigned i; @@ -182,7 +182,7 @@ ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n) return i; } -static INLINE void * +static inline void * nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -409,7 +409,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count) #define NVC0_PRIM_GL_CASE(n) \ case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n -static INLINE unsigned +static inline unsigned nvc0_prim_gl(unsigned prim) { switch (prim) { @@ -427,8 +427,7 @@ nvc0_prim_gl(unsigned prim) NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY); NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY); NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY); - /* - NVC0_PRIM_GL_CASE(PATCHES); */ + NVC0_PRIM_GL_CASE(PATCHES); default: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS; } @@ -483,7 +482,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) struct pipe_context *pipe = &nvc0->base.pipe; struct nvc0_so_target *targ; targ = nvc0_so_target(info->count_from_stream_output); - pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count); + pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count); vert_count /= targ->stride; } ctx.idxbuf = NULL; /* shut up warnings */ @@ -560,7 +559,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1); } -static INLINE void +static inline void copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n) { unsigned i; @@ -568,7 +567,7 @@ copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n) dst[i] = elts[i] + bias; } -static INLINE void +static inline void copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n) { unsigned i; @@ -576,7 +575,7 @@ copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n) dst[i] = elts[i] + bias; } -static INLINE void +static inline void copy_indices_u32(uint32_t *dst, const uint32_t *elts, uint32_t bias, unsigned n) { unsigned i; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h index 725e889683f..4ea8ca3cfa2 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h @@ -15,14 +15,14 @@ #endif -static INLINE void +static inline void nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin, unsigned flags, struct nouveau_bo *bo) { nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL; } -static INLINE void +static inline void nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin, struct nv04_resource *res, unsigned flags) { @@ -38,7 +38,7 @@ nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin, #define BCTX_REFN(bctx, bin, res, acc) \ nvc0_add_resident(bctx, NVC0_BIND_##bin, res, NOUVEAU_BO_##acc) -static INLINE void +static inline void PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) { struct nouveau_pushbuf_refn ref = { bo, flags }; @@ -69,46 +69,46 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE -static INLINE uint32_t +static inline uint32_t NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size) { return 0x20000000 | (size << 16) | (subc << 13) | (mthd >> 2); } -static INLINE uint32_t +static inline uint32_t NVC0_FIFO_PKHDR_NI(int subc, int mthd, unsigned size) { return 0x60000000 | (size << 16) | (subc << 13) | (mthd >> 2); } -static INLINE uint32_t +static inline uint32_t NVC0_FIFO_PKHDR_IL(int subc, int mthd, uint16_t data) { assert(data < 0x2000); return 0x80000000 | (data << 16) | (subc << 13) | (mthd >> 2); } -static INLINE uint32_t +static inline uint32_t NVC0_FIFO_PKHDR_1I(int subc, int mthd, unsigned size) { return 0xa0000000 | (size << 16) | (subc << 13) | (mthd >> 2); } -static INLINE uint8_t +static inline uint8_t nouveau_bo_memtype(const struct nouveau_bo *bo) { return bo->config.nvc0.memtype; } -static INLINE void +static inline void PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data) { *push->cur++ = (uint32_t)(data >> 32); } -static INLINE void +static inline void BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING @@ -117,7 +117,7 @@ BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(subc, mthd, size)); } -static INLINE void +static inline void BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING @@ -126,7 +126,7 @@ BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) PUSH_DATA (push, NVC0_FIFO_PKHDR_NI(subc, mthd, size)); } -static INLINE void +static inline void BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) { #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING @@ -135,7 +135,7 @@ BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size) PUSH_DATA (push, NVC0_FIFO_PKHDR_1I(subc, mthd, size)); } -static INLINE void +static inline void IMMED_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, uint16_t data) { #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index fce02a7cc57..d3e5676873e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -250,7 +250,7 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0) static void nve4_compute_validate_samplers(struct nvc0_context *nvc0) { - boolean need_flush = nve4_validate_tsc(nvc0, 5); + bool need_flush = nve4_validate_tsc(nvc0, 5); if (need_flush) { BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1); PUSH_DATA (nvc0->base.pushbuf, 0); @@ -299,11 +299,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) } -static boolean +static bool nve4_compute_state_validate(struct nvc0_context *nvc0) { if (!nvc0_compute_validate_program(nvc0)) - return FALSE; + return false; if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) nve4_compute_validate_textures(nvc0); if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) @@ -316,15 +316,15 @@ nve4_compute_state_validate(struct nvc0_context *nvc0) nvc0_validate_global_residents(nvc0, nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE); + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return FALSE; + return false; if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE); + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); - return TRUE; + return true; } @@ -364,7 +364,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); } -static INLINE uint8_t +static inline uint8_t nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) { if (shared_size > (32 << 10)) @@ -413,7 +413,7 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE); } -static INLINE struct nve4_cp_launch_desc * +static inline struct nve4_cp_launch_desc * nve4_compute_alloc_launch_desc(struct nouveau_context *nv, struct nouveau_bo **pbo, uint64_t *pgpuaddr) { @@ -505,7 +505,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) for (i = 0; i < nvc0->num_textures[s]; ++i) { struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); struct nv04_resource *res; - const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i)); + const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); if (!tic) { nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; @@ -575,18 +575,18 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) { const uint32_t *data = (const uint32_t *)desc; unsigned i; - boolean zero = FALSE; + bool zero = false; debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); for (i = 0; i < sizeof(*desc); i += 4) { if (data[i / 4]) { debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); - zero = FALSE; + zero = false; } else if (!zero) { debug_printf("...\n"); - zero = TRUE; + zero = true; } } @@ -606,7 +606,7 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) for (i = 0; i < 8; ++i) { uint64_t address; uint32_t size = desc->cb[i].size; - boolean valid = !!(desc->cb_mask & (1 << i)); + bool valid = !!(desc->cb_mask & (1 << i)); address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l; diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h index 4d7af54d860..7364a68a579 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h @@ -68,7 +68,7 @@ struct nve4_cp_launch_desc u32 unk48[16]; }; -static INLINE void +static inline void nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc) { memset(desc, 0, sizeof(*desc)); @@ -78,7 +78,7 @@ nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc) desc->unk47_20 = 0x300; } -static INLINE void +static inline void nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, unsigned index, struct nouveau_bo *bo, @@ -96,7 +96,7 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, desc->cb_mask |= 1 << index; } -static INLINE void +static inline void nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc, unsigned index, const struct nvc0_constbuf *cb) diff --git a/src/gallium/drivers/r300/Makefile.am b/src/gallium/drivers/r300/Makefile.am index dd1a5ede19b..081f332683e 100644 --- a/src/gallium/drivers/r300/Makefile.am +++ b/src/gallium/drivers/r300/Makefile.am @@ -1,5 +1,3 @@ -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c index baf05cea965..6ea8f24cc14 100644 --- a/src/gallium/drivers/r300/r300_blit.c +++ b/src/gallium/drivers/r300/r300_blit.c @@ -382,7 +382,7 @@ static void r300_clear(struct pipe_context* pipe, r300_get_num_cs_end_dwords(r300); /* Reserve CS space. */ - if (dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) { + if (dwords > (r300->cs->max_dw - r300->cs->cdw)) { r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL); } diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c index c35aa3b24aa..8c24ad6d98a 100644 --- a/src/gallium/drivers/r300/r300_context.c +++ b/src/gallium/drivers/r300/r300_context.c @@ -94,6 +94,8 @@ static void r300_destroy_context(struct pipe_context* context) if (r300->cs) r300->rws->cs_destroy(r300->cs); + if (r300->ctx) + r300->rws->ctx_destroy(r300->ctx); rc_destroy_regalloc_state(&r300->fs_regalloc_state); @@ -382,7 +384,11 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, sizeof(struct pipe_transfer), 64, UTIL_SLAB_SINGLETHREADED); - r300->cs = rws->cs_create(rws, RING_GFX, r300_flush_callback, r300, NULL); + r300->ctx = rws->ctx_create(rws); + if (!r300->ctx) + goto fail; + + r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL); if (r300->cs == NULL) goto fail; diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h index 3873c9a31c1..18ae11a3a24 100644 --- a/src/gallium/drivers/r300/r300_context.h +++ b/src/gallium/drivers/r300/r300_context.h @@ -449,6 +449,8 @@ struct r300_context { /* The interface to the windowing system, etc. */ struct radeon_winsys *rws; + /* The submission context. */ + struct radeon_winsys_ctx *ctx; /* The command stream. */ struct radeon_winsys_cs *cs; /* Screen. */ @@ -647,32 +649,32 @@ struct r300_context { for (atom = r300->first_dirty; atom != r300->last_dirty; atom++) /* Convenience cast wrappers. */ -static INLINE struct r300_query* r300_query(struct pipe_query* q) +static inline struct r300_query* r300_query(struct pipe_query* q) { return (struct r300_query*)q; } -static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf) +static inline struct r300_surface* r300_surface(struct pipe_surface* surf) { return (struct r300_surface*)surf; } -static INLINE struct r300_resource* r300_resource(struct pipe_resource* tex) +static inline struct r300_resource* r300_resource(struct pipe_resource* tex) { return (struct r300_resource*)tex; } -static INLINE struct r300_context* r300_context(struct pipe_context* context) +static inline struct r300_context* r300_context(struct pipe_context* context) { return (struct r300_context*)context; } -static INLINE struct r300_fragment_shader *r300_fs(struct r300_context *r300) +static inline struct r300_fragment_shader *r300_fs(struct r300_context *r300) { return (struct r300_fragment_shader*)r300->fs.state; } -static INLINE void r300_mark_atom_dirty(struct r300_context *r300, +static inline void r300_mark_atom_dirty(struct r300_context *r300, struct r300_atom *atom) { atom->dirty = TRUE; @@ -688,7 +690,7 @@ static INLINE void r300_mark_atom_dirty(struct r300_context *r300, } } -static INLINE struct pipe_surface * +static inline struct pipe_surface * r300_get_nonnull_cb(struct pipe_framebuffer_state *fb, unsigned i) { if (fb->cbufs[i]) @@ -777,12 +779,12 @@ void r300_update_derived_state(struct r300_context* r300); void r500_dump_rs_block(struct r300_rs_block *rs); -static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags) +static inline boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags) { return SCREEN_DBG_ON(ctx->screen, flags); } -static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags, +static inline void CTX_DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...) { if (CTX_DBG_ON(ctx, flags)) { diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h index 37f9641ab3e..fc150542d4b 100644 --- a/src/gallium/drivers/r300/r300_cs.h +++ b/src/gallium/drivers/r300/r300_cs.h @@ -46,7 +46,7 @@ #ifdef DEBUG #define BEGIN_CS(size) do { \ - assert(size <= (RADEON_MAX_CMDBUF_DWORDS - cs_copy->cdw)); \ + assert(size <= (cs_copy->max_dw - cs_copy->cdw)); \ cs_count = size; \ } while (0) diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h index 39eb73da65d..b39624dad5f 100644 --- a/src/gallium/drivers/r300/r300_fs.h +++ b/src/gallium/drivers/r300/r300_fs.h @@ -77,14 +77,14 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info, /* Return TRUE if the shader was switched and should be re-emitted. */ boolean r300_pick_fragment_shader(struct r300_context* r300); -static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs) +static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs) { if (!fs) return FALSE; return (fs->shader->code.writes_depth) ? TRUE : FALSE; } -static INLINE boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs) +static inline boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs) { if (!fs) return FALSE; diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c index 01b83b87fcf..4dd8156f616 100644 --- a/src/gallium/drivers/r300/r300_query.c +++ b/src/gallium/drivers/r300/r300_query.c @@ -146,10 +146,11 @@ static boolean r300_get_query_result(struct pipe_context* pipe, if (q->type == PIPE_QUERY_GPU_FINISHED) { if (wait) { - r300->rws->buffer_wait(q->buf, RADEON_USAGE_READWRITE); + r300->rws->buffer_wait(q->buf, PIPE_TIMEOUT_INFINITE, + RADEON_USAGE_READWRITE); vresult->b = TRUE; } else { - vresult->b = !r300->rws->buffer_is_busy(q->buf, RADEON_USAGE_READWRITE); + vresult->b = r300->rws->buffer_wait(q->buf, 0, RADEON_USAGE_READWRITE); } return vresult->b; } @@ -168,8 +169,6 @@ static boolean r300_get_query_result(struct pipe_context* pipe, map++; } - r300->rws->buffer_unmap(q->cs_buf); - if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) { vresult->b = temp != 0; } else { diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c index 4c951d14f10..0487b11e775 100644 --- a/src/gallium/drivers/r300/r300_render.c +++ b/src/gallium/drivers/r300/r300_render.c @@ -215,7 +215,7 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300, cs_dwords += r300_get_num_cs_end_dwords(r300); /* Reserve requested CS space. */ - if (cs_dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) { + if (cs_dwords > (r300->cs->max_dw - r300->cs->cdw)) { r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL); flushed = TRUE; } @@ -871,7 +871,7 @@ struct r300_render { uint8_t *vbo_ptr; }; -static INLINE struct r300_render* +static inline struct r300_render* r300_render(struct vbuf_render* render) { return (struct r300_render*)render; diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index a7bca915f57..4ca0b268bde 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -191,6 +191,10 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; /* SWTCL-only features. */ @@ -427,7 +431,7 @@ static int r300_get_video_param(struct pipe_screen *screen, * Whether the format matches: * PIPE_FORMAT_?10?10?10?2_UNORM */ -static INLINE boolean +static inline boolean util_format_is_rgba1010102_variant(const struct util_format_description *desc) { static const unsigned size[4] = {10, 10, 10, 2}; @@ -660,14 +664,6 @@ static void r300_fence_reference(struct pipe_screen *screen, rws->fence_reference(ptr, fence); } -static boolean r300_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - struct radeon_winsys *rws = r300_screen(screen)->rws; - - return rws->fence_wait(rws, fence, 0); -} - static boolean r300_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) @@ -712,7 +708,6 @@ struct pipe_screen* r300_screen_create(struct radeon_winsys *rws) r300screen->screen.is_video_format_supported = vl_video_buffer_is_format_supported; r300screen->screen.context_create = r300_create_context; r300screen->screen.fence_reference = r300_fence_reference; - r300screen->screen.fence_signalled = r300_fence_signalled; r300screen->screen.fence_finish = r300_fence_finish; r300_init_screen_resource_functions(r300screen); diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h index 7bba39bf12b..e15c3c7de0c 100644 --- a/src/gallium/drivers/r300/r300_screen.h +++ b/src/gallium/drivers/r300/r300_screen.h @@ -51,11 +51,11 @@ struct r300_screen { /* Convenience cast wrappers. */ -static INLINE struct r300_screen* r300_screen(struct pipe_screen* screen) { +static inline struct r300_screen* r300_screen(struct pipe_screen* screen) { return (struct r300_screen*)screen; } -static INLINE struct radeon_winsys * +static inline struct radeon_winsys * radeon_winsys(struct pipe_screen *screen) { return r300_screen(screen)->rws; } @@ -102,12 +102,12 @@ radeon_winsys(struct pipe_screen *screen) { #define DBG_P_STAT (1 << 25) /*@}*/ -static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags) +static inline boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags) { return (screen->debug & flags) ? TRUE : FALSE; } -static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags, +static inline void SCREEN_DBG(struct r300_screen * screen, unsigned flags, const char * fmt, ...) { if (SCREEN_DBG_ON(screen, flags)) { diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c index de557b57776..6451a2c8df2 100644 --- a/src/gallium/drivers/r300/r300_screen_buffer.c +++ b/src/gallium/drivers/r300/r300_screen_buffer.c @@ -96,7 +96,7 @@ r300_buffer_transfer_map( struct pipe_context *context, /* Check if mapping this buffer would cause waiting for the GPU. */ if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) || - r300->rws->buffer_is_busy(rbuf->buf, RADEON_USAGE_READWRITE)) { + !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) { unsigned i; struct pb_buffer *new_buf; diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h index b4c8520039b..14b849c8c93 100644 --- a/src/gallium/drivers/r300/r300_screen_buffer.h +++ b/src/gallium/drivers/r300/r300_screen_buffer.h @@ -46,7 +46,7 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen, /* Inline functions. */ -static INLINE struct r300_buffer *r300_buffer(struct pipe_resource *buffer) +static inline struct r300_buffer *r300_buffer(struct pipe_resource *buffer) { return (struct r300_buffer *)buffer; } diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h index b756048c6c7..93bbc9d4a96 100644 --- a/src/gallium/drivers/r300/r300_shader_semantics.h +++ b/src/gallium/drivers/r300/r300_shader_semantics.h @@ -46,7 +46,7 @@ struct r300_shader_semantics { int num_generic; }; -static INLINE void r300_shader_semantics_reset( +static inline void r300_shader_semantics_reset( struct r300_shader_semantics* info) { int i; diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c index e886df87a60..d99d5ae0152 100644 --- a/src/gallium/drivers/r300/r300_state.c +++ b/src/gallium/drivers/r300/r300_state.c @@ -844,7 +844,7 @@ static void r300_tex_set_tiling_flags(struct r300_context *r300, tex->tex.macrotile[level]) { r300->rws->buffer_set_tiling(tex->buf, r300->cs, tex->tex.microtile, tex->tex.macrotile[level], - 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, tex->tex.stride_in_bytes[0], false); tex->surface_level = level; diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h index feec494c4dc..fbd91cda9fe 100644 --- a/src/gallium/drivers/r300/r300_state_inlines.h +++ b/src/gallium/drivers/r300/r300_state_inlines.h @@ -32,13 +32,13 @@ /* Some maths. These should probably find their way to u_math, if needed. */ -static INLINE int pack_float_16_6x(float f) { +static inline int pack_float_16_6x(float f) { return ((int)(f * 6.0) & 0xffff); } /* Blend state. */ -static INLINE uint32_t r300_translate_blend_function(int blend_func, +static inline uint32_t r300_translate_blend_function(int blend_func, boolean clamp) { switch (blend_func) { @@ -60,7 +60,7 @@ static INLINE uint32_t r300_translate_blend_function(int blend_func, return 0; } -static INLINE uint32_t r300_translate_blend_factor(int blend_fact) +static inline uint32_t r300_translate_blend_factor(int blend_fact) { switch (blend_fact) { case PIPE_BLENDFACTOR_ONE: @@ -113,7 +113,7 @@ static INLINE uint32_t r300_translate_blend_factor(int blend_fact) /* DSA state. */ -static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func) +static inline uint32_t r300_translate_depth_stencil_function(int zs_func) { switch (zs_func) { case PIPE_FUNC_NEVER: @@ -141,7 +141,7 @@ static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func) return 0; } -static INLINE uint32_t r300_translate_stencil_op(int s_op) +static inline uint32_t r300_translate_stencil_op(int s_op) { switch (s_op) { case PIPE_STENCIL_OP_KEEP: @@ -168,7 +168,7 @@ static INLINE uint32_t r300_translate_stencil_op(int s_op) return 0; } -static INLINE uint32_t r300_translate_alpha_function(int alpha_func) +static inline uint32_t r300_translate_alpha_function(int alpha_func) { switch (alpha_func) { case PIPE_FUNC_NEVER: @@ -195,7 +195,7 @@ static INLINE uint32_t r300_translate_alpha_function(int alpha_func) return 0; } -static INLINE uint32_t +static inline uint32_t r300_translate_polygon_mode_front(unsigned mode) { switch (mode) { @@ -213,7 +213,7 @@ r300_translate_polygon_mode_front(unsigned mode) { } } -static INLINE uint32_t +static inline uint32_t r300_translate_polygon_mode_back(unsigned mode) { switch (mode) { @@ -233,7 +233,7 @@ r300_translate_polygon_mode_back(unsigned mode) { /* Texture sampler state. */ -static INLINE uint32_t r300_translate_wrap(int wrap) +static inline uint32_t r300_translate_wrap(int wrap) { switch (wrap) { case PIPE_TEX_WRAP_REPEAT: @@ -259,7 +259,7 @@ static INLINE uint32_t r300_translate_wrap(int wrap) } } -static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip, +static inline uint32_t r300_translate_tex_filters(int min, int mag, int mip, boolean is_anisotropic) { uint32_t retval = 0; @@ -308,7 +308,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip, return retval; } -static INLINE uint32_t r300_anisotropy(unsigned max_aniso) +static inline uint32_t r300_anisotropy(unsigned max_aniso) { if (max_aniso >= 16) { return R300_TX_MAX_ANISO_16_TO_1; @@ -323,7 +323,7 @@ static INLINE uint32_t r300_anisotropy(unsigned max_aniso) } } -static INLINE uint32_t r500_anisotropy(unsigned max_aniso) +static inline uint32_t r500_anisotropy(unsigned max_aniso) { if (!max_aniso) { return 0; @@ -336,7 +336,7 @@ static INLINE uint32_t r500_anisotropy(unsigned max_aniso) } /* Translate pipe_formats into PSC vertex types. */ -static INLINE uint16_t +static inline uint16_t r300_translate_vertex_data_type(enum pipe_format format) { uint32_t result = 0; const struct util_format_description *desc; @@ -410,7 +410,7 @@ r300_translate_vertex_data_type(enum pipe_format format) { return result; } -static INLINE uint16_t +static inline uint16_t r300_translate_vertex_data_swizzle(enum pipe_format format) { const struct util_format_description *desc; unsigned i, swizzle = 0; diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c index 6c01c0d21e4..5e4d50df27d 100644 --- a/src/gallium/drivers/r300/r300_texture.c +++ b/src/gallium/drivers/r300/r300_texture.c @@ -1063,7 +1063,7 @@ r300_texture_create_object(struct r300_screen *rscreen, rws->buffer_set_tiling(tex->buf, NULL, tex->tex.microtile, tex->tex.macrotile[0], - 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, tex->tex.stride_in_bytes[0], false); return tex; diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c index b87164ba836..44303792f51 100644 --- a/src/gallium/drivers/r300/r300_transfer.c +++ b/src/gallium/drivers/r300/r300_transfer.c @@ -41,7 +41,7 @@ struct r300_transfer { }; /* Convenience cast wrapper. */ -static INLINE struct r300_transfer* +static inline struct r300_transfer* r300_transfer(struct pipe_transfer* transfer) { return (struct r300_transfer*)transfer; @@ -120,7 +120,7 @@ r300_texture_transfer_map(struct pipe_context *ctx, referenced_hw = TRUE; } else { referenced_hw = - r300->rws->buffer_is_busy(tex->buf, RADEON_USAGE_READWRITE); + !r300->rws->buffer_wait(tex->buf, 0, RADEON_USAGE_READWRITE); } trans = CALLOC_STRUCT(r300_transfer); @@ -251,16 +251,12 @@ void r300_texture_transfer_unmap(struct pipe_context *ctx, struct r300_resource *tex = r300_resource(transfer->resource); if (trans->linear_texture) { - rws->buffer_unmap(trans->linear_texture->cs_buf); - if (transfer->usage & PIPE_TRANSFER_WRITE) { r300_copy_into_tiled_texture(ctx, trans); } pipe_resource_reference( (struct pipe_resource**)&trans->linear_texture, NULL); - } else { - rws->buffer_unmap(tex->cs_buf); } FREE(transfer); } diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am index dc0d90d759b..8317da727a2 100644 --- a/src/gallium/drivers/r600/Makefile.am +++ b/src/gallium/drivers/r600/Makefile.am @@ -1,5 +1,3 @@ -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c index 295cb4d80b7..42e8b0b1761 100644 --- a/src/gallium/drivers/r600/eg_asm.c +++ b/src/gallium/drivers/r600/eg_asm.c @@ -160,6 +160,9 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c alu.op = ALU_OP1_MOVA_INT; alu.src[0].sel = bc->index_reg[id]; alu.src[0].chan = 0; + if (bc->chip_class == CAYMAN) + alu.dst.sel = id == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; + alu.last = 1; r = r600_bytecode_add_alu(bc, &alu); if (r) @@ -167,12 +170,14 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c bc->ar_loaded = 0; /* clobbered */ - memset(&alu, 0, sizeof(alu)); - alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1; - alu.last = 1; - r = r600_bytecode_add_alu(bc, &alu); - if (r) - return r; + if (bc->chip_class == EVERGREEN) { + memset(&alu, 0, sizeof(alu)); + alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1; + alu.last = 1; + r = r600_bytecode_add_alu(bc, &alu); + if (r) + return r; + } /* Must split ALU group as index only applies to following group */ if (inside_alu_clause) { diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h index b534872f062..97e230f56c7 100644 --- a/src/gallium/drivers/r600/eg_sq.h +++ b/src/gallium/drivers/r600/eg_sq.h @@ -521,4 +521,11 @@ #define V_SQ_REL_ABSOLUTE 0 #define V_SQ_REL_RELATIVE 1 + +/* CAYMAN has special encoding for MOVA_INT destination */ +#define CM_V_SQ_MOVA_DST_AR_X 0 +#define CM_V_SQ_MOVA_DST_CF_PC 1 +#define CM_V_SQ_MOVA_DST_CF_IDX0 2 +#define CM_V_SQ_MOVA_DST_CF_IDX1 3 + #endif diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 4c3c34cd664..c52e43e9c2a 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -163,7 +163,7 @@ static void evergreen_cs_set_vertex_buffer( rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE; state->enabled_mask |= 1 << vb_index; state->dirty_mask |= 1 << vb_index; - state->atom.dirty = true; + r600_mark_atom_dirty(rctx, &state->atom); } static void evergreen_cs_set_constant_buffer( @@ -226,7 +226,7 @@ void *evergreen_create_compute_state( } #else memset(&shader->binary, 0, sizeof(shader->binary)); - radeon_elf_read(code, header->num_bytes, &shader->binary, true); + radeon_elf_read(code, header->num_bytes, &shader->binary); r600_create_shader(&shader->bc, &shader->binary, &use_kill); shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen, @@ -487,6 +487,12 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, /* Emit constant buffer state */ r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); + /* Emit sampler state */ + r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom); + + /* Emit sampler view (texture resource) state */ + r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom); + /* Emit compute shader state */ r600_emit_atom(ctx, &ctx->cs_shader_state.atom); @@ -655,25 +661,6 @@ static void evergreen_set_compute_resources(struct pipe_context * ctx_, } } -void evergreen_set_cs_sampler_view(struct pipe_context *ctx_, - unsigned start_slot, unsigned count, - struct pipe_sampler_view **views) -{ - struct r600_pipe_sampler_view **resource = - (struct r600_pipe_sampler_view **)views; - - for (unsigned i = 0; i < count; i++) { - if (resource[i]) { - assert(i+1 < 12); - /* XXX: Implement */ - assert(!"Compute samplers not implemented."); - ///FETCH0 = VTX0 (param buffer), - //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX - } - } -} - - static void evergreen_set_global_binding( struct pipe_context *ctx_, unsigned first, unsigned n, struct pipe_resource **resources, diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 4ddbc0beba5..6a91d4709f4 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -32,7 +32,7 @@ #include "evergreen_compute.h" #include "util/u_math.h" -static INLINE unsigned evergreen_array_mode(unsigned mode) +static inline unsigned evergreen_array_mode(unsigned mode) { switch (mode) { case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_028C70_ARRAY_LINEAR_ALIGNED; @@ -485,7 +485,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx, /* offset */ rs->offset_units = state->offset_units; - rs->offset_scale = state->offset_scale * 12.0f; + rs->offset_scale = state->offset_scale * 16.0f; rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri; if (state->point_size_per_vertex) { @@ -896,7 +896,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx, for (i = start_slot; i < start_slot + num_scissors; i++) { rctx->scissor[i].scissor = state[i - start_slot]; - rctx->scissor[i].atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom); } } @@ -1028,7 +1028,10 @@ void evergreen_init_color_surface(struct r600_context *rctx, macro_aspect = rtex->surface.mtilea; bankw = rtex->surface.bankw; bankh = rtex->surface.bankh; - fmask_bankh = rtex->fmask.bank_height; + if (rtex->fmask.size) + fmask_bankh = rtex->fmask.bank_height; + else + fmask_bankh = rtex->surface.bankh; tile_split = eg_tile_split(tile_split); macro_aspect = eg_macro_tile_aspect(macro_aspect); bankw = eg_bank_wh(bankw); @@ -1149,10 +1152,11 @@ void evergreen_init_color_surface(struct r600_context *rctx, surf->cb_color_attrib = color_attrib; if (rtex->fmask.size) { surf->cb_color_fmask = (base_offset + rtex->fmask.offset) >> 8; + surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max); } else { surf->cb_color_fmask = surf->cb_color_base; + surf->cb_color_fmask_slice = S_028C88_TILE_MAX(slice); } - surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max); surf->color_initialized = true; } @@ -1342,11 +1346,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx, if (rctx->alphatest_state.bypass != alphatest_bypass) { rctx->alphatest_state.bypass = alphatest_bypass; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } if (rctx->alphatest_state.cb0_export_16bpc != export_16bpc) { rctx->alphatest_state.cb0_export_16bpc = export_16bpc; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } } @@ -1362,28 +1366,28 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx, if (state->zsbuf->format != rctx->poly_offset_state.zs_format) { rctx->poly_offset_state.zs_format = state->zsbuf->format; - rctx->poly_offset_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom); } if (rctx->db_state.rsurf != surf) { rctx->db_state.rsurf = surf; - rctx->db_state.atom.dirty = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_state.atom); + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } else if (rctx->db_state.rsurf) { rctx->db_state.rsurf = NULL; - rctx->db_state.atom.dirty = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_state.atom); + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) { rctx->cb_misc_state.nr_cbufs = state->nr_cbufs; - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) { rctx->alphatest_state.bypass = false; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } log_samples = util_logbase2(rctx->framebuffer.nr_samples); @@ -1392,7 +1396,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx, rctx->b.family == CHIP_RV770) && rctx->db_misc_state.log_samples != log_samples) { rctx->db_misc_state.log_samples = log_samples; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } @@ -1420,7 +1424,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx, rctx->framebuffer.atom.num_dw += 4; } - rctx->framebuffer.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom); r600_set_sample_locations_constant_buffer(rctx); } @@ -1434,7 +1438,7 @@ static void evergreen_set_min_samples(struct pipe_context *ctx, unsigned min_sam rctx->ps_iter_samples = min_samples; if (rctx->framebuffer.nr_samples > 1) { - rctx->framebuffer.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom); } } @@ -1732,10 +1736,10 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_ r600_write_context_reg_seq(cs, R_028238_CB_TARGET_MASK, 2); radeon_emit(cs, a->blend_colormask & fb_colormask); /* R_028238_CB_TARGET_MASK */ - /* Always enable the first colorbuffer in CB_SHADER_MASK. This - * will assure that the alpha-test will work even if there is - * no colorbuffer bound. */ - radeon_emit(cs, 0xf | (a->dual_src_blend ? ps_colormask : 0) | fb_colormask); /* R_02823C_CB_SHADER_MASK */ + /* This must match the used export instructions exactly. + * Other values may lead to undefined behavior and hangs. + */ + radeon_emit(cs, ps_colormask); /* R_02823C_CB_SHADER_MASK */ } static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom) @@ -1980,7 +1984,7 @@ static void evergreen_emit_cs_constant_buffers(struct r600_context *rctx, struct static void evergreen_emit_sampler_views(struct r600_context *rctx, struct r600_samplerview_state *state, - unsigned resource_id_base) + unsigned resource_id_base, unsigned pkt_flags) { struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; uint32_t dirty_mask = state->dirty_mask; @@ -1993,7 +1997,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, rview = state->views[resource_index]; assert(rview); - radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0)); + radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags); radeon_emit(cs, (resource_id_base + resource_index) * 8); radeon_emit_array(cs, rview->tex_resource_words, 8); @@ -2002,11 +2006,11 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, rview->tex_resource->b.b.nr_samples > 1 ? RADEON_PRIO_SHADER_TEXTURE_MSAA : RADEON_PRIO_SHADER_TEXTURE_RO); - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); radeon_emit(cs, reloc); if (!rview->skip_mip_address_reloc) { - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); radeon_emit(cs, reloc); } } @@ -2015,23 +2019,33 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, static void evergreen_emit_vs_sampler_views(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views, 176 + R600_MAX_CONST_BUFFERS); + evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views, + 176 + R600_MAX_CONST_BUFFERS, 0); } static void evergreen_emit_gs_sampler_views(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views, 336 + R600_MAX_CONST_BUFFERS); + evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views, + 336 + R600_MAX_CONST_BUFFERS, 0); } static void evergreen_emit_ps_sampler_views(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views, R600_MAX_CONST_BUFFERS); + evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views, + R600_MAX_CONST_BUFFERS, 0); +} + +static void evergreen_emit_cs_sampler_views(struct r600_context *rctx, struct r600_atom *atom) +{ + evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views, + 816 + 2, RADEON_CP_PACKET3_COMPUTE_MODE); } static void evergreen_emit_sampler_states(struct r600_context *rctx, struct r600_textures_info *texinfo, unsigned resource_id_base, - unsigned border_index_reg) + unsigned border_index_reg, + unsigned pkt_flags) { struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; uint32_t dirty_mask = texinfo->states.dirty_mask; @@ -2043,7 +2057,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx, rstate = texinfo->states.states[i]; assert(rstate); - radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0)); + radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0) | pkt_flags); radeon_emit(cs, (resource_id_base + i) * 3); radeon_emit_array(cs, rstate->tex_sampler_words, 3); @@ -2058,17 +2072,27 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx, static void evergreen_emit_vs_sampler_states(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18, R_00A414_TD_VS_SAMPLER0_BORDER_INDEX); + evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18, + R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0); } static void evergreen_emit_gs_sampler_states(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36, R_00A428_TD_GS_SAMPLER0_BORDER_INDEX); + evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36, + R_00A428_TD_GS_SAMPLER0_BORDER_INDEX, 0); } static void evergreen_emit_ps_sampler_states(struct r600_context *rctx, struct r600_atom *atom) { - evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0, R_00A400_TD_PS_SAMPLER0_BORDER_INDEX); + evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0, + R_00A400_TD_PS_SAMPLER0_BORDER_INDEX, 0); +} + +static void evergreen_emit_cs_sampler_states(struct r600_context *rctx, struct r600_atom *atom) +{ + evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE], 90, + R_00A464_TD_CS_SAMPLER0_BORDER_INDEX, + RADEON_CP_PACKET3_COMPUTE_MODE); } static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a) @@ -3176,7 +3200,7 @@ void evergreen_update_db_shader_control(struct r600_context * rctx) if (db_shader_control != rctx->db_misc_state.db_shader_control) { rctx->db_misc_state.db_shader_control = db_shader_control; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -3431,12 +3455,14 @@ void evergreen_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].states.atom, id++, evergreen_emit_vs_sampler_states, 0); r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].states.atom, id++, evergreen_emit_gs_sampler_states, 0); r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].states.atom, id++, evergreen_emit_ps_sampler_states, 0); + r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom, id++, evergreen_emit_cs_sampler_states, 0); /* resources */ r600_init_atom(rctx, &rctx->vertex_buffer_state.atom, id++, evergreen_fs_emit_vertex_buffers, 0); r600_init_atom(rctx, &rctx->cs_vertex_buffer_state.atom, id++, evergreen_cs_emit_vertex_buffers, 0); r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views.atom, id++, evergreen_emit_vs_sampler_views, 0); r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views.atom, id++, evergreen_emit_gs_sampler_views, 0); r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views.atom, id++, evergreen_emit_ps_sampler_views, 0); + r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom, id++, evergreen_emit_cs_sampler_views, 0); r600_init_atom(rctx, &rctx->vgt_state.atom, id++, r600_emit_vgt_state, 10); @@ -3466,8 +3492,8 @@ void evergreen_init_state_functions(struct r600_context *rctx) } r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5); - rctx->atoms[id++] = &rctx->b.streamout.begin_atom; - rctx->atoms[id++] = &rctx->b.streamout.enable_atom; + r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); + r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0); r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0); diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index cd4ff46b103..ad6ad434b78 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -1253,6 +1253,11 @@ #define R_00A430_TD_GS_SAMPLER0_BORDER_GREEN 0x00A430 #define R_00A434_TD_GS_SAMPLER0_BORDER_BLUE 0x00A434 #define R_00A438_TD_GS_SAMPLER0_BORDER_ALPHA 0x00A438 +#define R_00A464_TD_CS_SAMPLER0_BORDER_INDEX 0x00A464 +#define R_00A468_TD_CS_SAMPLER0_BORDER_RED 0x00A468 +#define R_00A46C_TD_CS_SAMPLER0_BORDER_GREEN 0x00A46C +#define R_00A470_TD_CS_SAMPLER0_BORDER_BLUE 0x00A470 +#define R_00A474_TD_CS_SAMPLER0_BORDER_ALPHA 0x00A474 #define R_03C000_SQ_TEX_SAMPLER_WORD0_0 0x03C000 #define S_03C000_CLAMP_X(x) (((x) & 0x7) << 0) diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index 01262a59e90..b0002c3b50f 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -145,7 +145,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx, rctx->db_misc_state.copy_depth = util_format_has_depth(desc); rctx->db_misc_state.copy_stencil = util_format_has_stencil(desc); rctx->db_misc_state.copy_sample = first_sample; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); for (level = first_level; level <= last_level; level++) { if (!staging && !(texture->dirty_level_mask & (1 << level))) @@ -162,7 +162,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx, if (sample != rctx->db_misc_state.copy_sample) { rctx->db_misc_state.copy_sample = sample; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } surf_tmpl.format = texture->resource.b.b.format; @@ -197,7 +197,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx, /* reenable compression in DB_RENDER_CONTROL */ rctx->db_misc_state.flush_depthstencil_through_cb = false; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } static void r600_blit_decompress_depth_in_place(struct r600_context *rctx, @@ -210,7 +210,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx, /* Enable decompression in DB_RENDER_CONTROL */ rctx->db_misc_state.flush_depthstencil_in_place = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); surf_tmpl.format = texture->resource.b.b.format; @@ -248,7 +248,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx, /* Disable decompression in DB_RENDER_CONTROL */ rctx->db_misc_state.flush_depthstencil_in_place = false; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } void r600_decompress_depth_textures(struct r600_context *rctx, @@ -396,6 +396,8 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers, if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN) { evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom, &buffers, color); + if (!buffers) + return; /* all buffers have been fast cleared */ } if (buffers & PIPE_CLEAR_COLOR) { @@ -435,10 +437,10 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers, fb->zsbuf->u.tex.last_layer == util_max_layer(&rtex->resource.b.b, level)) { if (rtex->depth_clear_value != depth) { rtex->depth_clear_value = depth; - rctx->db_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_state.atom); } rctx->db_misc_state.htile_clear = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -451,7 +453,7 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers, /* disable fast clear */ if (rctx->db_misc_state.htile_clear) { rctx->db_misc_state.htile_clear = false; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } diff --git a/src/gallium/drivers/r600/r600_formats.h b/src/gallium/drivers/r600/r600_formats.h index fa374d92e6f..9533aaa1378 100644 --- a/src/gallium/drivers/r600/r600_formats.h +++ b/src/gallium/drivers/r600/r600_formats.h @@ -64,7 +64,7 @@ #define ENDIAN_8IN32 2 #define ENDIAN_8IN64 3 -static INLINE unsigned r600_endian_swap(unsigned size) +static inline unsigned r600_endian_swap(unsigned size) { if (R600_BIG_ENDIAN) { switch (size) { @@ -82,7 +82,7 @@ static INLINE unsigned r600_endian_swap(unsigned size) } } -static INLINE bool r600_is_vertex_format_supported(enum pipe_format format) +static inline bool r600_is_vertex_format_supported(enum pipe_format format) { const struct util_format_description *desc = util_format_description(format); unsigned i; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 8eb0c6806b9..64451516c23 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, unsigned i; /* The number of dwords all the dirty states would take. */ - for (i = 0; i < R600_NUM_ATOMS; i++) { - if (ctx->atoms[i] && ctx->atoms[i]->dirty) { - num_dw += ctx->atoms[i]->num_dw; - if (ctx->screen->b.trace_bo) { - num_dw += R600_TRACE_CS_DWORDS; - } + i = r600_next_dirty_atom(ctx, 0); + while (i < R600_NUM_ATOMS) { + num_dw += ctx->atoms[i]->num_dw; + if (ctx->screen->b.trace_bo) { + num_dw += R600_TRACE_CS_DWORDS; } + i = r600_next_dirty_atom(ctx, i + 1); } /* The upper-bound of how much space a draw command would take. */ @@ -68,7 +68,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, } /* Count in queries_suspend. */ - num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend; + num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend + + ctx->b.num_cs_dw_timer_queries_suspend; /* Count in streamout_end at the end of CS. */ if (ctx->b.streamout.begin_emitted) { @@ -92,7 +93,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += 10; /* Flush if there's not enough space. */ - if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { + if (num_dw > ctx->b.rings.gfx.cs->max_dw) { ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } } @@ -295,43 +296,45 @@ void r600_begin_new_cs(struct r600_context *ctx) r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd); /* Re-emit states. */ - ctx->alphatest_state.atom.dirty = true; - ctx->blend_color.atom.dirty = true; - ctx->cb_misc_state.atom.dirty = true; - ctx->clip_misc_state.atom.dirty = true; - ctx->clip_state.atom.dirty = true; - ctx->db_misc_state.atom.dirty = true; - ctx->db_state.atom.dirty = true; - ctx->framebuffer.atom.dirty = true; - ctx->pixel_shader.atom.dirty = true; - ctx->poly_offset_state.atom.dirty = true; - ctx->vgt_state.atom.dirty = true; - ctx->sample_mask.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom); + r600_mark_atom_dirty(ctx, &ctx->blend_color.atom); + r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom); + r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom); + r600_mark_atom_dirty(ctx, &ctx->clip_state.atom); + r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom); + r600_mark_atom_dirty(ctx, &ctx->db_state.atom); + r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom); + r600_mark_atom_dirty(ctx, &ctx->pixel_shader.atom); + r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom); + r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom); + r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom); for (i = 0; i < R600_MAX_VIEWPORTS; i++) { - ctx->scissor[i].atom.dirty = true; - ctx->viewport[i].atom.dirty = true; - } - ctx->config_state.atom.dirty = true; - ctx->stencil_ref.atom.dirty = true; - ctx->vertex_fetch_shader.atom.dirty = true; - ctx->export_shader.atom.dirty = true; - ctx->shader_stages.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->scissor[i].atom); + r600_mark_atom_dirty(ctx, &ctx->viewport[i].atom); + } + if (ctx->b.chip_class < EVERGREEN) { + r600_mark_atom_dirty(ctx, &ctx->config_state.atom); + } + r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom); + r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom); + r600_mark_atom_dirty(ctx, &ctx->export_shader.atom); + r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom); if (ctx->gs_shader) { - ctx->geometry_shader.atom.dirty = true; - ctx->gs_rings.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->geometry_shader.atom); + r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom); } - ctx->vertex_shader.atom.dirty = true; - ctx->b.streamout.enable_atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom); + r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); if (ctx->blend_state.cso) - ctx->blend_state.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->blend_state.atom); if (ctx->dsa_state.cso) - ctx->dsa_state.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom); if (ctx->rasterizer_state.cso) - ctx->rasterizer_state.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom); if (ctx->b.chip_class <= R700) { - ctx->seamless_cube_map.atom.dirty = true; + r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom); } ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask; diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 72e2dc42f7e..faf538ccbb5 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -84,7 +84,7 @@ static void llvm_load_system_value( #else LLVMValueRef reg = lp_build_const_int32( ctx->soa.bld_base.base.gallivm, chan); - ctx->system_values[index] = build_intrinsic( + ctx->system_values[index] = lp_build_intrinsic( ctx->soa.bld_base.base.gallivm->builder, "llvm.R600.load.input", ctx->soa.bld_base.base.elem_type, ®, 1, @@ -111,9 +111,9 @@ llvm_load_input_vector( Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex, lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), ""); LLVMValueRef HalfVec[2] = { - build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy", + lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy", VecType, Args, ArgCount, LLVMReadNoneAttribute), - build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw", + lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw", VecType, Args, ArgCount, LLVMReadNoneAttribute) }; LLVMValueRef MaskInputs[4] = { @@ -127,7 +127,7 @@ llvm_load_input_vector( Mask, ""); } else { VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4); - return build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const", + return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const", VecType, Args, ArgCount, LLVMReadNoneAttribute); } } @@ -153,7 +153,7 @@ llvm_load_input_helper( arg_count = 1; } - return build_intrinsic(bb->gallivm->builder, intrinsic, + return lp_build_intrinsic(bb->gallivm->builder, intrinsic, bb->elem_type, &arg[0], arg_count, LLVMReadNoneAttribute); } #endif @@ -332,7 +332,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer); args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component); lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output", - LLVMVoidTypeInContext(base->gallivm->context), args, 4); + LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0); } } @@ -356,7 +356,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) args[0] = output; args[1] = lp_build_const_int32(base->gallivm, next_pos++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -373,7 +373,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE); args[0] = output; args[1] = base_vector; - adjusted_elements[chan] = build_intrinsic(base->gallivm->builder, + adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder, "llvm.AMDGPU.dp4", bld_base->base.elem_type, args, 2, LLVMReadNoneAttribute); } @@ -381,7 +381,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) adjusted_elements, 4); args[1] = lp_build_const_int32(base->gallivm, next_pos++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -394,14 +394,14 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) args[0] = output; args[1] = lp_build_const_int32(base->gallivm, next_pos++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), args, 3, 0); args[1] = lp_build_const_int32(base->gallivm, next_param++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -418,7 +418,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) args[0] = lp_build_gather_values(base->gallivm, elements, 4); args[1] = lp_build_const_int32(base->gallivm, next_param++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -430,7 +430,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) args[0] = output; args[1] = lp_build_const_int32(base->gallivm, next_param++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -449,7 +449,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) for (unsigned j = 0; j < ctx->color_buffer_count; j++) { args[1] = lp_build_const_int32(base->gallivm, j); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -458,7 +458,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) } else { args[1] = lp_build_const_int32(base->gallivm, color_count++); args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL); - build_intrinsic( + lp_build_intrinsic( base->gallivm->builder, "llvm.R600.store.swizzle", LLVMVoidTypeInContext(base->gallivm->context), @@ -543,7 +543,7 @@ static void llvm_emit_tex( case TGSI_OPCODE_TXF: { args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), ""); args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS); - emit_data->output[0] = build_intrinsic(gallivm->builder, + emit_data->output[0] = lp_build_intrinsic(gallivm->builder, "llvm.R600.load.texbuf", emit_data->dst_type, args, 2, LLVMReadNoneAttribute); if (ctx->chip_class >= EVERGREEN) @@ -658,7 +658,7 @@ static void llvm_emit_tex( lp_build_const_int32(gallivm, 1), lp_build_const_int32(gallivm, 1) }; - LLVMValueRef ptr = build_intrinsic(gallivm->builder, + LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder, "llvm.R600.ldptr", emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute); LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0], @@ -679,7 +679,7 @@ static void llvm_emit_tex( } } - emit_data->output[0] = build_intrinsic(gallivm->builder, + emit_data->output[0] = lp_build_intrinsic(gallivm->builder, action->intr_name, emit_data->dst_type, args, c, LLVMReadNoneAttribute); @@ -754,7 +754,131 @@ static struct lp_build_tgsi_action dot_action = { .intr_name = "llvm.AMDGPU.dp4" }; +static void txd_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + const struct tgsi_full_instruction * inst = emit_data->inst; + + LLVMValueRef coords[4]; + unsigned chan, src; + for (src = 0; src < 3; src++) { + for (chan = 0; chan < 4; chan++) + coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan); + + emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm, + coords, 4); + } + emit_data->arg_count = 3; + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); +} + + +static void txp_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + const struct tgsi_full_instruction * inst = emit_data->inst; + LLVMValueRef src_w; + unsigned chan; + LLVMValueRef coords[5]; + + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W); + + for (chan = 0; chan < 3; chan++ ) { + LLVMValueRef arg = lp_build_emit_fetch(bld_base, + emit_data->inst, 0, chan); + coords[chan] = lp_build_emit_llvm_binary(bld_base, + TGSI_OPCODE_DIV, arg, src_w); + } + coords[3] = bld_base->base.one; + + if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || + inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || + inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || + inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && + inst->Instruction.Opcode != TGSI_OPCODE_TXQ && + inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { + radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL); + } + emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, + coords, 4); + emit_data->arg_count = 1; +} + +static void tex_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + const struct tgsi_full_instruction * inst = emit_data->inst; + + LLVMValueRef coords[5]; + unsigned chan; + for (chan = 0; chan < 4; chan++) { + coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan); + } + + if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || + inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || + inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { + /* These instructions have additional operand that should be packed + * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords. + * That operand should be passed as a float value in the args array + * right after the coord vector. After packing it's not used anymore, + * that's why arg_count is not increased */ + coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0); + } + + if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || + inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || + inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || + inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && + inst->Instruction.Opcode != TGSI_OPCODE_TXQ && + inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { + radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL); + } + + emit_data->arg_count = 1; + emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, + coords, 4); + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); +} + +static void txf_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + const struct tgsi_full_instruction * inst = emit_data->inst; + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + const struct tgsi_texture_offset * off = inst->TexOffsets; + LLVMTypeRef offset_type = bld_base->int_bld.elem_type; + + /* fetch tex coords */ + tex_fetch_args(bld_base, emit_data); + + /* fetch tex offsets */ + if (inst->Texture.NumOffsets) { + assert(inst->Texture.NumOffsets == 1); + + emit_data->args[1] = LLVMConstBitCast( + bld->immediates[off->Index][off->SwizzleX], + offset_type); + emit_data->args[2] = LLVMConstBitCast( + bld->immediates[off->Index][off->SwizzleY], + offset_type); + emit_data->args[3] = LLVMConstBitCast( + bld->immediates[off->Index][off->SwizzleZ], + offset_type); + } else { + emit_data->args[1] = bld_base->int_bld.zero; + emit_data->args[2] = bld_base->int_bld.zero; + emit_data->args[3] = bld_base->int_bld.zero; + } + + emit_data->arg_count = 4; +} LLVMModuleRef r600_tgsi_llvm( struct radeon_llvm_context * ctx, @@ -783,7 +907,6 @@ LLVMModuleRef r600_tgsi_llvm( bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const; bld_base->emit_prologue = llvm_emit_prologue; bld_base->emit_epilogue = llvm_emit_epilogue; - ctx->userdata = ctx; ctx->load_input = llvm_load_input; ctx->load_system_value = llvm_load_system_value; @@ -791,18 +914,42 @@ LLVMModuleRef r600_tgsi_llvm( bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action; bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action; bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action; + bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx"; + bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args; bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy"; + bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args; bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex"; bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex"; bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb"; bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb"; bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd"; bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf"; + bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl"; bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl"; bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex"; bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex; + bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq"; + bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex; bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt; lp_build_tgsi_llvm(bld_base, tokens); @@ -881,7 +1028,7 @@ unsigned r600_llvm_compile( const char * gpu_family = r600_get_llvm_processor_name(family); memset(&binary, 0, sizeof(struct radeon_shader_binary)); - r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL); + r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL); r = r600_create_shader(bc, &binary, use_kill); diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index e122b607b86..6ffe5615fbf 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -120,6 +120,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void rctx->b.b.screen = screen; rctx->b.b.priv = priv; rctx->b.b.destroy = r600_destroy_context; + rctx->b.set_atom_dirty = (void *)r600_set_atom_dirty; if (!r600_common_context_init(&rctx->b, &rscreen->b)) goto fail; @@ -176,7 +177,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void goto fail; } - rctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX, + rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, r600_context_gfx_flush, rctx, rscreen->b.trace_bo ? rscreen->b.trace_bo->cs_buf : NULL); @@ -268,8 +269,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_SAMPLE_SHADING: case PIPE_CAP_CLIP_HALFZ: case PIPE_CAP_POLYGON_OFFSET_CLAMP: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return rscreen->b.info.drm_major == 2 && rscreen->b.info.drm_minor >= 43; + case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: return !R600_BIG_ENDIAN && rscreen->b.info.has_userptr; @@ -329,10 +336,10 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 4ea270d3839..9b66105641a 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -36,7 +36,7 @@ #include "util/list.h" #include "util/u_transfer.h" -#define R600_NUM_ATOMS 73 +#define R600_NUM_ATOMS 75 #define R600_MAX_VIEWPORTS 16 @@ -85,6 +85,9 @@ #define R600_BIG_ENDIAN 0 #endif +#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8) +#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, R600_DIRTY_ATOM_WORD_BITS) + struct r600_context; struct r600_bytecode; struct r600_shader_key; @@ -426,6 +429,8 @@ struct r600_context { /* State binding slots are here. */ struct r600_atom *atoms[R600_NUM_ATOMS]; + /* Dirty atom bitmask for fast tests */ + unsigned long dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN]; /* States for CS initialization. */ struct r600_command_buffer start_cs_cmd; /* invariant state mostly */ /** Compute specific registers initializations. The start_cs_cmd atom @@ -490,37 +495,92 @@ struct r600_context { struct r600_isa *isa; }; -static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs, +static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs, struct r600_command_buffer *cb) { - assert(cs->cdw + cb->num_dw <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw + cb->num_dw <= cs->max_dw); memcpy(cs->buf + cs->cdw, cb->buf, 4 * cb->num_dw); cs->cdw += cb->num_dw; } +static inline void r600_set_atom_dirty(struct r600_context *rctx, + struct r600_atom *atom, + bool dirty) +{ + unsigned long mask; + unsigned int w; + + atom->dirty = dirty; + + assert(atom->id != 0); + w = atom->id / R600_DIRTY_ATOM_WORD_BITS; + mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS); + if (dirty) + rctx->dirty_atoms[w] |= mask; + else + rctx->dirty_atoms[w] &= ~mask; +} + +static inline void r600_mark_atom_dirty(struct r600_context *rctx, + struct r600_atom *atom) +{ + r600_set_atom_dirty(rctx, atom, true); +} + +static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx, + unsigned int id) +{ +#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ) + unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS; + unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS; + unsigned long bits, mask = (1ul << bit) - 1; + + for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) { + bits = rctx->dirty_atoms[w] & ~mask; + if (bits == 0) + continue; + return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits); + } + + return R600_NUM_ATOMS; +#else + for (; id < R600_NUM_ATOMS; id++) { + bool dirty = !!(rctx->dirty_atoms[id / R600_DIRTY_ATOM_WORD_BITS] & + (1ul << (id % R600_DIRTY_ATOM_WORD_BITS))); + assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty)); + if (dirty) + break; + } + + return id; +#endif +} + void r600_trace_emit(struct r600_context *rctx); -static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom) +static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom) { atom->emit(&rctx->b, atom); - atom->dirty = false; + r600_set_atom_dirty(rctx, atom, false); if (rctx->screen->b.trace_bo) { r600_trace_emit(rctx); } } -static INLINE void r600_set_cso_state(struct r600_cso_state *state, void *cso) +static inline void r600_set_cso_state(struct r600_context *rctx, + struct r600_cso_state *state, void *cso) { state->cso = cso; - state->atom.dirty = cso != NULL; + r600_set_atom_dirty(rctx, &state->atom, cso != NULL); } -static INLINE void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso, +static inline void r600_set_cso_state_with_cb(struct r600_context *rctx, + struct r600_cso_state *state, void *cso, struct r600_command_buffer *cb) { state->cb = cb; state->atom.num_dw = cb ? cb->num_dw : 0; - r600_set_cso_state(state, cso); + r600_set_cso_state(rctx, state, cso); } /* compute_memory_pool.c */ @@ -529,11 +589,6 @@ void compute_memory_pool_delete(struct compute_memory_pool* pool); struct compute_memory_pool* compute_memory_pool_new( struct r600_screen *rscreen); -/* evergreen_compute.c */ -void evergreen_set_cs_sampler_view(struct pipe_context *ctx_, - unsigned start_slot, unsigned count, - struct pipe_sampler_view **views); - /* evergreen_state.c */ struct pipe_sampler_view * evergreen_create_sampler_view_custom(struct pipe_context *ctx, @@ -656,6 +711,7 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a); +void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id); void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id, void (*emit)(struct r600_context *ctx, struct r600_atom *state), unsigned num_dw); @@ -719,19 +775,19 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, /*Evergreen Compute packet3*/ #define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE) -static INLINE void r600_store_value(struct r600_command_buffer *cb, unsigned value) +static inline void r600_store_value(struct r600_command_buffer *cb, unsigned value) { cb->buf[cb->num_dw++] = value; } -static INLINE void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr) +static inline void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr) { assert(cb->num_dw+num <= cb->max_num_dw); memcpy(&cb->buf[cb->num_dw], ptr, num * sizeof(ptr[0])); cb->num_dw += num; } -static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) +static inline void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) { assert(reg < R600_CONTEXT_REG_OFFSET); assert(cb->num_dw+2+num <= cb->max_num_dw); @@ -743,7 +799,7 @@ static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, uns * Needs cb->pkt_flags set to RADEON_CP_PACKET3_COMPUTE_MODE for compute * shaders. */ -static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) +static inline void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) { assert(reg >= R600_CONTEXT_REG_OFFSET && reg < R600_CTL_CONST_OFFSET); assert(cb->num_dw+2+num <= cb->max_num_dw); @@ -755,7 +811,7 @@ static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, un * Needs cb->pkt_flags set to RADEON_CP_PACKET3_COMPUTE_MODE for compute * shaders. */ -static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) +static inline void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) { assert(reg >= R600_CTL_CONST_OFFSET); assert(cb->num_dw+2+num <= cb->max_num_dw); @@ -763,7 +819,7 @@ static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsi cb->buf[cb->num_dw++] = (reg - R600_CTL_CONST_OFFSET) >> 2; } -static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) +static inline void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) { assert(reg >= R600_LOOP_CONST_OFFSET); assert(cb->num_dw+2+num <= cb->max_num_dw); @@ -775,7 +831,7 @@ static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, uns * Needs cb->pkt_flags set to RADEON_CP_PACKET3_COMPUTE_MODE for compute * shaders. */ -static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) +static inline void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num) { assert(reg >= EG_LOOP_CONST_OFFSET); assert(cb->num_dw+2+num <= cb->max_num_dw); @@ -783,31 +839,31 @@ static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsig cb->buf[cb->num_dw++] = (reg - EG_LOOP_CONST_OFFSET) >> 2; } -static INLINE void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value) +static inline void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value) { r600_store_config_reg_seq(cb, reg, 1); r600_store_value(cb, value); } -static INLINE void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value) +static inline void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value) { r600_store_context_reg_seq(cb, reg, 1); r600_store_value(cb, value); } -static INLINE void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) +static inline void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) { r600_store_ctl_const_seq(cb, reg, 1); r600_store_value(cb, value); } -static INLINE void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) +static inline void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) { r600_store_loop_const_seq(cb, reg, 1); r600_store_value(cb, value); } -static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) +static inline void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value) { eg_store_loop_const_seq(cb, reg, 1); r600_store_value(cb, value); @@ -816,28 +872,28 @@ static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw); void r600_release_command_buffer(struct r600_command_buffer *cb); -static INLINE void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { r600_write_context_reg_seq(cs, reg, num); /* Set the compute bit on the packet header */ cs->buf[cs->cdw - 2] |= RADEON_CP_PACKET3_COMPUTE_MODE; } -static INLINE void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { assert(reg >= R600_CTL_CONST_OFFSET); - assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw+2+num <= cs->max_dw); cs->buf[cs->cdw++] = PKT3(PKT3_SET_CTL_CONST, num, 0); cs->buf[cs->cdw++] = (reg - R600_CTL_CONST_OFFSET) >> 2; } -static INLINE void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { r600_write_compute_context_reg_seq(cs, reg, 1); radeon_emit(cs, value); } -static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag) +static inline void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag) { if (flag & RADEON_CP_PACKET3_COMPUTE_MODE) { r600_write_compute_context_reg(cs, reg, value); @@ -846,7 +902,7 @@ static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsi } } -static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { r600_write_ctl_const_seq(cs, reg, 1); radeon_emit(cs, value); @@ -855,21 +911,21 @@ static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned re /* * common helpers */ -static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits) +static inline uint32_t S_FIXED(float value, uint32_t frac_bits) { return value * (1 << frac_bits); } #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y)) /* 12.4 fixed-point */ -static INLINE unsigned r600_pack_float_12p4(float x) +static inline unsigned r600_pack_float_12p4(float x) { return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16; } /* Return if the depth format can be read without the DB->CB copy on r6xx-r7xx. */ -static INLINE bool r600_can_read_depth(struct r600_texture *rtex) +static inline bool r600_can_read_depth(struct r600_texture *rtex) { return rtex->resource.b.b.nr_samples <= 1 && (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM || @@ -880,7 +936,7 @@ static INLINE bool r600_can_read_depth(struct r600_texture *rtex) #define V_028A6C_OUTPRIM_TYPE_LINESTRIP 1 #define V_028A6C_OUTPRIM_TYPE_TRISTRIP 2 -static INLINE unsigned r600_conv_prim_to_gs_out(unsigned mode) +static inline unsigned r600_conv_prim_to_gs_out(unsigned mode) { static const int prim_conv[] = { V_028A6C_OUTPRIM_TYPE_POINTLIST, diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index af7622e9b34..8d1f95abddc 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -310,6 +310,7 @@ struct r600_shader_ctx { int gs_next_vertex; struct r600_shader *gs_for_vs; int gs_export_gpr_treg; + unsigned enabled_stream_buffers_mask; }; struct r600_shader_tgsi_instruction { @@ -1402,6 +1403,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output * with MEM_STREAM instructions */ output.array_size = 0xFFF; output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; + + ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer); + if (ctx->bc->chip_class >= EVERGREEN) { switch (so->output[i].output_buffer) { case 0: @@ -1718,6 +1722,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx, gs->gs_copy_shader = cshader; ctx.bc->nstack = 1; + + cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; cshader->shader.ring_item_size = ocnt * 16; return r600_bytecode_build(ctx.bc); @@ -1931,15 +1937,14 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; + ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1; + ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2; + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { - ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1; - ctx.temp_reg = ctx.bc->ar_reg + 2; - ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3; - ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4; + ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 3; + ctx.temp_reg = ctx.bc->ar_reg + 4; } else { - ctx.temp_reg = ctx.bc->ar_reg + 1; - ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2; - ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3; + ctx.temp_reg = ctx.bc->ar_reg + 3; } shader->max_arrays = 0; @@ -2086,7 +2091,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, radeon_llvm_ctx.chip_class = ctx.bc->chip_class; radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN); radeon_llvm_ctx.stream_outputs = &so; - radeon_llvm_ctx.clip_vertex = ctx.cv_output; radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; radeon_llvm_ctx.has_compressed_msaa_texturing = ctx.bc->has_compressed_msaa_texturing; @@ -2262,6 +2266,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, so.num_outputs && !use_llvm) emit_streamout(&ctx, &so); + pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; convert_edgeflag_to_int(&ctx); if (ring_outputs) { @@ -2485,6 +2490,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].array_base = 0; output[j].op = CF_OP_EXPORT; j++; + shader->nr_ps_color_exports++; } noutput = j; diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index dd359d7e959..5d05c8153d7 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -125,6 +125,7 @@ struct r600_pipe_shader { struct r600_shader_key key; unsigned db_shader_control; unsigned ps_depth_export; + unsigned enabled_stream_buffers_mask; }; /* return the table index 0-5 for TGSI_INTERPOLATE_LINEAR/PERSPECTIVE and diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 960dfcedfef..5cc2283792d 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -473,7 +473,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx, /* offset */ rs->offset_units = state->offset_units; - rs->offset_scale = state->offset_scale * 12.0f; + rs->offset_scale = state->offset_scale * 16.0f; rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri; if (state->point_size_per_vertex) { @@ -802,7 +802,7 @@ static void r600_set_scissor_states(struct pipe_context *ctx, return; for (i = start_slot ; i < start_slot + num_scissors; i++) { - rctx->scissor[i].atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom); } } @@ -1193,7 +1193,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, if (rctx->alphatest_state.bypass != alphatest_bypass) { rctx->alphatest_state.bypass = alphatest_bypass; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } } @@ -1209,28 +1209,28 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, if (state->zsbuf->format != rctx->poly_offset_state.zs_format) { rctx->poly_offset_state.zs_format = state->zsbuf->format; - rctx->poly_offset_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom); } if (rctx->db_state.rsurf != surf) { rctx->db_state.rsurf = surf; - rctx->db_state.atom.dirty = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_state.atom); + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } else if (rctx->db_state.rsurf) { rctx->db_state.rsurf = NULL; - rctx->db_state.atom.dirty = true; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_state.atom); + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) { rctx->cb_misc_state.nr_cbufs = state->nr_cbufs; - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) { rctx->alphatest_state.bypass = false; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } /* Calculate the CS size. */ @@ -1250,7 +1250,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, rctx->framebuffer.atom.num_dw += 2; } - rctx->framebuffer.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom); r600_set_sample_locations_constant_buffer(rctx); } @@ -1541,9 +1541,9 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples) rctx->ps_iter_samples = min_samples; if (rctx->framebuffer.nr_samples > 1) { - rctx->rasterizer_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->rasterizer_state.atom); if (rctx->b.chip_class == R600) - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -2089,7 +2089,7 @@ bool r600_adjust_gprs(struct r600_context *rctx) if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp || rctx->config_state.sq_gpr_resource_mgmt_2 != tmp2) { rctx->config_state.sq_gpr_resource_mgmt_1 = tmp; rctx->config_state.sq_gpr_resource_mgmt_2 = tmp2; - rctx->config_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->config_state.atom); rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE; } return true; @@ -2796,11 +2796,11 @@ void r600_update_db_shader_control(struct r600_context * rctx) if (db_shader_control != rctx->db_misc_state.db_shader_control) { rctx->db_misc_state.db_shader_control = db_shader_control; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } -static INLINE unsigned r600_array_mode(unsigned mode) +static inline unsigned r600_array_mode(unsigned mode) { switch (mode) { case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_0280A0_ARRAY_LINEAR_ALIGNED; @@ -3074,8 +3074,8 @@ void r600_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5); - rctx->atoms[id++] = &rctx->b.streamout.begin_atom; - rctx->atoms[id++] = &rctx->b.streamout.enable_atom; + r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); + r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0); r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0); diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 13dc9ee8c10..aa4a8d0240f 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -47,18 +47,26 @@ void r600_release_command_buffer(struct r600_command_buffer *cb) FREE(cb->buf); } +void r600_add_atom(struct r600_context *rctx, + struct r600_atom *atom, + unsigned id) +{ + assert(id < R600_NUM_ATOMS); + assert(rctx->atoms[id] == NULL); + rctx->atoms[id] = atom; + atom->id = id; + atom->dirty = false; +} + void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id, void (*emit)(struct r600_context *ctx, struct r600_atom *state), unsigned num_dw) { - assert(id < R600_NUM_ATOMS); - assert(rctx->atoms[id] == NULL); - rctx->atoms[id] = atom; atom->emit = (void*)emit; atom->num_dw = num_dw; - atom->dirty = false; + r600_add_atom(rctx, atom, id); } void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom) @@ -127,11 +135,11 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx, rctx->dual_src_blend = blend->dual_src_blend; if (!blend_disable) { - r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer); + r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer); color_control = blend->cb_color_control; } else { /* Blending is disabled. */ - r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer_no_blend); + r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer_no_blend); color_control = blend->cb_color_control_no_blend; } @@ -150,7 +158,7 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx, update_cb = true; } if (update_cb) { - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } } @@ -160,7 +168,7 @@ static void r600_bind_blend_state(struct pipe_context *ctx, void *state) struct r600_blend_state *blend = (struct r600_blend_state *)state; if (blend == NULL) { - r600_set_cso_state_with_cb(&rctx->blend_state, NULL, NULL); + r600_set_cso_state_with_cb(rctx, &rctx->blend_state, NULL, NULL); return; } @@ -173,7 +181,7 @@ static void r600_set_blend_color(struct pipe_context *ctx, struct r600_context *rctx = (struct r600_context *)ctx; rctx->blend_color.state = *state; - rctx->blend_color.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->blend_color.atom); } void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom) @@ -210,7 +218,7 @@ static void r600_set_clip_state(struct pipe_context *ctx, struct pipe_constant_buffer cb; rctx->clip_state.state = *state; - rctx->clip_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->clip_state.atom); cb.buffer = NULL; cb.user_buffer = state->ucp; @@ -226,7 +234,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx, struct r600_context *rctx = (struct r600_context *)ctx; rctx->stencil_ref.state = *state; - rctx->stencil_ref.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->stencil_ref.atom); } void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom) @@ -274,11 +282,11 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state) struct r600_stencil_ref ref; if (state == NULL) { - r600_set_cso_state_with_cb(&rctx->dsa_state, NULL, NULL); + r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, NULL, NULL); return; } - r600_set_cso_state_with_cb(&rctx->dsa_state, dsa, &dsa->buffer); + r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, dsa, &dsa->buffer); ref.ref_value[0] = rctx->stencil_ref.pipe_state.ref_value[0]; ref.ref_value[1] = rctx->stencil_ref.pipe_state.ref_value[1]; @@ -293,7 +301,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state) * we are having lockup on evergreen so do not enable * hyperz when not writing zbuffer */ - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -304,7 +312,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state) rctx->alphatest_state.sx_alpha_ref != dsa->alpha_ref) { rctx->alphatest_state.sx_alpha_test_control = dsa->sx_alpha_test_control; rctx->alphatest_state.sx_alpha_ref = dsa->alpha_ref; - rctx->alphatest_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom); } } @@ -318,14 +326,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state) rctx->rasterizer = rs; - r600_set_cso_state_with_cb(&rctx->rasterizer_state, rs, &rs->buffer); + r600_set_cso_state_with_cb(rctx, &rctx->rasterizer_state, rs, &rs->buffer); if (rs->offset_enable && (rs->offset_units != rctx->poly_offset_state.offset_units || rs->offset_scale != rctx->poly_offset_state.offset_scale)) { rctx->poly_offset_state.offset_units = rs->offset_units; rctx->poly_offset_state.offset_scale = rs->offset_scale; - rctx->poly_offset_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom); } /* Update clip_misc_state. */ @@ -333,14 +341,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state) rctx->clip_misc_state.clip_plane_enable != rs->clip_plane_enable) { rctx->clip_misc_state.pa_cl_clip_cntl = rs->pa_cl_clip_cntl; rctx->clip_misc_state.clip_plane_enable = rs->clip_plane_enable; - rctx->clip_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); } /* Workaround for a missing scissor enable on r600. */ if (rctx->b.chip_class == R600 && rs->scissor_enable != rctx->scissor[0].enable) { rctx->scissor[0].enable = rs->scissor_enable; - rctx->scissor[0].atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->scissor[0].atom); } /* Re-emit PA_SC_LINE_STIPPLE. */ @@ -378,7 +386,7 @@ void r600_sampler_states_dirty(struct r600_context *rctx, state->atom.num_dw = util_bitcount(state->dirty_mask & state->has_bordercolor_mask) * 11 + util_bitcount(state->dirty_mask & ~state->has_bordercolor_mask) * 5; - state->atom.dirty = true; + r600_mark_atom_dirty(rctx, &state->atom); } } @@ -399,9 +407,9 @@ static void r600_bind_sampler_states(struct pipe_context *pipe, assert(start == 0); /* XXX fix below */ - if (shader != PIPE_SHADER_VERTEX && - shader != PIPE_SHADER_FRAGMENT) { - return; + if (!states) { + disable_mask = ~0u; + count = 0; } for (i = 0; i < count; i++) { @@ -443,7 +451,7 @@ static void r600_bind_sampler_states(struct pipe_context *pipe, /* change in TA_CNTL_AUX need a pipeline flush */ rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE; rctx->seamless_cube_map.enabled = seamless_cube_map; - rctx->seamless_cube_map.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->seamless_cube_map.atom); } } @@ -483,7 +491,7 @@ static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state) { struct r600_context *rctx = (struct r600_context *)ctx; - r600_set_cso_state(&rctx->vertex_fetch_shader, state); + r600_set_cso_state(rctx, &rctx->vertex_fetch_shader, state); } static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state) @@ -513,7 +521,7 @@ void r600_vertex_buffers_dirty(struct r600_context *rctx) rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE; rctx->vertex_buffer_state.atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 12 : 11) * util_bitcount(rctx->vertex_buffer_state.dirty_mask); - rctx->vertex_buffer_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->vertex_buffer_state.atom); } } @@ -570,7 +578,7 @@ void r600_sampler_views_dirty(struct r600_context *rctx, rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; state->atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 14 : 13) * util_bitcount(state->dirty_mask); - state->atom.dirty = true; + r600_mark_atom_dirty(rctx, &state->atom); } } @@ -593,9 +601,9 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader, assert(start == 0); /* XXX fix below */ - if (shader == PIPE_SHADER_COMPUTE) { - evergreen_set_cs_sampler_view(pipe, start, count, views); - return; + if (!views) { + disable_mask = ~0u; + count = 0; } remaining_mask = dst->views.enabled_mask & disable_mask; @@ -673,7 +681,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx, for (i = start_slot; i < start_slot + num_viewports; i++) { rctx->viewport[i].state = state[i - start_slot]; - rctx->viewport[i].atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->viewport[i].atom); } } @@ -694,7 +702,7 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom) } /* Compute the key for the hw shader variant */ -static INLINE struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx, +static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx, struct r600_pipe_shader_selector * sel) { struct r600_context *rctx = (struct r600_context *)ctx; @@ -913,7 +921,7 @@ void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE; state->atom.num_dw = rctx->b.chip_class >= EVERGREEN ? util_bitcount(state->dirty_mask)*20 : util_bitcount(state->dirty_mask)*19; - state->atom.dirty = true; + r600_mark_atom_dirty(rctx, &state->atom); } } @@ -982,7 +990,7 @@ static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask return; rctx->sample_mask.sample_mask = sample_mask; - rctx->sample_mask.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->sample_mask.atom); } /* @@ -1107,27 +1115,28 @@ static void update_shader_atom(struct pipe_context *ctx, struct r600_shader_state *state, struct r600_pipe_shader *shader) { + struct r600_context *rctx = (struct r600_context *)ctx; + state->shader = shader; if (shader) { state->atom.num_dw = shader->command_buffer.num_dw; - state->atom.dirty = true; r600_context_add_resource_size(ctx, (struct pipe_resource *)shader->bo); } else { state->atom.num_dw = 0; - state->atom.dirty = false; } + r600_mark_atom_dirty(rctx, &state->atom); } static void update_gs_block_state(struct r600_context *rctx, unsigned enable) { if (rctx->shader_stages.geom_enable != enable) { rctx->shader_stages.geom_enable = enable; - rctx->shader_stages.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom); } if (rctx->gs_rings.enable != enable) { rctx->gs_rings.enable = enable; - rctx->gs_rings.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->gs_rings.atom); if (enable && !rctx->gs_rings.esgs_ring.buffer) { unsigned size = 0x1C000; @@ -1192,7 +1201,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) if (!rctx->shader_stages.geom_enable) { rctx->shader_stages.geom_enable = true; - rctx->shader_stages.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom); } /* gs_shader provides GS and VS (copy shader) */ @@ -1206,8 +1215,9 @@ static bool r600_update_derived_state(struct r600_context *rctx) rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->gs_shader->current->gs_copy_shader->pa_cl_vs_out_cntl; rctx->clip_misc_state.clip_dist_write = rctx->gs_shader->current->gs_copy_shader->shader.clip_dist_write; rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space; - rctx->clip_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); } + rctx->b.streamout.enabled_stream_buffers_mask = rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask; } r600_shader_select(ctx, rctx->vs_shader, &vs_dirty); @@ -1223,7 +1233,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) update_shader_atom(ctx, &rctx->geometry_shader, NULL); update_shader_atom(ctx, &rctx->export_shader, NULL); rctx->shader_stages.geom_enable = false; - rctx->shader_stages.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom); } r600_shader_select(ctx, rctx->vs_shader, &vs_dirty); @@ -1240,8 +1250,9 @@ static bool r600_update_derived_state(struct r600_context *rctx) rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->vs_shader->current->pa_cl_vs_out_cntl; rctx->clip_misc_state.clip_dist_write = rctx->vs_shader->current->shader.clip_dist_write; rctx->clip_misc_state.clip_disable = rctx->vs_shader->current->shader.vs_position_window_space; - rctx->clip_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); } + rctx->b.streamout.enabled_stream_buffers_mask = rctx->vs_shader->current->enabled_stream_buffers_mask; } } @@ -1252,7 +1263,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) if (rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) { rctx->cb_misc_state.nr_ps_color_outputs = rctx->ps_shader->current->nr_ps_color_outputs; - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } if (rctx->b.chip_class <= R700) { @@ -1260,7 +1271,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) if (rctx->cb_misc_state.multiwrite != multiwrite) { rctx->cb_misc_state.multiwrite = multiwrite; - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } } @@ -1274,7 +1285,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) r600_update_ps_state(ctx, rctx->ps_shader->current); } - rctx->shader_stages.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom); update_shader_atom(ctx, &rctx->pixel_shader, rctx->ps_shader->current); } @@ -1409,7 +1420,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info data += info.indirect_offset / sizeof(unsigned); start = data[2] * ib.index_size; count = data[0]; - rctx->b.ws->buffer_unmap(indirect_resource->cs_buf); } else { start = 0; @@ -1454,24 +1464,23 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info rctx->vgt_state.vgt_multi_prim_ib_reset_en = info.primitive_restart; rctx->vgt_state.vgt_multi_prim_ib_reset_indx = info.restart_index; rctx->vgt_state.vgt_indx_offset = info.index_bias; - rctx->vgt_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->vgt_state.atom); } /* Workaround for hardware deadlock on certain R600 ASICs: write into a CB register. */ if (rctx->b.chip_class == R600) { rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH; - rctx->cb_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom); } /* Emit states. */ r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE); r600_flush_emit(rctx); - for (i = 0; i < R600_NUM_ATOMS; i++) { - if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) { - continue; - } + i = r600_next_dirty_atom(rctx, 0); + while (i < R600_NUM_ATOMS) { r600_emit_atom(rctx, rctx->atoms[i]); + i = r600_next_dirty_atom(rctx, i + 1); } if (rctx->b.chip_class == CAYMAN) { @@ -2490,7 +2499,7 @@ static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable if (rctx->db_misc_state.occlusion_query_enabled != enable) { rctx->db_misc_state.occlusion_query_enabled = enable; - rctx->db_misc_state.atom.dirty = true; + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index 2e38a62c05a..62680788c5e 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -489,7 +489,7 @@ bool alu_group_tracker::try_reserve(alu_node* n) { n->bc.bank_swizzle = 0; - if (!trans & fbs) + if (!trans && fbs) n->bc.bank_swizzle = VEC_210; if (gpr.try_reserve(n)) { diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index fc5f6c29870..cb9809f2449 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -84,7 +84,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } } - if (busy || ctx->ws->buffer_is_busy(resource->buf, rusage)) { + if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { return NULL; } else { @@ -121,7 +121,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen, /* Older kernels didn't always flush the HDP cache before * CS execution */ - if (rscreen->info.drm_minor < 40) { + if (rscreen->info.drm_major == 2 && + rscreen->info.drm_minor < 40) { res->domains = RADEON_DOMAIN_GTT; flags |= RADEON_FLAG_GTT_WC; break; @@ -147,7 +148,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen, * Write-combined CPU mappings are fine, the kernel ensures all CPU * writes finish before the GPU executes a command stream. */ - if (rscreen->info.drm_minor < 40) + if (rscreen->info.drm_major == 2 && + rscreen->info.drm_minor < 40) res->domains = RADEON_DOMAIN_GTT; else if (res->domains & RADEON_DOMAIN_VRAM) flags |= RADEON_FLAG_CPU_ACCESS; @@ -161,6 +163,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen, flags |= RADEON_FLAG_NO_CPU_ACCESS; } + if (rscreen->debug_flags & DBG_NO_WC) + flags &= ~RADEON_FLAG_GTT_WC; + /* Allocate a new resource. */ new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, use_reusable_pool, @@ -274,7 +279,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || - rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); } /* At this point, the buffer is always idle. */ @@ -288,7 +293,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || - rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h index b51eebbc68e..03a04b754d6 100644 --- a/src/gallium/drivers/radeon/r600_cs.h +++ b/src/gallium/drivers/radeon/r600_cs.h @@ -33,7 +33,7 @@ #include "r600_pipe_common.h" #include "r600d_common.h" -static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx, +static inline unsigned r600_context_bo_reloc(struct r600_common_context *rctx, struct r600_ring *ring, struct r600_resource *rbo, enum radeon_bo_usage usage, @@ -59,7 +59,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx, rbo->domains, priority) * 4; } -static INLINE void r600_emit_reloc(struct r600_common_context *rctx, +static inline void r600_emit_reloc(struct r600_common_context *rctx, struct r600_ring *ring, struct r600_resource *rbo, enum radeon_bo_usage usage, enum radeon_bo_priority priority) @@ -74,57 +74,57 @@ static INLINE void r600_emit_reloc(struct r600_common_context *rctx, } } -static INLINE void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { assert(reg < R600_CONTEXT_REG_OFFSET); - assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw+2+num <= cs->max_dw); radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2); } -static INLINE void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { r600_write_config_reg_seq(cs, reg, 1); radeon_emit(cs, value); } -static INLINE void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { assert(reg >= R600_CONTEXT_REG_OFFSET); - assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw+2+num <= cs->max_dw); radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2); } -static INLINE void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { r600_write_context_reg_seq(cs, reg, 1); radeon_emit(cs, value); } -static INLINE void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); - assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw+2+num <= cs->max_dw); radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); } -static INLINE void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { si_write_sh_reg_seq(cs, reg, 1); radeon_emit(cs, value); } -static INLINE void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +static inline void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) { assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); - assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS); + assert(cs->cdw+2+num <= cs->max_dw); radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0)); radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); } -static INLINE void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +static inline void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) { cik_write_uconfig_reg_seq(cs, reg, 1); radeon_emit(cs, value); diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 3def4446882..ed5d1dabdc3 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -108,9 +108,9 @@ void r600_draw_rectangle(struct blitter_context *blitter, void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw) { /* Flush if there's not enough space. */ - if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) { + if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) { ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS); + assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw); } } @@ -132,10 +132,11 @@ void r600_preflush_suspend_features(struct r600_common_context *ctx) } /* suspend queries */ - ctx->nontimer_queries_suspended = false; + ctx->queries_suspended_for_flush = false; if (ctx->num_cs_dw_nontimer_queries_suspend) { r600_suspend_nontimer_queries(ctx); - ctx->nontimer_queries_suspended = true; + r600_suspend_timer_queries(ctx); + ctx->queries_suspended_for_flush = true; } ctx->streamout.suspended = false; @@ -153,8 +154,9 @@ void r600_postflush_resume_features(struct r600_common_context *ctx) } /* resume queries */ - if (ctx->nontimer_queries_suspended) { + if (ctx->queries_suspended_for_flush) { r600_resume_nontimer_queries(ctx); + r600_resume_timer_queries(ctx); } /* Re-enable render condition. */ @@ -196,6 +198,19 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags, rctx->rings.dma.flushing = false; } +static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + unsigned latest = rctx->ws->query_value(rctx->ws, + RADEON_GPU_RESET_COUNTER); + + if (rctx->gpu_reset_counter == latest) + return PIPE_NO_RESET; + + rctx->gpu_reset_counter = latest; + return PIPE_UNKNOWN_CONTEXT_RESET; +} + bool r600_common_context_init(struct r600_common_context *rctx, struct r600_common_screen *rscreen) { @@ -222,6 +237,13 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->b.memory_barrier = r600_memory_barrier; rctx->b.flush = r600_flush_from_st; + if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) { + rctx->b.get_device_reset_status = r600_get_reset_status; + rctx->gpu_reset_counter = + rctx->ws->query_value(rctx->ws, + RADEON_GPU_RESET_COUNTER); + } + LIST_INITHEAD(&rctx->texture_buffers); r600_init_context_texture_functions(rctx); @@ -240,8 +262,12 @@ bool r600_common_context_init(struct r600_common_context *rctx, if (!rctx->uploader) return false; + rctx->ctx = rctx->ws->ctx_create(rctx->ws); + if (!rctx->ctx) + return false; + if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { - rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ws, RING_DMA, + rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, r600_flush_dma_ring, rctx, NULL); rctx->rings.dma.flush = r600_flush_dma_ring; @@ -252,12 +278,12 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx) { - if (rctx->rings.gfx.cs) { + if (rctx->rings.gfx.cs) rctx->ws->cs_destroy(rctx->rings.gfx.cs); - } - if (rctx->rings.dma.cs) { + if (rctx->rings.dma.cs) rctx->ws->cs_destroy(rctx->rings.dma.cs); - } + if (rctx->ctx) + rctx->ws->ctx_destroy(rctx->ctx); if (rctx->uploader) { u_upload_destroy(rctx->uploader); @@ -313,6 +339,11 @@ static const struct debug_named_value common_debug_options[] = { { "gs", DBG_GS, "Print geometry shaders" }, { "ps", DBG_PS, "Print pixel shaders" }, { "cs", DBG_CS, "Print compute shaders" }, + { "tcs", DBG_TCS, "Print tessellation control shaders" }, + { "tes", DBG_TES, "Print tessellation evaluation shaders" }, + { "noir", DBG_NO_IR, "Don't print the LLVM IR"}, + { "notgsi", DBG_NO_TGSI, "Don't print the TGSI"}, + { "noasm", DBG_NO_ASM, "Don't print disassembled shaders"}, /* features */ { "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" }, @@ -324,6 +355,7 @@ static const struct debug_named_value common_debug_options[] = { { "switch_on_eop", DBG_SWITCH_ON_EOP, "Program WD/IA to switch on end-of-packet." }, { "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." }, { "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." }, + { "nowc", DBG_NO_WC, "Disable GTT write combining" }, DEBUG_NAMED_VALUE_END /* must be last */ }; @@ -338,11 +370,9 @@ static const char* r600_get_device_vendor(struct pipe_screen* pscreen) return "AMD"; } -static const char* r600_get_name(struct pipe_screen* pscreen) +static const char* r600_get_chip_name(struct r600_common_screen *rscreen) { - struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; - - switch (rscreen->family) { + switch (rscreen->info.family) { case CHIP_R600: return "AMD R600"; case CHIP_RV610: return "AMD RV610"; case CHIP_RV630: return "AMD RV630"; @@ -378,10 +408,21 @@ static const char* r600_get_name(struct pipe_screen* pscreen) case CHIP_KABINI: return "AMD KABINI"; case CHIP_HAWAII: return "AMD HAWAII"; case CHIP_MULLINS: return "AMD MULLINS"; + case CHIP_TONGA: return "AMD TONGA"; + case CHIP_ICELAND: return "AMD ICELAND"; + case CHIP_CARRIZO: return "AMD CARRIZO"; + case CHIP_FIJI: return "AMD FIJI"; default: return "AMD unknown"; } } +static const char* r600_get_name(struct pipe_screen* pscreen) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; + + return rscreen->renderer_string; +} + static float r600_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param) { @@ -495,6 +536,10 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) #else return "kabini"; #endif + case CHIP_TONGA: return "tonga"; + case CHIP_ICELAND: return "iceland"; + case CHIP_CARRIZO: return "carrizo"; + case CHIP_FIJI: return "fiji"; default: return ""; } } @@ -636,6 +681,12 @@ static int r600_get_compute_param(struct pipe_screen *screen, return sizeof(uint32_t); case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: break; /* unused */ + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + if (ret) { + uint32_t *subgroup_size = ret; + *subgroup_size = r600_wavefront_size(rscreen->family); + } + return sizeof(uint32_t); } fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); @@ -656,25 +707,33 @@ static int r600_get_driver_query_info(struct pipe_screen *screen, { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct pipe_driver_query_info list[] = { + {"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64, + PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, + {"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64, + PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, {"draw-calls", R600_QUERY_DRAW_CALLS, {0}}, {"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, {"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, - {"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}}, + {"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS, + PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, {"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}}, - {"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES}, + {"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES, + PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, {"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, {"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, + {"GPU-load", R600_QUERY_GPU_LOAD, {100}}, {"temperature", R600_QUERY_GPU_TEMPERATURE, {100}}, - {"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}}, - {"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}}, - {"GPU-load", R600_QUERY_GPU_LOAD, {100}} + {"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ}, + {"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ}, }; unsigned num_queries; if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) num_queries = Elements(list); + else if (rscreen->info.drm_major == 3) + num_queries = Elements(list) - 3; else - num_queries = 8; + num_queries = Elements(list) - 4; if (!info) return num_queries; @@ -695,14 +754,6 @@ static void r600_fence_reference(struct pipe_screen *screen, rws->fence_reference(ptr, fence); } -static boolean r600_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; - - return rws->fence_wait(rws, fence, 0); -} - static boolean r600_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) @@ -837,8 +888,22 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, bool r600_common_screen_init(struct r600_common_screen *rscreen, struct radeon_winsys *ws) { + char llvm_string[32] = {}; + ws->query_info(ws, &rscreen->info); +#if HAVE_LLVM + snprintf(llvm_string, sizeof(llvm_string), + ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff, + HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); +#endif + + snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string), + "%s (DRM %i.%i.%i%s)", + r600_get_chip_name(rscreen), rscreen->info.drm_major, + rscreen->info.drm_minor, rscreen->info.drm_patchlevel, + llvm_string); + rscreen->b.get_name = r600_get_name; rscreen->b.get_vendor = r600_get_vendor; rscreen->b.get_device_vendor = r600_get_device_vendor; @@ -848,7 +913,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, rscreen->b.get_timestamp = r600_get_timestamp; rscreen->b.fence_finish = r600_fence_finish; rscreen->b.fence_reference = r600_fence_reference; - rscreen->b.fence_signalled = r600_fence_signalled; rscreen->b.resource_destroy = u_resource_destroy_vtbl; rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory; @@ -874,7 +938,9 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, pipe_mutex_init(rscreen->aux_context_lock); pipe_mutex_init(rscreen->gpu_load_mutex); - if (rscreen->info.drm_minor >= 28 && (rscreen->debug_flags & DBG_TRACE_CS)) { + if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) || + rscreen->info.drm_major == 3) && + (rscreen->debug_flags & DBG_TRACE_CS)) { rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, @@ -922,10 +988,8 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen) pipe_mutex_destroy(rscreen->aux_context_lock); rscreen->aux_context->destroy(rscreen->aux_context); - if (rscreen->trace_bo) { - rscreen->ws->buffer_unmap(rscreen->trace_bo->cs_buf); + if (rscreen->trace_bo) pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL); - } rscreen->ws->destroy(rscreen->ws); FREE(rscreen); @@ -941,6 +1005,10 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen, switch (tgsi_get_processor_type(tokens)) { case TGSI_PROCESSOR_VERTEX: return (rscreen->debug_flags & DBG_VS) != 0; + case TGSI_PROCESSOR_TESS_CTRL: + return (rscreen->debug_flags & DBG_TCS) != 0; + case TGSI_PROCESSOR_TESS_EVAL: + return (rscreen->debug_flags & DBG_TES) != 0; case TGSI_PROCESSOR_GEOMETRY: return (rscreen->debug_flags & DBG_GS) != 0; case TGSI_PROCESSOR_FRAGMENT: diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 6ce81d33ddd..29db1cc4e07 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -59,6 +59,8 @@ #define R600_QUERY_CURRENT_GPU_SCLK (PIPE_QUERY_DRIVER_SPECIFIC + 9) #define R600_QUERY_CURRENT_GPU_MCLK (PIPE_QUERY_DRIVER_SPECIFIC + 10) #define R600_QUERY_GPU_LOAD (PIPE_QUERY_DRIVER_SPECIFIC + 11) +#define R600_QUERY_NUM_COMPILATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 12) +#define R600_QUERY_NUM_SHADERS_CREATED (PIPE_QUERY_DRIVER_SPECIFIC + 13) #define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) #define R600_CONTEXT_PRIVATE_FLAG (1u << 1) @@ -79,17 +81,23 @@ #define DBG_GS (1 << 7) #define DBG_PS (1 << 8) #define DBG_CS (1 << 9) +#define DBG_TCS (1 << 10) +#define DBG_TES (1 << 11) +#define DBG_NO_IR (1 << 12) +#define DBG_NO_TGSI (1 << 13) +#define DBG_NO_ASM (1 << 14) +/* Bits 21-31 are reserved for the r600g driver. */ /* features */ -#define DBG_NO_ASYNC_DMA (1 << 10) -#define DBG_NO_HYPERZ (1 << 11) -#define DBG_NO_DISCARD_RANGE (1 << 12) -#define DBG_NO_2D_TILING (1 << 13) -#define DBG_NO_TILING (1 << 14) -#define DBG_SWITCH_ON_EOP (1 << 15) -#define DBG_FORCE_DMA (1 << 16) -#define DBG_PRECOMPILE (1 << 17) -#define DBG_INFO (1 << 18) -/* The maximum allowed bit is 20. */ +#define DBG_NO_ASYNC_DMA (1llu << 32) +#define DBG_NO_HYPERZ (1llu << 33) +#define DBG_NO_DISCARD_RANGE (1llu << 34) +#define DBG_NO_2D_TILING (1llu << 35) +#define DBG_NO_TILING (1llu << 36) +#define DBG_SWITCH_ON_EOP (1llu << 37) +#define DBG_FORCE_DMA (1llu << 38) +#define DBG_PRECOMPILE (1llu << 39) +#define DBG_INFO (1llu << 40) +#define DBG_NO_WC (1llu << 41) #define R600_MAP_BUFFER_ALIGNMENT 64 @@ -127,9 +135,8 @@ struct radeon_shader_binary { struct radeon_shader_reloc *relocs; unsigned reloc_count; - /** Set to 1 if the disassembly for this binary has been dumped to - * stderr. */ - int disassembled; + /** Disassembled shader in a string. */ + char *disasm_string; }; struct r600_resource { @@ -214,7 +221,6 @@ struct r600_texture { float depth_clear_value; bool non_disp_tiling; /* R600-Cayman only */ - unsigned mipmap_shift; }; struct r600_surface { @@ -236,6 +242,7 @@ struct r600_surface { unsigned cb_color_pitch; /* EG and later */ unsigned cb_color_slice; /* EG and later */ unsigned cb_color_attrib; /* EG and later */ + unsigned cb_dcc_control; /* VI and later */ unsigned cb_color_fmask; /* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */ unsigned cb_color_fmask_slice; /* EG and later */ unsigned cb_color_cmask; /* CB_COLORn_TILE (r600 only) */ @@ -272,7 +279,7 @@ struct r600_common_screen { enum chip_class chip_class; struct radeon_info info; struct r600_tiling_info tiling_info; - unsigned debug_flags; + uint64_t debug_flags; bool has_cp_dma; bool has_streamout; @@ -285,12 +292,23 @@ struct r600_common_screen { uint32_t *trace_ptr; unsigned cs_count; + /* This must be in the screen, because UE4 uses one context for + * compilation and another one for rendering. + */ + unsigned num_compilations; + /* Along with ST_DEBUG=precompile, this should show if applications + * are loading shaders on demand. This is a monotonic counter. + */ + unsigned num_shaders_created; + /* GPU load thread. */ pipe_mutex gpu_load_mutex; pipe_thread gpu_load_thread; unsigned gpu_load_counter_busy; unsigned gpu_load_counter_idle; - unsigned gpu_load_stop_thread; /* bool */ + volatile unsigned gpu_load_stop_thread; /* bool */ + + char renderer_string[64]; }; /* This encapsulates a state or an operation which can emitted into the GPU @@ -298,6 +316,7 @@ struct r600_common_screen { struct r600_atom { void (*emit)(struct r600_common_context *ctx, struct r600_atom *state); unsigned num_dw; + unsigned short id; /* used by r600 only */ bool dirty; }; @@ -327,6 +346,10 @@ struct r600_streamout { /* External state which comes from the vertex shader, * it must be set explicitly when binding a shader. */ unsigned *stride_in_dw; + unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ + + /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */ + unsigned hw_enabled_mask; /* The state of VGT_STRMOUT_(CONFIG|EN). */ struct r600_atom enable_atom; @@ -352,10 +375,12 @@ struct r600_common_context { struct r600_common_screen *screen; struct radeon_winsys *ws; + struct radeon_winsys_ctx *ctx; enum radeon_family family; enum chip_class chip_class; struct r600_rings rings; unsigned initial_gfx_cs_size; + unsigned gpu_reset_counter; struct u_upload_mgr *uploader; struct u_suballocator *allocator_so_filled_size; @@ -376,11 +401,14 @@ struct r600_common_context { int num_occlusion_queries; /* Keep track of non-timer queries, because they should be suspended * during context flushing. - * The timer queries (TIME_ELAPSED) shouldn't be suspended. */ + * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits, + * but they should be suspended between IBs. */ struct list_head active_nontimer_queries; + struct list_head active_timer_queries; unsigned num_cs_dw_nontimer_queries_suspend; + unsigned num_cs_dw_timer_queries_suspend; /* If queries have been suspended. */ - bool nontimer_queries_suspended; + bool queries_suspended_for_flush; /* Additional hardware info. */ unsigned backend_mask; unsigned max_db; /* for OQ */ @@ -441,6 +469,9 @@ struct r600_common_context { /* This ensures there is enough space in the command stream. */ void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw, bool include_draw_vbo); + + void (*set_atom_dirty)(struct r600_common_context *ctx, + struct r600_atom *atom, bool dirty); }; /* r600_buffer.c */ @@ -495,6 +526,8 @@ unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin); void r600_query_init(struct r600_common_context *rctx); void r600_suspend_nontimer_queries(struct r600_common_context *ctx); void r600_resume_nontimer_queries(struct r600_common_context *ctx); +void r600_suspend_timer_queries(struct r600_common_context *ctx); +void r600_resume_timer_queries(struct r600_common_context *ctx); void r600_query_init_backend_mask(struct r600_common_context *ctx); /* r600_streamout.c */ @@ -549,12 +582,12 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, /* Inline helpers. */ -static INLINE struct r600_resource *r600_resource(struct pipe_resource *r) +static inline struct r600_resource *r600_resource(struct pipe_resource *r) { return (struct r600_resource*)r; } -static INLINE void +static inline void r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res) { pipe_resource_reference((struct pipe_resource **)ptr, @@ -570,6 +603,26 @@ static inline unsigned r600_tex_aniso_filter(unsigned filter) /* else */ return 4; } +static inline unsigned r600_wavefront_size(enum radeon_family family) +{ + switch (family) { + case CHIP_RV610: + case CHIP_RS780: + case CHIP_RV620: + case CHIP_RS880: + return 16; + case CHIP_RV630: + case CHIP_RV635: + case CHIP_RV730: + case CHIP_RV710: + case CHIP_PALM: + case CHIP_CEDAR: + return 32; + default: + return 64; + } +} + #define COMPUTE_DBG(rscreen, fmt, args...) \ do { \ if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \ diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index 71f4a1522f9..7057aa19a7c 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -54,6 +54,8 @@ struct r600_query { uint64_t end_result; /* Fence for GPU_FINISHED. */ struct pipe_fence_handle *fence; + /* For transform feedback: which stream the query is for */ + unsigned stream; }; @@ -90,6 +92,8 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: case R600_QUERY_GPU_LOAD: + case R600_QUERY_NUM_COMPILATIONS: + case R600_QUERY_NUM_SHADERS_CREATED: return NULL; } @@ -118,7 +122,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c } results += 4 * ctx->max_db; } - ctx->ws->buffer_unmap(buf->cs_buf); break; case PIPE_QUERY_TIME_ELAPSED: case PIPE_QUERY_TIMESTAMP: @@ -130,7 +133,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c case PIPE_QUERY_PIPELINE_STATISTICS: results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE); memset(results, 0, buf_size); - ctx->ws->buffer_unmap(buf->cs_buf); break; default: assert(0); @@ -157,6 +159,17 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx, } } +static unsigned event_type_for_stream(struct r600_query *query) +{ + switch (query->stream) { + default: + case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS; + case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1; + case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2; + case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3; + } +} + static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; @@ -191,7 +204,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_SO_OVERFLOW_PREDICATE: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32UL) & 0xFF); break; @@ -215,9 +228,10 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, RADEON_PRIO_MIN); - if (!r600_is_timer_query(query->type)) { + if (r600_is_timer_query(query->type)) + ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw; + else ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw; - } } static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query) @@ -248,7 +262,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que case PIPE_QUERY_SO_OVERFLOW_PREDICATE: va += query->buffer.results_end + query->result_size/2; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32UL) & 0xFF); break; @@ -279,9 +293,10 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que query->buffer.results_end += query->result_size; if (r600_query_needs_begin(query->type)) { - if (!r600_is_timer_query(query->type)) { + if (r600_is_timer_query(query->type)) + ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw; + else ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw; - } } r600_update_occlusion_query_state(ctx, query->type, -1); @@ -292,6 +307,13 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct int operation, bool flag_wait) { struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + uint32_t op = PRED_OP(operation); + + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (ctx->current_render_cond_cond) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */ if (operation == PREDICATION_OP_CLEAR) { ctx->need_gfx_cs_space(&ctx->b, 3, FALSE); @@ -302,24 +324,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct } else { struct r600_query_buffer *qbuf; unsigned count; - uint32_t op; - /* Find how many results there are. */ count = 0; for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { count += qbuf->results_end / query->result_size; } - + ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE); - - op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE | - (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW); - + + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + /* emit predicate packets for all data blocks */ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { unsigned results_base = 0; uint64_t va = qbuf->buf->gpu_address; - + while (results_base < qbuf->results_end) { radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL); @@ -327,7 +346,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ, RADEON_PRIO_MIN); results_base += query->result_size; - + /* set CONTINUE bit for all packets except the first */ op |= PREDICATION_CONTINUE; } @@ -369,6 +388,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ query->result_size = 32; query->num_cs_dw = 6; + query->stream = index; break; case PIPE_QUERY_PIPELINE_STATISTICS: /* 11 values on EG, 8 on R600. */ @@ -390,6 +410,8 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: case R600_QUERY_GPU_LOAD: + case R600_QUERY_NUM_COMPILATIONS: + case R600_QUERY_NUM_SHADERS_CREATED: skip_allocation = true; break; default: @@ -454,7 +476,7 @@ static boolean r600_begin_query(struct pipe_context *ctx, rquery->begin_result = 0; return true; case R600_QUERY_BUFFER_WAIT_TIME: - rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS); + rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000; return true; case R600_QUERY_NUM_CS_FLUSHES: rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES); @@ -465,6 +487,12 @@ static boolean r600_begin_query(struct pipe_context *ctx, case R600_QUERY_GPU_LOAD: rquery->begin_result = r600_gpu_load_begin(rctx->screen); return true; + case R600_QUERY_NUM_COMPILATIONS: + rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations); + return true; + case R600_QUERY_NUM_SHADERS_CREATED: + rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); + return true; } /* Discard the old query buffers. */ @@ -477,7 +505,7 @@ static boolean r600_begin_query(struct pipe_context *ctx, /* Obtain a new buffer if the current one can't be mapped without a stall. */ if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) || - rctx->ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL); rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type); } @@ -487,9 +515,10 @@ static boolean r600_begin_query(struct pipe_context *ctx, r600_emit_query_begin(rctx, rquery); - if (!r600_is_timer_query(rquery->type)) { + if (r600_is_timer_query(rquery->type)) + LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries); + else LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries); - } return true; } @@ -515,7 +544,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY); return; case R600_QUERY_BUFFER_WAIT_TIME: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS); + rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000; return; case R600_QUERY_NUM_CS_FLUSHES: rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES); @@ -541,13 +570,18 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) case R600_QUERY_GPU_LOAD: rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result); return; + case R600_QUERY_NUM_COMPILATIONS: + rquery->end_result = p_atomic_read(&rctx->screen->num_compilations); + return; + case R600_QUERY_NUM_SHADERS_CREATED: + rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created); + return; } r600_emit_query_end(rctx, rquery); - if (r600_query_needs_begin(rquery->type) && !r600_is_timer_query(rquery->type)) { + if (r600_query_needs_begin(rquery->type)) LIST_DELINIT(&rquery->list); - } } static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index, @@ -601,6 +635,8 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx, case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_NUM_COMPILATIONS: + case R600_QUERY_NUM_SHADERS_CREATED: result->u64 = query->end_result - query->begin_result; return TRUE; case R600_QUERY_GPU_LOAD: @@ -751,7 +787,6 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx, assert(0); } - ctx->ws->buffer_unmap(qbuf->buf->cs_buf); return TRUE; } @@ -823,22 +858,37 @@ static void r600_render_condition(struct pipe_context *ctx, } } -void r600_suspend_nontimer_queries(struct r600_common_context *ctx) +static void r600_suspend_queries(struct r600_common_context *ctx, + struct list_head *query_list, + unsigned *num_cs_dw_queries_suspend) { struct r600_query *query; - LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) { + LIST_FOR_EACH_ENTRY(query, query_list, list) { r600_emit_query_end(ctx, query); } - assert(ctx->num_cs_dw_nontimer_queries_suspend == 0); + assert(*num_cs_dw_queries_suspend == 0); +} + +void r600_suspend_nontimer_queries(struct r600_common_context *ctx) +{ + r600_suspend_queries(ctx, &ctx->active_nontimer_queries, + &ctx->num_cs_dw_nontimer_queries_suspend); +} + +void r600_suspend_timer_queries(struct r600_common_context *ctx) +{ + r600_suspend_queries(ctx, &ctx->active_timer_queries, + &ctx->num_cs_dw_timer_queries_suspend); } -static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx) +static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx, + struct list_head *query_list) { struct r600_query *query; unsigned num_dw = 0; - LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) { + LIST_FOR_EACH_ENTRY(query, query_list, list) { /* begin + end */ num_dw += query->num_cs_dw * 2; @@ -857,21 +907,35 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context * return num_dw; } -void r600_resume_nontimer_queries(struct r600_common_context *ctx) +static void r600_resume_queries(struct r600_common_context *ctx, + struct list_head *query_list, + unsigned *num_cs_dw_queries_suspend) { struct r600_query *query; + unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list); - assert(ctx->num_cs_dw_nontimer_queries_suspend == 0); + assert(*num_cs_dw_queries_suspend == 0); /* Check CS space here. Resuming must not be interrupted by flushes. */ - ctx->need_gfx_cs_space(&ctx->b, - r600_queries_num_cs_dw_for_resuming(ctx), TRUE); + ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE); - LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) { + LIST_FOR_EACH_ENTRY(query, query_list, list) { r600_emit_query_begin(ctx, query); } } +void r600_resume_nontimer_queries(struct r600_common_context *ctx) +{ + r600_resume_queries(ctx, &ctx->active_nontimer_queries, + &ctx->num_cs_dw_nontimer_queries_suspend); +} + +void r600_resume_timer_queries(struct r600_common_context *ctx) +{ + r600_resume_queries(ctx, &ctx->active_timer_queries, + &ctx->num_cs_dw_timer_queries_suspend); +} + /* Get backends mask */ void r600_query_init_backend_mask(struct r600_common_context *ctx) { @@ -919,7 +983,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE); if (results) { memset(results, 0, ctx->max_db * 4 * 4); - ctx->ws->buffer_unmap(buffer->cs_buf); /* emit EVENT_WRITE for ZPASS_DONE */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); @@ -937,7 +1000,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) if (results[i*4 + 1]) mask |= (1<<i); } - ctx->ws->buffer_unmap(buffer->cs_buf); } } @@ -966,4 +1028,5 @@ void r600_query_init(struct r600_common_context *rctx) rctx->b.render_condition = r600_render_condition; LIST_INITHEAD(&rctx->active_nontimer_queries); + LIST_INITHEAD(&rctx->active_timer_queries); } diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c index bc8bf97ef89..0853f636a27 100644 --- a/src/gallium/drivers/radeon/r600_streamout.c +++ b/src/gallium/drivers/radeon/r600_streamout.c @@ -88,8 +88,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx) 12 + /* flush_vgt_streamout */ num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */ - begin->num_dw = 12 + /* flush_vgt_streamout */ - 3; /* VGT_STRMOUT_BUFFER_CONFIG */ + begin->num_dw = 12; /* flush_vgt_streamout */ if (rctx->chip_class >= SI) { begin->num_dw += num_bufs * 4; /* SET_CONTEXT_REG */ @@ -105,7 +104,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx) (num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */ (rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */ - begin->dirty = true; + rctx->set_atom_dirty(rctx, begin, true); r600_set_streamout_enable(rctx, true); } @@ -146,7 +145,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, if (num_targets) { r600_streamout_buffers_dirty(rctx); } else { - rctx->streamout.begin_atom.dirty = false; + rctx->set_atom_dirty(rctx, &rctx->streamout.begin_atom, false); r600_set_streamout_enable(rctx, false); } } @@ -192,11 +191,6 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r r600_flush_vgt_streamout(rctx); - r600_write_context_reg(cs, rctx->chip_class >= EVERGREEN ? - R_028B98_VGT_STRMOUT_BUFFER_CONFIG : - R_028B20_VGT_STRMOUT_BUFFER_EN, - rctx->streamout.enabled_mask); - for (i = 0; i < rctx->streamout.num_targets; i++) { if (!t[i]) continue; @@ -326,20 +320,42 @@ static bool r600_get_strmout_en(struct r600_common_context *rctx) static void r600_emit_streamout_enable(struct r600_common_context *rctx, struct r600_atom *atom) { - r600_write_context_reg(rctx->rings.gfx.cs, - rctx->chip_class >= EVERGREEN ? - R_028B94_VGT_STRMOUT_CONFIG : - R_028AB0_VGT_STRMOUT_EN, - S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx))); + unsigned strmout_config_reg = R_028AB0_VGT_STRMOUT_EN; + unsigned strmout_config_val = S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)); + unsigned strmout_buffer_reg = R_028B20_VGT_STRMOUT_BUFFER_EN; + unsigned strmout_buffer_val = rctx->streamout.hw_enabled_mask & + rctx->streamout.enabled_stream_buffers_mask; + + if (rctx->chip_class >= EVERGREEN) { + strmout_buffer_reg = R_028B98_VGT_STRMOUT_BUFFER_CONFIG; + + strmout_config_reg = R_028B94_VGT_STRMOUT_CONFIG; + strmout_config_val |= + S_028B94_RAST_STREAM(0) | + S_028B94_STREAMOUT_1_EN(r600_get_strmout_en(rctx)) | + S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | + S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); + } + r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val); + r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val); } static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) { bool old_strmout_en = r600_get_strmout_en(rctx); + unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask; rctx->streamout.streamout_enabled = enable; - if (old_strmout_en != r600_get_strmout_en(rctx)) - rctx->streamout.enable_atom.dirty = true; + + rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask | + (rctx->streamout.enabled_mask << 4) | + (rctx->streamout.enabled_mask << 8) | + (rctx->streamout.enabled_mask << 12); + + if ((old_strmout_en != r600_get_strmout_en(rctx)) || + (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask)) { + rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true); + } } void r600_update_prims_generated_query_state(struct r600_common_context *rctx, @@ -354,8 +370,9 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx, rctx->streamout.prims_gen_query_enabled = rctx->streamout.num_prims_gen_queries != 0; - if (old_strmout_en != r600_get_strmout_en(rctx)) - rctx->streamout.enable_atom.dirty = true; + if (old_strmout_en != r600_get_strmout_en(rctx)) { + rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true); + } } } @@ -365,5 +382,5 @@ void r600_streamout_init(struct r600_common_context *rctx) rctx->b.stream_output_target_destroy = r600_so_target_destroy; rctx->streamout.begin_atom.emit = r600_emit_streamout_begin; rctx->streamout.enable_atom.emit = r600_emit_streamout_enable; - rctx->streamout.enable_atom.num_dw = 3; + rctx->streamout.enable_atom.num_dw = 6; } diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index dc510c99749..54696910e43 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -243,10 +243,11 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, surface->level[0].mode >= RADEON_SURF_MODE_2D ? RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, + surface->pipe_config, surface->bankw, surface->bankh, surface->tile_split, surface->stencil_tile_split, - surface->mtilea, + surface->mtilea, surface->num_banks, surface->level[0].pitch_bytes, (surface->flags & RADEON_SURF_SCANOUT) != 0); @@ -489,7 +490,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, unsigned num_pipes = rscreen->tiling_info.num_channels; if (rscreen->chip_class <= EVERGREEN && - rscreen->info.drm_minor < 26) + rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26) return 0; /* HW bug on R6xx. */ @@ -501,7 +502,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, /* HTILE is broken with 1D tiling on old kernels and CIK. */ if (rscreen->chip_class >= CIK && rtex->surface.level[0].mode == RADEON_SURF_MODE_1D && - rscreen->info.drm_minor < 38) + rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38) return 0; switch (num_pipes) { @@ -706,6 +707,7 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, const struct pipe_resource *templ) { const struct util_format_description *desc = util_format_description(templ->format); + bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING; /* MSAA resources must be 2D tiled. */ if (templ->nr_samples > 1) @@ -715,10 +717,16 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, if (templ->flags & R600_RESOURCE_FLAG_TRANSFER) return RADEON_SURF_MODE_LINEAR_ALIGNED; + /* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */ + if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN && + (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) && + (templ->target == PIPE_TEXTURE_2D || + templ->target == PIPE_TEXTURE_3D)) + force_tiling = true; + /* Handle common candidates for the linear mode. * Compressed textures must always be tiled. */ - if (!(templ->flags & R600_RESOURCE_FLAG_FORCE_TILING) && - !util_format_is_compressed(templ->format)) { + if (!force_tiling && !util_format_is_compressed(templ->format)) { /* Not everything can be linear, so we cannot enforce it * for all textures. */ if ((rscreen->debug_flags & DBG_NO_TILING) && @@ -934,7 +942,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, use_staging_texture = TRUE; } else if (!(usage & PIPE_TRANSFER_READ) && (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) || - rctx->ws->buffer_is_busy(rtex->resource.buf, RADEON_USAGE_READWRITE))) { + !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) { /* Use a staging texture for uploads if the underlying BO is busy. */ use_staging_texture = TRUE; } @@ -1059,18 +1067,9 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer) { struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; - struct r600_common_context *rctx = (struct r600_common_context*)ctx; - struct radeon_winsys_cs_handle *buf; struct pipe_resource *texture = transfer->resource; struct r600_texture *rtex = (struct r600_texture*)texture; - if (rtransfer->staging) { - buf = rtransfer->staging->cs_buf; - } else { - buf = r600_resource(transfer->resource)->cs_buf; - } - rctx->ws->buffer_unmap(buf); - if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) { if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) { ctx->resource_copy_region(ctx, texture, transfer->level, @@ -1262,7 +1261,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, /* fast color clear with 1D tiling doesn't work on old kernels and CIK */ if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D && - rctx->chip_class >= CIK && rctx->screen->info.drm_minor < 38) { + rctx->chip_class >= CIK && + rctx->screen->info.drm_major == 2 && + rctx->screen->info.drm_minor < 38) { continue; } @@ -1278,7 +1279,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, tex->cmask.offset, tex->cmask.size, 0, true); tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; - fb_state->dirty = true; + rctx->set_atom_dirty(rctx, fb_state, true); *buffers &= ~clear_bit; } } diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h index 74c8d8782a6..115042d153e 100644 --- a/src/gallium/drivers/radeon/r600d_common.h +++ b/src/gallium/drivers/radeon/r600d_common.h @@ -66,6 +66,9 @@ #define PKT3_SET_SH_REG 0x76 /* SI and later */ #define PKT3_SET_UCONFIG_REG 0x79 /* CIK and later */ +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS1 0x1 /* EG and later */ +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS2 0x2 /* EG and later */ +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS3 0x3 /* EG and later */ #define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 #define EVENT_TYPE_ZPASS_DONE 0x15 @@ -177,7 +180,7 @@ #define S_028804_INTERPOLATE_SRC_Z(x) (((x) & 0x1) << 19) #define S_028804_STATIC_ANCHOR_ASSOCIATIONS(x) (((x) & 0x1) << 20) #define S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x) (((x) & 0x1) << 21) -#define S_028804_OVERRASTERIZATION_AMOUNT(x) (((x) & 0x7) << 24) +#define S_028804_OVERRASTERIZATION_AMOUNT(x) (((x) & 0x07) << 24) #define S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x) (((x) & 0x1) << 27) #define CM_R_028BDC_PA_SC_LINE_CNTL 0x28bdc #define S_028BDC_EXPAND_LINE_WIDTH(x) (((x) & 0x1) << 9) diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c b/src/gallium/drivers/radeon/radeon_elf_util.c index 9b508227fd4..2e45d439e7a 100644 --- a/src/gallium/drivers/radeon/radeon_elf_util.c +++ b/src/gallium/drivers/radeon/radeon_elf_util.c @@ -103,8 +103,7 @@ static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols, } void radeon_elf_read(const char *elf_data, unsigned elf_size, - struct radeon_shader_binary *binary, - unsigned debug) + struct radeon_shader_binary *binary) { char *elf_buffer; Elf *elf; @@ -124,7 +123,6 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size, elf = elf_memory(elf_buffer, elf_size); elf_getshdrstrndx(elf, §ion_str_index); - binary->disassembled = 0; while ((section = elf_nextscn(elf, section))) { const char *name; @@ -145,12 +143,11 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size, binary->config_size = section_data->d_size; binary->config = MALLOC(binary->config_size * sizeof(unsigned char)); memcpy(binary->config, section_data->d_buf, binary->config_size); - } else if (debug && !strcmp(name, ".AMDGPU.disasm")) { - binary->disassembled = 1; + } else if (!strcmp(name, ".AMDGPU.disasm")) { + /* Always read disassembly if it's available. */ section_data = elf_getdata(section, section_data); - fprintf(stderr, "\nShader Disassembly:\n\n"); - fprintf(stderr, "%.*s\n", (int)section_data->d_size, - (char *)section_data->d_buf); + binary->disasm_string = strndup(section_data->d_buf, + section_data->d_size); } else if (!strncmp(name, ".rodata", 7)) { section_data = elf_getdata(section, section_data); binary->rodata_size = section_data->d_size; diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h b/src/gallium/drivers/radeon/radeon_elf_util.h index ab83f98ea69..ea4ab2f14b2 100644 --- a/src/gallium/drivers/radeon/radeon_elf_util.h +++ b/src/gallium/drivers/radeon/radeon_elf_util.h @@ -37,7 +37,7 @@ struct radeon_shader_reloc; * radeon_shader_binary object. */ void radeon_elf_read(const char *elf_data, unsigned elf_size, - struct radeon_shader_binary *binary, unsigned debug); + struct radeon_shader_binary *binary); /** * @returns A pointer to the start of the configuration information for diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 6a9557b0b73..e967ad2214e 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -58,7 +58,6 @@ struct radeon_llvm_context { unsigned type; unsigned face_gpr; unsigned two_side; - unsigned clip_vertex; unsigned inputs_count; struct r600_shader_io * r600_inputs; struct r600_shader_io * r600_outputs; @@ -72,21 +71,6 @@ struct radeon_llvm_context { /*=== Front end configuration ===*/ - /* Special Intrinsics */ - - /** Write to an output register: float store_output(float, i32) */ - const char * store_output_intr; - - /** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32) - * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value - * in 2-bits. - * Swizzle{0-1} = X Channel - * Swizzle{2-3} = Y Channel - * Swizzle{4-5} = Z Channel - * Swizzle{6-7} = W Channel - */ - const char * swizzle_intr; - /* Instructions that are not described by any of the TGSI opcodes. */ /** This function is responsible for initilizing the inputs array and will be @@ -100,9 +84,6 @@ struct radeon_llvm_context { unsigned index, const struct tgsi_full_declaration *decl); - /** User data to use with the callbacks */ - void * userdata; - /** This array contains the input values for the shader. Typically these * values will be in the form of a target intrinsic that will inform the * backend how to load the actual inputs to the shader. @@ -146,6 +127,8 @@ static inline LLVMTypeRef tgsi2llvmtype( case TGSI_TYPE_UNSIGNED: case TGSI_TYPE_SIGNED: return LLVMInt32TypeInContext(ctx); + case TGSI_TYPE_DOUBLE: + return LLVMDoubleTypeInContext(ctx); case TGSI_TYPE_UNTYPED: case TGSI_TYPE_FLOAT: return LLVMFloatTypeInContext(ctx); @@ -171,8 +154,9 @@ static inline LLVMValueRef bitcast( void radeon_llvm_emit_prepare_cube_coords(struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data, - LLVMValueRef *coords_arg); + struct lp_build_emit_data * emit_data, + LLVMValueRef *coords_arg, + LLVMValueRef *derivs_arg); void radeon_llvm_context_init(struct radeon_llvm_context * ctx); @@ -191,20 +175,29 @@ unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan); void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx); -LLVMValueRef -build_intrinsic(LLVMBuilderRef builder, - const char *name, - LLVMTypeRef ret_type, - LLVMValueRef *args, - unsigned num_args, - LLVMAttribute attr); - void build_tgsi_intrinsic_nomem( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data); - +LLVMValueRef +radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base, + LLVMValueRef ptr, + LLVMValueRef ptr2); + +LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base, + LLVMValueRef value); + +LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle); + +void radeon_llvm_emit_store( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_instruction * inst, + const struct tgsi_opcode_info * info, + LLVMValueRef dst[4]); #endif /* RADEON_LLVM_H */ diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c index 25580b6bd4c..00025590137 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.c +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c @@ -62,6 +62,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type) switch (type) { case TGSI_PROCESSOR_VERTEX: + case TGSI_PROCESSOR_TESS_CTRL: + case TGSI_PROCESSOR_TESS_EVAL: llvm_type = RADEON_LLVM_SHADER_VS; break; case TGSI_PROCESSOR_GEOMETRY: @@ -142,7 +144,8 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context) * @returns 0 for success, 1 for failure */ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary, - const char *gpu_family, unsigned dump, LLVMTargetMachineRef tm) + const char *gpu_family, bool dump_ir, bool dump_asm, + LLVMTargetMachineRef tm) { char cpu[CPU_STRING_LEN]; @@ -165,17 +168,15 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar } strncpy(cpu, gpu_family, CPU_STRING_LEN); memset(fs, 0, sizeof(fs)); - if (dump) { + if (dump_asm) strncpy(fs, "+DumpCode", FS_STRING_LEN); - } tm = LLVMCreateTargetMachine(target, triple, cpu, fs, LLVMCodeGenLevelDefault, LLVMRelocDefault, LLVMCodeModelDefault); dispose_tm = true; } - if (dump) { + if (dump_ir) LLVMDumpModule(M); - } /* Setup Diagnostic Handler*/ llvm_ctx = LLVMGetModuleContext(M); @@ -204,7 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar buffer_size = LLVMGetBufferSize(out_buffer); buffer_data = LLVMGetBufferStart(out_buffer); - radeon_elf_read(buffer_data, buffer_size, binary, dump); + radeon_elf_read(buffer_data, buffer_size, binary); /* Clean up */ LLVMDisposeMemoryBuffer(out_buffer); diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h index 3ccef78e36d..e20aed94c6b 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.h +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h @@ -29,6 +29,7 @@ #include <llvm-c/Core.h> #include <llvm-c/TargetMachine.h> +#include <stdbool.h> struct radeon_shader_binary; @@ -36,11 +37,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type); LLVMTargetRef radeon_llvm_get_r600_target(const char *triple); -unsigned radeon_llvm_compile( - LLVMModuleRef M, - struct radeon_shader_binary *binary, - const char * gpu_family, - unsigned dump, - LLVMTargetMachineRef tm); +unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary, + const char *gpu_family, bool dump_ir, bool dump_asm, + LLVMTargetMachineRef tm); #endif /* RADEON_LLVM_EMIT_H */ diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index c8c980d9d32..56694700a47 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -109,12 +109,27 @@ emit_array_index( return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), ""); } -static LLVMValueRef -emit_fetch( +LLVMValueRef +radeon_llvm_emit_fetch_double( struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle); + LLVMValueRef ptr, + LLVMValueRef ptr2) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef result; + + result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2)); + + result = LLVMBuildInsertElement(builder, + result, + bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr), + bld_base->int_bld.zero, ""); + result = LLVMBuildInsertElement(builder, + result, + bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr2), + bld_base->int_bld.one, ""); + return bitcast(bld_base, TGSI_TYPE_DOUBLE, result); +} static LLVMValueRef emit_array_fetch( @@ -136,7 +151,7 @@ emit_array_fetch( for (i = 0; i < size; ++i) { tmp_reg.Register.Index = i + range.First; - LLVMValueRef temp = emit_fetch(bld_base, &tmp_reg, type, swizzle); + LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle); result = LLVMBuildInsertElement(builder, result, temp, lp_build_const_int32(gallivm, i), ""); } @@ -150,23 +165,21 @@ static bool uses_temp_indirect_addressing( return (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)); } -static LLVMValueRef -emit_fetch( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle) +LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); LLVMBuilderRef builder = bld_base->base.gallivm->builder; - LLVMValueRef result = NULL, ptr; + LLVMValueRef result = NULL, ptr, ptr2; if (swizzle == ~0) { LLVMValueRef values[TGSI_NUM_CHANNELS]; unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - values[chan] = emit_fetch(bld_base, reg, type, chan); + values[chan] = radeon_llvm_emit_fetch(bld_base, reg, type, chan); } return lp_build_gather_values(bld_base->base.gallivm, values, TGSI_NUM_CHANNELS); @@ -184,11 +197,27 @@ emit_fetch( switch(reg->Register.File) { case TGSI_FILE_IMMEDIATE: { LLVMTypeRef ctype = tgsi2llvmtype(bld_base, type); - return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype); + if (type == TGSI_TYPE_DOUBLE) { + result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2)); + result = LLVMConstInsertElement(result, + bld->immediates[reg->Register.Index][swizzle], + bld_base->int_bld.zero); + result = LLVMConstInsertElement(result, + bld->immediates[reg->Register.Index][swizzle + 1], + bld_base->int_bld.one); + return LLVMConstBitCast(result, ctype); + } else { + return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype); + } } case TGSI_FILE_INPUT: result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]; + if (type == TGSI_TYPE_DOUBLE) { + ptr = result; + ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)]; + return radeon_llvm_emit_fetch_double(bld_base, ptr, ptr2); + } break; case TGSI_FILE_TEMPORARY: @@ -199,11 +228,23 @@ emit_fetch( break; } ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle]; + if (type == TGSI_TYPE_DOUBLE) { + ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1]; + return radeon_llvm_emit_fetch_double(bld_base, + LLVMBuildLoad(builder, ptr, ""), + LLVMBuildLoad(builder, ptr2, "")); + } result = LLVMBuildLoad(builder, ptr, ""); break; case TGSI_FILE_OUTPUT: ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle); + if (type == TGSI_TYPE_DOUBLE) { + ptr2 = lp_get_output_ptr(bld, reg->Register.Index, swizzle + 1); + return radeon_llvm_emit_fetch_double(bld_base, + LLVMBuildLoad(builder, ptr, ""), + LLVMBuildLoad(builder, ptr2, "")); + } result = LLVMBuildLoad(builder, ptr, ""); break; @@ -321,8 +362,8 @@ static void emit_declaration( } } -static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base, - LLVMValueRef value) +LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base, + LLVMValueRef value) { struct lp_build_emit_data clamp_emit_data; @@ -336,8 +377,7 @@ static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base, &clamp_emit_data); } -static void -emit_store( +void radeon_llvm_emit_store( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_instruction * inst, const struct tgsi_opcode_info * info, @@ -348,9 +388,10 @@ emit_store( struct gallivm_state *gallivm = bld->bld_base.base.gallivm; const struct tgsi_full_dst_register *reg = &inst->Dst[0]; LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; - LLVMValueRef temp_ptr; + LLVMValueRef temp_ptr, temp_ptr2 = NULL; unsigned chan, chan_index; boolean is_vec_store = FALSE; + enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode); if (dst[0]) { LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0])); @@ -371,6 +412,8 @@ emit_store( TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { LLVMValueRef value = dst[chan_index]; + if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3)) + continue; if (inst->Instruction.Saturate) value = radeon_llvm_saturate(bld_base, value); @@ -379,8 +422,9 @@ emit_store( LLVMBuildStore(builder, value, temp_ptr); continue; } - - value = bitcast(bld_base, TGSI_TYPE_FLOAT, value); + + if (dtype != TGSI_TYPE_DOUBLE) + value = bitcast(bld_base, TGSI_TYPE_FLOAT, value); if (reg->Register.Indirect) { struct tgsi_declaration_range range = get_array_range(bld_base, @@ -418,6 +462,8 @@ emit_store( switch(reg->Register.File) { case TGSI_FILE_OUTPUT: temp_ptr = bld->outputs[reg->Register.Index][chan_index]; + if (dtype == TGSI_TYPE_DOUBLE) + temp_ptr2 = bld->outputs[reg->Register.Index][chan_index + 1]; break; case TGSI_FILE_TEMPORARY: @@ -428,12 +474,28 @@ emit_store( break; } temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index]; + if (dtype == TGSI_TYPE_DOUBLE) + temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1]; + break; default: return; } - LLVMBuildStore(builder, value, temp_ptr); + if (dtype != TGSI_TYPE_DOUBLE) + LLVMBuildStore(builder, value, temp_ptr); + else { + LLVMValueRef ptr = LLVMBuildBitCast(builder, value, + LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), 2), ""); + LLVMValueRef val2; + value = LLVMBuildExtractElement(builder, ptr, + bld_base->uint_bld.zero, ""); + val2 = LLVMBuildExtractElement(builder, ptr, + bld_base->uint_bld.one, ""); + + LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, value), temp_ptr); + LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, val2), temp_ptr2); + } } } } @@ -686,34 +748,26 @@ static void kil_emit( } } -void radeon_llvm_emit_prepare_cube_coords( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data, - LLVMValueRef *coords_arg) +static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base, + LLVMValueRef *in, LLVMValueRef *out) { - - unsigned target = emit_data->inst->Texture.Texture; - unsigned opcode = emit_data->inst->Instruction.Opcode; struct gallivm_state * gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef type = bld_base->base.elem_type; LLVMValueRef coords[4]; LLVMValueRef mad_args[3]; - LLVMValueRef idx; - struct LLVMOpaqueValue *cube_vec; - LLVMValueRef v; + LLVMValueRef v, cube_vec; unsigned i; - cube_vec = lp_build_gather_values(bld_base->base.gallivm, coords_arg, 4); - v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4), + cube_vec = lp_build_gather_values(bld_base->base.gallivm, in, 4); + v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4), &cube_vec, 1, LLVMReadNoneAttribute); - for (i = 0; i < 4; ++i) { - idx = lp_build_const_int32(gallivm, i); - coords[i] = LLVMBuildExtractElement(builder, v, idx, ""); - } + for (i = 0; i < 4; ++i) + coords[i] = LLVMBuildExtractElement(builder, v, + lp_build_const_int32(gallivm, i), ""); - coords[2] = build_intrinsic(builder, "fabs", + coords[2] = lp_build_intrinsic(builder, "llvm.fabs.f32", type, &coords[2], 1, LLVMReadNoneAttribute); coords[2] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_RCP, coords[2]); @@ -729,10 +783,60 @@ void radeon_llvm_emit_prepare_cube_coords( mad_args[0], mad_args[1], mad_args[2]); /* apply xyz = yxw swizzle to cooords */ - coords[2] = coords[3]; - coords[3] = coords[1]; - coords[1] = coords[0]; - coords[0] = coords[3]; + out[0] = coords[1]; + out[1] = coords[0]; + out[2] = coords[3]; +} + +void radeon_llvm_emit_prepare_cube_coords( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data, + LLVMValueRef *coords_arg, + LLVMValueRef *derivs_arg) +{ + + unsigned target = emit_data->inst->Texture.Texture; + unsigned opcode = emit_data->inst->Instruction.Opcode; + struct gallivm_state * gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef coords[4]; + unsigned i; + + radeon_llvm_cube_to_2d_coords(bld_base, coords_arg, coords); + + if (opcode == TGSI_OPCODE_TXD && derivs_arg) { + LLVMValueRef derivs[4]; + int axis; + + /* Convert cube derivatives to 2D derivatives. */ + for (axis = 0; axis < 2; axis++) { + LLVMValueRef shifted_cube_coords[4], shifted_coords[4]; + + /* Shift the cube coordinates by the derivatives to get + * the cube coordinates of the "neighboring pixel". + */ + for (i = 0; i < 3; i++) + shifted_cube_coords[i] = + LLVMBuildFAdd(builder, coords_arg[i], + derivs_arg[axis*3+i], ""); + shifted_cube_coords[3] = LLVMGetUndef(bld_base->base.elem_type); + + /* Project the shifted cube coordinates onto the face. */ + radeon_llvm_cube_to_2d_coords(bld_base, shifted_cube_coords, + shifted_coords); + + /* Subtract both sets of 2D coordinates to get 2D derivatives. + * This won't work if the shifted coordinates ended up + * in a different face. + */ + for (i = 0; i < 2; i++) + derivs[axis * 2 + i] = + LLVMBuildFSub(builder, shifted_coords[i], + coords[i], ""); + } + + memcpy(derivs_arg, derivs, sizeof(derivs)); + } if (target == TGSI_TEXTURE_CUBE_ARRAY || target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { @@ -756,140 +860,6 @@ void radeon_llvm_emit_prepare_cube_coords( memcpy(coords_arg, coords, sizeof(coords)); } -static void txd_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - - LLVMValueRef coords[4]; - unsigned chan, src; - for (src = 0; src < 3; src++) { - for (chan = 0; chan < 4; chan++) - coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan); - - emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - } - emit_data->arg_count = 3; - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); -} - - -static void txp_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - LLVMValueRef src_w; - unsigned chan; - LLVMValueRef coords[5]; - - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); - src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W); - - for (chan = 0; chan < 3; chan++ ) { - LLVMValueRef arg = lp_build_emit_fetch(bld_base, - emit_data->inst, 0, chan); - coords[chan] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_DIV, arg, src_w); - } - coords[3] = bld_base->base.one; - - if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || - inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { - radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords); - } - - emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - emit_data->arg_count = 1; -} - -static void tex_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - /* XXX: lp_build_swizzle_aos() was failing with wrong arg types, - * when we used CHAN_ALL. We should be able to get this to work, - * but for now we will swizzle it ourselves - emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, - 0, CHAN_ALL); - - */ - - const struct tgsi_full_instruction * inst = emit_data->inst; - - LLVMValueRef coords[5]; - unsigned chan; - for (chan = 0; chan < 4; chan++) { - coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan); - } - - if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || - inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || - inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { - /* These instructions have additional operand that should be packed - * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords. - * That operand should be passed as a float value in the args array - * right after the coord vector. After packing it's not used anymore, - * that's why arg_count is not increased */ - coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0); - } - - if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || - inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { - radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords); - } - - emit_data->arg_count = 1; - emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); -} - -static void txf_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); - const struct tgsi_texture_offset * off = inst->TexOffsets; - LLVMTypeRef offset_type = bld_base->int_bld.elem_type; - - /* fetch tex coords */ - tex_fetch_args(bld_base, emit_data); - - /* fetch tex offsets */ - if (inst->Texture.NumOffsets) { - assert(inst->Texture.NumOffsets == 1); - - emit_data->args[1] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleX], - offset_type); - emit_data->args[2] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleY], - offset_type); - emit_data->args[3] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleZ], - offset_type); - } else { - emit_data->args[1] = bld_base->int_bld.zero; - emit_data->args[2] = bld_base->int_bld.zero; - emit_data->args[3] = bld_base->int_bld.zero; - } - - emit_data->arg_count = 4; -} - static void emit_icmp( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -996,6 +966,35 @@ static void emit_fcmp( emit_data->output[emit_data->chan] = v; } +static void emit_dcmp( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMContextRef context = bld_base->base.gallivm->context; + LLVMRealPredicate pred; + + /* Use ordered for everything but NE (which is usual for + * float comparisons) + */ + switch (emit_data->inst->Instruction.Opcode) { + case TGSI_OPCODE_DSEQ: pred = LLVMRealOEQ; break; + case TGSI_OPCODE_DSGE: pred = LLVMRealOGE; break; + case TGSI_OPCODE_DSLT: pred = LLVMRealOLT; break; + case TGSI_OPCODE_DSNE: pred = LLVMRealUNE; break; + default: assert(!"unknown instruction"); pred = 0; break; + } + + LLVMValueRef v = LLVMBuildFCmp(builder, pred, + emit_data->args[0], emit_data->args[1],""); + + v = LLVMBuildSExtOrBitCast(builder, v, + LLVMInt32TypeInContext(context), ""); + + emit_data->output[emit_data->chan] = v; +} + static void emit_not( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -1161,6 +1160,40 @@ static void emit_ineg( emit_data->args[0], ""); } +static void emit_dneg( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + emit_data->output[emit_data->chan] = LLVMBuildFNeg(builder, + emit_data->args[0], ""); +} + +static void emit_frac( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + char *intr; + + if (emit_data->info->opcode == TGSI_OPCODE_FRC) + intr = "llvm.floor.f32"; + else if (emit_data->info->opcode == TGSI_OPCODE_DFRAC) + intr = "llvm.floor.f64"; + else { + assert(0); + return; + } + + LLVMValueRef floor = lp_build_intrinsic(builder, intr, emit_data->dst_type, + &emit_data->args[0], 1, + LLVMReadNoneAttribute); + emit_data->output[emit_data->chan] = LLVMBuildFSub(builder, + emit_data->args[0], floor, ""); +} + static void emit_f2i( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -1215,58 +1248,16 @@ static void emit_immediate(struct lp_build_tgsi_context * bld_base, ctx->soa.num_immediates++; } -LLVMValueRef -build_intrinsic(LLVMBuilderRef builder, - const char *name, - LLVMTypeRef ret_type, - LLVMValueRef *args, - unsigned num_args, - LLVMAttribute attr) -{ - LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); - LLVMValueRef function; - - function = LLVMGetNamedFunction(module, name); - if(!function) { - LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS]; - unsigned i; - - assert(num_args <= LP_MAX_FUNC_ARGS); - - for(i = 0; i < num_args; ++i) { - assert(args[i]); - arg_types[i] = LLVMTypeOf(args[i]); - } - - function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args); - - if (attr) - LLVMAddFunctionAttr(function, attr); - } - - return LLVMBuildCall(builder, function, args, num_args, ""); -} - -static void build_tgsi_intrinsic( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data, - LLVMAttribute attr) -{ - struct lp_build_context * base = &bld_base->base; - emit_data->output[emit_data->chan] = build_intrinsic( - base->gallivm->builder, action->intr_name, - emit_data->dst_type, emit_data->args, - emit_data->arg_count, attr); -} - void -build_tgsi_intrinsic_nomem( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) +build_tgsi_intrinsic_nomem(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { - build_tgsi_intrinsic(action, bld_base, emit_data, LLVMReadNoneAttribute); + struct lp_build_context * base = &bld_base->base; + emit_data->output[emit_data->chan] = + lp_build_intrinsic(base->gallivm->builder, action->intr_name, + emit_data->dst_type, emit_data->args, + emit_data->arg_count, LLVMReadNoneAttribute); } static void emit_bfi(const struct lp_build_tgsi_action * action, @@ -1322,7 +1313,7 @@ static void emit_lsb(const struct lp_build_tgsi_action * action, }; emit_data->output[emit_data->chan] = - build_intrinsic(gallivm->builder, "llvm.cttz.i32", + lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32", emit_data->dst_type, args, Elements(args), LLVMReadNoneAttribute); } @@ -1341,7 +1332,7 @@ static void emit_umsb(const struct lp_build_tgsi_action * action, }; LLVMValueRef msb = - build_intrinsic(builder, "llvm.ctlz.i32", + lp_build_intrinsic(builder, "llvm.ctlz.i32", emit_data->dst_type, args, Elements(args), LLVMReadNoneAttribute); @@ -1368,7 +1359,7 @@ static void emit_imsb(const struct lp_build_tgsi_action * action, LLVMValueRef arg = emit_data->args[0]; LLVMValueRef msb = - build_intrinsic(builder, "llvm.AMDGPU.flbit.i32", + lp_build_intrinsic(builder, "llvm.AMDGPU.flbit.i32", emit_data->dst_type, &arg, 1, LLVMReadNoneAttribute); @@ -1407,12 +1398,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) ctx->gallivm.context); ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context); - ctx->store_output_intr = "llvm.AMDGPU.store.output."; - ctx->swizzle_intr = "llvm.AMDGPU.swizzle"; struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base; - /* XXX: We need to revisit this.I think the correct way to do this is - * to use length = 4 here and use the elem_bld for everything. */ type.floating = TRUE; type.fixed = FALSE; type.sign = TRUE; @@ -1423,28 +1410,32 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) lp_build_context_init(&bld_base->base, &ctx->gallivm, type); lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type)); lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type)); + { + struct lp_type dbl_type; + dbl_type = type; + dbl_type.width *= 2; + lp_build_context_init(&ctx->soa.bld_base.dbl_bld, &ctx->gallivm, dbl_type); + } bld_base->soa = 1; - bld_base->emit_store = emit_store; + bld_base->emit_store = radeon_llvm_emit_store; bld_base->emit_swizzle = emit_swizzle; bld_base->emit_declaration = emit_declaration; bld_base->emit_immediate = emit_immediate; - bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch; - bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch; - bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch; - bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch; + bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = radeon_llvm_emit_fetch; + bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = radeon_llvm_emit_fetch; + bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = radeon_llvm_emit_fetch; + bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = radeon_llvm_emit_fetch; bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value; /* Allocate outputs */ ctx->soa.outputs = ctx->outputs; - /* XXX: Is there a better way to initialize all this ? */ - lp_set_default_actions(bld_base); bld_base->op_actions[TGSI_OPCODE_ABS].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "fabs"; + bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.fabs.f32"; bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and; bld_base->op_actions[TGSI_OPCODE_ARL].emit = emit_arl; bld_base->op_actions[TGSI_OPCODE_BFI].emit = emit_bfi; @@ -1453,7 +1444,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_BREV].intr_name = "llvm.AMDGPU.brev"; bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit; bld_base->op_actions[TGSI_OPCODE_CEIL].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "ceil"; + bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32"; bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp."; bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem; @@ -1461,21 +1452,30 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit; bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32"; - bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx"; - bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy"; - bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "llvm.fabs.f64"; + bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64"; + bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac; + bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg; + bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp; + bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp; + bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp; + bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp; + bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64"; + bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64"; bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit; bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit; bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit; bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp."; bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "floor"; + bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32"; bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_FMA].intr_name = "llvm.fma.f32"; - bld_base->op_actions[TGSI_OPCODE_FRC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction."; + bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac; bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i; bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u; bld_base->op_actions[TGSI_OPCODE_FSEQ].emit = emit_fcmp; @@ -1520,6 +1520,9 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32"; bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest."; + bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = + HAVE_LLVM >= 0x0305 ? "llvm.AMDGPU.rsq.clamped.f32" : "llvm.AMDGPU.rsq"; + bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp; bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp; bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl; @@ -1532,26 +1535,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32"; bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg; - bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb"; - bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb"; - bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd"; - bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf"; - bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl"; - bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl"; - bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq"; bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc"; bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd; @@ -1571,13 +1554,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f; bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor; bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp; - - bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem; -#if HAVE_LLVM >= 0x0305 - bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq.clamped.f32"; -#else - bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq"; -#endif } void radeon_llvm_create_func(struct radeon_llvm_context * ctx, diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index be58d0b9ce3..16ee5410273 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -57,6 +57,7 @@ #define FB_BUFFER_OFFSET 0x1000 #define FB_BUFFER_SIZE 2048 +#define IT_SCALING_TABLE_SIZE 992 /* UVD decoder representation */ struct ruvd_decoder { @@ -65,6 +66,7 @@ struct ruvd_decoder { ruvd_set_dtb set_dtb; unsigned stream_handle; + unsigned stream_type; unsigned frame_number; struct pipe_screen *screen; @@ -73,15 +75,18 @@ struct ruvd_decoder { unsigned cur_buffer; - struct rvid_buffer msg_fb_buffers[NUM_BUFFERS]; + struct rvid_buffer msg_fb_it_buffers[NUM_BUFFERS]; struct ruvd_msg *msg; uint32_t *fb; + uint8_t *it; struct rvid_buffer bs_buffers[NUM_BUFFERS]; void* bs_ptr; unsigned bs_size; struct rvid_buffer dpb; + bool use_legacy; + struct rvid_buffer ctx; }; /* flush IB to the hardware */ @@ -107,19 +112,34 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain, RADEON_PRIO_MIN); - set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off); - set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4); + if (!dec->use_legacy) { + uint64_t addr; + addr = dec->ws->buffer_get_virtual_address(cs_buf); + addr = addr + off; + set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr); + set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32); + } else { + set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off); + set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4); + } set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1); } -/* map the next available message/feedback buffer */ -static void map_msg_fb_buf(struct ruvd_decoder *dec) +/* do the codec needs an IT buffer ?*/ +static bool have_it(struct ruvd_decoder *dec) +{ + return dec->stream_type == RUVD_CODEC_H264_PERF || + dec->stream_type == RUVD_CODEC_H265; +} + +/* map the next available message/feedback/itscaling buffer */ +static void map_msg_fb_it_buf(struct ruvd_decoder *dec) { struct rvid_buffer* buf; uint8_t *ptr; /* grab the current message/feedback buffer */ - buf = &dec->msg_fb_buffers[dec->cur_buffer]; + buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* and map it for CPU access */ ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE); @@ -127,6 +147,8 @@ static void map_msg_fb_buf(struct ruvd_decoder *dec) /* calc buffer offsets */ dec->msg = (struct ruvd_msg *)ptr; dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); + if (have_it(dec)) + dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE); } /* unmap and send a message command to the VCPU */ @@ -139,12 +161,13 @@ static void send_msg_buf(struct ruvd_decoder *dec) return; /* grab the current message buffer */ - buf = &dec->msg_fb_buffers[dec->cur_buffer]; + buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* unmap the buffer */ dec->ws->buffer_unmap(buf->res->cs_buf); dec->msg = NULL; dec->fb = NULL; + dec->it = NULL; /* and send it to the hardware */ send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0, @@ -159,11 +182,12 @@ static void next_buffer(struct ruvd_decoder *dec) } /* convert the profile into something UVD understands */ -static uint32_t profile2stream_type(enum pipe_video_profile profile) +static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) { - switch (u_reduce_video_profile(profile)) { + switch (u_reduce_video_profile(dec->base.profile)) { case PIPE_VIDEO_FORMAT_MPEG4_AVC: - return RUVD_CODEC_H264; + return (family >= CHIP_TONGA) ? + RUVD_CODEC_H264_PERF : RUVD_CODEC_H264; case PIPE_VIDEO_FORMAT_VC1: return RUVD_CODEC_VC1; @@ -174,23 +198,46 @@ static uint32_t profile2stream_type(enum pipe_video_profile profile) case PIPE_VIDEO_FORMAT_MPEG4: return RUVD_CODEC_MPEG4; + case PIPE_VIDEO_FORMAT_HEVC: + return RUVD_CODEC_H265; + default: assert(0); return 0; } } +static unsigned calc_ctx_size(struct ruvd_decoder *dec) +{ + unsigned width_in_mb, height_in_mb, ctx_size; + + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + + unsigned max_references = dec->base.max_references + 1; + + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + width = align (width, 16); + height = align (height, 16); + ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024; + return ctx_size; +} + /* calculate size of reference picture buffer */ -static unsigned calc_dpb_size(const struct pipe_video_codec *templ) +static unsigned calc_dpb_size(struct ruvd_decoder *dec) { unsigned width_in_mb, height_in_mb, image_size, dpb_size; // always align them to MB size for dpb calculation - unsigned width = align(templ->width, VL_MACROBLOCK_WIDTH); - unsigned height = align(templ->height, VL_MACROBLOCK_HEIGHT); + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); // always one more for currently decoded picture - unsigned max_references = templ->max_references + 1; + unsigned max_references = dec->base.max_references + 1; // aligned size of a single frame image_size = width * height; @@ -201,19 +248,67 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ) width_in_mb = width / VL_MACROBLOCK_WIDTH; height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2); - switch (u_reduce_video_profile(templ->profile)) { - case PIPE_VIDEO_FORMAT_MPEG4_AVC: - // the firmware seems to allways assume a minimum of ref frames - max_references = MAX2(NUM_H264_REFS, max_references); - - // reference picture buffer - dpb_size = image_size * max_references; + switch (u_reduce_video_profile(dec->base.profile)) { + case PIPE_VIDEO_FORMAT_MPEG4_AVC: { + if (!dec->use_legacy) { + unsigned fs_in_mb = width_in_mb * height_in_mb; + unsigned alignment = 64, num_dpb_buffer; + + if (dec->stream_type == RUVD_CODEC_H264_PERF) + alignment = 256; + switch(dec->base.level) { + case 30: + num_dpb_buffer = 8100 / fs_in_mb; + break; + case 31: + num_dpb_buffer = 18000 / fs_in_mb; + break; + case 32: + num_dpb_buffer = 20480 / fs_in_mb; + break; + case 41: + num_dpb_buffer = 32768 / fs_in_mb; + break; + case 42: + num_dpb_buffer = 34816 / fs_in_mb; + break; + case 50: + num_dpb_buffer = 110400 / fs_in_mb; + break; + case 51: + num_dpb_buffer = 184320 / fs_in_mb; + break; + default: + num_dpb_buffer = 184320 / fs_in_mb; + break; + } + num_dpb_buffer++; + max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references); + dpb_size = image_size * max_references; + dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment); + dpb_size += align(width_in_mb * height_in_mb * 32, alignment); + } else { + // the firmware seems to allways assume a minimum of ref frames + max_references = MAX2(NUM_H264_REFS, max_references); + // reference picture buffer + dpb_size = image_size * max_references; + // macroblock context buffer + dpb_size += width_in_mb * height_in_mb * max_references * 192; + // IT surface buffer + dpb_size += width_in_mb * height_in_mb * 32; + } + break; + } - // macroblock context buffer - dpb_size += width_in_mb * height_in_mb * max_references * 192; + case PIPE_VIDEO_FORMAT_HEVC: + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); - // IT surface buffer - dpb_size += width_in_mb * height_in_mb * 32; + width = align (width, 16); + height = align (height, 16); + dpb_size = align((width * height * 3) / 2, 256) * max_references; break; case PIPE_VIDEO_FORMAT_VC1: @@ -250,6 +345,8 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ) // IT surface buffer dpb_size += align(width_in_mb * height_in_mb * 32, 64); + + dpb_size = MAX2(dpb_size, 30 * 1024 * 1024); break; default: @@ -263,6 +360,12 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ) return dpb_size; } +/* free associated data in the video buffer callback */ +static void ruvd_destroy_associated_data(void *data) +{ + /* NOOP, since we only use an intptr */ +} + /* get h264 specific message bits */ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_picture_desc *pic) { @@ -286,10 +389,8 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ assert(0); break; } - if (((dec->base.width * dec->base.height) >> 8) <= 1620) - result.level = 30; - else - result.level = 41; + + result.level = dec->base.level; result.sps_info_flags = 0; result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0; @@ -338,6 +439,11 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16); memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64); + if (dec->stream_type == RUVD_CODEC_H264_PERF) { + memcpy(dec->it, result.scaling_list_4x4, 6*16); + memcpy((dec->it + 96), result.scaling_list_8x8, 2*64); + } + result.num_ref_frames = pic->num_ref_frames; result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1; @@ -354,6 +460,151 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ return result; } +/* get h265 specific message bits */ +static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target, + struct pipe_h265_picture_desc *pic) +{ + struct ruvd_h265 result; + unsigned i; + + memset(&result, 0, sizeof(result)); + + result.sps_info_flags = 0; + result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0; + result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1; + result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2; + result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3; + result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4; + result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5; + result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6; + result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7; + result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8; + if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO) + result.sps_info_flags |= 1 << 9; + + result.chroma_format = pic->pps->sps->chroma_format_idc; + result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8; + result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8; + result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4; + result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1; + result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3; + result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size; + result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2; + result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size; + result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter; + result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra; + result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1; + result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1; + result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3; + result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size; + result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets; + + result.pps_info_flags = 0; + result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0; + result.pps_info_flags |= pic->pps->output_flag_present_flag << 1; + result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2; + result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3; + result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4; + result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5; + result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6; + result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7; + result.pps_info_flags |= pic->pps->weighted_pred_flag << 8; + result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9; + result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10; + result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11; + result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12; + result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13; + result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14; + result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15; + result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16; + result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17; + result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18; + result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19; + //result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ??? + + result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits; + result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps; + result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1; + result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1; + result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset; + result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset; + result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2; + result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2; + result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth; + result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1; + result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1; + result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2; + result.init_qp_minus26 = pic->pps->init_qp_minus26; + + for (i = 0; i < 19; ++i) + result.column_width_minus1[i] = pic->pps->column_width_minus1[i]; + + for (i = 0; i < 21; ++i) + result.row_height_minus1[i] = pic->pps->row_height_minus1[i]; + + result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx; + result.curr_idx = pic->CurrPicOrderCntVal; + result.curr_poc = pic->CurrPicOrderCntVal; + + vl_video_buffer_set_associated_data(target, &dec->base, + (void *)(uintptr_t)pic->CurrPicOrderCntVal, + &ruvd_destroy_associated_data); + + for (i = 0; i < 16; ++i) { + struct pipe_video_buffer *ref = pic->ref[i]; + uintptr_t ref_pic = 0; + + result.poc_list[i] = pic->PicOrderCntVal[i]; + + if (ref) + ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base); + else + ref_pic = 0x7F; + result.ref_pic_list[i] = ref_pic; + } + + for (i = 0; i < 8; ++i) { + result.ref_pic_set_st_curr_before[i] = 0xFF; + result.ref_pic_set_st_curr_after[i] = 0xFF; + result.ref_pic_set_lt_curr[i] = 0xFF; + } + + for (i = 0; i < pic->NumPocStCurrBefore; ++i) + result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i]; + + for (i = 0; i < pic->NumPocStCurrAfter; ++i) + result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i]; + + for (i = 0; i < pic->NumPocLtCurr; ++i) + result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i]; + + for (i = 0; i < 6; ++i) + result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i]; + + for (i = 0; i < 2; ++i) + result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i]; + + memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16); + memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64); + memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64); + memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64); + + /* TODO + result.highestTid; + result.isNonRef; + + IDRPicFlag; + RAPPicFlag; + NumPocTotalCurr; + NumShortTermPictureSliceHeaderBits; + NumLongTermPictureSliceHeaderBits; + + IsLongTerm[16]; + */ + + return result; +} + /* get vc1 specific message bits */ static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic) { @@ -556,7 +807,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) assert(decoder); - map_msg_fb_buf(dec); + map_msg_fb_it_buf(dec); memset(dec->msg, 0, sizeof(*dec->msg)); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_DESTROY; @@ -568,21 +819,17 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) dec->ws->cs_destroy(dec->cs); for (i = 0; i < NUM_BUFFERS; ++i) { - rvid_destroy_buffer(&dec->msg_fb_buffers[i]); + rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); rvid_destroy_buffer(&dec->bs_buffers[i]); } rvid_destroy_buffer(&dec->dpb); + if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) + rvid_destroy_buffer(&dec->ctx); FREE(dec); } -/* free associated data in the video buffer callback */ -static void ruvd_destroy_associated_data(void *data) -{ - /* NOOP, since we only use an intptr */ -} - /** * start decoding of a new frame */ @@ -670,7 +917,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, { struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; struct radeon_winsys_cs_handle *dt; - struct rvid_buffer *msg_fb_buf, *bs_buf; + struct rvid_buffer *msg_fb_it_buf, *bs_buf; unsigned bs_size; assert(decoder); @@ -678,26 +925,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, if (!dec->bs_ptr) return; - msg_fb_buf = &dec->msg_fb_buffers[dec->cur_buffer]; + msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; bs_buf = &dec->bs_buffers[dec->cur_buffer]; bs_size = align(dec->bs_size, 128); memset(dec->bs_ptr, 0, bs_size - dec->bs_size); dec->ws->buffer_unmap(bs_buf->res->cs_buf); - map_msg_fb_buf(dec); + map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_DECODE; dec->msg->stream_handle = dec->stream_handle; dec->msg->status_report_feedback_number = dec->frame_number; - dec->msg->body.decode.stream_type = profile2stream_type(dec->base.profile); + dec->msg->body.decode.stream_type = dec->stream_type; dec->msg->body.decode.decode_flags = 0x1; dec->msg->body.decode.width_in_samples = dec->base.width; dec->msg->body.decode.height_in_samples = dec->base.height; dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; + dec->msg->body.decode.db_pitch = dec->base.width; dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target); @@ -706,6 +954,10 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture); break; + case PIPE_VIDEO_FORMAT_HEVC: + dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture); + break; + case PIPE_VIDEO_FORMAT_VC1: dec->msg->body.decode.codec.vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture); break; @@ -733,12 +985,19 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); + if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) { + send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0, + RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); + } send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf, 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0, RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM); - send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf, FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); + if (have_it(dec)) + send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf, + FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); set_reg(dec, RUVD_ENGINE_CNTL, 1); flush(dec); @@ -760,7 +1019,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, ruvd_set_dtb set_dtb) { struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws; - unsigned dpb_size = calc_dpb_size(templ); + struct r600_common_context *rctx = (struct r600_common_context*)context; + unsigned dpb_size; unsigned width = templ->width, height = templ->height; unsigned bs_buf_size; struct radeon_info info; @@ -791,6 +1051,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, if (!dec) return NULL; + if (info.drm_major < 3) + dec->use_legacy = TRUE; + dec->base = *templ; dec->base.context = context; dec->base.width = width; @@ -803,11 +1066,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, dec->base.end_frame = ruvd_end_frame; dec->base.flush = ruvd_flush; + dec->stream_type = profile2stream_type(dec, info.family); dec->set_dtb = set_dtb; dec->stream_handle = rvid_alloc_stream_handle(); dec->screen = context->screen; dec->ws = ws; - dec->cs = ws->cs_create(ws, RING_UVD, NULL, NULL, NULL); + dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL); if (!dec->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; @@ -815,10 +1079,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, bs_buf_size = width * height * 512 / (16 * 16); for (i = 0; i < NUM_BUFFERS; ++i) { - unsigned msg_fb_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE; + unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE; STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET); - if (!rvid_create_buffer(dec->screen, &dec->msg_fb_buffers[i], - msg_fb_size, PIPE_USAGE_STAGING)) { + if (have_it(dec)) + msg_fb_it_size += IT_SCALING_TABLE_SIZE; + if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i], + msg_fb_it_size, PIPE_USAGE_STAGING)) { RVID_ERR("Can't allocated message buffers.\n"); goto error; } @@ -829,10 +1095,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, goto error; } - rvid_clear_buffer(context, &dec->msg_fb_buffers[i]); + rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]); rvid_clear_buffer(context, &dec->bs_buffers[i]); } + dpb_size = calc_dpb_size(dec); + if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) { RVID_ERR("Can't allocated dpb.\n"); goto error; @@ -840,14 +1108,23 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, rvid_clear_buffer(context, &dec->dpb); - map_msg_fb_buf(dec); + if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) { + unsigned ctx_size = calc_ctx_size(dec); + if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated context buffer.\n"); + goto error; + } + rvid_clear_buffer(context, &dec->ctx); + } + + map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_CREATE; dec->msg->stream_handle = dec->stream_handle; - dec->msg->body.create.stream_type = profile2stream_type(dec->base.profile); + dec->msg->body.create.stream_type = dec->stream_type; dec->msg->body.create.width_in_samples = dec->base.width; dec->msg->body.create.height_in_samples = dec->base.height; - dec->msg->body.create.dpb_size = dec->dpb.res->buf->size; + dec->msg->body.create.dpb_size = dpb_size; send_msg_buf(dec); flush(dec); next_buffer(dec); @@ -858,11 +1135,13 @@ error: if (dec->cs) dec->ws->cs_destroy(dec->cs); for (i = 0; i < NUM_BUFFERS; ++i) { - rvid_destroy_buffer(&dec->msg_fb_buffers[i]); + rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); rvid_destroy_buffer(&dec->bs_buffers[i]); } rvid_destroy_buffer(&dec->dpb); + if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) + rvid_destroy_buffer(&dec->ctx); FREE(dec); diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h index 7442865c9ec..452fbd60880 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.h +++ b/src/gallium/drivers/radeon/radeon_uvd.h @@ -62,6 +62,8 @@ #define RUVD_CMD_DECODING_TARGET_BUFFER 0x00000002 #define RUVD_CMD_FEEDBACK_BUFFER 0x00000003 #define RUVD_CMD_BITSTREAM_BUFFER 0x00000100 +#define RUVD_CMD_ITSCALING_TABLE_BUFFER 0x00000204 +#define RUVD_CMD_CONTEXT_BUFFER 0x00000206 /* UVD message types */ #define RUVD_MSG_CREATE 0 @@ -73,6 +75,8 @@ #define RUVD_CODEC_VC1 0x00000001 #define RUVD_CODEC_MPEG2 0x00000003 #define RUVD_CODEC_MPEG4 0x00000004 +#define RUVD_CODEC_H264_PERF 0x00000007 +#define RUVD_CODEC_H265 0x00000010 /* UVD decode target buffer tiling mode */ #define RUVD_TILE_LINEAR 0x00000000 @@ -171,6 +175,66 @@ struct ruvd_h264 { } mvc; }; +struct ruvd_h265 { + uint32_t sps_info_flags; + uint32_t pps_info_flags; + + uint8_t chroma_format; + uint8_t bit_depth_luma_minus8; + uint8_t bit_depth_chroma_minus8; + uint8_t log2_max_pic_order_cnt_lsb_minus4; + + uint8_t sps_max_dec_pic_buffering_minus1; + uint8_t log2_min_luma_coding_block_size_minus3; + uint8_t log2_diff_max_min_luma_coding_block_size; + uint8_t log2_min_transform_block_size_minus2; + + uint8_t log2_diff_max_min_transform_block_size; + uint8_t max_transform_hierarchy_depth_inter; + uint8_t max_transform_hierarchy_depth_intra; + uint8_t pcm_sample_bit_depth_luma_minus1; + + uint8_t pcm_sample_bit_depth_chroma_minus1; + uint8_t log2_min_pcm_luma_coding_block_size_minus3; + uint8_t log2_diff_max_min_pcm_luma_coding_block_size; + uint8_t num_extra_slice_header_bits; + + uint8_t num_short_term_ref_pic_sets; + uint8_t num_long_term_ref_pic_sps; + uint8_t num_ref_idx_l0_default_active_minus1; + uint8_t num_ref_idx_l1_default_active_minus1; + + int8_t pps_cb_qp_offset; + int8_t pps_cr_qp_offset; + int8_t pps_beta_offset_div2; + int8_t pps_tc_offset_div2; + + uint8_t diff_cu_qp_delta_depth; + uint8_t num_tile_columns_minus1; + uint8_t num_tile_rows_minus1; + uint8_t log2_parallel_merge_level_minus2; + + uint16_t column_width_minus1[19]; + uint16_t row_height_minus1[21]; + + int8_t init_qp_minus26; + uint8_t num_delta_pocs_ref_rps_idx; + uint8_t curr_idx; + uint8_t reserved1; + int32_t curr_poc; + uint8_t ref_pic_list[16]; + int32_t poc_list[16]; + uint8_t ref_pic_set_st_curr_before[8]; + uint8_t ref_pic_set_st_curr_after[8]; + uint8_t ref_pic_set_lt_curr[8]; + + uint8_t ucScalingListDCCoefSizeID2[6]; + uint8_t ucScalingListDCCoefSizeID3[2]; + + uint8_t highestTid; + uint8_t isNonRef; +}; + struct ruvd_vc1 { uint32_t profile; uint32_t level; @@ -327,6 +391,7 @@ struct ruvd_msg { union { struct ruvd_h264 h264; + struct ruvd_h265 h265; struct ruvd_vc1 vc1; struct ruvd_mpeg2 mpeg2; struct ruvd_mpeg4 mpeg4; diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index a6567379fe3..7eab974a3df 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -47,6 +47,8 @@ #define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8)) #define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8)) #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8)) +#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8)) +#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8)) /** * flush commands to the hardware @@ -54,6 +56,8 @@ static void flush(struct rvce_encoder *enc) { enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0); + enc->task_info_idx = 0; + enc->bs_idx = 0; } #if 0 @@ -214,7 +218,7 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc) * Calculate the offsets into the CPB */ void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, - unsigned *luma_offset, unsigned *chroma_offset) + signed *luma_offset, signed *chroma_offset) { unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128); unsigned vpitch = align(enc->luma->npix_y, 16); @@ -278,24 +282,19 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder, enc->fb = &fb; enc->session(enc); enc->create(enc); - enc->rate_control(enc); - need_rate_control = false; - enc->config_extension(enc); - enc->motion_estimation(enc); - enc->rdo(enc); - if (enc->use_vui) - enc->vui(enc); - enc->pic_control(enc); + enc->config(enc); enc->feedback(enc); flush(enc); //dump_feedback(enc, &fb); rvid_destroy_buffer(&fb); + need_rate_control = false; } - enc->session(enc); - - if (need_rate_control) - enc->rate_control(enc); + if (need_rate_control) { + enc->session(enc); + enc->config(enc); + flush(enc); + } } static void rvce_encode_bitstream(struct pipe_video_codec *encoder, @@ -312,6 +311,8 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder, RVID_ERR("Can't create feedback buffer.\n"); return; } + if (!enc->cs->cdw) + enc->session(enc); enc->encode(enc); enc->feedback(enc); } @@ -324,7 +325,8 @@ static void rvce_end_frame(struct pipe_video_codec *encoder, struct rvce_cpb_slot *slot = LIST_ENTRY( struct rvce_cpb_slot, enc->cpb_slots.prev, list); - flush(enc); + if (!enc->dual_inst || enc->bs_idx > 1) + flush(enc); /* update the CPB backtrack with the just encoded frame */ slot->picture_type = enc->pic.picture_type; @@ -363,6 +365,9 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, */ static void rvce_flush(struct pipe_video_codec *encoder) { + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + + flush(enc); } static void rvce_cs_flush(void *ctx, unsigned flags, @@ -377,6 +382,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, rvce_get_buffer get_buffer) { struct r600_common_screen *rscreen = (struct r600_common_screen *)context->screen; + struct r600_common_context *rctx = (struct r600_common_context*)context; struct rvce_encoder *enc; struct pipe_video_buffer *tmp_buf, templat = {}; struct radeon_surf *tmp_surf; @@ -395,8 +401,17 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, if (!enc) return NULL; + if (rscreen->info.drm_major == 3) + enc->use_vm = true; if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42)) enc->use_vui = true; + if (rscreen->info.family >= CHIP_TONGA) + enc->dual_pipe = true; + /* TODO enable B frame with dual instance */ + if ((rscreen->info.family >= CHIP_TONGA) && + (templ->max_references == 1) && + (rscreen->info.vce_harvest_config == 0)) + enc->dual_inst = true; enc->base = *templ; enc->base.context = context; @@ -411,7 +426,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, enc->screen = context->screen; enc->ws = ws; - enc->cs = ws->cs_create(ws, RING_VCE, rvce_cs_flush, enc, NULL); + enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL); if (!enc->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; @@ -436,6 +451,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, cpb_size = cpb_size * align(tmp_surf->npix_y, 16); cpb_size = cpb_size * 3 / 2; cpb_size = cpb_size * enc->cpb_num; + if (enc->dual_pipe) + cpb_size += RVCE_MAX_AUX_BUFFER_NUM * + RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2; tmp_buf->destroy(tmp_buf); if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) { RVID_ERR("Can't create CPB buffer.\n"); @@ -455,6 +473,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, case FW_50_0_1: case FW_50_1_2: + case FW_50_10_2: + case FW_50_17_3: radeon_vce_50_init(enc); break; @@ -482,5 +502,29 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) { return rscreen->info.vce_fw_version == FW_40_2_2 || rscreen->info.vce_fw_version == FW_50_0_1 || - rscreen->info.vce_fw_version == FW_50_1_2; + rscreen->info.vce_fw_version == FW_50_1_2 || + rscreen->info.vce_fw_version == FW_50_10_2 || + rscreen->info.vce_fw_version == FW_50_17_3; +} + +/** + * Add the buffer as relocation to the current command submission + */ +void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf, + enum radeon_bo_usage usage, enum radeon_bo_domain domain, + signed offset) +{ + int reloc_idx; + + reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN); + if (enc->use_vm) { + uint64_t addr; + addr = enc->ws->buffer_get_virtual_address(buf); + addr = addr + offset; + RVCE_CS(addr >> 32); + RVCE_CS(addr); + } else { + RVCE_CS(reloc_idx * 4); + RVCE_CS(offset); + } } diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h index 8319ef48cd5..624bda479f8 100644 --- a/src/gallium/drivers/radeon/radeon_vce.h +++ b/src/gallium/drivers/radeon/radeon_vce.h @@ -36,15 +36,16 @@ #include "util/list.h" -#define RVCE_RELOC(buf, usage, domain) (enc->ws->cs_add_reloc(enc->cs, (buf), (usage), domain, RADEON_PRIO_MIN)) - #define RVCE_CS(value) (enc->cs->buf[enc->cs->cdw++] = (value)) #define RVCE_BEGIN(cmd) { uint32_t *begin = &enc->cs->buf[enc->cs->cdw++]; RVCE_CS(cmd) -#define RVCE_READ(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READ, domain) * 4) -#define RVCE_WRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_WRITE, domain) * 4) -#define RVCE_READWRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READWRITE, domain) * 4) +#define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off)) +#define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off)) +#define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off)) #define RVCE_END() *begin = (&enc->cs->buf[enc->cs->cdw] - begin) * 4; } +#define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5) +#define RVCE_MAX_AUX_BUFFER_NUM 4 + struct r600_common_screen; /* driver dependent callback */ @@ -76,8 +77,12 @@ struct rvce_encoder { void (*motion_estimation)(struct rvce_encoder *enc); void (*rdo)(struct rvce_encoder *enc); void (*vui)(struct rvce_encoder *enc); + void (*config)(struct rvce_encoder *enc); void (*encode)(struct rvce_encoder *enc); void (*destroy)(struct rvce_encoder *enc); + void (*task_info)(struct rvce_encoder *enc, uint32_t op, + uint32_t dep, uint32_t fb_idx, + uint32_t ring_idx); unsigned stream_handle; @@ -101,7 +106,14 @@ struct rvce_encoder { struct rvid_buffer *fb; struct rvid_buffer cpb; struct pipe_h264_enc_picture_desc pic; - bool use_vui; + + unsigned task_info_idx; + unsigned bs_idx; + + bool use_vm; + bool use_vui; + bool dual_pipe; + bool dual_inst; }; /* CPB handling functions */ @@ -109,7 +121,7 @@ struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc); struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc); struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc); void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, - unsigned *luma_offset, unsigned *chroma_offset); + signed *luma_offset, signed *chroma_offset); struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, const struct pipe_video_codec *templat, @@ -118,6 +130,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen); +void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf, + enum radeon_bo_usage usage, enum radeon_bo_domain domain, + signed offset); + /* init vce fw 40.2.2 specific callbacks */ void radeon_vce_40_2_2_init(struct rvce_encoder *enc); diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c index 51b17b5f6a8..e64fbc7afb0 100644 --- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c +++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c @@ -53,30 +53,38 @@ static void session(struct rvce_encoder *enc) RVCE_END(); } -static void task_info(struct rvce_encoder *enc, uint32_t taskOperation) +static void task_info(struct rvce_encoder *enc, uint32_t op, + uint32_t dep, uint32_t fb_idx, uint32_t ring_idx) { RVCE_BEGIN(0x00000002); // task info + if (op == 0x3) { + if (enc->task_info_idx) { + uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3; + // Update offsetOfNextTaskInfo + enc->cs->buf[enc->task_info_idx] = offs; + } + enc->task_info_idx = enc->cs->cdw; + } RVCE_CS(0xffffffff); // offsetOfNextTaskInfo - RVCE_CS(taskOperation); // taskOperation - RVCE_CS(0x00000000); // referencePictureDependency + RVCE_CS(op); // taskOperation + RVCE_CS(dep); // referencePictureDependency RVCE_CS(0x00000000); // collocateFlagDependency - RVCE_CS(0x00000000); // feedbackIndex - RVCE_CS(0x00000000); // videoBitstreamRingIndex + RVCE_CS(fb_idx); // feedbackIndex + RVCE_CS(ring_idx); // videoBitstreamRingIndex RVCE_END(); } static void feedback(struct rvce_encoder *enc) { RVCE_BEGIN(0x05000005); // feedback buffer - RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains); // feedbackRingAddressHi - RVCE_CS(0x00000000); // feedbackRingAddressLo + RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo RVCE_CS(0x00000001); // feedbackRingSize RVCE_END(); } static void create(struct rvce_encoder *enc) { - task_info(enc, 0x00000000); + enc->task_info(enc, 0x00000000, 0, 0, 0); RVCE_BEGIN(0x01000001); // create cmd RVCE_CS(0x00000000); // encUseCircularBuffer @@ -272,21 +280,31 @@ static void vui(struct rvce_encoder *enc) RVCE_END(); } +static void config(struct rvce_encoder *enc) +{ + enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0); + enc->rate_control(enc); + enc->config_extension(enc); + enc->motion_estimation(enc); + enc->rdo(enc); + if (enc->use_vui) + enc->vui(enc); + enc->pic_control(enc); +} + static void encode(struct rvce_encoder *enc) { + signed luma_offset, chroma_offset; int i; - unsigned luma_offset, chroma_offset; - task_info(enc, 0x00000003); + enc->task_info(enc, 0x00000003, 0, 0, 0); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi - RVCE_CS(0x00000000); // encodeContextAddressLo + RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo RVCE_END(); RVCE_BEGIN(0x05000004); // video bitstream buffer - RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi - RVCE_CS(0x00000000); // videoBitstreamRingAddressLo + RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0x0); // videoBitstreamRingAddressHi/Lo RVCE_CS(enc->bs_size); // videoBitstreamRingSize RVCE_END(); @@ -298,10 +316,10 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0x00000000); // insertAUD RVCE_CS(0x00000000); // endOfSequence RVCE_CS(0x00000000); // endOfStream - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi - RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi - RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch @@ -404,7 +422,7 @@ static void encode(struct rvce_encoder *enc) static void destroy(struct rvce_encoder *enc) { - task_info(enc, 0x00000001); + enc->task_info(enc, 0x00000001, 0, 0, 0); RVCE_BEGIN(0x02000001); // destroy RVCE_END(); @@ -413,6 +431,7 @@ static void destroy(struct rvce_encoder *enc) void radeon_vce_40_2_2_init(struct rvce_encoder *enc) { enc->session = session; + enc->task_info = task_info; enc->create = create; enc->feedback = feedback; enc->rate_control = rate_control; @@ -421,6 +440,7 @@ void radeon_vce_40_2_2_init(struct rvce_encoder *enc) enc->motion_estimation = motion_estimation; enc->rdo = rdo; enc->vui = vui; + enc->config = config; enc->encode = encode; enc->destroy = destroy; } diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c index 84a2bfb117e..afdab18c0d3 100644 --- a/src/gallium/drivers/radeon/radeon_vce_50.c +++ b/src/gallium/drivers/radeon/radeon_vce_50.c @@ -44,18 +44,6 @@ #include "radeon_video.h" #include "radeon_vce.h" -static void task_info(struct rvce_encoder *enc, uint32_t taskOperation) -{ - RVCE_BEGIN(0x00000002); // task info - RVCE_CS(0xffffffff); // offsetOfNextTaskInfo - RVCE_CS(taskOperation); // taskOperation - RVCE_CS(0x00000000); // referencePictureDependency - RVCE_CS(0x00000000); // collocateFlagDependency - RVCE_CS(0x00000000); // feedbackIndex - RVCE_CS(0x00000000); // videoBitstreamRingIndex - RVCE_END(); -} - static void rate_control(struct rvce_encoder *enc) { RVCE_BEGIN(0x04000005); // rate control @@ -90,22 +78,46 @@ static void rate_control(struct rvce_encoder *enc) static void encode(struct rvce_encoder *enc) { + signed luma_offset, chroma_offset, bs_offset; + unsigned dep, bs_idx = enc->bs_idx++; int i; - unsigned luma_offset, chroma_offset; - task_info(enc, 0x00000003); + if (enc->dual_inst) { + if (bs_idx == 0) + dep = 1; + else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) + dep = 0; + else + dep = 2; + } else + dep = 0; + + enc->task_info(enc, 0x00000003, dep, 0, bs_idx); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi - RVCE_CS(0x00000000); // encodeContextAddressLo + RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo RVCE_END(); + bs_offset = -(signed)(bs_idx * enc->bs_size); + RVCE_BEGIN(0x05000004); // video bitstream buffer - RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi - RVCE_CS(0x00000000); // videoBitstreamRingAddressLo + RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, bs_offset); // videoBitstreamRingAddressHi/Lo RVCE_CS(enc->bs_size); // videoBitstreamRingSize RVCE_END(); + if (enc->dual_pipe) { + unsigned aux_offset = enc->cpb.res->buf->size - + RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2; + RVCE_BEGIN(0x05000002); // auxiliary buffer + for (i = 0; i < 8; ++i) { + RVCE_CS(aux_offset); + aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE; + } + for (i = 0; i < 8; ++i) + RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE); + RVCE_END(); + } + RVCE_BEGIN(0x03000001); // encode RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders RVCE_CS(0x00000000); // pictureStructure @@ -114,14 +126,17 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0x00000000); // insertAUD RVCE_CS(0x00000000); // endOfSequence RVCE_CS(0x00000000); // endOfStream - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi - RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi - RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch - RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) + if (enc->dual_pipe) + RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) + else + RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) RVCE_CS(0x00000000); // encInputPicTileConfig RVCE_CS(enc->pic.picture_type); // encPicType RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index 826e0763c08..3a1834b948f 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -214,9 +214,9 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_NPOT_TEXTURES: return 1; case PIPE_VIDEO_CAP_MAX_WIDTH: - return 2048; + return (rscreen->family < CHIP_TONGA) ? 2048 : 4096; case PIPE_VIDEO_CAP_MAX_HEIGHT: - return 1152; + return (rscreen->family < CHIP_TONGA) ? 1152 : 2304; case PIPE_VIDEO_CAP_PREFERED_FORMAT: return PIPE_FORMAT_NV12; case PIPE_VIDEO_CAP_PREFERS_INTERLACED: @@ -225,6 +225,8 @@ int rvid_get_video_param(struct pipe_screen *screen, return false; case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: return true; + case PIPE_VIDEO_CAP_STACKED_FRAMES: + return (rscreen->family < CHIP_TONGA) ? 1 : 2; default: return 0; } @@ -262,20 +264,28 @@ int rvid_get_video_param(struct pipe_screen *screen, /* FIXME: VC-1 simple/main profile is broken */ return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED && entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE; + case PIPE_VIDEO_FORMAT_HEVC: + /* Carrizo only supports HEVC Main */ + return rscreen->family >= CHIP_CARRIZO && + profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; default: return false; } case PIPE_VIDEO_CAP_NPOT_TEXTURES: return 1; case PIPE_VIDEO_CAP_MAX_WIDTH: - return 2048; + return (rscreen->family < CHIP_TONGA) ? 2048 : 4096; case PIPE_VIDEO_CAP_MAX_HEIGHT: - return 1152; + return (rscreen->family < CHIP_TONGA) ? 1152 : 2304; case PIPE_VIDEO_CAP_PREFERED_FORMAT: return PIPE_FORMAT_NV12; case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC) + return false; //The hardware doesn't support interlaced HEVC. return true; case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC) + return false; //The hardware doesn't support interlaced HEVC. return true; case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: return true; @@ -300,6 +310,8 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: return 41; + case PIPE_VIDEO_PROFILE_HEVC_MAIN: + return 186; default: return 0; } diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 3bfbb6d75b7..7ab6e56e099 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -42,12 +42,9 @@ #include "pipebuffer/pb_buffer.h" -#define RADEON_MAX_CMDBUF_DWORDS (16 * 1024) - #define RADEON_FLUSH_ASYNC (1 << 0) #define RADEON_FLUSH_KEEP_TILING_FLAGS (1 << 1) /* needs DRM 2.12.0 */ -#define RADEON_FLUSH_COMPUTE (1 << 2) -#define RADEON_FLUSH_END_OF_FRAME (1 << 3) +#define RADEON_FLUSH_END_OF_FRAME (1 << 2) /* Tiling flags. */ enum radeon_bo_layout { @@ -136,6 +133,10 @@ enum radeon_family { CHIP_KABINI, CHIP_HAWAII, CHIP_MULLINS, + CHIP_TONGA, + CHIP_ICELAND, + CHIP_CARRIZO, + CHIP_FIJI, CHIP_LAST, }; @@ -150,10 +151,12 @@ enum chip_class { CAYMAN, SI, CIK, + VI, }; enum ring_type { RING_GFX = 0, + RING_COMPUTE, RING_DMA, RING_UVD, RING_VCE, @@ -169,9 +172,10 @@ enum radeon_value_id { RADEON_NUM_BYTES_MOVED, RADEON_VRAM_USAGE, RADEON_GTT_USAGE, - RADEON_GPU_TEMPERATURE, + RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */ RADEON_CURRENT_SCLK, - RADEON_CURRENT_MCLK + RADEON_CURRENT_MCLK, + RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */ }; enum radeon_bo_priority { @@ -192,9 +196,11 @@ enum radeon_bo_priority { struct winsys_handle; struct radeon_winsys_cs_handle; +struct radeon_winsys_ctx; struct radeon_winsys_cs { unsigned cdw; /* Number of used dwords. */ + unsigned max_dw; /* Maximum number of dwords. */ uint32_t *buf; /* The command buffer. */ enum ring_type ring_type; }; @@ -238,6 +244,7 @@ struct radeon_info { boolean cik_macrotile_mode_array_valid; uint32_t cik_macrotile_mode_array[16]; + uint32_t vce_harvest_config; }; enum radeon_feature_id { @@ -317,6 +324,8 @@ struct radeon_surf { struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL]; uint32_t tiling_index[RADEON_SURF_MAX_LEVEL]; uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL]; + uint32_t pipe_config; + uint32_t num_banks; }; struct radeon_winsys { @@ -398,24 +407,15 @@ struct radeon_winsys { void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf); /** - * Return TRUE if a buffer object is being used by the GPU. - * - * \param buf A winsys buffer object. - * \param usage Only check whether the buffer is busy for the given usage. - */ - boolean (*buffer_is_busy)(struct pb_buffer *buf, - enum radeon_bo_usage usage); - - /** - * Wait for a buffer object until it is not used by a GPU. This is - * equivalent to a fence placed after the last command using the buffer, - * and synchronizing to the fence. + * Wait for the buffer and return true if the buffer is not used + * by the device. * - * \param buf A winsys buffer object to wait for. - * \param usage Only wait until the buffer is idle for the given usage, - * but may still be busy for some other usage. + * The timeout of 0 will only return the status. + * The timeout of PIPE_TIMEOUT_INFINITE will always wait until the buffer + * is idle. */ - void (*buffer_wait)(struct pb_buffer *buf, enum radeon_bo_usage usage); + bool (*buffer_wait)(struct pb_buffer *buf, uint64_t timeout, + enum radeon_bo_usage usage); /** * Return tiling flags describing a memory layout of a buffer object. @@ -450,10 +450,11 @@ struct radeon_winsys { struct radeon_winsys_cs *rcs, enum radeon_bo_layout microtile, enum radeon_bo_layout macrotile, + unsigned pipe_config, unsigned bankw, unsigned bankh, unsigned tile_split, unsigned stencil_tile_split, - unsigned mtilea, + unsigned mtilea, unsigned num_banks, unsigned stride, bool scanout); @@ -515,15 +516,31 @@ struct radeon_winsys { *************************************************************************/ /** + * Create a command submission context. + * Various command streams can be submitted to the same context. + */ + struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws); + + /** + * Destroy a context. + */ + void (*ctx_destroy)(struct radeon_winsys_ctx *ctx); + + /** + * Query a GPU reset status. + */ + enum pipe_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *ctx); + + /** * Create a command stream. * - * \param ws The winsys this function is called from. + * \param ctx The submission context * \param ring_type The ring type (GFX, DMA, UVD) * \param flush Flush callback function associated with the command stream. * \param user User pointer that will be passed to the flush callback. * \param trace_buf Trace buffer when tracing is enabled */ - struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws, + struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), @@ -668,12 +685,12 @@ struct radeon_winsys { }; -static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) +static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) { cs->buf[cs->cdw++] = value; } -static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs, +static inline void radeon_emit_array(struct radeon_winsys_cs *cs, const uint32_t *values, unsigned count) { memcpy(cs->buf+cs->cdw, values, count * 4); diff --git a/src/gallium/drivers/radeonsi/Automake.inc b/src/gallium/drivers/radeonsi/Automake.inc index 8686fffd71c..5a9dcfd9fd6 100644 --- a/src/gallium/drivers/radeonsi/Automake.inc +++ b/src/gallium/drivers/radeonsi/Automake.inc @@ -5,10 +5,12 @@ TARGET_CPPFLAGS += -DGALLIUM_RADEONSI TARGET_LIB_DEPS += \ $(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \ $(RADEON_LIBS) \ - $(LIBDRM_LIBS) + $(LIBDRM_LIBS) \ + $(AMDGPU_LIBS) TARGET_RADEON_WINSYS = \ - $(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la + $(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \ + $(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la TARGET_RADEON_COMMON = \ $(top_builddir)/src/gallium/drivers/radeon/libradeon.la diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 2876c0ae735..a0b1414f4bb 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -3,6 +3,7 @@ C_SOURCES := \ si_blit.c \ si_commands.c \ si_compute.c \ + si_cp_dma.c \ si_descriptors.c \ sid.h \ si_dma.c \ diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 86111cb86e8..47b586f171e 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -27,7 +27,7 @@ #include "sid.h" #include "si_pipe.h" -#include "../radeon/r600_cs.h" +#include "radeon/r600_cs.h" #include "util/u_format.h" diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 1f2c4082dbc..48972bd170c 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -57,17 +57,19 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader); util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader); + util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader); + util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader); util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader); util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); if (sctx->queued.named.sample_mask) { util_blitter_save_sample_mask(sctx->blitter, sctx->queued.named.sample_mask->sample_mask); } - if (sctx->queued.named.viewport) { - util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport->viewport); + if (sctx->queued.named.viewport[0]) { + util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport[0]->viewport); } - if (sctx->queued.named.scissor) { - util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor->scissor); + if (sctx->queued.named.scissor[0]) { + util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor[0]->scissor); } util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets, @@ -146,7 +148,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx, struct pipe_surface *zsurf, *cbsurf, surf_tmpl; sctx->dbcb_copy_sample = sample; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); surf_tmpl.format = texture->resource.b.b.format; surf_tmpl.u.tex.level = level; @@ -180,7 +182,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx, sctx->dbcb_depth_copy_enabled = false; sctx->dbcb_stencil_copy_enabled = false; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } static void si_blit_decompress_depth_in_place(struct si_context *sctx, @@ -192,7 +194,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx, unsigned layer, max_layer, checked_last_layer, level; sctx->db_inplace_flush_enabled = true; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); surf_tmpl.format = texture->resource.b.b.format; @@ -230,7 +232,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx, } sctx->db_inplace_flush_enabled = false; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } void si_flush_depth_textures(struct si_context *sctx, @@ -340,6 +342,8 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, if (buffers & PIPE_CLEAR_COLOR) { evergreen_do_fast_color_clear(&sctx->b, fb, &sctx->framebuffer.atom, &buffers, color); + if (!buffers) + return; /* all buffers have been fast cleared */ } if (buffers & PIPE_CLEAR_COLOR) { @@ -374,9 +378,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, } zstex->depth_clear_value = depth; - sctx->framebuffer.atom.dirty = true; /* updates DB_DEPTH_CLEAR */ + si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */ sctx->db_depth_clear = true; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } si_blitter_begin(ctx, SI_CLEAR); @@ -389,7 +393,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, sctx->db_depth_clear = false; sctx->db_depth_disable_expclear = false; zstex->depth_cleared = true; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } } @@ -455,89 +459,6 @@ struct texture_orig_info { unsigned npix0_y; }; -static void si_compressed_to_blittable(struct pipe_resource *tex, - unsigned level, - struct texture_orig_info *orig) -{ - struct r600_texture *rtex = (struct r600_texture*)tex; - unsigned pixsize = util_format_get_blocksize(rtex->resource.b.b.format); - int new_format; - int new_height, new_width; - - orig->format = tex->format; - orig->width0 = tex->width0; - orig->height0 = tex->height0; - orig->npix0_x = rtex->surface.level[0].npix_x; - orig->npix0_y = rtex->surface.level[0].npix_y; - orig->npix_x = rtex->surface.level[level].npix_x; - orig->npix_y = rtex->surface.level[level].npix_y; - - if (pixsize == 8) - new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */ - else - new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */ - - new_width = util_format_get_nblocksx(tex->format, orig->width0); - new_height = util_format_get_nblocksy(tex->format, orig->height0); - - tex->width0 = new_width; - tex->height0 = new_height; - tex->format = new_format; - rtex->surface.level[0].npix_x = util_format_get_nblocksx(orig->format, orig->npix0_x); - rtex->surface.level[0].npix_y = util_format_get_nblocksy(orig->format, orig->npix0_y); - rtex->surface.level[level].npix_x = util_format_get_nblocksx(orig->format, orig->npix_x); - rtex->surface.level[level].npix_y = util_format_get_nblocksy(orig->format, orig->npix_y); - - /* By dividing the dimensions by 4, we effectively decrement - * last_level by 2, therefore the last 2 mipmap levels disappear and - * aren't blittable. Note that the last 3 mipmap levels (4x4, 2x2, - * 1x1) have equal slice sizes, which is an important assumption - * for this to work. - * - * In order to make the last 2 mipmap levels blittable, we have to - * add the slice size of the last mipmap level to the texture - * address, so that even though the hw thinks it reads last_level-2, - * it will actually read last_level-1, and if we add the slice size*2, - * it will read last_level. That's how this workaround works. - */ - if (level > rtex->resource.b.b.last_level-2) - rtex->mipmap_shift = level - (rtex->resource.b.b.last_level-2); -} - -static void si_change_format(struct pipe_resource *tex, - unsigned level, - struct texture_orig_info *orig, - enum pipe_format format) -{ - struct r600_texture *rtex = (struct r600_texture*)tex; - - orig->format = tex->format; - orig->width0 = tex->width0; - orig->height0 = tex->height0; - orig->npix0_x = rtex->surface.level[0].npix_x; - orig->npix0_y = rtex->surface.level[0].npix_y; - orig->npix_x = rtex->surface.level[level].npix_x; - orig->npix_y = rtex->surface.level[level].npix_y; - - tex->format = format; -} - -static void si_reset_blittable_to_orig(struct pipe_resource *tex, - unsigned level, - struct texture_orig_info *orig) -{ - struct r600_texture *rtex = (struct r600_texture*)tex; - - tex->format = orig->format; - tex->width0 = orig->width0; - tex->height0 = orig->height0; - rtex->surface.level[0].npix_x = orig->npix0_x; - rtex->surface.level[0].npix_y = orig->npix0_y; - rtex->surface.level[level].npix_x = orig->npix_x; - rtex->surface.level[level].npix_y = orig->npix_y; - rtex->mipmap_shift = 0; -} - void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, @@ -547,114 +468,116 @@ void si_resource_copy_region(struct pipe_context *ctx, const struct pipe_box *src_box) { struct si_context *sctx = (struct si_context *)ctx; - struct r600_texture *rdst = (struct r600_texture*)dst; struct pipe_surface *dst_view, dst_templ; struct pipe_sampler_view src_templ, *src_view; - struct texture_orig_info orig_info[2]; + unsigned dst_width, dst_height, src_width0, src_height0; + unsigned src_force_level = 0; struct pipe_box sbox, dstbox; - boolean restore_orig[2]; - /* Fallback for buffers. */ + /* Handle buffers first. */ if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, false); return; } - memset(orig_info, 0, sizeof(orig_info)); + assert(u_max_sample(dst) == u_max_sample(src)); /* The driver doesn't decompress resources automatically while * u_blitter is rendering. */ si_decompress_subresource(ctx, src, src_level, src_box->z, src_box->z + src_box->depth - 1); - restore_orig[0] = restore_orig[1] = FALSE; + dst_width = u_minify(dst->width0, dst_level); + dst_height = u_minify(dst->height0, dst_level); + src_width0 = src->width0; + src_height0 = src->height0; + + util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz); + util_blitter_default_src_texture(&src_templ, src, src_level); if (util_format_is_compressed(src->format) && util_format_is_compressed(dst->format)) { - si_compressed_to_blittable(src, src_level, &orig_info[0]); - restore_orig[0] = TRUE; - sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x); - sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y); + unsigned blocksize = util_format_get_blocksize(src->format); + + if (blocksize == 8) + src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */ + else + src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */ + dst_templ.format = src_templ.format; + + dst_width = util_format_get_nblocksx(dst->format, dst_width); + dst_height = util_format_get_nblocksy(dst->format, dst_height); + src_width0 = util_format_get_nblocksx(src->format, src_width0); + src_height0 = util_format_get_nblocksy(src->format, src_height0); + + dstx = util_format_get_nblocksx(dst->format, dstx); + dsty = util_format_get_nblocksy(dst->format, dsty); + + sbox.x = util_format_get_nblocksx(src->format, src_box->x); + sbox.y = util_format_get_nblocksy(src->format, src_box->y); sbox.z = src_box->z; - sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width); - sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height); + sbox.width = util_format_get_nblocksx(src->format, src_box->width); + sbox.height = util_format_get_nblocksy(src->format, src_box->height); sbox.depth = src_box->depth; src_box = &sbox; - si_compressed_to_blittable(dst, dst_level, &orig_info[1]); - restore_orig[1] = TRUE; - /* translate the dst box as well */ - dstx = util_format_get_nblocksx(orig_info[1].format, dstx); - dsty = util_format_get_nblocksy(orig_info[1].format, dsty); - } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) { + src_force_level = src_level; + } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src) || + /* also *8_SNORM has precision issues, use UNORM instead */ + util_format_is_snorm(src->format)) { if (util_format_is_subsampled_422(src->format)) { - /* XXX untested */ - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R8G8B8A8_UINT); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R8G8B8A8_UINT); + src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; + dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; + + dst_width = util_format_get_nblocksx(dst->format, dst_width); + src_width0 = util_format_get_nblocksx(src->format, src_width0); + + dstx = util_format_get_nblocksx(dst->format, dstx); sbox = *src_box; - sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x); - sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width); + sbox.x = util_format_get_nblocksx(src->format, src_box->x); + sbox.width = util_format_get_nblocksx(src->format, src_box->width); src_box = &sbox; - dstx = util_format_get_nblocksx(orig_info[1].format, dstx); - - restore_orig[0] = TRUE; - restore_orig[1] = TRUE; } else { unsigned blocksize = util_format_get_blocksize(src->format); switch (blocksize) { case 1: - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R8_UNORM); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R8_UNORM); + dst_templ.format = PIPE_FORMAT_R8_UNORM; + src_templ.format = PIPE_FORMAT_R8_UNORM; break; case 2: - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R8G8_UNORM); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R8G8_UNORM); + dst_templ.format = PIPE_FORMAT_R8G8_UNORM; + src_templ.format = PIPE_FORMAT_R8G8_UNORM; break; case 4: - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R8G8B8A8_UNORM); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R8G8B8A8_UNORM); + dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; + src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; break; case 8: - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R16G16B16A16_UINT); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R16G16B16A16_UINT); + dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; + src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; break; case 16: - si_change_format(src, src_level, &orig_info[0], - PIPE_FORMAT_R32G32B32A32_UINT); - si_change_format(dst, dst_level, &orig_info[1], - PIPE_FORMAT_R32G32B32A32_UINT); + dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; + src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; break; default: fprintf(stderr, "Unhandled format %s with blocksize %u\n", util_format_short_name(src->format), blocksize); assert(0); } - restore_orig[0] = TRUE; - restore_orig[1] = TRUE; } } /* Initialize the surface. */ - util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz); dst_view = r600_create_surface_custom(ctx, dst, &dst_templ, - rdst->surface.level[dst_level].npix_x, - rdst->surface.level[dst_level].npix_y); + dst_width, dst_height); /* Initialize the sampler view. */ - util_blitter_default_src_texture(&src_templ, src, src_level); - src_view = ctx->create_sampler_view(ctx, src, &src_templ); + src_view = si_create_sampler_view_custom(ctx, src, &src_templ, + src_width0, src_height0, + src_force_level); u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), abs(src_box->depth), &dstbox); @@ -662,18 +585,12 @@ void si_resource_copy_region(struct pipe_context *ctx, /* Copy. */ si_blitter_begin(ctx, SI_COPY); util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, - src_view, src_box, src->width0, src->height0, + src_view, src_box, src_width0, src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL); si_blitter_end(ctx); pipe_surface_reference(&dst_view, NULL); pipe_sampler_view_reference(&src_view, NULL); - - if (restore_orig[0]) - si_reset_blittable_to_orig(src, src_level, &orig_info[0]); - - if (restore_orig[1]) - si_reset_blittable_to_orig(dst, dst_level, &orig_info[1]); } /* For MSAA integer resolving to work, we change the format to NORM using this function. */ diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 89bef2e7afd..d4fe5653687 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -137,14 +137,14 @@ static void *si_create_compute_state( } #else - radeon_elf_read(code, header->num_bytes, &program->shader.binary, true); + radeon_elf_read(code, header->num_bytes, &program->shader.binary); /* init_scratch_buffer patches the shader code with the scratch address, * so we need to call it before si_shader_binary_read() which uploads * the shader code to the GPU. */ init_scratch_buffer(sctx, program); - si_shader_binary_read(sctx->screen, &program->shader, &program->shader.binary); + si_shader_binary_read(sctx->screen, &program->shader); #endif program->input_buffer = si_resource_create_custom(sctx->b.b.screen, @@ -309,8 +309,6 @@ static void si_launch_grid( kernel_args[i]); } - sctx->b.ws->buffer_unmap(input_buffer->cs_buf); - kernel_args_va = input_buffer->gpu_address; kernel_args_va += kernel_args_offset; diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c new file mode 100644 index 00000000000..f8a9da45a10 --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -0,0 +1,265 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Marek Olšák <[email protected]> + */ + +#include "si_pipe.h" +#include "sid.h" +#include "radeon/r600_cs.h" + + +/* Set this if you want the 3D engine to wait until CP DMA is done. + * It should be set on the last CP DMA packet. */ +#define R600_CP_DMA_SYNC (1 << 0) /* R600+ */ + +/* Set this if the source data was used as a destination in a previous CP DMA + * packet. It's for preventing a read-after-write (RAW) hazard between two + * CP DMA packets. */ +#define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */ +#define CIK_CP_DMA_USE_L2 (1 << 2) + +/* Emit a CP DMA packet to do a copy from one buffer to another. + * The size must fit in bits [20:0]. + */ +static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, + uint64_t dst_va, uint64_t src_va, + unsigned size, unsigned flags) +{ + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; + uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? + PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + if (sctx->b.chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } +} + +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */ +static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, + uint64_t dst_va, unsigned size, + uint32_t clear_value, unsigned flags) +{ + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; + uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + if (sctx->b.chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, 0); + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } +} + +/* The max number of bytes to copy per packet. */ +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) + +static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, + unsigned offset, unsigned size, unsigned value, + bool is_framebuffer) +{ + struct si_context *sctx = (struct si_context*)ctx; + unsigned flush_flags, tc_l2_flag; + + if (!size) + return; + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&r600_resource(dst)->valid_buffer_range, offset, + offset + size); + + /* Fallback for unaligned clears. */ + if (offset % 4 != 0 || size % 4 != 0) { + uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, + sctx->b.rings.gfx.cs, + PIPE_TRANSFER_WRITE); + size /= 4; + for (unsigned i = 0; i < size; i++) + *map++ = value; + return; + } + + uint64_t va = r600_resource(dst)->gpu_address + offset; + + /* Flush the caches where the resource is bound. */ + if (is_framebuffer) { + flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + tc_l2_flag = 0; + } else { + flush_flags = SI_CONTEXT_INV_TC_L1 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | + SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } + + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + flush_flags; + + while (size) { + unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + unsigned dma_flags = tc_l2_flag; + + si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), + FALSE); + + /* This must be done after need_cs_space. */ + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + (struct r600_resource*)dst, RADEON_USAGE_WRITE, + RADEON_PRIO_MIN); + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. */ + if (sctx->b.flags) { + si_emit_cache_flush(&sctx->b, NULL); + dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ + } + + /* Do the synchronization after the last copy, so that all data is written to memory. */ + if (size == byte_count) + dma_flags |= R600_CP_DMA_SYNC; + + /* Emit the clear packet. */ + si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); + + size -= byte_count; + va += byte_count; + } + + /* Flush the caches again in case the 3D engine has been prefetching + * the resource. */ + sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; +} + +void si_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size, + bool is_framebuffer) +{ + unsigned flush_flags, tc_l2_flag; + + if (!size) + return; + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, + dst_offset + size); + + dst_offset += r600_resource(dst)->gpu_address; + src_offset += r600_resource(src)->gpu_address; + + /* Flush the caches where the resource is bound. */ + if (is_framebuffer) { + flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + tc_l2_flag = 0; + } else { + flush_flags = SI_CONTEXT_INV_TC_L1 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | + SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } + + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + flush_flags; + + while (size) { + unsigned sync_flags = tc_l2_flag; + unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + + si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); + + /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ + if (sctx->b.flags) { + si_emit_cache_flush(&sctx->b, NULL); + sync_flags |= SI_CP_DMA_RAW_WAIT; + } + + /* Do the synchronization after the last copy, so that all data is written to memory. */ + if (size == byte_count) { + sync_flags |= R600_CP_DMA_SYNC; + } + + /* This must be done after r600_need_cs_space. */ + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, + RADEON_USAGE_READ, RADEON_PRIO_MIN); + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, + RADEON_USAGE_WRITE, RADEON_PRIO_MIN); + + si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); + + size -= byte_count; + src_offset += byte_count; + dst_offset += byte_count; + } + + /* Flush the caches again in case the 3D engine has been prefetching + * the resource. */ + sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; +} + +void si_init_cp_dma_functions(struct si_context *sctx) +{ + sctx->b.clear_buffer = si_clear_buffer; +} diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index bbfd36dcbeb..890be071596 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -24,14 +24,23 @@ * Marek Olšák <[email protected]> */ -/* Resource binding slots and sampler states (each described with 8 or 4 dwords) - * live in memory on SI. +/* Resource binding slots and sampler states (each described with 8 or + * 4 dwords) are stored in lists in memory which is accessed by shaders + * using scalar load instructions. * - * This file is responsible for managing lists of resources and sampler states - * in memory and binding them, which means updating those structures in memory. + * This file is responsible for managing such lists. It keeps a copy of all + * descriptors in CPU memory and re-uploads a whole list if some slots have + * been changed. * - * There is also code for updating shader pointers to resources and sampler - * states. CP DMA functions are here too. + * This code is also reponsible for updating shader pointers to those lists. + * + * Note that CP DMA can't be used for updating the lists, because a GPU hang + * could leave the list in a mid-IB state and the next IB would get wrong + * descriptors and the whole context would be unusable at that point. + * (Note: The register shadowing can't be used due to the same reason) + * + * Also, uploading descriptors to newly allocated memory doesn't require + * a KCACHE flush. */ #include "radeon/r600_cs.h" @@ -42,7 +51,6 @@ #include "util/u_memory.h" #include "util/u_upload_mgr.h" -#define SI_NUM_CONTEXTS 16 /* NULL image and buffer descriptor. * @@ -64,284 +72,62 @@ static uint32_t null_descriptor[8] = { * descriptor */ }; -/* Set this if you want the 3D engine to wait until CP DMA is done. - * It should be set on the last CP DMA packet. */ -#define R600_CP_DMA_SYNC (1 << 0) /* R600+ */ - -/* Set this if the source data was used as a destination in a previous CP DMA - * packet. It's for preventing a read-after-write (RAW) hazard between two - * CP DMA packets. */ -#define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */ -#define CIK_CP_DMA_USE_L2 (1 << 2) - -/* Emit a CP DMA packet to do a copy from one buffer to another. - * The size must fit in bits [20:0]. - */ -static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, - uint64_t dst_va, uint64_t src_va, - unsigned size, unsigned flags) -{ - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; - uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; - uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? - PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0; - - assert(size); - assert((size & ((1<<21)-1)) == size); - - if (sctx->b.chip_class >= CIK) { - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ - } else { - radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ - } -} - -/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */ -static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, - uint64_t dst_va, unsigned size, - uint32_t clear_value, unsigned flags) -{ - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; - uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; - uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0; - - assert(size); - assert((size & ((1<<21)-1)) == size); - - if (sctx->b.chip_class >= CIK) { - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ - radeon_emit(cs, clear_value); /* DATA [31:0] */ - radeon_emit(cs, 0); - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ - } else { - radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); - radeon_emit(cs, clear_value); /* DATA [31:0] */ - radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ - } -} - -static void si_init_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - unsigned shader_userdata_reg, +static void si_init_descriptors(struct si_descriptors *desc, + unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements, - void (*emit_func)(struct si_context *ctx, struct r600_atom *state)) + unsigned num_elements) { + int i; + assert(num_elements <= sizeof(desc->enabled_mask)*8); - assert(num_elements <= sizeof(desc->dirty_mask)*8); - desc->atom.emit = (void*)emit_func; - desc->shader_userdata_reg = shader_userdata_reg; + desc->list = CALLOC(num_elements, element_dw_size * 4); desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; - desc->context_size = num_elements * element_dw_size * 4; - - desc->buffer = (struct r600_resource*) - pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, - SI_NUM_CONTEXTS * desc->context_size); - - r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, - RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - - /* We don't check for CS space here, because this should be called - * only once at context initialization. */ - si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address, - desc->buffer->b.b.width0, 0, - R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); + desc->list_dirty = true; /* upload the list before the next draw */ + desc->shader_userdata_offset = shader_userdata_index * 4; + + /* Initialize the array to NULL descriptors if the element size is 8. */ + if (element_dw_size == 8) + for (i = 0; i < num_elements; i++) + memcpy(desc->list + i*element_dw_size, null_descriptor, + sizeof(null_descriptor)); } static void si_release_descriptors(struct si_descriptors *desc) { pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL); + FREE(desc->list); } -static void si_update_descriptors(struct si_context *sctx, +static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc) { - if (desc->dirty_mask) { - desc->atom.num_dw = - 7 + /* copy */ - (4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */ - 4; /* pointer update */ - - if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 && - desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) - desc->atom.num_dw += 4; /* second pointer update */ - - desc->atom.dirty = true; - - /* TODO: Investigate if these flushes can be removed after - * adding CE support. */ - - /* The descriptors are read with the K cache. */ - sctx->b.flags |= SI_CONTEXT_INV_KCACHE; - - /* Since SI uses uncached CP DMA to update descriptors, - * we have to flush TC L2, which is used to fetch constants - * along with KCACHE. */ - if (sctx->b.chip_class == SI) - sctx->b.flags |= SI_CONTEXT_INV_TC_L2; - } else { - desc->atom.dirty = false; - } -} + unsigned list_size = desc->num_elements * desc->element_dw_size * 4; + void *ptr; -static void si_emit_shader_pointer(struct si_context *sctx, - struct r600_atom *atom) -{ - struct si_descriptors *desc = (struct si_descriptors*)atom; - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - uint64_t va = desc->buffer->gpu_address + - desc->current_context_id * desc->context_size + - desc->buffer_offset; + if (!desc->list_dirty) + return true; - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); - radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + u_upload_alloc(sctx->b.uploader, 0, list_size, + &desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, &ptr); + if (!desc->buffer) + return false; /* skip the draw call */ - if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 && - desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) { - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); - radeon_emit(cs, (desc->shader_userdata_reg + - (R_00B330_SPI_SHADER_USER_DATA_ES_0 - - R_00B130_SPI_SHADER_USER_DATA_VS_0) - - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - } -} + util_memcpy_cpu_to_le32(ptr, desc->list, list_size); -static void si_emit_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - uint32_t **descriptors) -{ - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - uint64_t va_base; - int packet_start = 0; - int packet_size = 0; - int last_index = desc->num_elements; /* point to a non-existing element */ - uint64_t dirty_mask = desc->dirty_mask; - unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS; - - assert(dirty_mask); - - va_base = desc->buffer->gpu_address; - - /* Copy the descriptors to a new context slot. */ - si_emit_cp_dma_copy_buffer(sctx, - va_base + new_context_id * desc->context_size, - va_base + desc->current_context_id * desc->context_size, - desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); - - va_base += new_context_id * desc->context_size; - - /* Update the descriptors. - * Updates of consecutive descriptors are merged to one WRITE_DATA packet. - * - * XXX When unbinding lots of resources, consider clearing the memory - * with CP DMA instead of emitting zeros. - */ - while (dirty_mask) { - int i = u_bit_scan64(&dirty_mask); - - assert(i < desc->num_elements); - - if (last_index+1 == i && packet_size) { - /* Append new data at the end of the last packet. */ - packet_size += desc->element_dw_size; - cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0); - } else { - /* Start a new packet. */ - uint64_t va = va_base + i * desc->element_dw_size * 4; - - packet_start = cs->cdw; - packet_size = 2 + desc->element_dw_size; - - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0)); - radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ? - PKT3_WRITE_DATA_DST_SEL_MEM_SYNC : - PKT3_WRITE_DATA_DST_SEL_TC_L2) | - PKT3_WRITE_DATA_WR_CONFIRM | - PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); - radeon_emit(cs, va & 0xFFFFFFFFUL); - radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL); - } - - radeon_emit_array(cs, descriptors[i], desc->element_dw_size); - - last_index = i; - } - - desc->dirty_mask = 0; - desc->current_context_id = new_context_id; - - /* Now update the shader userdata pointer. */ - si_emit_shader_pointer(sctx, &desc->atom); -} + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); -static unsigned si_get_shader_user_data_base(unsigned shader) -{ - switch (shader) { - case PIPE_SHADER_VERTEX: - return R_00B130_SPI_SHADER_USER_DATA_VS_0; - case PIPE_SHADER_GEOMETRY: - return R_00B230_SPI_SHADER_USER_DATA_GS_0; - case PIPE_SHADER_FRAGMENT: - return R_00B030_SPI_SHADER_USER_DATA_PS_0; - default: - assert(0); - return 0; - } + desc->list_dirty = false; + desc->pointer_dirty = true; + si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); + return true; } /* SAMPLER VIEWS */ -static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_sampler_views *views = (struct si_sampler_views*)atom; - - si_emit_descriptors(sctx, &views->desc, views->desc_data); -} - -static void si_init_sampler_views(struct si_context *sctx, - struct si_sampler_views *views, - unsigned shader) -{ - int i; - - si_init_descriptors(sctx, &views->desc, - si_get_shader_user_data_base(shader) + - SI_SGPR_RESOURCE * 4, - 8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views); - - for (i = 0; i < views->desc.num_elements; i++) { - views->desc_data[i] = null_descriptor; - views->desc.dirty_mask |= 1llu << i; - } - si_update_descriptors(sctx, &views->desc); -} - static void si_release_sampler_views(struct si_sampler_views *views) { int i; @@ -382,10 +168,10 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, si_get_resource_ro_priority(rview->resource)); } + if (!views->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - - si_emit_shader_pointer(sctx, &views->desc.atom); } static void si_set_sampler_view(struct si_context *sctx, unsigned shader, @@ -406,17 +192,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader, rview->resource, RADEON_USAGE_READ, si_get_resource_ro_priority(rview->resource)); - pipe_sampler_view_reference(&views->views[slot], view); - views->desc_data[slot] = view_desc; + memcpy(views->desc.list + slot*8, view_desc, 8*4); views->desc.enabled_mask |= 1llu << slot; } else { pipe_sampler_view_reference(&views->views[slot], NULL); - views->desc_data[slot] = null_descriptor; + memcpy(views->desc.list + slot*8, null_descriptor, 8*4); views->desc.enabled_mask &= ~(1llu << slot); } - views->desc.dirty_mask |= 1llu << slot; + views->desc.list_dirty = true; } static void si_set_sampler_views(struct pipe_context *ctx, @@ -475,25 +260,17 @@ static void si_set_sampler_views(struct pipe_context *ctx, NULL, NULL); } } - - si_update_descriptors(sctx, &samplers->views.desc); } /* SAMPLER STATES */ -static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_sampler_states *states = (struct si_sampler_states*)atom; - - si_emit_descriptors(sctx, &states->desc, states->desc_data); -} - static void si_sampler_states_begin_new_cs(struct si_context *sctx, struct si_sampler_states *states) { + if (!states->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - si_emit_shader_pointer(sctx, &states->desc.atom); } void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, @@ -513,66 +290,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, for (i = 0; i < count; i++) { unsigned slot = start + i; - if (!sstates[i]) { - samplers->desc.dirty_mask &= ~(1llu << slot); + if (!sstates[i]) continue; - } - samplers->desc_data[slot] = sstates[i]->val; - samplers->desc.dirty_mask |= 1llu << slot; + memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4); + samplers->desc.list_dirty = true; } - - si_update_descriptors(sctx, &samplers->desc); } /* BUFFER RESOURCES */ -static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom; - - si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data); -} - -static void si_init_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned num_buffers, unsigned shader, +static void si_init_buffer_resources(struct si_buffer_resources *buffers, + unsigned num_buffers, unsigned shader_userdata_index, enum radeon_bo_usage shader_usage, enum radeon_bo_priority priority) { - int i; - - buffers->num_buffers = num_buffers; buffers->shader_usage = shader_usage; buffers->priority = priority; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4); - - /* si_emit_descriptors only accepts an array of arrays. - * This adds such an array. */ - buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*)); - for (i = 0; i < num_buffers; i++) { - buffers->desc_data[i] = &buffers->desc_storage[i*4]; - } - si_init_descriptors(sctx, &buffers->desc, - si_get_shader_user_data_base(shader) + - shader_userdata_index*4, 4, num_buffers, - si_emit_buffer_resources); + si_init_descriptors(&buffers->desc, shader_userdata_index, 4, + num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers) { int i; - for (i = 0; i < buffers->num_buffers; i++) { + for (i = 0; i < buffers->desc.num_elements; i++) { pipe_resource_reference(&buffers->buffers[i], NULL); } FREE(buffers->buffers); - FREE(buffers->desc_storage); - FREE(buffers->desc_data); si_release_descriptors(&buffers->desc); } @@ -590,11 +340,11 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, buffers->shader_usage, buffers->priority); } + if (!buffers->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - - si_emit_shader_pointer(sctx, &buffers->desc.atom); } /* VERTEX BUFFERS */ @@ -617,14 +367,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); } + + if (!desc->buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); - - si_emit_shader_pointer(sctx, &desc->atom); } -void si_update_vertex_buffers(struct si_context *sctx) +static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { struct si_descriptors *desc = &sctx->vertex_buffers; bool bound[SI_NUM_VERTEX_BUFFERS] = {}; @@ -632,8 +383,10 @@ void si_update_vertex_buffers(struct si_context *sctx) uint64_t va; uint32_t *ptr; + if (!sctx->vertex_buffers_dirty) + return true; if (!count || !sctx->vertex_elements) - return; + return true; /* Vertex buffer descriptors are the only ones which are uploaded * directly through a staging buffer and don't go through @@ -641,13 +394,14 @@ void si_update_vertex_buffers(struct si_context *sctx) */ u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); + if (!desc->buffer) + return false; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); assert(count <= SI_NUM_VERTEX_BUFFERS); - assert(desc->current_context_id == 0); for (i = 0; i < count; i++) { struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; @@ -675,7 +429,8 @@ void si_update_vertex_buffers(struct si_context *sctx) desc[0] = va & 0xFFFFFFFF; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride); - if (vb->stride) + + if (sctx->b.chip_class <= CIK && vb->stride) /* Round up by rounding down and adding 1 */ desc[2] = (vb->buffer->width0 - offset - sctx->vertex_elements->format_size[i]) / @@ -693,13 +448,14 @@ void si_update_vertex_buffers(struct si_context *sctx) } } - desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */ - desc->atom.dirty = true; - /* Don't flush the const cache. It would have a very negative effect * on performance (confirmed by testing). New descriptors are always * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ + desc->pointer_dirty = true; + si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); + sctx->vertex_buffers_dirty = false; + return true; } @@ -724,7 +480,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s if (shader >= SI_NUM_SHADERS) return; - assert(slot < buffers->num_buffers); + assert(slot < buffers->desc.num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy @@ -751,7 +507,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s } /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[slot]; + uint32_t *desc = buffers->desc.list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); @@ -770,12 +526,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s buffers->desc.enabled_mask |= 1llu << slot; } else { /* Clear the descriptor. */ - memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4); buffers->desc.enabled_mask &= ~(1llu << slot); } - buffers->desc.dirty_mask |= 1llu << slot; - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } /* RING BUFFERS */ @@ -784,7 +539,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_resource *buffer, unsigned stride, unsigned num_records, bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride) + unsigned element_size, unsigned index_stride, uint64_t offset) { struct si_context *sctx = (struct si_context *)ctx; struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; @@ -795,13 +550,13 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, /* The stride field in the resource descriptor has 14 bits */ assert(stride < (1 << 14)); - assert(slot < buffers->num_buffers); + assert(slot < buffers->desc.num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); if (buffer) { uint64_t va; - va = r600_resource(buffer)->gpu_address; + va = r600_resource(buffer)->gpu_address + offset; switch (element_size) { default: @@ -839,8 +594,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, break; } + if (sctx->b.chip_class >= VI && stride) + num_records *= stride; + /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[slot]; + uint32_t *desc = buffers->desc.list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) | @@ -863,12 +621,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, buffers->desc.enabled_mask |= 1llu << slot; } else { /* Clear the descriptor. */ - memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4); buffers->desc.enabled_mask &= ~(1llu << slot); } - buffers->desc.dirty_mask |= 1llu << slot; - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } /* STREAMOUT BUFFERS */ @@ -929,15 +686,21 @@ static void si_set_streamout_targets(struct pipe_context *ctx, struct pipe_resource *buffer = targets[i]->buffer; uint64_t va = r600_resource(buffer)->gpu_address; - /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[bufidx]; + /* Set the descriptor. + * + * On VI, the format must be non-INVALID, otherwise + * the buffer will be considered not bound and store + * instructions will be no-ops. + */ + uint32_t *desc = buffers->desc.list + bufidx*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); desc[2] = 0xffffffff; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); /* Set the resource. */ pipe_resource_reference(&buffers->buffers[bufidx], @@ -948,24 +711,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx, buffers->desc.enabled_mask |= 1llu << bufidx; } else { /* Clear the descriptor and unset the resource. */ - memset(buffers->desc_data[bufidx], 0, + memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->desc.enabled_mask &= ~(1llu << bufidx); } - buffers->desc.dirty_mask |= 1llu << bufidx; } for (; i < old_num_targets; i++) { bufidx = SI_SO_BUF_OFFSET + i; /* Clear the descriptor and unset the resource. */ - memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->desc.enabled_mask &= ~(1llu << bufidx); - buffers->desc.dirty_mask |= 1llu << bufidx; } - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } static void si_desc_reset_buffer_offset(struct pipe_context *ctx, @@ -1034,22 +795,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Read/Write buffers. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; - bool found = false; uint64_t mask = buffers->desc.enabled_mask; while (mask) { i = u_bit_scan64(&mask); if (buffers->buffers[i] == buf) { - si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], + si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4, old_va, buf); + buffers->desc.list_dirty = true; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, buffers->shader_usage, buffers->priority); - buffers->desc.dirty_mask |= 1llu << i; - found = true; - if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) { /* Update the streamout state. */ if (sctx->b.streamout.begin_emitted) { @@ -1061,34 +819,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } } - if (found) { - si_update_descriptors(sctx, &buffers->desc); - } } /* Constant buffers. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; - bool found = false; uint64_t mask = buffers->desc.enabled_mask; while (mask) { unsigned i = u_bit_scan64(&mask); if (buffers->buffers[i] == buf) { - si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], + si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4, old_va, buf); + buffers->desc.list_dirty = true; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, buffers->shader_usage, buffers->priority); - - buffers->desc.dirty_mask |= 1llu << i; - found = true; } } - if (found) { - si_update_descriptors(sctx, &buffers->desc); - } } /* Texture buffers - update virtual addresses in sampler view descriptors. */ @@ -1100,223 +849,211 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Texture buffers - update bindings. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_sampler_views *views = &sctx->samplers[shader].views; - bool found = false; uint64_t mask = views->desc.enabled_mask; while (mask) { unsigned i = u_bit_scan64(&mask); if (views->views[i]->texture == buf) { + si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4, + old_va, buf); + views->desc.list_dirty = true; + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); - - views->desc.dirty_mask |= 1llu << i; - found = true; } } - if (found) { - si_update_descriptors(sctx, &views->desc); - } } } -/* CP DMA */ - -/* The max number of bytes to copy per packet. */ -#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) +/* SHADER USER DATA */ -static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, - bool is_framebuffer) +static void si_mark_shader_pointers_dirty(struct si_context *sctx, + unsigned shader) { - struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags, tc_l2_flag; + sctx->const_buffers[shader].desc.pointer_dirty = true; + sctx->rw_buffers[shader].desc.pointer_dirty = true; + sctx->samplers[shader].views.desc.pointer_dirty = true; + sctx->samplers[shader].states.desc.pointer_dirty = true; - if (!size) - return; + if (shader == PIPE_SHADER_VERTEX) + sctx->vertex_buffers.pointer_dirty = true; - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&r600_resource(dst)->valid_buffer_range, offset, - offset + size); - - /* Fallback for unaligned clears. */ - if (offset % 4 != 0 || size % 4 != 0) { - uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, - sctx->b.rings.gfx.cs, - PIPE_TRANSFER_WRITE); - size /= 4; - for (unsigned i = 0; i < size; i++) - *map++ = value; - return; - } + si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); +} - uint64_t va = r600_resource(dst)->gpu_address + offset; +static void si_shader_userdata_begin_new_cs(struct si_context *sctx) +{ + int i; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_mark_shader_pointers_dirty(sctx, i); } +} - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; - - while (size) { - unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - unsigned dma_flags = tc_l2_flag; - - si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), - FALSE); - - /* This must be done after need_cs_space. */ - r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, - (struct r600_resource*)dst, RADEON_USAGE_WRITE, - RADEON_PRIO_MIN); - - /* Flush the caches for the first copy only. - * Also wait for the previous CP DMA operations. */ - if (sctx->b.flags) { - si_emit_cache_flush(&sctx->b, NULL); - dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ - } - - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) - dma_flags |= R600_CP_DMA_SYNC; +/* Set a base register address for user data constants in the given shader. + * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*. + */ +static void si_set_user_data_base(struct si_context *sctx, + unsigned shader, uint32_t new_base) +{ + uint32_t *base = &sctx->shader_userdata.sh_base[shader]; - /* Emit the clear packet. */ - si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); + if (*base != new_base) { + *base = new_base; - size -= byte_count; - va += byte_count; + if (new_base) + si_mark_shader_pointers_dirty(sctx, shader); } - - /* Flush the caches again in case the 3D engine has been prefetching - * the resource. */ - sctx->b.flags |= flush_flags; - - if (tc_l2_flag) - r600_resource(dst)->TC_L2_dirty = true; } -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size, - bool is_framebuffer) +/* This must be called when these shaders are changed from non-NULL to NULL + * and vice versa: + * - geometry shader + * - tessellation control shader + * - tessellation evaluation shader + */ +void si_shader_change_notify(struct si_context *sctx) { - unsigned flush_flags, tc_l2_flag; - - if (!size) - return; - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, - dst_offset + size); - - dst_offset += r600_resource(dst)->gpu_address; - src_offset += r600_resource(src)->gpu_address; - - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; + /* VS can be bound as VS, ES, or LS. */ + if (sctx->tes_shader) + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B530_SPI_SHADER_USER_DATA_LS_0); + else if (sctx->gs_shader) + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + else + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + + /* TES can be bound as ES, VS, or not bound. */ + if (sctx->tes_shader) { + if (sctx->gs_shader) + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + else + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B130_SPI_SHADER_USER_DATA_VS_0); } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0); } +} - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; +static void si_emit_shader_pointer(struct si_context *sctx, + struct si_descriptors *desc, + unsigned sh_base, bool keep_dirty) +{ + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + uint64_t va; - while (size) { - unsigned sync_flags = tc_l2_flag; - unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + if (!desc->pointer_dirty || !desc->buffer) + return; - si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); + va = desc->buffer->gpu_address + + desc->buffer_offset; - /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ - if (sctx->b.flags) { - si_emit_cache_flush(&sctx->b, NULL); - sync_flags |= SI_CP_DMA_RAW_WAIT; - } + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); + radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) { - sync_flags |= R600_CP_DMA_SYNC; - } + desc->pointer_dirty = keep_dirty; +} - /* This must be done after r600_need_cs_space. */ - r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, - RADEON_USAGE_READ, RADEON_PRIO_MIN); - r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, - RADEON_USAGE_WRITE, RADEON_PRIO_MIN); +static void si_emit_shader_userdata(struct si_context *sctx, + struct r600_atom *atom) +{ + unsigned i; + uint32_t *sh_base = sctx->shader_userdata.sh_base; + + if (sctx->gs_shader) { + /* The VS copy shader needs these for clipping, streamout, and rings. */ + unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0; + unsigned i = PIPE_SHADER_VERTEX; + + si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true); + si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true); + + /* The TESSEVAL shader needs this for the ESGS ring buffer. */ + si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, + R_00B330_SPI_SHADER_USER_DATA_ES_0, true); + } else if (sctx->tes_shader) { + /* The TESSEVAL shader needs this for streamout. */ + si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc, + R_00B130_SPI_SHADER_USER_DATA_VS_0, true); + } - si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); + for (i = 0; i < SI_NUM_SHADERS; i++) { + unsigned base = sh_base[i]; - size -= byte_count; - src_offset += byte_count; - dst_offset += byte_count; - } + if (!base) + continue; - /* Flush the caches again in case the 3D engine has been prefetching - * the resource. */ - sctx->b.flags |= flush_flags; + if (i != PIPE_SHADER_TESS_EVAL) + si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false); - if (tc_l2_flag) - r600_resource(dst)->TC_L2_dirty = true; + si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false); + si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false); + si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false); + } + si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false); } -/* INIT/DEINIT */ +/* INIT/DEINIT/UPLOAD */ void si_init_all_descriptors(struct si_context *sctx) { int i; for (i = 0; i < SI_NUM_SHADERS; i++) { - si_init_buffer_resources(sctx, &sctx->const_buffers[i], - SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST, + si_init_buffer_resources(&sctx->const_buffers[i], + SI_NUM_CONST_BUFFERS, SI_SGPR_CONST, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); - si_init_buffer_resources(sctx, &sctx->rw_buffers[i], - i == PIPE_SHADER_VERTEX ? - SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS, - i, SI_SGPR_RW_BUFFERS, + si_init_buffer_resources(&sctx->rw_buffers[i], + SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW); - si_init_sampler_views(sctx, &sctx->samplers[i].views, i); - - si_init_descriptors(sctx, &sctx->samplers[i].states.desc, - si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4, - 4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states); - - sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom; - sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom; - sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom; - sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom; + si_init_descriptors(&sctx->samplers[i].views.desc, + SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS); + si_init_descriptors(&sctx->samplers[i].states.desc, + SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES); } - si_init_descriptors(sctx, &sctx->vertex_buffers, - si_get_shader_user_data_base(PIPE_SHADER_VERTEX) + - SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS, - si_emit_shader_pointer); - sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom; + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER, + 4, SI_NUM_VERTEX_BUFFERS); /* Set pipe_context functions. */ sctx->b.b.set_constant_buffer = si_set_constant_buffer; sctx->b.b.set_sampler_views = si_set_sampler_views; sctx->b.b.set_stream_output_targets = si_set_streamout_targets; - sctx->b.clear_buffer = si_clear_buffer; sctx->b.invalidate_buffer = si_invalidate_buffer; + + /* Shader user data. */ + sctx->atoms.s.shader_userdata = &sctx->shader_userdata.atom; + sctx->shader_userdata.atom.emit = (void*)si_emit_shader_userdata; + + /* Upper bound, 4 pointers per shader, +1 for vertex buffers, +2 for the VS copy shader. */ + sctx->shader_userdata.atom.num_dw = (SI_NUM_SHADERS * 4 + 1 + 2) * 4; + + /* Set default and immutable mappings. */ + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); +} + +bool si_upload_shader_descriptors(struct si_context *sctx) +{ + int i; + + for (i = 0; i < SI_NUM_SHADERS; i++) { + if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) || + !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) || + !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) || + !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc)) + return false; + } + return si_upload_vertex_buffer_descriptors(sctx); } void si_release_all_descriptors(struct si_context *sctx) @@ -1343,4 +1080,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states); } si_vertex_buffers_begin_new_cs(sctx); + si_shader_userdata_begin_new_cs(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 313ced7f5d1..307dc391431 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -30,10 +30,32 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw, boolean count_draw_in) { + struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; int i; + /* If the CS is sufficiently large, don't count the space needed + * and just flush if there is less than 8096 dwords left. */ + if (cs->max_dw >= 24 * 1024) { + if (cs->cdw > cs->max_dw - 8 * 1024) + ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + return; + } + + /* There are two memory usage counters in the winsys for all buffers + * that have been added (cs_add_reloc) and two counters in the pipe + * driver for those that haven't been added yet. + * */ + if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) { + ctx->b.gtt = 0; + ctx->b.vram = 0; + ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + return; + } + ctx->b.gtt = 0; + ctx->b.vram = 0; + /* The number of dwords we already used in the CS so far. */ - num_dw += ctx->b.rings.gfx.cs->cdw; + num_dw += cs->cdw; if (count_draw_in) { for (i = 0; i < SI_NUM_ATOMS(ctx); i++) { @@ -50,7 +72,8 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw, } /* Count in queries_suspend. */ - num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend; + num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend + + ctx->b.num_cs_dw_timer_queries_suspend; /* Count in streamout_end at the end of CS. */ if (ctx->b.streamout.begin_emitted) { @@ -72,7 +95,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw, #endif /* Flush if there's not enough space. */ - if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { + if (num_dw > cs->max_dw) { ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } } @@ -82,9 +105,16 @@ void si_context_gfx_flush(void *context, unsigned flags, { struct si_context *ctx = context; struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; - - if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence) + struct radeon_winsys *ws = ctx->b.ws; + + if (cs->cdw == ctx->b.initial_gfx_cs_size && + (!fence || ctx->last_gfx_fence)) { + if (fence) + ws->fence_reference(fence, ctx->last_gfx_fence); + if (!(flags & RADEON_FLUSH_ASYNC)) + ws->cs_sync_flush(cs); return; + } ctx->b.rings.gfx.flushing = true; @@ -101,9 +131,13 @@ void si_context_gfx_flush(void *context, unsigned flags, flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; /* Flush the CS. */ - ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++); + ws->cs_flush(cs, flags, &ctx->last_gfx_fence, + ctx->screen->b.cs_count++); ctx->b.rings.gfx.flushing = false; + if (fence) + ws->fence_reference(fence, ctx->last_gfx_fence); + #if SI_TRACE_CS if (ctx->screen->b.trace_bo) { struct si_screen *sscreen = ctx->screen; @@ -111,7 +145,7 @@ void si_context_gfx_flush(void *context, unsigned flags, for (i = 0; i < 10; i++) { usleep(5); - if (!ctx->b.ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) { + if (!ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) { break; } } @@ -130,7 +164,8 @@ void si_context_gfx_flush(void *context, unsigned flags, void si_begin_new_cs(struct si_context *ctx) { /* Flush read caches at the beginning of CS. */ - ctx->b.flags |= SI_CONTEXT_INV_TC_L1 | + ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | + SI_CONTEXT_INV_TC_L1 | SI_CONTEXT_INV_TC_L2 | SI_CONTEXT_INV_KCACHE | SI_CONTEXT_INV_ICACHE; @@ -143,24 +178,32 @@ void si_begin_new_cs(struct si_context *ctx) /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); - ctx->clip_regs.dirty = true; - ctx->framebuffer.atom.dirty = true; - ctx->msaa_sample_locs.dirty = true; - ctx->msaa_config.dirty = true; - ctx->db_render_state.dirty = true; - ctx->b.streamout.enable_atom.dirty = true; + si_mark_atom_dirty(ctx, &ctx->clip_regs); + si_mark_atom_dirty(ctx, &ctx->framebuffer.atom); + si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs); + si_mark_atom_dirty(ctx, &ctx->msaa_config); + si_mark_atom_dirty(ctx, &ctx->db_render_state); + si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); si_all_descriptors_begin_new_cs(ctx); r600_postflush_resume_features(&ctx->b); ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + + /* Invalidate various draw states so that they are emitted before + * the first draw call. */ si_invalidate_draw_sh_constants(ctx); ctx->last_primitive_restart_en = -1; ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN; ctx->last_gs_out_prim = -1; ctx->last_prim = -1; ctx->last_multi_vgt_param = -1; + ctx->last_ls_hs_config = -1; ctx->last_rast_prim = -1; ctx->last_sc_line_stipple = ~0; ctx->emit_scratch_reloc = true; + ctx->last_ls = NULL; + ctx->last_tcs = NULL; + ctx->last_tes_sh_base = -1; + ctx->last_num_tcs_input_cp = -1; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 53ae71a8c92..473a2e9ad12 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -36,32 +36,42 @@ static void si_destroy_context(struct pipe_context *context) { struct si_context *sctx = (struct si_context *)context; + int i; si_release_all_descriptors(sctx); pipe_resource_reference(&sctx->esgs_ring, NULL); pipe_resource_reference(&sctx->gsvs_ring, NULL); + pipe_resource_reference(&sctx->tf_ring, NULL); pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); r600_resource_reference(&sctx->border_color_table, NULL); r600_resource_reference(&sctx->scratch_buffer, NULL); + sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings); - si_pm4_delete_state(sctx, gs_onoff, sctx->gs_on); - si_pm4_delete_state(sctx, gs_onoff, sctx->gs_off); + si_pm4_delete_state(sctx, tf_ring, sctx->tf_state); + for (i = 0; i < Elements(sctx->vgt_shader_config); i++) + si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); if (sctx->pstipple_sampler_state) sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state); - if (sctx->dummy_pixel_shader) { + if (sctx->dummy_pixel_shader) sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader); - } - sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush); - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve); - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress); - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear); + if (sctx->fixed_func_tcs_shader) + sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader); + if (sctx->custom_dsa_flush) + sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush); + if (sctx->custom_blend_resolve) + sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve); + if (sctx->custom_blend_decompress) + sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress); + if (sctx->custom_blend_fastclear) + sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear); util_unreference_framebuffer_state(&sctx->framebuffer.state); - util_blitter_destroy(sctx->blitter); + if (sctx->blitter) + util_blitter_destroy(sctx->blitter); si_pm4_cleanup(sctx); @@ -74,6 +84,14 @@ static void si_destroy_context(struct pipe_context *context) FREE(sctx); } +static enum pipe_reset_status +si_amdgpu_get_reset_status(struct pipe_context *ctx) +{ + struct si_context *sctx = (struct si_context *)ctx; + + return sctx->b.ws->ctx_query_reset_status(sctx->b.ctx); +} + static struct pipe_context *si_create_context(struct pipe_screen *screen, void *priv) { struct si_context *sctx = CALLOC_STRUCT(si_context); @@ -91,13 +109,18 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * sctx->b.b.screen = screen; /* this must be set first */ sctx->b.b.priv = priv; sctx->b.b.destroy = si_destroy_context; + sctx->b.set_atom_dirty = (void *)si_set_atom_dirty; sctx->screen = sscreen; /* Easy accessing of screen/winsys. */ if (!r600_common_context_init(&sctx->b, &sscreen->b)) goto fail; + if (sscreen->b.info.drm_major == 3) + sctx->b.b.get_device_reset_status = si_amdgpu_get_reset_status; + si_init_blit_functions(sctx); si_init_compute_functions(sctx); + si_init_cp_dma_functions(sctx); if (sscreen->b.info.has_uvd) { sctx->b.b.create_video_codec = si_uvd_create_decoder; @@ -107,7 +130,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX, si_context_gfx_flush, + sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, sctx, sscreen->b.trace_bo ? sscreen->b.trace_bo->cs_buf : NULL); sctx->b.rings.gfx.flush = si_context_gfx_flush; @@ -127,17 +150,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * sctx->atoms.s.streamout_begin = &sctx->b.streamout.begin_atom; sctx->atoms.s.streamout_enable = &sctx->b.streamout.enable_atom; - switch (sctx->b.chip_class) { - case SI: - case CIK: - si_init_state_functions(sctx); - si_init_shader_functions(sctx); - si_init_config(sctx); - break; - default: - R600_ERR("Unsupported chip class %d.\n", sctx->b.chip_class); - goto fail; - } + si_init_state_functions(sctx); + si_init_shader_functions(sctx); if (sscreen->b.debug_flags & DBG_FORCE_DMA) sctx->b.b.resource_copy_region = sctx->b.dma_copy; @@ -181,7 +195,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * r600_target = radeon_llvm_get_r600_target(triple); sctx->tm = LLVMCreateTargetMachine(r600_target, triple, r600_get_llvm_processor_name(sscreen->b.family), - "+DumpCode,+vgpr-spilling", + sctx->b.chip_class >= VI ? + "+DumpCode" : + "+DumpCode,+vgpr-spilling", LLVMCodeGenLevelDefault, LLVMRelocDefault, LLVMCodeModelDefault); @@ -252,15 +268,27 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: case PIPE_CAP_TGSI_TEXCOORD: + case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 1; case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr; + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return (sscreen->b.info.drm_major == 2 && + sscreen->b.info.drm_minor >= 43) || + sscreen->b.info.drm_major == 3; + case PIPE_CAP_TEXTURE_MULTISAMPLE: /* 2D tiling on CIK is supported since DRM 2.35.0 */ return sscreen->b.chip_class < CIK || - sscreen->b.info.drm_minor >= 35; + (sscreen->b.info.drm_major == 2 && + sscreen->b.info.drm_minor >= 35) || + sscreen->b.info.drm_major == 3; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return R600_MAP_BUFFER_ALIGNMENT; @@ -270,7 +298,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return 4; case PIPE_CAP_GLSL_FEATURE_LEVEL: - return 330; + return HAVE_LLVM >= 0x0307 ? 410 : 330; case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF); @@ -289,13 +317,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 0; + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + return 30; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600; @@ -314,7 +342,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: return 4095; case PIPE_CAP_MAX_VERTEX_STREAMS: - return 1; + return 4; case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: return 2048; @@ -335,7 +363,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return 8; case PIPE_CAP_MAX_VIEWPORTS: - return 1; + return 16; /* Timer queries, present when the clock frequency is non zero. */ case PIPE_CAP_QUERY_TIMESTAMP: @@ -375,6 +403,13 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: break; + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + /* LLVM 3.6.2 is required for tessellation because of bug fixes there */ + if (HAVE_LLVM < 0x0306 || + (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 2)) + return 0; + break; case PIPE_SHADER_COMPUTE: switch (param) { case PIPE_SHADER_CAP_PREFERRED_IR: @@ -401,7 +436,6 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu } break; default: - /* TODO: support tessellation */ return 0; } @@ -433,7 +467,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu /* Indirection of geometry shader input dimension is not * handled yet */ - return shader < PIPE_SHADER_GEOMETRY; + return shader != PIPE_SHADER_GEOMETRY; case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: @@ -448,6 +482,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_DOUBLES: + return HAVE_LLVM >= 0x0307; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: return 0; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 2d67342f160..553e1f32683 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -48,7 +48,8 @@ #define SI_MAX_DRAW_CS_DWORDS \ (/*scratch:*/ 3 + /*derived prim state:*/ 3 + \ - /*draw regs:*/ 16 + /*draw packets:*/ 31) + /*draw regs:*/ 18 + /*draw packets:*/ 31 +\ + /*derived tess state:*/ 19) /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) @@ -125,8 +126,6 @@ struct si_framebuffer { #define SI_NUM_ATOMS(sctx) (sizeof((sctx)->atoms)/sizeof((sctx)->atoms.array[0])) -#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1) - struct si_context { struct r600_common_context b; struct blitter_context *blitter; @@ -137,17 +136,12 @@ struct si_context { void *pstipple_sampler_state; struct si_screen *screen; struct si_pm4_state *init_config; + struct pipe_fence_handle *last_gfx_fence; + struct si_shader_selector *fixed_func_tcs_shader; union { struct { /* The order matters. */ - struct r600_atom *vertex_buffers; - struct r600_atom *const_buffers[SI_NUM_SHADERS]; - struct r600_atom *rw_buffers[SI_NUM_SHADERS]; - struct r600_atom *sampler_views[SI_NUM_SHADERS]; - struct r600_atom *sampler_states[SI_NUM_SHADERS]; - /* Caches must be flushed after resource descriptors are - * updated in memory. */ struct r600_atom *cache_flush; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ @@ -156,6 +150,7 @@ struct si_context { struct r600_atom *db_render_state; struct r600_atom *msaa_config; struct r600_atom *clip_regs; + struct r600_atom *shader_userdata; } s; struct r600_atom *array[0]; } atoms; @@ -168,7 +163,10 @@ struct si_context { struct si_shader_selector *ps_shader; struct si_shader_selector *gs_shader; struct si_shader_selector *vs_shader; + struct si_shader_selector *tcs_shader; + struct si_shader_selector *tes_shader; struct si_cs_shader_state cs_shader_state; + struct si_shader_data shader_userdata; /* shader information */ unsigned sprite_coord_enable; bool flatshade; @@ -194,13 +192,16 @@ struct si_context { /* With rasterizer discard, there doesn't have to be a pixel shader. * In that case, we bind this one: */ void *dummy_pixel_shader; - struct si_pm4_state *gs_on; - struct si_pm4_state *gs_off; - struct si_pm4_state *gs_rings; struct r600_atom cache_flush; struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */ + + /* VGT states. */ + struct si_pm4_state *vgt_shader_config[4]; + struct si_pm4_state *gs_rings; struct pipe_resource *esgs_ring; struct pipe_resource *gsvs_ring; + struct si_pm4_state *tf_state; + struct pipe_resource *tf_ring; LLVMTargetMachineRef tm; @@ -218,7 +219,7 @@ struct si_context { bool db_depth_disable_expclear; unsigned ps_db_shader_control; - /* Draw state. */ + /* Emitted draw state. */ int last_base_vertex; int last_start_instance; int last_sh_base_reg; @@ -227,6 +228,7 @@ struct si_context { int last_gs_out_prim; int last_prim; int last_multi_vgt_param; + int last_ls_hs_config; int last_rast_prim; unsigned last_sc_line_stipple; int current_rast_prim; /* primitive type after TES, GS */ @@ -235,6 +237,12 @@ struct si_context { boolean emit_scratch_reloc; unsigned scratch_waves; unsigned spi_tmpring_size; + + /* Emitted derived tessellation state. */ + struct si_shader *last_ls; /* local shader (VS) */ + struct si_shader_selector *last_tcs; + int last_num_tcs_input_cp; + int last_tes_sh_base; }; /* cik_sdma.c */ @@ -260,6 +268,13 @@ void si_resource_copy_region(struct pipe_context *ctx, unsigned src_level, const struct pipe_box *src_box); +/* si_cp_dma.c */ +void si_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size, + bool is_framebuffer); +void si_init_cp_dma_functions(struct si_context *sctx); + /* si_dma.c */ void si_dma_copy(struct pipe_context *ctx, struct pipe_resource *dst, @@ -293,7 +308,7 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, * common helpers */ -static INLINE struct r600_resource * +static inline struct r600_resource * si_resource_create_custom(struct pipe_screen *screen, unsigned usage, unsigned size) { @@ -302,7 +317,7 @@ si_resource_create_custom(struct pipe_screen *screen, PIPE_BIND_CUSTOM, usage, size)); } -static INLINE void +static inline void si_invalidate_draw_sh_constants(struct si_context *sctx) { sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN; @@ -310,4 +325,18 @@ si_invalidate_draw_sh_constants(struct si_context *sctx) sctx->last_sh_base_reg = -1; /* reset to an unknown value */ } +static inline void +si_set_atom_dirty(struct si_context *sctx, + struct r600_atom *atom, bool dirty) +{ + atom->dirty = dirty; +} + +static inline void +si_mark_atom_dirty(struct si_context *sctx, + struct r600_atom *atom) +{ + si_set_atom_dirty(sctx, atom, true); +} + #endif diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 47e5f96cbed..4288e9b2ab1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -31,6 +31,7 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_bitarit.h" #include "gallivm/lp_bld_flow.h" #include "radeon/r600_cs.h" #include "radeon/radeon_llvm.h" @@ -71,18 +72,25 @@ struct si_shader_context int param_streamout_write_index; int param_streamout_offset[4]; int param_vertex_id; + int param_rel_auto_id; + int param_vs_prim_id; int param_instance_id; + int param_tes_u; + int param_tes_v; + int param_tes_rel_patch_id; + int param_tes_patch_id; + int param_es2gs_offset; LLVMTargetMachineRef tm; LLVMValueRef const_md; LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS]; - LLVMValueRef ddxy_lds; + LLVMValueRef lds; LLVMValueRef *constants[SI_NUM_CONST_BUFFERS]; LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS]; LLVMValueRef samplers[SI_NUM_SAMPLER_STATES]; LLVMValueRef so_buffers[4]; LLVMValueRef esgs_ring; - LLVMValueRef gsvs_ring; - LLVMValueRef gs_next_vertex; + LLVMValueRef gsvs_ring[4]; + LLVMValueRef gs_next_vertex[4]; }; static struct si_shader_context * si_shader_context( @@ -129,12 +137,29 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) assert(index <= 1); return 2 + index; case TGSI_SEMANTIC_GENERIC: - assert(index <= 63-4); - return 4 + index; + if (index <= 63-4) + return 4 + index; + else + /* same explanation as in the default statement, + * the only user hitting this is st/nine. + */ + return 0; + + /* patch indices are completely separate and thus start from 0 */ + case TGSI_SEMANTIC_TESSOUTER: + return 0; + case TGSI_SEMANTIC_TESSINNER: + return 1; + case TGSI_SEMANTIC_PATCH: + return 2 + index; default: - assert(0); - return 63; + /* Don't fail here. The result of this function is only used + * for LS, TCS, TES, and GS, where legacy GL semantics can't + * occur, but this function is called for all vertex shaders + * before it's known whether LS will be compiled or not. + */ + return 0; } } @@ -205,6 +230,136 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, return value; } +static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx) +{ + switch (si_shader_ctx->type) { + case TGSI_PROCESSOR_TESS_CTRL: + return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8); + + case TGSI_PROCESSOR_TESS_EVAL: + return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + si_shader_ctx->param_tes_rel_patch_id); + + default: + assert(0); + return NULL; + } +} + +/* Tessellation shaders pass outputs to the next shader using LDS. + * + * LS outputs = TCS inputs + * TCS outputs = TES inputs + * + * The LDS layout is: + * - TCS inputs for patch 0 + * - TCS inputs for patch 1 + * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) + * - ... + * - TCS outputs for patch 0 = get_tcs_out_patch0_offset + * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset + * - TCS outputs for patch 1 + * - Per-patch TCS outputs for patch 1 + * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) + * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) + * - ... + * + * All three shaders VS(LS), TCS, TES share the same LDS space. + */ + +static LLVMValueRef +get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx) +{ + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) + return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13); + else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL) + return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13); + else { + assert(0); + return NULL; + } +} + +static LLVMValueRef +get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx) +{ + return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13); +} + +static LLVMValueRef +get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx) +{ + return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld, + unpack_param(si_shader_ctx, + SI_PARAM_TCS_OUT_OFFSETS, + 0, 16), + 4); +} + +static LLVMValueRef +get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx) +{ + return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld, + unpack_param(si_shader_ctx, + SI_PARAM_TCS_OUT_OFFSETS, + 16, 16), + 4); +} + +static LLVMValueRef +get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx) +{ + struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; + LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + + return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, ""); +} + +static LLVMValueRef +get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx) +{ + struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; + LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + + return LLVMBuildAdd(gallivm->builder, patch0_offset, + LLVMBuildMul(gallivm->builder, patch_stride, + rel_patch_id, ""), + ""); +} + +static LLVMValueRef +get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx) +{ + struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; + LLVMValueRef patch0_patch_data_offset = + get_tcs_out_patch0_patch_data_offset(si_shader_ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + + return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset, + LLVMBuildMul(gallivm->builder, patch_stride, + rel_patch_id, ""), + ""); +} + +static void build_indexed_store(struct si_shader_context *si_shader_ctx, + LLVMValueRef base_ptr, LLVMValueRef index, + LLVMValueRef value) +{ + struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef indices[2], pointer; + + indices[0] = bld_base->uint_bld.zero; + indices[1] = index; + + pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, ""); + LLVMBuildStore(gallivm->builder, value, pointer); +} + /** * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. * It's equivalent to doing a load from &base_ptr[index]. @@ -308,7 +463,7 @@ static void declare_input_vs( args[0] = t_list; args[1] = attribute_offset; args[2] = buffer_index; - input = build_intrinsic(gallivm->builder, + input = lp_build_intrinsic(gallivm->builder, "llvm.SI.vs.load.input", vec4_type, args, 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); @@ -323,6 +478,285 @@ static void declare_input_vs( } } +static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, + unsigned swizzle) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + + if (swizzle > 0) + return bld_base->uint_bld.zero; + + switch (si_shader_ctx->type) { + case TGSI_PROCESSOR_VERTEX: + return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + si_shader_ctx->param_vs_prim_id); + case TGSI_PROCESSOR_TESS_CTRL: + return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_PATCH_ID); + case TGSI_PROCESSOR_TESS_EVAL: + return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + si_shader_ctx->param_tes_patch_id); + case TGSI_PROCESSOR_GEOMETRY: + return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_PRIMITIVE_ID); + default: + assert(0); + return bld_base->uint_bld.zero; + } +} + +/** + * Return the value of tgsi_ind_register for indexing. + * This is the indirect index with the constant offset added to it. + */ +static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx, + const struct tgsi_ind_register *ind, + int rel_index) +{ + struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; + LLVMValueRef result; + + result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle]; + result = LLVMBuildLoad(gallivm->builder, result, ""); + result = LLVMBuildAdd(gallivm->builder, result, + lp_build_const_int32(gallivm, rel_index), ""); + return result; +} + +/** + * Calculate a dword address given an input or output register and a stride. + */ +static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx, + const struct tgsi_full_dst_register *dst, + const struct tgsi_full_src_register *src, + LLVMValueRef vertex_dw_stride, + LLVMValueRef base_addr) +{ + struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; + struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info; + ubyte *name, *index, *array_first; + int first, param; + struct tgsi_full_dst_register reg; + + /* Set the register description. The address computation is the same + * for sources and destinations. */ + if (src) { + reg.Register.File = src->Register.File; + reg.Register.Index = src->Register.Index; + reg.Register.Indirect = src->Register.Indirect; + reg.Register.Dimension = src->Register.Dimension; + reg.Indirect = src->Indirect; + reg.Dimension = src->Dimension; + reg.DimIndirect = src->DimIndirect; + } else + reg = *dst; + + /* If the register is 2-dimensional (e.g. an array of vertices + * in a primitive), calculate the base address of the vertex. */ + if (reg.Register.Dimension) { + LLVMValueRef index; + + if (reg.Dimension.Indirect) + index = get_indirect_index(si_shader_ctx, ®.DimIndirect, + reg.Dimension.Index); + else + index = lp_build_const_int32(gallivm, reg.Dimension.Index); + + base_addr = LLVMBuildAdd(gallivm->builder, base_addr, + LLVMBuildMul(gallivm->builder, index, + vertex_dw_stride, ""), ""); + } + + /* Get information about the register. */ + if (reg.Register.File == TGSI_FILE_INPUT) { + name = info->input_semantic_name; + index = info->input_semantic_index; + array_first = info->input_array_first; + } else if (reg.Register.File == TGSI_FILE_OUTPUT) { + name = info->output_semantic_name; + index = info->output_semantic_index; + array_first = info->output_array_first; + } else { + assert(0); + return NULL; + } + + if (reg.Register.Indirect) { + /* Add the relative address of the element. */ + LLVMValueRef ind_index; + + if (reg.Indirect.ArrayID) + first = array_first[reg.Indirect.ArrayID]; + else + first = reg.Register.Index; + + ind_index = get_indirect_index(si_shader_ctx, ®.Indirect, + reg.Register.Index - first); + + base_addr = LLVMBuildAdd(gallivm->builder, base_addr, + LLVMBuildMul(gallivm->builder, ind_index, + lp_build_const_int32(gallivm, 4), ""), ""); + + param = si_shader_io_get_unique_index(name[first], index[first]); + } else { + param = si_shader_io_get_unique_index(name[reg.Register.Index], + index[reg.Register.Index]); + } + + /* Add the base address of the element. */ + return LLVMBuildAdd(gallivm->builder, base_addr, + lp_build_const_int32(gallivm, param * 4), ""); +} + +/** + * Load from LDS. + * + * \param type output value type + * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 + * \param dw_addr address in dwords + */ +static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, + enum tgsi_opcode_type type, unsigned swizzle, + LLVMValueRef dw_addr) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef value; + + if (swizzle == ~0) { + LLVMValueRef values[TGSI_NUM_CHANNELS]; + + for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) + values[chan] = lds_load(bld_base, type, chan, dw_addr); + + return lp_build_gather_values(bld_base->base.gallivm, values, + TGSI_NUM_CHANNELS); + } + + dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, + lp_build_const_int32(gallivm, swizzle)); + + value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr); + return LLVMBuildBitCast(gallivm->builder, value, + tgsi2llvmtype(bld_base, type), ""); +} + +/** + * Store to LDS. + * + * \param swizzle offset (typically 0..3) + * \param dw_addr address in dwords + * \param value value to store + */ +static void lds_store(struct lp_build_tgsi_context * bld_base, + unsigned swizzle, LLVMValueRef dw_addr, + LLVMValueRef value) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + + dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, + lp_build_const_int32(gallivm, swizzle)); + + value = LLVMBuildBitCast(gallivm->builder, value, + LLVMInt32TypeInContext(gallivm->context), ""); + build_indexed_store(si_shader_ctx, si_shader_ctx->lds, + dw_addr, value); +} + +static LLVMValueRef fetch_input_tcs( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, unsigned swizzle) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + LLVMValueRef dw_addr, stride; + + stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); + dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + + return lds_load(bld_base, type, swizzle, dw_addr); +} + +static LLVMValueRef fetch_output_tcs( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, unsigned swizzle) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + LLVMValueRef dw_addr, stride; + + if (reg->Register.Dimension) { + stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + } else { + dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr); + } + + return lds_load(bld_base, type, swizzle, dw_addr); +} + +static LLVMValueRef fetch_input_tes( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, unsigned swizzle) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + LLVMValueRef dw_addr, stride; + + if (reg->Register.Dimension) { + stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + } else { + dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr); + } + + return lds_load(bld_base, type, swizzle, dw_addr); +} + +static void store_output_tcs(struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_instruction * inst, + const struct tgsi_opcode_info * info, + LLVMValueRef dst[4]) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + unsigned chan_index; + LLVMValueRef dw_addr, stride; + + /* Only handle per-patch and per-vertex outputs here. + * Vectors will be lowered to scalars and this function will be called again. + */ + if (reg->Register.File != TGSI_FILE_OUTPUT || + (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { + radeon_llvm_emit_store(bld_base, inst, info, dst); + return; + } + + if (reg->Register.Dimension) { + stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr); + } else { + dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); + dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr); + } + + TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) { + LLVMValueRef value = dst[chan_index]; + + if (inst->Instruction.Saturate) + value = radeon_llvm_saturate(bld_base, value); + + lds_store(bld_base, chan_index, dw_addr, value); + } +} + static LLVMValueRef fetch_input_gs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, @@ -342,13 +776,8 @@ static LLVMValueRef fetch_input_gs( unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; - if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) { - if (swizzle == 0) - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - SI_PARAM_PRIMITIVE_ID); - else - return uint->zero; - } + if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) + return get_primitive_id(bld_base, swizzle); if (!reg->Register.Dimension) return NULL; @@ -380,7 +809,7 @@ static LLVMValueRef fetch_input_gs( args[1] = vtx_offset; args[2] = lp_build_const_int32(gallivm, (get_param_index(semantic_name, semantic_index, - shader->selector->gs_used_inputs) * 4 + + shader->selector->inputs_read) * 4 + swizzle) * 256); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ @@ -390,13 +819,42 @@ static LLVMValueRef fetch_input_gs( args[8] = uint->zero; /* TFE */ return LLVMBuildBitCast(gallivm->builder, - build_intrinsic(gallivm->builder, + lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", i32, args, 9, LLVMReadOnlyAttribute | LLVMNoUnwindAttribute), tgsi2llvmtype(bld_base, type), ""); } +static int lookup_interp_param_index(unsigned interpolate, unsigned location) +{ + switch (interpolate) { + case TGSI_INTERPOLATE_CONSTANT: + return 0; + + case TGSI_INTERPOLATE_LINEAR: + if (location == TGSI_INTERPOLATE_LOC_SAMPLE) + return SI_PARAM_LINEAR_SAMPLE; + else if (location == TGSI_INTERPOLATE_LOC_CENTROID) + return SI_PARAM_LINEAR_CENTROID; + else + return SI_PARAM_LINEAR_CENTER; + break; + case TGSI_INTERPOLATE_COLOR: + case TGSI_INTERPOLATE_PERSPECTIVE: + if (location == TGSI_INTERPOLATE_LOC_SAMPLE) + return SI_PARAM_PERSP_SAMPLE; + else if (location == TGSI_INTERPOLATE_LOC_CENTROID) + return SI_PARAM_PERSP_CENTROID; + else + return SI_PARAM_PERSP_CENTER; + break; + default: + fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); + return -1; + } +} + static void declare_input_fs( struct radeon_llvm_context *radeon_bld, unsigned input_index, @@ -411,7 +869,8 @@ static void declare_input_fs( LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context); LLVMValueRef main_fn = radeon_bld->main_fn; - LLVMValueRef interp_param; + LLVMValueRef interp_param = NULL; + int interp_param_idx; const char * intr_name; /* This value is: @@ -460,31 +919,13 @@ static void declare_input_fs( attr_number = lp_build_const_int32(gallivm, shader->ps_input_param_offset[input_index]); - switch (decl->Interp.Interpolate) { - case TGSI_INTERPOLATE_CONSTANT: - interp_param = 0; - break; - case TGSI_INTERPOLATE_LINEAR: - if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE) - interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_SAMPLE); - else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID) - interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTROID); - else - interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTER); - break; - case TGSI_INTERPOLATE_COLOR: - case TGSI_INTERPOLATE_PERSPECTIVE: - if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE) - interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_SAMPLE); - else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID) - interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTROID); - else - interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTER); - break; - default: - fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); + shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate; + interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, + decl->Interp.Location); + if (interp_param_idx == -1) return; - } + else if (interp_param_idx) + interp_param = LLVMGetParam(main_fn, interp_param_idx); /* fs.constant returns the param from the middle vertex, so it's not * really useful for flat shading. It's meant to be used for custom @@ -522,12 +963,12 @@ static void declare_input_fs( args[0] = llvm_chan; args[1] = attr_number; - front = build_intrinsic(gallivm->builder, intr_name, + front = lp_build_intrinsic(gallivm->builder, intr_name, input_type, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); args[1] = back_attr_number; - back = build_intrinsic(gallivm->builder, intr_name, + back = lp_build_intrinsic(gallivm->builder, intr_name, input_type, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); @@ -548,7 +989,7 @@ static void declare_input_fs( args[2] = params; args[3] = interp_param; radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = - build_intrinsic(gallivm->builder, intr_name, + lp_build_intrinsic(gallivm->builder, intr_name, input_type, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = @@ -566,7 +1007,7 @@ static void declare_input_fs( args[2] = params; args[3] = interp_param; radeon_bld->inputs[soa_index] = - build_intrinsic(gallivm->builder, intr_name, + lp_build_intrinsic(gallivm->builder, intr_name, input_type, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); } @@ -587,10 +1028,35 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou { LLVMValueRef args[2] = {resource, offset}; - return build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2, + return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); } +static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id) +{ + struct si_shader_context *si_shader_ctx = + si_shader_context(&radeon_bld->soa.bld_base); + struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; + struct gallivm_state *gallivm = &radeon_bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF); + LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index); + + /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ + LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); + LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), ""); + + LLVMValueRef pos[4] = { + buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type), + buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type), + lp_build_const_float(gallivm, 0), + lp_build_const_float(gallivm, 0) + }; + + return lp_build_gather_values(gallivm, pos, 4); +} + static void declare_system_value( struct radeon_llvm_context * radeon_bld, unsigned index, @@ -598,6 +1064,7 @@ static void declare_system_value( { struct si_shader_context *si_shader_ctx = si_shader_context(&radeon_bld->soa.bld_base); + struct lp_build_context *bld = &radeon_bld->soa.bld_base.base; struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMValueRef value = 0; @@ -626,30 +1093,23 @@ static void declare_system_value( SI_PARAM_BASE_VERTEX); break; + case TGSI_SEMANTIC_INVOCATIONID: + if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL) + value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5); + else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) + value = LLVMGetParam(radeon_bld->main_fn, + SI_PARAM_GS_INSTANCE_ID); + else + assert(!"INVOCATIONID not implemented"); + break; + case TGSI_SEMANTIC_SAMPLEID: value = get_sample_id(radeon_bld); break; case TGSI_SEMANTIC_SAMPLEPOS: - { - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); - LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF); - LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index); - - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ - LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, get_sample_id(radeon_bld), 8); - LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), ""); - - LLVMValueRef pos[4] = { - buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type), - buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type), - lp_build_const_float(gallivm, 0), - lp_build_const_float(gallivm, 0) - }; - value = lp_build_gather_values(gallivm, pos, 4); + value = load_sample_position(radeon_bld, get_sample_id(radeon_bld)); break; - } case TGSI_SEMANTIC_SAMPLEMASK: /* Smoothing isn't MSAA in GL, but it's MSAA in hardware. @@ -660,6 +1120,48 @@ static void declare_system_value( value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE); break; + case TGSI_SEMANTIC_TESSCOORD: + { + LLVMValueRef coord[4] = { + LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u), + LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v), + bld->zero, + bld->zero + }; + + /* For triangles, the vector should be (u, v, 1-u-v). */ + if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == + PIPE_PRIM_TRIANGLES) + coord[2] = lp_build_sub(bld, bld->one, + lp_build_add(bld, coord[0], coord[1])); + + value = lp_build_gather_values(gallivm, coord, 4); + break; + } + + case TGSI_SEMANTIC_VERTICESIN: + value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6); + break; + + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_TESSOUTER: + { + LLVMValueRef dw_addr; + int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0); + + dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); + dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr, + lp_build_const_int32(gallivm, param * 4), ""); + + value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT, + ~0, dw_addr); + break; + } + + case TGSI_SEMANTIC_PRIMID: + value = get_primitive_id(&radeon_bld->soa.bld_base, 0); + break; + default: assert(!"unknown system value"); return; @@ -679,7 +1181,7 @@ static LLVMValueRef fetch_constant( const struct tgsi_ind_register *ireg = ®->Indirect; unsigned buf, idx; - LLVMValueRef addr; + LLVMValueRef addr, bufp; LLVMValueRef result; if (swizzle == LP_CHAN_ALL) { @@ -694,8 +1196,24 @@ static LLVMValueRef fetch_constant( buf = reg->Register.Dimension ? reg->Dimension.Index : 0; idx = reg->Register.Index * 4 + swizzle; - if (!reg->Register.Indirect) - return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]); + if (!reg->Register.Indirect && !reg->Dimension.Indirect) { + if (type != TGSI_TYPE_DOUBLE) + return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]); + else { + return radeon_llvm_emit_fetch_double(bld_base, + si_shader_ctx->constants[buf][idx], + si_shader_ctx->constants[buf][idx + 1]); + } + } + + if (reg->Register.Dimension && reg->Dimension.Indirect) { + LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST); + LLVMValueRef index; + index = get_indirect_index(si_shader_ctx, ®->DimIndirect, + reg->Dimension.Index); + bufp = build_indexed_load_const(si_shader_ctx, ptr, index); + } else + bufp = si_shader_ctx->const_resource[buf]; addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle]; addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); @@ -703,10 +1221,26 @@ static LLVMValueRef fetch_constant( addr = lp_build_add(&bld_base->uint_bld, addr, lp_build_const_int32(base->gallivm, idx * 4)); - result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf], - addr, base->elem_type); + result = buffer_load_const(base->gallivm->builder, bufp, + addr, bld_base->base.elem_type); + + if (type != TGSI_TYPE_DOUBLE) + result = bitcast(bld_base, type, result); + else { + LLVMValueRef addr2, result2; + addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1]; + addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2"); + addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16); + addr2 = lp_build_add(&bld_base->uint_bld, addr2, + lp_build_const_int32(base->gallivm, idx * 4)); + + result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf], + addr2, bld_base->base.elem_type); - return bitcast(bld_base, type, result); + result = radeon_llvm_emit_fetch_double(bld_base, + result, result2); + } + return result; } /* Initialize arguments for the shader export intrinsic */ @@ -745,7 +1279,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, args[0] = values[2 * chan]; args[1] = values[2 * chan + 1]; args[chan + 5] = - build_intrinsic(base->gallivm->builder, + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.packf16", LLVMInt32TypeInContext(base->gallivm->context), args, 2, @@ -827,12 +1361,12 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, lp_build_const_float(gallivm, 1.0f), lp_build_const_float(gallivm, -1.0f)); - build_intrinsic(gallivm->builder, + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", LLVMVoidTypeInContext(gallivm->context), &arg, 1, 0); } else { - build_intrinsic(gallivm->builder, + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp", LLVMVoidTypeInContext(gallivm->context), NULL, 0, 0); @@ -853,7 +1387,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base SI_PARAM_SAMPLE_COVERAGE); coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); - coverage = build_intrinsic(gallivm->builder, "llvm.ctpop.i32", + coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", bld_base->int_bld.elem_type, &coverage, 1, LLVMReadNoneAttribute); @@ -983,16 +1517,16 @@ static void build_tbuffer_store(struct si_shader_context *shader, lp_build_intrinsic(gallivm->builder, name, LLVMVoidTypeInContext(gallivm->context), - args, Elements(args)); + args, Elements(args), 0); } -static void build_streamout_store(struct si_shader_context *shader, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef vaddr, - LLVMValueRef soffset, - unsigned inst_offset) +static void build_tbuffer_store_dwords(struct si_shader_context *shader, + LLVMValueRef rsrc, + LLVMValueRef vdata, + unsigned num_channels, + LLVMValueRef vaddr, + LLVMValueRef soffset, + unsigned inst_offset) { static unsigned dfmt[] = { V_008F0C_BUF_DATA_FORMAT_32, @@ -1025,13 +1559,16 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, LLVMValueRef so_vtx_count = unpack_param(shader, shader->param_streamout_config, 16, 7); - LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32, + LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32, NULL, 0, LLVMReadNoneAttribute); /* can_emit = tid < so_vtx_count; */ LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + LLVMValueRef stream_id = + unpack_param(shader, shader->param_streamout_config, 24, 2); + /* Emit the streamout code conditionally. This actually avoids * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ @@ -1071,7 +1608,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, unsigned reg = so->output[i].register_index; unsigned start = so->output[i].start_component; unsigned num_comps = so->output[i].num_components; + unsigned stream = so->output[i].stream; LLVMValueRef out[4]; + struct lp_build_if_state if_ctx_stream; assert(num_comps && num_comps <= 4); if (!num_comps || num_comps > 4) @@ -1105,11 +1644,18 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, break; } - build_streamout_store(shader, shader->so_buffers[buf_idx], - vdata, num_comps, - so_write_offset[buf_idx], - LLVMConstInt(i32, 0, 0), - so->output[i].dst_offset*4); + LLVMValueRef can_emit_stream = + LLVMBuildICmp(builder, LLVMIntEQ, + stream_id, + lp_build_const_int32(gallivm, stream), ""); + + lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); + build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx], + vdata, num_comps, + so_write_offset[buf_idx], + LLVMConstInt(i32, 0, 0), + so->output[i].dst_offset*4); + lp_build_endif(&if_ctx_stream); } } lp_build_endif(&if_ctx); @@ -1128,7 +1674,7 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; LLVMValueRef args[9]; LLVMValueRef pos_args[4][9] = { { 0 } }; - LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL; + LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; unsigned semantic_name, semantic_index; unsigned target; unsigned param_count = 0; @@ -1154,7 +1700,12 @@ handle_semantic: continue; case TGSI_SEMANTIC_LAYER: layer_value = outputs[i].values[0]; - continue; + semantic_name = TGSI_SEMANTIC_GENERIC; + goto handle_semantic; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + viewport_index_value = outputs[i].values[0]; + semantic_name = TGSI_SEMANTIC_GENERIC; + goto handle_semantic; case TGSI_SEMANTIC_POSITION: target = V_008DFC_SQ_EXP_POS; break; @@ -1195,7 +1746,7 @@ handle_semantic: lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - args, 9); + args, 9, 0); } if (semantic_name == TGSI_SEMANTIC_CLIPDIST) { @@ -1204,6 +1755,8 @@ handle_semantic: } } + shader->nr_param_exports = param_count; + /* We need to add the position output manually if it's missing. */ if (!pos_args[0][0]) { pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */ @@ -1220,11 +1773,13 @@ handle_semantic: /* Write the misc vector (point size, edgeflag, layer, viewport). */ if (shader->selector->info.writes_psize || shader->selector->info.writes_edgeflag || + shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) { pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */ shader->selector->info.writes_psize | (shader->selector->info.writes_edgeflag << 1) | - (shader->selector->info.writes_layer << 2)); + (shader->selector->info.writes_layer << 2) | + (shader->selector->info.writes_viewport_index << 3)); pos_args[1][1] = uint->zero; /* EXEC mask */ pos_args[1][2] = uint->zero; /* last export? */ pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1); @@ -1255,6 +1810,9 @@ handle_semantic: if (shader->selector->info.writes_layer) pos_args[1][7] = layer_value; + + if (shader->selector->info.writes_viewport_index) + pos_args[1][8] = viewport_index_value; } for (i = 0; i < 4; i++) @@ -1276,7 +1834,133 @@ handle_semantic: lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - pos_args[i], 9); + pos_args[i], 9, 0); + } +} + +/* This only writes the tessellation factor levels. */ +static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct si_shader *shader = si_shader_ctx->shader; + unsigned tess_inner_index, tess_outer_index; + LLVMValueRef lds_base, lds_inner, lds_outer; + LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers; + LLVMValueRef out[6], vec0, vec1, invocation_id; + unsigned stride, outer_comps, inner_comps, i; + struct lp_build_if_state if_ctx; + + invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5); + + /* Do this only for invocation 0, because the tess levels are per-patch, + * not per-vertex. + * + * This can't jump, because invocation 0 executes this. It should + * at least mask out the loads and stores for other invocations. + */ + lp_build_if(&if_ctx, gallivm, + LLVMBuildICmp(gallivm->builder, LLVMIntEQ, + invocation_id, bld_base->uint_bld.zero, "")); + + /* Determine the layout of one tess factor element in the buffer. */ + switch (shader->key.tcs.prim_mode) { + case PIPE_PRIM_LINES: + stride = 2; /* 2 dwords, 1 vec2 store */ + outer_comps = 2; + inner_comps = 0; + break; + case PIPE_PRIM_TRIANGLES: + stride = 4; /* 4 dwords, 1 vec4 store */ + outer_comps = 3; + inner_comps = 1; + break; + case PIPE_PRIM_QUADS: + stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ + outer_comps = 4; + inner_comps = 2; + break; + default: + assert(0); + return; + } + + /* Load tess_inner and tess_outer from LDS. + * Any invocation can write them, so we can't get them from a temporary. + */ + tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0); + tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0); + + lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx); + lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, + lp_build_const_int32(gallivm, + tess_inner_index * 4), ""); + lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, + lp_build_const_int32(gallivm, + tess_outer_index * 4), ""); + + for (i = 0; i < outer_comps; i++) + out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); + for (i = 0; i < inner_comps; i++) + out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + + /* Convert the outputs to vectors for stores. */ + vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); + vec1 = NULL; + + if (stride > 4) + vec1 = lp_build_gather_values(gallivm, out+4, stride - 4); + + /* Get the buffer. */ + rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + buffer = build_indexed_load_const(si_shader_ctx, rw_buffers, + lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR)); + + /* Get the offset. */ + tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + rel_patch_id = get_rel_patch_id(si_shader_ctx); + byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id, + lp_build_const_int32(gallivm, 4 * stride), ""); + + /* Store the outputs. */ + build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0, + MIN2(stride, 4), byteoffset, tf_base, 0); + if (vec1) + build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1, + stride - 4, byteoffset, tf_base, 16); + lp_build_endif(&if_ctx); +} + +static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader *shader = si_shader_ctx->shader; + struct tgsi_shader_info *info = &shader->selector->info; + struct gallivm_state *gallivm = bld_base->base.gallivm; + unsigned i, chan; + LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + si_shader_ctx->param_rel_auto_id); + LLVMValueRef vertex_dw_stride = + unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8); + LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id, + vertex_dw_stride, ""); + + /* Write outputs to LDS. The next shader (TCS aka HS) will read + * its inputs from it. */ + for (i = 0; i < info->num_outputs; i++) { + LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i]; + unsigned name = info->output_semantic_name[i]; + unsigned index = info->output_semantic_index[i]; + int param = si_shader_io_get_unique_index(name, index); + LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr, + lp_build_const_int32(gallivm, param * 4), ""); + + for (chan = 0; chan < 4; chan++) { + lds_store(bld_base, chan, dw_addr, + LLVMBuildLoad(gallivm->builder, out_ptr[chan], "")); + } } } @@ -1288,17 +1972,25 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) struct tgsi_shader_info *info = &es->selector->info; LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - SI_PARAM_ES2GS_OFFSET); + si_shader_ctx->param_es2gs_offset); + uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ? + es->key.tes.es_enabled_outputs : + es->key.vs.es_enabled_outputs; unsigned chan; int i; for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i]; - int param_index = get_param_index(info->output_semantic_name[i], - info->output_semantic_index[i], - es->key.vs.gs_used_inputs); + int param_index; + if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || + info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) + continue; + + param_index = get_param_index(info->output_semantic_name[i], + info->output_semantic_index[i], + enabled_outputs); if (param_index < 0) continue; @@ -1326,7 +2018,7 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE); args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); - build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", + lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", LLVMVoidTypeInContext(gallivm->context), args, 2, LLVMNoUnwindAttribute); } @@ -1339,7 +2031,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) struct si_shader_output_values *outputs = NULL; int i,j; - outputs = MALLOC(info->num_outputs * sizeof(outputs[0])); + outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); for (i = 0; i < info->num_outputs; i++) { outputs[i].name = info->output_semantic_name[i]; @@ -1352,7 +2044,19 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) ""); } - si_llvm_export_vs(bld_base, outputs, info->num_outputs); + /* Export PrimitiveID when PS needs it. */ + if (si_vs_exports_prim_id(si_shader_ctx->shader)) { + outputs[i].name = TGSI_SEMANTIC_PRIMID; + outputs[i].sid = 0; + outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)); + outputs[i].values[1] = bld_base->base.undef; + outputs[i].values[2] = bld_base->base.undef; + outputs[i].values[3] = bld_base->base.undef; + i++; + } + + si_llvm_export_vs(bld_base, outputs, i); FREE(outputs); } @@ -1417,7 +2121,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - last_args, 9); + last_args, 9, 0); } /* This instruction will be emitted at the end of the shader. */ @@ -1434,14 +2138,14 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - args, 9); + args, 9, 0); } } } else { lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - args, 9); + args, 9, 0); } } @@ -1503,7 +2207,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - args, 9); + args, 9, 0); else memcpy(last_args, args, sizeof(args)); } @@ -1534,7 +2238,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", LLVMVoidTypeInContext(base->gallivm->context), - last_args, 9); + last_args, 9, 0); } static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, @@ -1563,15 +2267,36 @@ static void tex_fetch_args( const struct tgsi_full_instruction * inst = emit_data->inst; unsigned opcode = inst->Instruction.Opcode; unsigned target = inst->Texture.Texture; - LLVMValueRef coords[5]; + LLVMValueRef coords[5], derivs[6]; LLVMValueRef address[16]; int ref_pos; unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos); unsigned count = 0; unsigned chan; - unsigned sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; - unsigned sampler_index = emit_data->inst->Src[sampler_src].Register.Index; + unsigned sampler_src; + unsigned sampler_index; + unsigned num_deriv_channels = 0; bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false; + LLVMValueRef res_ptr, samp_ptr; + + sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; + sampler_index = emit_data->inst->Src[sampler_src].Register.Index; + + if (emit_data->inst->Src[sampler_src].Register.Indirect) { + const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src]; + LLVMValueRef ind_index; + + ind_index = get_indirect_index(si_shader_ctx, ®->Indirect, reg->Register.Index); + + res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); + res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index); + + samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER); + samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index); + } else { + res_ptr = si_shader_ctx->resources[sampler_index]; + samp_ptr = si_shader_ctx->samplers[sampler_index]; + } if (target == TGSI_TEXTURE_BUFFER) { LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128); @@ -1580,7 +2305,7 @@ static void tex_fetch_args( LLVMTypeRef v16i8 = LLVMVectorType(i8, 16); /* Bitcast and truncate v8i32 to v16i8. */ - LLVMValueRef res = si_shader_ctx->resources[sampler_index]; + LLVMValueRef res = res_ptr; res = LLVMBuildBitCast(gallivm->builder, res, v2i128, ""); res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, ""); res = LLVMBuildBitCast(gallivm->builder, res, v16i8, ""); @@ -1649,18 +2374,13 @@ static void tex_fetch_args( } } - if (target == TGSI_TEXTURE_CUBE || - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) - radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords); - /* Pack user derivatives */ if (opcode == TGSI_OPCODE_TXD) { - int num_deriv_channels, param; + int param, num_src_deriv_channels; switch (target) { case TGSI_TEXTURE_3D: + num_src_deriv_channels = 3; num_deriv_channels = 3; break; case TGSI_TEXTURE_2D: @@ -1669,27 +2389,44 @@ static void tex_fetch_args( case TGSI_TEXTURE_SHADOWRECT: case TGSI_TEXTURE_2D_ARRAY: case TGSI_TEXTURE_SHADOW2D_ARRAY: + num_src_deriv_channels = 2; + num_deriv_channels = 2; + break; case TGSI_TEXTURE_CUBE: case TGSI_TEXTURE_SHADOWCUBE: case TGSI_TEXTURE_CUBE_ARRAY: case TGSI_TEXTURE_SHADOWCUBE_ARRAY: + /* Cube derivatives will be converted to 2D. */ + num_src_deriv_channels = 3; num_deriv_channels = 2; break; case TGSI_TEXTURE_1D: case TGSI_TEXTURE_SHADOW1D: case TGSI_TEXTURE_1D_ARRAY: case TGSI_TEXTURE_SHADOW1D_ARRAY: + num_src_deriv_channels = 1; num_deriv_channels = 1; break; default: assert(0); /* no other targets are valid here */ } - for (param = 1; param <= 2; param++) - for (chan = 0; chan < num_deriv_channels; chan++) - address[count++] = lp_build_emit_fetch(bld_base, inst, param, chan); + for (param = 0; param < 2; param++) + for (chan = 0; chan < num_src_deriv_channels; chan++) + derivs[param * num_src_deriv_channels + chan] = + lp_build_emit_fetch(bld_base, inst, param+1, chan); } + if (target == TGSI_TEXTURE_CUBE || + target == TGSI_TEXTURE_CUBE_ARRAY || + target == TGSI_TEXTURE_SHADOWCUBE || + target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) + radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs); + + if (opcode == TGSI_OPCODE_TXD) + for (int i = 0; i < num_deriv_channels * 2; i++) + address[count++] = derivs[i]; + /* Pack texture coordinates */ address[count++] = coords[0]; if (num_coords > 1) @@ -1806,7 +2543,7 @@ static void tex_fetch_args( } /* Resource */ - emit_data->args[1] = si_shader_ctx->resources[sampler_index]; + emit_data->args[1] = res_ptr; if (opcode == TGSI_OPCODE_TXF) { /* add tex offsets */ @@ -1889,7 +2626,7 @@ static void tex_fetch_args( dmask = 1 << gather_comp; } - emit_data->args[2] = si_shader_ctx->samplers[sampler_index]; + emit_data->args[2] = samp_ptr; emit_data->args[3] = lp_build_const_int32(gallivm, dmask); emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */ emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */ @@ -1905,7 +2642,7 @@ static void tex_fetch_args( LLVMFloatTypeInContext(gallivm->context), 4); } else { - emit_data->args[2] = si_shader_ctx->samplers[sampler_index]; + emit_data->args[2] = samp_ptr; emit_data->args[3] = lp_build_const_int32(gallivm, target); emit_data->arg_count = 4; @@ -1940,7 +2677,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, emit_data->inst->Texture.NumOffsets > 0 : false; if (target == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = build_intrinsic( + emit_data->output[emit_data->chan] = lp_build_intrinsic( base->gallivm->builder, "llvm.SI.vs.load.input", emit_data->dst_type, emit_data->args, emit_data->arg_count, @@ -1989,7 +2726,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, is_shadow ? ".c" : "", infix, has_offset ? ".o" : "", LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0]))); - emit_data->output[emit_data->chan] = build_intrinsic( + emit_data->output[emit_data->chan] = lp_build_intrinsic( base->gallivm->builder, intr_name, emit_data->dst_type, emit_data->args, emit_data->arg_count, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); @@ -2036,7 +2773,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, sprintf(intr_name, "%s.v%ui32", name, LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0]))); - emit_data->output[emit_data->chan] = build_intrinsic( + emit_data->output[emit_data->chan] = lp_build_intrinsic( base->gallivm->builder, intr_name, emit_data->dst_type, emit_data->args, emit_data->arg_count, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); @@ -2050,17 +2787,47 @@ static void txq_fetch_args( struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); const struct tgsi_full_instruction *inst = emit_data->inst; struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; unsigned target = inst->Texture.Texture; + LLVMValueRef res_ptr; + + if (inst->Src[1].Register.Indirect) { + const struct tgsi_full_src_register *reg = &inst->Src[1]; + LLVMValueRef ind_index; + + ind_index = get_indirect_index(si_shader_ctx, ®->Indirect, reg->Register.Index); + + res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); + res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, + ind_index); + } else + res_ptr = si_shader_ctx->resources[inst->Src[1].Register.Index]; if (target == TGSI_TEXTURE_BUFFER) { LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMTypeRef v8i32 = LLVMVectorType(i32, 8); /* Read the size from the buffer descriptor directly. */ - LLVMValueRef size = si_shader_ctx->resources[inst->Src[1].Register.Index]; - size = LLVMBuildBitCast(gallivm->builder, size, v8i32, ""); - size = LLVMBuildExtractElement(gallivm->builder, size, - lp_build_const_int32(gallivm, 6), ""); + LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, ""); + LLVMValueRef size = LLVMBuildExtractElement(builder, res, + lp_build_const_int32(gallivm, 6), ""); + + if (si_shader_ctx->screen->b.chip_class >= VI) { + /* On VI, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = + LLVMBuildExtractElement(builder, res, + lp_build_const_int32(gallivm, 5), ""); + stride = LLVMBuildLShr(builder, stride, + lp_build_const_int32(gallivm, 16), ""); + stride = LLVMBuildAnd(builder, stride, + lp_build_const_int32(gallivm, 0x3FFF), ""); + + size = LLVMBuildUDiv(builder, size, stride, ""); + } + emit_data->args[0] = size; return; } @@ -2069,7 +2836,7 @@ static void txq_fetch_args( emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); /* Resource */ - emit_data->args[1] = si_shader_ctx->resources[inst->Src[1].Register.Index]; + emit_data->args[1] = res_ptr; /* Texture target */ if (target == TGSI_TEXTURE_CUBE_ARRAY || @@ -2116,6 +2883,35 @@ static void build_txq_intrinsic(const struct lp_build_tgsi_action * action, } } +/* + * SI implements derivatives using the local data store (LDS) + * All writes to the LDS happen in all executing threads at + * the same time. TID is the Thread ID for the current + * thread and is a value between 0 and 63, representing + * the thread's position in the wavefront. + * + * For the pixel shader threads are grouped into quads of four pixels. + * The TIDs of the pixels of a quad are: + * + * +------+------+ + * |4n + 0|4n + 1| + * +------+------+ + * |4n + 2|4n + 3| + * +------+------+ + * + * So, masking the TID with 0xfffffffc yields the TID of the top left pixel + * of the quad, masking with 0xfffffffd yields the TID of the top pixel of + * the current pixel's column, and masking with 0xfffffffe yields the TID + * of the left pixel of the current pixel's row. + * + * Adding 1 yields the TID of the pixel to the right of the left pixel, and + * adding 2 yields the TID of the pixel below the top pixel. + */ +/* masks for thread ID. */ +#define TID_MASK_TOP_LEFT 0xfffffffc +#define TID_MASK_TOP 0xfffffffd +#define TID_MASK_LEFT 0xfffffffe + static void si_llvm_emit_ddxy( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -2132,25 +2928,34 @@ static void si_llvm_emit_ddxy( LLVMTypeRef i32; unsigned swizzle[4]; unsigned c; + int idx; + unsigned mask; i32 = LLVMInt32TypeInContext(gallivm->context); indices[0] = bld_base->uint_bld.zero; - indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32, + indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32, NULL, 0, LLVMReadNoneAttribute); - store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds, + store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, indices, 2, ""); + if (opcode == TGSI_OPCODE_DDX_FINE) + mask = TID_MASK_LEFT; + else if (opcode == TGSI_OPCODE_DDY_FINE) + mask = TID_MASK_TOP; + else + mask = TID_MASK_TOP_LEFT; + indices[1] = LLVMBuildAnd(gallivm->builder, indices[1], - lp_build_const_int32(gallivm, 0xfffffffc), ""); - load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds, + lp_build_const_int32(gallivm, mask), ""); + load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, indices, 2, ""); + /* for DDX we want to next X pixel, DDY next Y pixel. */ + idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; indices[1] = LLVMBuildAdd(gallivm->builder, indices[1], - lp_build_const_int32(gallivm, - opcode == TGSI_OPCODE_DDX ? 1 : 2), - ""); - load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds, + lp_build_const_int32(gallivm, idx), ""); + load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, indices, 2, ""); for (c = 0; c < 4; ++c) { @@ -2184,6 +2989,247 @@ static void si_llvm_emit_ddxy( emit_data->output[0] = lp_build_gather_values(gallivm, result, 4); } +/* + * this takes an I,J coordinate pair, + * and works out the X and Y derivatives. + * it returns DDX(I), DDX(J), DDY(I), DDY(J). + */ +static LLVMValueRef si_llvm_emit_ddxy_interp( + struct lp_build_tgsi_context *bld_base, + LLVMValueRef interp_ij) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_context *base = &bld_base->base; + LLVMValueRef indices[2]; + LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2; + LLVMValueRef tl, tr, bl, result[4]; + LLVMTypeRef i32; + unsigned c; + + i32 = LLVMInt32TypeInContext(gallivm->context); + + indices[0] = bld_base->uint_bld.zero; + indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32, + NULL, 0, LLVMReadNoneAttribute); + store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + indices, 2, ""); + + temp = LLVMBuildAnd(gallivm->builder, indices[1], + lp_build_const_int32(gallivm, TID_MASK_LEFT), ""); + + temp2 = LLVMBuildAnd(gallivm->builder, indices[1], + lp_build_const_int32(gallivm, TID_MASK_TOP), ""); + + indices[1] = temp; + load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + indices, 2, ""); + + indices[1] = temp2; + load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + indices, 2, ""); + + indices[1] = LLVMBuildAdd(gallivm->builder, temp, + lp_build_const_int32(gallivm, 1), ""); + load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + indices, 2, ""); + + indices[1] = LLVMBuildAdd(gallivm->builder, temp2, + lp_build_const_int32(gallivm, 2), ""); + load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + indices, 2, ""); + + for (c = 0; c < 2; ++c) { + LLVMValueRef store_val; + LLVMValueRef c_ll = lp_build_const_int32(gallivm, c); + + store_val = LLVMBuildExtractElement(gallivm->builder, + interp_ij, c_ll, ""); + LLVMBuildStore(gallivm->builder, + store_val, + store_ptr); + + tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, ""); + tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, ""); + + tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, ""); + tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, ""); + + result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, ""); + + tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, ""); + tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, ""); + + bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, ""); + bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, ""); + + result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, ""); + } + + return lp_build_gather_values(gallivm, result, 4); +} + +static void interp_fetch_args( + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction *inst = emit_data->inst; + + if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { + /* offset is in second src, first two channels */ + emit_data->args[0] = lp_build_emit_fetch(bld_base, + emit_data->inst, 1, + 0); + emit_data->args[1] = lp_build_emit_fetch(bld_base, + emit_data->inst, 1, + 1); + emit_data->arg_count = 2; + } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { + LLVMValueRef sample_position; + LLVMValueRef sample_id; + LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f); + + /* fetch sample ID, then fetch its sample position, + * and place into first two channels. + */ + sample_id = lp_build_emit_fetch(bld_base, + emit_data->inst, 1, 0); + sample_id = LLVMBuildBitCast(gallivm->builder, sample_id, + LLVMInt32TypeInContext(gallivm->context), + ""); + sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id); + + emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder, + sample_position, + lp_build_const_int32(gallivm, 0), ""); + + emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, ""); + emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder, + sample_position, + lp_build_const_int32(gallivm, 1), ""); + emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, ""); + emit_data->arg_count = 2; + } +} + +static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader *shader = si_shader_ctx->shader; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef interp_param; + const struct tgsi_full_instruction *inst = emit_data->inst; + const char *intr_name; + int input_index; + int chan; + int i; + LLVMValueRef attr_number; + LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context); + LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK); + int interp_param_idx; + unsigned location; + + assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); + input_index = inst->Src[0].Register.Index; + + if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || + inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) + location = TGSI_INTERPOLATE_LOC_CENTER; + else + location = TGSI_INTERPOLATE_LOC_CENTROID; + + interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index], + location); + if (interp_param_idx == -1) + return; + else if (interp_param_idx) + interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx); + else + interp_param = NULL; + + attr_number = lp_build_const_int32(gallivm, + shader->ps_input_param_offset[input_index]); + + if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || + inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { + LLVMValueRef ij_out[2]; + LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param); + + /* + * take the I then J parameters, and the DDX/Y for it, and + * calculate the IJ inputs for the interpolator. + * temp1 = ddx * offset/sample.x + I; + * interp_param.I = ddy * offset/sample.y + temp1; + * temp1 = ddx * offset/sample.x + J; + * interp_param.J = ddy * offset/sample.y + temp1; + */ + for (i = 0; i < 2; i++) { + LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i); + LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2); + LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder, + ddxy_out, ix_ll, ""); + LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder, + ddxy_out, iy_ll, ""); + LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder, + interp_param, ix_ll, ""); + LLVMValueRef temp1, temp2; + + interp_el = LLVMBuildBitCast(gallivm->builder, interp_el, + LLVMFloatTypeInContext(gallivm->context), ""); + + temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], ""); + + temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, ""); + + temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], ""); + + temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); + + ij_out[i] = LLVMBuildBitCast(gallivm->builder, + temp2, + LLVMIntTypeInContext(gallivm->context, 32), ""); + } + interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2); + } + + intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; + for (chan = 0; chan < 2; chan++) { + LLVMValueRef args[4]; + LLVMValueRef llvm_chan; + unsigned schan; + + schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); + llvm_chan = lp_build_const_int32(gallivm, schan); + + args[0] = llvm_chan; + args[1] = attr_number; + args[2] = params; + args[3] = interp_param; + + emit_data->output[chan] = + lp_build_intrinsic(gallivm->builder, intr_name, + input_type, args, args[3] ? 4 : 3, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + } +} + +static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; + struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; + unsigned stream; + + assert(src0.File == TGSI_FILE_IMMEDIATE); + + stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3; + return stream; +} + /* Emit one vertex from the geometry shader */ static void si_llvm_emit_vertex( const struct lp_build_tgsi_action *action, @@ -2203,9 +3249,14 @@ static void si_llvm_emit_vertex( LLVMValueRef args[2]; unsigned chan; int i; + unsigned stream; + + stream = si_llvm_get_stream(bld_base, emit_data); /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, ""); + gs_next_vertex = LLVMBuildLoad(gallivm->builder, + si_shader_ctx->gs_next_vertex[stream], + ""); /* If this thread has already emitted the declared maximum number of * vertices, kill it: excessive vertex emissions are not supposed to @@ -2218,8 +3269,9 @@ static void si_llvm_emit_vertex( kill = lp_build_select(&bld_base->base, can_emit, lp_build_const_float(gallivm, 1.0f), lp_build_const_float(gallivm, -1.0f)); - build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", - LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = @@ -2237,7 +3289,7 @@ static void si_llvm_emit_vertex( out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, ""); build_tbuffer_store(si_shader_ctx, - si_shader_ctx->gsvs_ring, + si_shader_ctx->gsvs_ring[stream], out_val, 1, voffset, soffset, 0, V_008F0C_BUF_DATA_FORMAT_32, @@ -2247,12 +3299,13 @@ static void si_llvm_emit_vertex( } gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); - LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex); + + LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]); /* Signal vertex emission */ - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS); + args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8)); args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); - build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", + lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", LLVMVoidTypeInContext(gallivm->context), args, 2, LLVMNoUnwindAttribute); } @@ -2266,15 +3319,28 @@ static void si_llvm_emit_primitive( struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef args[2]; + unsigned stream; /* Signal primitive cut */ - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS); + stream = si_llvm_get_stream(bld_base, emit_data); + args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8)); args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); - build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", + lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", LLVMVoidTypeInContext(gallivm->context), args, 2, LLVMNoUnwindAttribute); } +static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local", + LLVMVoidTypeInContext(gallivm->context), NULL, 0, + LLVMNoUnwindAttribute); +} + static const struct lp_build_tgsi_action tex_action = { .fetch_args = tex_fetch_args, .emit = build_tex_intrinsic, @@ -2286,6 +3352,11 @@ static const struct lp_build_tgsi_action txq_action = { .intr_name = "llvm.SI.resinfo" }; +static const struct lp_build_tgsi_action interp_action = { + .fetch_args = interp_fetch_args, + .emit = build_interp_intrinsic, +}; + static void create_meta_data(struct si_shader_context *si_shader_ctx) { struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; @@ -2304,6 +3375,27 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) CONST_ADDR_SPACE); } +static void declare_streamout_params(struct si_shader_context *si_shader_ctx, + struct pipe_stream_output_info *so, + LLVMTypeRef *params, LLVMTypeRef i32, + unsigned *num_params) +{ + int i; + + /* Streamout SGPRs. */ + if (so->num_outputs) { + params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32; + params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32; + } + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32; + } +} + static void create_function(struct si_shader_context *si_shader_ctx) { struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; @@ -2336,8 +3428,10 @@ static void create_function(struct si_shader_context *si_shader_ctx) num_params = SI_PARAM_START_INSTANCE+1; if (shader->key.vs.as_es) { - params[SI_PARAM_ES2GS_OFFSET] = i32; - num_params++; + params[si_shader_ctx->param_es2gs_offset = num_params++] = i32; + } else if (shader->key.vs.as_ls) { + params[SI_PARAM_LS_OUT_LAYOUT] = i32; + num_params = SI_PARAM_LS_OUT_LAYOUT+1; } else { if (shader->is_gs_copy_shader) { last_array_pointer = SI_PARAM_CONST; @@ -2345,30 +3439,52 @@ static void create_function(struct si_shader_context *si_shader_ctx) } /* The locations of the other parameters are assigned dynamically. */ - - /* Streamout SGPRs. */ - if (shader->selector->so.num_outputs) { - params[si_shader_ctx->param_streamout_config = num_params++] = i32; - params[si_shader_ctx->param_streamout_write_index = num_params++] = i32; - } - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (i = 0; i < 4; i++) { - if (!shader->selector->so.stride[i]) - continue; - - params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32; - } + declare_streamout_params(si_shader_ctx, &shader->selector->so, + params, i32, &num_params); } last_sgpr = num_params-1; /* VGPRs */ params[si_shader_ctx->param_vertex_id = num_params++] = i32; - params[num_params++] = i32; /* unused*/ - params[num_params++] = i32; /* unused */ + params[si_shader_ctx->param_rel_auto_id = num_params++] = i32; + params[si_shader_ctx->param_vs_prim_id = num_params++] = i32; params[si_shader_ctx->param_instance_id = num_params++] = i32; break; + case TGSI_PROCESSOR_TESS_CTRL: + params[SI_PARAM_TCS_OUT_OFFSETS] = i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = i32; + params[SI_PARAM_TCS_IN_LAYOUT] = i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = i32; + last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + + /* VGPRs */ + params[SI_PARAM_PATCH_ID] = i32; + params[SI_PARAM_REL_IDS] = i32; + num_params = SI_PARAM_REL_IDS+1; + break; + + case TGSI_PROCESSOR_TESS_EVAL: + params[SI_PARAM_TCS_OUT_OFFSETS] = i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = i32; + num_params = SI_PARAM_TCS_OUT_LAYOUT+1; + + if (shader->key.tes.as_es) { + params[si_shader_ctx->param_es2gs_offset = num_params++] = i32; + } else { + declare_streamout_params(si_shader_ctx, &shader->selector->so, + params, i32, &num_params); + } + last_sgpr = num_params - 1; + + /* VGPRs */ + params[si_shader_ctx->param_tes_u = num_params++] = f32; + params[si_shader_ctx->param_tes_v = num_params++] = f32; + params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32; + params[si_shader_ctx->param_tes_patch_id = num_params++] = i32; + break; + case TGSI_PROCESSOR_GEOMETRY: params[SI_PARAM_GS2VS_OFFSET] = i32; params[SI_PARAM_GS_WAVE_ID] = i32; @@ -2435,12 +3551,35 @@ static void create_function(struct si_shader_context *si_shader_ctx) if (bld_base->info && (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0)) - si_shader_ctx->ddxy_lds = + bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || + bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 || + bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 || + bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 || + bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0)) + si_shader_ctx->lds = LLVMAddGlobalInAddressSpace(gallivm->module, LLVMArrayType(i32, 64), "ddxy_lds", LOCAL_ADDR_SPACE); + + if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) || + si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL || + si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) { + /* This is the upper bound, maximum is 32 inputs times 32 vertices */ + unsigned vertex_data_dw_size = 32*32*4; + unsigned patch_data_dw_size = 32*4; + /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ + unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; + unsigned lds_dwords = patch_dw_size; + + /* The actual size is computed outside of the shader to reduce + * the number of shader variants. */ + si_shader_ctx->lds = + LLVMAddGlobalInAddressSpace(gallivm->module, + LLVMArrayType(i32, lds_dwords), + "tess_lds", + LOCAL_ADDR_SPACE); + } } static void preload_constants(struct si_shader_context *si_shader_ctx) @@ -2517,9 +3656,13 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx) struct gallivm_state * gallivm = bld_base->base.gallivm; unsigned i; - if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX || - si_shader_ctx->shader->key.vs.as_es || - !si_shader_ctx->shader->selector->so.num_outputs) + /* Streamout can only be used if the shader is compiled as VS. */ + if (!si_shader_ctx->shader->selector->so.num_outputs || + (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && + (si_shader_ctx->shader->key.vs.as_es || + si_shader_ctx->shader->key.vs.as_ls)) || + (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL && + si_shader_ctx->shader->key.tes.as_es)) return; LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, @@ -2550,6 +3693,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && si_shader_ctx->shader->key.vs.as_es) || + (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL && + si_shader_ctx->shader->key.tes.as_es) || si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS); @@ -2557,13 +3702,21 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) build_indexed_load_const(si_shader_ctx, buf_ptr, offset); } - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY || - si_shader_ctx->shader->is_gs_copy_shader) { + if (si_shader_ctx->shader->is_gs_copy_shader) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); - si_shader_ctx->gsvs_ring = + si_shader_ctx->gsvs_ring[0] = build_indexed_load_const(si_shader_ctx, buf_ptr, offset); } + if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) { + int i; + for (i = 0; i < 4; i++) { + LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i); + + si_shader_ctx->gsvs_ring[i] = + build_indexed_load_const(si_shader_ctx, buf_ptr, offset); + } + } } void si_shader_binary_read_config(const struct si_screen *sscreen, @@ -2637,26 +3790,54 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, } } -int si_shader_binary_read(struct si_screen *sscreen, - struct si_shader *shader, - const struct radeon_shader_binary *binary) +int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) { + const struct radeon_shader_binary *binary = &shader->binary; + unsigned code_size = binary->code_size + binary->rodata_size; + unsigned char *ptr; + + r600_resource_reference(&shader->bo, NULL); + shader->bo = si_resource_create_custom(&sscreen->b.b, + PIPE_USAGE_IMMUTABLE, + code_size); + if (!shader->bo) + return -ENOMEM; + ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, + PIPE_TRANSFER_READ_WRITE); + util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); + if (binary->rodata_size > 0) { + ptr += binary->code_size; + util_memcpy_cpu_to_le32(ptr, binary->rodata, + binary->rodata_size); + } + + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); + return 0; +} + +int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader) +{ + const struct radeon_shader_binary *binary = &shader->binary; unsigned i; - unsigned code_size; - unsigned char *ptr; bool dump = r600_can_dump_shader(&sscreen->b, shader->selector ? shader->selector->tokens : NULL); si_shader_binary_read_config(sscreen, shader, 0); + si_shader_binary_upload(sscreen, shader); if (dump) { - if (!binary->disassembled) { - fprintf(stderr, "SI CODE:\n"); - for (i = 0; i < binary->code_size; i+=4 ) { - fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], - binary->code[i + 2], binary->code[i + 1], - binary->code[i]); + if (!(sscreen->b.debug_flags & DBG_NO_ASM)) { + if (binary->disasm_string) { + fprintf(stderr, "\nShader Disassembly:\n\n"); + fprintf(stderr, "%s\n", binary->disasm_string); + } else { + fprintf(stderr, "SI CODE:\n"); + for (i = 0; i < binary->code_size; i+=4 ) { + fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], + binary->code[i + 2], binary->code[i + 1], + binary->code[i]); + } } } @@ -2666,26 +3847,6 @@ int si_shader_binary_read(struct si_screen *sscreen, shader->num_sgprs, shader->num_vgprs, binary->code_size, shader->lds_size, shader->scratch_bytes_per_wave); } - - /* copy new shader */ - code_size = binary->code_size + binary->rodata_size; - r600_resource_reference(&shader->bo, NULL); - shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE, - code_size); - if (shader->bo == NULL) { - return -ENOMEM; - } - - - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE); - util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); - if (binary->rodata_size > 0) { - ptr += binary->code_size; - util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size); - } - - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); - return 0; } @@ -2693,15 +3854,16 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, LLVMTargetMachineRef tm, LLVMModuleRef mod) { int r = 0; - bool dump = r600_can_dump_shader(&sscreen->b, - shader->selector ? shader->selector->tokens : NULL); - r = radeon_llvm_compile(mod, &shader->binary, - r600_get_llvm_processor_name(sscreen->b.family), dump, tm); + bool dump_asm = r600_can_dump_shader(&sscreen->b, + shader->selector ? shader->selector->tokens : NULL); + bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR); - if (r) { + r = radeon_llvm_compile(mod, &shader->binary, + r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm); + if (r) return r; - } - r = si_shader_binary_read(sscreen, shader, &shader->binary); + + r = si_shader_binary_read(sscreen, shader); FREE(shader->binary.config); FREE(shader->binary.rodata); @@ -2709,7 +3871,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, if (shader->scratch_bytes_per_wave == 0) { FREE(shader->binary.code); FREE(shader->binary.relocs); - memset(&shader->binary, 0, sizeof(shader->binary)); + memset(&shader->binary, 0, + offsetof(struct radeon_shader_binary, disasm_string)); } return r; } @@ -2741,7 +3904,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, preload_streamout_buffers(si_shader_ctx); preload_ring_buffers(si_shader_ctx); - args[0] = si_shader_ctx->gsvs_ring; + args[0] = si_shader_ctx->gsvs_ring[0]; args[1] = lp_build_mul_imm(uint, LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->param_vertex_id), @@ -2767,7 +3930,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, outputs[i].values[chan] = LLVMBuildBitCast(gallivm->builder, - build_intrinsic(gallivm->builder, + lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", LLVMInt32TypeInContext(gallivm->context), args, 9, @@ -2807,9 +3970,21 @@ static void si_dump_key(unsigned shader, union si_shader_key *key) fprintf(stderr, "}\n"); if (key->vs.as_es) - fprintf(stderr, " gs_used_inputs = 0x%"PRIx64"\n", - key->vs.gs_used_inputs); + fprintf(stderr, " es_enabled_outputs = 0x%"PRIx64"\n", + key->vs.es_enabled_outputs); fprintf(stderr, " as_es = %u\n", key->vs.as_es); + fprintf(stderr, " as_es = %u\n", key->vs.as_ls); + break; + + case PIPE_SHADER_TESS_CTRL: + fprintf(stderr, " prim_mode = %u\n", key->tcs.prim_mode); + break; + + case PIPE_SHADER_TESS_EVAL: + if (key->tes.as_es) + fprintf(stderr, " es_enabled_outputs = 0x%"PRIx64"\n", + key->tes.es_enabled_outputs); + fprintf(stderr, " as_es = %u\n", key->tes.as_es); break; case PIPE_SHADER_GEOMETRY: @@ -2851,7 +4026,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, /* Dump TGSI code before doing TGSI->LLVM conversion in case the * conversion fails. */ - if (dump) { + if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) { si_dump_key(sel->type, &shader->key); tgsi_dump(tokens, 0); si_dump_streamout(&sel->so); @@ -2873,6 +4048,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; + bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; + bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action; + bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action; + bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action; bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action; bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action; @@ -2888,9 +4067,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; + bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; + bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex; bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive; + bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; if (HAVE_LLVM >= 0x0306) { bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem; @@ -2908,11 +4090,25 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, switch (si_shader_ctx.type) { case TGSI_PROCESSOR_VERTEX: si_shader_ctx.radeon_bld.load_input = declare_input_vs; - if (shader->key.vs.as_es) { + if (shader->key.vs.as_ls) + bld_base->emit_epilogue = si_llvm_emit_ls_epilogue; + else if (shader->key.vs.as_es) bld_base->emit_epilogue = si_llvm_emit_es_epilogue; - } else { + else + bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; + break; + case TGSI_PROCESSOR_TESS_CTRL: + bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; + bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; + bld_base->emit_store = store_output_tcs; + bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue; + break; + case TGSI_PROCESSOR_TESS_EVAL: + bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; + if (shader->key.tes.as_es) + bld_base->emit_epilogue = si_llvm_emit_es_epilogue; + else bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; - } break; case TGSI_PROCESSOR_GEOMETRY: bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; @@ -2946,9 +4142,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, preload_ring_buffers(&si_shader_ctx); if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { - si_shader_ctx.gs_next_vertex = - lp_build_alloca(bld_base->base.gallivm, - bld_base->uint_bld.elem_type, ""); + int i; + for (i = 0; i < 4; i++) { + si_shader_ctx.gs_next_vertex[i] = + lp_build_alloca(bld_base->base.gallivm, + bld_base->uint_bld.elem_type, ""); + } } if (!lp_build_tgsi_llvm(bld_base, tokens)) { @@ -3000,4 +4199,5 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader) FREE(shader->binary.code); FREE(shader->binary.relocs); + FREE(shader->binary.disasm_string); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 51055afe36a..cd845c12e64 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -26,6 +26,46 @@ * Christian König <[email protected]> */ +/* How linking tessellation shader inputs and outputs works. + * + * Inputs and outputs between shaders are stored in a buffer. This buffer + * lives in LDS (typical case for tessellation), but it can also live + * in memory. Each input or output has a fixed location within a vertex. + * The highest used input or output determines the stride between vertices. + * + * Since tessellation is only enabled in the OpenGL core profile, + * only these semantics are valid for per-vertex data: + * + * Name Location + * + * POSITION 0 + * PSIZE 1 + * CLIPDIST0..1 2..3 + * CULLDIST0..1 (not implemented) + * GENERIC0..31 4..35 + * + * For example, a shader only writing GENERIC0 has the output stride of 5. + * + * Only these semantics are valid for per-patch data: + * + * Name Location + * + * TESSOUTER 0 + * TESSINNER 1 + * PATCH0..29 2..31 + * + * That's how independent shaders agree on input and output locations. + * The si_shader_io_get_unique_index function assigns the locations. + * + * Other required information for calculating the input and output addresses + * like the vertex stride, the patch stride, and the offsets where per-vertex + * and per-patch data start, is passed to the shader via user data SGPRs. + * The offsets and strides are calculated at draw time and aren't available + * at compile time. + * + * The same approach should be used for linking ES->GS in the future. + */ + #ifndef SI_SHADER_H #define SI_SHADER_H @@ -43,9 +83,16 @@ struct radeon_shader_reloc; #define SI_SGPR_VERTEX_BUFFER 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ +#define SI_SGPR_LS_OUT_LAYOUT 12 /* VS(LS) only */ +#define SI_SGPR_TCS_OUT_OFFSETS 8 /* TCS & TES only */ +#define SI_SGPR_TCS_OUT_LAYOUT 9 /* TCS & TES only */ +#define SI_SGPR_TCS_IN_LAYOUT 10 /* TCS only */ #define SI_SGPR_ALPHA_REF 8 /* PS only */ #define SI_VS_NUM_USER_SGPR 12 +#define SI_LS_NUM_USER_SGPR 13 +#define SI_TCS_NUM_USER_SGPR 11 +#define SI_TES_NUM_USER_SGPR 10 #define SI_GS_NUM_USER_SGPR 8 #define SI_GSCOPY_NUM_USER_SGPR 4 #define SI_PS_NUM_USER_SGPR 9 @@ -62,8 +109,30 @@ struct radeon_shader_reloc; #define SI_PARAM_START_INSTANCE 6 /* the other VS parameters are assigned dynamically */ -/* ES only parameters */ -#define SI_PARAM_ES2GS_OFFSET 7 +/* Offsets where TCS outputs and TCS patch outputs live in LDS: + * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 + * [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32 + */ +#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */ + +/* Layout of TCS outputs / TES inputs: + * [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4 + * [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4 + * [26:31] = gl_PatchVerticesIn, max = 32 + */ +#define SI_PARAM_TCS_OUT_LAYOUT 5 /* for TCS & TES */ + +/* Layout of LS outputs / TCS inputs + * [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4 + * [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4 + */ +#define SI_PARAM_TCS_IN_LAYOUT 6 /* TCS only */ +#define SI_PARAM_LS_OUT_LAYOUT 7 /* same value as TCS_IN_LAYOUT, LS only */ + +/* TCS only parameters. */ +#define SI_PARAM_TESS_FACTOR_OFFSET 7 +#define SI_PARAM_PATCH_ID 8 +#define SI_PARAM_REL_IDS 9 /* GS only parameters */ #define SI_PARAM_GS2VS_OFFSET 4 @@ -115,9 +184,25 @@ struct si_shader_selector { unsigned gs_output_prim; unsigned gs_max_out_vertices; - uint64_t gs_used_inputs; /* mask of "get_unique_index" bits */ + unsigned gs_num_invocations; + + /* masks of "get_unique_index" bits */ + uint64_t inputs_read; + uint64_t outputs_written; + uint32_t patch_outputs_written; }; +/* Valid shader configurations: + * + * API shaders VS | TCS | TES | GS |pass| PS + * are compiled as: | | | |thru| + * | | | | | + * Only VS & PS: VS | -- | -- | -- | -- | PS + * With GS: ES | -- | -- | GS | VS | PS + * With Tessel.: LS | HS | VS | -- | -- | PS + * With both: LS | HS | ES | GS | VS | PS + */ + union si_shader_key { struct { unsigned export_16bpc:8; @@ -130,11 +215,25 @@ union si_shader_key { } ps; struct { unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; - /* The mask of "get_unique_index" bits, needed for ES, - * it describes how the ES->GS ring buffer is laid out. */ - uint64_t gs_used_inputs; - unsigned as_es:1; + /* Mask of "get_unique_index" bits - which outputs are read + * by the next stage (needed by ES). + * This describes how outputs are laid out in memory. */ + uint64_t es_enabled_outputs; + unsigned as_es:1; /* export shader */ + unsigned as_ls:1; /* local shader */ + unsigned export_prim_id; /* when PS needs it and GS is disabled */ } vs; + struct { + unsigned prim_mode:3; + } tcs; /* tessellation control shader */ + struct { + /* Mask of "get_unique_index" bits - which outputs are read + * by the next stage (needed by ES). + * This describes how outputs are laid out in memory. */ + uint64_t es_enabled_outputs; + unsigned as_es:1; /* export shader */ + unsigned export_prim_id; /* when PS needs it and GS is disabled */ + } tes; /* tessellation evaluation shader */ }; struct si_shader { @@ -161,27 +260,47 @@ struct si_shader { unsigned nparam; unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS]; unsigned ps_input_param_offset[PIPE_MAX_SHADER_INPUTS]; - + unsigned ps_input_interpolate[PIPE_MAX_SHADER_INPUTS]; bool uses_instanceid; unsigned nr_pos_exports; + unsigned nr_param_exports; bool is_gs_copy_shader; bool dx10_clamp_mode; /* convert NaNs to 0 */ + + unsigned ls_rsrc1; + unsigned ls_rsrc2; }; static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) { - return sctx->gs_shader ? &sctx->gs_shader->info - : &sctx->vs_shader->info; + if (sctx->gs_shader) + return &sctx->gs_shader->info; + else if (sctx->tes_shader) + return &sctx->tes_shader->info; + else + return &sctx->vs_shader->info; } static inline struct si_shader* si_get_vs_state(struct si_context *sctx) { if (sctx->gs_shader) return sctx->gs_shader->current->gs_copy_shader; + else if (sctx->tes_shader) + return sctx->tes_shader->current; else return sctx->vs_shader->current; } +static inline bool si_vs_exports_prim_id(struct si_shader *shader) +{ + if (shader->selector->type == PIPE_SHADER_VERTEX) + return shader->key.vs.export_prim_id; + else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + return shader->key.tes.export_prim_id; + else + return false; +} + /* radeonsi_shader.c */ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader); @@ -189,8 +308,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, LLVMTargetMachineRef tm, LLVMModuleRef mod); void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); -int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, - const struct radeon_shader_binary *binary); +int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); +int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader); void si_shader_apply_scratch_relocs(struct si_context *sctx, struct si_shader *shader, uint64_t scratch_va); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 6c18836d189..c923ea7e154 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -61,7 +61,7 @@ unsigned si_array_mode(unsigned mode) uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex) { - if (sscreen->b.chip_class == CIK && + if (sscreen->b.chip_class >= CIK && sscreen->b.info.cik_macrotile_mode_array_valid) { unsigned index, tileb; @@ -489,11 +489,14 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) S_02881C_USE_VTX_POINT_SIZE(info->writes_psize) | S_02881C_USE_VTX_EDGE_FLAG(info->writes_edgeflag) | S_02881C_USE_VTX_RENDER_TARGET_INDX(info->writes_layer) | + S_02881C_USE_VTX_VIEWPORT_INDX(info->writes_viewport_index) | S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipdist_mask & 0x0F) != 0) | S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipdist_mask & 0xF0) != 0) | S_02881C_VS_OUT_MISC_VEC_ENA(info->writes_psize || info->writes_edgeflag || - info->writes_layer) | + info->writes_layer || + info->writes_viewport_index) | + S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) | (sctx->queued.named.rasterizer->clip_plane_enable & clipdist_mask)); r600_write_context_reg(cs, R_028810_PA_CL_CLIP_CNTL, @@ -509,20 +512,26 @@ static void si_set_scissor_states(struct pipe_context *ctx, const struct pipe_scissor_state *state) { struct si_context *sctx = (struct si_context *)ctx; - struct si_state_scissor *scissor = CALLOC_STRUCT(si_state_scissor); - struct si_pm4_state *pm4 = &scissor->pm4; - - if (scissor == NULL) - return; + struct si_state_scissor *scissor; + struct si_pm4_state *pm4; + int i; - scissor->scissor = *state; - si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL, - S_028250_TL_X(state->minx) | S_028250_TL_Y(state->miny) | - S_028250_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR, - S_028254_BR_X(state->maxx) | S_028254_BR_Y(state->maxy)); + for (i = start_slot; i < start_slot + num_scissors; i++) { + int idx = i - start_slot; + int offset = i * 4 * 2; - si_pm4_set_state(sctx, scissor, scissor); + scissor = CALLOC_STRUCT(si_state_scissor); + if (scissor == NULL) + return; + pm4 = &scissor->pm4; + scissor->scissor = state[idx]; + si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset, + S_028250_TL_X(state[idx].minx) | S_028250_TL_Y(state[idx].miny) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR + offset, + S_028254_BR_X(state[idx].maxx) | S_028254_BR_Y(state[idx].maxy)); + si_pm4_set_state(sctx, scissor[i], scissor); + } } static void si_set_viewport_states(struct pipe_context *ctx, @@ -531,21 +540,29 @@ static void si_set_viewport_states(struct pipe_context *ctx, const struct pipe_viewport_state *state) { struct si_context *sctx = (struct si_context *)ctx; - struct si_state_viewport *viewport = CALLOC_STRUCT(si_state_viewport); - struct si_pm4_state *pm4 = &viewport->pm4; + struct si_state_viewport *viewport; + struct si_pm4_state *pm4; + int i; - if (viewport == NULL) - return; + for (i = start_slot; i < start_slot + num_viewports; i++) { + int idx = i - start_slot; + int offset = i * 4 * 6; - viewport->viewport = *state; - si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0])); - si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0])); - si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1])); - si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1])); - si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2])); - si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2])); + viewport = CALLOC_STRUCT(si_state_viewport); + if (!viewport) + return; + pm4 = &viewport->pm4; + + viewport->viewport = state[idx]; + si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE + offset, fui(state[idx].scale[0])); + si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET + offset, fui(state[idx].translate[0])); + si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE + offset, fui(state[idx].scale[1])); + si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET + offset, fui(state[idx].translate[1])); + si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE + offset, fui(state[idx].scale[2])); + si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET + offset, fui(state[idx].translate[2])); - si_pm4_set_state(sctx, viewport, viewport); + si_pm4_set_state(sctx, viewport[i], viewport); + } } /* @@ -649,7 +666,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, /* offset */ rs->offset_units = state->offset_units; - rs->offset_scale = state->offset_scale * 12.0f; + rs->offset_scale = state->offset_scale * 16.0f; si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, S_0286D4_FLAT_SHADE_ENA(1) | @@ -718,12 +735,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) if (sctx->framebuffer.nr_samples > 1 && (!old_rs || old_rs->multisample_enable != rs->multisample_enable)) - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); si_pm4_bind_state(sctx, rasterizer, rs); si_update_fb_rs_state(sctx); - sctx->clip_regs.dirty = true; + si_mark_atom_dirty(sctx, &sctx->clip_regs); } static void si_delete_rs_state(struct pipe_context *ctx, void *state) @@ -821,7 +838,8 @@ static void *si_create_dsa_state(struct pipe_context *ctx, db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) | - S_028800_ZFUNC(state->depth.func); + S_028800_ZFUNC(state->depth.func) | + S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test); /* stencil */ if (state->stencil[0].enabled) { @@ -850,9 +868,12 @@ static void *si_create_dsa_state(struct pipe_context *ctx, dsa->alpha_func = PIPE_FUNC_ALWAYS; } - /* misc */ si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); + if (state->depth.bounds_test) { + si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min)); + si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max)); + } return dsa; } @@ -888,7 +909,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable) { struct si_context *sctx = (struct si_context*)ctx; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state) @@ -1157,7 +1178,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, int first_non_void) { struct si_screen *sscreen = (struct si_screen*)screen; - bool enable_s3tc = sscreen->b.info.drm_minor >= 31; + bool enable_compressed_formats = (sscreen->b.info.drm_major == 2 && + sscreen->b.info.drm_minor >= 31) || + sscreen->b.info.drm_major == 3; boolean uniform = TRUE; int i; @@ -1200,7 +1223,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, } if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { - if (!enable_s3tc) + if (!enable_compressed_formats) goto out_unknown; switch (format) { @@ -1220,7 +1243,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, } if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { - if (!enable_s3tc) + if (!enable_compressed_formats) goto out_unknown; switch (format) { @@ -1249,8 +1272,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, } if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - - if (!enable_s3tc) + if (!enable_compressed_formats) goto out_unknown; if (!util_format_s3tc_enabled) { @@ -1606,7 +1628,6 @@ boolean si_is_format_supported(struct pipe_screen *screen, unsigned sample_count, unsigned usage) { - struct si_screen *sscreen = (struct si_screen *)screen; unsigned retval = 0; if (target >= PIPE_MAX_TEXTURE_TYPES) { @@ -1618,8 +1639,7 @@ boolean si_is_format_supported(struct pipe_screen *screen, return FALSE; if (sample_count > 1) { - /* 2D tiling on CIK is supported since DRM 2.35.0 */ - if (sscreen->b.chip_class >= CIK && sscreen->b.info.drm_minor < 35) + if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) return FALSE; switch (sample_count) { @@ -1826,6 +1846,9 @@ static void si_initialize_color_surface(struct si_context *sctx, surf->cb_color_info = color_info; surf->cb_color_attrib = color_attrib; + if (sctx->b.chip_class >= VI) + surf->cb_dcc_control = S_028C78_OVERWRITE_COMBINER_DISABLE(1); + if (rtex->fmask.size) { surf->cb_color_fmask = (offset + rtex->fmask.offset) >> 8; surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max); @@ -2023,7 +2046,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, util_format_is_pure_integer(state->cbufs[0]->format); if (sctx->framebuffer.cb0_is_integer != old_cb0_is_integer) - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2043,6 +2066,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (rtex->fmask.size && rtex->cmask.size) { sctx->framebuffer.compressed_cb_mask |= 1 << i; } + r600_context_add_resource_size(ctx, surf->base.texture); } /* Set the 16BPC export for possible dual-src blending. */ if (i == 1 && surf && surf->export_16bpc) { @@ -2057,20 +2081,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (!surf->depth_initialized) { si_init_depth_surface(sctx, surf); } + r600_context_add_resource_size(ctx, surf->base.texture); } si_update_fb_rs_state(sctx); si_update_fb_blend_state(sctx); - sctx->framebuffer.atom.num_dw = state->nr_cbufs*15 + (8 - state->nr_cbufs)*3; + sctx->framebuffer.atom.num_dw = state->nr_cbufs*16 + (8 - state->nr_cbufs)*3; sctx->framebuffer.atom.num_dw += state->zsbuf ? 26 : 4; sctx->framebuffer.atom.num_dw += 3; /* WINDOW_SCISSOR_BR */ sctx->framebuffer.atom.num_dw += 18; /* MSAA sample locations */ - sctx->framebuffer.atom.dirty = true; + si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); if (sctx->framebuffer.nr_samples != old_nr_samples) { - sctx->msaa_config.dirty = true; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->msaa_config); + si_mark_atom_dirty(sctx, &sctx->db_render_state); /* Set sample locations as fragment shader constants. */ switch (sctx->framebuffer.nr_samples) { @@ -2107,7 +2132,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, old_nr_samples != SI_NUM_SMOOTH_AA_SAMPLES) && (sctx->framebuffer.nr_samples != SI_NUM_SMOOTH_AA_SAMPLES || old_nr_samples != 1)) - sctx->msaa_sample_locs.dirty = true; + si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs); } } @@ -2141,20 +2166,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom RADEON_PRIO_COLOR_META); } - r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 13); + r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, + sctx->b.chip_class >= VI ? 14 : 13); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ radeon_emit(cs, cb->cb_color_info | tex->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ - radeon_emit(cs, 0); /* R_028C78 unused */ + radeon_emit(cs, cb->cb_dcc_control); /* R_028C78_CB_COLOR0_DCC_CONTROL */ radeon_emit(cs, tex->cmask.base_address_reg); /* R_028C7C_CB_COLOR0_CMASK */ radeon_emit(cs, tex->cmask.slice_tile_max); /* R_028C80_CB_COLOR0_CMASK_SLICE */ radeon_emit(cs, cb->cb_color_fmask); /* R_028C84_CB_COLOR0_FMASK */ radeon_emit(cs, cb->cb_color_fmask_slice); /* R_028C88_CB_COLOR0_FMASK_SLICE */ radeon_emit(cs, tex->color_clear_value[0]); /* R_028C8C_CB_COLOR0_CLEAR_WORD0 */ radeon_emit(cs, tex->color_clear_value[1]); /* R_028C90_CB_COLOR0_CLEAR_WORD1 */ + + if (sctx->b.chip_class >= VI) + radeon_emit(cs, 0); /* R_028C94_CB_COLOR0_DCC_BASE */ } /* set CB_COLOR1_INFO for possible dual-src blending */ if (i == 1 && state->cbufs[0]) { @@ -2249,22 +2278,35 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) sctx->ps_iter_samples = min_samples; if (sctx->framebuffer.nr_samples > 1) - sctx->msaa_config.dirty = true; + si_mark_atom_dirty(sctx, &sctx->msaa_config); } /* * Samplers */ -static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state) +/** + * Create a sampler view. + * + * @param ctx context + * @param texture texture + * @param state sampler view template + * @param width0 width0 override (for compressed textures as int) + * @param height0 height0 override (for compressed textures as int) + * @param force_level set the base address to the level (for compressed textures) + */ +struct pipe_sampler_view * +si_create_sampler_view_custom(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level) { struct si_context *sctx = (struct si_context*)ctx; struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); struct r600_texture *tmp = (struct r600_texture*)texture; const struct util_format_description *desc; - unsigned format, num_format; + unsigned format, num_format, base_level, first_level, last_level; uint32_t pitch = 0; unsigned char state_swizzle[4], swizzle[4]; unsigned height, depth, width; @@ -2297,7 +2339,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx /* Buffer resource. */ if (texture->target == PIPE_BUFFER) { - unsigned stride; + unsigned stride, num_records; desc = util_format_description(state->format); first_non_void = util_format_get_first_non_void_channel(state->format); @@ -2306,10 +2348,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); + num_records = state->u.buf.last_element + 1 - state->u.buf.first_element; + num_records = MIN2(num_records, texture->width0 / stride); + + if (sctx->b.chip_class >= VI) + num_records *= stride; + view->state[4] = va; view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); - view->state[6] = state->u.buf.last_element + 1 - state->u.buf.first_element; + view->state[6] = num_records; view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | @@ -2437,13 +2485,25 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx format = 0; } - /* not supported any more */ - //endian = si_colorformat_endian_swap(format); + base_level = 0; + first_level = state->u.tex.first_level; + last_level = state->u.tex.last_level; + width = width0; + height = height0; + depth = texture->depth0; - width = surflevel[0].npix_x; - height = surflevel[0].npix_y; - depth = surflevel[0].npix_z; - pitch = surflevel[0].nblk_x * util_format_get_blockwidth(pipe_format); + if (force_level) { + assert(force_level == first_level && + force_level == last_level); + base_level = force_level; + first_level = 0; + last_level = 0; + width = u_minify(width, force_level); + height = u_minify(height, force_level); + depth = u_minify(depth, force_level); + } + + pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format); if (texture->target == PIPE_TEXTURE_1D_ARRAY) { height = 1; @@ -2453,8 +2513,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx } else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY) depth = texture->array_size / 6; - va = tmp->resource.gpu_address + surflevel[0].offset; - va += tmp->mipmap_shift * surflevel[texture->last_level].slice_size * tmp->surface.array_size; + va = tmp->resource.gpu_address + surflevel[base_level].offset; view->state[0] = va >> 8; view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) | @@ -2467,11 +2526,11 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ? - 0 : state->u.tex.first_level - tmp->mipmap_shift) | + 0 : first_level) | S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ? util_logbase2(texture->nr_samples) : - state->u.tex.last_level - tmp->mipmap_shift) | - S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, 0, false)) | + last_level) | + S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) | S_008F1C_POW2_PAD(texture->last_level > 0) | S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples))); view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1)); @@ -2523,6 +2582,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx return &view->base; } +static struct pipe_sampler_view * +si_create_sampler_view(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state) +{ + return si_create_sampler_view_custom(ctx, texture, state, + texture ? texture->width0 : 0, + texture ? texture->height0 : 0, 0); +} + static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state) { @@ -2765,6 +2834,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, pipe_resource_reference(&dsti->buffer, src->buffer); dsti->buffer_offset = src->buffer_offset; dsti->stride = src->stride; + r600_context_add_resource_size(ctx, src->buffer); } } else { for (i = 0; i < count; i++) { @@ -2782,6 +2852,7 @@ static void si_set_index_buffer(struct pipe_context *ctx, if (ib) { pipe_resource_reference(&sctx->index_buffer.buffer, ib->buffer); memcpy(&sctx->index_buffer, ib, sizeof(*ib)); + r600_context_add_resource_size(ctx, ib->buffer); } else { pipe_resource_reference(&sctx->index_buffer.buffer, NULL); } @@ -2845,6 +2916,30 @@ static void si_set_polygon_stipple(struct pipe_context *ctx, } } +static void si_set_tess_state(struct pipe_context *ctx, + const float default_outer_level[4], + const float default_inner_level[2]) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb; + float array[8]; + + memcpy(array, default_outer_level, sizeof(float) * 4); + memcpy(array+4, default_inner_level, sizeof(float) * 2); + + cb.buffer = NULL; + cb.user_buffer = NULL; + cb.buffer_size = sizeof(array); + + si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer, + (void*)array, sizeof(array), + &cb.buffer_offset); + + ctx->set_constant_buffer(ctx, PIPE_SHADER_TESS_CTRL, + SI_DRIVER_STATE_CONST_BUF, &cb); + pipe_resource_reference(&cb.buffer, NULL); +} + static void si_texture_barrier(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; @@ -2870,6 +2965,8 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw, si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo); } +static void si_init_config(struct si_context *sctx); + void si_init_state_functions(struct si_context *sctx) { si_init_atom(&sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state, 0); @@ -2920,6 +3017,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.b.texture_barrier = si_texture_barrier; sctx->b.b.set_polygon_stipple = si_set_polygon_stipple; sctx->b.b.set_min_samples = si_set_min_samples; + sctx->b.b.set_tess_state = si_set_tess_state; sctx->b.set_occlusion_query_state = si_set_occlusion_query_state; sctx->b.need_gfx_cs_space = si_need_gfx_cs_space; @@ -2931,24 +3029,31 @@ void si_init_state_functions(struct si_context *sctx) } else { sctx->b.dma_copy = si_dma_copy; } + + si_init_config(sctx); } static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4, - unsigned raster_config) + unsigned raster_config, + unsigned raster_config_1) { unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1); unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1); unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask; - unsigned num_rb = sctx->screen->b.info.r600_num_backends; - unsigned rb_per_pkr = num_rb / num_se / sh_per_se; + unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16); + unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2); unsigned rb_per_se = num_rb / num_se; - unsigned se0_mask = (1 << rb_per_se) - 1; - unsigned se1_mask = se0_mask << rb_per_se; + unsigned se_mask[4]; unsigned se; - assert(num_se == 1 || num_se == 2); + se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask; + se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask; + se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask; + se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask; + + assert(num_se == 1 || num_se == 2 || num_se == 4); assert(sh_per_se == 1 || sh_per_se == 2); assert(rb_per_pkr == 1 || rb_per_pkr == 2); @@ -2956,17 +3061,16 @@ si_write_harvested_raster_configs(struct si_context *sctx, * fields are for, so I'm leaving them as their default * values. */ - se0_mask &= rb_mask; - se1_mask &= rb_mask; - if (num_se == 2 && (!se0_mask || !se1_mask)) { - raster_config &= C_028350_SE_MAP; + if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || + (!se_mask[2] && !se_mask[3]))) { + raster_config_1 &= C_028354_SE_PAIR_MAP; - if (!se0_mask) { - raster_config |= - S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); + if (!se_mask[0] && !se_mask[1]) { + raster_config_1 |= + S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3); } else { - raster_config |= - S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); + raster_config_1 |= + S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0); } } @@ -2974,10 +3078,23 @@ si_write_harvested_raster_configs(struct si_context *sctx, unsigned raster_config_se = raster_config; unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se); unsigned pkr1_mask = pkr0_mask << rb_per_pkr; + int idx = (se / 2) * 2; + + if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) { + raster_config_se &= C_028350_SE_MAP; + + if (!se_mask[idx]) { + raster_config_se |= + S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); + } else { + raster_config_se |= + S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); + } + } pkr0_mask &= rb_mask; pkr1_mask &= rb_mask; - if (sh_per_se == 2 && (!pkr0_mask || !pkr1_mask)) { + if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) { raster_config_se &= C_028350_PKR_MAP; if (!pkr0_mask) { @@ -2989,7 +3106,7 @@ si_write_harvested_raster_configs(struct si_context *sctx, } } - if (rb_per_pkr == 2) { + if (rb_per_se >= 2) { unsigned rb0_mask = 1 << (se * rb_per_se); unsigned rb1_mask = rb0_mask << 1; @@ -3007,7 +3124,7 @@ si_write_harvested_raster_configs(struct si_context *sctx, } } - if (sh_per_se == 2) { + if (rb_per_se > 2) { rb0_mask = 1 << (se * rb_per_se + rb_per_pkr); rb1_mask = rb0_mask << 1; rb0_mask &= rb_mask; @@ -3026,19 +3143,28 @@ si_write_harvested_raster_configs(struct si_context *sctx, } } - si_pm4_set_reg(pm4, GRBM_GFX_INDEX, - SE_INDEX(se) | SH_BROADCAST_WRITES | - INSTANCE_BROADCAST_WRITES); + /* GRBM_GFX_INDEX is privileged on VI */ + if (sctx->b.chip_class <= CIK) + si_pm4_set_reg(pm4, GRBM_GFX_INDEX, + SE_INDEX(se) | SH_BROADCAST_WRITES | + INSTANCE_BROADCAST_WRITES); si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se); + if (sctx->b.chip_class >= CIK) + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); } - si_pm4_set_reg(pm4, GRBM_GFX_INDEX, - SE_BROADCAST_WRITES | SH_BROADCAST_WRITES | - INSTANCE_BROADCAST_WRITES); + /* GRBM_GFX_INDEX is privileged on VI */ + if (sctx->b.chip_class <= CIK) + si_pm4_set_reg(pm4, GRBM_GFX_INDEX, + SE_BROADCAST_WRITES | SH_BROADCAST_WRITES | + INSTANCE_BROADCAST_WRITES); } -void si_init_config(struct si_context *sctx) +static void si_init_config(struct si_context *sctx) { + unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16); + unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask; + unsigned raster_config, raster_config_1; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); if (pm4 == NULL) @@ -3046,24 +3172,18 @@ void si_init_config(struct si_context *sctx) si_cmd_context_control(pm4); - si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0); - si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0); + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); /* FIXME calculate these values somehow ??? */ si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, 0x80); si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); - si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0x0); si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0); si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0); - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0); - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0); - si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, 0); - si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0); si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); @@ -3076,62 +3196,78 @@ void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); - if (sctx->b.chip_class >= CIK) { - switch (sctx->screen->b.family) { - case CHIP_BONAIRE: - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012); - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0); - break; - case CHIP_HAWAII: - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x3a00161a); - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002e); - break; - case CHIP_KAVERI: - /* XXX todo */ - case CHIP_KABINI: - /* XXX todo */ - case CHIP_MULLINS: - /* XXX todo */ - default: - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0); - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0); - break; - } - } else { - unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask; - unsigned num_rb = sctx->screen->b.info.r600_num_backends; - unsigned raster_config; - - switch (sctx->screen->b.family) { - case CHIP_TAHITI: - case CHIP_PITCAIRN: - raster_config = 0x2a00126a; - break; - case CHIP_VERDE: - raster_config = 0x0000124a; - break; - case CHIP_OLAND: - raster_config = 0x00000082; - break; - case CHIP_HAINAN: - raster_config = 0; - break; - default: - fprintf(stderr, - "radeonsi: Unknown GPU, using 0 for raster_config\n"); - raster_config = 0; - break; - } + switch (sctx->screen->b.family) { + case CHIP_TAHITI: + case CHIP_PITCAIRN: + raster_config = 0x2a00126a; + raster_config_1 = 0x00000000; + break; + case CHIP_VERDE: + raster_config = 0x0000124a; + raster_config_1 = 0x00000000; + break; + case CHIP_OLAND: + raster_config = 0x00000082; + raster_config_1 = 0x00000000; + break; + case CHIP_HAINAN: + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + case CHIP_BONAIRE: + raster_config = 0x16000012; + raster_config_1 = 0x00000000; + break; + case CHIP_HAWAII: + raster_config = 0x3a00161a; + raster_config_1 = 0x0000002e; + break; + case CHIP_FIJI: + /* Fiji should be same as Hawaii, but that causes corruption in some cases */ + raster_config = 0x16000012; /* 0x3a00161a */ + raster_config_1 = 0x0000002a; /* 0x0000002e */ + break; + case CHIP_TONGA: + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + break; + case CHIP_ICELAND: + raster_config = 0x00000002; + raster_config_1 = 0x00000000; + break; + case CHIP_CARRIZO: + raster_config = 0x00000002; + raster_config_1 = 0x00000000; + break; + case CHIP_KAVERI: + /* KV should be 0x00000002, but that causes problems with radeon */ + raster_config = 0x00000000; /* 0x00000002 */ + raster_config_1 = 0x00000000; + break; + case CHIP_KABINI: + case CHIP_MULLINS: + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + default: + fprintf(stderr, + "radeonsi: Unknown GPU, using 0 for raster_config\n"); + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + } - /* Always use the default config when all backends are enabled - * (or when we failed to determine the enabled backends). - */ - if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, - raster_config); - } else { - si_write_harvested_raster_configs(sctx, pm4, raster_config); - } + /* Always use the default config when all backends are enabled + * (or when we failed to determine the enabled backends). + */ + if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { + si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, + raster_config); + if (sctx->b.chip_class >= CIK) + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, + raster_config_1); + } else { + si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); } si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); @@ -3153,8 +3289,6 @@ void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0)); si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0)); si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0)); - si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, 0); - si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, 0); si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0); si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); @@ -3173,10 +3307,21 @@ void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); if (sctx->b.chip_class >= CIK) { + si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffc)); + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0); + si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xfffe)); + si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff)); si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0)); si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff)); } + if (sctx->b.chip_class >= VI) { + si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL, + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1)); + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30); + si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); + } + sctx->init_config = pm4; } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 5e68b162137..b8f63c5dd36 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -30,6 +30,8 @@ #include "si_pm4.h" #include "radeon/r600_pipe_common.h" +#define SI_NUM_SHADERS (PIPE_SHADER_TESS_EVAL+1) + struct si_screen; struct si_shader; @@ -92,18 +94,21 @@ union si_state { struct si_pm4_state *blend_color; struct si_pm4_state *clip; struct si_state_sample_mask *sample_mask; - struct si_state_scissor *scissor; - struct si_state_viewport *viewport; + struct si_state_scissor *scissor[16]; + struct si_state_viewport *viewport[16]; struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; struct si_pm4_state *fb_rs; struct si_pm4_state *fb_blend; struct si_pm4_state *dsa_stencil_ref; struct si_pm4_state *ta_bordercolor_base; + struct si_pm4_state *ls; + struct si_pm4_state *hs; struct si_pm4_state *es; struct si_pm4_state *gs; struct si_pm4_state *gs_rings; - struct si_pm4_state *gs_onoff; + struct si_pm4_state *tf_ring; + struct si_pm4_state *vgt_shader_config; struct si_pm4_state *vs; struct si_pm4_state *ps; struct si_pm4_state *spi; @@ -111,6 +116,11 @@ union si_state { struct si_pm4_state *array[0]; }; +struct si_shader_data { + struct r600_atom atom; + uint32_t sh_base[SI_NUM_SHADERS]; +}; + #define SI_NUM_USER_SAMPLERS 16 /* AKA OpenGL textures units per shader */ #define SI_POLY_STIPPLE_SAMPLER SI_NUM_USER_SAMPLERS #define SI_NUM_SAMPLERS (SI_POLY_STIPPLE_SAMPLER + 1) @@ -135,68 +145,61 @@ union si_state { * Ring buffers: 0..1 * Streamout buffers: 2..5 */ -#define SI_RING_ESGS 0 -#define SI_RING_GSVS 1 -#define SI_NUM_RING_BUFFERS 2 +#define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */ +#define SI_RING_ESGS 0 /* for ES, GS */ +#define SI_RING_GSVS 1 /* for GS, VS */ +#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */ +#define SI_RING_GSVS_2 3 +#define SI_RING_GSVS_3 4 +#define SI_NUM_RING_BUFFERS 5 #define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS #define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4) #define SI_NUM_VERTEX_BUFFERS 16 -/* This represents resource descriptors in memory, such as buffer resources, +/* This represents descriptors in memory, such as buffer resources, * image resources, and sampler states. */ struct si_descriptors { - struct r600_atom atom; - - /* The size of one resource descriptor. */ + /* The list of descriptors in malloc'd memory. */ + uint32_t *list; + /* The size of one descriptor. */ unsigned element_dw_size; - /* The maximum number of resource descriptors. */ + /* The maximum number of descriptors. */ unsigned num_elements; + /* Whether the list has been changed and should be re-uploaded. */ + bool list_dirty; - /* The buffer where resource descriptors are stored. */ + /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; unsigned buffer_offset; - /* The i-th bit is set if that element is dirty (changed but not emitted). */ - uint64_t dirty_mask; /* The i-th bit is set if that element is enabled (non-NULL resource). */ uint64_t enabled_mask; - /* We can't update descriptors directly because the GPU might be - * reading them at the same time, so we have to update them - * in a copy-on-write manner. Each such copy is called a context, - * which is just another array descriptors in the same buffer. */ - unsigned current_context_id; - /* The size of a context, should be equal to 4*element_dw_size*num_elements. */ - unsigned context_size; - - /* The shader userdata register where the 64-bit pointer to the descriptor + /* The shader userdata offset within a shader where the 64-bit pointer to the descriptor * array will be stored. */ - unsigned shader_userdata_reg; + unsigned shader_userdata_offset; + /* Whether the pointer should be re-emitted. */ + bool pointer_dirty; }; struct si_sampler_views { struct si_descriptors desc; struct pipe_sampler_view *views[SI_NUM_SAMPLER_VIEWS]; - uint32_t *desc_data[SI_NUM_SAMPLER_VIEWS]; }; struct si_sampler_states { struct si_descriptors desc; - uint32_t *desc_data[SI_NUM_SAMPLER_STATES]; void *saved_states[2]; /* saved for u_blitter */ }; struct si_buffer_resources { struct si_descriptors desc; - unsigned num_buffers; enum radeon_bo_usage shader_usage; /* READ, WRITE, or READWRITE */ enum radeon_bo_priority priority; struct pipe_resource **buffers; /* this has num_buffers elements */ - uint32_t *desc_storage; /* this has num_buffers*4 elements */ - uint32_t **desc_data; /* an array of pointers pointing to desc_storage */ }; #define si_pm4_block_idx(member) \ @@ -232,20 +235,18 @@ struct si_buffer_resources { /* si_descriptors.c */ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, unsigned start, unsigned count, void **states); -void si_update_vertex_buffers(struct si_context *sctx); void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_resource *buffer, unsigned stride, unsigned num_records, bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride); + unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); +bool si_upload_shader_descriptors(struct si_context *sctx); void si_release_all_descriptors(struct si_context *sctx); void si_all_descriptors_begin_new_cs(struct si_context *sctx); -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer); void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, const uint8_t *ptr, unsigned size, uint32_t *const_offset); +void si_shader_change_notify(struct si_context *sctx); /* si_state.c */ struct si_shader_selector; @@ -256,7 +257,6 @@ boolean si_is_format_supported(struct pipe_screen *screen, unsigned sample_count, unsigned usage); void si_init_state_functions(struct si_context *sctx); -void si_init_config(struct si_context *sctx); unsigned cik_bank_wh(unsigned bankwh); unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode); unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect); @@ -264,6 +264,12 @@ unsigned cik_tile_split(unsigned tile_split); unsigned si_array_mode(unsigned mode); uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex); unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil); +struct pipe_sampler_view * +si_create_sampler_view_custom(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level); /* si_state_shader.c */ void si_update_shaders(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 2e77d85a80d..4c21655596c 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -31,6 +31,7 @@ #include "util/u_index_modify.h" #include "util/u_upload_mgr.h" +#include "util/u_prim.h" static void si_decompress_textures(struct si_context *sctx) { @@ -64,6 +65,7 @@ static unsigned si_conv_pipe_prim(unsigned mode) [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ, [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ, [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ, + [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH, [R600_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST }; assert(mode < Elements(prim_conv)); @@ -87,6 +89,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode) [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST, [R600_PRIM_RECTANGLE_LIST] = V_028A6C_OUTPRIM_TYPE_TRISTRIP }; assert(mode < Elements(prim_conv)); @@ -94,8 +97,128 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode) return prim_conv[mode]; } +/** + * This calculates the LDS size for tessellation shaders (VS, TCS, TES). + * LS.LDS_SIZE is shared by all 3 shader stages. + * + * The information about LDS and other non-compile-time parameters is then + * written to userdata SGPRs. + */ +static void si_emit_derived_tess_state(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned *num_patches) +{ + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct si_shader_selector *ls = sctx->vs_shader; + /* The TES pointer will only be used for sctx->last_tcs. + * It would be wrong to think that TCS = TES. */ + struct si_shader_selector *tcs = + sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader; + unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL]; + unsigned num_tcs_input_cp = info->vertices_per_patch; + unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs; + unsigned num_tcs_patch_outputs; + unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size; + unsigned input_patch_size, output_patch_size, output_patch0_offset; + unsigned perpatch_output_offset, lds_size, ls_rsrc2; + unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets; + + *num_patches = 1; /* TODO: calculate this */ + + if (sctx->last_ls == ls->current && + sctx->last_tcs == tcs && + sctx->last_tes_sh_base == tes_sh_base && + sctx->last_num_tcs_input_cp == num_tcs_input_cp) + return; + + sctx->last_ls = ls->current; + sctx->last_tcs = tcs; + sctx->last_tes_sh_base = tes_sh_base; + sctx->last_num_tcs_input_cp = num_tcs_input_cp; + + /* This calculates how shader inputs and outputs among VS, TCS, and TES + * are laid out in LDS. */ + num_tcs_inputs = util_last_bit64(ls->outputs_written); + + if (sctx->tcs_shader) { + num_tcs_outputs = util_last_bit64(tcs->outputs_written); + num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); + } else { + /* No TCS. Route varyings from LS to TES. */ + num_tcs_outputs = num_tcs_inputs; + num_tcs_output_cp = num_tcs_input_cp; + num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ + } + + input_vertex_size = num_tcs_inputs * 16; + output_vertex_size = num_tcs_outputs * 16; + + input_patch_size = num_tcs_input_cp * input_vertex_size; + + pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; + output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + + output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0; + perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; + + lds_size = output_patch0_offset + output_patch_size * *num_patches; + ls_rsrc2 = ls->current->ls_rsrc2; + + if (sctx->b.chip_class >= CIK) { + assert(lds_size <= 65536); + ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512); + } else { + assert(lds_size <= 32768); + ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256); + } + + /* Due to a hw bug, RSRC2_LS must be written twice with another + * LS register written in between. */ + if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII) + si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); + si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); + radeon_emit(cs, ls->current->ls_rsrc1); + radeon_emit(cs, ls_rsrc2); + + /* Compute userdata SGPRs. */ + assert(((input_vertex_size / 4) & ~0xff) == 0); + assert(((output_vertex_size / 4) & ~0xff) == 0); + assert(((input_patch_size / 4) & ~0x1fff) == 0); + assert(((output_patch_size / 4) & ~0x1fff) == 0); + assert(((output_patch0_offset / 16) & ~0xffff) == 0); + assert(((perpatch_output_offset / 16) & ~0xffff) == 0); + assert(num_tcs_input_cp <= 32); + assert(num_tcs_output_cp <= 32); + + tcs_in_layout = (input_patch_size / 4) | + ((input_vertex_size / 4) << 13); + tcs_out_layout = (output_patch_size / 4) | + ((output_vertex_size / 4) << 13); + tcs_out_offsets = (output_patch0_offset / 16) | + ((perpatch_output_offset / 16) << 16); + + /* Set them for LS. */ + si_write_sh_reg(cs, + R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4, + tcs_in_layout); + + /* Set them for TCS. */ + si_write_sh_reg_seq(cs, + R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OUT_OFFSETS * 4, 3); + radeon_emit(cs, tcs_out_offsets); + radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26)); + radeon_emit(cs, tcs_in_layout); + + /* Set them for TES. */ + si_write_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OUT_OFFSETS * 4, 2); + radeon_emit(cs, tcs_out_offsets); + radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26)); +} + static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, - const struct pipe_draw_info *info) + const struct pipe_draw_info *info, + unsigned num_patches) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned prim = info->mode; @@ -104,11 +227,41 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, /* SWITCH_ON_EOP(0) is always preferable. */ bool wd_switch_on_eop = false; bool ia_switch_on_eop = false; + bool ia_switch_on_eoi = false; bool partial_vs_wave = false; + bool partial_es_wave = false; if (sctx->gs_shader) primgroup_size = 64; /* recommended with a GS */ + if (sctx->tes_shader) { + unsigned num_cp_out = + sctx->tcs_shader ? + sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : + info->vertices_per_patch; + unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out); + + primgroup_size = MIN2(primgroup_size, max_size); + + /* primgroup_size must be set to a multiple of NUM_PATCHES */ + primgroup_size = (primgroup_size / num_patches) * num_patches; + + /* SWITCH_ON_EOI must be set if PrimID is used. + * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ + if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) || + sctx->tes_shader->info.uses_primid) { + ia_switch_on_eoi = true; + partial_es_wave = true; + } + + /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */ + if ((sctx->b.family == CHIP_TAHITI || + sctx->b.family == CHIP_PITCAIRN || + sctx->b.family == CHIP_BONAIRE) && + sctx->gs_shader) + partial_vs_wave = true; + } + /* This is a hardware requirement. */ if ((rs && rs->line_stipple_enable) || (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) { @@ -139,14 +292,52 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, (info->indirect || info->instance_count > 1)) wd_switch_on_eop = true; + /* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */ + if (info->count_from_stream_output) + wd_switch_on_eop = true; + /* If the WD switch is false, the IA switch must be false too. */ assert(wd_switch_on_eop || !ia_switch_on_eop); } + /* Hw bug with single-primitive instances and SWITCH_ON_EOI + * on multi-SE chips. */ + if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi && + (info->indirect || + (info->instance_count > 1 && + u_prims_for_vertices(info->mode, info->count) <= 1))) + sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; + + /* Instancing bug on 2 SE chips. */ + if (sctx->b.screen->info.max_se == 2 && ia_switch_on_eoi && + (info->indirect || info->instance_count > 1)) + partial_vs_wave = true; + return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | + S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | + S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) | - S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0); + S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) | + S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ? 2 : 0); +} + +static unsigned si_get_ls_hs_config(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned num_patches) +{ + unsigned num_output_cp; + + if (!sctx->tes_shader) + return 0; + + num_output_cp = sctx->tcs_shader ? + sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : + info->vertices_per_patch; + + return S_028B58_NUM_PATCHES(num_patches) | + S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) | + S_028B58_HS_NUM_OUTPUT_CP(num_output_cp); } static void si_emit_scratch_reloc(struct si_context *sctx) @@ -202,22 +393,31 @@ static void si_emit_draw_registers(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; unsigned prim = si_conv_pipe_prim(info->mode); unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); - unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info); + unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0; + + if (sctx->tes_shader) + si_emit_derived_tess_state(sctx, info, &num_patches); + + ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches); + ls_hs_config = si_get_ls_hs_config(sctx, info, num_patches); /* Draw state. */ if (prim != sctx->last_prim || - ia_multi_vgt_param != sctx->last_multi_vgt_param) { + ia_multi_vgt_param != sctx->last_multi_vgt_param || + ls_hs_config != sctx->last_ls_hs_config) { if (sctx->b.chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0)); radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */ radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */ - radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */ + radeon_emit(cs, ls_hs_config); /* VGT_LS_HS_CONFIG */ } else { r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim); r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); + r600_write_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); } sctx->last_prim = prim; sctx->last_multi_vgt_param = ia_multi_vgt_param; + sctx->last_ls_hs_config = ls_hs_config; } if (gs_out_prim != sctx->last_gs_out_prim) { @@ -245,8 +445,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_index_buffer *ib) { struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : - R_00B130_SPI_SHADER_USER_DATA_VS_0); + unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX]; if (info->count_from_stream_output) { struct r600_so_target *t = @@ -275,12 +474,24 @@ static void si_emit_draw_packets(struct si_context *sctx, if (info->indexed) { radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); - if (ib->index_size == 4) { - radeon_emit(cs, V_028A7C_VGT_INDEX_32 | (SI_BIG_ENDIAN ? - V_028A7C_VGT_DMA_SWAP_32_BIT : 0)); - } else { - radeon_emit(cs, V_028A7C_VGT_INDEX_16 | (SI_BIG_ENDIAN ? - V_028A7C_VGT_DMA_SWAP_16_BIT : 0)); + /* index type */ + switch (ib->index_size) { + case 1: + radeon_emit(cs, V_028A7C_VGT_INDEX_8); + break; + case 2: + radeon_emit(cs, V_028A7C_VGT_INDEX_16 | + (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ? + V_028A7C_VGT_DMA_SWAP_16_BIT : 0)); + break; + case 4: + radeon_emit(cs, V_028A7C_VGT_INDEX_32 | + (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ? + V_028A7C_VGT_DMA_SWAP_32_BIT : 0)); + break; + default: + assert(!"unreachable"); + return; } } @@ -406,9 +617,14 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato if (sctx->flags & SI_CONTEXT_INV_TC_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L2) + if (sctx->flags & SI_CONTEXT_INV_TC_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); + /* TODO: this might not be needed. */ + if (sctx->chip_class >= VI) + cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); + } + if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | @@ -520,8 +736,14 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) (info->indexed || !info->count_from_stream_output)) return; - if (!sctx->ps_shader || !sctx->vs_shader) + if (!sctx->ps_shader || !sctx->vs_shader) { + assert(0); return; + } + if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) { + assert(0); + return; + } si_decompress_textures(sctx); @@ -532,15 +754,15 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) * current_rast_prim for this draw_vbo call. */ if (sctx->gs_shader) sctx->current_rast_prim = sctx->gs_shader->gs_output_prim; + else if (sctx->tes_shader) + sctx->current_rast_prim = + sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; else sctx->current_rast_prim = info->mode; si_update_shaders(sctx); - - if (sctx->vertex_buffers_dirty) { - si_update_vertex_buffers(sctx); - sctx->vertex_buffers_dirty = false; - } + if (!si_upload_shader_descriptors(sctx)) + return; if (info->indexed) { /* Initialize the index buffer struct. */ @@ -550,7 +772,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) ib.offset = sctx->index_buffer.offset; /* Translate or upload, if needed. */ - if (ib.index_size == 1) { + /* 8-bit indices are supported on VI. */ + if (sctx->b.chip_class <= CIK && ib.index_size == 1) { struct pipe_resource *out_buffer = NULL; unsigned out_offset, start, count, start_offset; void *ptr; @@ -585,6 +808,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) } } + /* TODO: VI should read index buffers through TC, so this shouldn't be + * needed on VI. */ if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) { sctx->b.flags |= SI_CONTEXT_INV_TC_L2; r600_resource(ib.buffer)->TC_L2_dirty = false; @@ -592,7 +817,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* Check flush flags. */ if (sctx->b.flags) - sctx->atoms.s.cache_flush->dirty = true; + si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush); si_need_cs_space(sctx, 0, TRUE); @@ -618,7 +843,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ - if (sctx->b.family == CHIP_HAWAII && + if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) && (sctx->b.streamout.streamout_enabled || sctx->b.streamout.prims_gen_query_enabled)) { sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 208c8523ef1..0347014948d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -30,9 +30,135 @@ #include "sid.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" #include "util/u_memory.h" #include "util/u_simple_shaders.h" +static void si_set_tesseval_regs(struct si_shader *shader, + struct si_pm4_state *pm4) +{ + struct tgsi_shader_info *info = &shader->selector->info; + unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; + unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; + bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; + bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE]; + unsigned type, partitioning, topology; + + switch (tes_prim_mode) { + case PIPE_PRIM_LINES: + type = V_028B6C_TESS_ISOLINE; + break; + case PIPE_PRIM_TRIANGLES: + type = V_028B6C_TESS_TRIANGLE; + break; + case PIPE_PRIM_QUADS: + type = V_028B6C_TESS_QUAD; + break; + default: + assert(0); + return; + } + + switch (tes_spacing) { + case PIPE_TESS_SPACING_FRACTIONAL_ODD: + partitioning = V_028B6C_PART_FRAC_ODD; + break; + case PIPE_TESS_SPACING_FRACTIONAL_EVEN: + partitioning = V_028B6C_PART_FRAC_EVEN; + break; + case PIPE_TESS_SPACING_EQUAL: + partitioning = V_028B6C_PART_INTEGER; + break; + default: + assert(0); + return; + } + + if (tes_point_mode) + topology = V_028B6C_OUTPUT_POINT; + else if (tes_prim_mode == PIPE_PRIM_LINES) + topology = V_028B6C_OUTPUT_LINE; + else if (tes_vertex_order_cw) + /* for some reason, this must be the other way around */ + topology = V_028B6C_OUTPUT_TRIANGLE_CCW; + else + topology = V_028B6C_OUTPUT_TRIANGLE_CW; + + si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM, + S_028B6C_TYPE(type) | + S_028B6C_PARTITIONING(partitioning) | + S_028B6C_TOPOLOGY(topology)); +} + +static void si_shader_ls(struct si_shader *shader) +{ + struct si_pm4_state *pm4; + unsigned num_sgprs, num_user_sgprs; + unsigned vgpr_comp_cnt; + uint64_t va; + + pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state); + if (pm4 == NULL) + return; + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); + + /* We need at least 2 components for LS. + * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */ + vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1; + + num_user_sgprs = SI_LS_NUM_USER_SGPR; + num_sgprs = shader->num_sgprs; + if (num_user_sgprs > num_sgprs) { + /* Last 2 reserved SGPRs are used for VCC */ + num_sgprs = num_user_sgprs + 2; + } + assert(num_sgprs <= 104); + + si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40); + + shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B528_SGPRS((num_sgprs - 1) / 8) | + S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt); + shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | + S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); +} + +static void si_shader_hs(struct si_shader *shader) +{ + struct si_pm4_state *pm4; + unsigned num_sgprs, num_user_sgprs; + uint64_t va; + + pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state); + if (pm4 == NULL) + return; + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); + + num_user_sgprs = SI_TCS_NUM_USER_SGPR; + num_sgprs = shader->num_sgprs; + /* One SGPR after user SGPRs is pre-loaded with tessellation factor + * buffer offset. */ + if ((num_user_sgprs + 1) > num_sgprs) { + /* Last 2 reserved SGPRs are used for VCC */ + num_sgprs = num_user_sgprs + 1 + 2; + } + assert(num_sgprs <= 104); + + si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); + si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40); + si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, + S_00B428_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B428_SGPRS((num_sgprs - 1) / 8)); + si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, + S_00B42C_USER_SGPR(num_user_sgprs) | + S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); +} + static void si_shader_es(struct si_shader *shader) { struct si_pm4_state *pm4; @@ -48,9 +174,15 @@ static void si_shader_es(struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); - vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; + if (shader->selector->type == PIPE_SHADER_VERTEX) { + vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; + num_user_sgprs = SI_VS_NUM_USER_SGPR; + } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { + vgpr_comp_cnt = 3; /* all components are needed for TES */ + num_user_sgprs = SI_TES_NUM_USER_SGPR; + } else + assert(0); - num_user_sgprs = SI_VS_NUM_USER_SGPR; num_sgprs = shader->num_sgprs; /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */ if ((num_user_sgprs + 1) > num_sgprs) { @@ -69,17 +201,37 @@ static void si_shader_es(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(shader, pm4); +} + +static unsigned si_gs_get_max_stream(struct si_shader *shader) +{ + struct pipe_stream_output_info *so = &shader->selector->so; + unsigned max_stream = 0, i; + + if (so->num_outputs == 0) + return 0; + + for (i = 0; i < so->num_outputs; i++) { + if (so->output[i].stream > max_stream) + max_stream = so->output[i].stream; + } + return max_stream; } static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2); + unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; - unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; + unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; + unsigned gs_num_invocations = shader->selector->gs_num_invocations; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; + unsigned max_stream = si_gs_get_max_stream(shader); /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); @@ -107,16 +259,23 @@ static void si_shader_gs(struct si_shader *shader) S_028A40_GS_WRITE_OPTIMIZE(1)); si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize); - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize); + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - util_bitcount64(shader->selector->gs_used_inputs) * (16 >> 2)); - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize); + util_bitcount64(shader->selector->inputs_read) * (16 >> 2)); + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize); + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2); + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0); + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0); + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0); + + si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, + S_028B90_CNT(MIN2(gs_num_invocations, 127)) | + S_028B90_ENABLE(gs_num_invocations > 0)); va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); @@ -143,19 +302,29 @@ static void si_shader_gs(struct si_shader *shader) static void si_shader_vs(struct si_shader *shader) { - struct tgsi_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; - unsigned nparams, i, vgpr_comp_cnt; + unsigned nparams, vgpr_comp_cnt; uint64_t va; unsigned window_space = shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool enable_prim_id = si_vs_exports_prim_id(shader); pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state); if (pm4 == NULL) return; + /* If this is the GS copy shader, the GS state writes this register. + * Otherwise, the VS state writes it. + */ + if (!shader->is_gs_copy_shader) { + si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, + S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0)); + si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id); + } else + si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0); + va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); @@ -163,8 +332,11 @@ static void si_shader_vs(struct si_shader *shader) vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */ num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_VERTEX) { - vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; + vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0); num_user_sgprs = SI_VS_NUM_USER_SGPR; + } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { + vgpr_comp_cnt = 3; /* all components are needed for TES */ + num_user_sgprs = SI_TES_NUM_USER_SGPR; } else assert(0); @@ -175,28 +347,8 @@ static void si_shader_vs(struct si_shader *shader) } assert(num_sgprs <= 104); - /* Certain attributes (position, psize, etc.) don't count as params. - * VS is required to export at least one param and r600_shader_from_tgsi() - * takes care of adding a dummy export. - */ - for (nparams = 0, i = 0 ; i < info->num_outputs; i++) { - switch (info->output_semantic_name[i]) { - case TGSI_SEMANTIC_CLIPVERTEX: - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_CULLDIST: - case TGSI_SEMANTIC_POSITION: - case TGSI_SEMANTIC_PSIZE: - case TGSI_SEMANTIC_EDGEFLAG: - case TGSI_SEMANTIC_VIEWPORT_INDEX: - case TGSI_SEMANTIC_LAYER: - break; - default: - nparams++; - } - } - if (nparams < 1) - nparams = 1; - + /* VS is required to export at least one param. */ + nparams = MAX2(shader->nr_param_exports, 1); si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG, S_0286C4_VS_EXPORT_COUNT(nparams - 1)); @@ -236,6 +388,9 @@ static void si_shader_vs(struct si_shader *shader) S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1)); + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(shader, pm4); } static void si_shader_ps(struct si_shader *shader) @@ -333,7 +488,18 @@ static void si_shader_init_pm4_state(struct si_shader *shader) switch (shader->selector->type) { case PIPE_SHADER_VERTEX: - if (shader->key.vs.as_es) + if (shader->key.vs.as_ls) + si_shader_ls(shader); + else if (shader->key.vs.as_es) + si_shader_es(shader); + else + si_shader_vs(shader); + break; + case PIPE_SHADER_TESS_CTRL: + si_shader_hs(shader); + break; + case PIPE_SHADER_TESS_EVAL: + if (shader->key.tes.as_es) si_shader_es(shader); else si_shader_vs(shader); @@ -351,7 +517,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader) } /* Compute the key for the hw shader variant */ -static INLINE void si_shader_selector_key(struct pipe_context *ctx, +static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, union si_shader_key *key) { @@ -367,10 +533,27 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx, key->vs.instance_divisors[i] = sctx->vertex_elements->elements[i].instance_divisor; - if (sctx->gs_shader) { + if (sctx->tes_shader) + key->vs.as_ls = 1; + else if (sctx->gs_shader) { key->vs.as_es = 1; - key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs; + key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read; } + + if (!sctx->gs_shader && sctx->ps_shader && + sctx->ps_shader->info.uses_primid) + key->vs.export_prim_id = 1; + break; + case PIPE_SHADER_TESS_CTRL: + key->tcs.prim_mode = + sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + break; + case PIPE_SHADER_TESS_EVAL: + if (sctx->gs_shader) { + key->tes.as_es = 1; + key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read; + } else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid) + key->tes.export_prim_id = 1; break; case PIPE_SHADER_GEOMETRY: break; @@ -468,6 +651,7 @@ static int si_shader_select(struct pipe_context *ctx, } si_shader_init_pm4_state(shader); sel->num_shaders++; + p_atomic_inc(&sctx->screen->b.num_compilations); } return 0; @@ -485,6 +669,7 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->tokens = tgsi_dup_tokens(state->tokens); sel->so = state->stream_output; tgsi_scan_shader(state->tokens, &sel->info); + p_atomic_inc(&sscreen->b.num_shaders_created); switch (pipe_shader_type) { case PIPE_SHADER_GEOMETRY: @@ -492,6 +677,8 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; sel->gs_max_out_vertices = sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; + sel->gs_num_invocations = + sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; for (i = 0; i < sel->info.num_inputs; i++) { unsigned name = sel->info.input_semantic_name[i]; @@ -501,10 +688,31 @@ static void *si_create_shader_state(struct pipe_context *ctx, case TGSI_SEMANTIC_PRIMID: break; default: - sel->gs_used_inputs |= + sel->inputs_read |= 1llu << si_shader_io_get_unique_index(name, index); } } + break; + + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_CTRL: + for (i = 0; i < sel->info.num_outputs; i++) { + unsigned name = sel->info.output_semantic_name[i]; + unsigned index = sel->info.output_semantic_index[i]; + + switch (name) { + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_TESSOUTER: + case TGSI_SEMANTIC_PATCH: + sel->patch_outputs_written |= + 1llu << si_shader_io_get_unique_index(name, index); + break; + default: + sel->outputs_written |= + 1llu << si_shader_io_get_unique_index(name, index); + } + } + break; } if (sscreen->b.debug_flags & DBG_PRECOMPILE) @@ -531,6 +739,18 @@ static void *si_create_vs_state(struct pipe_context *ctx, return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX); } +static void *si_create_tcs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL); +} + +static void *si_create_tes_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL); +} + static void si_bind_vs_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -540,20 +760,58 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) return; sctx->vs_shader = sel; - sctx->clip_regs.dirty = true; + si_mark_atom_dirty(sctx, &sctx->clip_regs); } static void si_bind_gs_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->gs_shader != !!sel; if (sctx->gs_shader == sel) return; sctx->gs_shader = sel; - sctx->clip_regs.dirty = true; + si_mark_atom_dirty(sctx, &sctx->clip_regs); sctx->last_rast_prim = -1; /* reset this so that it gets updated */ + + if (enable_changed) + si_shader_change_notify(sctx); +} + +static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->tcs_shader != !!sel; + + if (sctx->tcs_shader == sel) + return; + + sctx->tcs_shader = sel; + + if (enable_changed) + sctx->last_tcs = NULL; /* invalidate derived tess state */ +} + +static void si_bind_tes_shader(struct pipe_context *ctx, void *state) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->tes_shader != !!sel; + + if (sctx->tes_shader == sel) + return; + + sctx->tes_shader = sel; + si_mark_atom_dirty(sctx, &sctx->clip_regs); + sctx->last_rast_prim = -1; /* reset this so that it gets updated */ + + if (enable_changed) { + si_shader_change_notify(sctx); + sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ + } } static void si_make_dummy_ps(struct si_context *sctx) @@ -594,7 +852,18 @@ static void si_delete_shader_selector(struct pipe_context *ctx, c = p->next_variant; switch (sel->type) { case PIPE_SHADER_VERTEX: - if (p->key.vs.as_es) + if (p->key.vs.as_ls) + si_pm4_delete_state(sctx, ls, p->pm4); + else if (p->key.vs.as_es) + si_pm4_delete_state(sctx, es, p->pm4); + else + si_pm4_delete_state(sctx, vs, p->pm4); + break; + case PIPE_SHADER_TESS_CTRL: + si_pm4_delete_state(sctx, hs, p->pm4); + break; + case PIPE_SHADER_TESS_EVAL: + if (p->key.tes.as_es) si_pm4_delete_state(sctx, es, p->pm4); else si_pm4_delete_state(sctx, vs, p->pm4); @@ -653,6 +922,30 @@ static void si_delete_ps_shader(struct pipe_context *ctx, void *state) si_delete_shader_selector(ctx, sel); } +static void si_delete_tcs_shader(struct pipe_context *ctx, void *state) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)state; + + if (sctx->tcs_shader == sel) { + sctx->tcs_shader = NULL; + } + + si_delete_shader_selector(ctx, sel); +} + +static void si_delete_tes_shader(struct pipe_context *ctx, void *state) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)state; + + if (sctx->tes_shader == sel) { + sctx->tes_shader = NULL; + } + + si_delete_shader_selector(ctx, sel); +} + static void si_update_spi_map(struct si_context *sctx) { struct si_shader *ps = sctx->ps_shader->current; @@ -694,7 +987,10 @@ bcolor: } } - if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) { + if (name == TGSI_SEMANTIC_PRIMID) + /* PrimID is written after the last output. */ + tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]); + else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) { /* No corresponding output found, load defaults into input. * Don't set any other bits. * (FLAT_SHADE=1 completely changes behavior) */ @@ -720,7 +1016,7 @@ bcolor: static void si_init_gs_rings(struct si_context *sctx) { unsigned esgs_ring_size = 128 * 1024; - unsigned gsvs_ring_size = 64 * 1024 * 1024; + unsigned gsvs_ring_size = 60 * 1024 * 1024; assert(!sctx->gs_rings); sctx->gs_rings = CALLOC_STRUCT(si_pm4_state); @@ -732,6 +1028,12 @@ static void si_init_gs_rings(struct si_context *sctx) PIPE_USAGE_DEFAULT, gsvs_ring_size); if (sctx->b.chip_class >= CIK) { + if (sctx->b.chip_class >= VI) { + /* The maximum sizes are 63.999 MB on VI, because + * the register fields only have 18 bits. */ + assert(esgs_ring_size / 256 < (1 << 18)); + assert(gsvs_ring_size / 256 < (1 << 18)); + } si_pm4_set_reg(sctx->gs_rings, R_030900_VGT_ESGS_RING_SIZE, esgs_ring_size / 256); si_pm4_set_reg(sctx->gs_rings, R_030904_VGT_GSVS_RING_SIZE, @@ -745,15 +1047,42 @@ static void si_init_gs_rings(struct si_context *sctx) si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, sctx->esgs_ring, 0, esgs_ring_size, - true, true, 4, 64); + true, true, 4, 64, 0); si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, sctx->esgs_ring, 0, esgs_ring_size, - false, false, 0, 0); + false, false, 0, 0, 0); si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, sctx->gsvs_ring, 0, gsvs_ring_size, - false, false, 0, 0); + false, false, 0, 0, 0); } +static void si_update_gs_rings(struct si_context *sctx) +{ + unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16; + unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices; + unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; + uint64_t offset; + + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, 0); + + offset = gsvs_itemsize * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + + offset = (gsvs_itemsize * 2) * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + + offset = (gsvs_itemsize * 3) * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + +} /** * @returns 1 if \p sel has been updated to use a new scratch buffer and 0 * otherwise. @@ -763,7 +1092,6 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx, { struct si_shader *shader; uint64_t scratch_va = sctx->scratch_buffer->gpu_address; - unsigned char *ptr; if (!sel) return 0; @@ -784,12 +1112,7 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx, si_shader_apply_scratch_relocs(sctx, shader, scratch_va); /* Replace the shader bo with a new bo that has the relocs applied. */ - r600_resource_reference(&shader->bo, NULL); - shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE, - shader->binary.code_size); - ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE); - util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size); - sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf); + si_shader_binary_upload(sctx->screen, shader); /* Update the shader state to use the new shader bo. */ si_shader_init_pm4_state(shader); @@ -818,10 +1141,14 @@ static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx, static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) { - - return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader), - si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader), - si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader)); + unsigned bytes = 0; + + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader)); + return bytes; } static void si_update_spi_tmpring_size(struct si_context *sctx) @@ -855,15 +1182,29 @@ static void si_update_spi_tmpring_size(struct si_context *sctx) si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4); if (si_update_scratch_buffer(sctx, sctx->gs_shader)) si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4); + if (si_update_scratch_buffer(sctx, sctx->tcs_shader)) + si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4); - /* VS can be bound as ES or VS. */ - if (sctx->gs_shader) { + /* VS can be bound as LS, ES, or VS. */ + if (sctx->tes_shader) { + if (si_update_scratch_buffer(sctx, sctx->vs_shader)) + si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4); + } else if (sctx->gs_shader) { if (si_update_scratch_buffer(sctx, sctx->vs_shader)) si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4); } else { if (si_update_scratch_buffer(sctx, sctx->vs_shader)) si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); } + + /* TES can be bound as ES or VS. */ + if (sctx->gs_shader) { + if (si_update_scratch_buffer(sctx, sctx->tes_shader)) + si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4); + } else { + if (si_update_scratch_buffer(sctx, sctx->tes_shader)) + si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4); + } } /* The LLVM shader backend should be reporting aligned scratch_sizes. */ @@ -874,60 +1215,187 @@ static void si_update_spi_tmpring_size(struct si_context *sctx) S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10); } +static void si_init_tess_factor_ring(struct si_context *sctx) +{ + assert(!sctx->tf_state); + sctx->tf_state = CALLOC_STRUCT(si_pm4_state); + + sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + 32768 * sctx->screen->b.info.max_se); + sctx->b.clear_buffer(&sctx->b.b, sctx->tf_ring, 0, + sctx->tf_ring->width0, fui(0), false); + assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0); + + if (sctx->b.chip_class >= CIK) { + si_pm4_set_reg(sctx->tf_state, R_030938_VGT_TF_RING_SIZE, + S_030938_SIZE(sctx->tf_ring->width0 / 4)); + si_pm4_set_reg(sctx->tf_state, R_030940_VGT_TF_MEMORY_BASE, + r600_resource(sctx->tf_ring)->gpu_address >> 8); + } else { + si_pm4_set_reg(sctx->tf_state, R_008988_VGT_TF_RING_SIZE, + S_008988_SIZE(sctx->tf_ring->width0 / 4)); + si_pm4_set_reg(sctx->tf_state, R_0089B8_VGT_TF_MEMORY_BASE, + r600_resource(sctx->tf_ring)->gpu_address >> 8); + } + si_pm4_add_bo(sctx->tf_state, r600_resource(sctx->tf_ring), + RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW); + si_pm4_bind_state(sctx, tf_ring, sctx->tf_state); + + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL, + SI_RING_TESS_FACTOR, sctx->tf_ring, 0, + sctx->tf_ring->width0, false, false, 0, 0, 0); + + sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; +} + +/** + * This is used when TCS is NULL in the VS->TCS->TES chain. In this case, + * VS passes its outputs to TES directly, so the fixed-function shader only + * has to write TESSOUTER and TESSINNER. + */ +static void si_generate_fixed_func_tcs(struct si_context *sctx) +{ + struct ureg_src const0, const1; + struct ureg_dst tessouter, tessinner; + struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL); + + if (!ureg) + return; /* if we get here, we're screwed */ + + assert(!sctx->fixed_func_tcs_shader); + + ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF); + const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0), + SI_DRIVER_STATE_CONST_BUF); + const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1), + SI_DRIVER_STATE_CONST_BUF); + + tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); + tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); + + ureg_MOV(ureg, tessouter, const0); + ureg_MOV(ureg, tessinner, const1); + ureg_END(ureg); + + sctx->fixed_func_tcs_shader = + ureg_create_shader_and_destroy(ureg, &sctx->b.b); + assert(sctx->fixed_func_tcs_shader); +} + +static void si_update_vgt_shader_config(struct si_context *sctx) +{ + /* Calculate the index of the config. + * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */ + unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader; + struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index]; + + if (!*pm4) { + uint32_t stages = 0; + + *pm4 = CALLOC_STRUCT(si_pm4_state); + + if (sctx->tes_shader) { + stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | + S_028B54_HS_EN(1); + + if (sctx->gs_shader) + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | + S_028B54_GS_EN(1) | + S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + else + stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS); + } else if (sctx->gs_shader) { + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | + S_028B54_GS_EN(1) | + S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + } + + si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages); + } + si_pm4_bind_state(sctx, vgt_shader_config, *pm4); +} + +static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader) +{ + struct pipe_stream_output_info *so = &shader->so; + uint32_t enabled_stream_buffers_mask = 0; + int i; + + for (i = 0; i < so->num_outputs; i++) + enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4); + sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask; + sctx->b.streamout.stride_in_dw = shader->so.stride; +} + void si_update_shaders(struct si_context *sctx) { struct pipe_context *ctx = (struct pipe_context*)sctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (sctx->gs_shader) { - si_shader_select(ctx, sctx->gs_shader); - si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4); - si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4); + /* Update stages before GS. */ + if (sctx->tes_shader) { + if (!sctx->tf_state) + si_init_tess_factor_ring(sctx); - sctx->b.streamout.stride_in_dw = sctx->gs_shader->so.stride; + /* VS as LS */ + si_shader_select(ctx, sctx->vs_shader); + si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4); + + if (sctx->tcs_shader) { + si_shader_select(ctx, sctx->tcs_shader); + si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4); + } else { + if (!sctx->fixed_func_tcs_shader) + si_generate_fixed_func_tcs(sctx); + si_shader_select(ctx, sctx->fixed_func_tcs_shader); + si_pm4_bind_state(sctx, hs, + sctx->fixed_func_tcs_shader->current->pm4); + } + si_shader_select(ctx, sctx->tes_shader); + if (sctx->gs_shader) { + /* TES as ES */ + si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4); + } else { + /* TES as VS */ + si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4); + si_update_so(sctx, sctx->tes_shader); + } + } else if (sctx->gs_shader) { + /* VS as ES */ si_shader_select(ctx, sctx->vs_shader); si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4); + } else { + /* VS as VS */ + si_shader_select(ctx, sctx->vs_shader); + si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); + si_update_so(sctx, sctx->vs_shader); + } + + /* Update GS. */ + if (sctx->gs_shader) { + si_shader_select(ctx, sctx->gs_shader); + si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4); + si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4); + si_update_so(sctx, sctx->gs_shader); if (!sctx->gs_rings) si_init_gs_rings(sctx); + if (sctx->emitted.named.gs_rings != sctx->gs_rings) sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings); - si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, - sctx->gsvs_ring, - sctx->gs_shader->gs_max_out_vertices * - sctx->gs_shader->info.num_outputs * 16, - 64, true, true, 4, 16); - - if (!sctx->gs_on) { - sctx->gs_on = CALLOC_STRUCT(si_pm4_state); - - si_pm4_set_reg(sctx->gs_on, R_028B54_VGT_SHADER_STAGES_EN, - S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | - S_028B54_GS_EN(1) | - S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER)); - } - si_pm4_bind_state(sctx, gs_onoff, sctx->gs_on); + si_update_gs_rings(sctx); } else { - si_shader_select(ctx, sctx->vs_shader); - si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); - - sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride; - - if (!sctx->gs_off) { - sctx->gs_off = CALLOC_STRUCT(si_pm4_state); - - si_pm4_set_reg(sctx->gs_off, R_028A40_VGT_GS_MODE, 0); - si_pm4_set_reg(sctx->gs_off, R_028B54_VGT_SHADER_STAGES_EN, 0); - } - si_pm4_bind_state(sctx, gs_onoff, sctx->gs_off); si_pm4_bind_state(sctx, gs_rings, NULL); si_pm4_bind_state(sctx, gs, NULL); si_pm4_bind_state(sctx, es, NULL); } + si_update_vgt_shader_config(sctx); + si_shader_select(ctx, sctx->ps_shader); if (!sctx->ps_shader->current) { @@ -957,29 +1425,35 @@ void si_update_shaders(struct si_context *sctx) if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) { sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control; - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) { sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing; - sctx->msaa_config.dirty = true; + si_mark_atom_dirty(sctx, &sctx->msaa_config); if (sctx->b.chip_class == SI) - sctx->db_render_state.dirty = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } } void si_init_shader_functions(struct si_context *sctx) { sctx->b.b.create_vs_state = si_create_vs_state; + sctx->b.b.create_tcs_state = si_create_tcs_state; + sctx->b.b.create_tes_state = si_create_tes_state; sctx->b.b.create_gs_state = si_create_gs_state; sctx->b.b.create_fs_state = si_create_fs_state; sctx->b.b.bind_vs_state = si_bind_vs_shader; + sctx->b.b.bind_tcs_state = si_bind_tcs_shader; + sctx->b.b.bind_tes_state = si_bind_tes_shader; sctx->b.b.bind_gs_state = si_bind_gs_shader; sctx->b.b.bind_fs_state = si_bind_ps_shader; sctx->b.b.delete_vs_state = si_delete_vs_shader; + sctx->b.b.delete_tcs_state = si_delete_tcs_shader; + sctx->b.b.delete_tes_state = si_delete_tes_shader; sctx->b.b.delete_gs_state = si_delete_gs_shader; sctx->b.b.delete_fs_state = si_delete_ps_shader; } diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 35d5ee232a0..66fdf35c8af 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -206,6 +206,398 @@ * 6. COMMAND [29:22] | BYTE_COUNT [20:0] */ + +#define R_000E4C_SRBM_STATUS2 0x000E4C +#define S_000E4C_SDMA_RQ_PENDING(x) (((x) & 0x1) << 0) +#define G_000E4C_SDMA_RQ_PENDING(x) (((x) >> 0) & 0x1) +#define C_000E4C_SDMA_RQ_PENDING 0xFFFFFFFE +#define S_000E4C_TST_RQ_PENDING(x) (((x) & 0x1) << 1) +#define G_000E4C_TST_RQ_PENDING(x) (((x) >> 1) & 0x1) +#define C_000E4C_TST_RQ_PENDING 0xFFFFFFFD +#define S_000E4C_SDMA1_RQ_PENDING(x) (((x) & 0x1) << 2) +#define G_000E4C_SDMA1_RQ_PENDING(x) (((x) >> 2) & 0x1) +#define C_000E4C_SDMA1_RQ_PENDING 0xFFFFFFFB +#define S_000E4C_VCE0_RQ_PENDING(x) (((x) & 0x1) << 3) +#define G_000E4C_VCE0_RQ_PENDING(x) (((x) >> 3) & 0x1) +#define C_000E4C_VCE0_RQ_PENDING 0xFFFFFFF7 +#define S_000E4C_VP8_BUSY(x) (((x) & 0x1) << 4) +#define G_000E4C_VP8_BUSY(x) (((x) >> 4) & 0x1) +#define C_000E4C_VP8_BUSY 0xFFFFFFEF +#define S_000E4C_SDMA_BUSY(x) (((x) & 0x1) << 5) +#define G_000E4C_SDMA_BUSY(x) (((x) >> 5) & 0x1) +#define C_000E4C_SDMA_BUSY 0xFFFFFFDF +#define S_000E4C_SDMA1_BUSY(x) (((x) & 0x1) << 6) +#define G_000E4C_SDMA1_BUSY(x) (((x) >> 6) & 0x1) +#define C_000E4C_SDMA1_BUSY 0xFFFFFFBF +#define S_000E4C_VCE0_BUSY(x) (((x) & 0x1) << 7) +#define G_000E4C_VCE0_BUSY(x) (((x) >> 7) & 0x1) +#define C_000E4C_VCE0_BUSY 0xFFFFFF7F +#define S_000E4C_XDMA_BUSY(x) (((x) & 0x1) << 8) +#define G_000E4C_XDMA_BUSY(x) (((x) >> 8) & 0x1) +#define C_000E4C_XDMA_BUSY 0xFFFFFEFF +#define S_000E4C_CHUB_BUSY(x) (((x) & 0x1) << 9) +#define G_000E4C_CHUB_BUSY(x) (((x) >> 9) & 0x1) +#define C_000E4C_CHUB_BUSY 0xFFFFFDFF +#define S_000E4C_SDMA2_BUSY(x) (((x) & 0x1) << 10) +#define G_000E4C_SDMA2_BUSY(x) (((x) >> 10) & 0x1) +#define C_000E4C_SDMA2_BUSY 0xFFFFFBFF +#define S_000E4C_SDMA3_BUSY(x) (((x) & 0x1) << 11) +#define G_000E4C_SDMA3_BUSY(x) (((x) >> 11) & 0x1) +#define C_000E4C_SDMA3_BUSY 0xFFFFF7FF +#define S_000E4C_SAMSCP_BUSY(x) (((x) & 0x1) << 12) +#define G_000E4C_SAMSCP_BUSY(x) (((x) >> 12) & 0x1) +#define C_000E4C_SAMSCP_BUSY 0xFFFFEFFF +#define S_000E4C_ISP_BUSY(x) (((x) & 0x1) << 13) +#define G_000E4C_ISP_BUSY(x) (((x) >> 13) & 0x1) +#define C_000E4C_ISP_BUSY 0xFFFFDFFF +#define S_000E4C_VCE1_BUSY(x) (((x) & 0x1) << 14) +#define G_000E4C_VCE1_BUSY(x) (((x) >> 14) & 0x1) +#define C_000E4C_VCE1_BUSY 0xFFFFBFFF +#define S_000E4C_ODE_BUSY(x) (((x) & 0x1) << 15) +#define G_000E4C_ODE_BUSY(x) (((x) >> 15) & 0x1) +#define C_000E4C_ODE_BUSY 0xFFFF7FFF +#define S_000E4C_SDMA2_RQ_PENDING(x) (((x) & 0x1) << 16) +#define G_000E4C_SDMA2_RQ_PENDING(x) (((x) >> 16) & 0x1) +#define C_000E4C_SDMA2_RQ_PENDING 0xFFFEFFFF +#define S_000E4C_SDMA3_RQ_PENDING(x) (((x) & 0x1) << 17) +#define G_000E4C_SDMA3_RQ_PENDING(x) (((x) >> 17) & 0x1) +#define C_000E4C_SDMA3_RQ_PENDING 0xFFFDFFFF +#define S_000E4C_SAMSCP_RQ_PENDING(x) (((x) & 0x1) << 18) +#define G_000E4C_SAMSCP_RQ_PENDING(x) (((x) >> 18) & 0x1) +#define C_000E4C_SAMSCP_RQ_PENDING 0xFFFBFFFF +#define S_000E4C_ISP_RQ_PENDING(x) (((x) & 0x1) << 19) +#define G_000E4C_ISP_RQ_PENDING(x) (((x) >> 19) & 0x1) +#define C_000E4C_ISP_RQ_PENDING 0xFFF7FFFF +#define S_000E4C_VCE1_RQ_PENDING(x) (((x) & 0x1) << 20) +#define G_000E4C_VCE1_RQ_PENDING(x) (((x) >> 20) & 0x1) +#define C_000E4C_VCE1_RQ_PENDING 0xFFEFFFFF +#define R_000E50_SRBM_STATUS 0x000E50 +#define S_000E50_UVD_RQ_PENDING(x) (((x) & 0x1) << 1) +#define G_000E50_UVD_RQ_PENDING(x) (((x) >> 1) & 0x1) +#define C_000E50_UVD_RQ_PENDING 0xFFFFFFFD +#define S_000E50_SAMMSP_RQ_PENDING(x) (((x) & 0x1) << 2) +#define G_000E50_SAMMSP_RQ_PENDING(x) (((x) >> 2) & 0x1) +#define C_000E50_SAMMSP_RQ_PENDING 0xFFFFFFFB +#define S_000E50_ACP_RQ_PENDING(x) (((x) & 0x1) << 3) +#define G_000E50_ACP_RQ_PENDING(x) (((x) >> 3) & 0x1) +#define C_000E50_ACP_RQ_PENDING 0xFFFFFFF7 +#define S_000E50_SMU_RQ_PENDING(x) (((x) & 0x1) << 4) +#define G_000E50_SMU_RQ_PENDING(x) (((x) >> 4) & 0x1) +#define C_000E50_SMU_RQ_PENDING 0xFFFFFFEF +#define S_000E50_GRBM_RQ_PENDING(x) (((x) & 0x1) << 5) +#define G_000E50_GRBM_RQ_PENDING(x) (((x) >> 5) & 0x1) +#define C_000E50_GRBM_RQ_PENDING 0xFFFFFFDF +#define S_000E50_HI_RQ_PENDING(x) (((x) & 0x1) << 6) +#define G_000E50_HI_RQ_PENDING(x) (((x) >> 6) & 0x1) +#define C_000E50_HI_RQ_PENDING 0xFFFFFFBF +#define S_000E50_VMC_BUSY(x) (((x) & 0x1) << 8) +#define G_000E50_VMC_BUSY(x) (((x) >> 8) & 0x1) +#define C_000E50_VMC_BUSY 0xFFFFFEFF +#define S_000E50_MCB_BUSY(x) (((x) & 0x1) << 9) +#define G_000E50_MCB_BUSY(x) (((x) >> 9) & 0x1) +#define C_000E50_MCB_BUSY 0xFFFFFDFF +#define S_000E50_MCB_NON_DISPLAY_BUSY(x) (((x) & 0x1) << 10) +#define G_000E50_MCB_NON_DISPLAY_BUSY(x) (((x) >> 10) & 0x1) +#define C_000E50_MCB_NON_DISPLAY_BUSY 0xFFFFFBFF +#define S_000E50_MCC_BUSY(x) (((x) & 0x1) << 11) +#define G_000E50_MCC_BUSY(x) (((x) >> 11) & 0x1) +#define C_000E50_MCC_BUSY 0xFFFFF7FF +#define S_000E50_MCD_BUSY(x) (((x) & 0x1) << 12) +#define G_000E50_MCD_BUSY(x) (((x) >> 12) & 0x1) +#define C_000E50_MCD_BUSY 0xFFFFEFFF +#define S_000E50_VMC1_BUSY(x) (((x) & 0x1) << 13) +#define G_000E50_VMC1_BUSY(x) (((x) >> 13) & 0x1) +#define C_000E50_VMC1_BUSY 0xFFFFDFFF +#define S_000E50_SEM_BUSY(x) (((x) & 0x1) << 14) +#define G_000E50_SEM_BUSY(x) (((x) >> 14) & 0x1) +#define C_000E50_SEM_BUSY 0xFFFFBFFF +#define S_000E50_ACP_BUSY(x) (((x) & 0x1) << 16) +#define G_000E50_ACP_BUSY(x) (((x) >> 16) & 0x1) +#define C_000E50_ACP_BUSY 0xFFFEFFFF +#define S_000E50_IH_BUSY(x) (((x) & 0x1) << 17) +#define G_000E50_IH_BUSY(x) (((x) >> 17) & 0x1) +#define C_000E50_IH_BUSY 0xFFFDFFFF +#define S_000E50_UVD_BUSY(x) (((x) & 0x1) << 19) +#define G_000E50_UVD_BUSY(x) (((x) >> 19) & 0x1) +#define C_000E50_UVD_BUSY 0xFFF7FFFF +#define S_000E50_SAMMSP_BUSY(x) (((x) & 0x1) << 20) +#define G_000E50_SAMMSP_BUSY(x) (((x) >> 20) & 0x1) +#define C_000E50_SAMMSP_BUSY 0xFFEFFFFF +#define S_000E50_GCATCL2_BUSY(x) (((x) & 0x1) << 21) +#define G_000E50_GCATCL2_BUSY(x) (((x) >> 21) & 0x1) +#define C_000E50_GCATCL2_BUSY 0xFFDFFFFF +#define S_000E50_OSATCL2_BUSY(x) (((x) & 0x1) << 22) +#define G_000E50_OSATCL2_BUSY(x) (((x) >> 22) & 0x1) +#define C_000E50_OSATCL2_BUSY 0xFFBFFFFF +#define S_000E50_BIF_BUSY(x) (((x) & 0x1) << 29) +#define G_000E50_BIF_BUSY(x) (((x) >> 29) & 0x1) +#define C_000E50_BIF_BUSY 0xDFFFFFFF +#define R_000E54_SRBM_STATUS3 0x000E54 +#define S_000E54_MCC0_BUSY(x) (((x) & 0x1) << 0) +#define G_000E54_MCC0_BUSY(x) (((x) >> 0) & 0x1) +#define C_000E54_MCC0_BUSY 0xFFFFFFFE +#define S_000E54_MCC1_BUSY(x) (((x) & 0x1) << 1) +#define G_000E54_MCC1_BUSY(x) (((x) >> 1) & 0x1) +#define C_000E54_MCC1_BUSY 0xFFFFFFFD +#define S_000E54_MCC2_BUSY(x) (((x) & 0x1) << 2) +#define G_000E54_MCC2_BUSY(x) (((x) >> 2) & 0x1) +#define C_000E54_MCC2_BUSY 0xFFFFFFFB +#define S_000E54_MCC3_BUSY(x) (((x) & 0x1) << 3) +#define G_000E54_MCC3_BUSY(x) (((x) >> 3) & 0x1) +#define C_000E54_MCC3_BUSY 0xFFFFFFF7 +#define S_000E54_MCC4_BUSY(x) (((x) & 0x1) << 4) +#define G_000E54_MCC4_BUSY(x) (((x) >> 4) & 0x1) +#define C_000E54_MCC4_BUSY 0xFFFFFFEF +#define S_000E54_MCC5_BUSY(x) (((x) & 0x1) << 5) +#define G_000E54_MCC5_BUSY(x) (((x) >> 5) & 0x1) +#define C_000E54_MCC5_BUSY 0xFFFFFFDF +#define S_000E54_MCC6_BUSY(x) (((x) & 0x1) << 6) +#define G_000E54_MCC6_BUSY(x) (((x) >> 6) & 0x1) +#define C_000E54_MCC6_BUSY 0xFFFFFFBF +#define S_000E54_MCC7_BUSY(x) (((x) & 0x1) << 7) +#define G_000E54_MCC7_BUSY(x) (((x) >> 7) & 0x1) +#define C_000E54_MCC7_BUSY 0xFFFFFF7F +#define S_000E54_MCD0_BUSY(x) (((x) & 0x1) << 8) +#define G_000E54_MCD0_BUSY(x) (((x) >> 8) & 0x1) +#define C_000E54_MCD0_BUSY 0xFFFFFEFF +#define S_000E54_MCD1_BUSY(x) (((x) & 0x1) << 9) +#define G_000E54_MCD1_BUSY(x) (((x) >> 9) & 0x1) +#define C_000E54_MCD1_BUSY 0xFFFFFDFF +#define S_000E54_MCD2_BUSY(x) (((x) & 0x1) << 10) +#define G_000E54_MCD2_BUSY(x) (((x) >> 10) & 0x1) +#define C_000E54_MCD2_BUSY 0xFFFFFBFF +#define S_000E54_MCD3_BUSY(x) (((x) & 0x1) << 11) +#define G_000E54_MCD3_BUSY(x) (((x) >> 11) & 0x1) +#define C_000E54_MCD3_BUSY 0xFFFFF7FF +#define S_000E54_MCD4_BUSY(x) (((x) & 0x1) << 12) +#define G_000E54_MCD4_BUSY(x) (((x) >> 12) & 0x1) +#define C_000E54_MCD4_BUSY 0xFFFFEFFF +#define S_000E54_MCD5_BUSY(x) (((x) & 0x1) << 13) +#define G_000E54_MCD5_BUSY(x) (((x) >> 13) & 0x1) +#define C_000E54_MCD5_BUSY 0xFFFFDFFF +#define S_000E54_MCD6_BUSY(x) (((x) & 0x1) << 14) +#define G_000E54_MCD6_BUSY(x) (((x) >> 14) & 0x1) +#define C_000E54_MCD6_BUSY 0xFFFFBFFF +#define S_000E54_MCD7_BUSY(x) (((x) & 0x1) << 15) +#define G_000E54_MCD7_BUSY(x) (((x) >> 15) & 0x1) +#define C_000E54_MCD7_BUSY 0xFFFF7FFF +#define R_00D034_SDMA0_STATUS_REG 0x00D034 +#define S_00D034_IDLE(x) (((x) & 0x1) << 0) +#define G_00D034_IDLE(x) (((x) >> 0) & 0x1) +#define C_00D034_IDLE 0xFFFFFFFE +#define S_00D034_REG_IDLE(x) (((x) & 0x1) << 1) +#define G_00D034_REG_IDLE(x) (((x) >> 1) & 0x1) +#define C_00D034_REG_IDLE 0xFFFFFFFD +#define S_00D034_RB_EMPTY(x) (((x) & 0x1) << 2) +#define G_00D034_RB_EMPTY(x) (((x) >> 2) & 0x1) +#define C_00D034_RB_EMPTY 0xFFFFFFFB +#define S_00D034_RB_FULL(x) (((x) & 0x1) << 3) +#define G_00D034_RB_FULL(x) (((x) >> 3) & 0x1) +#define C_00D034_RB_FULL 0xFFFFFFF7 +#define S_00D034_RB_CMD_IDLE(x) (((x) & 0x1) << 4) +#define G_00D034_RB_CMD_IDLE(x) (((x) >> 4) & 0x1) +#define C_00D034_RB_CMD_IDLE 0xFFFFFFEF +#define S_00D034_RB_CMD_FULL(x) (((x) & 0x1) << 5) +#define G_00D034_RB_CMD_FULL(x) (((x) >> 5) & 0x1) +#define C_00D034_RB_CMD_FULL 0xFFFFFFDF +#define S_00D034_IB_CMD_IDLE(x) (((x) & 0x1) << 6) +#define G_00D034_IB_CMD_IDLE(x) (((x) >> 6) & 0x1) +#define C_00D034_IB_CMD_IDLE 0xFFFFFFBF +#define S_00D034_IB_CMD_FULL(x) (((x) & 0x1) << 7) +#define G_00D034_IB_CMD_FULL(x) (((x) >> 7) & 0x1) +#define C_00D034_IB_CMD_FULL 0xFFFFFF7F +#define S_00D034_BLOCK_IDLE(x) (((x) & 0x1) << 8) +#define G_00D034_BLOCK_IDLE(x) (((x) >> 8) & 0x1) +#define C_00D034_BLOCK_IDLE 0xFFFFFEFF +#define S_00D034_INSIDE_IB(x) (((x) & 0x1) << 9) +#define G_00D034_INSIDE_IB(x) (((x) >> 9) & 0x1) +#define C_00D034_INSIDE_IB 0xFFFFFDFF +#define S_00D034_EX_IDLE(x) (((x) & 0x1) << 10) +#define G_00D034_EX_IDLE(x) (((x) >> 10) & 0x1) +#define C_00D034_EX_IDLE 0xFFFFFBFF +#define S_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x) (((x) & 0x1) << 11) +#define G_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x) (((x) >> 11) & 0x1) +#define C_00D034_EX_IDLE_POLL_TIMER_EXPIRE 0xFFFFF7FF +#define S_00D034_PACKET_READY(x) (((x) & 0x1) << 12) +#define G_00D034_PACKET_READY(x) (((x) >> 12) & 0x1) +#define C_00D034_PACKET_READY 0xFFFFEFFF +#define S_00D034_MC_WR_IDLE(x) (((x) & 0x1) << 13) +#define G_00D034_MC_WR_IDLE(x) (((x) >> 13) & 0x1) +#define C_00D034_MC_WR_IDLE 0xFFFFDFFF +#define S_00D034_SRBM_IDLE(x) (((x) & 0x1) << 14) +#define G_00D034_SRBM_IDLE(x) (((x) >> 14) & 0x1) +#define C_00D034_SRBM_IDLE 0xFFFFBFFF +#define S_00D034_CONTEXT_EMPTY(x) (((x) & 0x1) << 15) +#define G_00D034_CONTEXT_EMPTY(x) (((x) >> 15) & 0x1) +#define C_00D034_CONTEXT_EMPTY 0xFFFF7FFF +#define S_00D034_DELTA_RPTR_FULL(x) (((x) & 0x1) << 16) +#define G_00D034_DELTA_RPTR_FULL(x) (((x) >> 16) & 0x1) +#define C_00D034_DELTA_RPTR_FULL 0xFFFEFFFF +#define S_00D034_RB_MC_RREQ_IDLE(x) (((x) & 0x1) << 17) +#define G_00D034_RB_MC_RREQ_IDLE(x) (((x) >> 17) & 0x1) +#define C_00D034_RB_MC_RREQ_IDLE 0xFFFDFFFF +#define S_00D034_IB_MC_RREQ_IDLE(x) (((x) & 0x1) << 18) +#define G_00D034_IB_MC_RREQ_IDLE(x) (((x) >> 18) & 0x1) +#define C_00D034_IB_MC_RREQ_IDLE 0xFFFBFFFF +#define S_00D034_MC_RD_IDLE(x) (((x) & 0x1) << 19) +#define G_00D034_MC_RD_IDLE(x) (((x) >> 19) & 0x1) +#define C_00D034_MC_RD_IDLE 0xFFF7FFFF +#define S_00D034_DELTA_RPTR_EMPTY(x) (((x) & 0x1) << 20) +#define G_00D034_DELTA_RPTR_EMPTY(x) (((x) >> 20) & 0x1) +#define C_00D034_DELTA_RPTR_EMPTY 0xFFEFFFFF +#define S_00D034_MC_RD_RET_STALL(x) (((x) & 0x1) << 21) +#define G_00D034_MC_RD_RET_STALL(x) (((x) >> 21) & 0x1) +#define C_00D034_MC_RD_RET_STALL 0xFFDFFFFF +#define S_00D034_MC_RD_NO_POLL_IDLE(x) (((x) & 0x1) << 22) +#define G_00D034_MC_RD_NO_POLL_IDLE(x) (((x) >> 22) & 0x1) +#define C_00D034_MC_RD_NO_POLL_IDLE 0xFFBFFFFF +#define S_00D034_PREV_CMD_IDLE(x) (((x) & 0x1) << 25) +#define G_00D034_PREV_CMD_IDLE(x) (((x) >> 25) & 0x1) +#define C_00D034_PREV_CMD_IDLE 0xFDFFFFFF +#define S_00D034_SEM_IDLE(x) (((x) & 0x1) << 26) +#define G_00D034_SEM_IDLE(x) (((x) >> 26) & 0x1) +#define C_00D034_SEM_IDLE 0xFBFFFFFF +#define S_00D034_SEM_REQ_STALL(x) (((x) & 0x1) << 27) +#define G_00D034_SEM_REQ_STALL(x) (((x) >> 27) & 0x1) +#define C_00D034_SEM_REQ_STALL 0xF7FFFFFF +#define S_00D034_SEM_RESP_STATE(x) (((x) & 0x03) << 28) +#define G_00D034_SEM_RESP_STATE(x) (((x) >> 28) & 0x03) +#define C_00D034_SEM_RESP_STATE 0xCFFFFFFF +#define S_00D034_INT_IDLE(x) (((x) & 0x1) << 30) +#define G_00D034_INT_IDLE(x) (((x) >> 30) & 0x1) +#define C_00D034_INT_IDLE 0xBFFFFFFF +#define S_00D034_INT_REQ_STALL(x) (((x) & 0x1) << 31) +#define G_00D034_INT_REQ_STALL(x) (((x) >> 31) & 0x1) +#define C_00D034_INT_REQ_STALL 0x7FFFFFFF +#define R_00D834_SDMA1_STATUS_REG 0x00D834 +#define R_008008_GRBM_STATUS2 0x008008 +#define S_008008_ME0PIPE1_CMDFIFO_AVAIL(x) (((x) & 0x0F) << 0) +#define G_008008_ME0PIPE1_CMDFIFO_AVAIL(x) (((x) >> 0) & 0x0F) +#define C_008008_ME0PIPE1_CMDFIFO_AVAIL 0xFFFFFFF0 +#define S_008008_ME0PIPE1_CF_RQ_PENDING(x) (((x) & 0x1) << 4) +#define G_008008_ME0PIPE1_CF_RQ_PENDING(x) (((x) >> 4) & 0x1) +#define C_008008_ME0PIPE1_CF_RQ_PENDING 0xFFFFFFEF +#define S_008008_ME0PIPE1_PF_RQ_PENDING(x) (((x) & 0x1) << 5) +#define G_008008_ME0PIPE1_PF_RQ_PENDING(x) (((x) >> 5) & 0x1) +#define C_008008_ME0PIPE1_PF_RQ_PENDING 0xFFFFFFDF +#define S_008008_ME1PIPE0_RQ_PENDING(x) (((x) & 0x1) << 6) +#define G_008008_ME1PIPE0_RQ_PENDING(x) (((x) >> 6) & 0x1) +#define C_008008_ME1PIPE0_RQ_PENDING 0xFFFFFFBF +#define S_008008_ME1PIPE1_RQ_PENDING(x) (((x) & 0x1) << 7) +#define G_008008_ME1PIPE1_RQ_PENDING(x) (((x) >> 7) & 0x1) +#define C_008008_ME1PIPE1_RQ_PENDING 0xFFFFFF7F +#define S_008008_ME1PIPE2_RQ_PENDING(x) (((x) & 0x1) << 8) +#define G_008008_ME1PIPE2_RQ_PENDING(x) (((x) >> 8) & 0x1) +#define C_008008_ME1PIPE2_RQ_PENDING 0xFFFFFEFF +#define S_008008_ME1PIPE3_RQ_PENDING(x) (((x) & 0x1) << 9) +#define G_008008_ME1PIPE3_RQ_PENDING(x) (((x) >> 9) & 0x1) +#define C_008008_ME1PIPE3_RQ_PENDING 0xFFFFFDFF +#define S_008008_ME2PIPE0_RQ_PENDING(x) (((x) & 0x1) << 10) +#define G_008008_ME2PIPE0_RQ_PENDING(x) (((x) >> 10) & 0x1) +#define C_008008_ME2PIPE0_RQ_PENDING 0xFFFFFBFF +#define S_008008_ME2PIPE1_RQ_PENDING(x) (((x) & 0x1) << 11) +#define G_008008_ME2PIPE1_RQ_PENDING(x) (((x) >> 11) & 0x1) +#define C_008008_ME2PIPE1_RQ_PENDING 0xFFFFF7FF +#define S_008008_ME2PIPE2_RQ_PENDING(x) (((x) & 0x1) << 12) +#define G_008008_ME2PIPE2_RQ_PENDING(x) (((x) >> 12) & 0x1) +#define C_008008_ME2PIPE2_RQ_PENDING 0xFFFFEFFF +#define S_008008_ME2PIPE3_RQ_PENDING(x) (((x) & 0x1) << 13) +#define G_008008_ME2PIPE3_RQ_PENDING(x) (((x) >> 13) & 0x1) +#define C_008008_ME2PIPE3_RQ_PENDING 0xFFFFDFFF +#define S_008008_RLC_RQ_PENDING(x) (((x) & 0x1) << 14) +#define G_008008_RLC_RQ_PENDING(x) (((x) >> 14) & 0x1) +#define C_008008_RLC_RQ_PENDING 0xFFFFBFFF +#define S_008008_RLC_BUSY(x) (((x) & 0x1) << 24) +#define G_008008_RLC_BUSY(x) (((x) >> 24) & 0x1) +#define C_008008_RLC_BUSY 0xFEFFFFFF +#define S_008008_TC_BUSY(x) (((x) & 0x1) << 25) +#define G_008008_TC_BUSY(x) (((x) >> 25) & 0x1) +#define C_008008_TC_BUSY 0xFDFFFFFF +#define S_008008_TCC_CC_RESIDENT(x) (((x) & 0x1) << 26) +#define G_008008_TCC_CC_RESIDENT(x) (((x) >> 26) & 0x1) +#define C_008008_TCC_CC_RESIDENT 0xFBFFFFFF +#define S_008008_CPF_BUSY(x) (((x) & 0x1) << 28) +#define G_008008_CPF_BUSY(x) (((x) >> 28) & 0x1) +#define C_008008_CPF_BUSY 0xEFFFFFFF +#define S_008008_CPC_BUSY(x) (((x) & 0x1) << 29) +#define G_008008_CPC_BUSY(x) (((x) >> 29) & 0x1) +#define C_008008_CPC_BUSY 0xDFFFFFFF +#define S_008008_CPG_BUSY(x) (((x) & 0x1) << 30) +#define G_008008_CPG_BUSY(x) (((x) >> 30) & 0x1) +#define C_008008_CPG_BUSY 0xBFFFFFFF +#define R_008010_GRBM_STATUS 0x008010 +#define S_008010_ME0PIPE0_CMDFIFO_AVAIL(x) (((x) & 0x0F) << 0) +#define G_008010_ME0PIPE0_CMDFIFO_AVAIL(x) (((x) >> 0) & 0x0F) +#define C_008010_ME0PIPE0_CMDFIFO_AVAIL 0xFFFFFFF0 +#define S_008010_SRBM_RQ_PENDING(x) (((x) & 0x1) << 5) +#define G_008010_SRBM_RQ_PENDING(x) (((x) >> 5) & 0x1) +#define C_008010_SRBM_RQ_PENDING 0xFFFFFFDF +#define S_008010_ME0PIPE0_CF_RQ_PENDING(x) (((x) & 0x1) << 7) +#define G_008010_ME0PIPE0_CF_RQ_PENDING(x) (((x) >> 7) & 0x1) +#define C_008010_ME0PIPE0_CF_RQ_PENDING 0xFFFFFF7F +#define S_008010_ME0PIPE0_PF_RQ_PENDING(x) (((x) & 0x1) << 8) +#define G_008010_ME0PIPE0_PF_RQ_PENDING(x) (((x) >> 8) & 0x1) +#define C_008010_ME0PIPE0_PF_RQ_PENDING 0xFFFFFEFF +#define S_008010_GDS_DMA_RQ_PENDING(x) (((x) & 0x1) << 9) +#define G_008010_GDS_DMA_RQ_PENDING(x) (((x) >> 9) & 0x1) +#define C_008010_GDS_DMA_RQ_PENDING 0xFFFFFDFF +#define S_008010_DB_CLEAN(x) (((x) & 0x1) << 12) +#define G_008010_DB_CLEAN(x) (((x) >> 12) & 0x1) +#define C_008010_DB_CLEAN 0xFFFFEFFF +#define S_008010_CB_CLEAN(x) (((x) & 0x1) << 13) +#define G_008010_CB_CLEAN(x) (((x) >> 13) & 0x1) +#define C_008010_CB_CLEAN 0xFFFFDFFF +#define S_008010_TA_BUSY(x) (((x) & 0x1) << 14) +#define G_008010_TA_BUSY(x) (((x) >> 14) & 0x1) +#define C_008010_TA_BUSY 0xFFFFBFFF +#define S_008010_GDS_BUSY(x) (((x) & 0x1) << 15) +#define G_008010_GDS_BUSY(x) (((x) >> 15) & 0x1) +#define C_008010_GDS_BUSY 0xFFFF7FFF +#define S_008010_WD_BUSY_NO_DMA(x) (((x) & 0x1) << 16) +#define G_008010_WD_BUSY_NO_DMA(x) (((x) >> 16) & 0x1) +#define C_008010_WD_BUSY_NO_DMA 0xFFFEFFFF +#define S_008010_VGT_BUSY(x) (((x) & 0x1) << 17) +#define G_008010_VGT_BUSY(x) (((x) >> 17) & 0x1) +#define C_008010_VGT_BUSY 0xFFFDFFFF +#define S_008010_IA_BUSY_NO_DMA(x) (((x) & 0x1) << 18) +#define G_008010_IA_BUSY_NO_DMA(x) (((x) >> 18) & 0x1) +#define C_008010_IA_BUSY_NO_DMA 0xFFFBFFFF +#define S_008010_IA_BUSY(x) (((x) & 0x1) << 19) +#define G_008010_IA_BUSY(x) (((x) >> 19) & 0x1) +#define C_008010_IA_BUSY 0xFFF7FFFF +#define S_008010_SX_BUSY(x) (((x) & 0x1) << 20) +#define G_008010_SX_BUSY(x) (((x) >> 20) & 0x1) +#define C_008010_SX_BUSY 0xFFEFFFFF +#define S_008010_WD_BUSY(x) (((x) & 0x1) << 21) +#define G_008010_WD_BUSY(x) (((x) >> 21) & 0x1) +#define C_008010_WD_BUSY 0xFFDFFFFF +#define S_008010_SPI_BUSY(x) (((x) & 0x1) << 22) +#define G_008010_SPI_BUSY(x) (((x) >> 22) & 0x1) +#define C_008010_SPI_BUSY 0xFFBFFFFF +#define S_008010_BCI_BUSY(x) (((x) & 0x1) << 23) +#define G_008010_BCI_BUSY(x) (((x) >> 23) & 0x1) +#define C_008010_BCI_BUSY 0xFF7FFFFF +#define S_008010_SC_BUSY(x) (((x) & 0x1) << 24) +#define G_008010_SC_BUSY(x) (((x) >> 24) & 0x1) +#define C_008010_SC_BUSY 0xFEFFFFFF +#define S_008010_PA_BUSY(x) (((x) & 0x1) << 25) +#define G_008010_PA_BUSY(x) (((x) >> 25) & 0x1) +#define C_008010_PA_BUSY 0xFDFFFFFF +#define S_008010_DB_BUSY(x) (((x) & 0x1) << 26) +#define G_008010_DB_BUSY(x) (((x) >> 26) & 0x1) +#define C_008010_DB_BUSY 0xFBFFFFFF +#define S_008010_CP_COHERENCY_BUSY(x) (((x) & 0x1) << 28) +#define G_008010_CP_COHERENCY_BUSY(x) (((x) >> 28) & 0x1) +#define C_008010_CP_COHERENCY_BUSY 0xEFFFFFFF +#define S_008010_CP_BUSY(x) (((x) & 0x1) << 29) +#define G_008010_CP_BUSY(x) (((x) >> 29) & 0x1) +#define C_008010_CP_BUSY 0xDFFFFFFF +#define S_008010_CB_BUSY(x) (((x) & 0x1) << 30) +#define G_008010_CB_BUSY(x) (((x) >> 30) & 0x1) +#define C_008010_CB_BUSY 0xBFFFFFFF +#define S_008010_GUI_ACTIVE(x) (((x) & 0x1) << 31) +#define G_008010_GUI_ACTIVE(x) (((x) >> 31) & 0x1) +#define C_008010_GUI_ACTIVE 0x7FFFFFFF #define GRBM_GFX_INDEX 0x802C #define INSTANCE_INDEX(x) ((x) << 0) #define SH_INDEX(x) ((x) << 8) @@ -276,12 +668,155 @@ #define C_0085F0_SH_ICACHE_ACTION_ENA 0xDFFFFFFF #define R_0085F4_CP_COHER_SIZE 0x0085F4 #define R_0085F8_CP_COHER_BASE 0x0085F8 - +#define R_008014_GRBM_STATUS_SE0 0x008014 +#define S_008014_DB_CLEAN(x) (((x) & 0x1) << 1) +#define G_008014_DB_CLEAN(x) (((x) >> 1) & 0x1) +#define C_008014_DB_CLEAN 0xFFFFFFFD +#define S_008014_CB_CLEAN(x) (((x) & 0x1) << 2) +#define G_008014_CB_CLEAN(x) (((x) >> 2) & 0x1) +#define C_008014_CB_CLEAN 0xFFFFFFFB +#define S_008014_BCI_BUSY(x) (((x) & 0x1) << 22) +#define G_008014_BCI_BUSY(x) (((x) >> 22) & 0x1) +#define C_008014_BCI_BUSY 0xFFBFFFFF +#define S_008014_VGT_BUSY(x) (((x) & 0x1) << 23) +#define G_008014_VGT_BUSY(x) (((x) >> 23) & 0x1) +#define C_008014_VGT_BUSY 0xFF7FFFFF +#define S_008014_PA_BUSY(x) (((x) & 0x1) << 24) +#define G_008014_PA_BUSY(x) (((x) >> 24) & 0x1) +#define C_008014_PA_BUSY 0xFEFFFFFF +#define S_008014_TA_BUSY(x) (((x) & 0x1) << 25) +#define G_008014_TA_BUSY(x) (((x) >> 25) & 0x1) +#define C_008014_TA_BUSY 0xFDFFFFFF +#define S_008014_SX_BUSY(x) (((x) & 0x1) << 26) +#define G_008014_SX_BUSY(x) (((x) >> 26) & 0x1) +#define C_008014_SX_BUSY 0xFBFFFFFF +#define S_008014_SPI_BUSY(x) (((x) & 0x1) << 27) +#define G_008014_SPI_BUSY(x) (((x) >> 27) & 0x1) +#define C_008014_SPI_BUSY 0xF7FFFFFF +#define S_008014_SC_BUSY(x) (((x) & 0x1) << 29) +#define G_008014_SC_BUSY(x) (((x) >> 29) & 0x1) +#define C_008014_SC_BUSY 0xDFFFFFFF +#define S_008014_DB_BUSY(x) (((x) & 0x1) << 30) +#define G_008014_DB_BUSY(x) (((x) >> 30) & 0x1) +#define C_008014_DB_BUSY 0xBFFFFFFF +#define S_008014_CB_BUSY(x) (((x) & 0x1) << 31) +#define G_008014_CB_BUSY(x) (((x) >> 31) & 0x1) +#define C_008014_CB_BUSY 0x7FFFFFFF +#define R_008018_GRBM_STATUS_SE1 0x008018 +#define S_008018_DB_CLEAN(x) (((x) & 0x1) << 1) +#define G_008018_DB_CLEAN(x) (((x) >> 1) & 0x1) +#define C_008018_DB_CLEAN 0xFFFFFFFD +#define S_008018_CB_CLEAN(x) (((x) & 0x1) << 2) +#define G_008018_CB_CLEAN(x) (((x) >> 2) & 0x1) +#define C_008018_CB_CLEAN 0xFFFFFFFB +#define S_008018_BCI_BUSY(x) (((x) & 0x1) << 22) +#define G_008018_BCI_BUSY(x) (((x) >> 22) & 0x1) +#define C_008018_BCI_BUSY 0xFFBFFFFF +#define S_008018_VGT_BUSY(x) (((x) & 0x1) << 23) +#define G_008018_VGT_BUSY(x) (((x) >> 23) & 0x1) +#define C_008018_VGT_BUSY 0xFF7FFFFF +#define S_008018_PA_BUSY(x) (((x) & 0x1) << 24) +#define G_008018_PA_BUSY(x) (((x) >> 24) & 0x1) +#define C_008018_PA_BUSY 0xFEFFFFFF +#define S_008018_TA_BUSY(x) (((x) & 0x1) << 25) +#define G_008018_TA_BUSY(x) (((x) >> 25) & 0x1) +#define C_008018_TA_BUSY 0xFDFFFFFF +#define S_008018_SX_BUSY(x) (((x) & 0x1) << 26) +#define G_008018_SX_BUSY(x) (((x) >> 26) & 0x1) +#define C_008018_SX_BUSY 0xFBFFFFFF +#define S_008018_SPI_BUSY(x) (((x) & 0x1) << 27) +#define G_008018_SPI_BUSY(x) (((x) >> 27) & 0x1) +#define C_008018_SPI_BUSY 0xF7FFFFFF +#define S_008018_SC_BUSY(x) (((x) & 0x1) << 29) +#define G_008018_SC_BUSY(x) (((x) >> 29) & 0x1) +#define C_008018_SC_BUSY 0xDFFFFFFF +#define S_008018_DB_BUSY(x) (((x) & 0x1) << 30) +#define G_008018_DB_BUSY(x) (((x) >> 30) & 0x1) +#define C_008018_DB_BUSY 0xBFFFFFFF +#define S_008018_CB_BUSY(x) (((x) & 0x1) << 31) +#define G_008018_CB_BUSY(x) (((x) >> 31) & 0x1) +#define C_008018_CB_BUSY 0x7FFFFFFF +#define R_008038_GRBM_STATUS_SE2 0x008038 +#define S_008038_DB_CLEAN(x) (((x) & 0x1) << 1) +#define G_008038_DB_CLEAN(x) (((x) >> 1) & 0x1) +#define C_008038_DB_CLEAN 0xFFFFFFFD +#define S_008038_CB_CLEAN(x) (((x) & 0x1) << 2) +#define G_008038_CB_CLEAN(x) (((x) >> 2) & 0x1) +#define C_008038_CB_CLEAN 0xFFFFFFFB +#define S_008038_BCI_BUSY(x) (((x) & 0x1) << 22) +#define G_008038_BCI_BUSY(x) (((x) >> 22) & 0x1) +#define C_008038_BCI_BUSY 0xFFBFFFFF +#define S_008038_VGT_BUSY(x) (((x) & 0x1) << 23) +#define G_008038_VGT_BUSY(x) (((x) >> 23) & 0x1) +#define C_008038_VGT_BUSY 0xFF7FFFFF +#define S_008038_PA_BUSY(x) (((x) & 0x1) << 24) +#define G_008038_PA_BUSY(x) (((x) >> 24) & 0x1) +#define C_008038_PA_BUSY 0xFEFFFFFF +#define S_008038_TA_BUSY(x) (((x) & 0x1) << 25) +#define G_008038_TA_BUSY(x) (((x) >> 25) & 0x1) +#define C_008038_TA_BUSY 0xFDFFFFFF +#define S_008038_SX_BUSY(x) (((x) & 0x1) << 26) +#define G_008038_SX_BUSY(x) (((x) >> 26) & 0x1) +#define C_008038_SX_BUSY 0xFBFFFFFF +#define S_008038_SPI_BUSY(x) (((x) & 0x1) << 27) +#define G_008038_SPI_BUSY(x) (((x) >> 27) & 0x1) +#define C_008038_SPI_BUSY 0xF7FFFFFF +#define S_008038_SC_BUSY(x) (((x) & 0x1) << 29) +#define G_008038_SC_BUSY(x) (((x) >> 29) & 0x1) +#define C_008038_SC_BUSY 0xDFFFFFFF +#define S_008038_DB_BUSY(x) (((x) & 0x1) << 30) +#define G_008038_DB_BUSY(x) (((x) >> 30) & 0x1) +#define C_008038_DB_BUSY 0xBFFFFFFF +#define S_008038_CB_BUSY(x) (((x) & 0x1) << 31) +#define G_008038_CB_BUSY(x) (((x) >> 31) & 0x1) +#define C_008038_CB_BUSY 0x7FFFFFFF +#define R_00803C_GRBM_STATUS_SE3 0x00803C +#define S_00803C_DB_CLEAN(x) (((x) & 0x1) << 1) +#define G_00803C_DB_CLEAN(x) (((x) >> 1) & 0x1) +#define C_00803C_DB_CLEAN 0xFFFFFFFD +#define S_00803C_CB_CLEAN(x) (((x) & 0x1) << 2) +#define G_00803C_CB_CLEAN(x) (((x) >> 2) & 0x1) +#define C_00803C_CB_CLEAN 0xFFFFFFFB +#define S_00803C_BCI_BUSY(x) (((x) & 0x1) << 22) +#define G_00803C_BCI_BUSY(x) (((x) >> 22) & 0x1) +#define C_00803C_BCI_BUSY 0xFFBFFFFF +#define S_00803C_VGT_BUSY(x) (((x) & 0x1) << 23) +#define G_00803C_VGT_BUSY(x) (((x) >> 23) & 0x1) +#define C_00803C_VGT_BUSY 0xFF7FFFFF +#define S_00803C_PA_BUSY(x) (((x) & 0x1) << 24) +#define G_00803C_PA_BUSY(x) (((x) >> 24) & 0x1) +#define C_00803C_PA_BUSY 0xFEFFFFFF +#define S_00803C_TA_BUSY(x) (((x) & 0x1) << 25) +#define G_00803C_TA_BUSY(x) (((x) >> 25) & 0x1) +#define C_00803C_TA_BUSY 0xFDFFFFFF +#define S_00803C_SX_BUSY(x) (((x) & 0x1) << 26) +#define G_00803C_SX_BUSY(x) (((x) >> 26) & 0x1) +#define C_00803C_SX_BUSY 0xFBFFFFFF +#define S_00803C_SPI_BUSY(x) (((x) & 0x1) << 27) +#define G_00803C_SPI_BUSY(x) (((x) >> 27) & 0x1) +#define C_00803C_SPI_BUSY 0xF7FFFFFF +#define S_00803C_SC_BUSY(x) (((x) & 0x1) << 29) +#define G_00803C_SC_BUSY(x) (((x) >> 29) & 0x1) +#define C_00803C_SC_BUSY 0xDFFFFFFF +#define S_00803C_DB_BUSY(x) (((x) & 0x1) << 30) +#define G_00803C_DB_BUSY(x) (((x) >> 30) & 0x1) +#define C_00803C_DB_BUSY 0xBFFFFFFF +#define S_00803C_CB_BUSY(x) (((x) & 0x1) << 31) +#define G_00803C_CB_BUSY(x) (((x) >> 31) & 0x1) +#define C_00803C_CB_BUSY 0x7FFFFFFF /* CIK */ +#define R_0300FC_CP_STRMOUT_CNTL 0x0300FC +#define S_0300FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) +#define G_0300FC_OFFSET_UPDATE_DONE(x) (((x) >> 0) & 0x1) +#define C_0300FC_OFFSET_UPDATE_DONE 0xFFFFFFFE #define R_0301E4_CP_COHER_BASE_HI 0x0301E4 #define S_0301E4_COHER_BASE_HI_256B(x) (((x) & 0xFF) << 0) #define G_0301E4_COHER_BASE_HI_256B(x) (((x) >> 0) & 0xFF) #define C_0301E4_COHER_BASE_HI_256B 0xFFFFFF00 +#define R_0301EC_CP_COHER_START_DELAY 0x0301EC +#define S_0301EC_START_DELAY_COUNT(x) (((x) & 0x3F) << 0) +#define G_0301EC_START_DELAY_COUNT(x) (((x) >> 0) & 0x3F) +#define C_0301EC_START_DELAY_COUNT 0xFFFFFFC0 #define R_0301F0_CP_COHER_CNTL 0x0301F0 #define S_0301F0_DEST_BASE_0_ENA(x) (((x) & 0x1) << 0) #define G_0301F0_DEST_BASE_0_ENA(x) (((x) >> 0) & 0x1) @@ -289,6 +824,14 @@ #define S_0301F0_DEST_BASE_1_ENA(x) (((x) & 0x1) << 1) #define G_0301F0_DEST_BASE_1_ENA(x) (((x) >> 1) & 0x1) #define C_0301F0_DEST_BASE_1_ENA 0xFFFFFFFD +/* VI */ +#define S_0301F0_TC_SD_ACTION_ENA(x) (((x) & 0x1) << 2) +#define G_0301F0_TC_SD_ACTION_ENA(x) (((x) >> 2) & 0x1) +#define C_0301F0_TC_SD_ACTION_ENA 0xFFFFFFFB +#define S_0301F0_TC_NC_ACTION_ENA(x) (((x) & 0x1) << 3) +#define G_0301F0_TC_NC_ACTION_ENA(x) (((x) >> 3) & 0x1) +#define C_0301F0_TC_NC_ACTION_ENA 0xFFFFFFF7 +/* */ #define S_0301F0_CB0_DEST_BASE_ENA(x) (((x) & 0x1) << 6) #define G_0301F0_CB0_DEST_BASE_ENA(x) (((x) >> 6) & 0x1) #define C_0301F0_CB0_DEST_BASE_ENA 0xFFFFFFBF @@ -319,7 +862,7 @@ #define S_0301F0_TCL1_VOL_ACTION_ENA(x) (((x) & 0x1) << 15) #define G_0301F0_TCL1_VOL_ACTION_ENA(x) (((x) >> 15) & 0x1) #define C_0301F0_TCL1_VOL_ACTION_ENA 0xFFFF7FFF -#define S_0301F0_TC_VOL_ACTION_ENA(x) (((x) & 0x1) << 16) +#define S_0301F0_TC_VOL_ACTION_ENA(x) (((x) & 0x1) << 16) /* not on VI */ #define G_0301F0_TC_VOL_ACTION_ENA(x) (((x) >> 16) & 0x1) #define C_0301F0_TC_VOL_ACTION_ENA 0xFFFEFFFF #define S_0301F0_TC_WB_ACTION_ENA(x) (((x) & 0x1) << 18) @@ -352,8 +895,389 @@ #define S_0301F0_SH_ICACHE_ACTION_ENA(x) (((x) & 0x1) << 29) #define G_0301F0_SH_ICACHE_ACTION_ENA(x) (((x) >> 29) & 0x1) #define C_0301F0_SH_ICACHE_ACTION_ENA 0xDFFFFFFF +/* VI */ +#define S_0301F0_SH_KCACHE_WB_ACTION_ENA(x) (((x) & 0x1) << 30) +#define G_0301F0_SH_KCACHE_WB_ACTION_ENA(x) (((x) >> 30) & 0x1) +#define C_0301F0_SH_KCACHE_WB_ACTION_ENA 0xBFFFFFFF +#define S_0301F0_SH_SD_ACTION_ENA(x) (((x) & 0x1) << 31) +#define G_0301F0_SH_SD_ACTION_ENA(x) (((x) >> 31) & 0x1) +#define C_0301F0_SH_SD_ACTION_ENA 0x7FFFFFFF +/* */ #define R_0301F4_CP_COHER_SIZE 0x0301F4 #define R_0301F8_CP_COHER_BASE 0x0301F8 +#define R_0301FC_CP_COHER_STATUS 0x0301FC +#define S_0301FC_MATCHING_GFX_CNTX(x) (((x) & 0xFF) << 0) +#define G_0301FC_MATCHING_GFX_CNTX(x) (((x) >> 0) & 0xFF) +#define C_0301FC_MATCHING_GFX_CNTX 0xFFFFFF00 +#define S_0301FC_MEID(x) (((x) & 0x03) << 24) +#define G_0301FC_MEID(x) (((x) >> 24) & 0x03) +#define C_0301FC_MEID 0xFCFFFFFF +#define S_0301FC_PHASE1_STATUS(x) (((x) & 0x1) << 30) +#define G_0301FC_PHASE1_STATUS(x) (((x) >> 30) & 0x1) +#define C_0301FC_PHASE1_STATUS 0xBFFFFFFF +#define S_0301FC_STATUS(x) (((x) & 0x1) << 31) +#define G_0301FC_STATUS(x) (((x) >> 31) & 0x1) +#define C_0301FC_STATUS 0x7FFFFFFF +#define R_008210_CP_CPC_STATUS 0x008210 +#define S_008210_MEC1_BUSY(x) (((x) & 0x1) << 0) +#define G_008210_MEC1_BUSY(x) (((x) >> 0) & 0x1) +#define C_008210_MEC1_BUSY 0xFFFFFFFE +#define S_008210_MEC2_BUSY(x) (((x) & 0x1) << 1) +#define G_008210_MEC2_BUSY(x) (((x) >> 1) & 0x1) +#define C_008210_MEC2_BUSY 0xFFFFFFFD +#define S_008210_DC0_BUSY(x) (((x) & 0x1) << 2) +#define G_008210_DC0_BUSY(x) (((x) >> 2) & 0x1) +#define C_008210_DC0_BUSY 0xFFFFFFFB +#define S_008210_DC1_BUSY(x) (((x) & 0x1) << 3) +#define G_008210_DC1_BUSY(x) (((x) >> 3) & 0x1) +#define C_008210_DC1_BUSY 0xFFFFFFF7 +#define S_008210_RCIU1_BUSY(x) (((x) & 0x1) << 4) +#define G_008210_RCIU1_BUSY(x) (((x) >> 4) & 0x1) +#define C_008210_RCIU1_BUSY 0xFFFFFFEF +#define S_008210_RCIU2_BUSY(x) (((x) & 0x1) << 5) +#define G_008210_RCIU2_BUSY(x) (((x) >> 5) & 0x1) +#define C_008210_RCIU2_BUSY 0xFFFFFFDF +#define S_008210_ROQ1_BUSY(x) (((x) & 0x1) << 6) +#define G_008210_ROQ1_BUSY(x) (((x) >> 6) & 0x1) +#define C_008210_ROQ1_BUSY 0xFFFFFFBF +#define S_008210_ROQ2_BUSY(x) (((x) & 0x1) << 7) +#define G_008210_ROQ2_BUSY(x) (((x) >> 7) & 0x1) +#define C_008210_ROQ2_BUSY 0xFFFFFF7F +#define S_008210_TCIU_BUSY(x) (((x) & 0x1) << 10) +#define G_008210_TCIU_BUSY(x) (((x) >> 10) & 0x1) +#define C_008210_TCIU_BUSY 0xFFFFFBFF +#define S_008210_SCRATCH_RAM_BUSY(x) (((x) & 0x1) << 11) +#define G_008210_SCRATCH_RAM_BUSY(x) (((x) >> 11) & 0x1) +#define C_008210_SCRATCH_RAM_BUSY 0xFFFFF7FF +#define S_008210_QU_BUSY(x) (((x) & 0x1) << 12) +#define G_008210_QU_BUSY(x) (((x) >> 12) & 0x1) +#define C_008210_QU_BUSY 0xFFFFEFFF +#define S_008210_ATCL2IU_BUSY(x) (((x) & 0x1) << 13) +#define G_008210_ATCL2IU_BUSY(x) (((x) >> 13) & 0x1) +#define C_008210_ATCL2IU_BUSY 0xFFFFDFFF +#define S_008210_CPG_CPC_BUSY(x) (((x) & 0x1) << 29) +#define G_008210_CPG_CPC_BUSY(x) (((x) >> 29) & 0x1) +#define C_008210_CPG_CPC_BUSY 0xDFFFFFFF +#define S_008210_CPF_CPC_BUSY(x) (((x) & 0x1) << 30) +#define G_008210_CPF_CPC_BUSY(x) (((x) >> 30) & 0x1) +#define C_008210_CPF_CPC_BUSY 0xBFFFFFFF +#define S_008210_CPC_BUSY(x) (((x) & 0x1) << 31) +#define G_008210_CPC_BUSY(x) (((x) >> 31) & 0x1) +#define C_008210_CPC_BUSY 0x7FFFFFFF +#define R_008214_CP_CPC_BUSY_STAT 0x008214 +#define S_008214_MEC1_LOAD_BUSY(x) (((x) & 0x1) << 0) +#define G_008214_MEC1_LOAD_BUSY(x) (((x) >> 0) & 0x1) +#define C_008214_MEC1_LOAD_BUSY 0xFFFFFFFE +#define S_008214_MEC1_SEMAPOHRE_BUSY(x) (((x) & 0x1) << 1) +#define G_008214_MEC1_SEMAPOHRE_BUSY(x) (((x) >> 1) & 0x1) +#define C_008214_MEC1_SEMAPOHRE_BUSY 0xFFFFFFFD +#define S_008214_MEC1_MUTEX_BUSY(x) (((x) & 0x1) << 2) +#define G_008214_MEC1_MUTEX_BUSY(x) (((x) >> 2) & 0x1) +#define C_008214_MEC1_MUTEX_BUSY 0xFFFFFFFB +#define S_008214_MEC1_MESSAGE_BUSY(x) (((x) & 0x1) << 3) +#define G_008214_MEC1_MESSAGE_BUSY(x) (((x) >> 3) & 0x1) +#define C_008214_MEC1_MESSAGE_BUSY 0xFFFFFFF7 +#define S_008214_MEC1_EOP_QUEUE_BUSY(x) (((x) & 0x1) << 4) +#define G_008214_MEC1_EOP_QUEUE_BUSY(x) (((x) >> 4) & 0x1) +#define C_008214_MEC1_EOP_QUEUE_BUSY 0xFFFFFFEF +#define S_008214_MEC1_IQ_QUEUE_BUSY(x) (((x) & 0x1) << 5) +#define G_008214_MEC1_IQ_QUEUE_BUSY(x) (((x) >> 5) & 0x1) +#define C_008214_MEC1_IQ_QUEUE_BUSY 0xFFFFFFDF +#define S_008214_MEC1_IB_QUEUE_BUSY(x) (((x) & 0x1) << 6) +#define G_008214_MEC1_IB_QUEUE_BUSY(x) (((x) >> 6) & 0x1) +#define C_008214_MEC1_IB_QUEUE_BUSY 0xFFFFFFBF +#define S_008214_MEC1_TC_BUSY(x) (((x) & 0x1) << 7) +#define G_008214_MEC1_TC_BUSY(x) (((x) >> 7) & 0x1) +#define C_008214_MEC1_TC_BUSY 0xFFFFFF7F +#define S_008214_MEC1_DMA_BUSY(x) (((x) & 0x1) << 8) +#define G_008214_MEC1_DMA_BUSY(x) (((x) >> 8) & 0x1) +#define C_008214_MEC1_DMA_BUSY 0xFFFFFEFF +#define S_008214_MEC1_PARTIAL_FLUSH_BUSY(x) (((x) & 0x1) << 9) +#define G_008214_MEC1_PARTIAL_FLUSH_BUSY(x) (((x) >> 9) & 0x1) +#define C_008214_MEC1_PARTIAL_FLUSH_BUSY 0xFFFFFDFF +#define S_008214_MEC1_PIPE0_BUSY(x) (((x) & 0x1) << 10) +#define G_008214_MEC1_PIPE0_BUSY(x) (((x) >> 10) & 0x1) +#define C_008214_MEC1_PIPE0_BUSY 0xFFFFFBFF +#define S_008214_MEC1_PIPE1_BUSY(x) (((x) & 0x1) << 11) +#define G_008214_MEC1_PIPE1_BUSY(x) (((x) >> 11) & 0x1) +#define C_008214_MEC1_PIPE1_BUSY 0xFFFFF7FF +#define S_008214_MEC1_PIPE2_BUSY(x) (((x) & 0x1) << 12) +#define G_008214_MEC1_PIPE2_BUSY(x) (((x) >> 12) & 0x1) +#define C_008214_MEC1_PIPE2_BUSY 0xFFFFEFFF +#define S_008214_MEC1_PIPE3_BUSY(x) (((x) & 0x1) << 13) +#define G_008214_MEC1_PIPE3_BUSY(x) (((x) >> 13) & 0x1) +#define C_008214_MEC1_PIPE3_BUSY 0xFFFFDFFF +#define S_008214_MEC2_LOAD_BUSY(x) (((x) & 0x1) << 16) +#define G_008214_MEC2_LOAD_BUSY(x) (((x) >> 16) & 0x1) +#define C_008214_MEC2_LOAD_BUSY 0xFFFEFFFF +#define S_008214_MEC2_SEMAPOHRE_BUSY(x) (((x) & 0x1) << 17) +#define G_008214_MEC2_SEMAPOHRE_BUSY(x) (((x) >> 17) & 0x1) +#define C_008214_MEC2_SEMAPOHRE_BUSY 0xFFFDFFFF +#define S_008214_MEC2_MUTEX_BUSY(x) (((x) & 0x1) << 18) +#define G_008214_MEC2_MUTEX_BUSY(x) (((x) >> 18) & 0x1) +#define C_008214_MEC2_MUTEX_BUSY 0xFFFBFFFF +#define S_008214_MEC2_MESSAGE_BUSY(x) (((x) & 0x1) << 19) +#define G_008214_MEC2_MESSAGE_BUSY(x) (((x) >> 19) & 0x1) +#define C_008214_MEC2_MESSAGE_BUSY 0xFFF7FFFF +#define S_008214_MEC2_EOP_QUEUE_BUSY(x) (((x) & 0x1) << 20) +#define G_008214_MEC2_EOP_QUEUE_BUSY(x) (((x) >> 20) & 0x1) +#define C_008214_MEC2_EOP_QUEUE_BUSY 0xFFEFFFFF +#define S_008214_MEC2_IQ_QUEUE_BUSY(x) (((x) & 0x1) << 21) +#define G_008214_MEC2_IQ_QUEUE_BUSY(x) (((x) >> 21) & 0x1) +#define C_008214_MEC2_IQ_QUEUE_BUSY 0xFFDFFFFF +#define S_008214_MEC2_IB_QUEUE_BUSY(x) (((x) & 0x1) << 22) +#define G_008214_MEC2_IB_QUEUE_BUSY(x) (((x) >> 22) & 0x1) +#define C_008214_MEC2_IB_QUEUE_BUSY 0xFFBFFFFF +#define S_008214_MEC2_TC_BUSY(x) (((x) & 0x1) << 23) +#define G_008214_MEC2_TC_BUSY(x) (((x) >> 23) & 0x1) +#define C_008214_MEC2_TC_BUSY 0xFF7FFFFF +#define S_008214_MEC2_DMA_BUSY(x) (((x) & 0x1) << 24) +#define G_008214_MEC2_DMA_BUSY(x) (((x) >> 24) & 0x1) +#define C_008214_MEC2_DMA_BUSY 0xFEFFFFFF +#define S_008214_MEC2_PARTIAL_FLUSH_BUSY(x) (((x) & 0x1) << 25) +#define G_008214_MEC2_PARTIAL_FLUSH_BUSY(x) (((x) >> 25) & 0x1) +#define C_008214_MEC2_PARTIAL_FLUSH_BUSY 0xFDFFFFFF +#define S_008214_MEC2_PIPE0_BUSY(x) (((x) & 0x1) << 26) +#define G_008214_MEC2_PIPE0_BUSY(x) (((x) >> 26) & 0x1) +#define C_008214_MEC2_PIPE0_BUSY 0xFBFFFFFF +#define S_008214_MEC2_PIPE1_BUSY(x) (((x) & 0x1) << 27) +#define G_008214_MEC2_PIPE1_BUSY(x) (((x) >> 27) & 0x1) +#define C_008214_MEC2_PIPE1_BUSY 0xF7FFFFFF +#define S_008214_MEC2_PIPE2_BUSY(x) (((x) & 0x1) << 28) +#define G_008214_MEC2_PIPE2_BUSY(x) (((x) >> 28) & 0x1) +#define C_008214_MEC2_PIPE2_BUSY 0xEFFFFFFF +#define S_008214_MEC2_PIPE3_BUSY(x) (((x) & 0x1) << 29) +#define G_008214_MEC2_PIPE3_BUSY(x) (((x) >> 29) & 0x1) +#define C_008214_MEC2_PIPE3_BUSY 0xDFFFFFFF +#define R_008218_CP_CPC_STALLED_STAT1 0x008218 +#define S_008218_RCIU_TX_FREE_STALL(x) (((x) & 0x1) << 3) +#define G_008218_RCIU_TX_FREE_STALL(x) (((x) >> 3) & 0x1) +#define C_008218_RCIU_TX_FREE_STALL 0xFFFFFFF7 +#define S_008218_RCIU_PRIV_VIOLATION(x) (((x) & 0x1) << 4) +#define G_008218_RCIU_PRIV_VIOLATION(x) (((x) >> 4) & 0x1) +#define C_008218_RCIU_PRIV_VIOLATION 0xFFFFFFEF +#define S_008218_TCIU_TX_FREE_STALL(x) (((x) & 0x1) << 6) +#define G_008218_TCIU_TX_FREE_STALL(x) (((x) >> 6) & 0x1) +#define C_008218_TCIU_TX_FREE_STALL 0xFFFFFFBF +#define S_008218_MEC1_DECODING_PACKET(x) (((x) & 0x1) << 8) +#define G_008218_MEC1_DECODING_PACKET(x) (((x) >> 8) & 0x1) +#define C_008218_MEC1_DECODING_PACKET 0xFFFFFEFF +#define S_008218_MEC1_WAIT_ON_RCIU(x) (((x) & 0x1) << 9) +#define G_008218_MEC1_WAIT_ON_RCIU(x) (((x) >> 9) & 0x1) +#define C_008218_MEC1_WAIT_ON_RCIU 0xFFFFFDFF +#define S_008218_MEC1_WAIT_ON_RCIU_READ(x) (((x) & 0x1) << 10) +#define G_008218_MEC1_WAIT_ON_RCIU_READ(x) (((x) >> 10) & 0x1) +#define C_008218_MEC1_WAIT_ON_RCIU_READ 0xFFFFFBFF +#define S_008218_MEC1_WAIT_ON_ROQ_DATA(x) (((x) & 0x1) << 13) +#define G_008218_MEC1_WAIT_ON_ROQ_DATA(x) (((x) >> 13) & 0x1) +#define C_008218_MEC1_WAIT_ON_ROQ_DATA 0xFFFFDFFF +#define S_008218_MEC2_DECODING_PACKET(x) (((x) & 0x1) << 16) +#define G_008218_MEC2_DECODING_PACKET(x) (((x) >> 16) & 0x1) +#define C_008218_MEC2_DECODING_PACKET 0xFFFEFFFF +#define S_008218_MEC2_WAIT_ON_RCIU(x) (((x) & 0x1) << 17) +#define G_008218_MEC2_WAIT_ON_RCIU(x) (((x) >> 17) & 0x1) +#define C_008218_MEC2_WAIT_ON_RCIU 0xFFFDFFFF +#define S_008218_MEC2_WAIT_ON_RCIU_READ(x) (((x) & 0x1) << 18) +#define G_008218_MEC2_WAIT_ON_RCIU_READ(x) (((x) >> 18) & 0x1) +#define C_008218_MEC2_WAIT_ON_RCIU_READ 0xFFFBFFFF +#define S_008218_MEC2_WAIT_ON_ROQ_DATA(x) (((x) & 0x1) << 21) +#define G_008218_MEC2_WAIT_ON_ROQ_DATA(x) (((x) >> 21) & 0x1) +#define C_008218_MEC2_WAIT_ON_ROQ_DATA 0xFFDFFFFF +#define S_008218_ATCL2IU_WAITING_ON_FREE(x) (((x) & 0x1) << 22) +#define G_008218_ATCL2IU_WAITING_ON_FREE(x) (((x) >> 22) & 0x1) +#define C_008218_ATCL2IU_WAITING_ON_FREE 0xFFBFFFFF +#define S_008218_ATCL2IU_WAITING_ON_TAGS(x) (((x) & 0x1) << 23) +#define G_008218_ATCL2IU_WAITING_ON_TAGS(x) (((x) >> 23) & 0x1) +#define C_008218_ATCL2IU_WAITING_ON_TAGS 0xFF7FFFFF +#define S_008218_ATCL1_WAITING_ON_TRANS(x) (((x) & 0x1) << 24) +#define G_008218_ATCL1_WAITING_ON_TRANS(x) (((x) >> 24) & 0x1) +#define C_008218_ATCL1_WAITING_ON_TRANS 0xFEFFFFFF +#define R_00821C_CP_CPF_STATUS 0x00821C +#define S_00821C_POST_WPTR_GFX_BUSY(x) (((x) & 0x1) << 0) +#define G_00821C_POST_WPTR_GFX_BUSY(x) (((x) >> 0) & 0x1) +#define C_00821C_POST_WPTR_GFX_BUSY 0xFFFFFFFE +#define S_00821C_CSF_BUSY(x) (((x) & 0x1) << 1) +#define G_00821C_CSF_BUSY(x) (((x) >> 1) & 0x1) +#define C_00821C_CSF_BUSY 0xFFFFFFFD +#define S_00821C_ROQ_ALIGN_BUSY(x) (((x) & 0x1) << 4) +#define G_00821C_ROQ_ALIGN_BUSY(x) (((x) >> 4) & 0x1) +#define C_00821C_ROQ_ALIGN_BUSY 0xFFFFFFEF +#define S_00821C_ROQ_RING_BUSY(x) (((x) & 0x1) << 5) +#define G_00821C_ROQ_RING_BUSY(x) (((x) >> 5) & 0x1) +#define C_00821C_ROQ_RING_BUSY 0xFFFFFFDF +#define S_00821C_ROQ_INDIRECT1_BUSY(x) (((x) & 0x1) << 6) +#define G_00821C_ROQ_INDIRECT1_BUSY(x) (((x) >> 6) & 0x1) +#define C_00821C_ROQ_INDIRECT1_BUSY 0xFFFFFFBF +#define S_00821C_ROQ_INDIRECT2_BUSY(x) (((x) & 0x1) << 7) +#define G_00821C_ROQ_INDIRECT2_BUSY(x) (((x) >> 7) & 0x1) +#define C_00821C_ROQ_INDIRECT2_BUSY 0xFFFFFF7F +#define S_00821C_ROQ_STATE_BUSY(x) (((x) & 0x1) << 8) +#define G_00821C_ROQ_STATE_BUSY(x) (((x) >> 8) & 0x1) +#define C_00821C_ROQ_STATE_BUSY 0xFFFFFEFF +#define S_00821C_ROQ_CE_RING_BUSY(x) (((x) & 0x1) << 9) +#define G_00821C_ROQ_CE_RING_BUSY(x) (((x) >> 9) & 0x1) +#define C_00821C_ROQ_CE_RING_BUSY 0xFFFFFDFF +#define S_00821C_ROQ_CE_INDIRECT1_BUSY(x) (((x) & 0x1) << 10) +#define G_00821C_ROQ_CE_INDIRECT1_BUSY(x) (((x) >> 10) & 0x1) +#define C_00821C_ROQ_CE_INDIRECT1_BUSY 0xFFFFFBFF +#define S_00821C_ROQ_CE_INDIRECT2_BUSY(x) (((x) & 0x1) << 11) +#define G_00821C_ROQ_CE_INDIRECT2_BUSY(x) (((x) >> 11) & 0x1) +#define C_00821C_ROQ_CE_INDIRECT2_BUSY 0xFFFFF7FF +#define S_00821C_SEMAPHORE_BUSY(x) (((x) & 0x1) << 12) +#define G_00821C_SEMAPHORE_BUSY(x) (((x) >> 12) & 0x1) +#define C_00821C_SEMAPHORE_BUSY 0xFFFFEFFF +#define S_00821C_INTERRUPT_BUSY(x) (((x) & 0x1) << 13) +#define G_00821C_INTERRUPT_BUSY(x) (((x) >> 13) & 0x1) +#define C_00821C_INTERRUPT_BUSY 0xFFFFDFFF +#define S_00821C_TCIU_BUSY(x) (((x) & 0x1) << 14) +#define G_00821C_TCIU_BUSY(x) (((x) >> 14) & 0x1) +#define C_00821C_TCIU_BUSY 0xFFFFBFFF +#define S_00821C_HQD_BUSY(x) (((x) & 0x1) << 15) +#define G_00821C_HQD_BUSY(x) (((x) >> 15) & 0x1) +#define C_00821C_HQD_BUSY 0xFFFF7FFF +#define S_00821C_PRT_BUSY(x) (((x) & 0x1) << 16) +#define G_00821C_PRT_BUSY(x) (((x) >> 16) & 0x1) +#define C_00821C_PRT_BUSY 0xFFFEFFFF +#define S_00821C_ATCL2IU_BUSY(x) (((x) & 0x1) << 17) +#define G_00821C_ATCL2IU_BUSY(x) (((x) >> 17) & 0x1) +#define C_00821C_ATCL2IU_BUSY 0xFFFDFFFF +#define S_00821C_CPF_GFX_BUSY(x) (((x) & 0x1) << 26) +#define G_00821C_CPF_GFX_BUSY(x) (((x) >> 26) & 0x1) +#define C_00821C_CPF_GFX_BUSY 0xFBFFFFFF +#define S_00821C_CPF_CMP_BUSY(x) (((x) & 0x1) << 27) +#define G_00821C_CPF_CMP_BUSY(x) (((x) >> 27) & 0x1) +#define C_00821C_CPF_CMP_BUSY 0xF7FFFFFF +#define S_00821C_GRBM_CPF_STAT_BUSY(x) (((x) & 0x03) << 28) +#define G_00821C_GRBM_CPF_STAT_BUSY(x) (((x) >> 28) & 0x03) +#define C_00821C_GRBM_CPF_STAT_BUSY 0xCFFFFFFF +#define S_00821C_CPC_CPF_BUSY(x) (((x) & 0x1) << 30) +#define G_00821C_CPC_CPF_BUSY(x) (((x) >> 30) & 0x1) +#define C_00821C_CPC_CPF_BUSY 0xBFFFFFFF +#define S_00821C_CPF_BUSY(x) (((x) & 0x1) << 31) +#define G_00821C_CPF_BUSY(x) (((x) >> 31) & 0x1) +#define C_00821C_CPF_BUSY 0x7FFFFFFF +#define R_008220_CP_CPF_BUSY_STAT 0x008220 +#define S_008220_REG_BUS_FIFO_BUSY(x) (((x) & 0x1) << 0) +#define G_008220_REG_BUS_FIFO_BUSY(x) (((x) >> 0) & 0x1) +#define C_008220_REG_BUS_FIFO_BUSY 0xFFFFFFFE +#define S_008220_CSF_RING_BUSY(x) (((x) & 0x1) << 1) +#define G_008220_CSF_RING_BUSY(x) (((x) >> 1) & 0x1) +#define C_008220_CSF_RING_BUSY 0xFFFFFFFD +#define S_008220_CSF_INDIRECT1_BUSY(x) (((x) & 0x1) << 2) +#define G_008220_CSF_INDIRECT1_BUSY(x) (((x) >> 2) & 0x1) +#define C_008220_CSF_INDIRECT1_BUSY 0xFFFFFFFB +#define S_008220_CSF_INDIRECT2_BUSY(x) (((x) & 0x1) << 3) +#define G_008220_CSF_INDIRECT2_BUSY(x) (((x) >> 3) & 0x1) +#define C_008220_CSF_INDIRECT2_BUSY 0xFFFFFFF7 +#define S_008220_CSF_STATE_BUSY(x) (((x) & 0x1) << 4) +#define G_008220_CSF_STATE_BUSY(x) (((x) >> 4) & 0x1) +#define C_008220_CSF_STATE_BUSY 0xFFFFFFEF +#define S_008220_CSF_CE_INDR1_BUSY(x) (((x) & 0x1) << 5) +#define G_008220_CSF_CE_INDR1_BUSY(x) (((x) >> 5) & 0x1) +#define C_008220_CSF_CE_INDR1_BUSY 0xFFFFFFDF +#define S_008220_CSF_CE_INDR2_BUSY(x) (((x) & 0x1) << 6) +#define G_008220_CSF_CE_INDR2_BUSY(x) (((x) >> 6) & 0x1) +#define C_008220_CSF_CE_INDR2_BUSY 0xFFFFFFBF +#define S_008220_CSF_ARBITER_BUSY(x) (((x) & 0x1) << 7) +#define G_008220_CSF_ARBITER_BUSY(x) (((x) >> 7) & 0x1) +#define C_008220_CSF_ARBITER_BUSY 0xFFFFFF7F +#define S_008220_CSF_INPUT_BUSY(x) (((x) & 0x1) << 8) +#define G_008220_CSF_INPUT_BUSY(x) (((x) >> 8) & 0x1) +#define C_008220_CSF_INPUT_BUSY 0xFFFFFEFF +#define S_008220_OUTSTANDING_READ_TAGS(x) (((x) & 0x1) << 9) +#define G_008220_OUTSTANDING_READ_TAGS(x) (((x) >> 9) & 0x1) +#define C_008220_OUTSTANDING_READ_TAGS 0xFFFFFDFF +#define S_008220_HPD_PROCESSING_EOP_BUSY(x) (((x) & 0x1) << 11) +#define G_008220_HPD_PROCESSING_EOP_BUSY(x) (((x) >> 11) & 0x1) +#define C_008220_HPD_PROCESSING_EOP_BUSY 0xFFFFF7FF +#define S_008220_HQD_DISPATCH_BUSY(x) (((x) & 0x1) << 12) +#define G_008220_HQD_DISPATCH_BUSY(x) (((x) >> 12) & 0x1) +#define C_008220_HQD_DISPATCH_BUSY 0xFFFFEFFF +#define S_008220_HQD_IQ_TIMER_BUSY(x) (((x) & 0x1) << 13) +#define G_008220_HQD_IQ_TIMER_BUSY(x) (((x) >> 13) & 0x1) +#define C_008220_HQD_IQ_TIMER_BUSY 0xFFFFDFFF +#define S_008220_HQD_DMA_OFFLOAD_BUSY(x) (((x) & 0x1) << 14) +#define G_008220_HQD_DMA_OFFLOAD_BUSY(x) (((x) >> 14) & 0x1) +#define C_008220_HQD_DMA_OFFLOAD_BUSY 0xFFFFBFFF +#define S_008220_HQD_WAIT_SEMAPHORE_BUSY(x) (((x) & 0x1) << 15) +#define G_008220_HQD_WAIT_SEMAPHORE_BUSY(x) (((x) >> 15) & 0x1) +#define C_008220_HQD_WAIT_SEMAPHORE_BUSY 0xFFFF7FFF +#define S_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x) (((x) & 0x1) << 16) +#define G_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x) (((x) >> 16) & 0x1) +#define C_008220_HQD_SIGNAL_SEMAPHORE_BUSY 0xFFFEFFFF +#define S_008220_HQD_MESSAGE_BUSY(x) (((x) & 0x1) << 17) +#define G_008220_HQD_MESSAGE_BUSY(x) (((x) >> 17) & 0x1) +#define C_008220_HQD_MESSAGE_BUSY 0xFFFDFFFF +#define S_008220_HQD_PQ_FETCHER_BUSY(x) (((x) & 0x1) << 18) +#define G_008220_HQD_PQ_FETCHER_BUSY(x) (((x) >> 18) & 0x1) +#define C_008220_HQD_PQ_FETCHER_BUSY 0xFFFBFFFF +#define S_008220_HQD_IB_FETCHER_BUSY(x) (((x) & 0x1) << 19) +#define G_008220_HQD_IB_FETCHER_BUSY(x) (((x) >> 19) & 0x1) +#define C_008220_HQD_IB_FETCHER_BUSY 0xFFF7FFFF +#define S_008220_HQD_IQ_FETCHER_BUSY(x) (((x) & 0x1) << 20) +#define G_008220_HQD_IQ_FETCHER_BUSY(x) (((x) >> 20) & 0x1) +#define C_008220_HQD_IQ_FETCHER_BUSY 0xFFEFFFFF +#define S_008220_HQD_EOP_FETCHER_BUSY(x) (((x) & 0x1) << 21) +#define G_008220_HQD_EOP_FETCHER_BUSY(x) (((x) >> 21) & 0x1) +#define C_008220_HQD_EOP_FETCHER_BUSY 0xFFDFFFFF +#define S_008220_HQD_CONSUMED_RPTR_BUSY(x) (((x) & 0x1) << 22) +#define G_008220_HQD_CONSUMED_RPTR_BUSY(x) (((x) >> 22) & 0x1) +#define C_008220_HQD_CONSUMED_RPTR_BUSY 0xFFBFFFFF +#define S_008220_HQD_FETCHER_ARB_BUSY(x) (((x) & 0x1) << 23) +#define G_008220_HQD_FETCHER_ARB_BUSY(x) (((x) >> 23) & 0x1) +#define C_008220_HQD_FETCHER_ARB_BUSY 0xFF7FFFFF +#define S_008220_HQD_ROQ_ALIGN_BUSY(x) (((x) & 0x1) << 24) +#define G_008220_HQD_ROQ_ALIGN_BUSY(x) (((x) >> 24) & 0x1) +#define C_008220_HQD_ROQ_ALIGN_BUSY 0xFEFFFFFF +#define S_008220_HQD_ROQ_EOP_BUSY(x) (((x) & 0x1) << 25) +#define G_008220_HQD_ROQ_EOP_BUSY(x) (((x) >> 25) & 0x1) +#define C_008220_HQD_ROQ_EOP_BUSY 0xFDFFFFFF +#define S_008220_HQD_ROQ_IQ_BUSY(x) (((x) & 0x1) << 26) +#define G_008220_HQD_ROQ_IQ_BUSY(x) (((x) >> 26) & 0x1) +#define C_008220_HQD_ROQ_IQ_BUSY 0xFBFFFFFF +#define S_008220_HQD_ROQ_PQ_BUSY(x) (((x) & 0x1) << 27) +#define G_008220_HQD_ROQ_PQ_BUSY(x) (((x) >> 27) & 0x1) +#define C_008220_HQD_ROQ_PQ_BUSY 0xF7FFFFFF +#define S_008220_HQD_ROQ_IB_BUSY(x) (((x) & 0x1) << 28) +#define G_008220_HQD_ROQ_IB_BUSY(x) (((x) >> 28) & 0x1) +#define C_008220_HQD_ROQ_IB_BUSY 0xEFFFFFFF +#define S_008220_HQD_WPTR_POLL_BUSY(x) (((x) & 0x1) << 29) +#define G_008220_HQD_WPTR_POLL_BUSY(x) (((x) >> 29) & 0x1) +#define C_008220_HQD_WPTR_POLL_BUSY 0xDFFFFFFF +#define S_008220_HQD_PQ_BUSY(x) (((x) & 0x1) << 30) +#define G_008220_HQD_PQ_BUSY(x) (((x) >> 30) & 0x1) +#define C_008220_HQD_PQ_BUSY 0xBFFFFFFF +#define S_008220_HQD_IB_BUSY(x) (((x) & 0x1) << 31) +#define G_008220_HQD_IB_BUSY(x) (((x) >> 31) & 0x1) +#define C_008220_HQD_IB_BUSY 0x7FFFFFFF +#define R_008224_CP_CPF_STALLED_STAT1 0x008224 +#define S_008224_RING_FETCHING_DATA(x) (((x) & 0x1) << 0) +#define G_008224_RING_FETCHING_DATA(x) (((x) >> 0) & 0x1) +#define C_008224_RING_FETCHING_DATA 0xFFFFFFFE +#define S_008224_INDR1_FETCHING_DATA(x) (((x) & 0x1) << 1) +#define G_008224_INDR1_FETCHING_DATA(x) (((x) >> 1) & 0x1) +#define C_008224_INDR1_FETCHING_DATA 0xFFFFFFFD +#define S_008224_INDR2_FETCHING_DATA(x) (((x) & 0x1) << 2) +#define G_008224_INDR2_FETCHING_DATA(x) (((x) >> 2) & 0x1) +#define C_008224_INDR2_FETCHING_DATA 0xFFFFFFFB +#define S_008224_STATE_FETCHING_DATA(x) (((x) & 0x1) << 3) +#define G_008224_STATE_FETCHING_DATA(x) (((x) >> 3) & 0x1) +#define C_008224_STATE_FETCHING_DATA 0xFFFFFFF7 +#define S_008224_TCIU_WAITING_ON_FREE(x) (((x) & 0x1) << 5) +#define G_008224_TCIU_WAITING_ON_FREE(x) (((x) >> 5) & 0x1) +#define C_008224_TCIU_WAITING_ON_FREE 0xFFFFFFDF +#define S_008224_TCIU_WAITING_ON_TAGS(x) (((x) & 0x1) << 6) +#define G_008224_TCIU_WAITING_ON_TAGS(x) (((x) >> 6) & 0x1) +#define C_008224_TCIU_WAITING_ON_TAGS 0xFFFFFFBF +#define S_008224_ATCL2IU_WAITING_ON_FREE(x) (((x) & 0x1) << 7) +#define G_008224_ATCL2IU_WAITING_ON_FREE(x) (((x) >> 7) & 0x1) +#define C_008224_ATCL2IU_WAITING_ON_FREE 0xFFFFFF7F +#define S_008224_ATCL2IU_WAITING_ON_TAGS(x) (((x) & 0x1) << 8) +#define G_008224_ATCL2IU_WAITING_ON_TAGS(x) (((x) >> 8) & 0x1) +#define C_008224_ATCL2IU_WAITING_ON_TAGS 0xFFFFFEFF +#define S_008224_ATCL1_WAITING_ON_TRANS(x) (((x) & 0x1) << 9) +#define G_008224_ATCL1_WAITING_ON_TRANS(x) (((x) >> 9) & 0x1) +#define C_008224_ATCL1_WAITING_ON_TRANS 0xFFFFFDFF #define R_030230_CP_COHER_SIZE_HI 0x030230 #define S_030230_COHER_SIZE_HI_256B(x) (((x) & 0xFF) << 0) #define G_030230_COHER_SIZE_HI_256B(x) (((x) >> 0) & 0xFF) @@ -375,10 +1299,6 @@ #define C_0088C4_ES_LIMIT 0xFFE0FFFF #define R_0088C8_VGT_ESGS_RING_SIZE 0x0088C8 #define R_0088CC_VGT_GSVS_RING_SIZE 0x0088CC -/* CIK */ -#define R_030900_VGT_ESGS_RING_SIZE 0x030900 -#define R_030904_VGT_GSVS_RING_SIZE 0x030904 -/* */ #define R_0088D4_VGT_GS_VERTEX_REUSE 0x0088D4 #define S_0088D4_VERT_REUSE(x) (((x) & 0x1F) << 0) #define G_0088D4_VERT_REUSE(x) (((x) >> 0) & 0x1F) @@ -461,7 +1381,293 @@ #define S_008B10_CURRENT_COUNT(x) (((x) & 0xFF) << 8) #define G_008B10_CURRENT_COUNT(x) (((x) >> 8) & 0xFF) #define C_008B10_CURRENT_COUNT 0xFFFF00FF +#define R_008670_CP_STALLED_STAT3 0x008670 +#define S_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 0) +#define G_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x) (((x) >> 0) & 0x1) +#define C_008670_CE_TO_CSF_NOT_RDY_TO_RCV 0xFFFFFFFE +#define S_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 1) +#define G_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x) (((x) >> 1) & 0x1) +#define C_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV 0xFFFFFFFD +#define S_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x) (((x) & 0x1) << 2) +#define G_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x) (((x) >> 2) & 0x1) +#define C_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER 0xFFFFFFFB +#define S_008670_CE_TO_RAM_INIT_NOT_RDY(x) (((x) & 0x1) << 3) +#define G_008670_CE_TO_RAM_INIT_NOT_RDY(x) (((x) >> 3) & 0x1) +#define C_008670_CE_TO_RAM_INIT_NOT_RDY 0xFFFFFFF7 +#define S_008670_CE_TO_RAM_DUMP_NOT_RDY(x) (((x) & 0x1) << 4) +#define G_008670_CE_TO_RAM_DUMP_NOT_RDY(x) (((x) >> 4) & 0x1) +#define C_008670_CE_TO_RAM_DUMP_NOT_RDY 0xFFFFFFEF +#define S_008670_CE_TO_RAM_WRITE_NOT_RDY(x) (((x) & 0x1) << 5) +#define G_008670_CE_TO_RAM_WRITE_NOT_RDY(x) (((x) >> 5) & 0x1) +#define C_008670_CE_TO_RAM_WRITE_NOT_RDY 0xFFFFFFDF +#define S_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 6) +#define G_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x) (((x) >> 6) & 0x1) +#define C_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV 0xFFFFFFBF +#define S_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 7) +#define G_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x) (((x) >> 7) & 0x1) +#define C_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV 0xFFFFFF7F +#define S_008670_CE_WAITING_ON_BUFFER_DATA(x) (((x) & 0x1) << 10) +#define G_008670_CE_WAITING_ON_BUFFER_DATA(x) (((x) >> 10) & 0x1) +#define C_008670_CE_WAITING_ON_BUFFER_DATA 0xFFFFFBFF +#define S_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x) (((x) & 0x1) << 11) +#define G_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x) (((x) >> 11) & 0x1) +#define C_008670_CE_WAITING_ON_CE_BUFFER_FLAG 0xFFFFF7FF +#define S_008670_CE_WAITING_ON_DE_COUNTER(x) (((x) & 0x1) << 12) +#define G_008670_CE_WAITING_ON_DE_COUNTER(x) (((x) >> 12) & 0x1) +#define C_008670_CE_WAITING_ON_DE_COUNTER 0xFFFFEFFF +#define S_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x) (((x) & 0x1) << 13) +#define G_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x) (((x) >> 13) & 0x1) +#define C_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW 0xFFFFDFFF +#define S_008670_TCIU_WAITING_ON_FREE(x) (((x) & 0x1) << 14) +#define G_008670_TCIU_WAITING_ON_FREE(x) (((x) >> 14) & 0x1) +#define C_008670_TCIU_WAITING_ON_FREE 0xFFFFBFFF +#define S_008670_TCIU_WAITING_ON_TAGS(x) (((x) & 0x1) << 15) +#define G_008670_TCIU_WAITING_ON_TAGS(x) (((x) >> 15) & 0x1) +#define C_008670_TCIU_WAITING_ON_TAGS 0xFFFF7FFF +#define S_008670_CE_STALLED_ON_TC_WR_CONFIRM(x) (((x) & 0x1) << 16) +#define G_008670_CE_STALLED_ON_TC_WR_CONFIRM(x) (((x) >> 16) & 0x1) +#define C_008670_CE_STALLED_ON_TC_WR_CONFIRM 0xFFFEFFFF +#define S_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) & 0x1) << 17) +#define G_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) >> 17) & 0x1) +#define C_008670_CE_STALLED_ON_ATOMIC_RTN_DATA 0xFFFDFFFF +#define S_008670_ATCL2IU_WAITING_ON_FREE(x) (((x) & 0x1) << 18) +#define G_008670_ATCL2IU_WAITING_ON_FREE(x) (((x) >> 18) & 0x1) +#define C_008670_ATCL2IU_WAITING_ON_FREE 0xFFFBFFFF +#define S_008670_ATCL2IU_WAITING_ON_TAGS(x) (((x) & 0x1) << 19) +#define G_008670_ATCL2IU_WAITING_ON_TAGS(x) (((x) >> 19) & 0x1) +#define C_008670_ATCL2IU_WAITING_ON_TAGS 0xFFF7FFFF +#define S_008670_ATCL1_WAITING_ON_TRANS(x) (((x) & 0x1) << 20) +#define G_008670_ATCL1_WAITING_ON_TRANS(x) (((x) >> 20) & 0x1) +#define C_008670_ATCL1_WAITING_ON_TRANS 0xFFEFFFFF +#define R_008674_CP_STALLED_STAT1 0x008674 +#define S_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 0) +#define G_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x) (((x) >> 0) & 0x1) +#define C_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV 0xFFFFFFFE +#define S_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 2) +#define G_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x) (((x) >> 2) & 0x1) +#define C_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV 0xFFFFFFFB +#define S_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 4) +#define G_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x) (((x) >> 4) & 0x1) +#define C_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV 0xFFFFFFEF +#define S_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x) (((x) & 0x1) << 10) +#define G_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x) (((x) >> 10) & 0x1) +#define C_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG 0xFFFFFBFF +#define S_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x) (((x) & 0x1) << 11) +#define G_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x) (((x) >> 11) & 0x1) +#define C_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG 0xFFFFF7FF +#define S_008674_ME_STALLED_ON_TC_WR_CONFIRM(x) (((x) & 0x1) << 12) +#define G_008674_ME_STALLED_ON_TC_WR_CONFIRM(x) (((x) >> 12) & 0x1) +#define C_008674_ME_STALLED_ON_TC_WR_CONFIRM 0xFFFFEFFF +#define S_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) & 0x1) << 13) +#define G_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) >> 13) & 0x1) +#define C_008674_ME_STALLED_ON_ATOMIC_RTN_DATA 0xFFFFDFFF +#define S_008674_ME_WAITING_ON_TC_READ_DATA(x) (((x) & 0x1) << 14) +#define G_008674_ME_WAITING_ON_TC_READ_DATA(x) (((x) >> 14) & 0x1) +#define C_008674_ME_WAITING_ON_TC_READ_DATA 0xFFFFBFFF +#define S_008674_ME_WAITING_ON_REG_READ_DATA(x) (((x) & 0x1) << 15) +#define G_008674_ME_WAITING_ON_REG_READ_DATA(x) (((x) >> 15) & 0x1) +#define C_008674_ME_WAITING_ON_REG_READ_DATA 0xFFFF7FFF +#define S_008674_RCIU_WAITING_ON_GDS_FREE(x) (((x) & 0x1) << 23) +#define G_008674_RCIU_WAITING_ON_GDS_FREE(x) (((x) >> 23) & 0x1) +#define C_008674_RCIU_WAITING_ON_GDS_FREE 0xFF7FFFFF +#define S_008674_RCIU_WAITING_ON_GRBM_FREE(x) (((x) & 0x1) << 24) +#define G_008674_RCIU_WAITING_ON_GRBM_FREE(x) (((x) >> 24) & 0x1) +#define C_008674_RCIU_WAITING_ON_GRBM_FREE 0xFEFFFFFF +#define S_008674_RCIU_WAITING_ON_VGT_FREE(x) (((x) & 0x1) << 25) +#define G_008674_RCIU_WAITING_ON_VGT_FREE(x) (((x) >> 25) & 0x1) +#define C_008674_RCIU_WAITING_ON_VGT_FREE 0xFDFFFFFF +#define S_008674_RCIU_STALLED_ON_ME_READ(x) (((x) & 0x1) << 26) +#define G_008674_RCIU_STALLED_ON_ME_READ(x) (((x) >> 26) & 0x1) +#define C_008674_RCIU_STALLED_ON_ME_READ 0xFBFFFFFF +#define S_008674_RCIU_STALLED_ON_DMA_READ(x) (((x) & 0x1) << 27) +#define G_008674_RCIU_STALLED_ON_DMA_READ(x) (((x) >> 27) & 0x1) +#define C_008674_RCIU_STALLED_ON_DMA_READ 0xF7FFFFFF +#define S_008674_RCIU_STALLED_ON_APPEND_READ(x) (((x) & 0x1) << 28) +#define G_008674_RCIU_STALLED_ON_APPEND_READ(x) (((x) >> 28) & 0x1) +#define C_008674_RCIU_STALLED_ON_APPEND_READ 0xEFFFFFFF +#define S_008674_RCIU_HALTED_BY_REG_VIOLATION(x) (((x) & 0x1) << 29) +#define G_008674_RCIU_HALTED_BY_REG_VIOLATION(x) (((x) >> 29) & 0x1) +#define C_008674_RCIU_HALTED_BY_REG_VIOLATION 0xDFFFFFFF +#define R_008678_CP_STALLED_STAT2 0x008678 +#define S_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 0) +#define G_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x) (((x) >> 0) & 0x1) +#define C_008678_PFP_TO_CSF_NOT_RDY_TO_RCV 0xFFFFFFFE +#define S_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 1) +#define G_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x) (((x) >> 1) & 0x1) +#define C_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV 0xFFFFFFFD +#define S_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 2) +#define G_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x) (((x) >> 2) & 0x1) +#define C_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV 0xFFFFFFFB +#define S_008678_PFP_TO_VGT_WRITES_PENDING(x) (((x) & 0x1) << 4) +#define G_008678_PFP_TO_VGT_WRITES_PENDING(x) (((x) >> 4) & 0x1) +#define C_008678_PFP_TO_VGT_WRITES_PENDING 0xFFFFFFEF +#define S_008678_PFP_RCIU_READ_PENDING(x) (((x) & 0x1) << 5) +#define G_008678_PFP_RCIU_READ_PENDING(x) (((x) >> 5) & 0x1) +#define C_008678_PFP_RCIU_READ_PENDING 0xFFFFFFDF +#define S_008678_PFP_WAITING_ON_BUFFER_DATA(x) (((x) & 0x1) << 8) +#define G_008678_PFP_WAITING_ON_BUFFER_DATA(x) (((x) >> 8) & 0x1) +#define C_008678_PFP_WAITING_ON_BUFFER_DATA 0xFFFFFEFF +#define S_008678_ME_WAIT_ON_CE_COUNTER(x) (((x) & 0x1) << 9) +#define G_008678_ME_WAIT_ON_CE_COUNTER(x) (((x) >> 9) & 0x1) +#define C_008678_ME_WAIT_ON_CE_COUNTER 0xFFFFFDFF +#define S_008678_ME_WAIT_ON_AVAIL_BUFFER(x) (((x) & 0x1) << 10) +#define G_008678_ME_WAIT_ON_AVAIL_BUFFER(x) (((x) >> 10) & 0x1) +#define C_008678_ME_WAIT_ON_AVAIL_BUFFER 0xFFFFFBFF +#define S_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x) (((x) & 0x1) << 11) +#define G_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x) (((x) >> 11) & 0x1) +#define C_008678_GFX_CNTX_NOT_AVAIL_TO_ME 0xFFFFF7FF +#define S_008678_ME_RCIU_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 12) +#define G_008678_ME_RCIU_NOT_RDY_TO_RCV(x) (((x) >> 12) & 0x1) +#define C_008678_ME_RCIU_NOT_RDY_TO_RCV 0xFFFFEFFF +#define S_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 13) +#define G_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x) (((x) >> 13) & 0x1) +#define C_008678_ME_TO_CONST_NOT_RDY_TO_RCV 0xFFFFDFFF +#define S_008678_ME_WAITING_DATA_FROM_PFP(x) (((x) & 0x1) << 14) +#define G_008678_ME_WAITING_DATA_FROM_PFP(x) (((x) >> 14) & 0x1) +#define C_008678_ME_WAITING_DATA_FROM_PFP 0xFFFFBFFF +#define S_008678_ME_WAITING_ON_PARTIAL_FLUSH(x) (((x) & 0x1) << 15) +#define G_008678_ME_WAITING_ON_PARTIAL_FLUSH(x) (((x) >> 15) & 0x1) +#define C_008678_ME_WAITING_ON_PARTIAL_FLUSH 0xFFFF7FFF +#define S_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 16) +#define G_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x) (((x) >> 16) & 0x1) +#define C_008678_MEQ_TO_ME_NOT_RDY_TO_RCV 0xFFFEFFFF +#define S_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x) (((x) & 0x1) << 17) +#define G_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x) (((x) >> 17) & 0x1) +#define C_008678_STQ_TO_ME_NOT_RDY_TO_RCV 0xFFFDFFFF +#define S_008678_ME_WAITING_DATA_FROM_STQ(x) (((x) & 0x1) << 18) +#define G_008678_ME_WAITING_DATA_FROM_STQ(x) (((x) >> 18) & 0x1) +#define C_008678_ME_WAITING_DATA_FROM_STQ 0xFFFBFFFF +#define S_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x) (((x) & 0x1) << 19) +#define G_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x) (((x) >> 19) & 0x1) +#define C_008678_PFP_STALLED_ON_TC_WR_CONFIRM 0xFFF7FFFF +#define S_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) & 0x1) << 20) +#define G_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x) (((x) >> 20) & 0x1) +#define C_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA 0xFFEFFFFF +#define S_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x) (((x) & 0x1) << 21) +#define G_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x) (((x) >> 21) & 0x1) +#define C_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE 0xFFDFFFFF +#define S_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x) (((x) & 0x1) << 22) +#define G_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x) (((x) >> 22) & 0x1) +#define C_008678_EOPD_FIFO_NEEDS_WR_CONFIRM 0xFFBFFFFF +#define S_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x) (((x) & 0x1) << 23) +#define G_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x) (((x) >> 23) & 0x1) +#define C_008678_STRMO_WR_OF_PRIM_DATA_PENDING 0xFF7FFFFF +#define S_008678_PIPE_STATS_WR_DATA_PENDING(x) (((x) & 0x1) << 24) +#define G_008678_PIPE_STATS_WR_DATA_PENDING(x) (((x) >> 24) & 0x1) +#define C_008678_PIPE_STATS_WR_DATA_PENDING 0xFEFFFFFF +#define S_008678_APPEND_RDY_WAIT_ON_CS_DONE(x) (((x) & 0x1) << 25) +#define G_008678_APPEND_RDY_WAIT_ON_CS_DONE(x) (((x) >> 25) & 0x1) +#define C_008678_APPEND_RDY_WAIT_ON_CS_DONE 0xFDFFFFFF +#define S_008678_APPEND_RDY_WAIT_ON_PS_DONE(x) (((x) & 0x1) << 26) +#define G_008678_APPEND_RDY_WAIT_ON_PS_DONE(x) (((x) >> 26) & 0x1) +#define C_008678_APPEND_RDY_WAIT_ON_PS_DONE 0xFBFFFFFF +#define S_008678_APPEND_WAIT_ON_WR_CONFIRM(x) (((x) & 0x1) << 27) +#define G_008678_APPEND_WAIT_ON_WR_CONFIRM(x) (((x) >> 27) & 0x1) +#define C_008678_APPEND_WAIT_ON_WR_CONFIRM 0xF7FFFFFF +#define S_008678_APPEND_ACTIVE_PARTITION(x) (((x) & 0x1) << 28) +#define G_008678_APPEND_ACTIVE_PARTITION(x) (((x) >> 28) & 0x1) +#define C_008678_APPEND_ACTIVE_PARTITION 0xEFFFFFFF +#define S_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x) (((x) & 0x1) << 29) +#define G_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x) (((x) >> 29) & 0x1) +#define C_008678_APPEND_WAITING_TO_SEND_MEMWRITE 0xDFFFFFFF +#define S_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x) (((x) & 0x1) << 30) +#define G_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x) (((x) >> 30) & 0x1) +#define C_008678_SURF_SYNC_NEEDS_IDLE_CNTXS 0xBFFFFFFF +#define S_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x) (((x) & 0x1) << 31) +#define G_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x) (((x) >> 31) & 0x1) +#define C_008678_SURF_SYNC_NEEDS_ALL_CLEAN 0x7FFFFFFF +#define R_008680_CP_STAT 0x008680 +#define S_008680_ROQ_RING_BUSY(x) (((x) & 0x1) << 9) +#define G_008680_ROQ_RING_BUSY(x) (((x) >> 9) & 0x1) +#define C_008680_ROQ_RING_BUSY 0xFFFFFDFF +#define S_008680_ROQ_INDIRECT1_BUSY(x) (((x) & 0x1) << 10) +#define G_008680_ROQ_INDIRECT1_BUSY(x) (((x) >> 10) & 0x1) +#define C_008680_ROQ_INDIRECT1_BUSY 0xFFFFFBFF +#define S_008680_ROQ_INDIRECT2_BUSY(x) (((x) & 0x1) << 11) +#define G_008680_ROQ_INDIRECT2_BUSY(x) (((x) >> 11) & 0x1) +#define C_008680_ROQ_INDIRECT2_BUSY 0xFFFFF7FF +#define S_008680_ROQ_STATE_BUSY(x) (((x) & 0x1) << 12) +#define G_008680_ROQ_STATE_BUSY(x) (((x) >> 12) & 0x1) +#define C_008680_ROQ_STATE_BUSY 0xFFFFEFFF +#define S_008680_DC_BUSY(x) (((x) & 0x1) << 13) +#define G_008680_DC_BUSY(x) (((x) >> 13) & 0x1) +#define C_008680_DC_BUSY 0xFFFFDFFF +#define S_008680_ATCL2IU_BUSY(x) (((x) & 0x1) << 14) +#define G_008680_ATCL2IU_BUSY(x) (((x) >> 14) & 0x1) +#define C_008680_ATCL2IU_BUSY 0xFFFFBFFF +#define S_008680_PFP_BUSY(x) (((x) & 0x1) << 15) +#define G_008680_PFP_BUSY(x) (((x) >> 15) & 0x1) +#define C_008680_PFP_BUSY 0xFFFF7FFF +#define S_008680_MEQ_BUSY(x) (((x) & 0x1) << 16) +#define G_008680_MEQ_BUSY(x) (((x) >> 16) & 0x1) +#define C_008680_MEQ_BUSY 0xFFFEFFFF +#define S_008680_ME_BUSY(x) (((x) & 0x1) << 17) +#define G_008680_ME_BUSY(x) (((x) >> 17) & 0x1) +#define C_008680_ME_BUSY 0xFFFDFFFF +#define S_008680_QUERY_BUSY(x) (((x) & 0x1) << 18) +#define G_008680_QUERY_BUSY(x) (((x) >> 18) & 0x1) +#define C_008680_QUERY_BUSY 0xFFFBFFFF +#define S_008680_SEMAPHORE_BUSY(x) (((x) & 0x1) << 19) +#define G_008680_SEMAPHORE_BUSY(x) (((x) >> 19) & 0x1) +#define C_008680_SEMAPHORE_BUSY 0xFFF7FFFF +#define S_008680_INTERRUPT_BUSY(x) (((x) & 0x1) << 20) +#define G_008680_INTERRUPT_BUSY(x) (((x) >> 20) & 0x1) +#define C_008680_INTERRUPT_BUSY 0xFFEFFFFF +#define S_008680_SURFACE_SYNC_BUSY(x) (((x) & 0x1) << 21) +#define G_008680_SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) +#define C_008680_SURFACE_SYNC_BUSY 0xFFDFFFFF +#define S_008680_DMA_BUSY(x) (((x) & 0x1) << 22) +#define G_008680_DMA_BUSY(x) (((x) >> 22) & 0x1) +#define C_008680_DMA_BUSY 0xFFBFFFFF +#define S_008680_RCIU_BUSY(x) (((x) & 0x1) << 23) +#define G_008680_RCIU_BUSY(x) (((x) >> 23) & 0x1) +#define C_008680_RCIU_BUSY 0xFF7FFFFF +#define S_008680_SCRATCH_RAM_BUSY(x) (((x) & 0x1) << 24) +#define G_008680_SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) +#define C_008680_SCRATCH_RAM_BUSY 0xFEFFFFFF +#define S_008680_CPC_CPG_BUSY(x) (((x) & 0x1) << 25) +#define G_008680_CPC_CPG_BUSY(x) (((x) >> 25) & 0x1) +#define C_008680_CPC_CPG_BUSY 0xFDFFFFFF +#define S_008680_CE_BUSY(x) (((x) & 0x1) << 26) +#define G_008680_CE_BUSY(x) (((x) >> 26) & 0x1) +#define C_008680_CE_BUSY 0xFBFFFFFF +#define S_008680_TCIU_BUSY(x) (((x) & 0x1) << 27) +#define G_008680_TCIU_BUSY(x) (((x) >> 27) & 0x1) +#define C_008680_TCIU_BUSY 0xF7FFFFFF +#define S_008680_ROQ_CE_RING_BUSY(x) (((x) & 0x1) << 28) +#define G_008680_ROQ_CE_RING_BUSY(x) (((x) >> 28) & 0x1) +#define C_008680_ROQ_CE_RING_BUSY 0xEFFFFFFF +#define S_008680_ROQ_CE_INDIRECT1_BUSY(x) (((x) & 0x1) << 29) +#define G_008680_ROQ_CE_INDIRECT1_BUSY(x) (((x) >> 29) & 0x1) +#define C_008680_ROQ_CE_INDIRECT1_BUSY 0xDFFFFFFF +#define S_008680_ROQ_CE_INDIRECT2_BUSY(x) (((x) & 0x1) << 30) +#define G_008680_ROQ_CE_INDIRECT2_BUSY(x) (((x) >> 30) & 0x1) +#define C_008680_ROQ_CE_INDIRECT2_BUSY 0xBFFFFFFF +#define S_008680_CP_BUSY(x) (((x) & 0x1) << 31) +#define G_008680_CP_BUSY(x) (((x) >> 31) & 0x1) +#define C_008680_CP_BUSY 0x7FFFFFFF /* CIK */ +#define R_030800_GRBM_GFX_INDEX 0x030800 +#define S_030800_INSTANCE_INDEX(x) (((x) & 0xFF) << 0) +#define G_030800_INSTANCE_INDEX(x) (((x) >> 0) & 0xFF) +#define C_030800_INSTANCE_INDEX 0xFFFFFF00 +#define S_030800_SH_INDEX(x) (((x) & 0xFF) << 8) +#define G_030800_SH_INDEX(x) (((x) >> 8) & 0xFF) +#define C_030800_SH_INDEX 0xFFFF00FF +#define S_030800_SE_INDEX(x) (((x) & 0xFF) << 16) +#define G_030800_SE_INDEX(x) (((x) >> 16) & 0xFF) +#define C_030800_SE_INDEX 0xFF00FFFF +#define S_030800_SH_BROADCAST_WRITES(x) (((x) & 0x1) << 29) +#define G_030800_SH_BROADCAST_WRITES(x) (((x) >> 29) & 0x1) +#define C_030800_SH_BROADCAST_WRITES 0xDFFFFFFF +#define S_030800_INSTANCE_BROADCAST_WRITES(x) (((x) & 0x1) << 30) +#define G_030800_INSTANCE_BROADCAST_WRITES(x) (((x) >> 30) & 0x1) +#define C_030800_INSTANCE_BROADCAST_WRITES 0xBFFFFFFF +#define S_030800_SE_BROADCAST_WRITES(x) (((x) & 0x1) << 31) +#define G_030800_SE_BROADCAST_WRITES(x) (((x) >> 31) & 0x1) +#define C_030800_SE_BROADCAST_WRITES 0x7FFFFFFF +#define R_030900_VGT_ESGS_RING_SIZE 0x030900 +#define R_030904_VGT_GSVS_RING_SIZE 0x030904 #define R_030908_VGT_PRIMITIVE_TYPE 0x030908 #define S_030908_PRIM_TYPE(x) (((x) & 0x3F) << 0) #define G_030908_PRIM_TYPE(x) (((x) >> 0) & 0x3F) @@ -530,6 +1736,34 @@ #define S_030A04_CURRENT_COUNT(x) (((x) & 0xFF) << 8) #define G_030A04_CURRENT_COUNT(x) (((x) >> 8) & 0xFF) #define C_030A04_CURRENT_COUNT 0xFFFF00FF +#define R_030A10_PA_SC_SCREEN_EXTENT_MIN_0 0x030A10 +#define S_030A10_X(x) (((x) & 0xFFFF) << 0) +#define G_030A10_X(x) (((x) >> 0) & 0xFFFF) +#define C_030A10_X 0xFFFF0000 +#define S_030A10_Y(x) (((x) & 0xFFFF) << 16) +#define G_030A10_Y(x) (((x) >> 16) & 0xFFFF) +#define C_030A10_Y 0x0000FFFF +#define R_030A14_PA_SC_SCREEN_EXTENT_MAX_0 0x030A14 +#define S_030A14_X(x) (((x) & 0xFFFF) << 0) +#define G_030A14_X(x) (((x) >> 0) & 0xFFFF) +#define C_030A14_X 0xFFFF0000 +#define S_030A14_Y(x) (((x) & 0xFFFF) << 16) +#define G_030A14_Y(x) (((x) >> 16) & 0xFFFF) +#define C_030A14_Y 0x0000FFFF +#define R_030A18_PA_SC_SCREEN_EXTENT_MIN_1 0x030A18 +#define S_030A18_X(x) (((x) & 0xFFFF) << 0) +#define G_030A18_X(x) (((x) >> 0) & 0xFFFF) +#define C_030A18_X 0xFFFF0000 +#define S_030A18_Y(x) (((x) & 0xFFFF) << 16) +#define G_030A18_Y(x) (((x) >> 16) & 0xFFFF) +#define C_030A18_Y 0x0000FFFF +#define R_030A2C_PA_SC_SCREEN_EXTENT_MAX_1 0x030A2C +#define S_030A2C_X(x) (((x) & 0xFFFF) << 0) +#define G_030A2C_X(x) (((x) >> 0) & 0xFFFF) +#define C_030A2C_X 0xFFFF0000 +#define S_030A2C_Y(x) (((x) & 0xFFFF) << 16) +#define G_030A2C_Y(x) (((x) >> 16) & 0xFFFF) +#define C_030A2C_Y 0x0000FFFF /* */ #define R_008BF0_PA_SC_ENHANCE 0x008BF0 #define S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x) (((x) & 0x1) << 0) @@ -608,6 +1842,32 @@ #define V_008DFC_SQ_VGPR 0x00 /* */ #define R_008DFC_SQ_INST 0x008DFC +#define R_030D20_SQC_CACHES 0x030D20 +#define S_030D20_TARGET_INST(x) (((x) & 0x1) << 0) +#define G_030D20_TARGET_INST(x) (((x) >> 0) & 0x1) +#define C_030D20_TARGET_INST 0xFFFFFFFE +#define S_030D20_TARGET_DATA(x) (((x) & 0x1) << 1) +#define G_030D20_TARGET_DATA(x) (((x) >> 1) & 0x1) +#define C_030D20_TARGET_DATA 0xFFFFFFFD +#define S_030D20_INVALIDATE(x) (((x) & 0x1) << 2) +#define G_030D20_INVALIDATE(x) (((x) >> 2) & 0x1) +#define C_030D20_INVALIDATE 0xFFFFFFFB +#define S_030D20_WRITEBACK(x) (((x) & 0x1) << 3) +#define G_030D20_WRITEBACK(x) (((x) >> 3) & 0x1) +#define C_030D20_WRITEBACK 0xFFFFFFF7 +#define S_030D20_VOL(x) (((x) & 0x1) << 4) +#define G_030D20_VOL(x) (((x) >> 4) & 0x1) +#define C_030D20_VOL 0xFFFFFFEF +#define S_030D20_COMPLETE(x) (((x) & 0x1) << 16) +#define G_030D20_COMPLETE(x) (((x) >> 16) & 0x1) +#define C_030D20_COMPLETE 0xFFFEFFFF +#define R_030D24_SQC_WRITEBACK 0x030D24 +#define S_030D24_DWB(x) (((x) & 0x1) << 0) +#define G_030D24_DWB(x) (((x) >> 0) & 0x1) +#define C_030D24_DWB 0xFFFFFFFE +#define S_030D24_DIRTY(x) (((x) & 0x1) << 1) +#define G_030D24_DIRTY(x) (((x) >> 1) & 0x1) +#define C_030D24_DIRTY 0xFFFFFFFD #define R_008DFC_SQ_VOP1 0x008DFC #define S_008DFC_SRC0(x) (((x) & 0x1FF) << 0) #define G_008DFC_SRC0(x) (((x) >> 0) & 0x1FF) @@ -3740,7 +5000,17 @@ #define C_008DFC_ENCODING 0x03FFFFFF #define V_008DFC_SQ_ENC_MUBUF_FIELD 0x38 #endif +#define R_030E00_TA_CS_BC_BASE_ADDR 0x030E00 +#define R_030E04_TA_CS_BC_BASE_ADDR_HI 0x030E04 +#define S_030E04_ADDRESS(x) (((x) & 0xFF) << 0) +#define G_030E04_ADDRESS(x) (((x) >> 0) & 0xFF) +#define C_030E04_ADDRESS 0xFFFFFF00 +#define R_030F00_DB_OCCLUSION_COUNT0_LOW 0x030F00 #define R_008F00_SQ_BUF_RSRC_WORD0 0x008F00 +#define R_030F04_DB_OCCLUSION_COUNT0_HI 0x030F04 +#define S_030F04_COUNT_HI(x) (((x) & 0x7FFFFFFF) << 0) +#define G_030F04_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF) +#define C_030F04_COUNT_HI 0x80000000 #define R_008F04_SQ_BUF_RSRC_WORD1 0x008F04 #define S_008F04_BASE_ADDRESS_HI(x) (((x) & 0xFFFF) << 0) #define G_008F04_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFFFF) @@ -3754,7 +5024,12 @@ #define S_008F04_SWIZZLE_ENABLE(x) (((x) & 0x1) << 31) #define G_008F04_SWIZZLE_ENABLE(x) (((x) >> 31) & 0x1) #define C_008F04_SWIZZLE_ENABLE 0x7FFFFFFF +#define R_030F08_DB_OCCLUSION_COUNT1_LOW 0x030F08 #define R_008F08_SQ_BUF_RSRC_WORD2 0x008F08 +#define R_030F0C_DB_OCCLUSION_COUNT1_HI 0x030F0C +#define S_030F0C_COUNT_HI(x) (((x) & 0x7FFFFFFF) << 0) +#define G_030F0C_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF) +#define C_030F0C_COUNT_HI 0x80000000 #define R_008F0C_SQ_BUF_RSRC_WORD3 0x008F0C #define S_008F0C_DST_SEL_X(x) (((x) & 0x07) << 0) #define G_008F0C_DST_SEL_X(x) (((x) >> 0) & 0x07) @@ -3862,7 +5137,12 @@ #define V_008F0C_SQ_RSRC_BUF_RSVD_1 0x01 #define V_008F0C_SQ_RSRC_BUF_RSVD_2 0x02 #define V_008F0C_SQ_RSRC_BUF_RSVD_3 0x03 +#define R_030F10_DB_OCCLUSION_COUNT2_LOW 0x030F10 #define R_008F10_SQ_IMG_RSRC_WORD0 0x008F10 +#define R_030F14_DB_OCCLUSION_COUNT2_HI 0x030F14 +#define S_030F14_COUNT_HI(x) (((x) & 0x7FFFFFFF) << 0) +#define G_030F14_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF) +#define C_030F14_COUNT_HI 0x80000000 #define R_008F14_SQ_IMG_RSRC_WORD1 0x008F14 #define S_008F14_BASE_ADDRESS_HI(x) (((x) & 0xFF) << 0) #define G_008F14_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFF) @@ -3961,6 +5241,7 @@ #define G_008F14_MTYPE(x) (((x) >> 30) & 0x03) #define C_008F14_MTYPE 0x3FFFFFFF /* */ +#define R_030F18_DB_OCCLUSION_COUNT3_LOW 0x030F18 #define R_008F18_SQ_IMG_RSRC_WORD2 0x008F18 #define S_008F18_WIDTH(x) (((x) & 0x3FFF) << 0) #define G_008F18_WIDTH(x) (((x) >> 0) & 0x3FFF) @@ -3974,6 +5255,10 @@ #define S_008F18_INTERLACED(x) (((x) & 0x1) << 31) #define G_008F18_INTERLACED(x) (((x) >> 31) & 0x1) #define C_008F18_INTERLACED 0x7FFFFFFF +#define R_030F1C_DB_OCCLUSION_COUNT3_HI 0x030F1C +#define S_030F1C_COUNT_HI(x) (((x) & 0x7FFFFFFF) << 0) +#define G_030F1C_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF) +#define C_030F1C_COUNT_HI 0x80000000 #define R_008F1C_SQ_IMG_RSRC_WORD3 0x008F1C #define S_008F1C_DST_SEL_X(x) (((x) & 0x07) << 0) #define G_008F1C_DST_SEL_X(x) (((x) >> 0) & 0x07) @@ -4084,6 +5369,23 @@ #define G_008F28_LOD_HDW_CNT_EN(x) (((x) >> 20) & 0x1) #define C_008F28_LOD_HDW_CNT_EN 0xFFEFFFFF /* */ +/* VI */ +#define S_008F28_COMPRESSION_EN(x) (((x) & 0x1) << 21) +#define G_008F28_COMPRESSION_EN(x) (((x) >> 21) & 0x1) +#define C_008F28_COMPRESSION_EN 0xFFDFFFFF +#define S_008F28_ALPHA_IS_ON_MSB(x) (((x) & 0x1) << 22) +#define G_008F28_ALPHA_IS_ON_MSB(x) (((x) >> 22) & 0x1) +#define C_008F28_ALPHA_IS_ON_MSB 0xFFBFFFFF +#define S_008F28_COLOR_TRANSFORM(x) (((x) & 0x1) << 23) +#define G_008F28_COLOR_TRANSFORM(x) (((x) >> 23) & 0x1) +#define C_008F28_COLOR_TRANSFORM 0xFF7FFFFF +#define S_008F28_LOST_ALPHA_BITS(x) (((x) & 0x0F) << 24) +#define G_008F28_LOST_ALPHA_BITS(x) (((x) >> 24) & 0x0F) +#define C_008F28_LOST_ALPHA_BITS 0xF0FFFFFF +#define S_008F28_LOST_COLOR_BITS(x) (((x) & 0x0F) << 28) +#define G_008F28_LOST_COLOR_BITS(x) (((x) >> 28) & 0x0F) +#define C_008F28_LOST_COLOR_BITS 0x0FFFFFFF +/* */ #define R_008F2C_SQ_IMG_RSRC_WORD7 0x008F2C #define R_008F30_SQ_IMG_SAMP_WORD0 0x008F30 #define S_008F30_CLAMP_X(x) (((x) & 0x07) << 0) @@ -4148,6 +5450,11 @@ #define S_008F30_FILTER_MODE(x) (((x) & 0x03) << 29) #define G_008F30_FILTER_MODE(x) (((x) >> 29) & 0x03) #define C_008F30_FILTER_MODE 0x9FFFFFFF +/* VI */ +#define S_008F30_COMPAT_MODE(x) (((x) & 0x1) << 31) +#define G_008F30_COMPAT_MODE(x) (((x) >> 31) & 0x1) +#define C_008F30_COMPAT_MODE 0x7FFFFFFF +/* */ #define R_008F34_SQ_IMG_SAMP_WORD1 0x008F34 #define S_008F34_MIN_LOD(x) (((x) & 0xFFF) << 0) #define G_008F34_MIN_LOD(x) (((x) >> 0) & 0xFFF) @@ -4313,6 +5620,11 @@ #define G_008F44_OFFSET(x) (((x) >> 0) & 0xFFFFFF) #define C_008F44_OFFSET 0xFF000000 /* */ +#define R_030FF8_DB_ZPASS_COUNT_LOW 0x030FF8 +#define R_030FFC_DB_ZPASS_COUNT_HI 0x030FFC +#define S_030FFC_COUNT_HI(x) (((x) & 0x7FFFFFFF) << 0) +#define G_030FFC_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF) +#define C_030FFC_COUNT_HI 0x80000000 #define R_009100_SPI_CONFIG_CNTL 0x009100 #define S_009100_GPR_WRITE_PRIORITY(x) (((x) & 0x1FFFFF) << 0) #define G_009100_GPR_WRITE_PRIORITY(x) (((x) >> 0) & 0x1FFFFF) @@ -4437,6 +5749,34 @@ #define S_009858_MSAA16_Y(x) (((x) & 0x03) << 18) #define G_009858_MSAA16_Y(x) (((x) >> 18) & 0x03) #define C_009858_MSAA16_Y 0xFFF3FFFF +#define R_0098F8_GB_ADDR_CONFIG 0x0098F8 +#define S_0098F8_NUM_PIPES(x) (((x) & 0x07) << 0) +#define G_0098F8_NUM_PIPES(x) (((x) >> 0) & 0x07) +#define C_0098F8_NUM_PIPES 0xFFFFFFF8 +#define S_0098F8_PIPE_INTERLEAVE_SIZE(x) (((x) & 0x07) << 4) +#define G_0098F8_PIPE_INTERLEAVE_SIZE(x) (((x) >> 4) & 0x07) +#define C_0098F8_PIPE_INTERLEAVE_SIZE 0xFFFFFF8F +#define S_0098F8_BANK_INTERLEAVE_SIZE(x) (((x) & 0x07) << 8) +#define G_0098F8_BANK_INTERLEAVE_SIZE(x) (((x) >> 8) & 0x07) +#define C_0098F8_BANK_INTERLEAVE_SIZE 0xFFFFF8FF +#define S_0098F8_NUM_SHADER_ENGINES(x) (((x) & 0x03) << 12) +#define G_0098F8_NUM_SHADER_ENGINES(x) (((x) >> 12) & 0x03) +#define C_0098F8_NUM_SHADER_ENGINES 0xFFFFCFFF +#define S_0098F8_SHADER_ENGINE_TILE_SIZE(x) (((x) & 0x07) << 16) +#define G_0098F8_SHADER_ENGINE_TILE_SIZE(x) (((x) >> 16) & 0x07) +#define C_0098F8_SHADER_ENGINE_TILE_SIZE 0xFFF8FFFF +#define S_0098F8_NUM_GPUS(x) (((x) & 0x07) << 20) +#define G_0098F8_NUM_GPUS(x) (((x) >> 20) & 0x07) +#define C_0098F8_NUM_GPUS 0xFF8FFFFF +#define S_0098F8_MULTI_GPU_TILE_SIZE(x) (((x) & 0x03) << 24) +#define G_0098F8_MULTI_GPU_TILE_SIZE(x) (((x) >> 24) & 0x03) +#define C_0098F8_MULTI_GPU_TILE_SIZE 0xFCFFFFFF +#define S_0098F8_ROW_SIZE(x) (((x) & 0x03) << 28) +#define G_0098F8_ROW_SIZE(x) (((x) >> 28) & 0x03) +#define C_0098F8_ROW_SIZE 0xCFFFFFFF +#define S_0098F8_NUM_LOWER_PIPES(x) (((x) & 0x1) << 30) +#define G_0098F8_NUM_LOWER_PIPES(x) (((x) >> 30) & 0x1) +#define C_0098F8_NUM_LOWER_PIPES 0xBFFFFFFF #define R_009910_GB_TILE_MODE0 0x009910 #define S_009910_MICRO_TILE_MODE(x) (((x) & 0x03) << 0) #define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03) @@ -4515,14 +5855,88 @@ #define V_009910_ADDR_SURF_4_BANK 0x01 #define V_009910_ADDR_SURF_8_BANK 0x02 #define V_009910_ADDR_SURF_16_BANK 0x03 -/* CIK */ #define S_009910_MICRO_TILE_MODE_NEW(x) (((x) & 0x07) << 22) #define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07) -#define C_009910_MICRO_TILE_MODE_NEW(x) 0xFE3FFFFF +#define C_009910_MICRO_TILE_MODE_NEW 0xFE3FFFFF #define V_009910_ADDR_SURF_DISPLAY_MICRO_TILING 0x00 #define V_009910_ADDR_SURF_THIN_MICRO_TILING 0x01 #define V_009910_ADDR_SURF_DEPTH_MICRO_TILING 0x02 #define V_009910_ADDR_SURF_ROTATED_MICRO_TILING 0x03 +#define S_009910_SAMPLE_SPLIT(x) (((x) & 0x03) << 25) +#define G_009910_SAMPLE_SPLIT(x) (((x) >> 25) & 0x03) +#define C_009910_SAMPLE_SPLIT 0xF9FFFFFF +#define R_009914_GB_TILE_MODE1 0x009914 +#define R_009918_GB_TILE_MODE2 0x009918 +#define R_00991C_GB_TILE_MODE3 0x00991C +#define R_009920_GB_TILE_MODE4 0x009920 +#define R_009924_GB_TILE_MODE5 0x009924 +#define R_009928_GB_TILE_MODE6 0x009928 +#define R_00992C_GB_TILE_MODE7 0x00992C +#define R_009930_GB_TILE_MODE8 0x009930 +#define R_009934_GB_TILE_MODE9 0x009934 +#define R_009938_GB_TILE_MODE10 0x009938 +#define R_00993C_GB_TILE_MODE11 0x00993C +#define R_009940_GB_TILE_MODE12 0x009940 +#define R_009944_GB_TILE_MODE13 0x009944 +#define R_009948_GB_TILE_MODE14 0x009948 +#define R_00994C_GB_TILE_MODE15 0x00994C +#define R_009950_GB_TILE_MODE16 0x009950 +#define R_009954_GB_TILE_MODE17 0x009954 +#define R_009958_GB_TILE_MODE18 0x009958 +#define R_00995C_GB_TILE_MODE19 0x00995C +#define R_009960_GB_TILE_MODE20 0x009960 +#define R_009964_GB_TILE_MODE21 0x009964 +#define R_009968_GB_TILE_MODE22 0x009968 +#define R_00996C_GB_TILE_MODE23 0x00996C +#define R_009970_GB_TILE_MODE24 0x009970 +#define R_009974_GB_TILE_MODE25 0x009974 +#define R_009978_GB_TILE_MODE26 0x009978 +#define R_00997C_GB_TILE_MODE27 0x00997C +#define R_009980_GB_TILE_MODE28 0x009980 +#define R_009984_GB_TILE_MODE29 0x009984 +#define R_009988_GB_TILE_MODE30 0x009988 +#define R_00998C_GB_TILE_MODE31 0x00998C +/* CIK */ +#define R_009990_GB_MACROTILE_MODE0 0x009990 +#define S_009990_BANK_WIDTH(x) (((x) & 0x03) << 0) +#define G_009990_BANK_WIDTH(x) (((x) >> 0) & 0x03) +#define C_009990_BANK_WIDTH 0xFFFFFFFC +#define S_009990_BANK_HEIGHT(x) (((x) & 0x03) << 2) +#define G_009990_BANK_HEIGHT(x) (((x) >> 2) & 0x03) +#define C_009990_BANK_HEIGHT 0xFFFFFFF3 +#define S_009990_MACRO_TILE_ASPECT(x) (((x) & 0x03) << 4) +#define G_009990_MACRO_TILE_ASPECT(x) (((x) >> 4) & 0x03) +#define C_009990_MACRO_TILE_ASPECT 0xFFFFFFCF +#define S_009990_NUM_BANKS(x) (((x) & 0x03) << 6) +#define G_009990_NUM_BANKS(x) (((x) >> 6) & 0x03) +#define C_009990_NUM_BANKS 0xFFFFFF3F +#define R_009994_GB_MACROTILE_MODE1 0x009994 +#define R_009998_GB_MACROTILE_MODE2 0x009998 +#define R_00999C_GB_MACROTILE_MODE3 0x00999C +#define R_0099A0_GB_MACROTILE_MODE4 0x0099A0 +#define R_0099A4_GB_MACROTILE_MODE5 0x0099A4 +#define R_0099A8_GB_MACROTILE_MODE6 0x0099A8 +#define R_0099AC_GB_MACROTILE_MODE7 0x0099AC +#define R_0099B0_GB_MACROTILE_MODE8 0x0099B0 +#define R_0099B4_GB_MACROTILE_MODE9 0x0099B4 +#define R_0099B8_GB_MACROTILE_MODE10 0x0099B8 +#define R_0099BC_GB_MACROTILE_MODE11 0x0099BC +#define R_0099C0_GB_MACROTILE_MODE12 0x0099C0 +#define R_0099C4_GB_MACROTILE_MODE13 0x0099C4 +#define R_0099C8_GB_MACROTILE_MODE14 0x0099C8 +#define R_0099CC_GB_MACROTILE_MODE15 0x0099CC +/* */ +#define R_00B000_SPI_SHADER_TBA_LO_PS 0x00B000 +#define R_00B004_SPI_SHADER_TBA_HI_PS 0x00B004 +#define S_00B004_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B004_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B004_MEM_BASE 0xFFFFFF00 +#define R_00B008_SPI_SHADER_TMA_LO_PS 0x00B008 +#define R_00B00C_SPI_SHADER_TMA_HI_PS 0x00B00C +#define S_00B00C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B00C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B00C_MEM_BASE 0xFFFFFF00 +/* CIK */ #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS 0x00B01C #define S_00B01C_CU_EN(x) (((x) & 0xFFFF) << 0) #define G_00B01C_CU_EN(x) (((x) >> 0) & 0xFFFF) @@ -4582,6 +5996,9 @@ #define S_00B02C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B02C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B02C_USER_SGPR 0xFFFFFFC1 +#define S_00B02C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B02C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B02C_TRAP_PRESENT 0xFFFFFFBF #define S_00B02C_WAVE_CNT_EN(x) (((x) & 0x1) << 7) #define G_00B02C_WAVE_CNT_EN(x) (((x) >> 7) & 0x1) #define C_00B02C_WAVE_CNT_EN 0xFFFFFF7F @@ -4591,6 +6008,9 @@ #define S_00B02C_EXCP_EN(x) (((x) & 0x7F) << 16) /* mask is 0x1FF on CIK */ #define G_00B02C_EXCP_EN(x) (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */ #define C_00B02C_EXCP_EN 0xFF80FFFF /* mask is 0x1FF on CIK */ +#define S_00B02C_EXCP_EN_CIK(x) (((x) & 0x1FF) << 16) +#define G_00B02C_EXCP_EN_CIK(x) (((x) >> 16) & 0x1FF) +#define C_00B02C_EXCP_EN_CIK 0xFE00FFFF #define R_00B030_SPI_SHADER_USER_DATA_PS_0 0x00B030 #define R_00B034_SPI_SHADER_USER_DATA_PS_1 0x00B034 #define R_00B038_SPI_SHADER_USER_DATA_PS_2 0x00B038 @@ -4607,6 +6027,16 @@ #define R_00B064_SPI_SHADER_USER_DATA_PS_13 0x00B064 #define R_00B068_SPI_SHADER_USER_DATA_PS_14 0x00B068 #define R_00B06C_SPI_SHADER_USER_DATA_PS_15 0x00B06C +#define R_00B100_SPI_SHADER_TBA_LO_VS 0x00B100 +#define R_00B104_SPI_SHADER_TBA_HI_VS 0x00B104 +#define S_00B104_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B104_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B104_MEM_BASE 0xFFFFFF00 +#define R_00B108_SPI_SHADER_TMA_LO_VS 0x00B108 +#define R_00B10C_SPI_SHADER_TMA_HI_VS 0x00B10C +#define S_00B10C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B10C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B10C_MEM_BASE 0xFFFFFF00 /* CIK */ #define R_00B118_SPI_SHADER_PGM_RSRC3_VS 0x00B118 #define S_00B118_CU_EN(x) (((x) & 0xFFFF) << 0) @@ -4674,6 +6104,9 @@ #define S_00B12C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B12C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B12C_USER_SGPR 0xFFFFFFC1 +#define S_00B12C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B12C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B12C_TRAP_PRESENT 0xFFFFFFBF #define S_00B12C_OC_LDS_EN(x) (((x) & 0x1) << 7) #define G_00B12C_OC_LDS_EN(x) (((x) >> 7) & 0x1) #define C_00B12C_OC_LDS_EN 0xFFFFFF7F @@ -4695,6 +6128,14 @@ #define S_00B12C_EXCP_EN(x) (((x) & 0x7F) << 13) /* mask is 0x1FF on CIK */ #define G_00B12C_EXCP_EN(x) (((x) >> 13) & 0x7F) /* mask is 0x1FF on CIK */ #define C_00B12C_EXCP_EN 0xFFF01FFF /* mask is 0x1FF on CIK */ +#define S_00B12C_EXCP_EN_CIK(x) (((x) & 0x1FF) << 13) +#define G_00B12C_EXCP_EN_CIK(x) (((x) >> 13) & 0x1FF) +#define C_00B12C_EXCP_EN_CIK 0xFFC01FFF +/* VI */ +#define S_00B12C_DISPATCH_DRAW_EN(x) (((x) & 0x1) << 24) +#define G_00B12C_DISPATCH_DRAW_EN(x) (((x) >> 24) & 0x1) +#define C_00B12C_DISPATCH_DRAW_EN 0xFEFFFFFF +/* */ #define R_00B130_SPI_SHADER_USER_DATA_VS_0 0x00B130 #define R_00B134_SPI_SHADER_USER_DATA_VS_1 0x00B134 #define R_00B138_SPI_SHADER_USER_DATA_VS_2 0x00B138 @@ -4711,6 +6152,16 @@ #define R_00B164_SPI_SHADER_USER_DATA_VS_13 0x00B164 #define R_00B168_SPI_SHADER_USER_DATA_VS_14 0x00B168 #define R_00B16C_SPI_SHADER_USER_DATA_VS_15 0x00B16C +#define R_00B200_SPI_SHADER_TBA_LO_GS 0x00B200 +#define R_00B204_SPI_SHADER_TBA_HI_GS 0x00B204 +#define S_00B204_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B204_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B204_MEM_BASE 0xFFFFFF00 +#define R_00B208_SPI_SHADER_TMA_LO_GS 0x00B208 +#define R_00B20C_SPI_SHADER_TMA_HI_GS 0x00B20C +#define S_00B20C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B20C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B20C_MEM_BASE 0xFFFFFF00 /* CIK */ #define R_00B21C_SPI_SHADER_PGM_RSRC3_GS 0x00B21C #define S_00B21C_CU_EN(x) (((x) & 0xFFFF) << 0) @@ -4723,6 +6174,11 @@ #define G_00B21C_LOCK_LOW_THRESHOLD(x) (((x) >> 22) & 0x0F) #define C_00B21C_LOCK_LOW_THRESHOLD 0xFC3FFFFF /* */ +/* VI */ +#define S_00B21C_GROUP_FIFO_DEPTH(x) (((x) & 0x3F) << 26) +#define G_00B21C_GROUP_FIFO_DEPTH(x) (((x) >> 26) & 0x3F) +#define C_00B21C_GROUP_FIFO_DEPTH 0x03FFFFFF +/* */ #define R_00B220_SPI_SHADER_PGM_LO_GS 0x00B220 #define R_00B224_SPI_SHADER_PGM_HI_GS 0x00B224 #define S_00B224_MEM_BASE(x) (((x) & 0xFF) << 0) @@ -4771,10 +6227,41 @@ #define S_00B22C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B22C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B22C_USER_SGPR 0xFFFFFFC1 +#define S_00B22C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B22C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B22C_TRAP_PRESENT 0xFFFFFFBF #define S_00B22C_EXCP_EN(x) (((x) & 0x7F) << 7) /* mask is 0x1FF on CIK */ #define G_00B22C_EXCP_EN(x) (((x) >> 7) & 0x7F) /* mask is 0x1FF on CIK */ #define C_00B22C_EXCP_EN 0xFFFFC07F /* mask is 0x1FF on CIK */ +#define S_00B22C_EXCP_EN_CIK(x) (((x) & 0x1FF) << 7) +#define G_00B22C_EXCP_EN_CIK(x) (((x) >> 7) & 0x1FF) +#define C_00B22C_EXCP_EN_CIK 0xFFFF007F #define R_00B230_SPI_SHADER_USER_DATA_GS_0 0x00B230 +#define R_00B234_SPI_SHADER_USER_DATA_GS_1 0x00B234 +#define R_00B238_SPI_SHADER_USER_DATA_GS_2 0x00B238 +#define R_00B23C_SPI_SHADER_USER_DATA_GS_3 0x00B23C +#define R_00B240_SPI_SHADER_USER_DATA_GS_4 0x00B240 +#define R_00B244_SPI_SHADER_USER_DATA_GS_5 0x00B244 +#define R_00B248_SPI_SHADER_USER_DATA_GS_6 0x00B248 +#define R_00B24C_SPI_SHADER_USER_DATA_GS_7 0x00B24C +#define R_00B250_SPI_SHADER_USER_DATA_GS_8 0x00B250 +#define R_00B254_SPI_SHADER_USER_DATA_GS_9 0x00B254 +#define R_00B258_SPI_SHADER_USER_DATA_GS_10 0x00B258 +#define R_00B25C_SPI_SHADER_USER_DATA_GS_11 0x00B25C +#define R_00B260_SPI_SHADER_USER_DATA_GS_12 0x00B260 +#define R_00B264_SPI_SHADER_USER_DATA_GS_13 0x00B264 +#define R_00B268_SPI_SHADER_USER_DATA_GS_14 0x00B268 +#define R_00B26C_SPI_SHADER_USER_DATA_GS_15 0x00B26C +#define R_00B300_SPI_SHADER_TBA_LO_ES 0x00B300 +#define R_00B304_SPI_SHADER_TBA_HI_ES 0x00B304 +#define S_00B304_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B304_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B304_MEM_BASE 0xFFFFFF00 +#define R_00B308_SPI_SHADER_TMA_LO_ES 0x00B308 +#define R_00B30C_SPI_SHADER_TMA_HI_ES 0x00B30C +#define S_00B30C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B30C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B30C_MEM_BASE 0xFFFFFF00 /* CIK */ #define R_00B31C_SPI_SHADER_PGM_RSRC3_ES 0x00B31C #define S_00B31C_CU_EN(x) (((x) & 0xFFFF) << 0) @@ -4787,6 +6274,11 @@ #define G_00B31C_LOCK_LOW_THRESHOLD(x) (((x) >> 22) & 0x0F) #define C_00B31C_LOCK_LOW_THRESHOLD 0xFC3FFFFF /* */ +/* VI */ +#define S_00B31C_GROUP_FIFO_DEPTH(x) (((x) & 0x3F) << 26) +#define G_00B31C_GROUP_FIFO_DEPTH(x) (((x) >> 26) & 0x3F) +#define C_00B31C_GROUP_FIFO_DEPTH 0x03FFFFFF +/* */ #define R_00B320_SPI_SHADER_PGM_LO_ES 0x00B320 #define R_00B324_SPI_SHADER_PGM_HI_ES 0x00B324 #define S_00B324_MEM_BASE(x) (((x) & 0xFF) << 0) @@ -4838,6 +6330,9 @@ #define S_00B32C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B32C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B32C_USER_SGPR 0xFFFFFFC1 +#define S_00B32C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B32C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B32C_TRAP_PRESENT 0xFFFFFFBF #define S_00B32C_OC_LDS_EN(x) (((x) & 0x1) << 7) #define G_00B32C_OC_LDS_EN(x) (((x) >> 7) & 0x1) #define C_00B32C_OC_LDS_EN 0xFFFFFF7F @@ -4848,6 +6343,31 @@ #define G_00B32C_LDS_SIZE(x) (((x) >> 20) & 0x1FF) /* CIK, for on-chip GS */ #define C_00B32C_LDS_SIZE 0xE00FFFFF /* CIK, for on-chip GS */ #define R_00B330_SPI_SHADER_USER_DATA_ES_0 0x00B330 +#define R_00B334_SPI_SHADER_USER_DATA_ES_1 0x00B334 +#define R_00B338_SPI_SHADER_USER_DATA_ES_2 0x00B338 +#define R_00B33C_SPI_SHADER_USER_DATA_ES_3 0x00B33C +#define R_00B340_SPI_SHADER_USER_DATA_ES_4 0x00B340 +#define R_00B344_SPI_SHADER_USER_DATA_ES_5 0x00B344 +#define R_00B348_SPI_SHADER_USER_DATA_ES_6 0x00B348 +#define R_00B34C_SPI_SHADER_USER_DATA_ES_7 0x00B34C +#define R_00B350_SPI_SHADER_USER_DATA_ES_8 0x00B350 +#define R_00B354_SPI_SHADER_USER_DATA_ES_9 0x00B354 +#define R_00B358_SPI_SHADER_USER_DATA_ES_10 0x00B358 +#define R_00B35C_SPI_SHADER_USER_DATA_ES_11 0x00B35C +#define R_00B360_SPI_SHADER_USER_DATA_ES_12 0x00B360 +#define R_00B364_SPI_SHADER_USER_DATA_ES_13 0x00B364 +#define R_00B368_SPI_SHADER_USER_DATA_ES_14 0x00B368 +#define R_00B36C_SPI_SHADER_USER_DATA_ES_15 0x00B36C +#define R_00B400_SPI_SHADER_TBA_LO_HS 0x00B400 +#define R_00B404_SPI_SHADER_TBA_HI_HS 0x00B404 +#define S_00B404_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B404_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B404_MEM_BASE 0xFFFFFF00 +#define R_00B408_SPI_SHADER_TMA_LO_HS 0x00B408 +#define R_00B40C_SPI_SHADER_TMA_HI_HS 0x00B40C +#define S_00B40C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B40C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B40C_MEM_BASE 0xFFFFFF00 /* CIK */ #define R_00B41C_SPI_SHADER_PGM_RSRC3_HS 0x00B41C #define S_00B41C_WAVE_LIMIT(x) (((x) & 0x3F) << 0) @@ -4857,6 +6377,11 @@ #define G_00B41C_LOCK_LOW_THRESHOLD(x) (((x) >> 6) & 0x0F) #define C_00B41C_LOCK_LOW_THRESHOLD 0xFFFFFC3F /* */ +/* VI */ +#define S_00B41C_GROUP_FIFO_DEPTH(x) (((x) & 0x3F) << 10) +#define G_00B41C_GROUP_FIFO_DEPTH(x) (((x) >> 10) & 0x3F) +#define C_00B41C_GROUP_FIFO_DEPTH 0xFFFF03FF +/* */ #define R_00B420_SPI_SHADER_PGM_LO_HS 0x00B420 #define R_00B424_SPI_SHADER_PGM_HI_HS 0x00B424 #define S_00B424_MEM_BASE(x) (((x) & 0xFF) << 0) @@ -4902,6 +6427,9 @@ #define S_00B42C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B42C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B42C_USER_SGPR 0xFFFFFFC1 +#define S_00B42C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B42C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B42C_TRAP_PRESENT 0xFFFFFFBF #define S_00B42C_OC_LDS_EN(x) (((x) & 0x1) << 7) #define G_00B42C_OC_LDS_EN(x) (((x) >> 7) & 0x1) #define C_00B42C_OC_LDS_EN 0xFFFFFF7F @@ -4912,6 +6440,31 @@ #define G_00B42C_EXCP_EN(x) (((x) >> 9) & 0x7F) /* mask is 0x1FF on CIK */ #define C_00B42C_EXCP_EN 0xFFFF01FF /* mask is 0x1FF on CIK */ #define R_00B430_SPI_SHADER_USER_DATA_HS_0 0x00B430 +#define R_00B434_SPI_SHADER_USER_DATA_HS_1 0x00B434 +#define R_00B438_SPI_SHADER_USER_DATA_HS_2 0x00B438 +#define R_00B43C_SPI_SHADER_USER_DATA_HS_3 0x00B43C +#define R_00B440_SPI_SHADER_USER_DATA_HS_4 0x00B440 +#define R_00B444_SPI_SHADER_USER_DATA_HS_5 0x00B444 +#define R_00B448_SPI_SHADER_USER_DATA_HS_6 0x00B448 +#define R_00B44C_SPI_SHADER_USER_DATA_HS_7 0x00B44C +#define R_00B450_SPI_SHADER_USER_DATA_HS_8 0x00B450 +#define R_00B454_SPI_SHADER_USER_DATA_HS_9 0x00B454 +#define R_00B458_SPI_SHADER_USER_DATA_HS_10 0x00B458 +#define R_00B45C_SPI_SHADER_USER_DATA_HS_11 0x00B45C +#define R_00B460_SPI_SHADER_USER_DATA_HS_12 0x00B460 +#define R_00B464_SPI_SHADER_USER_DATA_HS_13 0x00B464 +#define R_00B468_SPI_SHADER_USER_DATA_HS_14 0x00B468 +#define R_00B46C_SPI_SHADER_USER_DATA_HS_15 0x00B46C +#define R_00B500_SPI_SHADER_TBA_LO_LS 0x00B500 +#define R_00B504_SPI_SHADER_TBA_HI_LS 0x00B504 +#define S_00B504_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B504_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B504_MEM_BASE 0xFFFFFF00 +#define R_00B508_SPI_SHADER_TMA_LO_LS 0x00B508 +#define R_00B50C_SPI_SHADER_TMA_HI_LS 0x00B50C +#define S_00B50C_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B50C_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B50C_MEM_BASE 0xFFFFFF00 /* CIK */ #define R_00B51C_SPI_SHADER_PGM_RSRC3_LS 0x00B51C #define S_00B51C_CU_EN(x) (((x) & 0xFFFF) << 0) @@ -4924,6 +6477,11 @@ #define G_00B51C_LOCK_LOW_THRESHOLD(x) (((x) >> 22) & 0x0F) #define C_00B51C_LOCK_LOW_THRESHOLD 0xFC3FFFFF /* */ +/* VI */ +#define S_00B51C_GROUP_FIFO_DEPTH(x) (((x) & 0x3F) << 26) +#define G_00B51C_GROUP_FIFO_DEPTH(x) (((x) >> 26) & 0x3F) +#define C_00B51C_GROUP_FIFO_DEPTH 0x03FFFFFF +/* */ #define R_00B520_SPI_SHADER_PGM_LO_LS 0x00B520 #define R_00B524_SPI_SHADER_PGM_HI_LS 0x00B524 #define S_00B524_MEM_BASE(x) (((x) & 0xFF) << 0) @@ -4972,6 +6530,9 @@ #define S_00B52C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B52C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B52C_USER_SGPR 0xFFFFFFC1 +#define S_00B52C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B52C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B52C_TRAP_PRESENT 0xFFFFFFBF #define S_00B52C_LDS_SIZE(x) (((x) & 0x1FF) << 7) #define G_00B52C_LDS_SIZE(x) (((x) >> 7) & 0x1FF) #define C_00B52C_LDS_SIZE 0xFFFF007F @@ -4979,6 +6540,21 @@ #define G_00B52C_EXCP_EN(x) (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */ #define C_00B52C_EXCP_EN 0xFF80FFFF /* mask is 0x1FF on CIK */ #define R_00B530_SPI_SHADER_USER_DATA_LS_0 0x00B530 +#define R_00B534_SPI_SHADER_USER_DATA_LS_1 0x00B534 +#define R_00B538_SPI_SHADER_USER_DATA_LS_2 0x00B538 +#define R_00B53C_SPI_SHADER_USER_DATA_LS_3 0x00B53C +#define R_00B540_SPI_SHADER_USER_DATA_LS_4 0x00B540 +#define R_00B544_SPI_SHADER_USER_DATA_LS_5 0x00B544 +#define R_00B548_SPI_SHADER_USER_DATA_LS_6 0x00B548 +#define R_00B54C_SPI_SHADER_USER_DATA_LS_7 0x00B54C +#define R_00B550_SPI_SHADER_USER_DATA_LS_8 0x00B550 +#define R_00B554_SPI_SHADER_USER_DATA_LS_9 0x00B554 +#define R_00B558_SPI_SHADER_USER_DATA_LS_10 0x00B558 +#define R_00B55C_SPI_SHADER_USER_DATA_LS_11 0x00B55C +#define R_00B560_SPI_SHADER_USER_DATA_LS_12 0x00B560 +#define R_00B564_SPI_SHADER_USER_DATA_LS_13 0x00B564 +#define R_00B568_SPI_SHADER_USER_DATA_LS_14 0x00B568 +#define R_00B56C_SPI_SHADER_USER_DATA_LS_15 0x00B56C #define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 #define S_00B800_COMPUTE_SHADER_EN(x) (((x) & 0x1) << 0) #define G_00B800_COMPUTE_SHADER_EN(x) (((x) >> 0) & 0x1) @@ -5049,6 +6625,16 @@ #define S_00B82C_MAX_WAVE_ID(x) (((x) & 0xFFF) << 0) #define G_00B82C_MAX_WAVE_ID(x) (((x) >> 0) & 0xFFF) #define C_00B82C_MAX_WAVE_ID 0xFFFFF000 +/* CIK */ +#define R_00B828_COMPUTE_PIPELINESTAT_ENABLE 0x00B828 +#define S_00B828_PIPELINESTAT_ENABLE(x) (((x) & 0x1) << 0) +#define G_00B828_PIPELINESTAT_ENABLE(x) (((x) >> 0) & 0x1) +#define C_00B828_PIPELINESTAT_ENABLE 0xFFFFFFFE +#define R_00B82C_COMPUTE_PERFCOUNT_ENABLE 0x00B82C +#define S_00B82C_PERFCOUNT_ENABLE(x) (((x) & 0x1) << 0) +#define G_00B82C_PERFCOUNT_ENABLE(x) (((x) >> 0) & 0x1) +#define C_00B82C_PERFCOUNT_ENABLE 0xFFFFFFFE +/* */ #define R_00B830_COMPUTE_PGM_LO 0x00B830 #define R_00B834_COMPUTE_PGM_HI 0x00B834 #define S_00B834_DATA(x) (((x) & 0xFF) << 0) @@ -5059,6 +6645,16 @@ #define G_00B834_INST_ATC(x) (((x) >> 8) & 0x1) #define C_00B834_INST_ATC 0xFFFFFEFF /* */ +#define R_00B838_COMPUTE_TBA_LO 0x00B838 +#define R_00B83C_COMPUTE_TBA_HI 0x00B83C +#define S_00B83C_DATA(x) (((x) & 0xFF) << 0) +#define G_00B83C_DATA(x) (((x) >> 0) & 0xFF) +#define C_00B83C_DATA 0xFFFFFF00 +#define R_00B840_COMPUTE_TMA_LO 0x00B840 +#define R_00B844_COMPUTE_TMA_HI 0x00B844 +#define S_00B844_DATA(x) (((x) & 0xFF) << 0) +#define G_00B844_DATA(x) (((x) >> 0) & 0xFF) +#define C_00B844_DATA 0xFFFFFF00 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) #define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) @@ -5099,6 +6695,9 @@ #define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B84C_USER_SGPR 0xFFFFFFC1 +#define S_00B84C_TRAP_PRESENT(x) (((x) & 0x1) << 6) +#define G_00B84C_TRAP_PRESENT(x) (((x) >> 6) & 0x1) +#define C_00B84C_TRAP_PRESENT 0xFFFFFFBF #define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) #define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1) #define C_00B84C_TGID_X_EN 0xFFFFFF7F @@ -5125,6 +6724,10 @@ #define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24) #define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) #define C_00B84C_EXCP_EN 0x80FFFFFF +#define R_00B850_COMPUTE_VMID 0x00B850 +#define S_00B850_DATA(x) (((x) & 0x0F) << 0) +#define G_00B850_DATA(x) (((x) >> 0) & 0x0F) +#define C_00B850_DATA 0xFFFFFFF0 #define R_00B854_COMPUTE_RESOURCE_LIMITS 0x00B854 #define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */ #define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */ @@ -5167,7 +6770,84 @@ #define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) #define G_00B860_WAVESIZE(x) (((x) >> 12) & 0x1FFF) #define C_00B860_WAVESIZE 0xFE000FFF +/* CIK */ +#define R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2 0x00B864 +#define S_00B864_SH0_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_00B864_SH0_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_00B864_SH0_CU_EN 0xFFFF0000 +#define S_00B864_SH1_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_00B864_SH1_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_00B864_SH1_CU_EN 0x0000FFFF +#define R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3 0x00B868 +#define S_00B868_SH0_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_00B868_SH0_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_00B868_SH0_CU_EN 0xFFFF0000 +#define S_00B868_SH1_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_00B868_SH1_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_00B868_SH1_CU_EN 0x0000FFFF +#define R_00B86C_COMPUTE_RESTART_X 0x00B86C +#define R_00B870_COMPUTE_RESTART_Y 0x00B870 +#define R_00B874_COMPUTE_RESTART_Z 0x00B874 +#define R_00B87C_COMPUTE_MISC_RESERVED 0x00B87C +#define S_00B87C_SEND_SEID(x) (((x) & 0x03) << 0) +#define G_00B87C_SEND_SEID(x) (((x) >> 0) & 0x03) +#define C_00B87C_SEND_SEID 0xFFFFFFFC +#define S_00B87C_RESERVED2(x) (((x) & 0x1) << 2) +#define G_00B87C_RESERVED2(x) (((x) >> 2) & 0x1) +#define C_00B87C_RESERVED2 0xFFFFFFFB +#define S_00B87C_RESERVED3(x) (((x) & 0x1) << 3) +#define G_00B87C_RESERVED3(x) (((x) >> 3) & 0x1) +#define C_00B87C_RESERVED3 0xFFFFFFF7 +#define S_00B87C_RESERVED4(x) (((x) & 0x1) << 4) +#define G_00B87C_RESERVED4(x) (((x) >> 4) & 0x1) +#define C_00B87C_RESERVED4 0xFFFFFFEF +/* VI */ +#define S_00B87C_WAVE_ID_BASE(x) (((x) & 0xFFF) << 5) +#define G_00B87C_WAVE_ID_BASE(x) (((x) >> 5) & 0xFFF) +#define C_00B87C_WAVE_ID_BASE 0xFFFE001F +#define R_00B880_COMPUTE_DISPATCH_ID 0x00B880 +#define R_00B884_COMPUTE_THREADGROUP_ID 0x00B884 +#define R_00B888_COMPUTE_RELAUNCH 0x00B888 +#define S_00B888_PAYLOAD(x) (((x) & 0x3FFFFFFF) << 0) +#define G_00B888_PAYLOAD(x) (((x) >> 0) & 0x3FFFFFFF) +#define C_00B888_PAYLOAD 0xC0000000 +#define S_00B888_IS_EVENT(x) (((x) & 0x1) << 30) +#define G_00B888_IS_EVENT(x) (((x) >> 30) & 0x1) +#define C_00B888_IS_EVENT 0xBFFFFFFF +#define S_00B888_IS_STATE(x) (((x) & 0x1) << 31) +#define G_00B888_IS_STATE(x) (((x) >> 31) & 0x1) +#define C_00B888_IS_STATE 0x7FFFFFFF +#define R_00B88C_COMPUTE_WAVE_RESTORE_ADDR_LO 0x00B88C +#define R_00B890_COMPUTE_WAVE_RESTORE_ADDR_HI 0x00B890 +#define S_00B890_ADDR(x) (((x) & 0xFFFF) << 0) +#define G_00B890_ADDR(x) (((x) >> 0) & 0xFFFF) +#define C_00B890_ADDR 0xFFFF0000 +#define R_00B894_COMPUTE_WAVE_RESTORE_CONTROL 0x00B894 +#define S_00B894_ATC(x) (((x) & 0x1) << 0) +#define G_00B894_ATC(x) (((x) >> 0) & 0x1) +#define C_00B894_ATC 0xFFFFFFFE +#define S_00B894_MTYPE(x) (((x) & 0x03) << 1) +#define G_00B894_MTYPE(x) (((x) >> 1) & 0x03) +#define C_00B894_MTYPE 0xFFFFFFF9 +/* */ +/* */ #define R_00B900_COMPUTE_USER_DATA_0 0x00B900 +#define R_00B904_COMPUTE_USER_DATA_1 0x00B904 +#define R_00B908_COMPUTE_USER_DATA_2 0x00B908 +#define R_00B90C_COMPUTE_USER_DATA_3 0x00B90C +#define R_00B910_COMPUTE_USER_DATA_4 0x00B910 +#define R_00B914_COMPUTE_USER_DATA_5 0x00B914 +#define R_00B918_COMPUTE_USER_DATA_6 0x00B918 +#define R_00B91C_COMPUTE_USER_DATA_7 0x00B91C +#define R_00B920_COMPUTE_USER_DATA_8 0x00B920 +#define R_00B924_COMPUTE_USER_DATA_9 0x00B924 +#define R_00B928_COMPUTE_USER_DATA_10 0x00B928 +#define R_00B92C_COMPUTE_USER_DATA_11 0x00B92C +#define R_00B930_COMPUTE_USER_DATA_12 0x00B930 +#define R_00B934_COMPUTE_USER_DATA_13 0x00B934 +#define R_00B938_COMPUTE_USER_DATA_14 0x00B938 +#define R_00B93C_COMPUTE_USER_DATA_15 0x00B93C +#define R_00B9FC_COMPUTE_NOWHERE 0x00B9FC #define R_028000_DB_RENDER_CONTROL 0x028000 #define S_028000_DEPTH_CLEAR_ENABLE(x) (((x) & 0x1) << 0) #define G_028000_DEPTH_CLEAR_ENABLE(x) (((x) >> 0) & 0x1) @@ -5196,6 +6876,11 @@ #define S_028000_COPY_SAMPLE(x) (((x) & 0x0F) << 8) #define G_028000_COPY_SAMPLE(x) (((x) >> 8) & 0x0F) #define C_028000_COPY_SAMPLE 0xFFFFF0FF +/* VI */ +#define S_028000_DECOMPRESS_ENABLE(x) (((x) & 0x1) << 12) +#define G_028000_DECOMPRESS_ENABLE(x) (((x) >> 12) & 0x1) +#define C_028000_DECOMPRESS_ENABLE 0xFFFFEFFF +/* */ #define R_028004_DB_COUNT_CONTROL 0x028004 #define S_028004_ZPASS_INCREMENT_DISABLE(x) (((x) & 0x1) << 0) #define G_028004_ZPASS_INCREMENT_DISABLE(x) (((x) >> 0) & 0x1) @@ -5474,9 +7159,6 @@ #define S_028040_NUM_SAMPLES(x) (((x) & 0x03) << 2) #define G_028040_NUM_SAMPLES(x) (((x) >> 2) & 0x03) #define C_028040_NUM_SAMPLES 0xFFFFFFF3 -#define S_028040_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) /* not on CIK */ -#define G_028040_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) /* not on CIK */ -#define C_028040_TILE_MODE_INDEX 0xFF8FFFFF /* not on CIK */ /* CIK */ #define S_028040_TILE_SPLIT(x) (((x) & 0x07) << 13) #define G_028040_TILE_SPLIT(x) (((x) >> 13) & 0x07) @@ -5489,6 +7171,14 @@ #define V_028040_ADDR_SURF_TILE_SPLIT_2KB 0x05 #define V_028040_ADDR_SURF_TILE_SPLIT_4KB 0x06 /* */ +#define S_028040_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) /* not on CIK */ +#define G_028040_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) /* not on CIK */ +#define C_028040_TILE_MODE_INDEX 0xFF8FFFFF /* not on CIK */ +/* VI */ +#define S_028040_DECOMPRESS_ON_N_ZPLANES(x) (((x) & 0x0F) << 23) +#define G_028040_DECOMPRESS_ON_N_ZPLANES(x) (((x) >> 23) & 0x0F) +#define C_028040_DECOMPRESS_ON_N_ZPLANES 0xF87FFFFF +/* */ #define S_028040_ALLOW_EXPCLEAR(x) (((x) & 0x1) << 27) #define G_028040_ALLOW_EXPCLEAR(x) (((x) >> 27) & 0x1) #define C_028040_ALLOW_EXPCLEAR 0xF7FFFFFF @@ -5498,6 +7188,11 @@ #define S_028040_TILE_SURFACE_ENABLE(x) (((x) & 0x1) << 29) #define G_028040_TILE_SURFACE_ENABLE(x) (((x) >> 29) & 0x1) #define C_028040_TILE_SURFACE_ENABLE 0xDFFFFFFF +/* VI */ +#define S_028040_CLEAR_DISALLOWED(x) (((x) & 0x1) << 30) +#define G_028040_CLEAR_DISALLOWED(x) (((x) >> 30) & 0x1) +#define C_028040_CLEAR_DISALLOWED 0xBFFFFFFF +/* */ #define S_028040_ZRANGE_PRECISION(x) (((x) & 0x1) << 31) #define G_028040_ZRANGE_PRECISION(x) (((x) >> 31) & 0x1) #define C_028040_ZRANGE_PRECISION 0x7FFFFFFF @@ -5507,9 +7202,6 @@ #define C_028044_FORMAT 0xFFFFFFFE #define V_028044_STENCIL_INVALID 0x00 #define V_028044_STENCIL_8 0x01 -#define S_028044_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) /* not on CIK */ -#define G_028044_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) /* not on CIK */ -#define C_028044_TILE_MODE_INDEX 0xFF8FFFFF /* not on CIK */ /* CIK */ #define S_028044_TILE_SPLIT(x) (((x) & 0x07) << 13) #define G_028044_TILE_SPLIT(x) (((x) >> 13) & 0x07) @@ -5522,12 +7214,20 @@ #define V_028044_ADDR_SURF_TILE_SPLIT_2KB 0x05 #define V_028044_ADDR_SURF_TILE_SPLIT_4KB 0x06 /* */ +#define S_028044_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) /* not on CIK */ +#define G_028044_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) /* not on CIK */ +#define C_028044_TILE_MODE_INDEX 0xFF8FFFFF /* not on CIK */ #define S_028044_ALLOW_EXPCLEAR(x) (((x) & 0x1) << 27) #define G_028044_ALLOW_EXPCLEAR(x) (((x) >> 27) & 0x1) #define C_028044_ALLOW_EXPCLEAR 0xF7FFFFFF #define S_028044_TILE_STENCIL_DISABLE(x) (((x) & 0x1) << 29) #define G_028044_TILE_STENCIL_DISABLE(x) (((x) >> 29) & 0x1) #define C_028044_TILE_STENCIL_DISABLE 0xDFFFFFFF +/* VI */ +#define S_028044_CLEAR_DISALLOWED(x) (((x) & 0x1) << 30) +#define G_028044_CLEAR_DISALLOWED(x) (((x) >> 30) & 0x1) +#define C_028044_CLEAR_DISALLOWED 0xBFFFFFFF +/* */ #define R_028048_DB_Z_READ_BASE 0x028048 #define R_02804C_DB_STENCIL_READ_BASE 0x02804C #define R_028050_DB_Z_WRITE_BASE 0x028050 @@ -5549,7 +7249,13 @@ #define S_028084_ADDRESS(x) (((x) & 0xFF) << 0) #define G_028084_ADDRESS(x) (((x) >> 0) & 0xFF) #define C_028084_ADDRESS 0xFFFFFF00 -/* */ +#define R_0281E8_COHER_DEST_BASE_HI_0 0x0281E8 +#define R_0281EC_COHER_DEST_BASE_HI_1 0x0281EC +#define R_0281F0_COHER_DEST_BASE_HI_2 0x0281F0 +#define R_0281F4_COHER_DEST_BASE_HI_3 0x0281F4 +/* */ +#define R_0281F8_COHER_DEST_BASE_2 0x0281F8 +#define R_0281FC_COHER_DEST_BASE_3 0x0281FC #define R_028200_PA_SC_WINDOW_OFFSET 0x028200 #define S_028200_WINDOW_X_OFFSET(x) (((x) & 0xFFFF) << 0) #define G_028200_WINDOW_X_OFFSET(x) (((x) >> 0) & 0xFFFF) @@ -5694,6 +7400,8 @@ #define S_028244_BR_Y(x) (((x) & 0x7FFF) << 16) #define G_028244_BR_Y(x) (((x) >> 16) & 0x7FFF) #define C_028244_BR_Y 0x8000FFFF +#define R_028248_COHER_DEST_BASE_0 0x028248 +#define R_02824C_COHER_DEST_BASE_1 0x02824C #define R_028250_PA_SC_VPORT_SCISSOR_0_TL 0x028250 #define S_028250_TL_X(x) (((x) & 0x7FFF) << 0) #define G_028250_TL_X(x) (((x) >> 0) & 0x7FFF) @@ -5711,8 +7419,68 @@ #define S_028254_BR_Y(x) (((x) & 0x7FFF) << 16) #define G_028254_BR_Y(x) (((x) >> 16) & 0x7FFF) #define C_028254_BR_Y 0x8000FFFF +#define R_028258_PA_SC_VPORT_SCISSOR_1_TL 0x028258 +#define R_02825C_PA_SC_VPORT_SCISSOR_1_BR 0x02825C +#define R_028260_PA_SC_VPORT_SCISSOR_2_TL 0x028260 +#define R_028264_PA_SC_VPORT_SCISSOR_2_BR 0x028264 +#define R_028268_PA_SC_VPORT_SCISSOR_3_TL 0x028268 +#define R_02826C_PA_SC_VPORT_SCISSOR_3_BR 0x02826C +#define R_028270_PA_SC_VPORT_SCISSOR_4_TL 0x028270 +#define R_028274_PA_SC_VPORT_SCISSOR_4_BR 0x028274 +#define R_028278_PA_SC_VPORT_SCISSOR_5_TL 0x028278 +#define R_02827C_PA_SC_VPORT_SCISSOR_5_BR 0x02827C +#define R_028280_PA_SC_VPORT_SCISSOR_6_TL 0x028280 +#define R_028284_PA_SC_VPORT_SCISSOR_6_BR 0x028284 +#define R_028288_PA_SC_VPORT_SCISSOR_7_TL 0x028288 +#define R_02828C_PA_SC_VPORT_SCISSOR_7_BR 0x02828C +#define R_028290_PA_SC_VPORT_SCISSOR_8_TL 0x028290 +#define R_028294_PA_SC_VPORT_SCISSOR_8_BR 0x028294 +#define R_028298_PA_SC_VPORT_SCISSOR_9_TL 0x028298 +#define R_02829C_PA_SC_VPORT_SCISSOR_9_BR 0x02829C +#define R_0282A0_PA_SC_VPORT_SCISSOR_10_TL 0x0282A0 +#define R_0282A4_PA_SC_VPORT_SCISSOR_10_BR 0x0282A4 +#define R_0282A8_PA_SC_VPORT_SCISSOR_11_TL 0x0282A8 +#define R_0282AC_PA_SC_VPORT_SCISSOR_11_BR 0x0282AC +#define R_0282B0_PA_SC_VPORT_SCISSOR_12_TL 0x0282B0 +#define R_0282B4_PA_SC_VPORT_SCISSOR_12_BR 0x0282B4 +#define R_0282B8_PA_SC_VPORT_SCISSOR_13_TL 0x0282B8 +#define R_0282BC_PA_SC_VPORT_SCISSOR_13_BR 0x0282BC +#define R_0282C0_PA_SC_VPORT_SCISSOR_14_TL 0x0282C0 +#define R_0282C4_PA_SC_VPORT_SCISSOR_14_BR 0x0282C4 +#define R_0282C8_PA_SC_VPORT_SCISSOR_15_TL 0x0282C8 +#define R_0282CC_PA_SC_VPORT_SCISSOR_15_BR 0x0282CC #define R_0282D0_PA_SC_VPORT_ZMIN_0 0x0282D0 #define R_0282D4_PA_SC_VPORT_ZMAX_0 0x0282D4 +#define R_0282D8_PA_SC_VPORT_ZMIN_1 0x0282D8 +#define R_0282DC_PA_SC_VPORT_ZMAX_1 0x0282DC +#define R_0282E0_PA_SC_VPORT_ZMIN_2 0x0282E0 +#define R_0282E4_PA_SC_VPORT_ZMAX_2 0x0282E4 +#define R_0282E8_PA_SC_VPORT_ZMIN_3 0x0282E8 +#define R_0282EC_PA_SC_VPORT_ZMAX_3 0x0282EC +#define R_0282F0_PA_SC_VPORT_ZMIN_4 0x0282F0 +#define R_0282F4_PA_SC_VPORT_ZMAX_4 0x0282F4 +#define R_0282F8_PA_SC_VPORT_ZMIN_5 0x0282F8 +#define R_0282FC_PA_SC_VPORT_ZMAX_5 0x0282FC +#define R_028300_PA_SC_VPORT_ZMIN_6 0x028300 +#define R_028304_PA_SC_VPORT_ZMAX_6 0x028304 +#define R_028308_PA_SC_VPORT_ZMIN_7 0x028308 +#define R_02830C_PA_SC_VPORT_ZMAX_7 0x02830C +#define R_028310_PA_SC_VPORT_ZMIN_8 0x028310 +#define R_028314_PA_SC_VPORT_ZMAX_8 0x028314 +#define R_028318_PA_SC_VPORT_ZMIN_9 0x028318 +#define R_02831C_PA_SC_VPORT_ZMAX_9 0x02831C +#define R_028320_PA_SC_VPORT_ZMIN_10 0x028320 +#define R_028324_PA_SC_VPORT_ZMAX_10 0x028324 +#define R_028328_PA_SC_VPORT_ZMIN_11 0x028328 +#define R_02832C_PA_SC_VPORT_ZMAX_11 0x02832C +#define R_028330_PA_SC_VPORT_ZMIN_12 0x028330 +#define R_028334_PA_SC_VPORT_ZMAX_12 0x028334 +#define R_028338_PA_SC_VPORT_ZMIN_13 0x028338 +#define R_02833C_PA_SC_VPORT_ZMAX_13 0x02833C +#define R_028340_PA_SC_VPORT_ZMIN_14 0x028340 +#define R_028344_PA_SC_VPORT_ZMAX_14 0x028344 +#define R_028348_PA_SC_VPORT_ZMIN_15 0x028348 +#define R_02834C_PA_SC_VPORT_ZMAX_15 0x02834C #define R_028350_PA_SC_RASTER_CONFIG 0x028350 #define S_028350_RB_MAP_PKR0(x) (((x) & 0x03) << 0) #define G_028350_RB_MAP_PKR0(x) (((x) >> 0) & 0x03) @@ -5834,6 +7602,13 @@ #define V_028354_RASTER_CONFIG_SE_PAIR_YSEL_16_WIDE_TILE 0x01 #define V_028354_RASTER_CONFIG_SE_PAIR_YSEL_32_WIDE_TILE 0x02 #define V_028354_RASTER_CONFIG_SE_PAIR_YSEL_64_WIDE_TILE 0x03 +#define R_028358_PA_SC_SCREEN_EXTENT_CONTROL 0x028358 +#define S_028358_SLICE_EVEN_ENABLE(x) (((x) & 0x03) << 0) +#define G_028358_SLICE_EVEN_ENABLE(x) (((x) >> 0) & 0x03) +#define C_028358_SLICE_EVEN_ENABLE 0xFFFFFFFC +#define S_028358_SLICE_ODD_ENABLE(x) (((x) & 0x03) << 2) +#define G_028358_SLICE_ODD_ENABLE(x) (((x) >> 2) & 0x03) +#define C_028358_SLICE_ODD_ENABLE 0xFFFFFFF3 /* */ #define R_028400_VGT_MAX_VTX_INDX 0x028400 #define R_028404_VGT_MIN_VTX_INDX 0x028404 @@ -5843,6 +7618,18 @@ #define R_028418_CB_BLEND_GREEN 0x028418 #define R_02841C_CB_BLEND_BLUE 0x02841C #define R_028420_CB_BLEND_ALPHA 0x028420 +/* VI */ +#define R_028424_CB_DCC_CONTROL 0x028424 +#define S_028424_OVERWRITE_COMBINER_DISABLE(x) (((x) & 0x1) << 0) +#define G_028424_OVERWRITE_COMBINER_DISABLE(x) (((x) >> 0) & 0x1) +#define C_028424_OVERWRITE_COMBINER_DISABLE 0xFFFFFFFE +#define S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x) (((x) & 0x1) << 1) +#define G_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x) (((x) >> 1) & 0x1) +#define C_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE 0xFFFFFFFD +#define S_028424_OVERWRITE_COMBINER_WATERMARK(x) (((x) & 0x1F) << 2) +#define G_028424_OVERWRITE_COMBINER_WATERMARK(x) (((x) >> 2) & 0x1F) +#define C_028424_OVERWRITE_COMBINER_WATERMARK 0xFFFFFF83 +/* */ #define R_02842C_DB_STENCIL_CONTROL 0x02842C #define S_02842C_STENCILFAIL(x) (((x) & 0x0F) << 0) #define G_02842C_STENCILFAIL(x) (((x) >> 0) & 0x0F) @@ -5984,12 +7771,102 @@ #define S_028434_STENCILOPVAL_BF(x) (((x) & 0xFF) << 24) #define G_028434_STENCILOPVAL_BF(x) (((x) >> 24) & 0xFF) #define C_028434_STENCILOPVAL_BF 0x00FFFFFF -#define R_02843C_PA_CL_VPORT_XSCALE_0 0x02843C -#define R_028440_PA_CL_VPORT_XOFFSET_0 0x028440 -#define R_028444_PA_CL_VPORT_YSCALE_0 0x028444 -#define R_028448_PA_CL_VPORT_YOFFSET_0 0x028448 -#define R_02844C_PA_CL_VPORT_ZSCALE_0 0x02844C -#define R_028450_PA_CL_VPORT_ZOFFSET_0 0x028450 +#define R_02843C_PA_CL_VPORT_XSCALE 0x02843C +#define R_028440_PA_CL_VPORT_XOFFSET 0x028440 +#define R_028444_PA_CL_VPORT_YSCALE 0x028444 +#define R_028448_PA_CL_VPORT_YOFFSET 0x028448 +#define R_02844C_PA_CL_VPORT_ZSCALE 0x02844C +#define R_028450_PA_CL_VPORT_ZOFFSET 0x028450 +#define R_028454_PA_CL_VPORT_XSCALE_1 0x028454 +#define R_028458_PA_CL_VPORT_XOFFSET_1 0x028458 +#define R_02845C_PA_CL_VPORT_YSCALE_1 0x02845C +#define R_028460_PA_CL_VPORT_YOFFSET_1 0x028460 +#define R_028464_PA_CL_VPORT_ZSCALE_1 0x028464 +#define R_028468_PA_CL_VPORT_ZOFFSET_1 0x028468 +#define R_02846C_PA_CL_VPORT_XSCALE_2 0x02846C +#define R_028470_PA_CL_VPORT_XOFFSET_2 0x028470 +#define R_028474_PA_CL_VPORT_YSCALE_2 0x028474 +#define R_028478_PA_CL_VPORT_YOFFSET_2 0x028478 +#define R_02847C_PA_CL_VPORT_ZSCALE_2 0x02847C +#define R_028480_PA_CL_VPORT_ZOFFSET_2 0x028480 +#define R_028484_PA_CL_VPORT_XSCALE_3 0x028484 +#define R_028488_PA_CL_VPORT_XOFFSET_3 0x028488 +#define R_02848C_PA_CL_VPORT_YSCALE_3 0x02848C +#define R_028490_PA_CL_VPORT_YOFFSET_3 0x028490 +#define R_028494_PA_CL_VPORT_ZSCALE_3 0x028494 +#define R_028498_PA_CL_VPORT_ZOFFSET_3 0x028498 +#define R_02849C_PA_CL_VPORT_XSCALE_4 0x02849C +#define R_0284A0_PA_CL_VPORT_XOFFSET_4 0x0284A0 +#define R_0284A4_PA_CL_VPORT_YSCALE_4 0x0284A4 +#define R_0284A8_PA_CL_VPORT_YOFFSET_4 0x0284A8 +#define R_0284AC_PA_CL_VPORT_ZSCALE_4 0x0284AC +#define R_0284B0_PA_CL_VPORT_ZOFFSET_4 0x0284B0 +#define R_0284B4_PA_CL_VPORT_XSCALE_5 0x0284B4 +#define R_0284B8_PA_CL_VPORT_XOFFSET_5 0x0284B8 +#define R_0284BC_PA_CL_VPORT_YSCALE_5 0x0284BC +#define R_0284C0_PA_CL_VPORT_YOFFSET_5 0x0284C0 +#define R_0284C4_PA_CL_VPORT_ZSCALE_5 0x0284C4 +#define R_0284C8_PA_CL_VPORT_ZOFFSET_5 0x0284C8 +#define R_0284CC_PA_CL_VPORT_XSCALE_6 0x0284CC +#define R_0284D0_PA_CL_VPORT_XOFFSET_6 0x0284D0 +#define R_0284D4_PA_CL_VPORT_YSCALE_6 0x0284D4 +#define R_0284D8_PA_CL_VPORT_YOFFSET_6 0x0284D8 +#define R_0284DC_PA_CL_VPORT_ZSCALE_6 0x0284DC +#define R_0284E0_PA_CL_VPORT_ZOFFSET_6 0x0284E0 +#define R_0284E4_PA_CL_VPORT_XSCALE_7 0x0284E4 +#define R_0284E8_PA_CL_VPORT_XOFFSET_7 0x0284E8 +#define R_0284EC_PA_CL_VPORT_YSCALE_7 0x0284EC +#define R_0284F0_PA_CL_VPORT_YOFFSET_7 0x0284F0 +#define R_0284F4_PA_CL_VPORT_ZSCALE_7 0x0284F4 +#define R_0284F8_PA_CL_VPORT_ZOFFSET_7 0x0284F8 +#define R_0284FC_PA_CL_VPORT_XSCALE_8 0x0284FC +#define R_028500_PA_CL_VPORT_XOFFSET_8 0x028500 +#define R_028504_PA_CL_VPORT_YSCALE_8 0x028504 +#define R_028508_PA_CL_VPORT_YOFFSET_8 0x028508 +#define R_02850C_PA_CL_VPORT_ZSCALE_8 0x02850C +#define R_028510_PA_CL_VPORT_ZOFFSET_8 0x028510 +#define R_028514_PA_CL_VPORT_XSCALE_9 0x028514 +#define R_028518_PA_CL_VPORT_XOFFSET_9 0x028518 +#define R_02851C_PA_CL_VPORT_YSCALE_9 0x02851C +#define R_028520_PA_CL_VPORT_YOFFSET_9 0x028520 +#define R_028524_PA_CL_VPORT_ZSCALE_9 0x028524 +#define R_028528_PA_CL_VPORT_ZOFFSET_9 0x028528 +#define R_02852C_PA_CL_VPORT_XSCALE_10 0x02852C +#define R_028530_PA_CL_VPORT_XOFFSET_10 0x028530 +#define R_028534_PA_CL_VPORT_YSCALE_10 0x028534 +#define R_028538_PA_CL_VPORT_YOFFSET_10 0x028538 +#define R_02853C_PA_CL_VPORT_ZSCALE_10 0x02853C +#define R_028540_PA_CL_VPORT_ZOFFSET_10 0x028540 +#define R_028544_PA_CL_VPORT_XSCALE_11 0x028544 +#define R_028548_PA_CL_VPORT_XOFFSET_11 0x028548 +#define R_02854C_PA_CL_VPORT_YSCALE_11 0x02854C +#define R_028550_PA_CL_VPORT_YOFFSET_11 0x028550 +#define R_028554_PA_CL_VPORT_ZSCALE_11 0x028554 +#define R_028558_PA_CL_VPORT_ZOFFSET_11 0x028558 +#define R_02855C_PA_CL_VPORT_XSCALE_12 0x02855C +#define R_028560_PA_CL_VPORT_XOFFSET_12 0x028560 +#define R_028564_PA_CL_VPORT_YSCALE_12 0x028564 +#define R_028568_PA_CL_VPORT_YOFFSET_12 0x028568 +#define R_02856C_PA_CL_VPORT_ZSCALE_12 0x02856C +#define R_028570_PA_CL_VPORT_ZOFFSET_12 0x028570 +#define R_028574_PA_CL_VPORT_XSCALE_13 0x028574 +#define R_028578_PA_CL_VPORT_XOFFSET_13 0x028578 +#define R_02857C_PA_CL_VPORT_YSCALE_13 0x02857C +#define R_028580_PA_CL_VPORT_YOFFSET_13 0x028580 +#define R_028584_PA_CL_VPORT_ZSCALE_13 0x028584 +#define R_028588_PA_CL_VPORT_ZOFFSET_13 0x028588 +#define R_02858C_PA_CL_VPORT_XSCALE_14 0x02858C +#define R_028590_PA_CL_VPORT_XOFFSET_14 0x028590 +#define R_028594_PA_CL_VPORT_YSCALE_14 0x028594 +#define R_028598_PA_CL_VPORT_YOFFSET_14 0x028598 +#define R_02859C_PA_CL_VPORT_ZSCALE_14 0x02859C +#define R_0285A0_PA_CL_VPORT_ZOFFSET_14 0x0285A0 +#define R_0285A4_PA_CL_VPORT_XSCALE_15 0x0285A4 +#define R_0285A8_PA_CL_VPORT_XOFFSET_15 0x0285A8 +#define R_0285AC_PA_CL_VPORT_YSCALE_15 0x0285AC +#define R_0285B0_PA_CL_VPORT_YOFFSET_15 0x0285B0 +#define R_0285B4_PA_CL_VPORT_ZSCALE_15 0x0285B4 +#define R_0285B8_PA_CL_VPORT_ZOFFSET_15 0x0285B8 #define R_0285BC_PA_CL_UCP_0_X 0x0285BC #define R_0285C0_PA_CL_UCP_0_Y 0x0285C0 #define R_0285C4_PA_CL_UCP_0_Z 0x0285C4 @@ -6036,6 +7913,26 @@ #define G_028644_DUP(x) (((x) >> 18) & 0x1) #define C_028644_DUP 0xFFFBFFFF /* */ +/* VI */ +#define S_028644_FP16_INTERP_MODE(x) (((x) & 0x1) << 19) +#define G_028644_FP16_INTERP_MODE(x) (((x) >> 19) & 0x1) +#define C_028644_FP16_INTERP_MODE 0xFFF7FFFF +#define S_028644_USE_DEFAULT_ATTR1(x) (((x) & 0x1) << 20) +#define G_028644_USE_DEFAULT_ATTR1(x) (((x) >> 20) & 0x1) +#define C_028644_USE_DEFAULT_ATTR1 0xFFEFFFFF +#define S_028644_DEFAULT_VAL_ATTR1(x) (((x) & 0x03) << 21) +#define G_028644_DEFAULT_VAL_ATTR1(x) (((x) >> 21) & 0x03) +#define C_028644_DEFAULT_VAL_ATTR1 0xFF9FFFFF +#define S_028644_PT_SPRITE_TEX_ATTR1(x) (((x) & 0x1) << 23) +#define G_028644_PT_SPRITE_TEX_ATTR1(x) (((x) >> 23) & 0x1) +#define C_028644_PT_SPRITE_TEX_ATTR1 0xFF7FFFFF +#define S_028644_ATTR0_VALID(x) (((x) & 0x1) << 24) +#define G_028644_ATTR0_VALID(x) (((x) >> 24) & 0x1) +#define C_028644_ATTR0_VALID 0xFEFFFFFF +#define S_028644_ATTR1_VALID(x) (((x) & 0x1) << 25) +#define G_028644_ATTR1_VALID(x) (((x) >> 25) & 0x1) +#define C_028644_ATTR1_VALID 0xFDFFFFFF +/* */ #define R_028648_SPI_PS_INPUT_CNTL_1 0x028648 #define R_02864C_SPI_PS_INPUT_CNTL_2 0x02864C #define R_028650_SPI_PS_INPUT_CNTL_3 0x028650 @@ -6559,6 +8456,10 @@ #define R_028794_CB_BLEND5_CONTROL 0x028794 #define R_028798_CB_BLEND6_CONTROL 0x028798 #define R_02879C_CB_BLEND7_CONTROL 0x02879C +#define R_0287CC_CS_COPY_STATE 0x0287CC +#define S_0287CC_SRC_STATE_ID(x) (((x) & 0x07) << 0) +#define G_0287CC_SRC_STATE_ID(x) (((x) >> 0) & 0x07) +#define C_0287CC_SRC_STATE_ID 0xFFFFFFF8 #define R_0287D4_PA_CL_POINT_X_RAD 0x0287D4 #define R_0287D8_PA_CL_POINT_Y_RAD 0x0287D8 #define R_0287DC_PA_CL_POINT_SIZE 0x0287DC @@ -6588,6 +8489,10 @@ #define G_0287F0_USE_OPAQUE(x) (((x) >> 6) & 0x1) #define C_0287F0_USE_OPAQUE 0xFFFFFFBF #define R_0287F4_VGT_IMMED_DATA 0x0287F4 /* not on CIK */ +#define R_0287F8_VGT_EVENT_ADDRESS_REG 0x0287F8 +#define S_0287F8_ADDRESS_LOW(x) (((x) & 0xFFFFFFF) << 0) +#define G_0287F8_ADDRESS_LOW(x) (((x) >> 0) & 0xFFFFFFF) +#define C_0287F8_ADDRESS_LOW 0xF0000000 #define R_028800_DB_DEPTH_CONTROL 0x028800 #define S_028800_STENCIL_ENABLE(x) (((x) & 0x1) << 0) #define G_028800_STENCIL_ENABLE(x) (((x) >> 0) & 0x1) @@ -6644,36 +8549,42 @@ #define G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x) (((x) >> 31) & 0x1) #define C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS 0x7FFFFFFF #define R_028804_DB_EQAA 0x028804 -#define S_028804_MAX_ANCHOR_SAMPLES(x) (((x) & 0x7) << 0) -#define G_028804_MAX_ANCHOR_SAMPLES(x) (((x) >> 0) & 0x7) -#define C_028804_MAX_ANCHOR_SAMPLES (~(((~0) & 0x7) << 0)) -#define S_028804_PS_ITER_SAMPLES(x) (((x) & 0x7) << 4) -#define G_028804_PS_ITER_SAMPLES(x) (((x) >> 4) & 0x7) -#define C_028804_PS_ITER_SAMPLES (~(((~0) & 0x7) << 4)) -#define S_028804_MASK_EXPORT_NUM_SAMPLES(x) (((x) & 0x7) << 8) -#define G_028804_MASK_EXPORT_NUM_SAMPLES(x) (((x) >> 8) & 0x7) -#define C_028804_MASK_EXPORT_NUM_SAMPLES (~(((~0) & 0x7) << 8)) -#define S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x) (((x) & 0x7) << 12) -#define G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x) (((x) >> 12) & 0x7) -#define C_028804_ALPHA_TO_MASK_NUM_SAMPLES (~(((~0) & 0x7) << 12)) -#define S_028804_HIGH_QUALITY_INTERSECTIONS(x) (((x) & 0x1) << 16) -#define G_028804_HIGH_QUALITY_INTERSECTIONS(x) (((x) >> 16) & 0x1) -#define C_028804_HIGH_QUALITY_INTERSECTIONS (~(((~0) & 0x1) << 16)) -#define S_028804_INCOHERENT_EQAA_READS(x) (((x) & 0x1) << 17) -#define G_028804_INCOHERENT_EQAA_READS(x) (((x) >> 17) & 0x1) -#define C_028804_INCOHERENT_EQAA_READS (~(((~0) & 0x1) << 17)) -#define S_028804_INTERPOLATE_COMP_Z(x) (((x) & 0x1) << 18) -#define G_028804_INTERPOLATE_COMP_Z(x) (((x) >> 18) & 0x1) -#define C_028804_INTERPOLATE_COMP_Z (~(((~0) >> 18) & 0x1)) -#define S_028804_INTERPOLATE_SRC_Z(x) (((x) & 0x1) << 19) -#define G_028804_INTERPOLATE_SRC_Z(x) (((x) >> 19) & 0x1) -#define C_028804_INTERPOLATE_SRC_Z (~(((~0) & 0x1) << 19)) -#define S_028804_STATIC_ANCHOR_ASSOCIATIONS(x) (((x) & 0x1) << 20) -#define G_028804_STATIC_ANCHOR_ASSOCIATIONS(x) (((x) >> 20) & 0x1) -#define C_028804_STATIC_ANCHOR_ASSOCIATIONS (~(((~0) & 0x1) << 20)) -#define S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x) (((x) & 0x1) << 21) -#define G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x) (((x) >> 21) & 0x1) -#define C_028804_ALPHA_TO_MASK_EQAA_DISABLE (~(((~0) & 0x1) << 21)) +#define S_028804_MAX_ANCHOR_SAMPLES(x) (((x) & 0x7) << 0) +#define G_028804_MAX_ANCHOR_SAMPLES(x) (((x) >> 0) & 0x07) +#define C_028804_MAX_ANCHOR_SAMPLES 0xFFFFFFF8 +#define S_028804_PS_ITER_SAMPLES(x) (((x) & 0x7) << 4) +#define G_028804_PS_ITER_SAMPLES(x) (((x) >> 4) & 0x07) +#define C_028804_PS_ITER_SAMPLES 0xFFFFFF8F +#define S_028804_MASK_EXPORT_NUM_SAMPLES(x) (((x) & 0x7) << 8) +#define G_028804_MASK_EXPORT_NUM_SAMPLES(x) (((x) >> 8) & 0x07) +#define C_028804_MASK_EXPORT_NUM_SAMPLES 0xFFFFF8FF +#define S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x) (((x) & 0x7) << 12) +#define G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x) (((x) >> 12) & 0x07) +#define C_028804_ALPHA_TO_MASK_NUM_SAMPLES 0xFFFF8FFF +#define S_028804_HIGH_QUALITY_INTERSECTIONS(x) (((x) & 0x1) << 16) +#define G_028804_HIGH_QUALITY_INTERSECTIONS(x) (((x) >> 16) & 0x1) +#define C_028804_HIGH_QUALITY_INTERSECTIONS 0xFFFEFFFF +#define S_028804_INCOHERENT_EQAA_READS(x) (((x) & 0x1) << 17) +#define G_028804_INCOHERENT_EQAA_READS(x) (((x) >> 17) & 0x1) +#define C_028804_INCOHERENT_EQAA_READS 0xFFFDFFFF +#define S_028804_INTERPOLATE_COMP_Z(x) (((x) & 0x1) << 18) +#define G_028804_INTERPOLATE_COMP_Z(x) (((x) >> 18) & 0x1) +#define C_028804_INTERPOLATE_COMP_Z 0xFFFBFFFF +#define S_028804_INTERPOLATE_SRC_Z(x) (((x) & 0x1) << 19) +#define G_028804_INTERPOLATE_SRC_Z(x) (((x) >> 19) & 0x1) +#define C_028804_INTERPOLATE_SRC_Z 0xFFF7FFFF +#define S_028804_STATIC_ANCHOR_ASSOCIATIONS(x) (((x) & 0x1) << 20) +#define G_028804_STATIC_ANCHOR_ASSOCIATIONS(x) (((x) >> 20) & 0x1) +#define C_028804_STATIC_ANCHOR_ASSOCIATIONS 0xFFEFFFFF +#define S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x) (((x) & 0x1) << 21) +#define G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x) (((x) >> 21) & 0x1) +#define C_028804_ALPHA_TO_MASK_EQAA_DISABLE 0xFFDFFFFF +#define S_028804_OVERRASTERIZATION_AMOUNT(x) (((x) & 0x07) << 24) +#define G_028804_OVERRASTERIZATION_AMOUNT(x) (((x) >> 24) & 0x07) +#define C_028804_OVERRASTERIZATION_AMOUNT 0xF8FFFFFF +#define S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x) (((x) & 0x1) << 27) +#define G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x) (((x) >> 27) & 0x1) +#define C_028804_ENABLE_POSTZ_OVERRASTERIZATION 0xF7FFFFFF #define R_028808_CB_COLOR_CONTROL 0x028808 #define S_028808_DEGAMMA_ENABLE(x) (((x) & 0x1) << 3) #define G_028808_DEGAMMA_ENABLE(x) (((x) >> 3) & 0x1) @@ -6977,6 +8888,11 @@ #define S_02881C_USE_VTX_GS_CUT_FLAG(x) (((x) & 0x1) << 25) #define G_02881C_USE_VTX_GS_CUT_FLAG(x) (((x) >> 25) & 0x1) #define C_02881C_USE_VTX_GS_CUT_FLAG 0xFDFFFFFF +/* VI */ +#define S_02881C_USE_VTX_LINE_WIDTH(x) (((x) & 0x1) << 26) +#define G_02881C_USE_VTX_LINE_WIDTH(x) (((x) >> 26) & 0x1) +#define C_02881C_USE_VTX_LINE_WIDTH 0xFBFFFFFF +/* */ #define R_028820_PA_CL_NANINF_CNTL 0x028820 #define S_028820_VTE_XY_INF_DISCARD(x) (((x) & 0x1) << 0) #define G_028820_VTE_XY_INF_DISCARD(x) (((x) >> 0) & 0x1) @@ -7447,9 +9363,21 @@ #define S_028A4C_PS_ITER_SAMPLE(x) (((x) & 0x1) << 16) #define G_028A4C_PS_ITER_SAMPLE(x) (((x) >> 16) & 0x1) #define C_028A4C_PS_ITER_SAMPLE 0xFFFEFFFF -#define S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x) (((x) & 0x1) << 17) -#define G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x) (((x) >> 17) & 0x1) -#define C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC 0xFFFDFFFF +#define S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x) (((x) & 0x1) << 17) +#define G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x) (((x) >> 17) & 0x1) +#define C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE 0xFFFDFFFF +#define S_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x) (((x) & 0x1) << 18) +#define G_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x) (((x) >> 18) & 0x1) +#define C_028A4C_MULTI_GPU_SUPERTILE_ENABLE 0xFFFBFFFF +#define S_028A4C_GPU_ID_OVERRIDE_ENABLE(x) (((x) & 0x1) << 19) +#define G_028A4C_GPU_ID_OVERRIDE_ENABLE(x) (((x) >> 19) & 0x1) +#define C_028A4C_GPU_ID_OVERRIDE_ENABLE 0xFFF7FFFF +#define S_028A4C_GPU_ID_OVERRIDE(x) (((x) & 0x0F) << 20) +#define G_028A4C_GPU_ID_OVERRIDE(x) (((x) >> 20) & 0x0F) +#define C_028A4C_GPU_ID_OVERRIDE 0xFF0FFFFF +#define S_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x) (((x) & 0x1) << 24) +#define G_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x) (((x) >> 24) & 0x1) +#define C_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE 0xFEFFFFFF #define S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) & 0x1) << 25) #define G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) >> 25) & 0x1) #define C_028A4C_FORCE_EOV_CNTDWN_ENABLE 0xFDFFFFFF @@ -7515,6 +9443,7 @@ #define C_028A7C_INDEX_TYPE 0xFFFFFFFC #define V_028A7C_VGT_INDEX_16 0x00 #define V_028A7C_VGT_INDEX_32 0x01 +#define V_028A7C_VGT_INDEX_8 0x02 /* VI */ #define S_028A7C_SWAP_MODE(x) (((x) & 0x03) << 2) #define G_028A7C_SWAP_MODE(x) (((x) >> 2) & 0x03) #define C_028A7C_SWAP_MODE 0xFFFFFFF3 @@ -7544,6 +9473,12 @@ #define G_028A7C_REQ_PATH(x) (((x) >> 10) & 0x1) #define C_028A7C_REQ_PATH 0xFFFFFBFF /* */ +/* VI */ +#define S_028A7C_MTYPE(x) (((x) & 0x03) << 11) +#define G_028A7C_MTYPE(x) (((x) >> 11) & 0x03) +#define C_028A7C_MTYPE 0xFFFFE7FF +/* */ +#define R_028A80_WD_ENHANCE 0x028A80 #define R_028A84_VGT_PRIMITIVEID_EN 0x028A84 #define S_028A84_PRIMITIVEID_EN(x) (((x) & 0x1) << 0) #define G_028A84_PRIMITIVEID_EN(x) (((x) >> 0) & 0x1) @@ -7642,6 +9577,10 @@ #define S_028AA8_WD_SWITCH_ON_EOP(x) (((x) & 0x1) << 20) #define G_028AA8_WD_SWITCH_ON_EOP(x) (((x) >> 20) & 0x1) #define C_028AA8_WD_SWITCH_ON_EOP 0xFFEFFFFF +/* VI */ +#define S_028AA8_MAX_PRIMGRP_IN_WAVE(x) (((x) & 0x0F) << 28) +#define G_028AA8_MAX_PRIMGRP_IN_WAVE(x) (((x) >> 28) & 0x0F) +#define C_028AA8_MAX_PRIMGRP_IN_WAVE 0x0FFFFFFF /* */ #define R_028AAC_VGT_ESGS_RING_ITEMSIZE 0x028AAC #define S_028AAC_ITEMSIZE(x) (((x) & 0x7FFF) << 0) @@ -7681,6 +9620,11 @@ #define S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x) (((x) & 0x1) << 16) #define G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x) (((x) >> 16) & 0x1) #define C_028ABC_DST_OUTSIDE_ZERO_TO_ONE 0xFFFEFFFF +/* VI */ +#define S_028ABC_TC_COMPATIBLE(x) (((x) & 0x1) << 17) +#define G_028ABC_TC_COMPATIBLE(x) (((x) >> 17) & 0x1) +#define C_028ABC_TC_COMPATIBLE 0xFFFDFFFF +/* */ #define R_028AC0_DB_SRESULTS_COMPARE_STATE0 0x028AC0 #define S_028AC0_COMPAREFUNC0(x) (((x) & 0x07) << 0) #define G_028AC0_COMPAREFUNC0(x) (((x) >> 0) & 0x07) @@ -7770,6 +9714,21 @@ #define S_028B38_MAX_VERT_OUT(x) (((x) & 0x7FF) << 0) #define G_028B38_MAX_VERT_OUT(x) (((x) >> 0) & 0x7FF) #define C_028B38_MAX_VERT_OUT 0xFFFFF800 +/* VI */ +#define R_028B50_VGT_TESS_DISTRIBUTION 0x028B50 +#define S_028B50_ACCUM_ISOLINE(x) (((x) & 0xFF) << 0) +#define G_028B50_ACCUM_ISOLINE(x) (((x) >> 0) & 0xFF) +#define C_028B50_ACCUM_ISOLINE 0xFFFFFF00 +#define S_028B50_ACCUM_TRI(x) (((x) & 0xFF) << 8) +#define G_028B50_ACCUM_TRI(x) (((x) >> 8) & 0xFF) +#define C_028B50_ACCUM_TRI 0xFFFF00FF +#define S_028B50_ACCUM_QUAD(x) (((x) & 0xFF) << 16) +#define G_028B50_ACCUM_QUAD(x) (((x) >> 16) & 0xFF) +#define C_028B50_ACCUM_QUAD 0xFF00FFFF +#define S_028B50_DONUT_SPLIT(x) (((x) & 0xFF) << 24) +#define G_028B50_DONUT_SPLIT(x) (((x) >> 24) & 0xFF) +#define C_028B50_DONUT_SPLIT 0x00FFFFFF +/* */ #define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 #define S_028B54_LS_EN(x) (((x) & 0x03) << 0) #define G_028B54_LS_EN(x) (((x) >> 0) & 0x03) @@ -7798,6 +9757,20 @@ #define S_028B54_DYNAMIC_HS(x) (((x) & 0x1) << 8) #define G_028B54_DYNAMIC_HS(x) (((x) >> 8) & 0x1) #define C_028B54_DYNAMIC_HS 0xFFFFFEFF +/* VI */ +#define S_028B54_DISPATCH_DRAW_EN(x) (((x) & 0x1) << 9) +#define G_028B54_DISPATCH_DRAW_EN(x) (((x) >> 9) & 0x1) +#define C_028B54_DISPATCH_DRAW_EN 0xFFFFFDFF +#define S_028B54_DIS_DEALLOC_ACCUM_0(x) (((x) & 0x1) << 10) +#define G_028B54_DIS_DEALLOC_ACCUM_0(x) (((x) >> 10) & 0x1) +#define C_028B54_DIS_DEALLOC_ACCUM_0 0xFFFFFBFF +#define S_028B54_DIS_DEALLOC_ACCUM_1(x) (((x) & 0x1) << 11) +#define G_028B54_DIS_DEALLOC_ACCUM_1(x) (((x) >> 11) & 0x1) +#define C_028B54_DIS_DEALLOC_ACCUM_1 0xFFFFF7FF +#define S_028B54_VS_WAVE_ID_EN(x) (((x) & 0x1) << 12) +#define G_028B54_VS_WAVE_ID_EN(x) (((x) >> 12) & 0x1) +#define C_028B54_VS_WAVE_ID_EN 0xFFFFEFFF +/* */ #define R_028B58_VGT_LS_HS_CONFIG 0x028B58 #define S_028B58_NUM_PATCHES(x) (((x) & 0xFF) << 0) #define G_028B58_NUM_PATCHES(x) (((x) >> 0) & 0xFF) @@ -7848,6 +9821,9 @@ #define S_028B6C_RESERVED_REDUC_AXIS(x) (((x) & 0x1) << 8) /* not on CIK */ #define G_028B6C_RESERVED_REDUC_AXIS(x) (((x) >> 8) & 0x1) /* not on CIK */ #define C_028B6C_RESERVED_REDUC_AXIS 0xFFFFFEFF /* not on CIK */ +#define S_028B6C_DEPRECATED(x) (((x) & 0x1) << 9) +#define G_028B6C_DEPRECATED(x) (((x) >> 9) & 0x1) +#define C_028B6C_DEPRECATED 0xFFFFFDFF #define S_028B6C_NUM_DS_WAVES_PER_SIMD(x) (((x) & 0x0F) << 10) #define G_028B6C_NUM_DS_WAVES_PER_SIMD(x) (((x) >> 10) & 0x0F) #define C_028B6C_NUM_DS_WAVES_PER_SIMD 0xFFFFC3FF @@ -7862,6 +9838,14 @@ #define V_028B6C_VGT_POLICY_STREAM 0x01 #define V_028B6C_VGT_POLICY_BYPASS 0x02 /* */ +/* VI */ +#define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17) +#define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03) +#define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF +#define S_028B6C_MTYPE(x) (((x) & 0x03) << 19) +#define G_028B6C_MTYPE(x) (((x) >> 19) & 0x03) +#define C_028B6C_MTYPE 0xFFE7FFFF +/* */ #define R_028B70_DB_ALPHA_TO_MASK 0x028B70 #define S_028B70_ALPHA_TO_MASK_ENABLE(x) (((x) & 0x1) << 0) #define G_028B70_ALPHA_TO_MASK_ENABLE(x) (((x) >> 0) & 0x1) @@ -8001,6 +9985,22 @@ #define S_028BDC_DX10_DIAMOND_TEST_ENA(x) (((x) & 0x1) << 12) #define G_028BDC_DX10_DIAMOND_TEST_ENA(x) (((x) >> 12) & 0x1) #define C_028BDC_DX10_DIAMOND_TEST_ENA 0xFFFFEFFF +#define R_028BE0_PA_SC_AA_CONFIG 0x028BE0 +#define S_028BE0_MSAA_NUM_SAMPLES(x) (((x) & 0x7) << 0) +#define G_028BE0_MSAA_NUM_SAMPLES(x) (((x) >> 0) & 0x07) +#define C_028BE0_MSAA_NUM_SAMPLES 0xFFFFFFF8 +#define S_028BE0_AA_MASK_CENTROID_DTMN(x) (((x) & 0x1) << 4) +#define G_028BE0_AA_MASK_CENTROID_DTMN(x) (((x) >> 4) & 0x1) +#define C_028BE0_AA_MASK_CENTROID_DTMN 0xFFFFFFEF +#define S_028BE0_MAX_SAMPLE_DIST(x) (((x) & 0xf) << 13) +#define G_028BE0_MAX_SAMPLE_DIST(x) (((x) >> 13) & 0x0F) +#define C_028BE0_MAX_SAMPLE_DIST 0xFFFE1FFF +#define S_028BE0_MSAA_EXPOSED_SAMPLES(x) (((x) & 0x7) << 20) +#define G_028BE0_MSAA_EXPOSED_SAMPLES(x) (((x) >> 20) & 0x07) +#define C_028BE0_MSAA_EXPOSED_SAMPLES 0xFF8FFFFF +#define S_028BE0_DETAIL_TO_EXPOSED_MODE(x) (((x) & 0x3) << 24) +#define G_028BE0_DETAIL_TO_EXPOSED_MODE(x) (((x) >> 24) & 0x03) +#define C_028BE0_DETAIL_TO_EXPOSED_MODE 0xFCFFFFFF #define R_028BE4_PA_SU_VTX_CNTL 0x028BE4 #define S_028BE4_PIX_CENTER(x) (((x) & 0x1) << 0) #define G_028BE4_PIX_CENTER(x) (((x) >> 0) & 0x1) @@ -8569,6 +10569,17 @@ #define G_028C70_FMASK_COMPRESSION_DISABLE(x) (((x) >> 26) & 0x1) #define C_028C70_FMASK_COMPRESSION_DISABLE 0xFBFFFFFF /* */ +/* VI */ +#define S_028C70_FMASK_COMPRESS_1FRAG_ONLY(x) (((x) & 0x1) << 27) +#define G_028C70_FMASK_COMPRESS_1FRAG_ONLY(x) (((x) >> 27) & 0x1) +#define C_028C70_FMASK_COMPRESS_1FRAG_ONLY 0xF7FFFFFF +#define S_028C70_DCC_ENABLE(x) (((x) & 0x1) << 28) +#define G_028C70_DCC_ENABLE(x) (((x) >> 28) & 0x1) +#define C_028C70_DCC_ENABLE 0xEFFFFFFF +#define S_028C70_CMASK_ADDR_TYPE(x) (((x) & 0x03) << 29) +#define G_028C70_CMASK_ADDR_TYPE(x) (((x) >> 29) & 0x03) +#define C_028C70_CMASK_ADDR_TYPE 0x9FFFFFFF +/* */ #define R_028C74_CB_COLOR0_ATTRIB 0x028C74 #define S_028C74_TILE_MODE_INDEX(x) (((x) & 0x1F) << 0) #define G_028C74_TILE_MODE_INDEX(x) (((x) >> 0) & 0x1F) @@ -8576,7 +10587,9 @@ #define S_028C74_FMASK_TILE_MODE_INDEX(x) (((x) & 0x1F) << 5) #define G_028C74_FMASK_TILE_MODE_INDEX(x) (((x) >> 5) & 0x1F) #define C_028C74_FMASK_TILE_MODE_INDEX 0xFFFFFC1F -#define S_028C74_FMASK_BANK_HEIGHT(x) (((x) & 0x3) << 10) /* SI errata */ +#define S_028C74_FMASK_BANK_HEIGHT(x) (((x) & 0x03) << 10) +#define G_028C74_FMASK_BANK_HEIGHT(x) (((x) >> 10) & 0x03) +#define C_028C74_FMASK_BANK_HEIGHT 0xFFFFF3FF #define S_028C74_NUM_SAMPLES(x) (((x) & 0x07) << 12) #define G_028C74_NUM_SAMPLES(x) (((x) >> 12) & 0x07) #define C_028C74_NUM_SAMPLES 0xFFFF8FFF @@ -8586,6 +10599,36 @@ #define S_028C74_FORCE_DST_ALPHA_1(x) (((x) & 0x1) << 17) #define G_028C74_FORCE_DST_ALPHA_1(x) (((x) >> 17) & 0x1) #define C_028C74_FORCE_DST_ALPHA_1 0xFFFDFFFF +/* VI */ +#define R_028C78_CB_COLOR0_DCC_CONTROL 0x028C78 +#define S_028C78_OVERWRITE_COMBINER_DISABLE(x) (((x) & 0x1) << 0) +#define G_028C78_OVERWRITE_COMBINER_DISABLE(x) (((x) >> 0) & 0x1) +#define C_028C78_OVERWRITE_COMBINER_DISABLE 0xFFFFFFFE +#define S_028C78_KEY_CLEAR_ENABLE(x) (((x) & 0x1) << 1) +#define G_028C78_KEY_CLEAR_ENABLE(x) (((x) >> 1) & 0x1) +#define C_028C78_KEY_CLEAR_ENABLE 0xFFFFFFFD +#define S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x) (((x) & 0x03) << 2) +#define G_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x) (((x) >> 2) & 0x03) +#define C_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE 0xFFFFFFF3 +#define S_028C78_MIN_COMPRESSED_BLOCK_SIZE(x) (((x) & 0x1) << 4) +#define G_028C78_MIN_COMPRESSED_BLOCK_SIZE(x) (((x) >> 4) & 0x1) +#define C_028C78_MIN_COMPRESSED_BLOCK_SIZE 0xFFFFFFEF +#define S_028C78_MAX_COMPRESSED_BLOCK_SIZE(x) (((x) & 0x03) << 5) +#define G_028C78_MAX_COMPRESSED_BLOCK_SIZE(x) (((x) >> 5) & 0x03) +#define C_028C78_MAX_COMPRESSED_BLOCK_SIZE 0xFFFFFF9F +#define S_028C78_COLOR_TRANSFORM(x) (((x) & 0x03) << 7) +#define G_028C78_COLOR_TRANSFORM(x) (((x) >> 7) & 0x03) +#define C_028C78_COLOR_TRANSFORM 0xFFFFFE7F +#define S_028C78_INDEPENDENT_64B_BLOCKS(x) (((x) & 0x1) << 9) +#define G_028C78_INDEPENDENT_64B_BLOCKS(x) (((x) >> 9) & 0x1) +#define C_028C78_INDEPENDENT_64B_BLOCKS 0xFFFFFDFF +#define S_028C78_LOSSY_RGB_PRECISION(x) (((x) & 0x0F) << 10) +#define G_028C78_LOSSY_RGB_PRECISION(x) (((x) >> 10) & 0x0F) +#define C_028C78_LOSSY_RGB_PRECISION 0xFFFFC3FF +#define S_028C78_LOSSY_ALPHA_PRECISION(x) (((x) & 0x0F) << 14) +#define G_028C78_LOSSY_ALPHA_PRECISION(x) (((x) >> 14) & 0x0F) +#define C_028C78_LOSSY_ALPHA_PRECISION 0xFFFC3FFF +/* */ #define R_028C7C_CB_COLOR0_CMASK 0x028C7C #define R_028C80_CB_COLOR0_CMASK_SLICE 0x028C80 #define S_028C80_TILE_MAX(x) (((x) & 0x3FFF) << 0) @@ -8598,90 +10641,105 @@ #define C_028C88_TILE_MAX 0xFFC00000 #define R_028C8C_CB_COLOR0_CLEAR_WORD0 0x028C8C #define R_028C90_CB_COLOR0_CLEAR_WORD1 0x028C90 +#define R_028C94_CB_COLOR0_DCC_BASE 0x028C94 /* VI */ #define R_028C9C_CB_COLOR1_BASE 0x028C9C #define R_028CA0_CB_COLOR1_PITCH 0x028CA0 #define R_028CA4_CB_COLOR1_SLICE 0x028CA4 #define R_028CA8_CB_COLOR1_VIEW 0x028CA8 #define R_028CAC_CB_COLOR1_INFO 0x028CAC #define R_028CB0_CB_COLOR1_ATTRIB 0x028CB0 -#define R_028CD4_CB_COLOR1_CMASK 0x028CB8 +#define R_028CB4_CB_COLOR1_DCC_CONTROL 0x028CB4 /* VI */ +#define R_028CB8_CB_COLOR1_CMASK 0x028CB8 #define R_028CBC_CB_COLOR1_CMASK_SLICE 0x028CBC #define R_028CC0_CB_COLOR1_FMASK 0x028CC0 #define R_028CC4_CB_COLOR1_FMASK_SLICE 0x028CC4 #define R_028CC8_CB_COLOR1_CLEAR_WORD0 0x028CC8 #define R_028CCC_CB_COLOR1_CLEAR_WORD1 0x028CCC +#define R_028CD0_CB_COLOR1_DCC_BASE 0x028CD0 /* VI */ #define R_028CD8_CB_COLOR2_BASE 0x028CD8 #define R_028CDC_CB_COLOR2_PITCH 0x028CDC #define R_028CE0_CB_COLOR2_SLICE 0x028CE0 #define R_028CE4_CB_COLOR2_VIEW 0x028CE4 #define R_028CE8_CB_COLOR2_INFO 0x028CE8 #define R_028CEC_CB_COLOR2_ATTRIB 0x028CEC +#define R_028CF0_CB_COLOR2_DCC_CONTROL 0x028CF0 /* VI */ #define R_028CF4_CB_COLOR2_CMASK 0x028CF4 #define R_028CF8_CB_COLOR2_CMASK_SLICE 0x028CF8 #define R_028CFC_CB_COLOR2_FMASK 0x028CFC #define R_028D00_CB_COLOR2_FMASK_SLICE 0x028D00 #define R_028D04_CB_COLOR2_CLEAR_WORD0 0x028D04 #define R_028D08_CB_COLOR2_CLEAR_WORD1 0x028D08 +#define R_028D0C_CB_COLOR2_DCC_BASE 0x028D0C /* VI */ #define R_028D14_CB_COLOR3_BASE 0x028D14 #define R_028D18_CB_COLOR3_PITCH 0x028D18 #define R_028D1C_CB_COLOR3_SLICE 0x028D1C #define R_028D20_CB_COLOR3_VIEW 0x028D20 #define R_028D24_CB_COLOR3_INFO 0x028D24 #define R_028D28_CB_COLOR3_ATTRIB 0x028D28 +#define R_028D2C_CB_COLOR3_DCC_CONTROL 0x028D2C /* VI */ #define R_028D30_CB_COLOR3_CMASK 0x028D30 #define R_028D34_CB_COLOR3_CMASK_SLICE 0x028D34 #define R_028D38_CB_COLOR3_FMASK 0x028D38 #define R_028D3C_CB_COLOR3_FMASK_SLICE 0x028D3C #define R_028D40_CB_COLOR3_CLEAR_WORD0 0x028D40 #define R_028D44_CB_COLOR3_CLEAR_WORD1 0x028D44 +#define R_028D48_CB_COLOR3_DCC_BASE 0x028D48 /* VI */ #define R_028D50_CB_COLOR4_BASE 0x028D50 #define R_028D54_CB_COLOR4_PITCH 0x028D54 #define R_028D58_CB_COLOR4_SLICE 0x028D58 #define R_028D5C_CB_COLOR4_VIEW 0x028D5C #define R_028D60_CB_COLOR4_INFO 0x028D60 #define R_028D64_CB_COLOR4_ATTRIB 0x028D64 +#define R_028D68_CB_COLOR4_DCC_CONTROL 0x028D68 /* VI */ #define R_028D6C_CB_COLOR4_CMASK 0x028D6C #define R_028D70_CB_COLOR4_CMASK_SLICE 0x028D70 #define R_028D74_CB_COLOR4_FMASK 0x028D74 #define R_028D78_CB_COLOR4_FMASK_SLICE 0x028D78 #define R_028D7C_CB_COLOR4_CLEAR_WORD0 0x028D7C #define R_028D80_CB_COLOR4_CLEAR_WORD1 0x028D80 +#define R_028D84_CB_COLOR4_DCC_BASE 0x028D84 /* VI */ #define R_028D8C_CB_COLOR5_BASE 0x028D8C #define R_028D90_CB_COLOR5_PITCH 0x028D90 #define R_028D94_CB_COLOR5_SLICE 0x028D94 #define R_028D98_CB_COLOR5_VIEW 0x028D98 #define R_028D9C_CB_COLOR5_INFO 0x028D9C #define R_028DA0_CB_COLOR5_ATTRIB 0x028DA0 +#define R_028DA4_CB_COLOR5_DCC_CONTROL 0x028DA4 /* VI */ #define R_028DA8_CB_COLOR5_CMASK 0x028DA8 #define R_028DAC_CB_COLOR5_CMASK_SLICE 0x028DAC #define R_028DB0_CB_COLOR5_FMASK 0x028DB0 #define R_028DB4_CB_COLOR5_FMASK_SLICE 0x028DB4 #define R_028DB8_CB_COLOR5_CLEAR_WORD0 0x028DB8 #define R_028DBC_CB_COLOR5_CLEAR_WORD1 0x028DBC +#define R_028DC0_CB_COLOR5_DCC_BASE 0x028DC0 /* VI */ #define R_028DC8_CB_COLOR6_BASE 0x028DC8 #define R_028DCC_CB_COLOR6_PITCH 0x028DCC #define R_028DD0_CB_COLOR6_SLICE 0x028DD0 #define R_028DD4_CB_COLOR6_VIEW 0x028DD4 #define R_028DD8_CB_COLOR6_INFO 0x028DD8 #define R_028DDC_CB_COLOR6_ATTRIB 0x028DDC +#define R_028DE0_CB_COLOR6_DCC_CONTROL 0x028DE0 /* VI */ #define R_028DE4_CB_COLOR6_CMASK 0x028DE4 #define R_028DE8_CB_COLOR6_CMASK_SLICE 0x028DE8 #define R_028DEC_CB_COLOR6_FMASK 0x028DEC #define R_028DF0_CB_COLOR6_FMASK_SLICE 0x028DF0 #define R_028DF4_CB_COLOR6_CLEAR_WORD0 0x028DF4 #define R_028DF8_CB_COLOR6_CLEAR_WORD1 0x028DF8 +#define R_028DFC_CB_COLOR6_DCC_BASE 0x028DFC /* VI */ #define R_028E04_CB_COLOR7_BASE 0x028E04 #define R_028E08_CB_COLOR7_PITCH 0x028E08 #define R_028E0C_CB_COLOR7_SLICE 0x028E0C #define R_028E10_CB_COLOR7_VIEW 0x028E10 #define R_028E14_CB_COLOR7_INFO 0x028E14 #define R_028E18_CB_COLOR7_ATTRIB 0x028E18 +#define R_028E1C_CB_COLOR7_DCC_CONTROL 0x028E1C /* VI */ #define R_028E20_CB_COLOR7_CMASK 0x028E20 #define R_028E24_CB_COLOR7_CMASK_SLICE 0x028E24 #define R_028E28_CB_COLOR7_FMASK 0x028E28 #define R_028E2C_CB_COLOR7_FMASK_SLICE 0x028E2C #define R_028E30_CB_COLOR7_CLEAR_WORD0 0x028E30 #define R_028E34_CB_COLOR7_CLEAR_WORD1 0x028E34 +#define R_028E38_CB_COLOR7_DCC_BASE 0x028E38 /* VI */ /* SI async DMA packets */ #define SI_DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \ diff --git a/src/gallium/drivers/rbug/rbug_context.h b/src/gallium/drivers/rbug/rbug_context.h index 5e7b9d4dee4..e99f6edc523 100644 --- a/src/gallium/drivers/rbug/rbug_context.h +++ b/src/gallium/drivers/rbug/rbug_context.h @@ -79,7 +79,7 @@ struct rbug_context { struct rbug_list shaders; }; -static INLINE struct rbug_context * +static inline struct rbug_context * rbug_context(struct pipe_context *pipe) { return (struct rbug_context *)pipe; diff --git a/src/gallium/drivers/rbug/rbug_objects.h b/src/gallium/drivers/rbug/rbug_objects.h index 3fba3334228..02973e07996 100644 --- a/src/gallium/drivers/rbug/rbug_objects.h +++ b/src/gallium/drivers/rbug/rbug_objects.h @@ -93,7 +93,7 @@ struct rbug_transfer }; -static INLINE struct rbug_resource * +static inline struct rbug_resource * rbug_resource(struct pipe_resource *_resource) { if (!_resource) @@ -102,7 +102,7 @@ rbug_resource(struct pipe_resource *_resource) return (struct rbug_resource *)_resource; } -static INLINE struct rbug_sampler_view * +static inline struct rbug_sampler_view * rbug_sampler_view(struct pipe_sampler_view *_sampler_view) { if (!_sampler_view) @@ -111,7 +111,7 @@ rbug_sampler_view(struct pipe_sampler_view *_sampler_view) return (struct rbug_sampler_view *)_sampler_view; } -static INLINE struct rbug_surface * +static inline struct rbug_surface * rbug_surface(struct pipe_surface *_surface) { if (!_surface) @@ -120,7 +120,7 @@ rbug_surface(struct pipe_surface *_surface) return (struct rbug_surface *)_surface; } -static INLINE struct rbug_transfer * +static inline struct rbug_transfer * rbug_transfer(struct pipe_transfer *_transfer) { if (!_transfer) @@ -129,7 +129,7 @@ rbug_transfer(struct pipe_transfer *_transfer) return (struct rbug_transfer *)_transfer; } -static INLINE struct rbug_shader * +static inline struct rbug_shader * rbug_shader(void *_state) { if (!_state) @@ -137,7 +137,7 @@ rbug_shader(void *_state) return (struct rbug_shader *)_state; } -static INLINE struct pipe_resource * +static inline struct pipe_resource * rbug_resource_unwrap(struct pipe_resource *_resource) { if (!_resource) @@ -145,7 +145,7 @@ rbug_resource_unwrap(struct pipe_resource *_resource) return rbug_resource(_resource)->resource; } -static INLINE struct pipe_sampler_view * +static inline struct pipe_sampler_view * rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view) { if (!_sampler_view) @@ -153,7 +153,7 @@ rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view) return rbug_sampler_view(_sampler_view)->sampler_view; } -static INLINE struct pipe_surface * +static inline struct pipe_surface * rbug_surface_unwrap(struct pipe_surface *_surface) { if (!_surface) @@ -161,7 +161,7 @@ rbug_surface_unwrap(struct pipe_surface *_surface) return rbug_surface(_surface)->surface; } -static INLINE struct pipe_transfer * +static inline struct pipe_transfer * rbug_transfer_unwrap(struct pipe_transfer *_transfer) { if (!_transfer) @@ -169,7 +169,7 @@ rbug_transfer_unwrap(struct pipe_transfer *_transfer) return rbug_transfer(_transfer)->transfer; } -static INLINE void * +static inline void * rbug_shader_unwrap(void *_state) { struct rbug_shader *shader; diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c index d5a3164e217..7da4e81560a 100644 --- a/src/gallium/drivers/rbug/rbug_screen.c +++ b/src/gallium/drivers/rbug/rbug_screen.c @@ -226,17 +226,6 @@ rbug_screen_fence_reference(struct pipe_screen *_screen, } static boolean -rbug_screen_fence_signalled(struct pipe_screen *_screen, - struct pipe_fence_handle *fence) -{ - struct rbug_screen *rb_screen = rbug_screen(_screen); - struct pipe_screen *screen = rb_screen->screen; - - return screen->fence_signalled(screen, - fence); -} - -static boolean rbug_screen_fence_finish(struct pipe_screen *_screen, struct pipe_fence_handle *fence, uint64_t timeout) @@ -288,7 +277,6 @@ rbug_screen_create(struct pipe_screen *screen) rb_screen->base.resource_destroy = rbug_screen_resource_destroy; rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer; rb_screen->base.fence_reference = rbug_screen_fence_reference; - rb_screen->base.fence_signalled = rbug_screen_fence_signalled; rb_screen->base.fence_finish = rbug_screen_fence_finish; rb_screen->screen = screen; diff --git a/src/gallium/drivers/rbug/rbug_screen.h b/src/gallium/drivers/rbug/rbug_screen.h index a53afac05e9..fd92374beda 100644 --- a/src/gallium/drivers/rbug/rbug_screen.h +++ b/src/gallium/drivers/rbug/rbug_screen.h @@ -60,7 +60,7 @@ struct rbug_screen struct rbug_list transfers; }; -static INLINE struct rbug_screen * +static inline struct rbug_screen * rbug_screen(struct pipe_screen *screen) { return (struct rbug_screen *)screen; diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index 50a73369c1d..577df814b29 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -203,7 +203,7 @@ struct softpipe_context { }; -static INLINE struct softpipe_context * +static inline struct softpipe_context * softpipe_context( struct pipe_context *pipe ) { return (struct softpipe_context *)pipe; diff --git a/src/gallium/drivers/softpipe/sp_fence.c b/src/gallium/drivers/softpipe/sp_fence.c index c2897ed1ef8..6168236ec96 100644 --- a/src/gallium/drivers/softpipe/sp_fence.c +++ b/src/gallium/drivers/softpipe/sp_fence.c @@ -41,15 +41,6 @@ softpipe_fence_reference(struct pipe_screen *screen, static boolean -softpipe_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - assert(fence); - return TRUE; -} - - -static boolean softpipe_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) @@ -64,5 +55,4 @@ softpipe_init_screen_fence_funcs(struct pipe_screen *screen) { screen->fence_reference = softpipe_fence_reference; screen->fence_finish = softpipe_fence_finish; - screen->fence_signalled = softpipe_fence_signalled; } diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index 369ab6ed8d4..89411777ec9 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -52,7 +52,7 @@ struct sp_exec_fragment_shader /** cast wrapper */ -static INLINE struct sp_exec_fragment_shader * +static inline struct sp_exec_fragment_shader * sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var) { return (struct sp_exec_fragment_shader *) var; diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c index 18eca611669..f8a3eacdb37 100644 --- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c +++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c @@ -145,7 +145,7 @@ sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim) } -static INLINE cptrf4 get_vert( const void *vertex_buffer, +static inline cptrf4 get_vert( const void *vertex_buffer, int index, int stride ) { diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c index a32bd7fd241..5b458450cd8 100644 --- a/src/gallium/drivers/softpipe/sp_quad_blend.c +++ b/src/gallium/drivers/softpipe/sp_quad_blend.c @@ -63,7 +63,7 @@ struct blend_quad_stage /** cast wrapper */ -static INLINE struct blend_quad_stage * +static inline struct blend_quad_stage * blend_quad_stage(struct quad_stage *stage) { return (struct blend_quad_stage *) stage; diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c index 82c58d04527..395bc70f2cf 100644 --- a/src/gallium/drivers/softpipe/sp_quad_fs.c +++ b/src/gallium/drivers/softpipe/sp_quad_fs.c @@ -56,7 +56,7 @@ struct quad_shade_stage /** cast wrapper */ -static INLINE struct quad_shade_stage * +static inline struct quad_shade_stage * quad_shade_stage(struct quad_stage *qs) { return (struct quad_shade_stage *) qs; @@ -67,7 +67,7 @@ quad_shade_stage(struct quad_stage *qs) * Execute fragment shader for the four fragments in the quad. * \return TRUE if quad is alive, FALSE if all four pixels are killed */ -static INLINE boolean +static inline boolean shade_quad(struct quad_stage *qs, struct quad_header *quad) { struct softpipe_context *softpipe = qs->softpipe; diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index a688d319bb8..0bfd9c3578c 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -234,6 +234,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: return 1; case PIPE_CAP_CLIP_HALFZ: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; case PIPE_CAP_VERTEXID_NOBASE: return 0; @@ -242,6 +244,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h index d39e9f48e80..f0e929111c2 100644 --- a/src/gallium/drivers/softpipe/sp_screen.h +++ b/src/gallium/drivers/softpipe/sp_screen.h @@ -49,7 +49,7 @@ struct softpipe_screen { boolean use_llvm; }; -static INLINE struct softpipe_screen * +static inline struct softpipe_screen * softpipe_screen( struct pipe_screen *pipe ) { return (struct softpipe_screen *)pipe; diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c index 6704015112b..ff3cb9fe5e1 100644 --- a/src/gallium/drivers/softpipe/sp_setup.c +++ b/src/gallium/drivers/softpipe/sp_setup.c @@ -125,7 +125,7 @@ struct setup_context { /** * Clip setup->quad against the scissor/surface bounds. */ -static INLINE void +static inline void quad_clip(struct setup_context *setup, struct quad_header *quad) { const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect; @@ -156,7 +156,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad) /** * Emit a quad (pass to next stage) with clipping. */ -static INLINE void +static inline void clip_emit_quad(struct setup_context *setup, struct quad_header *quad) { quad_clip( setup, quad ); @@ -178,14 +178,14 @@ clip_emit_quad(struct setup_context *setup, struct quad_header *quad) * Given an X or Y coordinate, return the block/quad coordinate that it * belongs to. */ -static INLINE int +static inline int block(int x) { return x & ~(2-1); } -static INLINE int +static inline int block_x(int x) { return x & ~(16-1); @@ -1039,7 +1039,7 @@ setup_line_coefficients(struct setup_context *setup, /** * Plot a pixel in a line segment. */ -static INLINE void +static inline void plot(struct setup_context *setup, int x, int y) { const int iy = y & 1; diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index 1010b63de2c..565fca632c6 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -58,7 +58,7 @@ * of improperly weighted linear-filtered textures. * The tests/texwrap.c demo is a good test. */ -static INLINE float +static inline float frac(float f) { return f - floorf(f); @@ -69,7 +69,7 @@ frac(float f) /** * Linear interpolation macro */ -static INLINE float +static inline float lerp(float a, float v0, float v1) { return v0 + a * (v1 - v0); @@ -84,7 +84,7 @@ lerp(float a, float v0, float v1) * optimization! If we find that's not true on some systems, convert * to a macro. */ -static INLINE float +static inline float lerp_2d(float a, float b, float v00, float v10, float v01, float v11) { @@ -97,7 +97,7 @@ lerp_2d(float a, float b, /** * As above, but 3D interpolation of 8 values. */ -static INLINE float +static inline float lerp_3d(float a, float b, float c, float v000, float v100, float v010, float v110, float v001, float v101, float v011, float v111) @@ -115,7 +115,7 @@ lerp_3d(float a, float b, float c, * value. To avoid that problem we add a large multiple of the size * (rather than using a conditional). */ -static INLINE int +static inline int repeat(int coord, unsigned size) { return (coord + size * 1024) % size; @@ -486,7 +486,7 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset, /** * Do coordinate to array index conversion. For array textures. */ -static INLINE int +static inline int coord_to_layer(float coord, unsigned first_layer, unsigned last_layer) { int c = util_ifloor(coord + 0.5F); @@ -587,7 +587,7 @@ compute_lambda_vert(const struct sp_sampler_view *sview, -static INLINE const float * +static inline const float * get_texel_2d_no_border(const struct sp_sampler_view *sp_sview, union tex_tile_address addr, int x, int y) { @@ -603,7 +603,7 @@ get_texel_2d_no_border(const struct sp_sampler_view *sp_sview, } -static INLINE const float * +static inline const float * get_texel_2d(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, int x, int y) @@ -695,7 +695,7 @@ static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = { PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y } }; -static INLINE unsigned +static inline unsigned get_next_face(unsigned face, int idx) { return face_array[face][idx]; @@ -705,7 +705,7 @@ get_next_face(unsigned face, int idx) * return a new xcoord based on old face, old coords, cube size * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+) */ -static INLINE int +static inline int get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc) { if ((face == 0 && fall_off_index != 1) || @@ -743,7 +743,7 @@ get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc) * return a new ycoord based on old face, old coords, cube size * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+) */ -static INLINE int +static inline int get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc) { if ((fall_off_index <= 1) && (face <= 1 || face >= 4)) { @@ -771,7 +771,7 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc) /* Gather a quad of adjacent texels within a tile: */ -static INLINE void +static inline void get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview, union tex_tile_address addr, unsigned x, unsigned y, @@ -795,7 +795,7 @@ get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview, /* Gather a quad of potentially non-adjacent texels: */ -static INLINE void +static inline void get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview, union tex_tile_address addr, int x0, int y0, @@ -810,7 +810,7 @@ get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview, /* Can involve a lot of unnecessary checks for border color: */ -static INLINE void +static inline void get_texel_quad_2d(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, @@ -828,7 +828,7 @@ get_texel_quad_2d(const struct sp_sampler_view *sp_sview, /* 3d variants: */ -static INLINE const float * +static inline const float * get_texel_3d_no_border(const struct sp_sampler_view *sp_sview, union tex_tile_address addr, int x, int y, int z) { @@ -846,7 +846,7 @@ get_texel_3d_no_border(const struct sp_sampler_view *sp_sview, } -static INLINE const float * +static inline const float * get_texel_3d(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, int x, int y, int z) @@ -866,7 +866,7 @@ get_texel_3d(const struct sp_sampler_view *sp_sview, /* Get texel pointer for 1D array texture */ -static INLINE const float * +static inline const float * get_texel_1d_array(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, int x, int y) @@ -884,7 +884,7 @@ get_texel_1d_array(const struct sp_sampler_view *sp_sview, /* Get texel pointer for 2D array texture */ -static INLINE const float * +static inline const float * get_texel_2d_array(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, int x, int y, int layer) @@ -905,7 +905,7 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview, } -static INLINE const float * +static inline const float * get_texel_cube_seamless(const struct sp_sampler_view *sp_sview, union tex_tile_address addr, int x, int y, float *corner, int layer, unsigned face) @@ -960,7 +960,7 @@ get_texel_cube_seamless(const struct sp_sampler_view *sp_sview, /* Get texel pointer for cube array texture */ -static INLINE const float * +static inline const float * get_texel_cube_array(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, union tex_tile_address addr, int x, int y, int layer) @@ -986,7 +986,7 @@ get_texel_cube_array(const struct sp_sampler_view *sp_sview, * If level = 2, then we'll return 64 (the width at level=2). * Return 1 if level > base_pot. */ -static INLINE unsigned +static inline unsigned pot_level_size(unsigned base_pot, unsigned level) { return (base_pot >= level) ? (1 << (base_pot - level)) : 1; @@ -1016,7 +1016,7 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ /* Some image-filter fastpaths: */ -static INLINE void +static inline void img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, const struct img_filter_args *args, @@ -1070,7 +1070,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview, } -static INLINE void +static inline void img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, const struct img_filter_args *args, @@ -1104,7 +1104,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview, } -static INLINE void +static inline void img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, const struct img_filter_args *args, @@ -1819,7 +1819,7 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview, * \param lod_in per-fragment lod_bias or explicit_lod. * \param lod returns the per-fragment lod. */ -static INLINE void +static inline void compute_lod(const struct pipe_sampler_state *sampler, enum tgsi_sampler_control control, const float biased_lambda, @@ -1859,7 +1859,7 @@ compute_lod(const struct pipe_sampler_state *sampler, * \param lod_in per-fragment lod_bias or explicit_lod. * \param lod results per-fragment lod. */ -static INLINE void +static inline void compute_lambda_lod(struct sp_sampler_view *sp_sview, struct sp_sampler *sp_samp, const float s[TGSI_QUAD_SIZE], @@ -1906,7 +1906,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview, } } -static INLINE unsigned +static inline unsigned get_gather_component(const float lod_in[TGSI_QUAD_SIZE]) { /* gather component is stored in lod_in slot as unsigned */ @@ -2789,7 +2789,7 @@ get_linear_wrap(unsigned mode) /** * Is swizzling needed for the given state key? */ -static INLINE bool +static inline bool any_swizzle(const struct pipe_sampler_view *view) { return (view->swizzle_r != PIPE_SWIZZLE_RED || diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c index 4a421a8f882..21f38b2f859 100644 --- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c @@ -185,7 +185,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc) * This is basically a direct-map cache. * XXX There's probably lots of ways in which we can improve this. */ -static INLINE uint +static inline uint tex_cache_pos( union tex_tile_address addr ) { uint entry = (addr.bits.x + diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h index 2233effc439..b7ad222d715 100644 --- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h +++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h @@ -127,7 +127,7 @@ extern const struct softpipe_tex_cached_tile * sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, union tex_tile_address addr ); -static INLINE union tex_tile_address +static inline union tex_tile_address tex_tile_address( unsigned x, unsigned y, unsigned z, @@ -147,7 +147,7 @@ tex_tile_address( unsigned x, /* Quickly retrieve tile if it matches last lookup. */ -static INLINE const struct softpipe_tex_cached_tile * +static inline const struct softpipe_tex_cached_tile * sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, union tex_tile_address addr ) { diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h index 1701bf574d9..fbf741a9c72 100644 --- a/src/gallium/drivers/softpipe/sp_texture.h +++ b/src/gallium/drivers/softpipe/sp_texture.h @@ -81,13 +81,13 @@ struct softpipe_transfer /** cast wrappers */ -static INLINE struct softpipe_resource * +static inline struct softpipe_resource * softpipe_resource(struct pipe_resource *pt) { return (struct softpipe_resource *) pt; } -static INLINE struct softpipe_transfer * +static inline struct softpipe_transfer * softpipe_transfer(struct pipe_transfer *pt) { return (struct softpipe_transfer *) pt; @@ -99,7 +99,7 @@ softpipe_transfer(struct pipe_transfer *pt) * This is a short-cut instead of using map()/unmap(), which should * probably be fixed. */ -static INLINE void * +static inline void * softpipe_resource_data(struct pipe_resource *pt) { if (!pt) diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c index b763f526e61..9cc8ac12525 100644 --- a/src/gallium/drivers/softpipe/sp_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tile_cache.c @@ -52,7 +52,7 @@ sp_alloc_tile(struct softpipe_tile_cache *tc); (((x) + (y) * 5 + (l) * 10) % NUM_ENTRIES) -static INLINE int addr_to_clear_pos(union tile_address addr) +static inline int addr_to_clear_pos(union tile_address addr) { int pos; pos = addr.bits.layer * (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE); @@ -63,7 +63,7 @@ static INLINE int addr_to_clear_pos(union tile_address addr) /** * Is the tile at (x,y) in cleared state? */ -static INLINE uint +static inline uint is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max) { int pos, bit; @@ -77,7 +77,7 @@ is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max) /** * Mark the tile at (x,y) as not cleared. */ -static INLINE void +static inline void clear_clear_flag(uint *bitvec, union tile_address addr, unsigned max) { int pos; diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h index 167e1ffcada..2c0bafad651 100644 --- a/src/gallium/drivers/softpipe/sp_tile_cache.h +++ b/src/gallium/drivers/softpipe/sp_tile_cache.h @@ -128,7 +128,7 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc, union tile_address addr ); -static INLINE union tile_address +static inline union tile_address tile_address( unsigned x, unsigned y, unsigned layer ) { @@ -143,7 +143,7 @@ tile_address( unsigned x, /* Quickly retrieve tile if it matches last lookup. */ -static INLINE struct softpipe_cached_tile * +static inline struct softpipe_cached_tile * sp_get_cached_tile(struct softpipe_tile_cache *tc, int x, int y, int layer ) { diff --git a/src/gallium/drivers/svga/Makefile.am b/src/gallium/drivers/svga/Makefile.am index e0a8cad7208..d46de95e4b4 100644 --- a/src/gallium/drivers/svga/Makefile.am +++ b/src/gallium/drivers/svga/Makefile.am @@ -20,8 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript index bb4d034f1eb..0ee624616f9 100644 --- a/src/gallium/drivers/svga/SConscript +++ b/src/gallium/drivers/svga/SConscript @@ -11,7 +11,6 @@ if env['suncc']: if env['gcc'] or env['clang']: env.Append(CPPDEFINES = [ 'HAVE_STDINT_H', - 'HAVE_SYS_TYPES_H', ]) env.Prepend(CPPPATH = [ diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h index 355edfdb702..5e00906ce36 100644 --- a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h +++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h @@ -507,7 +507,7 @@ static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1; *---------------------------------------------------------------------- */ -static INLINE SVGA3dShaderRegType +static inline SVGA3dShaderRegType SVGA3dShaderGetRegType(uint32 token) { SVGA3dShaderSrcToken src; diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h index 0f242dd402c..ccbf7912e6d 100644 --- a/src/gallium/drivers/svga/include/svga_overlay.h +++ b/src/gallium/drivers/svga/include/svga_overlay.h @@ -133,7 +133,7 @@ struct { *---------------------------------------------------------------------- */ -static INLINE Bool +static inline Bool VMwareVideoGetAttributes(const SVGAOverlayFormat format, // IN uint32 *width, // IN / OUT uint32 *height, // IN / OUT diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c index 474b75c3c86..b271832171d 100644 --- a/src/gallium/drivers/svga/svga_cmd.c +++ b/src/gallium/drivers/svga/svga_cmd.c @@ -57,7 +57,7 @@ *---------------------------------------------------------------------- */ -static INLINE void +static inline void surface_to_surfaceid(struct svga_winsys_context *swc, // IN struct pipe_surface *surface, // IN SVGA3dSurfaceImageId *id, // OUT diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index 630f5f77d66..71f038df8c1 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -485,20 +485,20 @@ svga_context_create(struct pipe_screen *screen, * Inline conversion functions. These are better-typed than the * macros used previously: */ -static INLINE struct svga_context * +static inline struct svga_context * svga_context( struct pipe_context *pipe ) { return (struct svga_context *)pipe; } -static INLINE boolean +static inline boolean svga_have_gb_objects(const struct svga_context *svga) { return svga_screen(svga->pipe.screen)->sws->have_gb_objects; } -static INLINE boolean +static inline boolean svga_have_gb_dma(const struct svga_context *svga) { return svga_screen(svga->pipe.screen)->sws->have_gb_dma; diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h index 3a3fcd8fae2..82c9b602d5d 100644 --- a/src/gallium/drivers/svga/svga_debug.h +++ b/src/gallium/drivers/svga/svga_debug.h @@ -53,7 +53,7 @@ extern int SVGA_DEBUG; #define DBSTR(x) "" #endif -static INLINE void +static inline void SVGA_DBG( unsigned flag, const char *fmt, ... ) { #ifdef DEBUG diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h index 1b054038e9f..9ab87e8259a 100644 --- a/src/gallium/drivers/svga/svga_draw_private.h +++ b/src/gallium/drivers/svga/svga_draw_private.h @@ -57,7 +57,7 @@ static const unsigned svga_hw_prims = * PIPE_PRIM_QUADS, PIPE_PRIM_QUAD_STRIP or PIPE_PRIM_POLYGON. We convert * those to other types of primitives with index/translation code. */ -static INLINE unsigned +static inline unsigned svga_translate_prim(unsigned mode, unsigned vcount,unsigned *prim_count) { switch (mode) { diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c index 594eec7166e..2890516c0cf 100644 --- a/src/gallium/drivers/svga/svga_pipe_blend.c +++ b/src/gallium/drivers/svga/svga_pipe_blend.c @@ -33,7 +33,7 @@ #include "svga_hw_reg.h" -static INLINE unsigned +static inline unsigned svga_translate_blend_factor(unsigned factor) { switch (factor) { @@ -58,7 +58,7 @@ svga_translate_blend_factor(unsigned factor) } } -static INLINE unsigned +static inline unsigned svga_translate_blend_func(unsigned mode) { switch (mode) { diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c index cb07dbe09a3..8db21fd7476 100644 --- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c +++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c @@ -32,7 +32,7 @@ #include "svga_hw_reg.h" -static INLINE unsigned +static inline unsigned svga_translate_compare_func(unsigned func) { switch (func) { @@ -50,7 +50,7 @@ svga_translate_compare_func(unsigned func) } } -static INLINE unsigned +static inline unsigned svga_translate_stencil_op(unsigned op) { switch (op) { diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index a97a9c46cf8..208a2cd14bf 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -59,7 +59,7 @@ struct svga_query { /** cast wrapper */ -static INLINE struct svga_query * +static inline struct svga_query * svga_query( struct pipe_query *q ) { return (struct svga_query *)q; diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c index 8a87bb467aa..effd490dd22 100644 --- a/src/gallium/drivers/svga/svga_pipe_sampler.c +++ b/src/gallium/drivers/svga/svga_pipe_sampler.c @@ -35,7 +35,7 @@ #include "svga_debug.h" -static INLINE unsigned +static inline unsigned translate_wrap_mode(unsigned wrap) { switch (wrap) { @@ -68,7 +68,7 @@ translate_wrap_mode(unsigned wrap) } } -static INLINE unsigned translate_img_filter( unsigned filter ) +static inline unsigned translate_img_filter( unsigned filter ) { switch (filter) { case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST; @@ -79,7 +79,7 @@ static INLINE unsigned translate_img_filter( unsigned filter ) } } -static INLINE unsigned translate_mip_filter( unsigned filter ) +static inline unsigned translate_mip_filter( unsigned filter ) { switch (filter) { case PIPE_TEX_MIPFILTER_NONE: return SVGA3D_TEX_FILTER_NONE; diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c index d2c7762e7ff..13f85cddbd5 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.c +++ b/src/gallium/drivers/svga/svga_resource_buffer.c @@ -45,7 +45,7 @@ * Vertex and index buffers need hardware backing. Constant buffers * do not. No other types of buffers currently supported. */ -static INLINE boolean +static inline boolean svga_buffer_needs_hw_storage(unsigned usage) { return usage & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER); diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h index 83b3d342aec..e838beb6661 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.h +++ b/src/gallium/drivers/svga/svga_resource_buffer.h @@ -190,7 +190,7 @@ struct svga_buffer }; -static INLINE struct svga_buffer * +static inline struct svga_buffer * svga_buffer(struct pipe_resource *buffer) { if (buffer) { @@ -205,7 +205,7 @@ svga_buffer(struct pipe_resource *buffer) * Returns TRUE for user buffers. We may * decide to use an alternate upload path for these buffers. */ -static INLINE boolean +static inline boolean svga_buffer_is_user_buffer( struct pipe_resource *buffer ) { if (buffer) { @@ -219,7 +219,7 @@ svga_buffer_is_user_buffer( struct pipe_resource *buffer ) * Returns a pointer to a struct svga_winsys_screen given a * struct svga_buffer. */ -static INLINE struct svga_winsys_screen * +static inline struct svga_winsys_screen * svga_buffer_winsys_screen(struct svga_buffer *sbuf) { return svga_screen(sbuf->b.b.screen)->sws; @@ -230,7 +230,7 @@ svga_buffer_winsys_screen(struct svga_buffer *sbuf) * Returns whether a buffer has hardware storage that is * visible to the GPU. */ -static INLINE boolean +static inline boolean svga_buffer_has_hw_storage(struct svga_buffer *sbuf) { if (svga_buffer_winsys_screen(sbuf)->have_gb_objects) @@ -242,7 +242,7 @@ svga_buffer_has_hw_storage(struct svga_buffer *sbuf) /** * Map the hardware storage of a buffer. */ -static INLINE void * +static inline void * svga_buffer_hw_storage_map(struct svga_context *svga, struct svga_buffer *sbuf, unsigned flags, boolean *retry) @@ -259,7 +259,7 @@ svga_buffer_hw_storage_map(struct svga_context *svga, /** * Unmap the hardware storage of a buffer. */ -static INLINE void +static inline void svga_buffer_hw_storage_unmap(struct svga_context *svga, struct svga_buffer *sbuf) { diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h index 1ff42fabab9..19dadfb8828 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.h +++ b/src/gallium/drivers/svga/svga_resource_texture.h @@ -106,7 +106,7 @@ struct svga_transfer }; -static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource ) +static inline struct svga_texture *svga_texture( struct pipe_resource *resource ) { struct svga_texture *tex = (struct svga_texture *)resource; assert(tex == NULL || tex->b.vtbl == &svga_texture_vtbl); @@ -114,7 +114,7 @@ static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource } -static INLINE struct svga_transfer * +static inline struct svga_transfer * svga_transfer(struct pipe_transfer *transfer) { assert(transfer); @@ -127,7 +127,7 @@ svga_transfer(struct pipe_transfer *transfer) * This is used to track updates to textures when we draw into * them via a surface. */ -static INLINE void +static inline void svga_age_texture_view(struct svga_texture *tex, unsigned level) { assert(level < Elements(tex->view_age)); @@ -138,7 +138,7 @@ svga_age_texture_view(struct svga_texture *tex, unsigned level) /** * Mark the given texture face/level as being defined. */ -static INLINE void +static inline void svga_define_texture_level(struct svga_texture *tex, unsigned face,unsigned level) { @@ -148,7 +148,7 @@ svga_define_texture_level(struct svga_texture *tex, } -static INLINE bool +static inline bool svga_is_texture_level_defined(const struct svga_texture *tex, unsigned face, unsigned level) { @@ -177,7 +177,7 @@ check_face_level(const struct svga_texture *tex, } -static INLINE void +static inline void svga_set_texture_rendered_to(struct svga_texture *tex, unsigned face, unsigned level) { @@ -186,7 +186,7 @@ svga_set_texture_rendered_to(struct svga_texture *tex, } -static INLINE void +static inline void svga_clear_texture_rendered_to(struct svga_texture *tex, unsigned face, unsigned level) { @@ -195,7 +195,7 @@ svga_clear_texture_rendered_to(struct svga_texture *tex, } -static INLINE boolean +static inline boolean svga_was_texture_rendered_to(const struct svga_texture *tex, unsigned face, unsigned level) { diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h index 2087c1be85e..7f14323f84f 100644 --- a/src/gallium/drivers/svga/svga_sampler_view.h +++ b/src/gallium/drivers/svga/svga_sampler_view.h @@ -86,7 +86,7 @@ svga_destroy_sampler_view_priv(struct svga_sampler_view *v); void svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv); -static INLINE void +static inline void svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v) { struct svga_sampler_view *old = *ptr; diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 56e486786df..66c3deaa9e7 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -309,6 +309,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_UMA: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; } @@ -443,7 +447,9 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en return 0; case PIPE_SHADER_GEOMETRY: case PIPE_SHADER_COMPUTE: - /* no support for geometry or compute shaders at this time */ + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + /* no support for geometry, tess or compute shaders at this time */ return 0; default: debug_printf("Unexpected shader type (%u) query\n", shader); @@ -543,21 +549,15 @@ svga_fence_reference(struct pipe_screen *screen, static boolean -svga_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - struct svga_winsys_screen *sws = svga_screen(screen)->sws; - return sws->fence_signalled(sws, fence, 0) == 0; -} - - -static boolean svga_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) { struct svga_winsys_screen *sws = svga_screen(screen)->sws; + if (!timeout) + return sws->fence_signalled(sws, fence, 0) == 0; + SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n", __FUNCTION__, fence); @@ -645,7 +645,6 @@ svga_screen_create(struct svga_winsys_screen *sws) screen->is_format_supported = svga_is_format_supported; screen->context_create = svga_context_create; screen->fence_reference = svga_fence_reference; - screen->fence_signalled = svga_fence_signalled; screen->fence_finish = svga_fence_finish; screen->get_driver_query_info = svga_get_driver_query_info; svgascreen->sws = sws; diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h index b85191c4b26..ea1e743dfe5 100644 --- a/src/gallium/drivers/svga/svga_screen.h +++ b/src/gallium/drivers/svga/svga_screen.h @@ -82,7 +82,7 @@ struct svga_screen #ifndef DEBUG /** cast wrapper */ -static INLINE struct svga_screen * +static inline struct svga_screen * svga_screen(struct pipe_screen *pscreen) { return (struct svga_screen *) pscreen; diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c index f63f7836187..3c765394a88 100644 --- a/src/gallium/drivers/svga/svga_screen_cache.c +++ b/src/gallium/drivers/svga/svga_screen_cache.c @@ -76,7 +76,7 @@ surface_size(const struct svga_host_surface_cache_key *key) /** * Compute the bucket for this key. */ -static INLINE unsigned +static inline unsigned svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key) { return util_hash_crc32(key, sizeof *key) % SVGA_HOST_SURFACE_CACHE_BUCKETS; diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h index fd500ae4401..5102159b96a 100644 --- a/src/gallium/drivers/svga/svga_shader.h +++ b/src/gallium/drivers/svga/svga_shader.h @@ -44,7 +44,7 @@ svga_destroy_shader_variant(struct svga_context *svga, /** * Check if a shader's bytecode exceeds the device limits. */ -static INLINE boolean +static inline boolean svga_shader_too_large(const struct svga_context *svga, const struct svga_shader_variant *variant) { diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c index 566a79407e5..8cdce742b3b 100644 --- a/src/gallium/drivers/svga/svga_state_fs.c +++ b/src/gallium/drivers/svga/svga_state_fs.c @@ -41,7 +41,7 @@ -static INLINE int +static inline int compare_fs_keys(const struct svga_fs_compile_key *a, const struct svga_fs_compile_key *b) { diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c index fb56b3d36ba..ebb98373e2b 100644 --- a/src/gallium/drivers/svga/svga_state_rss.c +++ b/src/gallium/drivers/svga/svga_state_rss.c @@ -61,7 +61,7 @@ do { \ } while (0) -static INLINE void +static inline void svga_queue_rs( struct rs_queue *q, unsigned rss, unsigned value ) diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c index 0ab571c0588..41334bd7cb9 100644 --- a/src/gallium/drivers/svga/svga_state_tss.c +++ b/src/gallium/drivers/svga/svga_state_tss.c @@ -274,7 +274,7 @@ do { \ } while (0) -static INLINE void +static inline void svga_queue_tss( struct ts_queue *q, unsigned unit, unsigned tss, diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c index 545c9d7420f..c2a0f1ee6b1 100644 --- a/src/gallium/drivers/svga/svga_state_vs.c +++ b/src/gallium/drivers/svga/svga_state_vs.c @@ -41,7 +41,7 @@ #include "svga_hw_reg.h" -static INLINE int +static inline int compare_vs_keys(const struct svga_vs_compile_key *a, const struct svga_vs_compile_key *b) { diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h index 7b8f6f018d2..2fa72a1c8f0 100644 --- a/src/gallium/drivers/svga/svga_surface.h +++ b/src/gallium/drivers/svga/svga_surface.h @@ -84,7 +84,7 @@ svga_texture_copy_handle(struct svga_context *svga, unsigned width, unsigned height, unsigned depth); -static INLINE struct svga_surface * +static inline struct svga_surface * svga_surface(struct pipe_surface *surface) { assert(surface); @@ -92,7 +92,7 @@ svga_surface(struct pipe_surface *surface) } -static INLINE const struct svga_surface * +static inline const struct svga_surface * svga_surface_const(const struct pipe_surface *surface) { assert(surface); diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h index 608950d7af6..e2106e1e8e6 100644 --- a/src/gallium/drivers/svga/svga_swtnl_private.h +++ b/src/gallium/drivers/svga/svga_swtnl_private.h @@ -76,7 +76,7 @@ struct svga_vbuf_render { /** * Basically a cast wrapper. */ -static INLINE struct svga_vbuf_render * +static inline struct svga_vbuf_render * svga_vbuf_render( struct vbuf_render *render ) { assert(render); diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c index 9aafd851264..2e2ff5e4673 100644 --- a/src/gallium/drivers/svga/svga_tgsi.c +++ b/src/gallium/drivers/svga/svga_tgsi.c @@ -84,7 +84,7 @@ svga_shader_expand(struct svga_shader_emitter *emit) } -static INLINE boolean +static inline boolean reserve(struct svga_shader_emitter *emit, unsigned nr_dwords) { if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) { diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h index e7a2a134ca5..5c47a4ad39f 100644 --- a/src/gallium/drivers/svga/svga_tgsi.h +++ b/src/gallium/drivers/svga/svga_tgsi.h @@ -124,7 +124,7 @@ struct svga_shader_variant * The real use of this information is matching vertex elements to * fragment shader inputs in the case where vertex shader is disabled. */ -static INLINE void svga_generate_vdecl_semantics( unsigned idx, +static inline void svga_generate_vdecl_semantics( unsigned idx, unsigned *usage, unsigned *usage_index ) { @@ -140,12 +140,12 @@ static INLINE void svga_generate_vdecl_semantics( unsigned idx, -static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key ) +static inline unsigned svga_vs_key_size( const struct svga_vs_compile_key *key ) { return sizeof *key; } -static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key ) +static inline unsigned svga_fs_key_size( const struct svga_fs_compile_key *key ) { return (const char *)&key->tex[key->num_textures] - (const char *)key; } diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h index 1894296e6d7..1a1dac23507 100644 --- a/src/gallium/drivers/svga/svga_tgsi_emit.h +++ b/src/gallium/drivers/svga/svga_tgsi_emit.h @@ -167,7 +167,7 @@ svga_translate_decl_sm30(struct svga_shader_emitter *emit, /** Emit the given SVGA3dShaderInstToken opcode */ -static INLINE boolean +static inline boolean emit_instruction(struct svga_shader_emitter *emit, SVGA3dShaderInstToken opcode) { @@ -176,7 +176,7 @@ emit_instruction(struct svga_shader_emitter *emit, /** Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode */ -static INLINE SVGA3dShaderInstToken +static inline SVGA3dShaderInstToken inst_token(unsigned opcode) { SVGA3dShaderInstToken inst; @@ -192,7 +192,7 @@ inst_token(unsigned opcode) * Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode * with the predication flag set. */ -static INLINE SVGA3dShaderInstToken +static inline SVGA3dShaderInstToken inst_token_predicated(unsigned opcode) { SVGA3dShaderInstToken inst; @@ -209,7 +209,7 @@ inst_token_predicated(unsigned opcode) * Generate a SVGA3dShaderInstToken for a SETP instruction (set predicate) * using the given comparison operator (one of SVGA3DOPCOMP_xx). */ -static INLINE SVGA3dShaderInstToken +static inline SVGA3dShaderInstToken inst_token_setp(unsigned operator) { SVGA3dShaderInstToken inst; @@ -227,7 +227,7 @@ inst_token_setp(unsigned operator) * Note that this function is used to create tokens for output registers, * temp registers AND constants (see emit_def_const()). */ -static INLINE SVGA3dShaderDestToken +static inline SVGA3dShaderDestToken dst_register(unsigned file, int number) { SVGA3dShaderDestToken dest; @@ -255,7 +255,7 @@ dst_register(unsigned file, int number) * Apply a writemask to the given SVGA3dShaderDestToken, returning a * new SVGA3dShaderDestToken. */ -static INLINE SVGA3dShaderDestToken +static inline SVGA3dShaderDestToken writemask(SVGA3dShaderDestToken dest, unsigned mask) { assert(dest.mask & mask); @@ -265,7 +265,7 @@ writemask(SVGA3dShaderDestToken dest, unsigned mask) /** Create a SVGA3dShaderSrcToken given a register file and number */ -static INLINE SVGA3dShaderSrcToken +static inline SVGA3dShaderSrcToken src_token(unsigned file, int number) { SVGA3dShaderSrcToken src; @@ -289,7 +289,7 @@ src_token(unsigned file, int number) /** Create a src_register given a register file and register number */ -static INLINE struct src_register +static inline struct src_register src_register(unsigned file, int number) { struct src_register src; @@ -301,7 +301,7 @@ src_register(unsigned file, int number) } /** Translate src_register into SVGA3dShaderDestToken */ -static INLINE SVGA3dShaderDestToken +static inline SVGA3dShaderDestToken dst(struct src_register src) { return dst_register(SVGA3dShaderGetRegType(src.base.value), src.base.num); @@ -309,7 +309,7 @@ dst(struct src_register src) /** Translate SVGA3dShaderDestToken to a src_register */ -static INLINE struct src_register +static inline struct src_register src(SVGA3dShaderDestToken dst) { return src_register(SVGA3dShaderGetRegType(dst.value), dst.num); diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h index 5db64bf135b..0a2e3d5f345 100644 --- a/src/gallium/drivers/svga/svgadump/svga_shader.h +++ b/src/gallium/drivers/svga/svgadump/svga_shader.h @@ -56,7 +56,7 @@ struct sh_reg unsigned is_reg:1; }; -static INLINE unsigned +static inline unsigned sh_reg_type( struct sh_reg reg ) { return reg.type_lo | (reg.type_hi << 3); @@ -138,7 +138,7 @@ struct sh_dstreg unsigned is_reg:1; }; -static INLINE unsigned +static inline unsigned sh_dstreg_type( struct sh_dstreg reg ) { return reg.type_lo | (reg.type_hi << 3); @@ -169,7 +169,7 @@ struct sh_srcreg unsigned is_reg:1; }; -static INLINE unsigned +static inline unsigned sh_srcreg_type( struct sh_srcreg reg ) { return reg.type_lo | (reg.type_hi << 3); diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index 0013c963e7a..7f6d0645112 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -49,13 +49,13 @@ struct trace_query }; -static INLINE struct trace_query * +static inline struct trace_query * trace_query(struct pipe_query *query) { return (struct trace_query *)query; } -static INLINE struct pipe_query * +static inline struct pipe_query * trace_query_unwrap(struct pipe_query *query) { if (query) { @@ -66,7 +66,7 @@ trace_query_unwrap(struct pipe_query *query) } -static INLINE struct pipe_resource * +static inline struct pipe_resource * trace_resource_unwrap(struct trace_context *tr_ctx, struct pipe_resource *resource) { @@ -82,7 +82,7 @@ trace_resource_unwrap(struct trace_context *tr_ctx, } -static INLINE struct pipe_surface * +static inline struct pipe_surface * trace_surface_unwrap(struct trace_context *tr_ctx, struct pipe_surface *surface) { @@ -105,7 +105,7 @@ trace_surface_unwrap(struct trace_context *tr_ctx, } -static INLINE void +static inline void trace_context_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info) { @@ -125,7 +125,7 @@ trace_context_draw_vbo(struct pipe_context *_pipe, } -static INLINE struct pipe_query * +static inline struct pipe_query * trace_context_create_query(struct pipe_context *_pipe, unsigned query_type, unsigned index) @@ -163,7 +163,7 @@ trace_context_create_query(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_destroy_query(struct pipe_context *_pipe, struct pipe_query *_query) { @@ -185,7 +185,7 @@ trace_context_destroy_query(struct pipe_context *_pipe, } -static INLINE boolean +static inline boolean trace_context_begin_query(struct pipe_context *_pipe, struct pipe_query *query) { @@ -207,7 +207,7 @@ trace_context_begin_query(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_end_query(struct pipe_context *_pipe, struct pipe_query *query) { @@ -227,7 +227,7 @@ trace_context_end_query(struct pipe_context *_pipe, } -static INLINE boolean +static inline boolean trace_context_get_query_result(struct pipe_context *_pipe, struct pipe_query *_query, boolean wait, @@ -262,7 +262,7 @@ trace_context_get_query_result(struct pipe_context *_pipe, } -static INLINE void * +static inline void * trace_context_create_blend_state(struct pipe_context *_pipe, const struct pipe_blend_state *state) { @@ -285,7 +285,7 @@ trace_context_create_blend_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_bind_blend_state(struct pipe_context *_pipe, void *state) { @@ -303,7 +303,7 @@ trace_context_bind_blend_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_delete_blend_state(struct pipe_context *_pipe, void *state) { @@ -321,7 +321,7 @@ trace_context_delete_blend_state(struct pipe_context *_pipe, } -static INLINE void * +static inline void * trace_context_create_sampler_state(struct pipe_context *_pipe, const struct pipe_sampler_state *state) { @@ -344,7 +344,7 @@ trace_context_create_sampler_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_bind_sampler_states(struct pipe_context *_pipe, unsigned shader, unsigned start, @@ -371,7 +371,7 @@ trace_context_bind_sampler_states(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_delete_sampler_state(struct pipe_context *_pipe, void *state) { @@ -389,7 +389,7 @@ trace_context_delete_sampler_state(struct pipe_context *_pipe, } -static INLINE void * +static inline void * trace_context_create_rasterizer_state(struct pipe_context *_pipe, const struct pipe_rasterizer_state *state) { @@ -412,7 +412,7 @@ trace_context_create_rasterizer_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_bind_rasterizer_state(struct pipe_context *_pipe, void *state) { @@ -430,7 +430,7 @@ trace_context_bind_rasterizer_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_delete_rasterizer_state(struct pipe_context *_pipe, void *state) { @@ -448,7 +448,7 @@ trace_context_delete_rasterizer_state(struct pipe_context *_pipe, } -static INLINE void * +static inline void * trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe, const struct pipe_depth_stencil_alpha_state *state) { @@ -471,7 +471,7 @@ trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe, void *state) { @@ -489,7 +489,7 @@ trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, void *state) { @@ -508,7 +508,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, #define TRACE_SHADER_STATE(shader_type) \ - static INLINE void * \ + static inline void * \ trace_context_create_##shader_type##_state(struct pipe_context *_pipe, \ const struct pipe_shader_state *state) \ { \ @@ -524,7 +524,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, return result; \ } \ \ - static INLINE void \ + static inline void \ trace_context_bind_##shader_type##_state(struct pipe_context *_pipe, \ void *state) \ { \ @@ -537,7 +537,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, trace_dump_call_end(); \ } \ \ - static INLINE void \ + static inline void \ trace_context_delete_##shader_type##_state(struct pipe_context *_pipe, \ void *state) \ { \ @@ -559,7 +559,7 @@ TRACE_SHADER_STATE(tes) #undef TRACE_SHADER_STATE -static INLINE void * +static inline void * trace_context_create_vertex_elements_state(struct pipe_context *_pipe, unsigned num_elements, const struct pipe_vertex_element *elements) @@ -587,7 +587,7 @@ trace_context_create_vertex_elements_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_bind_vertex_elements_state(struct pipe_context *_pipe, void *state) { @@ -605,7 +605,7 @@ trace_context_bind_vertex_elements_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_delete_vertex_elements_state(struct pipe_context *_pipe, void *state) { @@ -623,7 +623,7 @@ trace_context_delete_vertex_elements_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_blend_color(struct pipe_context *_pipe, const struct pipe_blend_color *state) { @@ -641,7 +641,7 @@ trace_context_set_blend_color(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_stencil_ref(struct pipe_context *_pipe, const struct pipe_stencil_ref *state) { @@ -659,7 +659,7 @@ trace_context_set_stencil_ref(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_clip_state(struct pipe_context *_pipe, const struct pipe_clip_state *state) { @@ -676,7 +676,7 @@ trace_context_set_clip_state(struct pipe_context *_pipe, trace_dump_call_end(); } -static INLINE void +static inline void trace_context_set_sample_mask(struct pipe_context *_pipe, unsigned sample_mask) { @@ -693,7 +693,7 @@ trace_context_set_sample_mask(struct pipe_context *_pipe, trace_dump_call_end(); } -static INLINE void +static inline void trace_context_set_constant_buffer(struct pipe_context *_pipe, uint shader, uint index, struct pipe_constant_buffer *constant_buffer) @@ -721,7 +721,7 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_framebuffer_state(struct pipe_context *_pipe, const struct pipe_framebuffer_state *state) { @@ -751,7 +751,7 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_polygon_stipple(struct pipe_context *_pipe, const struct pipe_poly_stipple *state) { @@ -769,7 +769,7 @@ trace_context_set_polygon_stipple(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_scissor_states(struct pipe_context *_pipe, unsigned start_slot, unsigned num_scissors, @@ -791,7 +791,7 @@ trace_context_set_scissor_states(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_viewport_states(struct pipe_context *_pipe, unsigned start_slot, unsigned num_viewports, @@ -938,7 +938,7 @@ trace_context_surface_destroy(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_sampler_views(struct pipe_context *_pipe, unsigned shader, unsigned start, @@ -974,7 +974,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_vertex_buffers(struct pipe_context *_pipe, unsigned start_slot, unsigned num_buffers, const struct pipe_vertex_buffer *buffers) @@ -1008,7 +1008,7 @@ trace_context_set_vertex_buffers(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_set_index_buffer(struct pipe_context *_pipe, const struct pipe_index_buffer *ib) { @@ -1033,7 +1033,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe, } -static INLINE struct pipe_stream_output_target * +static inline struct pipe_stream_output_target * trace_context_create_stream_output_target(struct pipe_context *_pipe, struct pipe_resource *res, unsigned buffer_offset, @@ -1063,7 +1063,7 @@ trace_context_create_stream_output_target(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_stream_output_target_destroy( struct pipe_context *_pipe, struct pipe_stream_output_target *target) @@ -1082,7 +1082,7 @@ trace_context_stream_output_target_destroy( } -static INLINE void +static inline void trace_context_set_stream_output_targets(struct pipe_context *_pipe, unsigned num_targets, struct pipe_stream_output_target **tgs, @@ -1104,7 +1104,7 @@ trace_context_set_stream_output_targets(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_resource_copy_region(struct pipe_context *_pipe, struct pipe_resource *dst, unsigned dst_level, @@ -1139,7 +1139,7 @@ trace_context_resource_copy_region(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_blit(struct pipe_context *_pipe, const struct pipe_blit_info *_info) { @@ -1181,7 +1181,7 @@ trace_context_flush_resource(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_clear(struct pipe_context *_pipe, unsigned buffers, const union pipe_color_union *color, @@ -1210,7 +1210,7 @@ trace_context_clear(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_clear_render_target(struct pipe_context *_pipe, struct pipe_surface *dst, const union pipe_color_union *color, @@ -1237,7 +1237,7 @@ trace_context_clear_render_target(struct pipe_context *_pipe, trace_dump_call_end(); } -static INLINE void +static inline void trace_context_clear_depth_stencil(struct pipe_context *_pipe, struct pipe_surface *dst, unsigned clear_flags, @@ -1269,7 +1269,7 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe, trace_dump_call_end(); } -static INLINE void +static inline void trace_context_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence, unsigned flags) @@ -1291,7 +1291,7 @@ trace_context_flush(struct pipe_context *_pipe, } -static INLINE void +static inline void trace_context_destroy(struct pipe_context *_pipe) { struct trace_context *tr_ctx = trace_context(_pipe); diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h index 1e5ad88d034..ad57d9d5243 100644 --- a/src/gallium/drivers/trace/tr_context.h +++ b/src/gallium/drivers/trace/tr_context.h @@ -54,7 +54,7 @@ void trace_context_check(const struct pipe_context *pipe); -static INLINE struct trace_context * +static inline struct trace_context * trace_context(struct pipe_context *pipe) { assert(pipe); diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c index 753b92d8b54..601e2cbbec5 100644 --- a/src/gallium/drivers/trace/tr_dump.c +++ b/src/gallium/drivers/trace/tr_dump.c @@ -64,7 +64,7 @@ static long unsigned call_no = 0; static boolean dumping = FALSE; -static INLINE void +static inline void trace_dump_write(const char *buf, size_t size) { if (stream) { @@ -73,14 +73,14 @@ trace_dump_write(const char *buf, size_t size) } -static INLINE void +static inline void trace_dump_writes(const char *s) { trace_dump_write(s, strlen(s)); } -static INLINE void +static inline void trace_dump_writef(const char *format, ...) { static char buf[1024]; @@ -93,7 +93,7 @@ trace_dump_writef(const char *format, ...) } -static INLINE void +static inline void trace_dump_escape(const char *str) { const unsigned char *p = (const unsigned char *)str; @@ -117,7 +117,7 @@ trace_dump_escape(const char *str) } -static INLINE void +static inline void trace_dump_indent(unsigned level) { unsigned i; @@ -126,14 +126,14 @@ trace_dump_indent(unsigned level) } -static INLINE void +static inline void trace_dump_newline(void) { trace_dump_writes("\n"); } -static INLINE void +static inline void trace_dump_tag(const char *name) { trace_dump_writes("<"); @@ -142,7 +142,7 @@ trace_dump_tag(const char *name) } -static INLINE void +static inline void trace_dump_tag_begin(const char *name) { trace_dump_writes("<"); @@ -150,7 +150,7 @@ trace_dump_tag_begin(const char *name) trace_dump_writes(">"); } -static INLINE void +static inline void trace_dump_tag_begin1(const char *name, const char *attr1, const char *value1) { @@ -164,7 +164,7 @@ trace_dump_tag_begin1(const char *name, } -static INLINE void +static inline void trace_dump_tag_begin2(const char *name, const char *attr1, const char *value1, const char *attr2, const char *value2) @@ -183,7 +183,7 @@ trace_dump_tag_begin2(const char *name, } -static INLINE void +static inline void trace_dump_tag_begin3(const char *name, const char *attr1, const char *value1, const char *attr2, const char *value2, @@ -207,7 +207,7 @@ trace_dump_tag_begin3(const char *name, } -static INLINE void +static inline void trace_dump_tag_end(const char *name) { trace_dump_writes("</"); diff --git a/src/gallium/drivers/trace/tr_dump_defines.h b/src/gallium/drivers/trace/tr_dump_defines.h index 0c83c2b68f1..b38d63eac59 100644 --- a/src/gallium/drivers/trace/tr_dump_defines.h +++ b/src/gallium/drivers/trace/tr_dump_defines.h @@ -34,7 +34,7 @@ #include "tr_dump.h" -static INLINE void +static inline void trace_dump_format(enum pipe_format format) { if (!trace_dumping_enabled_locked()) @@ -44,7 +44,7 @@ trace_dump_format(enum pipe_format format) } -static INLINE void +static inline void trace_dump_query_type(unsigned value) { if (!trace_dumping_enabled_locked()) diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c index 266626defa8..1d86a378eea 100644 --- a/src/gallium/drivers/trace/tr_screen.c +++ b/src/gallium/drivers/trace/tr_screen.c @@ -370,29 +370,6 @@ trace_screen_fence_reference(struct pipe_screen *_screen, static boolean -trace_screen_fence_signalled(struct pipe_screen *_screen, - struct pipe_fence_handle *fence) -{ - struct trace_screen *tr_scr = trace_screen(_screen); - struct pipe_screen *screen = tr_scr->screen; - int result; - - trace_dump_call_begin("pipe_screen", "fence_signalled"); - - trace_dump_arg(ptr, screen); - trace_dump_arg(ptr, fence); - - result = screen->fence_signalled(screen, fence); - - trace_dump_ret(bool, result); - - trace_dump_call_end(); - - return result; -} - - -static boolean trace_screen_fence_finish(struct pipe_screen *_screen, struct pipe_fence_handle *fence, uint64_t timeout) @@ -503,7 +480,6 @@ trace_screen_create(struct pipe_screen *screen) tr_scr->base.resource_get_handle = trace_screen_resource_get_handle; tr_scr->base.resource_destroy = trace_screen_resource_destroy; tr_scr->base.fence_reference = trace_screen_fence_reference; - tr_scr->base.fence_signalled = trace_screen_fence_signalled; tr_scr->base.fence_finish = trace_screen_fence_finish; tr_scr->base.flush_frontbuffer = trace_screen_flush_frontbuffer; tr_scr->base.get_timestamp = trace_screen_get_timestamp; diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h index 5e45c3c2f8f..e48b7b39e24 100644 --- a/src/gallium/drivers/trace/tr_texture.h +++ b/src/gallium/drivers/trace/tr_texture.h @@ -85,7 +85,7 @@ struct trace_transfer }; -static INLINE struct trace_resource * +static inline struct trace_resource * trace_resource(struct pipe_resource *texture) { if(!texture) @@ -95,7 +95,7 @@ trace_resource(struct pipe_resource *texture) } -static INLINE struct trace_surface * +static inline struct trace_surface * trace_surface(struct pipe_surface *surface) { if(!surface) @@ -105,7 +105,7 @@ trace_surface(struct pipe_surface *surface) } -static INLINE struct trace_sampler_view * +static inline struct trace_sampler_view * trace_sampler_view(struct pipe_sampler_view *sampler_view) { if (!sampler_view) @@ -114,7 +114,7 @@ trace_sampler_view(struct pipe_sampler_view *sampler_view) } -static INLINE struct trace_transfer * +static inline struct trace_transfer * trace_transfer(struct pipe_transfer *transfer) { if(!transfer) diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am index 3f62ce21a9f..f4a57ba3404 100644 --- a/src/gallium/drivers/vc4/Makefile.am +++ b/src/gallium/drivers/vc4/Makefile.am @@ -19,8 +19,6 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -AUTOMAKE_OPTIONS = subdir-objects - include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc @@ -30,10 +28,10 @@ SIM_LDFLAGS = -lsimpenrose endif AM_CFLAGS = \ + -I$(top_builddir)/src/glsl/nir \ $(LIBDRM_CFLAGS) \ $(GALLIUM_DRIVER_CFLAGS) \ $(SIM_CFLAGS) \ - -I$(top_srcdir)/src/mesa/ \ $() noinst_LTLIBRARIES = libvc4.la diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 1eb029e67e7..6fb40c20562 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -19,6 +19,8 @@ C_SOURCES := \ vc4_fence.c \ vc4_formats.c \ vc4_job.c \ + vc4_nir_lower_blend.c \ + vc4_nir_lower_io.c \ vc4_opt_algebraic.c \ vc4_opt_constant_folding.c \ vc4_opt_copy_propagation.c \ @@ -49,4 +51,5 @@ C_SOURCES := \ vc4_state.c \ vc4_tiling.c \ vc4_tiling.h \ + vc4_uniforms.c \ $() diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h index 1fd8aa9fb28..ffc973735ae 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_drv.h +++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h @@ -26,17 +26,6 @@ #include "vc4_simulator_validate.h" -enum vc4_bo_mode { - VC4_MODE_UNDECIDED, - VC4_MODE_RENDER, - VC4_MODE_SHADER, -}; - -struct vc4_bo_exec_state { - struct drm_gem_cma_object *bo; - enum vc4_bo_mode mode; -}; - struct vc4_exec_info { /* Sequence number for this bin/render job. */ uint64_t seqno; @@ -47,7 +36,7 @@ struct vc4_exec_info { /* This is the array of BOs that were looked up at the start of exec. * Command validation will use indices into this array. */ - struct vc4_bo_exec_state *bo; + struct drm_gem_cma_object **bo; uint32_t bo_count; /* List of other BOs used in the job that need to be released @@ -72,7 +61,6 @@ struct vc4_exec_info { * command lists. */ struct vc4_shader_state { - uint8_t packet; uint32_t addr; /* Maximum vertex index referenced by any primitive using this * shader state. @@ -88,6 +76,7 @@ struct vc4_exec_info { bool found_tile_binning_mode_config_packet; bool found_start_tile_binning_packet; bool found_increment_semaphore_packet; + bool found_flush; uint8_t bin_tiles_x, bin_tiles_y; struct drm_gem_cma_object *tile_bo; uint32_t tile_alloc_offset; @@ -99,6 +88,9 @@ struct vc4_exec_info { uint32_t ct0ca, ct0ea; uint32_t ct1ca, ct1ea; + /* Pointer to the unvalidated bin CL (if present). */ + void *bin_u; + /* Pointers to the shader recs. These paddr gets incremented as CL * packets are relocated in validate_gl_shader_state, and the vaddrs * (u and v) get incremented and size decremented as the shader recs @@ -168,10 +160,8 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec); struct vc4_validated_shader_info * vc4_validate_shader(struct drm_gem_cma_object *shader_obj); -bool vc4_use_bo(struct vc4_exec_info *exec, - uint32_t hindex, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj); +struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec, + uint32_t hindex); int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec); diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c index e4b7fea5968..93f9ec7ed9b 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_gem.c +++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c @@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec) exec->ct0ca = exec->exec_bo->paddr + bin_offset; + exec->bin_u = bin; + exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset; exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset; exec->shader_rec_size = args->shader_rec_size; diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h index 88cfc0fa9f0..771e2b78761 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_packet.h +++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h @@ -88,16 +88,22 @@ enum vc4_packet { #define VC4_PACKET_START_TILE_BINNING_SIZE 1 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE 1 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE 1 +#define VC4_PACKET_BRANCH_SIZE 5 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE 5 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE 1 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE 1 +#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE 5 +#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE 5 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE 7 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE 7 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE 14 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE 10 +#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE 1 +#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE 1 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE 2 #define VC4_PACKET_GL_SHADER_STATE_SIZE 5 #define VC4_PACKET_NV_SHADER_STATE_SIZE 5 +#define VC4_PACKET_VG_SHADER_STATE_SIZE 5 #define VC4_PACKET_CONFIGURATION_BITS_SIZE 4 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE 5 #define VC4_PACKET_POINT_SIZE_SIZE 5 @@ -106,6 +112,7 @@ enum vc4_packet { #define VC4_PACKET_DEPTH_OFFSET_SIZE 5 #define VC4_PACKET_CLIP_WINDOW_SIZE 9 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE 5 +#define VC4_PACKET_Z_CLIPPING_SIZE 9 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE 9 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE 9 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE 16 @@ -136,6 +143,16 @@ enum vc4_packet { /** @{ * + * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and + * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER. + */ +#define VC4_LOADSTORE_FULL_RES_EOF (1 << 3) +#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL (1 << 2) +#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS (1 << 1) +#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR (1 << 0) + +/** @{ + * * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address) */ diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c index e2d907ad91f..b827eb7e9e1 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c +++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c @@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec, struct vc4_rcl_setup *setup, uint8_t x, uint8_t y, bool first, bool last) { - bool has_bin = exec->args->bin_cl_size != 0; + struct drm_vc4_submit_cl *args = exec->args; + bool has_bin = args->bin_cl_size != 0; /* Note that the load doesn't actually occur until the * tile coords packet is processed, and only one load @@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec, */ if (setup->color_read) { rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->color_read.bits); + rcl_u16(setup, args->color_read.bits); rcl_u32(setup, - setup->color_read->paddr + - exec->args->color_read.offset); + setup->color_read->paddr + args->color_read.offset); } if (setup->zs_read) { @@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec, } rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->zs_read.bits); - rcl_u32(setup, - setup->zs_read->paddr + exec->args->zs_read.offset); + rcl_u16(setup, args->zs_read.bits); + rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset); } /* Clipping depends on tile coordinates having been @@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec, if (setup->zs_write) { rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL); - rcl_u16(setup, exec->args->zs_write.bits | + rcl_u16(setup, args->zs_write.bits | (setup->color_ms_write ? VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0)); rcl_u32(setup, - (setup->zs_write->paddr + exec->args->zs_write.offset) | + (setup->zs_write->paddr + args->zs_write.offset) | ((last && !setup->color_ms_write) ? VC4_LOADSTORE_TILE_BUFFER_EOF : 0)); } @@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec, static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, struct vc4_rcl_setup *setup) { - bool has_bin = exec->args->bin_cl_size != 0; - uint8_t min_x_tile = exec->args->min_x_tile; - uint8_t min_y_tile = exec->args->min_y_tile; - uint8_t max_x_tile = exec->args->max_x_tile; - uint8_t max_y_tile = exec->args->max_y_tile; + struct drm_vc4_submit_cl *args = exec->args; + bool has_bin = args->bin_cl_size != 0; + uint8_t min_x_tile = args->min_x_tile; + uint8_t min_y_tile = args->min_y_tile; + uint8_t max_x_tile = args->max_x_tile; + uint8_t max_y_tile = args->max_y_tile; uint8_t xtiles = max_x_tile - min_x_tile + 1; uint8_t ytiles = max_y_tile - min_y_tile + 1; uint8_t x, y; @@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE; loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE; - if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { size += VC4_PACKET_CLEAR_COLORS_SIZE + VC4_PACKET_TILE_COORDINATES_SIZE + VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; @@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, } if (setup->zs_write) - loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE; + loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE; if (setup->color_ms_write) { if (setup->zs_write) loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE; @@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec, rcl_u32(setup, (setup->color_ms_write ? (setup->color_ms_write->paddr + - exec->args->color_ms_write.offset) : + args->color_ms_write.offset) : 0)); - rcl_u16(setup, exec->args->width); - rcl_u16(setup, exec->args->height); - rcl_u16(setup, exec->args->color_ms_write.bits); + rcl_u16(setup, args->width); + rcl_u16(setup, args->height); + rcl_u16(setup, args->color_ms_write.bits); /* The tile buffer gets cleared when the previous tile is stored. If * the clear values changed between frames, then the tile buffer has * stale clear values in it, so we have to do a store in None mode (no * writes) so that we trigger the tile buffer clear. */ - if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { + if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) { rcl_u8(setup, VC4_PACKET_CLEAR_COLORS); - rcl_u32(setup, exec->args->clear_color[0]); - rcl_u32(setup, exec->args->clear_color[1]); - rcl_u32(setup, exec->args->clear_z); - rcl_u8(setup, exec->args->clear_s); + rcl_u32(setup, args->clear_color[0]); + rcl_u32(setup, args->clear_color[1]); + rcl_u32(setup, args->clear_z); + rcl_u8(setup, args->clear_s); vc4_tile_coordinates(setup, 0, 0); @@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec, if (surf->hindex == ~0) return 0; - if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + *obj = vc4_use_bo(exec, surf->hindex); + if (!*obj) return -EINVAL; if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK | @@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec, if (surf->hindex == ~0) return 0; - if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj)) + *obj = vc4_use_bo(exec, surf->hindex); + if (!*obj) return -EINVAL; if (tiling > VC4_TILING_FORMAT_LT) { diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c index a0b67a7e50b..b248831113c 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c @@ -94,42 +94,42 @@ size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * utile_height(cpp)); } -bool -vc4_use_bo(struct vc4_exec_info *exec, - uint32_t hindex, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj) +struct drm_gem_cma_object * +vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex) { - *obj = NULL; + struct drm_gem_cma_object *obj; + struct drm_vc4_bo *bo; if (hindex >= exec->bo_count) { DRM_ERROR("BO index %d greater than BO count %d\n", hindex, exec->bo_count); - return false; + return NULL; } + obj = exec->bo[hindex]; + bo = to_vc4_bo(&obj->base); - if (exec->bo[hindex].mode != mode) { - if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) { - exec->bo[hindex].mode = mode; - } else { - DRM_ERROR("BO index %d reused with mode %d vs %d\n", - hindex, exec->bo[hindex].mode, mode); - return false; - } + if (bo->validated_shader) { + DRM_ERROR("Trying to use shader BO as something other than " + "a shader\n"); + return NULL; } - *obj = exec->bo[hindex].bo; - return true; + return obj; +} + +static struct drm_gem_cma_object * +vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index) +{ + return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]); } static bool -vc4_use_handle(struct vc4_exec_info *exec, - uint32_t gem_handles_packet_index, - enum vc4_bo_mode mode, - struct drm_gem_cma_object **obj) +validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos) { - return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index], - mode, obj); + /* Note that the untrusted pointer passed to these functions is + * incremented past the packet byte. + */ + return (untrusted - 1 == exec->bin_u + pos); } static uint32_t @@ -201,14 +201,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo, return true; } + static int -validate_flush_all(VALIDATE_ARGS) +validate_flush(VALIDATE_ARGS) { - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("VC4_PACKET_FLUSH_ALL after " - "VC4_PACKET_INCREMENT_SEMAPHORE\n"); + if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) { + DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n"); return -EINVAL; } + exec->found_flush = true; return 0; } @@ -233,17 +234,13 @@ validate_start_tile_binning(VALIDATE_ARGS) static int validate_increment_semaphore(VALIDATE_ARGS) { - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n"); + if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) { + DRM_ERROR("Bin CL must end with " + "VC4_PACKET_INCREMENT_SEMAPHORE\n"); return -EINVAL; } exec->found_increment_semaphore_packet = true; - /* Once we've found the semaphore increment, there should be one FLUSH - * then the end of the command list. The FLUSH actually triggers the - * increment, so we only need to make sure there - */ - return 0; } @@ -257,11 +254,6 @@ validate_indexed_prim_list(VALIDATE_ARGS) uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1; struct vc4_shader_state *shader_state; - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n"); - return -EINVAL; - } - /* Check overflow condition */ if (exec->shader_state_count == 0) { DRM_ERROR("shader state must precede primitives\n"); @@ -272,7 +264,8 @@ validate_indexed_prim_list(VALIDATE_ARGS) if (max_index > shader_state->max_index) shader_state->max_index = max_index; - if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib)) + ib = vc4_use_handle(exec, 0); + if (!ib) return -EINVAL; if (offset > ib->base.size || @@ -295,11 +288,6 @@ validate_gl_array_primitive(VALIDATE_ARGS) uint32_t max_index; struct vc4_shader_state *shader_state; - if (exec->found_increment_semaphore_packet) { - DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n"); - return -EINVAL; - } - /* Check overflow condition */ if (exec->shader_state_count == 0) { DRM_ERROR("shader state must precede primitives\n"); @@ -329,7 +317,6 @@ validate_gl_shader_state(VALIDATE_ARGS) return -EINVAL; } - exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE; exec->shader_state[i].addr = *(uint32_t *)untrusted; exec->shader_state[i].max_index = 0; @@ -348,31 +335,6 @@ validate_gl_shader_state(VALIDATE_ARGS) } static int -validate_nv_shader_state(VALIDATE_ARGS) -{ - uint32_t i = exec->shader_state_count++; - - if (i >= exec->shader_state_size) { - DRM_ERROR("More requests for shader states than declared\n"); - return -EINVAL; - } - - exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE; - exec->shader_state[i].addr = *(uint32_t *)untrusted; - - if (exec->shader_state[i].addr & 15) { - DRM_ERROR("NV shader state address 0x%08x misaligned\n", - exec->shader_state[i].addr); - return -EINVAL; - } - - *(uint32_t *)validated = (exec->shader_state[i].addr + - exec->shader_rec_p); - - return 0; -} - -static int validate_tile_binning_config(VALIDATE_ARGS) { struct drm_device *dev = exec->exec_bo->base.dev; @@ -473,8 +435,8 @@ static const struct cmd_info { } cmd_info[] = { VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL), VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL), - VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL), - VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush), + VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL), VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning), VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore), @@ -488,7 +450,7 @@ static const struct cmd_info { VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL), VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state), - VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state), + /* We don't support validating NV shader states. */ VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL), VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL), @@ -525,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev, u8 cmd = *(uint8_t *)src_pkt; const struct cmd_info *info; - if (cmd > ARRAY_SIZE(cmd_info)) { + if (cmd >= ARRAY_SIZE(cmd_info)) { DRM_ERROR("0x%08x: packet %d out of bounds\n", src_offset, cmd); return -EINVAL; @@ -580,8 +542,16 @@ vc4_validate_bin_cl(struct drm_device *dev, return -EINVAL; } - if (!exec->found_increment_semaphore_packet) { - DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n"); + /* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH. The + * semaphore is used to trigger the render CL to start up, and the + * FLUSH is what caps the bin lists with + * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main + * render CL when they get called to) and actually triggers the queued + * semaphore increment. + */ + if (!exec->found_increment_semaphore_packet || !exec->found_flush) { + DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + " + "VC4_PACKET_FLUSH\n"); return -EINVAL; } @@ -612,18 +582,19 @@ reloc_tex(struct vc4_exec_info *exec, uint32_t cube_map_stride = 0; enum vc4_texture_data_type type; - if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex)) + tex = vc4_use_bo(exec, texture_handle_index); + if (!tex) return false; if (sample->is_direct) { uint32_t remaining_size = tex->base.size - p0; if (p0 > tex->base.size - 4) { DRM_ERROR("UBO offset greater than UBO size\n"); - return false; + goto fail; } if (p1 > remaining_size - 4) { DRM_ERROR("UBO clamp would allow reads outside of UBO\n"); - return false; + goto fail; } *validated_p0 = tex->paddr + p0; return true; @@ -642,14 +613,14 @@ reloc_tex(struct vc4_exec_info *exec, VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) { if (cube_map_stride) { DRM_ERROR("Cube map stride set twice\n"); - return false; + goto fail; } cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK; } if (!cube_map_stride) { DRM_ERROR("Cube map stride not set\n"); - return false; + goto fail; } } @@ -683,7 +654,7 @@ reloc_tex(struct vc4_exec_info *exec, case VC4_TEXTURE_TYPE_YUV422R: default: DRM_ERROR("Texture format %d unsupported\n", type); - return false; + goto fail; } utile_w = utile_width(cpp); utile_h = utile_height(cpp); @@ -699,7 +670,7 @@ reloc_tex(struct vc4_exec_info *exec, if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5, tiling_format, width, height, cpp)) { - return false; + goto fail; } /* The mipmap levels are stored before the base of the texture. Make @@ -740,7 +711,7 @@ reloc_tex(struct vc4_exec_info *exec, i, level_width, level_height, aligned_width, aligned_height, level_size, offset); - return false; + goto fail; } offset -= level_size; @@ -749,54 +720,37 @@ reloc_tex(struct vc4_exec_info *exec, *validated_p0 = tex->paddr + p0; return true; + fail: + DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0); + DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1); + DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2); + DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3); + return false; } static int -validate_shader_rec(struct drm_device *dev, - struct vc4_exec_info *exec, - struct vc4_shader_state *state) +validate_gl_shader_rec(struct drm_device *dev, + struct vc4_exec_info *exec, + struct vc4_shader_state *state) { uint32_t *src_handles; void *pkt_u, *pkt_v; - enum shader_rec_reloc_type { - RELOC_CODE, - RELOC_VBO, - }; - struct shader_rec_reloc { - enum shader_rec_reloc_type type; - uint32_t offset; - }; - static const struct shader_rec_reloc gl_relocs[] = { - { RELOC_CODE, 4 }, /* fs */ - { RELOC_CODE, 16 }, /* vs */ - { RELOC_CODE, 28 }, /* cs */ + static const uint32_t shader_reloc_offsets[] = { + 4, /* fs */ + 16, /* vs */ + 28, /* cs */ }; - static const struct shader_rec_reloc nv_relocs[] = { - { RELOC_CODE, 4 }, /* fs */ - { RELOC_VBO, 12 } - }; - const struct shader_rec_reloc *relocs; - struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8]; - uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size; + uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets); + struct drm_gem_cma_object *bo[shader_reloc_count + 8]; + uint32_t nr_attributes, nr_relocs, packet_size; int i; - struct vc4_validated_shader_info *validated_shader = NULL; - - if (state->packet == VC4_PACKET_NV_SHADER_STATE) { - relocs = nv_relocs; - nr_fixed_relocs = ARRAY_SIZE(nv_relocs); - packet_size = 16; - } else { - relocs = gl_relocs; - nr_fixed_relocs = ARRAY_SIZE(gl_relocs); - - nr_attributes = state->addr & 0x7; - if (nr_attributes == 0) - nr_attributes = 8; - packet_size = gl_shader_rec_size(state->addr); - } - nr_relocs = nr_fixed_relocs + nr_attributes; + nr_attributes = state->addr & 0x7; + if (nr_attributes == 0) + nr_attributes = 8; + packet_size = gl_shader_rec_size(state->addr); + nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes; if (nr_relocs * 4 > exec->shader_rec_size) { DRM_ERROR("overflowed shader recs reading %d handles " "from %d bytes left\n", @@ -826,21 +780,30 @@ validate_shader_rec(struct drm_device *dev, exec->shader_rec_v += roundup(packet_size, 16); exec->shader_rec_size -= packet_size; - for (i = 0; i < nr_relocs; i++) { - enum vc4_bo_mode mode; - - if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE) - mode = VC4_MODE_SHADER; - else - mode = VC4_MODE_RENDER; + if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { + DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); + return -EINVAL; + } - if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) { - return false; + for (i = 0; i < shader_reloc_count; i++) { + if (src_handles[i] > exec->bo_count) { + DRM_ERROR("Shader handle %d too big\n", src_handles[i]); + return -EINVAL; } + + bo[i] = exec->bo[src_handles[i]]; + if (!bo[i]) + return -EINVAL; + } + for (i = shader_reloc_count; i < nr_relocs; i++) { + bo[i] = vc4_use_bo(exec, src_handles[i]); + if (!bo[i]) + return -EINVAL; } - for (i = 0; i < nr_fixed_relocs; i++) { - uint32_t o = relocs[i].offset; + for (i = 0; i < shader_reloc_count; i++) { + struct vc4_validated_shader_info *validated_shader; + uint32_t o = shader_reloc_offsets[i]; uint32_t src_offset = *(uint32_t *)(pkt_u + o); uint32_t *texture_handles_u; void *uniform_data_u; @@ -848,58 +811,50 @@ validate_shader_rec(struct drm_device *dev, *(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset; - switch (relocs[i].type) { - case RELOC_CODE: - if (src_offset != 0) { - DRM_ERROR("Shaders must be at offset 0 of " - "the BO.\n"); - goto fail; - } + if (src_offset != 0) { + DRM_ERROR("Shaders must be at offset 0 of " + "the BO.\n"); + return -EINVAL; + } - kfree(validated_shader); - validated_shader = vc4_validate_shader(bo[i]); - if (!validated_shader) - goto fail; + validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader; + if (!validated_shader) + return -EINVAL; - if (validated_shader->uniforms_src_size > - exec->uniforms_size) { - DRM_ERROR("Uniforms src buffer overflow\n"); - goto fail; - } + if (validated_shader->uniforms_src_size > + exec->uniforms_size) { + DRM_ERROR("Uniforms src buffer overflow\n"); + return -EINVAL; + } - texture_handles_u = exec->uniforms_u; - uniform_data_u = (texture_handles_u + - validated_shader->num_texture_samples); - - memcpy(exec->uniforms_v, uniform_data_u, - validated_shader->uniforms_size); - - for (tex = 0; - tex < validated_shader->num_texture_samples; - tex++) { - if (!reloc_tex(exec, - uniform_data_u, - &validated_shader->texture_samples[tex], - texture_handles_u[tex])) { - goto fail; - } - } + texture_handles_u = exec->uniforms_u; + uniform_data_u = (texture_handles_u + + validated_shader->num_texture_samples); - *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p; + memcpy(exec->uniforms_v, uniform_data_u, + validated_shader->uniforms_size); - exec->uniforms_u += validated_shader->uniforms_src_size; - exec->uniforms_v += validated_shader->uniforms_size; - exec->uniforms_p += validated_shader->uniforms_size; + for (tex = 0; + tex < validated_shader->num_texture_samples; + tex++) { + if (!reloc_tex(exec, + uniform_data_u, + &validated_shader->texture_samples[tex], + texture_handles_u[tex])) { + return -EINVAL; + } + } - break; + *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p; - case RELOC_VBO: - break; - } + exec->uniforms_u += validated_shader->uniforms_src_size; + exec->uniforms_v += validated_shader->uniforms_size; + exec->uniforms_p += validated_shader->uniforms_size; } for (i = 0; i < nr_attributes; i++) { - struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i]; + struct drm_gem_cma_object *vbo = + bo[ARRAY_SIZE(shader_reloc_offsets) + i]; uint32_t o = 36 + i * 8; uint32_t offset = *(uint32_t *)(pkt_u + o + 0); uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1; @@ -929,13 +884,7 @@ validate_shader_rec(struct drm_device *dev, *(uint32_t *)(pkt_v + o) = vbo->paddr + offset; } - kfree(validated_shader); - return 0; - -fail: - kfree(validated_shader); - return -EINVAL; } int @@ -946,7 +895,7 @@ vc4_validate_shader_recs(struct drm_device *dev, int ret = 0; for (i = 0; i < exec->shader_state_count; i++) { - ret = validate_shader_rec(dev, exec, &exec->shader_state[i]); + ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]); if (ret) return ret; } diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c index d29e2c9c318..e52a1941730 100644 --- a/src/gallium/drivers/vc4/vc4_blit.c +++ b/src/gallium/drivers/vc4/vc4_blit.c @@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info) struct vc4_context *vc4 = vc4_context(ctx); if (!util_blitter_is_blit_supported(vc4->blitter, info)) { - fprintf(stderr, "blit unsupported %s -> %s", + fprintf(stderr, "blit unsupported %s -> %s\n", util_format_short_name(info->src.resource->format), util_format_short_name(info->dst.resource->format)); return false; @@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) info.dst.resource->nr_samples <= 1 && !util_format_is_depth_or_stencil(info.src.resource->format) && !util_format_is_pure_integer(info.src.resource->format)) { - fprintf(stderr, "color resolve unimplemented"); + fprintf(stderr, "color resolve unimplemented\n"); return; } @@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) } if (info.mask & PIPE_MASK_S) { - fprintf(stderr, "cannot blit stencil, skipping"); + fprintf(stderr, "cannot blit stencil, skipping\n"); info.mask &= ~PIPE_MASK_S; } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index cbdb9e89cf6..f7b41f5816d 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -1,5 +1,5 @@ /* - * Copyright © 2014 Broadcom + * Copyright © 2014-2015 Broadcom * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) * allocate something new instead, since we assume that the * user will proceed to CPU map it and fill it with stuff. */ - if (!vc4_bo_wait(bo, 0)) { + if (!vc4_bo_wait(bo, 0, NULL)) { pipe_mutex_unlock(cache->lock); return NULL; } @@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo) } struct vc4_bo * -vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size, - const char *name) +vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size) { - void *map; struct vc4_bo *bo; + int ret; + + bo = CALLOC_STRUCT(vc4_bo); + if (!bo) + return NULL; + + pipe_reference_init(&bo->reference, 1); + bo->screen = screen; + bo->size = align(size, 4096); + bo->name = "code"; + bo->private = false; /* Make sure it doesn't go back to the cache. */ + + if (!using_vc4_simulator) { + struct drm_vc4_create_shader_bo create = { + .size = size, + .data = (uintptr_t)data, + }; + + ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO, + &create); + bo->handle = create.handle; + } else { + struct drm_mode_create_dumb create; + memset(&create, 0, sizeof(create)); + + create.width = 128; + create.bpp = 8; + create.height = (size + 127) / 128; + + ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); + bo->handle = create.handle; + assert(create.size >= size); + + vc4_bo_map(bo); + memcpy(bo->map, data, size); + } + if (ret != 0) { + fprintf(stderr, "create shader ioctl failure\n"); + abort(); + } + + screen->bo_count++; + screen->bo_size += bo->size; + if (dump_stats) { + fprintf(stderr, "Allocated shader %dkb:\n", size / 1024); + vc4_bo_dump_stats(screen); + } - bo = vc4_bo_alloc(screen, size, name); - map = vc4_bo_map(bo); - memcpy(map, data, size); return bo; } @@ -413,63 +455,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name) return true; } +static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns) +{ + if (using_vc4_simulator) + return 0; + + struct drm_vc4_wait_seqno wait = { + .seqno = seqno, + .timeout_ns = timeout_ns, + }; + int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); + if (ret == -1) + return -errno; + else + return 0; + +} + bool -vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns) +vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns, + const char *reason) { if (screen->finished_seqno >= seqno) return true; - struct drm_vc4_wait_seqno wait; - memset(&wait, 0, sizeof(wait)); - wait.seqno = seqno; - wait.timeout_ns = timeout_ns; - - int ret; - if (!using_vc4_simulator) - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); - else { - wait.seqno = screen->finished_seqno; - ret = 0; + if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) { + if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) { + fprintf(stderr, "Blocking on seqno %lld for %s\n", + (long long)seqno, reason); + } } - if (ret == 0) { - screen->finished_seqno = wait.seqno; - return true; - } + int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns); + if (ret) { + if (ret != -ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); + } - if (errno != ETIME) { - fprintf(stderr, "wait failed: %d\n", ret); - abort(); + return false; } - return false; + screen->finished_seqno = seqno; + return true; +} + +static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns) +{ + if (using_vc4_simulator) + return 0; + + struct drm_vc4_wait_bo wait = { + .handle = handle, + .timeout_ns = timeout_ns, + }; + int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait); + if (ret == -1) + return -errno; + else + return 0; + } bool -vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns) +vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason) { struct vc4_screen *screen = bo->screen; - struct drm_vc4_wait_bo wait; - memset(&wait, 0, sizeof(wait)); - wait.handle = bo->handle; - wait.timeout_ns = timeout_ns; - - int ret; - if (!using_vc4_simulator) - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait); - else - ret = 0; + if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) { + if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) { + fprintf(stderr, "Blocking on %s BO for %s\n", + bo->name, reason); + } + } - if (ret == 0) - return true; + int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns); + if (ret) { + if (ret != -ETIME) { + fprintf(stderr, "wait failed: %d\n", ret); + abort(); + } - if (errno != ETIME) { - fprintf(stderr, "wait failed: %d\n", ret); - abort(); + return false; } - return false; + return true; } void * @@ -515,7 +585,7 @@ vc4_bo_map(struct vc4_bo *bo) { void *map = vc4_bo_map_unsynchronized(bo); - bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE); + bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map"); if (!ok) { fprintf(stderr, "BO wait for map failed\n"); abort(); diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h index 7320695ca8e..b77506e242a 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -58,8 +58,8 @@ struct vc4_bo { struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name); -struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, - uint32_t size, const char *name); +struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, + uint32_t size); void vc4_bo_last_unreference(struct vc4_bo *bo); void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time); struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, @@ -113,10 +113,11 @@ void * vc4_bo_map_unsynchronized(struct vc4_bo *bo); bool -vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns); +vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason); bool -vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns); +vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns, + const char *reason); void vc4_bufmgr_destroy(struct pipe_screen *pscreen); diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c index 0700e885cbf..ced4f2dfa86 100644 --- a/src/gallium/drivers/vc4/vc4_cl.c +++ b/src/gallium/drivers/vc4/vc4_cl.c @@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl) void cl_ensure_space(struct vc4_cl *cl, uint32_t space) { - if ((cl->next - cl->base) + space <= cl->size) + uint32_t offset = cl_offset(cl); + + if (offset + space <= cl->size) return; uint32_t size = MAX2(cl->size + space, cl->size * 2); - uint32_t offset = cl->next -cl->base; cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size); cl->size = size; @@ -60,15 +61,20 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo) uint32_t hindex; uint32_t *current_handles = vc4->bo_handles.base; - for (hindex = 0; - hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4; - hindex++) { + for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) { if (current_handles[hindex] == bo->handle) return hindex; } - cl_u32(&vc4->bo_handles, bo->handle); - cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo)); + struct vc4_cl_out *out; + + out = cl_start(&vc4->bo_handles); + cl_u32(&out, bo->handle); + cl_end(&vc4->bo_handles, out); + + out = cl_start(&vc4->bo_pointers); + cl_ptr(&out, vc4_bo_reference(bo)); + cl_end(&vc4->bo_pointers, out); return hindex; } diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h index 4a50e790942..bf4be0efc29 100644 --- a/src/gallium/drivers/vc4/vc4_cl.h +++ b/src/gallium/drivers/vc4/vc4_cl.h @@ -33,12 +33,20 @@ struct vc4_bo; +/** + * Undefined structure, used for typechecking that you're passing the pointers + * to these functions correctly. + */ +struct vc4_cl_out; + struct vc4_cl { void *base; - void *next; + struct vc4_cl_out *next; + struct vc4_cl_out *reloc_next; uint32_t size; - uint32_t reloc_next; +#ifdef DEBUG uint32_t reloc_count; +#endif }; void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl); @@ -49,135 +57,149 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo); struct PACKED unaligned_16 { uint16_t x; }; struct PACKED unaligned_32 { uint32_t x; }; -static inline void -put_unaligned_32(void *ptr, uint32_t val) +static inline uint32_t cl_offset(struct vc4_cl *cl) { - struct unaligned_32 *p = ptr; - p->x = val; + return (char *)cl->next - (char *)cl->base; } static inline void -put_unaligned_16(void *ptr, uint16_t val) +cl_advance(struct vc4_cl_out **cl, uint32_t n) { - struct unaligned_16 *p = ptr; - p->x = val; + (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n); } -static inline void -cl_u8(struct vc4_cl *cl, uint8_t n) +static inline struct vc4_cl_out * +cl_start(struct vc4_cl *cl) { - assert((cl->next - cl->base) + 1 <= cl->size); - - *(uint8_t *)cl->next = n; - cl->next++; + return cl->next; } static inline void -cl_u16(struct vc4_cl *cl, uint16_t n) +cl_end(struct vc4_cl *cl, struct vc4_cl_out *next) { - assert((cl->next - cl->base) + 2 <= cl->size); + cl->next = next; + assert(cl_offset(cl) <= cl->size); +} - put_unaligned_16(cl->next, n); - cl->next += 2; + +static inline void +put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val) +{ + struct unaligned_32 *p = (void *)ptr; + p->x = val; } static inline void -cl_u32(struct vc4_cl *cl, uint32_t n) +put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val) { - assert((cl->next - cl->base) + 4 <= cl->size); + struct unaligned_16 *p = (void *)ptr; + p->x = val; +} - put_unaligned_32(cl->next, n); - cl->next += 4; +static inline void +cl_u8(struct vc4_cl_out **cl, uint8_t n) +{ + *(uint8_t *)(*cl) = n; + cl_advance(cl, 1); } static inline void -cl_aligned_u32(struct vc4_cl *cl, uint32_t n) +cl_u16(struct vc4_cl_out **cl, uint16_t n) { - assert((cl->next - cl->base) + 4 <= cl->size); + put_unaligned_16(*cl, n); + cl_advance(cl, 2); +} - *(uint32_t *)cl->next = n; - cl->next += 4; +static inline void +cl_u32(struct vc4_cl_out **cl, uint32_t n) +{ + put_unaligned_32(*cl, n); + cl_advance(cl, 4); } static inline void -cl_ptr(struct vc4_cl *cl, void *ptr) +cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n) { - assert((cl->next - cl->base) + sizeof(void *) <= cl->size); + *(uint32_t *)(*cl) = n; + cl_advance(cl, 4); +} - *(void **)cl->next = ptr; - cl->next += sizeof(void *); +static inline void +cl_ptr(struct vc4_cl_out **cl, void *ptr) +{ + *(struct vc4_cl_out **)(*cl) = ptr; + cl_advance(cl, sizeof(void *)); } static inline void -cl_f(struct vc4_cl *cl, float f) +cl_f(struct vc4_cl_out **cl, float f) { cl_u32(cl, fui(f)); } static inline void -cl_aligned_f(struct vc4_cl *cl, float f) +cl_aligned_f(struct vc4_cl_out **cl, float f) { cl_aligned_u32(cl, fui(f)); } static inline void -cl_start_reloc(struct vc4_cl *cl, uint32_t n) +cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n) { assert(n == 1 || n == 2); +#ifdef DEBUG assert(cl->reloc_count == 0); cl->reloc_count = n; +#endif - cl_u8(cl, VC4_PACKET_GEM_HANDLES); - cl->reloc_next = cl->next - cl->base; - cl_u32(cl, 0); /* Space where hindex will be written. */ - cl_u32(cl, 0); /* Space where hindex will be written. */ + cl_u8(out, VC4_PACKET_GEM_HANDLES); + cl->reloc_next = *out; + cl_u32(out, 0); /* Space where hindex will be written. */ + cl_u32(out, 0); /* Space where hindex will be written. */ } -static inline void +static inline struct vc4_cl_out * cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n) { +#ifdef DEBUG assert(cl->reloc_count == 0); cl->reloc_count = n; - cl->reloc_next = cl->next - cl->base; +#endif + cl->reloc_next = cl->next; + + /* Reserve the space where hindex will be written. */ + cl_advance(&cl->next, n * 4); - /* Space where hindex will be written. */ - cl->next += n * 4; + return cl->next; } static inline void -cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset) +cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out, + struct vc4_bo *bo, uint32_t offset) { - *(uint32_t *)(cl->base + cl->reloc_next) = hindex; - cl->reloc_next += 4; + *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); + cl_advance(&cl->reloc_next, 4); +#ifdef DEBUG cl->reloc_count--; +#endif - cl_u32(cl, offset); + cl_u32(cl_out, offset); } static inline void -cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset) +cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl, + struct vc4_cl_out **cl_out, + struct vc4_bo *bo, uint32_t offset) { - *(uint32_t *)(cl->base + cl->reloc_next) = hindex; - cl->reloc_next += 4; + *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); + cl_advance(&cl->reloc_next, 4); +#ifdef DEBUG cl->reloc_count--; +#endif - cl_aligned_u32(cl, offset); -} - -static inline void -cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, - struct vc4_bo *bo, uint32_t offset) -{ - cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset); -} - -static inline void -cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl, - struct vc4_bo *bo, uint32_t offset) -{ - cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset); + cl_aligned_u32(cl_out, offset); } void cl_ensure_space(struct vc4_cl *cl, uint32_t size); diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c index 69055081daa..6d748010baf 100644 --- a/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset) void *f = cl + offset; fprintf(stderr, "0x%08x 0x%08x: %f (0x%08x)\n", - offset, hw_offset, *(float *)f, *(uint32_t *)f); + offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f); } static void @@ -47,7 +47,33 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset } static void -dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset) +{ + uint32_t bits = *(uint32_t *)(cl + offset); + + fprintf(stderr, "0x%08x 0x%08x: addr 0x%08x%s%s%s%s\n", + offset, hw_offset, + bits & ~0xf, + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear", + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs", + (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color", + (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : ""); +} + +static void +dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_full(cl, offset, hw_offset); +} + +static void +dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_full(cl, offset, hw_offset); +} + +static void +dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset) { uint8_t *bytes = cl + offset; uint32_t *addr = cl + offset + 2; @@ -125,6 +151,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw } static void +dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_general(cl, offset, hw_offset); +} + +static void +dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset) +{ + dump_loadstore_general(cl, offset, hw_offset); +} + +static void dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset) { uint32_t *bits = cl + offset; @@ -291,63 +329,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset) offset, hw_offset, handles[0], handles[1]); } -#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name } -#define PACKET(name, size) [name] = { #name, size, NULL } +#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name } +#define PACKET(name) [name] = { #name, name ## _SIZE, NULL } static const struct packet_info { const char *name; uint8_t size; void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset); } packet_info[] = { - PACKET(VC4_PACKET_HALT, 1), - PACKET(VC4_PACKET_NOP, 1), - - PACKET(VC4_PACKET_FLUSH, 1), - PACKET(VC4_PACKET_FLUSH_ALL, 1), - PACKET(VC4_PACKET_START_TILE_BINNING, 1), - PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1), - PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1), - - PACKET(VC4_PACKET_BRANCH, 5), - PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5), - - PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1), - PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1), - PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5), - PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5), - PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7), - PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7), - - PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14), - PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10), - - PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48), - PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49), - - PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2), - - PACKET(VC4_PACKET_GL_SHADER_STATE, 5), - PACKET(VC4_PACKET_NV_SHADER_STATE, 5), - PACKET(VC4_PACKET_VG_SHADER_STATE, 5), - - PACKET(VC4_PACKET_CONFIGURATION_BITS, 4), - PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5), - PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5), - PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5), - PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3), - PACKET(VC4_PACKET_DEPTH_OFFSET, 5), - PACKET(VC4_PACKET_CLIP_WINDOW, 9), - PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5), - PACKET(VC4_PACKET_Z_CLIPPING, 9), - PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9), - PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9), - - PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16), - PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11), - PACKET(VC4_PACKET_CLEAR_COLORS, 14), - PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3), - - PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9), + PACKET(VC4_PACKET_HALT), + PACKET(VC4_PACKET_NOP), + + PACKET(VC4_PACKET_FLUSH), + PACKET(VC4_PACKET_FLUSH_ALL), + PACKET(VC4_PACKET_START_TILE_BINNING), + PACKET(VC4_PACKET_INCREMENT_SEMAPHORE), + PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE), + + PACKET(VC4_PACKET_BRANCH), + PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST), + + PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER), + PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF), + PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER), + PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER), + PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL), + PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL), + + PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE), + PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE), + + PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE), + PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE), + + PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT), + + PACKET(VC4_PACKET_GL_SHADER_STATE), + PACKET(VC4_PACKET_NV_SHADER_STATE), + PACKET(VC4_PACKET_VG_SHADER_STATE), + + PACKET(VC4_PACKET_CONFIGURATION_BITS), + PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS), + PACKET_DUMP(VC4_PACKET_POINT_SIZE), + PACKET_DUMP(VC4_PACKET_LINE_WIDTH), + PACKET(VC4_PACKET_RHT_X_BOUNDARY), + PACKET(VC4_PACKET_DEPTH_OFFSET), + PACKET(VC4_PACKET_CLIP_WINDOW), + PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET), + PACKET(VC4_PACKET_Z_CLIPPING), + PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING), + PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING), + + PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG), + PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG), + PACKET(VC4_PACKET_CLEAR_COLORS), + PACKET_DUMP(VC4_PACKET_TILE_COORDINATES), + + PACKET_DUMP(VC4_PACKET_GEM_HANDLES), }; void @@ -359,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render) while (offset < size) { uint8_t header = cmds[offset]; - if (header > ARRAY_SIZE(packet_info) || + if (header >= ARRAY_SIZE(packet_info) || !packet_info[header].name) { fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n", offset, hw_offset, header, header); diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 630f8e68896..fff63158c9d 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx) * FLUSH completes. */ cl_ensure_space(&vc4->bcl, 8); - cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); + cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE); /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */ - cl_u8(&vc4->bcl, VC4_PACKET_FLUSH); + cl_u8(&bcl, VC4_PACKET_FLUSH); + cl_end(&vc4->bcl, bcl); if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_write, cbuf); @@ -103,8 +105,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, vc4_flush(pctx); if (fence) { + struct pipe_screen *screen = pctx->screen; struct vc4_fence *f = vc4_fence_create(vc4->screen, vc4->last_emit_seqno); + screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle *)f; } } @@ -126,8 +130,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo) * they match. */ struct vc4_bo **referenced_bos = vc4->bo_pointers.base; - for (int i = 0; i < (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; i++) { + for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) { if (referenced_bos[i] == bo) { return true; } diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index d5d6be16f6e..654c46f3c0d 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -67,7 +67,20 @@ #define VC4_DIRTY_CLIP (1 << 20) #define VC4_DIRTY_UNCOMPILED_VS (1 << 21) #define VC4_DIRTY_UNCOMPILED_FS (1 << 22) -#define VC4_DIRTY_COMPILED_FS (1 << 24) +#define VC4_DIRTY_COMPILED_CS (1 << 23) +#define VC4_DIRTY_COMPILED_VS (1 << 24) +#define VC4_DIRTY_COMPILED_FS (1 << 25) + +struct vc4_sampler_view { + struct pipe_sampler_view base; + uint32_t texture_p0; + uint32_t texture_p1; +}; + +struct vc4_sampler_state { + struct pipe_sampler_state base; + uint32_t texture_p1; +}; struct vc4_texture_stateobj { struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS]; @@ -121,6 +134,12 @@ struct vc4_compiled_shader { struct vc4_ubo_range *ubo_ranges; uint32_t num_ubo_ranges; uint32_t ubo_size; + /** + * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the + * uniforms have to be rewritten (and therefore the shader state + * reemitted). + */ + uint32_t uniform_dirty_bits; /** bitmask of which inputs are color inputs, for flat shade handling. */ uint32_t color_inputs; @@ -238,6 +257,11 @@ struct vc4_context { */ bool draw_call_queued; + /** Maximum index buffer valid for the current shader_rec. */ + uint32_t max_index; + /** Last index bias baked into the current shader_rec. */ + uint32_t last_index_bias; + struct primconvert_context *primconvert; struct hash_table *fs_cache, *vs_cache; @@ -246,6 +270,7 @@ struct vc4_context { struct ra_regs *regs; unsigned int reg_class_any; + unsigned int reg_class_r4_or_a; unsigned int reg_class_a; uint8_t prim_mode; @@ -326,6 +351,18 @@ vc4_context(struct pipe_context *pcontext) return (struct vc4_context *)pcontext; } +static inline struct vc4_sampler_view * +vc4_sampler_view(struct pipe_sampler_view *psview) +{ + return (struct vc4_sampler_view *)psview; +} + +static inline struct vc4_sampler_state * +vc4_sampler_state(struct pipe_sampler_state *psampler) +{ + return (struct vc4_sampler_state *)psampler; +} + struct pipe_context *vc4_context_create(struct pipe_screen *pscreen, void *priv); void vc4_draw_init(struct pipe_context *pctx); @@ -337,6 +374,7 @@ void vc4_simulator_init(struct vc4_screen *screen); int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args); +void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader); void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 5e6d70d6f33..a4e5e092b1a 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4) uint32_t height = vc4->framebuffer.height; uint32_t tilew = align(width, 64) / 64; uint32_t tileh = align(height, 64) / 64; + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); // Tile state data is 48 bytes per tile, I think it can be thrown away // as soon as binning is finished. - cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); - cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */ - cl_u8(&vc4->bcl, tilew); - cl_u8(&vc4->bcl, tileh); - cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */ + cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); + cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */ + cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */ + cl_u32(&bcl, 0); /* tile state addr, filled by kernel */ + cl_u8(&bcl, tilew); + cl_u8(&bcl, tileh); + cl_u8(&bcl, 0); /* flags, filled by kernel. */ /* START_TILE_BINNING resets the statechange counters in the hardware, * which are what is used when a primitive is binned to a tile to * figure out what new state packets need to be written to that tile's * command list. */ - cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING); + cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING); /* Reset the current compressed primitives format. This gets modified * by VC4_PACKET_GL_INDEXED_PRIMITIVE and * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start * of every tile. */ - cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); - cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | - VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); + cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); + cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | + VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); vc4->needs_flush = true; vc4->draw_call_queued = true; vc4->draw_width = width; vc4->draw_height = height; + + cl_end(&vc4->bcl, bcl); } static void @@ -119,96 +122,67 @@ vc4_update_shadow_textures(struct pipe_context *pctx, } static void -vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info) { - struct vc4_context *vc4 = vc4_context(pctx); - - if (info->mode >= PIPE_PRIM_QUADS) { - util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); - util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); - util_primconvert_draw_vbo(vc4->primconvert, info); - perf_debug("Fallback conversion for %d %s vertices\n", - info->count, u_prim_name(info->mode)); - return; - } - - /* Before setting up the draw, do any fixup blits necessary. */ - vc4_update_shadow_textures(pctx, &vc4->verttex); - vc4_update_shadow_textures(pctx, &vc4->fragtex); - - vc4_get_draw_cl_space(vc4); - + /* VC4_DIRTY_VTXSTATE */ struct vc4_vertex_stateobj *vtx = vc4->vtx; + /* VC4_DIRTY_VTXBUF */ struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf; - if (vc4->prim_mode != info->mode) { - vc4->prim_mode = info->mode; - vc4->dirty |= VC4_DIRTY_PRIM_MODE; - } - - vc4_start_draw(vc4); - vc4_update_compiled_shaders(vc4, info->mode); - - vc4_emit_state(pctx); - vc4->dirty = 0; - - vc4_write_uniforms(vc4, vc4->prog.fs, - &vc4->constbuf[PIPE_SHADER_FRAGMENT], - &vc4->fragtex); - vc4_write_uniforms(vc4, vc4->prog.vs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - vc4_write_uniforms(vc4, vc4->prog.cs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - /* The simulator throws a fit if VS or CS don't read an attribute, so * we emit a dummy read. */ uint32_t num_elements_emit = MAX2(vtx->num_elements, 1); /* Emit the shader record. */ - cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); - cl_u16(&vc4->shader_rec, + struct vc4_cl_out *shader_rec = + cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); + /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */ + cl_u16(&shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING | + VC4_SHADER_FLAG_FS_SINGLE_THREAD | ((info->mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex) ? VC4_SHADER_FLAG_VS_POINT_SIZE : 0)); - cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */ - cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_FS */ + cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */ + cl_u8(&shader_rec, vc4->prog.fs->num_inputs); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_VS */ + cl_u16(&shader_rec, 0); /* vs num uniforms */ + cl_u8(&shader_rec, vc4->prog.vs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + /* VC4_DIRTY_COMPILED_CS */ + cl_u16(&shader_rec, 0); /* cs num uniforms */ + cl_u8(&shader_rec, vc4->prog.cs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ uint32_t max_index = 0xffff; - uint32_t vpm_offset = 0; for (int i = 0; i < vtx->num_elements; i++) { struct pipe_vertex_element *elem = &vtx->pipe[i]; struct pipe_vertex_buffer *vb = &vertexbuf->vb[elem->vertex_buffer_index]; struct vc4_resource *rsc = vc4_resource(vb->buffer); - uint32_t offset = vb->buffer_offset + elem->src_offset; + /* not vc4->dirty tracked: vc4->last_index_bias */ + uint32_t offset = (vb->buffer_offset + + elem->src_offset + + vb->stride * info->index_bias); uint32_t vb_size = rsc->bo->size - offset; uint32_t elem_size = util_format_get_blocksize(elem->src_format); - cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset); - cl_u8(&vc4->shader_rec, elem_size - 1); - cl_u8(&vc4->shader_rec, vb->stride); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]); - - vpm_offset += align(elem_size, 4); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset); + cl_u8(&shader_rec, elem_size - 1); + cl_u8(&shader_rec, vb->stride); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]); if (vb->stride > 0) { max_index = MIN2(max_index, @@ -219,25 +193,89 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (vtx->num_elements == 0) { assert(num_elements_emit == 1); struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO"); - cl_reloc(vc4, &vc4->shader_rec, bo, 0); - cl_u8(&vc4->shader_rec, 16 - 1); /* element size */ - cl_u8(&vc4->shader_rec, 0); /* stride */ - cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */ - cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */ + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0); + cl_u8(&shader_rec, 16 - 1); /* element size */ + cl_u8(&shader_rec, 0); /* stride */ + cl_u8(&shader_rec, 0); /* VS VPM offset */ + cl_u8(&shader_rec, 0); /* CS VPM offset */ vc4_bo_unreference(&bo); } + cl_end(&vc4->shader_rec, shader_rec); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); /* the actual draw call. */ - cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE); + cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE); assert(vtx->num_elements <= 8); /* Note that number of attributes == 0 in the packet means 8 * attributes. This field also contains the offset into shader_rec. */ - cl_u32(&vc4->bcl, num_elements_emit & 0x7); + cl_u32(&bcl, num_elements_emit & 0x7); + cl_end(&vc4->bcl, bcl); + + vc4_write_uniforms(vc4, vc4->prog.fs, + &vc4->constbuf[PIPE_SHADER_FRAGMENT], + &vc4->fragtex); + vc4_write_uniforms(vc4, vc4->prog.vs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + vc4_write_uniforms(vc4, vc4->prog.cs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + + vc4->last_index_bias = info->index_bias; + vc4->max_index = max_index; +} + +static void +vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +{ + struct vc4_context *vc4 = vc4_context(pctx); + + if (info->mode >= PIPE_PRIM_QUADS) { + util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); + util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); + util_primconvert_draw_vbo(vc4->primconvert, info); + perf_debug("Fallback conversion for %d %s vertices\n", + info->count, u_prim_name(info->mode)); + return; + } + + /* Before setting up the draw, do any fixup blits necessary. */ + vc4_update_shadow_textures(pctx, &vc4->verttex); + vc4_update_shadow_textures(pctx, &vc4->fragtex); + + vc4_get_draw_cl_space(vc4); + + if (vc4->prim_mode != info->mode) { + vc4->prim_mode = info->mode; + vc4->dirty |= VC4_DIRTY_PRIM_MODE; + } + + vc4_start_draw(vc4); + vc4_update_compiled_shaders(vc4, info->mode); + + vc4_emit_state(pctx); + + if ((vc4->dirty & (VC4_DIRTY_VTXBUF | + VC4_DIRTY_VTXSTATE | + VC4_DIRTY_PRIM_MODE | + VC4_DIRTY_RASTERIZER | + VC4_DIRTY_COMPILED_CS | + VC4_DIRTY_COMPILED_VS | + VC4_DIRTY_COMPILED_FS | + vc4->prog.cs->uniform_dirty_bits | + vc4->prog.vs->uniform_dirty_bits | + vc4->prog.fs->uniform_dirty_bits)) || + vc4->last_index_bias != info->index_bias) { + vc4_emit_gl_shader_state(vc4, info); + } + + vc4->dirty = 0; /* Note that the primitive type fields match with OpenGL/gallium * definitions, up to but not including QUADS. */ + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (info->indexed) { uint32_t offset = vc4->indexbuf.offset; uint32_t index_size = vc4->indexbuf.index_size; @@ -251,25 +289,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) } struct vc4_resource *rsc = vc4_resource(prsc); - cl_start_reloc(&vc4->bcl, 1); - cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); - cl_u8(&vc4->bcl, + cl_start_reloc(&vc4->bcl, &bcl, 1); + cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); + cl_u8(&bcl, info->mode | (index_size == 2 ? VC4_INDEX_BUFFER_U16: VC4_INDEX_BUFFER_U8)); - cl_u32(&vc4->bcl, info->count); - cl_reloc(vc4, &vc4->bcl, rsc->bo, offset); - cl_u32(&vc4->bcl, max_index); + cl_u32(&bcl, info->count); + cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset); + cl_u32(&bcl, vc4->max_index); if (vc4->indexbuf.index_size == 4) pipe_resource_reference(&prsc, NULL); } else { - cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); - cl_u8(&vc4->bcl, info->mode); - cl_u32(&vc4->bcl, info->count); - cl_u32(&vc4->bcl, info->start); + cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); + cl_u8(&bcl, info->mode); + cl_u32(&bcl, info->count); + cl_u32(&bcl, info->start); } + cl_end(&vc4->bcl, bcl); if (vc4->zsa && vc4->zsa->base.depth.enabled) { vc4->resolve |= PIPE_CLEAR_DEPTH; diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index 5f1ee4fa125..863ef8da8fb 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -31,12 +31,14 @@ #define DRM_VC4_WAIT_BO 0x02 #define DRM_VC4_CREATE_BO 0x03 #define DRM_VC4_MMAP_BO 0x04 +#define DRM_VC4_CREATE_SHADER_BO 0x05 #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) #define DRM_IOCTL_VC4_WAIT_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo) #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) +#define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) struct drm_vc4_submit_rcl_surface { uint32_t hindex; /* Handle index, or ~0 if not present. */ @@ -183,6 +185,29 @@ struct drm_vc4_create_bo { }; /** + * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4 + * shader BOs. + * + * Since allowing a shader to be overwritten while it's also being + * executed from would allow privlege escalation, shaders must be + * created using this ioctl, and they can't be mmapped later. + */ +struct drm_vc4_create_shader_bo { + /* Size of the data argument. */ + uint32_t size; + /* Flags, currently must be 0. */ + uint32_t flags; + + /* Pointer to the data. */ + uint64_t data; + + /** Returned GEM handle for the BO. */ + uint32_t handle; + /* Pad, must be 0. */ + uint32_t pad; +}; + +/** * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs. * * This doesn't actually perform an mmap. Instead, it returns the diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c index d2b54fccf91..ba064ff889b 100644 --- a/src/gallium/drivers/vc4/vc4_emit.c +++ b/src/gallium/drivers/vc4/vc4_emit.c @@ -28,23 +28,24 @@ vc4_emit_state(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) { float *vpscale = vc4->viewport.scale; float *vptranslate = vc4->viewport.translate; - float vp_minx = -fabs(vpscale[0]) + vptranslate[0]; - float vp_maxx = fabs(vpscale[0]) + vptranslate[0]; - float vp_miny = -fabs(vpscale[1]) + vptranslate[1]; - float vp_maxy = fabs(vpscale[1]) + vptranslate[1]; + float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; + float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; + float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; + float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; uint32_t minx = MAX2(vc4->scissor.minx, vp_minx); uint32_t miny = MAX2(vc4->scissor.miny, vp_miny); uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx); uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy); - cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW); - cl_u16(&vc4->bcl, minx); - cl_u16(&vc4->bcl, miny); - cl_u16(&vc4->bcl, maxx - minx); - cl_u16(&vc4->bcl, maxy - miny); + cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW); + cl_u16(&bcl, minx); + cl_u16(&bcl, miny); + cl_u16(&bcl, maxx - minx); + cl_u16(&bcl, maxy - miny); vc4->draw_min_x = MIN2(vc4->draw_min_x, minx); vc4->draw_min_y = MIN2(vc4->draw_min_y, miny); @@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx) } if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) { - cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS); - cl_u8(&vc4->bcl, + cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS); + cl_u8(&bcl, vc4->rasterizer->config_bits[0] | vc4->zsa->config_bits[0]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[1] | vc4->zsa->config_bits[1]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[2] | vc4->zsa->config_bits[2]); } if (vc4->dirty & VC4_DIRTY_RASTERIZER) { - cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_units); + cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET); + cl_u16(&bcl, vc4->rasterizer->offset_factor); + cl_u16(&bcl, vc4->rasterizer->offset_units); - cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE); - cl_f(&vc4->bcl, vc4->rasterizer->point_size); + cl_u8(&bcl, VC4_PACKET_POINT_SIZE); + cl_f(&bcl, vc4->rasterizer->point_size); - cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH); - cl_f(&vc4->bcl, vc4->rasterizer->base.line_width); + cl_u8(&bcl, VC4_PACKET_LINE_WIDTH); + cl_f(&bcl, vc4->rasterizer->base.line_width); } if (vc4->dirty & VC4_DIRTY_VIEWPORT) { - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING); - cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f); - cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f); + cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING); + cl_f(&bcl, vc4->viewport.scale[0] * 16.0f); + cl_f(&bcl, vc4->viewport.scale[1] * 16.0f); - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING); - cl_f(&vc4->bcl, vc4->viewport.translate[2]); - cl_f(&vc4->bcl, vc4->viewport.scale[2]); + cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING); + cl_f(&bcl, vc4->viewport.translate[2]); + cl_f(&bcl, vc4->viewport.scale[2]); - cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]); + cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET); + cl_u16(&bcl, 16 * vc4->viewport.translate[0]); + cl_u16(&bcl, 16 * vc4->viewport.translate[1]); } if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) { - cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS); - cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ? + cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS); + cl_u32(&bcl, vc4->rasterizer->base.flatshade ? vc4->prog.fs->color_inputs : 0); } + + cl_end(&vc4->bcl, bcl); } diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c index f2ee91de61a..b6fb2a8a460 100644 --- a/src/gallium/drivers/vc4/vc4_fence.c +++ b/src/gallium/drivers/vc4/vc4_fence.c @@ -60,16 +60,6 @@ vc4_fence_reference(struct pipe_screen *pscreen, } static boolean -vc4_fence_signalled(struct pipe_screen *pscreen, - struct pipe_fence_handle *pf) -{ - struct vc4_screen *screen = vc4_screen(pscreen); - struct vc4_fence *f = (struct vc4_fence *)pf; - - return vc4_wait_seqno(screen, f->seqno, 0); -} - -static boolean vc4_fence_finish(struct pipe_screen *pscreen, struct pipe_fence_handle *pf, uint64_t timeout_ns) @@ -77,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen, struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_fence *f = (struct vc4_fence *)pf; - return vc4_wait_seqno(screen, f->seqno, timeout_ns); + return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait"); } struct vc4_fence * @@ -98,6 +88,5 @@ void vc4_fence_init(struct vc4_screen *screen) { screen->base.fence_reference = vc4_fence_reference; - screen->base.fence_signalled = vc4_fence_signalled; screen->base.fence_finish = vc4_fence_finish; } diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c index 004bac70c67..ffce61237de 100644 --- a/src/gallium/drivers/vc4/vc4_formats.c +++ b/src/gallium/drivers/vc4/vc4_formats.c @@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = { static const struct vc4_format * get_format(enum pipe_format f) { - if (f > ARRAY_SIZE(vc4_format_table) || + if (f >= ARRAY_SIZE(vc4_format_table) || !vc4_format_table[f].present) return NULL; else diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index dcade15443a..7ebd9f160eb 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -44,8 +44,7 @@ void vc4_job_reset(struct vc4_context *vc4) { struct vc4_bo **referenced_bos = vc4->bo_pointers.base; - for (int i = 0; i < (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; i++) { + for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) { vc4_bo_unreference(&referenced_bos[i]); } vc4_reset_cl(&vc4->bcl); @@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4) { if (vc4_debug & VC4_DEBUG_CL) { fprintf(stderr, "BCL:\n"); - vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false); + vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false); } struct drm_vc4_submit_cl submit; @@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4) vc4->zs_write, true, true); submit.bo_handles = (uintptr_t)vc4->bo_handles.base; - submit.bo_handle_count = (vc4->bo_handles.next - - vc4->bo_handles.base) / 4; + submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4; submit.bin_cl = (uintptr_t)vc4->bcl.base; - submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base; + submit.bin_cl_size = cl_offset(&vc4->bcl); submit.shader_rec = (uintptr_t)vc4->shader_rec.base; - submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base; + submit.shader_rec_size = cl_offset(&vc4->shader_rec); submit.shader_rec_count = vc4->shader_rec_count; submit.uniforms = (uintptr_t)vc4->uniforms.base; - submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base; + submit.uniforms_size = cl_offset(&vc4->uniforms); assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0); submit.min_x_tile = vc4->draw_min_x / 64; @@ -207,7 +205,7 @@ vc4_job_submit(struct vc4_context *vc4) if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) { if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno, - PIPE_TIMEOUT_INFINITE)) { + PIPE_TIMEOUT_INFINITE, "sync")) { fprintf(stderr, "Wait failed.\n"); abort(); } diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c new file mode 100644 index 00000000000..a372a6c0cdc --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -0,0 +1,431 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Implements most of the fixed function fragment pipeline in shader code. + * + * VC4 doesn't have any hardware support for blending, alpha test, logic ops, + * or color mask. Instead, you read the current contents of the destination + * from the tile buffer after having waited for the scoreboard (which is + * handled by vc4_qpu_emit.c), then do math using your output color and that + * destination value, and update the output color appropriately. + */ + +/** + * Lowers fixed-function blending to a load of the destination color and a + * series of ALU operations before the store of the output. + */ +#include "util/u_format.h" +#include "vc4_qir.h" +#include "glsl/nir/nir_builder.h" +#include "vc4_context.h" + +/** Emits a load of the previous fragment color from the tile buffer. */ +static nir_ssa_def * +vc4_nir_get_dst_color(nir_builder *b) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_input); + load->num_components = 1; + load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT; + nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static nir_ssa_def * +vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb) +{ + nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045)); + nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92)); + nir_ssa_def *high = nir_fpow(b, + nir_fmul(b, + nir_fadd(b, srgb, + nir_imm_float(b, 0.055)), + nir_imm_float(b, 1.0 / 1.055)), + nir_imm_float(b, 2.4)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear) +{ + nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308)); + nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92)); + nir_ssa_def *high = nir_fsub(b, + nir_fmul(b, + nir_imm_float(b, 1.055), + nir_fpow(b, + linear, + nir_imm_float(b, 0.41666))), + nir_imm_float(b, 0.055)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_blend_channel(nir_builder *b, + nir_ssa_def **src, + nir_ssa_def **dst, + unsigned factor, + int channel) +{ + switch(factor) { + case PIPE_BLENDFACTOR_ONE: + return nir_imm_float(b, 1.0); + case PIPE_BLENDFACTOR_SRC_COLOR: + return src[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return src[3]; + case PIPE_BLENDFACTOR_DST_ALPHA: + return dst[3]; + case PIPE_BLENDFACTOR_DST_COLOR: + return dst[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + if (channel != 3) { + return nir_fmin(b, + src[3], + nir_fsub(b, + nir_imm_float(b, 1.0), + dst[3])); + } else { + return nir_imm_float(b, 1.0); + } + case PIPE_BLENDFACTOR_CONST_COLOR: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel); + case PIPE_BLENDFACTOR_CONST_ALPHA: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W); + case PIPE_BLENDFACTOR_ZERO: + return nir_imm_float(b, 0.0); + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]); + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), src[3]); + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]); + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]); + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel)); + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W)); + + default: + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + /* Unsupported. */ + fprintf(stderr, "Unknown blend factor %d\n", factor); + return nir_imm_float(b, 1.0); + } +} + +static nir_ssa_def * +vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst, + unsigned func) +{ + switch (func) { + case PIPE_BLEND_ADD: + return nir_fadd(b, src, dst); + case PIPE_BLEND_SUBTRACT: + return nir_fsub(b, src, dst); + case PIPE_BLEND_REVERSE_SUBTRACT: + return nir_fsub(b, dst, src); + case PIPE_BLEND_MIN: + return nir_fmin(b, src, dst); + case PIPE_BLEND_MAX: + return nir_fmax(b, src, dst); + + default: + /* Unsupported. */ + fprintf(stderr, "Unknown blend func %d\n", func); + return src; + + } +} + +static void +vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result, + nir_ssa_def **src_color, nir_ssa_def **dst_color) +{ + struct pipe_rt_blend_state *blend = &c->fs_key->blend; + + if (!blend->blend_enable) { + for (int i = 0; i < 4; i++) + result[i] = src_color[i]; + return; + } + + /* Clamp the src color to [0, 1]. Dest is already clamped. */ + for (int i = 0; i < 4; i++) + src_color[i] = nir_fsat(b, src_color[i]); + + nir_ssa_def *src_blend[4], *dst_blend[4]; + for (int i = 0; i < 4; i++) { + int src_factor = ((i != 3) ? blend->rgb_src_factor : + blend->alpha_src_factor); + int dst_factor = ((i != 3) ? blend->rgb_dst_factor : + blend->alpha_dst_factor); + src_blend[i] = nir_fmul(b, src_color[i], + vc4_blend_channel(b, + src_color, dst_color, + src_factor, i)); + dst_blend[i] = nir_fmul(b, dst_color[i], + vc4_blend_channel(b, + src_color, dst_color, + dst_factor, i)); + } + + for (int i = 0; i < 4; i++) { + result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i], + ((i != 3) ? blend->rgb_func : + blend->alpha_func)); + } +} + +static nir_ssa_def * +vc4_logicop(nir_builder *b, int logicop_func, + nir_ssa_def *src, nir_ssa_def *dst) +{ + switch (logicop_func) { + case PIPE_LOGICOP_CLEAR: + return nir_imm_int(b, 0); + case PIPE_LOGICOP_NOR: + return nir_inot(b, nir_ior(b, src, dst)); + case PIPE_LOGICOP_AND_INVERTED: + return nir_iand(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_COPY_INVERTED: + return nir_inot(b, src); + case PIPE_LOGICOP_AND_REVERSE: + return nir_iand(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_INVERT: + return nir_inot(b, dst); + case PIPE_LOGICOP_XOR: + return nir_ixor(b, src, dst); + case PIPE_LOGICOP_NAND: + return nir_inot(b, nir_iand(b, src, dst)); + case PIPE_LOGICOP_AND: + return nir_iand(b, src, dst); + case PIPE_LOGICOP_EQUIV: + return nir_inot(b, nir_ixor(b, src, dst)); + case PIPE_LOGICOP_NOOP: + return dst; + case PIPE_LOGICOP_OR_INVERTED: + return nir_ior(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_OR_REVERSE: + return nir_ior(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_OR: + return nir_ior(b, src, dst); + case PIPE_LOGICOP_SET: + return nir_imm_int(b, ~0); + default: + fprintf(stderr, "Unknown logic op %d\n", logicop_func); + /* FALLTHROUGH */ + case PIPE_LOGICOP_COPY: + return src; + } +} + +static nir_ssa_def * +vc4_nir_pipe_compare_func(nir_builder *b, int func, + nir_ssa_def *src0, nir_ssa_def *src1) +{ + switch (func) { + default: + fprintf(stderr, "Unknown compare func %d\n", func); + /* FALLTHROUGH */ + case PIPE_FUNC_NEVER: + return nir_imm_int(b, 0); + case PIPE_FUNC_ALWAYS: + return nir_imm_int(b, ~0); + case PIPE_FUNC_EQUAL: + return nir_feq(b, src0, src1); + case PIPE_FUNC_NOTEQUAL: + return nir_fne(b, src0, src1); + case PIPE_FUNC_GREATER: + return nir_flt(b, src1, src0); + case PIPE_FUNC_GEQUAL: + return nir_fge(b, src0, src1); + case PIPE_FUNC_LESS: + return nir_flt(b, src0, src1); + case PIPE_FUNC_LEQUAL: + return nir_fge(b, src1, src0); + } +} + +static void +vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b, + nir_ssa_def *alpha) +{ + if (!c->fs_key->alpha_test) + return; + + nir_ssa_def *alpha_ref = + vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF); + nir_ssa_def *condition = + vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func, + alpha, alpha_ref); + + nir_intrinsic_instr *discard = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_discard_if); + discard->num_components = 1; + discard->src[0] = nir_src_for_ssa(nir_inot(b, condition)); + nir_builder_instr_insert(b, &discard->instr); +} + +static void +vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + enum pipe_format color_format = c->fs_key->color_format; + const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); + + /* Pull out the float src/dst color components. */ + nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b); + nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color); + nir_ssa_def *src_color[4], *unpacked_dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false); + unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false); + } + + /* Unswizzle the destination color. */ + nir_ssa_def *dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + dst_color[i] = vc4_nir_get_swizzled_channel(b, + unpacked_dst_color, + format_swiz[i]); + } + + vc4_nir_emit_alpha_test_discard(c, b, src_color[3]); + + /* Turn dst color to linear. */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]); + } + + nir_ssa_def *blend_color[4]; + vc4_do_blending(c, b, blend_color, src_color, dst_color); + + /* sRGB encode the output color */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]); + } + + nir_ssa_def *swizzled_outputs[4]; + for (int i = 0; i < 4; i++) { + swizzled_outputs[i] = + vc4_nir_get_swizzled_channel(b, blend_color, + format_swiz[i]); + } + + nir_ssa_def *packed_color = + nir_pack_unorm_4x8(b, + nir_vec4(b, + swizzled_outputs[0], + swizzled_outputs[1], + swizzled_outputs[2], + swizzled_outputs[3])); + + packed_color = vc4_logicop(b, c->fs_key->logicop_func, + packed_color, packed_dst_color); + + /* If the bit isn't set in the color mask, then just return the + * original dst color, instead. + */ + uint32_t colormask = 0xffffffff; + for (int i = 0; i < 4; i++) { + if (format_swiz[i] < 4 && + !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { + colormask &= ~(0xff << (i * 8)); + } + } + packed_color = nir_ior(b, + nir_iand(b, packed_color, + nir_imm_int(b, colormask)), + nir_iand(b, packed_dst_color, + nir_imm_int(b, ~colormask))); + + /* Turn the old vec4 output into a store of the packed color. */ + nir_instr_rewrite_src(&intr->instr, &intr->src[0], + nir_src_for_ssa(packed_color)); + intr->num_components = 1; +} + +static bool +vc4_nir_lower_blend_block(nir_block *block, void *state) +{ + struct vc4_compile *c = state; + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_variable *output_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->outputs) { + if (var->data.driver_location == intr->const_index[0]) { + output_var = var; + break; + } + } + assert(output_var); + unsigned semantic_name = output_var->data.location; + + if (semantic_name != TGSI_SEMANTIC_COLOR) + continue; + + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + nir_builder b; + nir_builder_init(&b, impl); + nir_builder_insert_before_instr(&b, &intr->instr); + vc4_nir_lower_blend_instr(c, &b, intr); + } + return true; +} + +void +vc4_nir_lower_blend(struct vc4_compile *c) +{ + nir_foreach_overload(c->s, overload) { + if (overload->impl) { + nir_foreach_block(overload->impl, + vc4_nir_lower_blend_block, c); + + nir_metadata_preserve(overload->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c new file mode 100644 index 00000000000..229d41147d8 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -0,0 +1,291 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "vc4_qir.h" +#include "tgsi/tgsi_info.h" +#include "glsl/nir/nir_builder.h" + +/** + * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into + * something amenable to the VC4 architecture. + * + * Currently, it split inputs, outputs, and uniforms into scalars, drops any + * non-position outputs in coordinate shaders, and fixes up the addressing on + * indirect uniform loads. + */ + +static void +replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr, + nir_ssa_def **comps) +{ + + /* Batch things back together into a vec4. This will get split by the + * later ALU scalarization pass. + */ + nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]); + + /* Replace the old intrinsic with a reference to our reconstructed + * vec4. + */ + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec), + ralloc_parent(b->impl)); + nir_instr_remove(&intr->instr); +} + +static void +vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_builder_insert_before_instr(b, &intr->instr); + + if (c->stage == QSTAGE_FRAG && intr->const_index[0] == + VC4_NIR_TLB_COLOR_READ_INPUT) { + /* This doesn't need any lowering. */ + return; + } + + nir_variable *input_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->inputs) { + if (var->data.driver_location == intr->const_index[0]) { + input_var = var; + break; + } + } + assert(input_var); + int semantic_name = input_var->data.location; + int semantic_index = input_var->data.index; + + /* All TGSI-to-NIR inputs are vec4. */ + assert(intr->num_components == 4); + + /* Generate scalar loads equivalent to the original VEC4. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input); + intr_comp->num_components = 1; + intr_comp->const_index[0] = intr->const_index[0] * 4 + i; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_builder_instr_insert(b, &intr_comp->instr); + + dests[i] = &intr_comp->dest.ssa; + } + + switch (c->stage) { + case QSTAGE_FRAG: + switch (semantic_name) { + case TGSI_SEMANTIC_FACE: + dests[0] = nir_fsub(b, + nir_imm_float(b, 1.0), + nir_fmul(b, + nir_i2f(b, dests[0]), + nir_imm_float(b, 2.0))); + dests[1] = nir_imm_float(b, 0.0); + dests[2] = nir_imm_float(b, 0.0); + dests[3] = nir_imm_float(b, 1.0); + break; + case TGSI_SEMANTIC_GENERIC: + if (c->fs_key->point_sprite_mask & + (1 << semantic_index)) { + if (!c->fs_key->is_points) { + dests[0] = nir_imm_float(b, 0.0); + dests[1] = nir_imm_float(b, 0.0); + } + if (c->fs_key->point_coord_upper_left) { + dests[1] = nir_fsub(b, + nir_imm_float(b, 1.0), + dests[1]); + } + dests[2] = nir_imm_float(b, 0.0); + dests[3] = nir_imm_float(b, 1.0); + } + break; + } + break; + case QSTAGE_COORD: + case QSTAGE_VERT: + break; + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + +static void +vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_variable *output_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->outputs) { + if (var->data.driver_location == intr->const_index[0]) { + output_var = var; + break; + } + } + assert(output_var); + unsigned semantic_name = output_var->data.location; + + if (c->stage == QSTAGE_COORD && + (semantic_name != TGSI_SEMANTIC_POSITION && + semantic_name != TGSI_SEMANTIC_PSIZE)) { + nir_instr_remove(&intr->instr); + return; + } + + /* Color output is lowered by vc4_nir_lower_blend(). */ + if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) { + intr->const_index[0] *= 4; + return; + } + + /* All TGSI-to-NIR outputs are VEC4. */ + assert(intr->num_components == 4); + + nir_builder_insert_before_instr(b, &intr->instr); + + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output); + intr_comp->num_components = 1; + intr_comp->const_index[0] = intr->const_index[0] * 4 + i; + + assert(intr->src[0].is_ssa); + intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b, + intr->src[0].ssa, + &i, 1, false)); + nir_builder_instr_insert(b, &intr_comp->instr); + } + + nir_instr_remove(&intr->instr); +} + +static void +vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* All TGSI-to-NIR uniform loads are vec4, but we may create dword + * loads in our lowering passes. + */ + if (intr->num_components == 1) + return; + assert(intr->num_components == 4); + + nir_builder_insert_before_instr(b, &intr->instr); + + /* Generate scalar loads equivalent to the original VEC4. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, intr->intrinsic); + intr_comp->num_components = 1; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + + if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) { + /* Convert the variable TGSI register index to a byte + * offset. + */ + intr_comp->src[0] = + nir_src_for_ssa(nir_ishl(b, + intr->src[0].ssa, + nir_imm_int(b, 4))); + + /* Convert the offset to be a byte index, too. */ + intr_comp->const_index[0] = (intr->const_index[0] * 16 + + i * 4); + } else { + /* We want a dword index for non-indirect uniform + * loads. + */ + intr_comp->const_index[0] = (intr->const_index[0] * 4 + + i); + } + + dests[i] = &intr_comp->dest.ssa; + + nir_builder_instr_insert(b, &intr_comp->instr); + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + +static void +vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, + struct nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + vc4_nir_lower_input(c, b, intr); + break; + + case nir_intrinsic_store_output: + vc4_nir_lower_output(c, b, intr); + break; + + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_uniform_indirect: + vc4_nir_lower_uniform(c, b, intr); + break; + + default: + break; + } +} + +static bool +vc4_nir_lower_io_block(nir_block *block, void *arg) +{ + struct vc4_compile *c = arg; + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_instr_safe(block, instr) + vc4_nir_lower_io_instr(c, &b, instr); + + return true; +} + +static bool +vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl) +{ + nir_foreach_block(impl, vc4_nir_lower_io_block, c); + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + return true; +} + +void +vc4_nir_lower_io(struct vc4_compile *c) +{ + nir_foreach_overload(c->s, overload) { + if (overload->impl) + vc4_nir_lower_io_impl(c, overload->impl); + } +} diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index d6d2fbf257f..a755de9aa41 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c) if (inst->op == QOP_MOV && inst->dst.file == QFILE_TEMP && - inst->src[0].file != QFILE_VPM && - !(inst->src[0].file == QFILE_TEMP && - (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT || - c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) { + inst->src[0].file != QFILE_VPM) { movs[inst->dst.index] = inst->src[0]; } } diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c index 92c8260eb59..0e5480ea781 100644 --- a/src/gallium/drivers/vc4/vc4_opt_cse.c +++ b/src/gallium/drivers/vc4/vc4_opt_cse.c @@ -46,8 +46,7 @@ struct inst_key { struct qreg src[4]; /** * If the instruction depends on the flags, how many SFs have been - * seen before this instruction, or if it depends on r4, how many r4 - * writes have been seen. + * seen before this instruction. */ uint32_t implicit_arg_update_count; }; @@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b) static struct qinst * vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, - struct qinst *inst, uint32_t sf_count, - uint32_t r4_count) + struct qinst *inst, uint32_t sf_count) { if (inst->dst.file != QFILE_TEMP || inst->op == QOP_MOV || @@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, qir_get_op_nsrc(inst->op) * sizeof(key.src[0])); if (qir_depends_on_flags(inst)) key.implicit_arg_update_count = sf_count; - if (qir_reads_r4(inst)) - key.implicit_arg_update_count = r4_count; uint32_t hash = _mesa_hash_data(&key, sizeof(key)); struct hash_entry *entry = @@ -121,7 +117,7 @@ bool qir_opt_cse(struct vc4_compile *c) { bool progress = false; - uint32_t sf_count = 0, r4_count = 0; + uint32_t sf_count = 0; struct hash_table *ht = _mesa_hash_table_create(NULL, NULL, inst_key_equals); @@ -130,15 +126,15 @@ qir_opt_cse(struct vc4_compile *c) list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (qir_has_side_effects(c, inst) || - qir_has_side_effect_reads(c, inst)) { + qir_has_side_effect_reads(c, inst) || + inst->op == QOP_TLB_COLOR_READ) { continue; } if (inst->sf) { sf_count++; } else { - struct qinst *cse = vc4_find_cse(c, ht, inst, - sf_count, r4_count); + struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count); if (cse) { inst->src[0] = cse->dst; for (int i = 1; i < qir_get_op_nsrc(inst->op); @@ -154,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c) } } } - - if (qir_writes_r4(inst)) - r4_count++; } ralloc_free(ht); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ba47c51d9bd..13c472152d8 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -23,21 +23,19 @@ */ #include <inttypes.h> -#include "pipe/p_state.h" #include "util/u_format.h" #include "util/u_hash.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_pack_color.h" -#include "util/format_srgb.h" #include "util/ralloc.h" #include "util/hash_table.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_info.h" #include "tgsi/tgsi_lowering.h" #include "tgsi/tgsi_parse.h" +#include "glsl/nir/nir.h" +#include "glsl/nir/nir_builder.h" #include "nir/tgsi_to_nir.h" - #include "vc4_context.h" #include "vc4_qpu.h" #include "vc4_qir.h" @@ -45,51 +43,8 @@ #include "simpenrose/simpenrose.h" #endif -struct vc4_key { - struct vc4_uncompiled_shader *shader_state; - struct { - enum pipe_format format; - unsigned compare_mode:1; - unsigned compare_func:3; - unsigned wrap_s:3; - unsigned wrap_t:3; - uint8_t swizzle[4]; - } tex[VC4_MAX_TEXTURE_SAMPLERS]; - uint8_t ucp_enables; -}; - -struct vc4_fs_key { - struct vc4_key base; - enum pipe_format color_format; - bool depth_enabled; - bool stencil_enabled; - bool stencil_twoside; - bool stencil_full_writemasks; - bool is_points; - bool is_lines; - bool alpha_test; - bool point_coord_upper_left; - bool light_twoside; - uint8_t alpha_test_func; - uint8_t logicop_func; - uint32_t point_sprite_mask; - - struct pipe_rt_blend_state blend; -}; - -struct vc4_vs_key { - struct vc4_key base; - - /** - * This is a proxy for the array of FS input semantics, which is - * larger than we would want to put in the key. - */ - uint64_t compiled_fs_id; - - enum pipe_format attr_formats[8]; - bool is_coord; - bool per_vertex_point_size; -}; +static struct qreg +ntq_get_src(struct vc4_compile *c, nir_src src, int i); static void resize_qreg_array(struct vc4_compile *c, @@ -113,10 +68,10 @@ resize_qreg_array(struct vc4_compile *c, } static struct qreg -indirect_uniform_load(struct vc4_compile *c, - struct qreg indirect_offset, - unsigned offset) +indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) { + struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); + uint32_t offset = intr->const_index[0]; struct vc4_compiler_ubo_range *range = NULL; unsigned i; for (i = 0; i < c->num_uniform_ranges; i++) { @@ -138,10 +93,6 @@ indirect_uniform_load(struct vc4_compile *c, }; offset -= range->src_offset; - /* Translate the user's TGSI register index from the TGSI register - * base to a byte offset. - */ - indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4)); /* Adjust for where we stored the TGSI register base. */ indirect_offset = qir_ADD(c, indirect_offset, @@ -155,24 +106,70 @@ indirect_uniform_load(struct vc4_compile *c, range->size - 4))); qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); - struct qreg r4 = qir_TEX_RESULT(c); c->num_texture_samples++; - return qir_MOV(c, r4); + return qir_TEX_RESULT(c); } -static struct qreg * -ntq_get_dest(struct vc4_compile *c, nir_dest dest) +nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, + enum quniform_contents contents) { - assert(!dest.is_ssa); - nir_register *reg = dest.reg.reg; - struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg); - assert(reg->num_array_elems == 0); - assert(dest.reg.base_offset == 0); + nir_intrinsic_instr *intr = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_uniform); + intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents; + intr->num_components = 1; + nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL); + nir_builder_instr_insert(b, &intr->instr); + return &intr->dest.ssa; +} - struct qreg *qregs = entry->data; +nir_ssa_def * +vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +{ + switch (swiz) { + default: + case UTIL_FORMAT_SWIZZLE_NONE: + fprintf(stderr, "warning: unknown swizzle\n"); + /* FALLTHROUGH */ + case UTIL_FORMAT_SWIZZLE_0: + return nir_imm_float(b, 0.0); + case UTIL_FORMAT_SWIZZLE_1: + return nir_imm_float(b, 1.0); + case UTIL_FORMAT_SWIZZLE_X: + case UTIL_FORMAT_SWIZZLE_Y: + case UTIL_FORMAT_SWIZZLE_Z: + case UTIL_FORMAT_SWIZZLE_W: + return srcs[swiz]; + } +} + +static struct qreg * +ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) +{ + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + def->num_components); + _mesa_hash_table_insert(c->def_ht, def, qregs); return qregs; } +static struct qreg * +ntq_get_dest(struct vc4_compile *c, nir_dest *dest) +{ + if (dest->is_ssa) { + struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa); + for (int i = 0; i < dest->ssa.num_components; i++) + qregs[i] = c->undef; + return qregs; + } else { + nir_register *reg = dest->reg.reg; + assert(dest->reg.base_offset == 0); + assert(reg->num_array_elems == 0); + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, reg); + return entry->data; + } +} + static struct qreg ntq_get_src(struct vc4_compile *c, nir_src src, int i) { @@ -282,22 +279,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb) } static struct qreg -qir_srgb_encode(struct vc4_compile *c, struct qreg linear) -{ - struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92)); - struct qreg high = qir_FSUB(c, - qir_FMUL(c, - qir_uniform_f(c, 1.055), - qir_POW(c, - linear, - qir_uniform_f(c, 0.41666))), - qir_uniform_f(c, 0.055)); - - qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308))); - return qir_SEL_X_Y_NS(c, low, high); -} - -static struct qreg ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) { struct qreg src0_hi = qir_SHR(c, src0, @@ -410,13 +391,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) qir_TEX_S(c, s, texture_u[next_texture_u++]); c->num_texture_samples++; - struct qreg r4 = qir_TEX_RESULT(c); + struct qreg tex = qir_TEX_RESULT(c); enum pipe_format format = c->key->tex[unit].format; struct qreg unpacked[4]; if (util_format_is_depth_or_stencil(format)) { - struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4, + struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex, qir_uniform_ui(c, 8))); struct qreg normalized = qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); @@ -468,7 +449,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) unpacked[i] = depth_output; } else { for (int i = 0; i < 4; i++) - unpacked[i] = qir_R4_UNPACK(c, r4, i); + unpacked[i] = qir_UNPACK_8_F(c, tex, i); } const uint8_t *format_swiz = vc4_get_format_swizzle(format); @@ -484,7 +465,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) texture_output[i]); } - struct qreg *dest = ntq_get_dest(c, instr->dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest); for (int i = 0; i < 4; i++) { dest[i] = get_swizzled_channel(c, texture_output, c->key->tex[unit].swizzle[i]); @@ -558,7 +539,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src) struct qreg scaled_x = qir_FMUL(c, src, - qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); + qir_uniform_f(c, 1.0 / (M_PI * 2.0))); struct qreg x = qir_FADD(c, ntq_ffract(c, scaled_x), @@ -756,26 +737,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr) c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c)); } -static void -emit_point_coord_input(struct vc4_compile *c, int attr) -{ - if (c->point_x.file == QFILE_NULL) { - c->point_x = qir_uniform_f(c, 0.0); - c->point_y = qir_uniform_f(c, 0.0); - } - - c->inputs[attr * 4 + 0] = c->point_x; - if (c->fs_key->point_coord_upper_left) { - c->inputs[attr * 4 + 1] = qir_FSUB(c, - qir_uniform_f(c, 1.0), - c->point_y); - } else { - c->inputs[attr * 4 + 1] = c->point_y; - } - c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0); -} - static struct qreg emit_fragment_varying(struct vc4_compile *c, uint8_t semantic, uint8_t index, uint8_t swizzle) @@ -817,19 +778,6 @@ emit_fragment_input(struct vc4_compile *c, int attr, } static void -emit_face_input(struct vc4_compile *c, int attr) -{ - c->inputs[attr * 4 + 0] = qir_FSUB(c, - qir_uniform_f(c, 1.0), - qir_FMUL(c, - qir_ITOF(c, qir_FRAG_REV_FLAG(c)), - qir_uniform_f(c, 2.0))); - c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0); - c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0); -} - -static void add_output(struct vc4_compile *c, uint32_t decl_offset, uint8_t semantic_name, @@ -884,12 +832,38 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); - struct qreg *dest = ntq_get_dest(c, instr->dest.dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) dest[i] = srcs[i]; return; } + if (instr->op == nir_op_pack_unorm_4x8) { + struct qreg result; + for (int i = 0; i < 4; i++) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[i]); + if (i == 0) + result = qir_PACK_8888_F(c, src); + else + result = qir_PACK_8_F(c, result, src, i); + } + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + *dest = result; + return; + } + + if (instr->op == nir_op_unpack_unorm_4x8) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[0]); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + for (int i = 0; i < 4; i++) { + if (instr->dest.write_mask & (1 << i)) + dest[i] = qir_UNPACK_8_F(c, src, i); + } + return; + } + /* General case: We can just grab the one used channel per src. */ struct qreg src[nir_op_infos[instr->op].num_inputs]; for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { @@ -898,7 +872,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) /* Pick the channel to store the output in. */ assert(!instr->dest.saturate); - struct qreg *dest = ntq_get_dest(c, instr->dest.dest); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); assert(util_is_power_of_two(instr->dest.write_mask)); dest += ffs(instr->dest.write_mask) - 1; @@ -1092,167 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) } } -static struct qreg -vc4_blend_channel(struct vc4_compile *c, - struct qreg *dst, - struct qreg *src, - struct qreg val, - unsigned factor, - int channel) -{ - switch(factor) { - case PIPE_BLENDFACTOR_ONE: - return val; - case PIPE_BLENDFACTOR_SRC_COLOR: - return qir_FMUL(c, val, src[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA: - return qir_FMUL(c, val, src[3]); - case PIPE_BLENDFACTOR_DST_ALPHA: - return qir_FMUL(c, val, dst[3]); - case PIPE_BLENDFACTOR_DST_COLOR: - return qir_FMUL(c, val, dst[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - if (channel != 3) { - return qir_FMUL(c, - val, - qir_FMIN(c, - src[3], - qir_FSUB(c, - qir_uniform_f(c, 1.0), - dst[3]))); - } else { - return val; - } - case PIPE_BLENDFACTOR_CONST_COLOR: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, - channel)); - case PIPE_BLENDFACTOR_CONST_ALPHA: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3)); - case PIPE_BLENDFACTOR_ZERO: - return qir_uniform_f(c, 0.0); - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[channel])); - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[3])); - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[3])); - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[channel])); - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - channel))); - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - 3))); - - default: - case PIPE_BLENDFACTOR_SRC1_COLOR: - case PIPE_BLENDFACTOR_SRC1_ALPHA: - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - /* Unsupported. */ - fprintf(stderr, "Unknown blend factor %d\n", factor); - return val; - } -} - -static struct qreg -vc4_blend_func(struct vc4_compile *c, - struct qreg src, struct qreg dst, - unsigned func) -{ - switch (func) { - case PIPE_BLEND_ADD: - return qir_FADD(c, src, dst); - case PIPE_BLEND_SUBTRACT: - return qir_FSUB(c, src, dst); - case PIPE_BLEND_REVERSE_SUBTRACT: - return qir_FSUB(c, dst, src); - case PIPE_BLEND_MIN: - return qir_FMIN(c, src, dst); - case PIPE_BLEND_MAX: - return qir_FMAX(c, src, dst); - - default: - /* Unsupported. */ - fprintf(stderr, "Unknown blend func %d\n", func); - return src; - - } -} - -/** - * Implements fixed function blending in shader code. - * - * VC4 doesn't have any hardware support for blending. Instead, you read the - * current contents of the destination from the tile buffer after having - * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do - * math using your output color and that destination value, and update the - * output color appropriately. - */ -static void -vc4_blend(struct vc4_compile *c, struct qreg *result, - struct qreg *dst_color, struct qreg *src_color) -{ - struct pipe_rt_blend_state *blend = &c->fs_key->blend; - - if (!blend->blend_enable) { - for (int i = 0; i < 4; i++) - result[i] = src_color[i]; - return; - } - - struct qreg clamped_src[4]; - struct qreg clamped_dst[4]; - for (int i = 0; i < 4; i++) { - clamped_src[i] = qir_SAT(c, src_color[i]); - clamped_dst[i] = qir_SAT(c, dst_color[i]); - } - src_color = clamped_src; - dst_color = clamped_dst; - - struct qreg src_blend[4], dst_blend[4]; - for (int i = 0; i < 3; i++) { - src_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - src_color[i], - blend->rgb_src_factor, i); - dst_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[i], - blend->rgb_dst_factor, i); - } - src_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - src_color[3], - blend->alpha_src_factor, 3); - dst_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[3], - blend->alpha_dst_factor, 3); - - for (int i = 0; i < 3; i++) { - result[i] = vc4_blend_func(c, - src_blend[i], dst_blend[i], - blend->rgb_func); - } - result[3] = vc4_blend_func(c, - src_blend[3], dst_blend[3], - blend->alpha_func); -} - static void clip_distance_discard(struct vc4_compile *c) { @@ -1276,167 +1089,15 @@ clip_distance_discard(struct vc4_compile *c) } static void -alpha_test_discard(struct vc4_compile *c) -{ - struct qreg src_alpha; - struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0); - - if (!c->fs_key->alpha_test) - return; - - if (c->output_color_index != -1) - src_alpha = c->outputs[c->output_color_index + 3]; - else - src_alpha = qir_uniform_f(c, 1.0); - - if (c->discard.file == QFILE_NULL) - c->discard = qir_uniform_ui(c, 0); - - switch (c->fs_key->alpha_test_func) { - case PIPE_FUNC_NEVER: - c->discard = qir_uniform_ui(c, ~0); - break; - case PIPE_FUNC_ALWAYS: - break; - case PIPE_FUNC_EQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_NOTEQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GREATER: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LESS: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - } -} - -static struct qreg -vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst) -{ - switch (c->fs_key->logicop_func) { - case PIPE_LOGICOP_CLEAR: - return qir_uniform_f(c, 0.0); - case PIPE_LOGICOP_NOR: - return qir_NOT(c, qir_OR(c, src, dst)); - case PIPE_LOGICOP_AND_INVERTED: - return qir_AND(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_COPY_INVERTED: - return qir_NOT(c, src); - case PIPE_LOGICOP_AND_REVERSE: - return qir_AND(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_INVERT: - return qir_NOT(c, dst); - case PIPE_LOGICOP_XOR: - return qir_XOR(c, src, dst); - case PIPE_LOGICOP_NAND: - return qir_NOT(c, qir_AND(c, src, dst)); - case PIPE_LOGICOP_AND: - return qir_AND(c, src, dst); - case PIPE_LOGICOP_EQUIV: - return qir_NOT(c, qir_XOR(c, src, dst)); - case PIPE_LOGICOP_NOOP: - return dst; - case PIPE_LOGICOP_OR_INVERTED: - return qir_OR(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_OR_REVERSE: - return qir_OR(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_OR: - return qir_OR(c, src, dst); - case PIPE_LOGICOP_SET: - return qir_uniform_ui(c, ~0); - case PIPE_LOGICOP_COPY: - default: - return src; - } -} - -static void emit_frag_end(struct vc4_compile *c) { clip_distance_discard(c); - alpha_test_discard(c); - - enum pipe_format color_format = c->fs_key->color_format; - const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); - struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg packed_dst_color = c->undef; - - if (c->fs_key->blend.blend_enable || - c->fs_key->blend.colormask != 0xf || - c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - struct qreg r4 = qir_TLB_COLOR_READ(c); - for (int i = 0; i < 4; i++) - tlb_read_color[i] = qir_R4_UNPACK(c, r4, i); - for (int i = 0; i < 4; i++) { - dst_color[i] = get_swizzled_channel(c, - tlb_read_color, - format_swiz[i]); - if (util_format_is_srgb(color_format) && i != 3) { - linear_dst_color[i] = - qir_srgb_decode(c, dst_color[i]); - } else { - linear_dst_color[i] = dst_color[i]; - } - } - /* Save the packed value for logic ops. Can't reuse r4 - * because other things might smash it (like sRGB) - */ - packed_dst_color = qir_MOV(c, r4); - } - - struct qreg blend_color[4]; - struct qreg undef_array[4] = { - c->undef, c->undef, c->undef, c->undef - }; - vc4_blend(c, blend_color, linear_dst_color, - (c->output_color_index != -1 ? - c->outputs + c->output_color_index : - undef_array)); - - if (util_format_is_srgb(color_format)) { - for (int i = 0; i < 3; i++) - blend_color[i] = qir_srgb_encode(c, blend_color[i]); - } - - /* Debug: Sometimes you're getting a black output and just want to see - * if the FS is getting executed at all. Spam magenta into the color - * output. - */ - if (0) { - blend_color[0] = qir_uniform_f(c, 1.0); - blend_color[1] = qir_uniform_f(c, 0.0); - blend_color[2] = qir_uniform_f(c, 1.0); - blend_color[3] = qir_uniform_f(c, 0.5); - } - - struct qreg swizzled_outputs[4]; - for (int i = 0; i < 4; i++) { - swizzled_outputs[i] = get_swizzled_channel(c, blend_color, - format_swiz[i]); + struct qreg color; + if (c->output_color_index != -1) { + color = c->outputs[c->output_color_index]; + } else { + color = qir_uniform_ui(c, 0); } if (c->discard.file != QFILE_NULL) @@ -1463,47 +1124,7 @@ emit_frag_end(struct vc4_compile *c) qir_TLB_Z_WRITE(c, z); } - struct qreg packed_color = c->undef; - for (int i = 0; i < 4; i++) { - if (swizzled_outputs[i].file == QFILE_NULL) - continue; - if (packed_color.file == QFILE_NULL) { - packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]); - } else { - packed_color = qir_PACK_8_F(c, - packed_color, - swizzled_outputs[i], - i); - } - } - - if (packed_color.file == QFILE_NULL) - packed_color = qir_uniform_ui(c, 0); - - if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - packed_color = vc4_logicop(c, packed_color, packed_dst_color); - } - - /* If the bit isn't set in the color mask, then just return the - * original dst color, instead. - */ - uint32_t colormask = 0xffffffff; - for (int i = 0; i < 4; i++) { - if (format_swiz[i] < 4 && - !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { - colormask &= ~(0xff << (i * 8)); - } - } - if (colormask != 0xffffffff) { - packed_color = qir_OR(c, - qir_AND(c, packed_color, - qir_uniform_ui(c, colormask)), - qir_AND(c, packed_dst_color, - qir_uniform_ui(c, ~colormask))); - } - - qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef, - packed_color, c->undef)); + qir_TLB_COLOR_WRITE(c, color); } static void @@ -1695,6 +1316,7 @@ vc4_optimize_nir(struct nir_shader *s) progress = nir_opt_peephole_select(s) || progress; progress = nir_opt_algebraic(s) || progress; progress = nir_opt_constant_folding(s) || progress; + progress = nir_opt_undef(s) || progress; } while (progress); } @@ -1736,6 +1358,7 @@ ntq_setup_inputs(struct vc4_compile *c) unsigned loc = var->data.driver_location; assert(array_len == 1); + (void)array_len; resize_qreg_array(c, &c->inputs, &c->inputs_array_size, (loc + 1) * 4); @@ -1743,11 +1366,12 @@ ntq_setup_inputs(struct vc4_compile *c) if (semantic_name == TGSI_SEMANTIC_POSITION) { emit_fragcoord_input(c, loc); } else if (semantic_name == TGSI_SEMANTIC_FACE) { - emit_face_input(c, loc); + c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c); } else if (semantic_name == TGSI_SEMANTIC_GENERIC && (c->fs_key->point_sprite_mask & (1 << semantic_index))) { - emit_point_coord_input(c, loc); + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; } else { emit_fragment_input(c, loc, semantic_name, @@ -1770,6 +1394,13 @@ ntq_setup_outputs(struct vc4_compile *c) unsigned loc = var->data.driver_location * 4; assert(array_len == 1); + (void)array_len; + + /* NIR hack to pass through + * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */ + if (semantic_name == TGSI_SEMANTIC_COLOR && + semantic_index == -1) + semantic_index = 0; for (int i = 0; i < 4; i++) { add_output(c, @@ -1834,8 +1465,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list) static void ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) { - struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, - instr->def.num_components); + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) qregs[i] = qir_uniform_ui(c, instr->value.u[i]); @@ -1843,47 +1473,59 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) } static void +ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + + /* QIR needs there to be *some* value, so pick 0 (same as for + * ntq_setup_registers(). + */ + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = qir_uniform_ui(c, 0); +} + +static void ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; struct qreg *dest = NULL; if (info->has_dest) { - dest = ntq_get_dest(c, instr->dest); + dest = ntq_get_dest(c, &instr->dest); } switch (instr->intrinsic) { case nir_intrinsic_load_uniform: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = qir_uniform(c, QUNIFORM_UNIFORM, - instr->const_index[0] * 4 + i); + assert(instr->num_components == 1); + if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) { + *dest = qir_uniform(c, QUNIFORM_UNIFORM, + instr->const_index[0]); + } else { + *dest = qir_uniform(c, instr->const_index[0] - + VC4_NIR_STATE_UNIFORM_OFFSET, + 0); } break; case nir_intrinsic_load_uniform_indirect: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = indirect_uniform_load(c, - ntq_get_src(c, instr->src[0], 0), - (instr->const_index[0] * - 4 + i) * sizeof(float)); - } + *dest = indirect_uniform_load(c, instr); break; case nir_intrinsic_load_input: - for (int i = 0; i < instr->num_components; i++) - dest[i] = c->inputs[instr->const_index[0] * 4 + i]; - + assert(instr->num_components == 1); + if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { + *dest = qir_TLB_COLOR_READ(c); + } else { + *dest = c->inputs[instr->const_index[0]]; + } break; case nir_intrinsic_store_output: - for (int i = 0; i < instr->num_components; i++) { - c->outputs[instr->const_index[0] * 4 + i] = - qir_MOV(c, ntq_get_src(c, instr->src[0], i)); - } - c->num_outputs = MAX2(c->num_outputs, - instr->const_index[0] * 4 + - instr->num_components + 1); + assert(instr->num_components == 1); + c->outputs[instr->const_index[0]] = + qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); + c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1); break; case nir_intrinsic_discard: @@ -1927,6 +1569,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr) ntq_emit_load_const(c, nir_instr_as_load_const(instr)); break; + case nir_instr_type_ssa_undef: + ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_tex: ntq_emit_tex(c, nir_instr_as_tex(instr)); break; @@ -2084,13 +1730,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, c->s = tgsi_to_nir(tokens, &nir_options); nir_opt_global_to_local(c->s); nir_convert_to_ssa(c->s); + if (stage == QSTAGE_FRAG) + vc4_nir_lower_blend(c); + vc4_nir_lower_io(c); nir_lower_idiv(c->s); + nir_lower_load_const_to_scalar(c->s); vc4_optimize_nir(c->s); nir_remove_dead_variables(c->s); - nir_convert_from_ssa(c->s); + nir_convert_from_ssa(c->s, true); if (vc4_debug & VC4_DEBUG_SHADERDB) { fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", @@ -2187,6 +1837,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, memcpy(uinfo->contents, c->uniform_contents, count * sizeof(*uinfo->contents)); uinfo->num_texture_samples = c->num_texture_samples; + + vc4_set_shader_uniform_dirty_flags(shader); } static struct vc4_compiled_shader * @@ -2259,9 +1911,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } copy_uniform_state_to_shader(shader, c); - shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts, - c->qpu_inst_count * sizeof(uint64_t), - "code"); + shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts, + c->qpu_inst_count * sizeof(uint64_t)); /* Copy the compiler UBO range state to the compiled shader, dropping * out arrays that were never referenced by an indirect load. @@ -2288,10 +1939,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } } if (shader->ubo_size) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - qir_get_stage_name(c->stage), - c->program_id, c->variant_id, - shader->ubo_size / 4); + if (vc4_debug & VC4_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", + qir_get_stage_name(c->stage), + c->program_id, c->variant_id, + shader->ubo_size / 4); + } } qir_compile_destroy(c); @@ -2421,9 +2074,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) (prim_mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex); - vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + struct vc4_compiled_shader *vs = + vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + if (vs != vc4->prog.vs) { + vc4->prog.vs = vs; + vc4->dirty |= VC4_DIRTY_COMPILED_VS; + } + key->is_coord = true; - vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + struct vc4_compiled_shader *cs = + vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + if (cs != vc4->prog.cs) { + vc4->prog.cs = cs; + vc4->dirty |= VC4_DIRTY_COMPILED_CS; + } } void @@ -2490,305 +2154,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) free(so); } -static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest) -{ - switch (p_wrap) { - case PIPE_TEX_WRAP_REPEAT: - return 0; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return 1; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return 2; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - return 3; - case PIPE_TEX_WRAP_CLAMP: - return (using_nearest ? 1 : 3); - default: - fprintf(stderr, "Unknown wrap mode %d\n", p_wrap); - assert(!"not reached"); - return 0; - } -} - -static void -write_texture_p0(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - - cl_reloc(vc4, &vc4->uniforms, rsc->bo, - VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | - VC4_SET_FIELD(texture->u.tex.last_level - - texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) | - VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE, - VC4_TEX_P0_CMMODE) | - VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE)); -} - -static void -write_texture_p1(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - struct pipe_sampler_state *sampler = texstate->samplers[unit]; - static const uint8_t minfilter_map[6] = { - VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR, - VC4_TEX_P1_MINFILT_LIN_MIP_NEAR, - VC4_TEX_P1_MINFILT_NEAR_MIP_LIN, - VC4_TEX_P1_MINFILT_LIN_MIP_LIN, - VC4_TEX_P1_MINFILT_NEAREST, - VC4_TEX_P1_MINFILT_LINEAR, - }; - static const uint32_t magfilter_map[] = { - [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST, - [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR, - }; - - bool either_nearest = - (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST || - sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST); - - cl_aligned_u32(&vc4->uniforms, - VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) | - VC4_SET_FIELD(texture->texture->height0 & 2047, - VC4_TEX_P1_HEIGHT) | - VC4_SET_FIELD(texture->texture->width0 & 2047, - VC4_TEX_P1_WIDTH) | - VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter], - VC4_TEX_P1_MAGFILT) | - VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 + - sampler->min_img_filter], - VC4_TEX_P1_MINFILT) | - VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest), - VC4_TEX_P1_WRAP_S) | - VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest), - VC4_TEX_P1_WRAP_T)); -} - -static void -write_texture_p2(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t data) -{ - uint32_t unit = data & 0xffff; - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - - cl_aligned_u32(&vc4->uniforms, - VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE, - VC4_TEX_P2_PTYPE) | - VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) | - VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD)); -} - - -#define SWIZ(x,y,z,w) { \ - UTIL_FORMAT_SWIZZLE_##x, \ - UTIL_FORMAT_SWIZZLE_##y, \ - UTIL_FORMAT_SWIZZLE_##z, \ - UTIL_FORMAT_SWIZZLE_##w \ -} - -static void -write_texture_border_color(struct vc4_context *vc4, - struct vc4_texture_stateobj *texstate, - uint32_t unit) -{ - struct pipe_sampler_state *sampler = texstate->samplers[unit]; - struct pipe_sampler_view *texture = texstate->textures[unit]; - struct vc4_resource *rsc = vc4_resource(texture->texture); - union util_color uc; - - const struct util_format_description *tex_format_desc = - util_format_description(texture->format); - - float border_color[4]; - for (int i = 0; i < 4; i++) - border_color[i] = sampler->border_color.f[i]; - if (util_format_is_srgb(texture->format)) { - for (int i = 0; i < 3; i++) - border_color[i] = - util_format_linear_to_srgb_float(border_color[i]); - } - - /* Turn the border color into the layout of channels that it would - * have when stored as texture contents. - */ - float storage_color[4]; - util_format_unswizzle_4f(storage_color, - border_color, - tex_format_desc->swizzle); - - /* Now, pack so that when the vc4_format-sampled texture contents are - * replaced with our border color, the vc4_get_format_swizzle() - * swizzling will get the right channels. - */ - if (util_format_is_depth_or_stencil(texture->format)) { - uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM, - sampler->border_color.f[0]) << 8; - } else { - switch (rsc->vc4_format) { - default: - case VC4_TEXTURE_TYPE_RGBA8888: - util_pack_color(storage_color, - PIPE_FORMAT_R8G8B8A8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_RGBA4444: - util_pack_color(storage_color, - PIPE_FORMAT_A8B8G8R8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_RGB565: - util_pack_color(storage_color, - PIPE_FORMAT_B8G8R8A8_UNORM, &uc); - break; - case VC4_TEXTURE_TYPE_ALPHA: - uc.ui[0] = float_to_ubyte(storage_color[0]) << 24; - break; - case VC4_TEXTURE_TYPE_LUMALPHA: - uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) | - (float_to_ubyte(storage_color[0]) << 0)); - break; - } - } - - cl_aligned_u32(&vc4->uniforms, uc.ui[0]); -} - -static uint32_t -get_texrect_scale(struct vc4_texture_stateobj *texstate, - enum quniform_contents contents, - uint32_t data) -{ - struct pipe_sampler_view *texture = texstate->textures[data]; - uint32_t dim; - - if (contents == QUNIFORM_TEXRECT_SCALE_X) - dim = texture->texture->width0; - else - dim = texture->texture->height0; - - return fui(1.0f / dim); -} - -static struct vc4_bo * -vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader, - const uint32_t *gallium_uniforms) -{ - if (!shader->ubo_size) - return NULL; - - struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); - uint32_t *data = vc4_bo_map(ubo); - for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { - memcpy(data + shader->ubo_ranges[i].dst_offset, - gallium_uniforms + shader->ubo_ranges[i].src_offset, - shader->ubo_ranges[i].size); - } - - return ubo; -} - -void -vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, - struct vc4_constbuf_stateobj *cb, - struct vc4_texture_stateobj *texstate) -{ - struct vc4_shader_uniform_info *uinfo = &shader->uniforms; - const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; - struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); - - cl_ensure_space(&vc4->uniforms, (uinfo->count + - uinfo->num_texture_samples) * 4); - - cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); - - for (int i = 0; i < uinfo->count; i++) { - - switch (uinfo->contents[i]) { - case QUNIFORM_CONSTANT: - cl_aligned_u32(&vc4->uniforms, uinfo->data[i]); - break; - case QUNIFORM_UNIFORM: - cl_aligned_u32(&vc4->uniforms, - gallium_uniforms[uinfo->data[i]]); - break; - case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f); - break; - case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f); - break; - - case QUNIFORM_VIEWPORT_Z_OFFSET: - cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]); - break; - case QUNIFORM_VIEWPORT_Z_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]); - break; - - case QUNIFORM_USER_CLIP_PLANE: - cl_aligned_f(&vc4->uniforms, - vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P0: - write_texture_p0(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P1: - write_texture_p1(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXTURE_CONFIG_P2: - write_texture_p2(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_UBO_ADDR: - cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0); - break; - - case QUNIFORM_TEXTURE_BORDER_COLOR: - write_texture_border_color(vc4, texstate, uinfo->data[i]); - break; - - case QUNIFORM_TEXRECT_SCALE_X: - case QUNIFORM_TEXRECT_SCALE_Y: - cl_aligned_u32(&vc4->uniforms, - get_texrect_scale(texstate, - uinfo->contents[i], - uinfo->data[i])); - break; - - case QUNIFORM_BLEND_CONST_COLOR: - cl_aligned_f(&vc4->uniforms, - CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1)); - break; - - case QUNIFORM_STENCIL: - cl_aligned_u32(&vc4->uniforms, - vc4->zsa->stencil_uniforms[uinfo->data[i]] | - (uinfo->data[i] <= 1 ? - (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : - 0)); - break; - - case QUNIFORM_ALPHA_REF: - cl_aligned_f(&vc4->uniforms, - vc4->zsa->base.alpha.ref_value); - break; - } -#if 0 - uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4); - fprintf(stderr, "%p: %d / 0x%08x (%f)\n", - shader, i, written_val, uif(written_val)); -#endif - } -} - static void vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) { diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 1c96ef4795f..254140a72f5 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TEX_B] = { "tex_b", 0, 2 }, [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, - [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 }, - [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 }, - [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 }, - [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 }, [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 }, [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 }, [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 }, @@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst) } } -bool -qir_reads_r4(struct qinst *inst) -{ - switch (inst->op) { - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - return true; - default: - return false; - } -} - static void qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) { diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 732cfd0b306..cade795c12a 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -36,6 +36,11 @@ #include "util/list.h" #include "util/u_math.h" +#include "vc4_screen.h" +#include "pipe/p_state.h" + +struct nir_builder; + enum qfile { QFILE_NULL, QFILE_TEMP, @@ -155,10 +160,6 @@ enum qop { * the destination */ QOP_TEX_RESULT, - QOP_R4_UNPACK_A, - QOP_R4_UNPACK_B, - QOP_R4_UNPACK_C, - QOP_R4_UNPACK_D }; struct queued_qpu_inst { @@ -243,7 +244,11 @@ enum quniform_contents { QUNIFORM_TEXTURE_BORDER_COLOR, - QUNIFORM_BLEND_CONST_COLOR, + QUNIFORM_BLEND_CONST_COLOR_X, + QUNIFORM_BLEND_CONST_COLOR_Y, + QUNIFORM_BLEND_CONST_COLOR_Z, + QUNIFORM_BLEND_CONST_COLOR_W, + QUNIFORM_STENCIL, QUNIFORM_ALPHA_REF, @@ -280,6 +285,52 @@ struct vc4_compiler_ubo_range { bool used; }; +struct vc4_key { + struct vc4_uncompiled_shader *shader_state; + struct { + enum pipe_format format; + unsigned compare_mode:1; + unsigned compare_func:3; + unsigned wrap_s:3; + unsigned wrap_t:3; + uint8_t swizzle[4]; + } tex[VC4_MAX_TEXTURE_SAMPLERS]; + uint8_t ucp_enables; +}; + +struct vc4_fs_key { + struct vc4_key base; + enum pipe_format color_format; + bool depth_enabled; + bool stencil_enabled; + bool stencil_twoside; + bool stencil_full_writemasks; + bool is_points; + bool is_lines; + bool alpha_test; + bool point_coord_upper_left; + bool light_twoside; + uint8_t alpha_test_func; + uint8_t logicop_func; + uint32_t point_sprite_mask; + + struct pipe_rt_blend_state blend; +}; + +struct vc4_vs_key { + struct vc4_key base; + + /** + * This is a proxy for the array of FS input semantics, which is + * larger than we would want to put in the key. + */ + uint64_t compiled_fs_id; + + enum pipe_format attr_formats[8]; + bool is_coord; + bool per_vertex_point_size; +}; + struct vc4_compile { struct vc4_context *vc4; nir_shader *s; @@ -369,6 +420,16 @@ struct vc4_compile { uint32_t variant_id; }; +/* Special nir_load_input intrinsic index for loading the current TLB + * destination color. + */ +#define VC4_NIR_TLB_COLOR_READ_INPUT 2000000000 + +/* Special offset for nir_load_uniform values to get a QUNIFORM_* + * state-dependent value. + */ +#define VC4_NIR_STATE_UNIFORM_OFFSET 2000000000 + struct vc4_compile *qir_compile_init(void); void qir_compile_destroy(struct vc4_compile *c); struct qinst *qir_inst(enum qop op, struct qreg dst, @@ -393,7 +454,6 @@ bool qir_is_multi_instruction(struct qinst *inst); bool qir_is_tex(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); -bool qir_reads_r4(struct qinst *inst); bool qir_src_needs_a_file(struct qinst *inst); struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg); @@ -409,6 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); bool qir_opt_vpm_writes(struct vc4_compile *c); +void vc4_nir_lower_blend(struct vc4_compile *c); +void vc4_nir_lower_io(struct vc4_compile *c); +nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, + enum quniform_contents contents); +nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b, + nir_ssa_def **srcs, int swiz); void qir_lower_uniforms(struct vc4_compile *c); void qpu_schedule_instructions(struct vc4_compile *c); @@ -523,27 +589,12 @@ QIR_ALU0(FRAG_W) QIR_ALU0(FRAG_REV_FLAG) QIR_ALU0(TEX_RESULT) QIR_ALU0(TLB_COLOR_READ) +QIR_NODST_1(TLB_COLOR_WRITE) QIR_NODST_1(TLB_Z_WRITE) QIR_NODST_1(TLB_DISCARD_SETUP) QIR_NODST_1(TLB_STENCIL_SETUP) static inline struct qreg -qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef)); - return t; -} - -static inline struct qreg -qir_SEL_X_0_COND(struct vc4_compile *c, int i) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef)); - return t; -} - -static inline struct qreg qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i) { struct qreg t = qir_get_temp(c); diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index 910c89dca79..f087c3b81b5 100644 --- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -52,7 +52,7 @@ static void add_uniform(struct hash_table *ht, struct qreg reg) { struct hash_entry *entry; - void *key = (void *)(uintptr_t)reg.index; + void *key = (void *)(uintptr_t)(reg.index + 1); entry = _mesa_hash_table_search(ht, key); if (entry) { @@ -66,7 +66,7 @@ static void remove_uniform(struct hash_table *ht, struct qreg reg) { struct hash_entry *entry; - void *key = (void *)(uintptr_t)reg.index; + void *key = (void *)(uintptr_t)(reg.index + 1); entry = _mesa_hash_table_search(ht, key); assert(entry); @@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c) struct hash_entry *entry; hash_table_foreach(ht, entry) { uint32_t count = (uintptr_t)entry->data; - uint32_t index = (uintptr_t)entry->key; + uint32_t index = (uintptr_t)entry->key - 1; if (count > max_count) { max_count = count; max_index = index; diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index c9ab6344589..fbb90ba12a0 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); } static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); } static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); } -uint64_t qpu_NOP(void); -uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src); -uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src); +uint64_t qpu_NOP(void) ATTRIBUTE_CONST; +uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST; +uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST; uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst, - struct qpu_reg src0, struct qpu_reg src1); + struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST; uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst, - struct qpu_reg src0, struct qpu_reg src1); -uint64_t qpu_merge_inst(uint64_t a, uint64_t b); -uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val); -uint64_t qpu_set_sig(uint64_t inst, uint32_t sig); -uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond); -uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond); -uint32_t qpu_encode_small_immediate(uint32_t i); - -bool qpu_waddr_is_tlb(uint32_t waddr); -bool qpu_inst_is_tlb(uint64_t inst); -int qpu_num_sf_accesses(uint64_t inst); + struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST; +uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST; +uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST; +uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST; +uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; +uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; +uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST; + +bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST; +bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST; +int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST; void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst); static inline uint64_t diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index 55e0e6139b5..00aeb300a9b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -225,7 +225,7 @@ static const char *qpu_condflags[] = { }; #define DESC(array, index) \ - ((index > ARRAY_SIZE(array) || !(array)[index]) ? \ + ((index >= ARRAY_SIZE(array) || !(array)[index]) ? \ "???" : (array)[index]) static const char * diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 99afe4b8798..f324056258c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QFILE_VPM: assert((int)qinst->src[i].index >= last_vpm_read_index); + (void)last_vpm_read_index; last_vpm_read_index = qinst->src[i].index; src[i] = qpu_ra(QPU_R_VPM); break; @@ -319,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) abort(); } - queue(c, qpu_a_MOV(dst, qpu_r4())); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; @@ -402,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_COLOR_LOAD); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_TLB_COLOR_WRITE: @@ -451,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_LOAD_TMU0); - - break; - - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - assert(src[0].mux == QPU_MUX_R4); - queue(c, qpu_a_MOV(dst, src[0])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + - (qinst->op - - QOP_R4_UNPACK_A), - QPU_UNPACK); - + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_UNPACK_8A_F: @@ -474,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_UNPACK_8D_F: case QOP_UNPACK_16A_F: case QOP_UNPACK_16B_F: { - assert(src[0].mux == QPU_MUX_A); - - /* Since we're setting the pack bits, if the - * destination is in A it would get re-packed. - */ - queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? - qpu_rb(31) : dst), - src[0], src[0])); - *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op - - QOP_UNPACK_8A_F], - QPU_UNPACK); + if (src[0].mux == QPU_MUX_R4) { + queue(c, qpu_a_MOV(dst, src[0])); + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + + (qinst->op - + QOP_UNPACK_8A_F), + QPU_UNPACK); + } else { + assert(src[0].mux == QPU_MUX_A); - if (dst.mux == QPU_MUX_A) { - queue(c, qpu_a_MOV(dst, qpu_rb(31))); + /* Since we're setting the pack bits, if the + * destination is in A it would get re-packed. + */ + queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? + qpu_rb(31) : dst), + src[0], src[0])); + *last_inst(c) |= + QPU_SET_FIELD(unpack_map[qinst->op - + QOP_UNPACK_8A_F], + QPU_UNPACK); + + if (dst.mux == QPU_MUX_A) { + queue(c, qpu_a_MOV(dst, qpu_rb(31))); + } } } break; diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c index 8471edbf62c..9cf6841f41c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_validate.c +++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c @@ -23,6 +23,13 @@ #include "vc4_qpu.h" +#ifdef NDEBUG +/* Since most of our code is used in assert()s, don't warn about dead code. */ +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + static bool writes_reg(uint64_t inst, uint32_t w) { diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index 3b0b890b66a..a29db1f3abe 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4) vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs)); vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in * vc4_qpu_emit.c @@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4) /* R4 can't be written as a general purpose register. (it's * TMU_NOSWAP as a write address). */ - if (vc4_regs[i].mux == QPU_MUX_R4) + if (vc4_regs[i].mux == QPU_MUX_R4) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); continue; + } ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); } - vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); - for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) + for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) { ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + } ra_set_finalize(vc4->regs, NULL); } @@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b) return a->priority - b->priority; } +#define CLASS_BIT_A (1 << 0) +#define CLASS_BIT_B_OR_ACC (1 << 1) +#define CLASS_BIT_R4 (1 << 2) + /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. * @@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) uint32_t temp_to_node[c->num_temps]; uint32_t def[c->num_temps]; uint32_t use[c->num_temps]; + uint8_t class_bits[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); memset(def, 0, sizeof(def)); @@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, c->num_temps); - for (uint32_t i = 0; i < c->num_temps; i++) { - ra_set_node_class(g, i, vc4->reg_class_any); - } - /* Compute the live ranges so we can figure out interference. */ uint32_t ip = 0; @@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) temp_to_node[map[i].temp] = i; } - /* Figure out our register classes and preallocated registers*/ + /* Figure out our register classes and preallocated registers. We + * start with any temp being able to be in any file, then instructions + * incrementally remove bits that the temp definitely can't be in. + */ + memset(class_bits, + CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4, + sizeof(class_bits)); + + ip = 0; list_for_each_entry(struct qinst, inst, &c->instructions, link) { + if (qir_writes_r4(inst)) { + /* This instruction writes r4 (and optionally moves + * its result to a temp), so nothing else can be + * stored in r4 across it. + */ + for (int i = 0; i < c->num_temps; i++) { + if (def[i] < ip && use[i] > ip) + class_bits[i] &= ~CLASS_BIT_R4; + } + } else { + /* R4 can't be written as a general purpose + * register. (it's TMU_NOSWAP as a write address). + */ + if (inst->dst.file == QFILE_TEMP) + class_bits[inst->dst.index] &= ~CLASS_BIT_R4; + } + switch (inst->op) { case QOP_FRAG_Z: ra_set_node_reg(g, temp_to_node[inst->dst.index], @@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); break; - case QOP_TEX_RESULT: - case QOP_TLB_COLOR_READ: - assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4); - ra_set_node_reg(g, temp_to_node[inst->dst.index], - ACC_INDEX + 4); - break; - case QOP_PACK_SCALED: /* The pack flags require an A-file dst register. */ - ra_set_node_class(g, temp_to_node[inst->dst.index], - vc4->reg_class_a); + class_bits[inst->dst.index] &= CLASS_BIT_A; break; default: @@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } if (qir_src_needs_a_file(inst)) { - ra_set_node_class(g, temp_to_node[inst->src[0].index], - vc4->reg_class_a); + class_bits[inst->src[0].index] &= CLASS_BIT_A; + } + ip++; + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + int node = temp_to_node[i]; + + switch (class_bits[i]) { + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4: + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: + ra_set_node_class(g, node, vc4->reg_class_any); + break; + case CLASS_BIT_A | CLASS_BIT_R4: + ra_set_node_class(g, node, vc4->reg_class_r4_or_a); + break; + case CLASS_BIT_A: + ra_set_node_class(g, node, vc4->reg_class_a); + break; + default: + fprintf(stderr, "temp %d: bad class bits: 0x%x\n", + i, class_bits[i]); + abort(); + break; } } @@ -270,7 +315,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } bool ok = ra_allocate(g); - assert(ok); + if (!ok) { + fprintf(stderr, "Failed to register allocate:\n"); + qir_dump(c); + abort(); + } for (uint32_t i = 0; i < c->num_temps; i++) { temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])]; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index cab76406055..5d5166fd818 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx, if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { vc4_resource_bo_alloc(rsc); + + /* If it might be bound as one of our vertex buffers, make + * sure we re-emit vertex buffer state. + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (vc4_cl_references_bo(pctx, rsc->bo)) { if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && @@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, prsc->height0 == box->height && prsc->depth0 == box->depth) { vc4_resource_bo_alloc(rsc); + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else { vc4_flush(pctx); } diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h index ab8f5d3cd55..87571b75e8b 100644 --- a/src/gallium/drivers/vc4/vc4_resource.h +++ b/src/gallium/drivers/vc4/vc4_resource.h @@ -82,19 +82,19 @@ struct vc4_resource { struct pipe_resource *shadow_parent; }; -static INLINE struct vc4_resource * +static inline struct vc4_resource * vc4_resource(struct pipe_resource *prsc) { return (struct vc4_resource *)prsc; } -static INLINE struct vc4_surface * +static inline struct vc4_surface * vc4_surface(struct pipe_surface *psurf) { return (struct vc4_surface *)psurf; } -static INLINE struct vc4_transfer * +static inline struct vc4_transfer * vc4_transfer(struct pipe_transfer *ptrans) { return (struct vc4_transfer *)ptrans; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index f63bead0fbb..2dee1d40e5f 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -176,6 +176,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; /* Stream output. */ @@ -489,6 +493,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen, { whandle->stride = stride; + /* If we're passing some reference to our BO out to some other part of + * the system, then we can't do any optimizations about only us being + * the ones seeing it (like BO caching or shadow update avoidance). + */ + bo->private = false; + switch (whandle->type) { case DRM_API_HANDLE_TYPE_SHARED: return vc4_bo_flink(bo, &whandle->handle); diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index b58013dd2ee..7cfd236349d 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -74,11 +74,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec) struct vc4_bo **bos = vc4->bo_pointers.base; exec->bo_count = args->bo_handle_count; - exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state)); + exec->bo = calloc(exec->bo_count, sizeof(void *)); for (int i = 0; i < exec->bo_count; i++) { struct vc4_bo *bo = bos[i]; struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo); + struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base); #if 0 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name); #endif @@ -86,7 +87,16 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec) vc4_bo_map(bo); memcpy(obj->vaddr, bo->map, bo->size); - exec->bo[i].bo = obj; + exec->bo[i] = obj; + + /* The kernel does this validation at shader create ioctl + * time. + */ + if (strcmp(bo->name, "code") == 0) { + drm_bo->validated_shader = vc4_validate_shader(obj); + if (!drm_bo->validated_shader) + abort(); + } } return 0; } @@ -95,7 +105,7 @@ static int vc4_simulator_unpin_bos(struct vc4_exec_info *exec) { for (int i = 0; i < exec->bo_count; i++) { - struct drm_gem_cma_object *obj = exec->bo[i].bo; + struct drm_gem_cma_object *obj = exec->bo[i]; struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo; memcpy(bo->map, obj->vaddr, bo->size); diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 2bb36b253bb..68ace0216aa 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -78,6 +78,7 @@ struct drm_gem_cma_object { struct drm_vc4_bo { struct drm_gem_cma_object base; struct vc4_bo *bo; + struct vc4_validated_shader_info *validated_shader; struct list_head unref_head; }; diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 4a1d4c3a4d6..8a759c2ca4c 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx, /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, * BCM21553). */ - so->point_size = MAX2(cso->point_size, .125); + so->point_size = MAX2(cso->point_size, .125f); if (cso->front_ccw) so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES; @@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader) } } +static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest) +{ + switch (p_wrap) { + case PIPE_TEX_WRAP_REPEAT: + return 0; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return 1; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return 2; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return 3; + case PIPE_TEX_WRAP_CLAMP: + return (using_nearest ? 1 : 3); + default: + fprintf(stderr, "Unknown wrap mode %d\n", p_wrap); + assert(!"not reached"); + return 0; + } +} + static void * vc4_create_sampler_state(struct pipe_context *pctx, const struct pipe_sampler_state *cso) { - return vc4_generic_cso_state_create(cso, sizeof(*cso)); + static const uint8_t minfilter_map[6] = { + VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR, + VC4_TEX_P1_MINFILT_LIN_MIP_NEAR, + VC4_TEX_P1_MINFILT_NEAR_MIP_LIN, + VC4_TEX_P1_MINFILT_LIN_MIP_LIN, + VC4_TEX_P1_MINFILT_NEAREST, + VC4_TEX_P1_MINFILT_LINEAR, + }; + static const uint32_t magfilter_map[] = { + [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST, + [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR, + }; + bool either_nearest = + (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST || + cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST); + struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state); + + if (!so) + return NULL; + + memcpy(so, cso, sizeof(*cso)); + + so->texture_p1 = + (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter], + VC4_TEX_P1_MAGFILT) | + VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 + + cso->min_img_filter], + VC4_TEX_P1_MINFILT) | + VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest), + VC4_TEX_P1_WRAP_S) | + VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest), + VC4_TEX_P1_WRAP_T)); + + return so; } static void @@ -499,13 +552,13 @@ static struct pipe_sampler_view * vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, const struct pipe_sampler_view *cso) { - struct pipe_sampler_view *so = malloc(sizeof(*so)); + struct vc4_sampler_view *so = malloc(sizeof(*so)); struct vc4_resource *rsc = vc4_resource(prsc); if (!so) return NULL; - *so = *cso; + so->base = *cso; pipe_reference(NULL, &prsc->reference); @@ -516,18 +569,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, * Also, Raspberry Pi doesn't support sampling from raster textures, * so we also have to copy to a temporary then. */ - if (so->u.tex.first_level || + if (cso->u.tex.first_level || rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) { struct vc4_resource *shadow_parent = vc4_resource(prsc); struct pipe_resource tmpl = shadow_parent->base.b; struct vc4_resource *clone; tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; - tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level); - tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level); - tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level; + tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level); + tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level); + tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level; prsc = vc4_resource_create(pctx->screen, &tmpl); + rsc = vc4_resource(prsc); clone = vc4_resource(prsc); clone->shadow_parent = &shadow_parent->base.b; /* Flag it as needing update of the contents from the parent. */ @@ -535,11 +589,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R); } - so->texture = prsc; - so->reference.count = 1; - so->context = pctx; - - return so; + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->texture_p0 = + (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | + VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) | + VC4_SET_FIELD(cso->u.tex.last_level - + cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) | + VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE, + VC4_TEX_P0_CMMODE)); + so->texture_p1 = + (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) | + VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) | + VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH)); + + return &so->base; } static void diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c index f9801c9cefd..cf86eb0fa31 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.c +++ b/src/gallium/drivers/vc4/vc4_tiling.c @@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp) static void check_box_utile_alignment(const struct pipe_box *box, int cpp) { - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - - assert(!(box->x & (utile_w - 1))); - assert(!(box->y & (utile_h - 1))); - assert(!(box->width & (utile_w - 1))); - assert(!(box->height & (utile_h - 1))); + assert(!(box->x & (vc4_utile_width(cpp) - 1))); + assert(!(box->y & (vc4_utile_height(cpp) - 1))); + assert(!(box->width & (vc4_utile_width(cpp) - 1))); + assert(!(box->height & (vc4_utile_height(cpp) - 1))); } static void diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h index b5d10da3417..b90bba70200 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.h +++ b/src/gallium/drivers/vc4/vc4_tiling.h @@ -24,9 +24,9 @@ #ifndef VC4_TILING_H #define VC4_TILING_H -uint32_t vc4_utile_width(int cpp); -uint32_t vc4_utile_height(int cpp); -bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp); +uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST; +uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST; +bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST; void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp); void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp); void vc4_load_tiled_image(void *dst, uint32_t dst_stride, diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c new file mode 100644 index 00000000000..85d6998205e --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -0,0 +1,344 @@ +/* + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/u_pack_color.h" +#include "util/format_srgb.h" + +#include "vc4_context.h" +#include "vc4_qir.h" + +static void +write_texture_p0(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct vc4_sampler_view *sview = + vc4_sampler_view(texstate->textures[unit]); + struct vc4_resource *rsc = vc4_resource(sview->base.texture); + + cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0); +} + +static void +write_texture_p1(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct vc4_sampler_view *sview = + vc4_sampler_view(texstate->textures[unit]); + struct vc4_sampler_state *sampler = + vc4_sampler_state(texstate->samplers[unit]); + + cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1); +} + +static void +write_texture_p2(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t data) +{ + uint32_t unit = data & 0xffff; + struct pipe_sampler_view *texture = texstate->textures[unit]; + struct vc4_resource *rsc = vc4_resource(texture->texture); + + cl_aligned_u32(uniforms, + VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE, + VC4_TEX_P2_PTYPE) | + VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) | + VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD)); +} + + +#define SWIZ(x,y,z,w) { \ + UTIL_FORMAT_SWIZZLE_##x, \ + UTIL_FORMAT_SWIZZLE_##y, \ + UTIL_FORMAT_SWIZZLE_##z, \ + UTIL_FORMAT_SWIZZLE_##w \ +} + +static void +write_texture_border_color(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, + struct vc4_texture_stateobj *texstate, + uint32_t unit) +{ + struct pipe_sampler_state *sampler = texstate->samplers[unit]; + struct pipe_sampler_view *texture = texstate->textures[unit]; + struct vc4_resource *rsc = vc4_resource(texture->texture); + union util_color uc; + + const struct util_format_description *tex_format_desc = + util_format_description(texture->format); + + float border_color[4]; + for (int i = 0; i < 4; i++) + border_color[i] = sampler->border_color.f[i]; + if (util_format_is_srgb(texture->format)) { + for (int i = 0; i < 3; i++) + border_color[i] = + util_format_linear_to_srgb_float(border_color[i]); + } + + /* Turn the border color into the layout of channels that it would + * have when stored as texture contents. + */ + float storage_color[4]; + util_format_unswizzle_4f(storage_color, + border_color, + tex_format_desc->swizzle); + + /* Now, pack so that when the vc4_format-sampled texture contents are + * replaced with our border color, the vc4_get_format_swizzle() + * swizzling will get the right channels. + */ + if (util_format_is_depth_or_stencil(texture->format)) { + uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM, + sampler->border_color.f[0]) << 8; + } else { + switch (rsc->vc4_format) { + default: + case VC4_TEXTURE_TYPE_RGBA8888: + util_pack_color(storage_color, + PIPE_FORMAT_R8G8B8A8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_RGBA4444: + util_pack_color(storage_color, + PIPE_FORMAT_A8B8G8R8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_RGB565: + util_pack_color(storage_color, + PIPE_FORMAT_B8G8R8A8_UNORM, &uc); + break; + case VC4_TEXTURE_TYPE_ALPHA: + uc.ui[0] = float_to_ubyte(storage_color[0]) << 24; + break; + case VC4_TEXTURE_TYPE_LUMALPHA: + uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) | + (float_to_ubyte(storage_color[0]) << 0)); + break; + } + } + + cl_aligned_u32(uniforms, uc.ui[0]); +} + +static uint32_t +get_texrect_scale(struct vc4_texture_stateobj *texstate, + enum quniform_contents contents, + uint32_t data) +{ + struct pipe_sampler_view *texture = texstate->textures[data]; + uint32_t dim; + + if (contents == QUNIFORM_TEXRECT_SCALE_X) + dim = texture->texture->width0; + else + dim = texture->texture->height0; + + return fui(1.0f / dim); +} + +static struct vc4_bo * +vc4_upload_ubo(struct vc4_context *vc4, + struct vc4_compiled_shader *shader, + const uint32_t *gallium_uniforms) +{ + if (!shader->ubo_size) + return NULL; + + struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); + uint32_t *data = vc4_bo_map(ubo); + for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { + memcpy(data + shader->ubo_ranges[i].dst_offset, + gallium_uniforms + shader->ubo_ranges[i].src_offset, + shader->ubo_ranges[i].size); + } + + return ubo; +} + +void +vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, + struct vc4_constbuf_stateobj *cb, + struct vc4_texture_stateobj *texstate) +{ + struct vc4_shader_uniform_info *uinfo = &shader->uniforms; + const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; + struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); + + cl_ensure_space(&vc4->uniforms, (uinfo->count + + uinfo->num_texture_samples) * 4); + + struct vc4_cl_out *uniforms = + cl_start_shader_reloc(&vc4->uniforms, + uinfo->num_texture_samples); + + for (int i = 0; i < uinfo->count; i++) { + + switch (uinfo->contents[i]) { + case QUNIFORM_CONSTANT: + cl_aligned_u32(&uniforms, uinfo->data[i]); + break; + case QUNIFORM_UNIFORM: + cl_aligned_u32(&uniforms, + gallium_uniforms[uinfo->data[i]]); + break; + case QUNIFORM_VIEWPORT_X_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f); + break; + case QUNIFORM_VIEWPORT_Y_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f); + break; + + case QUNIFORM_VIEWPORT_Z_OFFSET: + cl_aligned_f(&uniforms, vc4->viewport.translate[2]); + break; + case QUNIFORM_VIEWPORT_Z_SCALE: + cl_aligned_f(&uniforms, vc4->viewport.scale[2]); + break; + + case QUNIFORM_USER_CLIP_PLANE: + cl_aligned_f(&uniforms, + vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P0: + write_texture_p0(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P1: + write_texture_p1(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P2: + write_texture_p2(vc4, &uniforms, texstate, + uinfo->data[i]); + break; + + case QUNIFORM_UBO_ADDR: + cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0); + break; + + case QUNIFORM_TEXTURE_BORDER_COLOR: + write_texture_border_color(vc4, &uniforms, + texstate, uinfo->data[i]); + break; + + case QUNIFORM_TEXRECT_SCALE_X: + case QUNIFORM_TEXRECT_SCALE_Y: + cl_aligned_u32(&uniforms, + get_texrect_scale(texstate, + uinfo->contents[i], + uinfo->data[i])); + break; + + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: + cl_aligned_f(&uniforms, + CLAMP(vc4->blend_color.color[uinfo->contents[i] - + QUNIFORM_BLEND_CONST_COLOR_X], + 0, 1)); + break; + + case QUNIFORM_STENCIL: + cl_aligned_u32(&uniforms, + vc4->zsa->stencil_uniforms[uinfo->data[i]] | + (uinfo->data[i] <= 1 ? + (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : + 0)); + break; + + case QUNIFORM_ALPHA_REF: + cl_aligned_f(&uniforms, + vc4->zsa->base.alpha.ref_value); + break; + } +#if 0 + uint32_t written_val = *((uint32_t *)uniforms - 1); + fprintf(stderr, "%p: %d / 0x%08x (%f)\n", + shader, i, written_val, uif(written_val)); +#endif + } + + cl_end(&vc4->uniforms, uniforms); + + vc4_bo_unreference(&ubo); +} + +void +vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader) +{ + uint32_t dirty = 0; + + for (int i = 0; i < shader->uniforms.count; i++) { + switch (shader->uniforms.contents[i]) { + case QUNIFORM_CONSTANT: + break; + case QUNIFORM_UNIFORM: + case QUNIFORM_UBO_ADDR: + dirty |= VC4_DIRTY_CONSTBUF; + break; + + case QUNIFORM_VIEWPORT_X_SCALE: + case QUNIFORM_VIEWPORT_Y_SCALE: + case QUNIFORM_VIEWPORT_Z_OFFSET: + case QUNIFORM_VIEWPORT_Z_SCALE: + dirty |= VC4_DIRTY_VIEWPORT; + break; + + case QUNIFORM_USER_CLIP_PLANE: + dirty |= VC4_DIRTY_CLIP; + break; + + case QUNIFORM_TEXTURE_CONFIG_P0: + case QUNIFORM_TEXTURE_CONFIG_P1: + case QUNIFORM_TEXTURE_CONFIG_P2: + case QUNIFORM_TEXTURE_BORDER_COLOR: + case QUNIFORM_TEXRECT_SCALE_X: + case QUNIFORM_TEXRECT_SCALE_Y: + dirty |= VC4_DIRTY_TEXSTATE; + break; + + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: + dirty |= VC4_DIRTY_BLEND_COLOR; + break; + + case QUNIFORM_STENCIL: + case QUNIFORM_ALPHA_REF: + dirty |= VC4_DIRTY_ZSA; + break; + } + } + + shader->uniform_dirty_bits = dirty; +} |